]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/perf_event_open.2
strftime.3: BUGS: 'errno' is not set if the result string would exceed 'max' bytes
[thirdparty/man-pages.git] / man2 / perf_event_open.2
CommitLineData
f2b1d720
MK
1.\" Copyright (c) 2012, Vincent Weaver
2.\"
1dd72f9c 3.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
f2b1d720
MK
4.\" This is free documentation; you can redistribute it and/or
5.\" modify it under the terms of the GNU General Public License as
6.\" published by the Free Software Foundation; either version 2 of
7.\" the License, or (at your option) any later version.
8.\"
9.\" The GNU General Public License's references to "object code"
10.\" and "executables" are to be interpreted as the output of any
11.\" document formatting or typesetting system, including
12.\" intermediate and printed output.
13.\"
14.\" This manual is distributed in the hope that it will be useful,
15.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
16.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17.\" GNU General Public License for more details.
18.\"
19.\" You should have received a copy of the GNU General Public
20.\" License along with this manual; if not, see
21.\" <http://www.gnu.org/licenses/>.
6a8d8745 22.\" %%%LICENSE_END
f2b1d720
MK
23.\"
24.\" This document is based on the perf_event.h header file, the
25.\" tools/perf/design.txt file, and a lot of bitter experience.
26.\"
f9293d98 27.TH PERF_EVENT_OPEN 2 2013-06-21 "Linux" "Linux Programmer's Manual"
f2b1d720
MK
28.SH NAME
29perf_event_open \- set up performance monitoring
30.SH SYNOPSIS
31.nf
32.B #include <linux/perf_event.h>
33.B #include <linux/hw_breakpoint.h>
34.sp
35.BI "int perf_event_open(struct perf_event_attr *" attr ,
36.BI " pid_t " pid ", int " cpu ", int " group_fd ,
37.BI " unsigned long " flags );
38.fi
39
40.IR Note :
41There is no glibc wrapper for this system call; see NOTES.
42.SH DESCRIPTION
43Given a list of parameters,
44.BR perf_event_open ()
45returns a file descriptor, for use in subsequent system calls
46.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
47.PP
48A call to
49.BR perf_event_open ()
50creates a file descriptor that allows measuring performance
51information.
52Each file descriptor corresponds to one
53event that is measured; these can be grouped together
54to measure multiple events simultaneously.
55.PP
56Events can be enabled and disabled in two ways: via
57.BR ioctl (2)
58and via
59.BR prctl (2) .
60When an event is disabled it does not count or generate overflows but does
61continue to exist and maintain its count value.
62.PP
63Events come in two flavors: counting and sampled.
64A
65.I counting
66event is one that is used for counting the aggregate number of events
67that occur.
68In general, counting event results are gathered with a
69.BR read (2)
70call.
71A
72.I sampling
73event periodically writes measurements to a buffer that can then
74be accessed via
75.BR mmap (2) .
76.SS Arguments
77.P
78The argument
79.I pid
80allows events to be attached to processes in various ways.
81If
82.I pid
83is 0, measurements happen on the current thread, if
84.I pid
85is greater than 0, the process indicated by
86.I pid
87is measured, and if
88.I pid
89is \-1, all processes are counted.
90
91The
92.I cpu
93argument allows measurements to be specific to a CPU.
94If
95.I cpu
96is greater than or equal to 0,
97measurements are restricted to the specified CPU;
98if
99.I cpu
100is \-1, the events are measured on all CPUs.
101.P
102Note that the combination of
103.IR pid " == \-1"
104and
105.IR cpu " == \-1"
106is not valid.
107.P
108A
109.IR pid " > 0"
110and
111.IR cpu " == \-1"
112setting measures per-process and follows that process to whatever CPU the
113process gets scheduled to.
114Per-process events can be created by any user.
115.P
116A
117.IR pid " == \-1"
118and
119.IR cpu " >= 0"
120setting is per-CPU and measures all processes on the specified CPU.
121Per-CPU events need the
122.B CAP_SYS_ADMIN
123capability or a
124.I /proc/sys/kernel/perf_event_paranoid
125value of less than 1.
126.P
127The
128.I group_fd
129argument allows event groups to be created.
130An event group has one event which is the group leader.
131The leader is created first, with
132.IR group_fd " = \-1."
133The rest of the group members are created with subsequent
134.BR perf_event_open ()
135calls with
136.IR group_fd
137being set to the fd of the group leader.
138(A single event on its own is created with
139.IR group_fd " = \-1"
140and is considered to be a group with only 1 member.)
33a0ccb2
MK
141An event group is scheduled onto the CPU as a unit: it will
142be put onto the CPU ionly if all of the events in the group can be put onto
f2b1d720
MK
143the CPU.
144This means that the values of the member events can be
145meaningfully compared, added, divided (to get ratios), etc., with each
146other, since they have counted events for the same set of executed
147instructions.
148.P
149The
150.I flags
08e325e8 151argument is formed by ORing together zero or more of the following values:
f2b1d720
MK
152.TP
153.BR PERF_FLAG_FD_NO_GROUP
154.\" FIXME The following sentence is unclear
155This flag allows creating an event as part of an event group but
156having no group leader.
157It is unclear why this is useful.
158.\" FIXME So, why is it useful?
159.TP
160.BR PERF_FLAG_FD_OUTPUT
161This flag re-routes the output from an event to the group leader.
162.TP
163.BR PERF_FLAG_PID_CGROUP " (Since Linux 2.6.39)."
164This flag activates per-container system-wide monitoring.
165A container
166is an abstraction that isolates a set of resources for finer grain
167control (CPUs, memory, etc...).
168In this mode, the event is measured
169only if the thread running on the monitored CPU belongs to the designated
170container (cgroup).
171The cgroup is identified by passing a file descriptor
172opened on its directory in the cgroupfs filesystem.
173For instance, if the
174cgroup to monitor is called
175.IR test ,
176then a file descriptor opened on
177.I /dev/cgroup/test
178(assuming cgroupfs is mounted on
179.IR /dev/cgroup )
180must be passed as the
181.I pid
182parameter.
33a0ccb2 183cgroup monitoring is available only
f2b1d720
MK
184for system-wide events and may therefore require extra permissions.
185.P
186The
187.I perf_event_attr
188structure provides detailed configuration information
189for the event being created.
190
191.in +4n
192.nf
193struct perf_event_attr {
194 __u32 type; /* Type of event */
195 __u32 size; /* Size of attribute structure */
196 __u64 config; /* Type-specific configuration */
197
198 union {
199 __u64 sample_period; /* Period of sampling */
200 __u64 sample_freq; /* Frequency of sampling */
201 };
202
203 __u64 sample_type; /* Specifies values included in sample */
204 __u64 read_format; /* Specifies values returned in read */
205
7db515ef
MK
206 __u64 disabled : 1, /* off by default */
207 inherit : 1, /* children inherit it */
208 pinned : 1, /* must always be on PMU */
209 exclusive : 1, /* only group on PMU */
210 exclude_user : 1, /* don't count user */
211 exclude_kernel : 1, /* don't count kernel */
f2b1d720 212 exclude_hv : 1, /* don't count hypervisor */
7db515ef
MK
213 exclude_idle : 1, /* don't count when idle */
214 mmap : 1, /* include mmap data */
215 comm : 1, /* include comm data */
216 freq : 1, /* use freq, not period */
217 inherit_stat : 1, /* per task counts */
218 enable_on_exec : 1, /* next exec enables */
219 task : 1, /* trace fork/exit */
220 watermark : 1, /* wakeup_watermark */
221 precise_ip : 2, /* skid constraint */
222 mmap_data : 1, /* non-exec mmap data */
f2b1d720 223 sample_id_all : 1, /* sample_type all events */
7db515ef
MK
224 exclude_host : 1, /* don't count in host */
225 exclude_guest : 1, /* don't count in guest */
226 exclude_callchain_kernel : 1,
227 /* exclude kernel callchains */
228 exclude_callchain_user : 1,
229 /* exclude user callchains */
f2b1d720
MK
230 __reserved_1 : 41;
231
232 union {
233 __u32 wakeup_events; /* wakeup every n events */
7db515ef 234 __u32 wakeup_watermark; /* bytes before wakeup */
f2b1d720
MK
235 };
236
237 __u32 bp_type; /* breakpoint type */
238
239 union {
240 __u64 bp_addr; /* breakpoint address */
241 __u64 config1; /* extension of config */
242 };
243
244 union {
245 __u64 bp_len; /* breakpoint length */
246 __u64 config2; /* extension of config1 */
247 };
7db515ef
MK
248 __u64 branch_sample_type; /* enum perf_branch_sample_type */
249 __u64 sample_regs_user; /* user regs to dump on samples */
250 __u32 sample_stack_user; /* size of stack to dump on
251 samples */
252 __u32 __reserved_2; /* Align to u64 */
f2b1d720
MK
253
254};
255.fi
256.in
257
258The fields of the
259.I perf_event_attr
260structure are described in more detail below:
f2b1d720
MK
261.TP
262.I type
263This field specifies the overall event type.
264It has one of the following values:
265.RS
266.TP
267.B PERF_TYPE_HARDWARE
268This indicates one of the "generalized" hardware events provided
269by the kernel.
270See the
271.I config
272field definition for more details.
273.TP
274.B PERF_TYPE_SOFTWARE
275This indicates one of the software-defined events provided by the kernel
276(even if no hardware support is available).
277.TP
278.B PERF_TYPE_TRACEPOINT
279This indicates a tracepoint
280provided by the kernel tracepoint infrastructure.
281.TP
282.B PERF_TYPE_HW_CACHE
283This indicates a hardware cache event.
284This has a special encoding, described in the
285.I config
286field definition.
287.TP
288.B PERF_TYPE_RAW
289This indicates a "raw" implementation-specific event in the
290.IR config " field."
291.TP
292.BR PERF_TYPE_BREAKPOINT " (Since Linux 2.6.33)"
293This indicates a hardware breakpoint as provided by the CPU.
294Breakpoints can be read/write accesses to an address as well as
295execution of an instruction address.
296.TP
297.RB "dynamic PMU"
298Since Linux 2.6.39,
7db515ef 299.BR perf_event_open ()
f2b1d720
MK
300can support multiple PMUs.
301To enable this, a value exported by the kernel can be used in the
302.I type
303field to indicate which PMU to use.
304The value to use can be found in the sysfs filesystem:
305there is a subdirectory per PMU instance under
306.IR /sys/bus/event_source/devices .
307In each sub-directory there is a
308.I type
309file whose content is an integer that can be used in the
310.I type
311field.
312For instance,
313.I /sys/bus/event_source/devices/cpu/type
314contains the value for the core CPU PMU, which is usually 4.
315.RE
f2b1d720
MK
316.TP
317.I "size"
318The size of the
319.I perf_event_attr
320structure for forward/backward compatibility.
321Set this using
322.I sizeof(struct perf_event_attr)
323to allow the kernel to see
324the struct size at the time of compilation.
325
326The related define
327.B PERF_ATTR_SIZE_VER0
328is set to 64; this was the size of the first published struct.
329.B PERF_ATTR_SIZE_VER1
330is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
331.B PERF_ATTR_SIZE_VER2
332is 80 corresponding to the addition of branch sampling in Linux 3.4.
333.B PERF_ATR_SIZE_VER3
334is 96 corresponding to the addition
7ede2f66
DP
335of
336.I sample_regs_user
337and
338.I sample_stack_user
339in Linux 3.7.
f2b1d720
MK
340.TP
341.I "config"
342This specifies which event you want, in conjunction with
343the
344.I type
345field.
346The
347.IR config1 " and " config2
348fields are also taken into account in cases where 64 bits is not
349enough to fully specify the event.
350The encoding of these fields are event dependent.
351
352The most significant bit (bit 63) of
353.I config
354signifies CPU-specific (raw) counter configuration data;
355if the most significant bit is unset, the next 7 bits are an event
356type and the rest of the bits are the event identifier.
357
358There are various ways to set the
359.I config
360field that are dependent on the value of the previously
361described
362.I type
363field.
364What follows are various possible settings for
365.I config
366separated out by
367.IR type .
368
369If
370.I type
371is
372.BR PERF_TYPE_HARDWARE ,
373we are measuring one of the generalized hardware CPU events.
374Not all of these are available on all platforms.
375Set
376.I config
377to one of the following:
378.RS 12
379.TP
380.B PERF_COUNT_HW_CPU_CYCLES
381Total cycles.
382Be wary of what happens during CPU frequency scaling
383.TP
384.B PERF_COUNT_HW_INSTRUCTIONS
385Retired instructions.
386Be careful, these can be affected by various
387issues, most notably hardware interrupt counts
388.TP
389.B PERF_COUNT_HW_CACHE_REFERENCES
390Cache accesses.
391Usually this indicates Last Level Cache accesses but this may
392vary depending on your CPU.
393This may include prefetches and coherency messages; again this
394depends on the design of your CPU.
395.TP
396.B PERF_COUNT_HW_CACHE_MISSES
397Cache misses.
398Usually this indicates Last Level Cache misses; this is intended to be
399used in conjunction with the
400.B PERF_COUNT_HW_CACHE_REFERENCES
401event to calculate cache miss rates.
402.TP
403.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
404Retired branch instructions.
405Prior to Linux 2.6.34, this used
406the wrong event on AMD processors.
407.TP
408.B PERF_COUNT_HW_BRANCH_MISSES
409Mispredicted branch instructions.
410.TP
411.B PERF_COUNT_HW_BUS_CYCLES
412Bus cycles, which can be different from total cycles.
413.TP
414.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (Since Linux 3.0)"
415Stalled cycles during issue.
416.TP
417.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (Since Linux 3.0)"
418Stalled cycles during retirement.
419.TP
420.BR PERF_COUNT_HW_REF_CPU_CYCLES " (Since Linux 3.3)"
421Total cycles; not affected by CPU frequency scaling.
422.RE
423.IP
424If
425.I type
426is
427.BR PERF_TYPE_SOFTWARE ,
428we are measuring software events provided by the kernel.
429Set
430.I config
431to one of the following:
432.RS 12
433.TP
434.B PERF_COUNT_SW_CPU_CLOCK
435This reports the CPU clock, a high-resolution per-CPU timer.
436.TP
437.B PERF_COUNT_SW_TASK_CLOCK
438This reports a clock count specific to the task that is running.
439.TP
440.B PERF_COUNT_SW_PAGE_FAULTS
441This reports the number of page faults.
442.TP
443.B PERF_COUNT_SW_CONTEXT_SWITCHES
444This counts context switches.
445Until Linux 2.6.34, these were all reported as user-space
446events, after that they are reported as happening in the kernel.
447.TP
448.B PERF_COUNT_SW_CPU_MIGRATIONS
449This reports the number of times the process
450has migrated to a new CPU.
451.TP
452.B PERF_COUNT_SW_PAGE_FAULTS_MIN
453This counts the number of minor page faults.
454These did not require disk I/O to handle.
455.TP
456.B PERF_COUNT_SW_PAGE_FAULTS_MAJ
457This counts the number of major page faults.
458These required disk I/O to handle.
459.TP
460.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (Since Linux 2.6.33)"
461This counts the number of alignment faults.
462These happen when unaligned memory accesses happen; the kernel
463can handle these but it reduces performance.
33a0ccb2 464This happens only on some architectures (never on x86).
f2b1d720
MK
465.TP
466.BR PERF_COUNT_SW_EMULATION_FAULTS " (Since Linux 2.6.33)"
467This counts the number of emulation faults.
468The kernel sometimes traps on unimplemented instructions
7db515ef 469and emulates them for user space.
f2b1d720
MK
470This can negatively impact performance.
471.RE
f2b1d720 472
f2b1d720
MK
473.RS
474If
475.I type
476is
477.BR PERF_TYPE_TRACEPOINT ,
478then we are measuring kernel tracepoints.
479The value to use in
480.I config
481can be obtained from under debugfs
482.I tracing/events/*/*/id
483if ftrace is enabled in the kernel.
f2b1d720 484.RE
1f22e274 485
f2b1d720
MK
486.RS
487If
488.I type
489is
490.BR PERF_TYPE_HW_CACHE ,
491then we are measuring a hardware CPU cache event.
492To calculate the appropriate
493.I config
494value use the following equation:
495.RS 4
496.nf
497
498 (perf_hw_cache_id) | (perf_hw_cache_op_id << 8) |
499 (perf_hw_cache_op_result_id << 16)
500.fi
501.P
502where
503.I perf_hw_cache_id
504is one of:
7db515ef 505.RS 4
f2b1d720
MK
506.TP
507.B PERF_COUNT_HW_CACHE_L1D
508for measuring Level 1 Data Cache
509.TP
510.B PERF_COUNT_HW_CACHE_L1I
511for measuring Level 1 Instruction Cache
512.TP
513.B PERF_COUNT_HW_CACHE_LL
514for measuring Last-Level Cache
515.TP
516.B PERF_COUNT_HW_CACHE_DTLB
517for measuring the Data TLB
518.TP
519.B PERF_COUNT_HW_CACHE_ITLB
520for measuring the Instruction TLB
521.TP
522.B PERF_COUNT_HW_CACHE_BPU
523for measuring the branch prediction unit
524.TP
525.BR PERF_COUNT_HW_CACHE_NODE " (Since Linux 3.0)"
526for measuring local memory accesses
527.RE
f2b1d720
MK
528.P
529and
530.I perf_hw_cache_op_id
531is one of
7db515ef 532.RS 4
f2b1d720
MK
533.TP
534.B PERF_COUNT_HW_CACHE_OP_READ
535for read accesses
536.TP
537.B PERF_COUNT_HW_CACHE_OP_WRITE
538for write accesses
539.TP
540.B PERF_COUNT_HW_CACHE_OP_PREFETCH
541for prefetch accesses
542.RE
f2b1d720
MK
543.P
544and
545.I perf_hw_cache_op_result_id
546is one of
7db515ef 547.RS 4
f2b1d720
MK
548.TP
549.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
550to measure accesses
551.TP
552.B PERF_COUNT_HW_CACHE_RESULT_MISS
553to measure misses
554.RE
555.RE
556
557If
558.I type
559is
560.BR PERF_TYPE_RAW ,
561then a custom "raw"
562.I config
563value is needed.
564Most CPUs support events that are not covered by the "generalized" events.
565These are implementation defined; see your CPU manual (for example
566the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
567Guide).
568The libpfm4 library can be used to translate from the name in the
569architectural manuals to the raw hex value
570.BR perf_event_open ()
571expects in this field.
572
573If
574.I type
575is
576.BR PERF_TYPE_BREAKPOINT ,
577then leave
578.I config
579set to zero.
580Its parameters are set in other places.
581.RE
582.TP
583.IR sample_period ", " sample_freq
584A "sampling" counter is one that generates an interrupt
585every N events, where N is given by
586.IR sample_period .
587A sampling counter has
588.IR sample_period " > 0."
589When an overflow interrupt occurs, requested data is recorded
590in the mmap buffer.
591The
592.I sample_type
593field controls what data is recorded on each interrupt.
594
595.I sample_freq
596can be used if you wish to use frequency rather than period.
597In this case you set the
598.I freq
599flag.
600The kernel will adjust the sampling period
601to try and achieve the desired rate.
602The rate of adjustment is a
603timer tick.
f2b1d720
MK
604.TP
605.I "sample_type"
606The various bits in this field specify which values to include
607in the sample.
608They will be recorded in a ring-buffer,
ad73a2cc 609which is available to user space using
f2b1d720
MK
610.BR mmap (2).
611The order in which the values are saved in the
612sample are documented in the MMAP Layout subsection below;
613it is not the
614.I "enum perf_event_sample_format"
615order.
616.RS
617.TP
618.B PERF_SAMPLE_IP
619Records instruction pointer.
620.TP
621.B PERF_SAMPLE_TID
7db515ef 622Records the process and thread IDs.
f2b1d720
MK
623.TP
624.B PERF_SAMPLE_TIME
625Records a timestamp.
626.TP
627.B PERF_SAMPLE_ADDR
628Records an address, if applicable.
629.TP
630.B PERF_SAMPLE_READ
631Record counter values for all events in a group, not just the group leader.
632.TP
633.B PERF_SAMPLE_CALLCHAIN
634Records the callchain (stack backtrace).
635.TP
636.B PERF_SAMPLE_ID
637Records a unique ID for the opened event's group leader.
638.TP
639.B PERF_SAMPLE_CPU
640Records CPU number.
641.TP
642.B PERF_SAMPLE_PERIOD
643Records the current sampling period.
644.TP
645.B PERF_SAMPLE_STREAM_ID
646Records a unique ID for the opened event.
647Unlike
648.B PERF_SAMPLE_ID
649the actual ID is returned, not the group leader.
650This ID is the same as the one returned by PERF_FORMAT_ID.
651.TP
652.B PERF_SAMPLE_RAW
653Records additional data, if applicable.
654Usually returned by tracepoint events.
655.TP
656.BR PERF_SAMPLE_BRANCH_STACK " (Since Linux 3.4)"
7db515ef
MK
657Records the branch stack.
658See branch_sample_type.
f2b1d720
MK
659.TP
660.BR PERF_SAMPLE_REGS_USER " (Since Linux 3.7)"
661Records the current register state.
662.TP
663.BR PERF_SAMPLE_STACK_USER " (Since Linux 3.7)"
664[To be documented]
665.RE
f2b1d720
MK
666.TP
667.IR "read_format"
668This field specifies the format of the data returned by
669.BR read (2)
670on a
7db515ef 671.BR perf_event_open ()
f2b1d720
MK
672file descriptor.
673.RS
674.TP
675.B PERF_FORMAT_TOTAL_TIME_ENABLED
7ede2f66
DP
676Adds the 64-bit
677.I time_enabled
678field.
f2b1d720
MK
679This can be used to calculate estimated totals if
680the PMU is overcommitted and multiplexing is happening.
681.TP
682.B PERF_FORMAT_TOTAL_TIME_RUNNING
7ede2f66
DP
683Adds the 64-bit
684.I time_running
685field.
f2b1d720
MK
686This can be used to calculate estimated totals if
687the PMU is overcommitted and multiplexing is happening.
688.TP
689.B PERF_FORMAT_ID
690Adds a 64-bit unique value that corresponds to the event group.
691.TP
692.B PERF_FORMAT_GROUP
693Allows all counter values in an event group to be read with one read.
694.RE
f2b1d720
MK
695.TP
696.IR "disabled"
697The
698.I disabled
699bit specifies whether the counter starts out disabled or enabled.
700If disabled, the event can later be enabled by
701.BR ioctl (2),
702.BR prctl (2),
703or
704.IR enable_on_exec .
f2b1d720
MK
705.TP
706.IR "inherit"
707The
708.I inherit
709bit specifies that this counter should count events of child
710tasks as well as the task specified.
33a0ccb2 711This applies only to new children, not to any existing children at
f2b1d720
MK
712the time the counter is created (nor to any new children of
713existing children).
714
715Inherit does not work for some combinations of
716.IR read_format s,
717such as
718.BR PERF_FORMAT_GROUP .
f2b1d720
MK
719.TP
720.IR "pinned"
721The
722.I pinned
723bit specifies that the counter should always be on the CPU if at all
724possible.
33a0ccb2 725It applies only to hardware counters and only to group leaders.
f2b1d720
MK
726If a pinned counter cannot be put onto the CPU (e.g., because there are
727not enough hardware counters or because of a conflict with some other
728event), then the counter goes into an 'error' state, where reads
729return end-of-file (i.e.,
730.BR read (2)
731returns 0) until the counter is subsequently enabled or disabled.
f2b1d720
MK
732.TP
733.IR "exclusive"
734The
735.I exclusive
736bit specifies that when this counter's group is on the CPU,
737it should be the only group using the CPU's counters.
738In the future this may allow monitoring programs to
739support PMU features that need to run alone so that they do not
740disrupt other hardware counters.
f2b1d720
MK
741.TP
742.IR "exclude_user"
ad73a2cc 743If this bit is set, the count excludes events that happen in user space.
f2b1d720
MK
744.TP
745.IR "exclude_kernel"
746If this bit is set, the count excludes events that happen in kernel-space.
f2b1d720
MK
747.TP
748.IR "exclude_hv"
749If this bit is set, the count excludes events that happen in the
750hypervisor.
751This is mainly for PMUs that have built-in support for handling this
752(such as POWER).
753Extra support is needed for handling hypervisor measurements on most
754machines.
f2b1d720
MK
755.TP
756.IR "exclude_idle"
757If set, don't count when the CPU is idle.
f2b1d720
MK
758.TP
759.IR "mmap"
760The
761.I mmap
762bit enables recording of exec mmap events.
f2b1d720
MK
763.TP
764.IR "comm"
765The
766.I comm
767bit enables tracking of process command name as modified by the
768.IR exec (2)
769and
770.IR prctl (PR_SET_NAME)
771system calls.
772Unfortunately for tools,
773there is no way to distinguish one system call versus the other.
f2b1d720
MK
774.TP
775.IR "freq"
776If this bit is set, then
777.I sample_frequency
778not
779.I sample_period
780is used when setting up the sampling interval.
f2b1d720
MK
781.TP
782.IR "inherit_stat"
783This bit enables saving of event counts on context switch for
784inherited tasks.
33a0ccb2 785This is meaningful only if the
f2b1d720
MK
786.I inherit
787field is set.
f2b1d720
MK
788.TP
789.IR "enable_on_exec"
790If this bit is set, a counter is automatically
791enabled after a call to
792.BR exec (2).
f2b1d720
MK
793.TP
794.IR "task"
795If this bit is set, then
796fork/exit notifications are included in the ring buffer.
f2b1d720
MK
797.TP
798.IR "watermark"
799If set, have a sampling interrupt happen when we cross the
800.I wakeup_watermark
801boundary.
802Otherwise interrupts happen after
803.I wakeup_events
804samples.
f2b1d720
MK
805.TP
806.IR "precise_ip" " (Since Linux 2.6.35)"
807This controls the amount of skid.
808Skid is how many instructions
809execute between an event of interest happening and the kernel
810being able to stop and record the event.
811Smaller skid is
812better and allows more accurate reporting of which events
813correspond to which instructions, but hardware is often limited
814with how small this can be.
815
816The values of this are the following:
817.RS
818.TP
8190 -
820.B SAMPLE_IP
821can have arbitrary skid
822.TP
8231 -
824.B SAMPLE_IP
825must have constant skid
826.TP
8272 -
828.B SAMPLE_IP
829requested to have 0 skid
830.TP
8313 -
832.B SAMPLE_IP
833must have 0 skid.
834See also
835.BR PERF_RECORD_MISC_EXACT_IP .
836.RE
f2b1d720
MK
837.TP
838.IR "mmap_data" " (Since Linux 2.6.36)"
839The counterpart of the
840.I mmap
841field, but enables including data mmap events
842in the ring-buffer.
f2b1d720
MK
843.TP
844.IR "sample_id_all" " (Since Linux 2.6.38)"
845If set, then TID, TIME, ID, CPU, and STREAM_ID can
846additionally be included in
847.RB non- PERF_RECORD_SAMPLE s
848if the corresponding
849.I sample_type
850is selected.
f2b1d720
MK
851.TP
852.IR "exclude_host" " (Since Linux 3.2)"
853Do not measure time spent in VM host
f2b1d720
MK
854.TP
855.IR "exclude_guest" " (Since Linux 3.2)"
856Do not measure time spent in VM guest
f2b1d720
MK
857.TP
858.IR "exclude_callchain_kernel" " (Since Linux 3.7)"
859Do not include kernel callchains.
f2b1d720
MK
860.TP
861.IR "exclude_callchain_user" " (Since Linux 3.7)"
862Do not include user callchains.
f2b1d720
MK
863.TP
864.IR "wakeup_events" ", " "wakeup_watermark"
865This union sets how many samples
866.RI ( wakeup_events )
867or bytes
868.RI ( wakeup_watermark )
869happen before an overflow signal happens.
870Which one is used is selected by the
871.I watermark
872bitflag.
751c0f1a
VW
873
874.I wakeup_events
875only counts
876.B PERF_RECORD_SAMPLE
877record types.
878To receive a signal for every incoming
879.B PERF_RECORD
880type set
881.I wakeup_watermark
882to 1.
f2b1d720
MK
883.TP
884.IR "bp_type" " (Since Linux 2.6.33)"
885This chooses the breakpoint type.
886It is one of:
887.RS
888.TP
889.BR HW_BREAKPOINT_EMPTY
890no breakpoint
891.TP
892.BR HW_BREAKPOINT_R
893count when we read the memory location
894.TP
895.BR HW_BREAKPOINT_W
896count when we write the memory location
897.TP
898.BR HW_BREAKPOINT_RW
899count when we read or write the memory location
900.TP
901.BR HW_BREAKPOINT_X
902count when we execute code at the memory location
f2b1d720 903.LP
7db515ef 904The values can be combined via a bitwise or, but the
f2b1d720
MK
905combination of
906.B HW_BREAKPOINT_R
907or
908.B HW_BREAKPOINT_W
909with
910.B HW_BREAKPOINT_X
911is not allowed.
912.RE
f2b1d720
MK
913.TP
914.IR "bp_addr" " (Since Linux 2.6.33)"
915.I bp_addr
916address of the breakpoint.
917For execution breakpoints this is the memory address of the instruction
918of interest; for read and write breakpoints it is the memory address
919of the memory location of interest.
f2b1d720
MK
920.TP
921.IR "config1" " (Since Linux 2.6.39)"
922.I config1
923is used for setting events that need an extra register or otherwise
924do not fit in the regular config field.
925Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
926on 3.3 and later kernels.
f2b1d720
MK
927.TP
928.IR "bp_len" " (Since Linux 2.6.33)"
929.I bp_len
930is the length of the breakpoint being measured if
931.I type
932is
933.BR PERF_TYPE_BREAKPOINT .
934Options are
935.BR HW_BREAKPOINT_LEN_1 ,
936.BR HW_BREAKPOINT_LEN_2 ,
937.BR HW_BREAKPOINT_LEN_4 ,
938.BR HW_BREAKPOINT_LEN_8 .
939For an execution breakpoint, set this to
940.IR sizeof(long) .
f2b1d720
MK
941.TP
942.IR "config2" " (Since Linux 2.6.39)"
943
944.I config2
945is a further extension of the
946.I config1
947field.
f2b1d720
MK
948.TP
949.IR "branch_sample_type" " (Since Linux 3.4)"
950This is used with the CPUs hardware branch sampling, if available.
951It can have one of the following values:
952.RS
953.TP
954.B PERF_SAMPLE_BRANCH_USER
955Branch target is in user space
956.TP
957.B PERF_SAMPLE_BRANCH_KERNEL
958Branch target is in kernel space
959.TP
960.B PERF_SAMPLE_BRANCH_HV
961Branch target is in hypervisor
962.TP
963.B PERF_SAMPLE_BRANCH_ANY
964Any branch type.
965.TP
966.B PERF_SAMPLE_BRANCH_ANY_CALL
967Any call branch
968.TP
969.B PERF_SAMPLE_BRANCH_ANY_RETURN
970Any return branch
971.TP
972.BR PERF_SAMPLE_BRANCH_IND_CALL
973Indirect calls
974.TP
975.BR PERF_SAMPLE_BRANCH_PLM_ALL
976User, kernel, and hv
977.RE
f2b1d720
MK
978.TP
979.IR "sample_regs_user" " (Since Linux 3.7)"
980This defines the set of user registers to dump on samples.
7db515ef 981See
12eb3e64 982.\" FIXME: The following reference seems to be not quite right:
7db515ef 983.IR asm/perf_regs.h .
f2b1d720
MK
984.TP
985.IR "sample_stack_user" " (Since Linux 3.7)"
7ede2f66 986This defines the size of the user stack to dump on samples.
73d8cece 987.SS Reading results
f2b1d720 988Once a
7db515ef 989.BR perf_event_open ()
f2b1d720
MK
990file descriptor has been opened, the values
991of the events can be read from the file descriptor.
992The values that are there are specified by the
993.I read_format
7db515ef
MK
994field in the
995.I attr
996structure at open time.
f2b1d720
MK
997
998If you attempt to read into a buffer that is not big enough to hold the
999data
1000.B ENOSPC
1001is returned
1002
1003Here is the layout of the data returned by a read:
e525b89f 1004.IP * 2
f2b1d720
MK
1005If
1006.B PERF_FORMAT_GROUP
1007was specified to allow reading all events in a group at once:
1008
1009.in +4n
1010.nf
1011struct read_format {
e525b89f
MK
1012 u64 nr; /* The number of events */
1013 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1014 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
50b2aa27 1015 struct
e525b89f
MK
1016 u64 value; /* The value of the event */
1017 u64 id; /* if PERF_FORMAT_ID */
f2b1d720
MK
1018 } values[nr];
1019};
1020.fi
1021.in
e525b89f 1022.IP *
f2b1d720
MK
1023If
1024.B PERF_FORMAT_GROUP
1025was
1026.I not
e525b89f 1027specified:
f2b1d720
MK
1028
1029.in +4n
1030.nf
1031struct read_format {
1032 u64 value; /* The value of the event */
1033 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1034 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1035 u64 id; /* if PERF_FORMAT_ID */
1036};
1037.fi
1038.in
e525b89f
MK
1039.PP
1040The values read are as follows:
f2b1d720
MK
1041.TP
1042.I nr
1043The number of events in this file descriptor.
1044Only available if
1045.B PERF_FORMAT_GROUP
1046was specified.
f2b1d720
MK
1047.TP
1048.IR time_enabled ", " time_running
1049Total time the event was enabled and running.
1050Normally these are the same.
1051If more events are started
1052than available counter slots on the PMU, then multiplexing
33a0ccb2 1053happens and events run only part of the time.
f2b1d720
MK
1054In that case the
1055.I time_enabled
1056and
1057.I time running
1058values can be used to scale an estimated value for the count.
f2b1d720
MK
1059.TP
1060.I value
1061An unsigned 64-bit value containing the counter result.
f2b1d720
MK
1062.TP
1063.I id
1064A globally unique value for this particular event, only there if
1065.B PERF_FORMAT_ID
e525b89f
MK
1066was specified in
1067.IR read_format .
73d8cece 1068.SS MMAP layout
f2b1d720 1069When using
7db515ef 1070.BR perf_event_open ()
f2b1d720
MK
1071in sampled mode, asynchronous events
1072(like counter overflow or
1073.B PROT_EXEC
1074mmap tracking)
1075are logged into a ring-buffer.
1076This ring-buffer is created and accessed through
1077.BR mmap (2).
1078
1079The mmap size should be 1+2^n pages, where the first page is a
1080metadata page
e525b89f 1081.RI ( "struct perf_event_mmap_page" )
f2b1d720
MK
1082that contains various
1083bits of information such as where the ring-buffer head is.
1084
1085Before kernel 2.6.39, there is a bug that means you must allocate a mmap
1086ring buffer when sampling even if you do not plan to access it.
1087
1088The structure of the first metadata mmap page is as follows:
1089
1090.in +4n
1091.nf
1092struct perf_event_mmap_page {
7db515ef 1093 __u32 version; /* version number of this structure */
f2b1d720 1094 __u32 compat_version; /* lowest version this is compat with */
7db515ef
MK
1095 __u32 lock; /* seqlock for synchronization */
1096 __u32 index; /* hardware counter identifier */
1097 __s64 offset; /* add to hardware counter value */
1098 __u64 time_enabled; /* time event active */
1099 __u64 time_running; /* time event on CPU */
f2b1d720
MK
1100 union {
1101 __u64 capabilities;
1102 __u64 cap_usr_time : 1,
1103 cap_usr_rdpmc : 1,
1104 };
1105 __u16 pmc_width;
1106 __u16 time_shift;
1107 __u32 time_mult;
1108 __u64 time_offset;
7db515ef 1109 __u64 __reserved[120]; /* Pad to 1k */
f2b1d720 1110 __u64 data_head; /* head in the data section */
7db515ef 1111 __u64 data_tail; /* user-space written tail */
f2b1d720
MK
1112}
1113.fi
1114.in
1115
f2b1d720
MK
1116The following looks at the fields in the
1117.I perf_event_mmap_page
e525b89f 1118structure in more detail:
f2b1d720
MK
1119.TP
1120.I version
1121Version number of this structure.
f2b1d720
MK
1122.TP
1123.I compat_version
1124The lowest version this is compatible with.
f2b1d720
MK
1125.TP
1126.I lock
1127A seqlock for synchronization.
f2b1d720
MK
1128.TP
1129.I index
1130A unique hardware counter identifier.
f2b1d720
MK
1131.TP
1132.I offset
1133.\" FIXME clarify
1134Add this to hardware counter value??
f2b1d720
MK
1135.TP
1136.I time_enabled
1137Time the event was active.
f2b1d720
MK
1138.TP
1139.I time_running
1140Time the event was running.
f2b1d720
MK
1141.TP
1142.I cap_usr_time
1143User time capability
f2b1d720
MK
1144.TP
1145.I cap_usr_rdpmc
1146If the hardware supports user-space read of performance counters
1147without syscall (this is the "rdpmc" instruction on x86), then
1148the following code can be used to do a read:
1149
1150.in +4n
1151.nf
1152u32 seq, time_mult, time_shift, idx, width;
1153u64 count, enabled, running;
1154u64 cyc, time_offset;
1155s64 pmc = 0;
1156
1157do {
1158 seq = pc\->lock;
1159 barrier();
1160 enabled = pc\->time_enabled;
1161 running = pc\->time_running;
1162
1163 if (pc\->cap_usr_time && enabled != running) {
1164 cyc = rdtsc();
1165 time_offset = pc\->time_offset;
1166 time_mult = pc\->time_mult;
1167 time_shift = pc\->time_shift;
1168 }
1169
1170 idx = pc\->index;
1171 count = pc\->offset;
1172
1173 if (pc\->cap_usr_rdpmc && idx) {
1174 width = pc\->pmc_width;
1175 pmc = rdpmc(idx \- 1);
1176 }
1177
1178 barrier();
1179} while (pc\->lock != seq);
1180.fi
1181.in
f2b1d720
MK
1182.TP
1183.I pmc_width
1184If
1185.IR cap_usr_rdpmc ,
1186this field provides the bit-width of the value
1187read using the rdpmc or equivalent instruction.
1188This can be used to sign extend the result like:
1189
1190.in +4n
1191.nf
1192pmc <<= 64 \- pmc_width;
1193pmc >>= 64 \- pmc_width; // signed shift right
1194count += pmc;
1195.fi
1196.in
f2b1d720
MK
1197.TP
1198.IR time_shift ", " time_mult ", " time_offset
1199
1200If
1201.IR cap_usr_time ,
1202these fields can be used to compute the time
7db515ef 1203delta since time_enabled (in nanoseconds) using rdtsc or similar.
f2b1d720
MK
1204.nf
1205
1206 u64 quot, rem;
1207 u64 delta;
1208 quot = (cyc >> time_shift);
1209 rem = cyc & ((1 << time_shift) \- 1);
1210 delta = time_offset + quot * time_mult +
1211 ((rem * time_mult) >> time_shift);
1212.fi
1213
7db515ef
MK
1214Where
1215.IR time_offset ,
1216.IR time_mult ,
1217.IR time_shift ,
1218and
1219.IR cyc
1220are read in the
f2b1d720
MK
1221seqcount loop described above.
1222This delta can then be added to
1223enabled and possible running (if idx), improving the scaling:
1224.nf
1225
1226 enabled += delta;
1227 if (idx)
1228 running += delta;
1229 quot = count / running;
1230 rem = count % running;
1231 count = quot * enabled + (rem * enabled) / running;
1232.fi
f2b1d720
MK
1233.TP
1234.I data_head
1235This points to the head of the data section.
7db515ef
MK
1236The value continuously increases, it does not wrap.
1237The value needs to be manually wrapped by the size of the mmap buffer
f2b1d720
MK
1238before accessing the samples.
1239
1240On SMP-capable platforms, after reading the data_head value,
ad73a2cc 1241user space should issue an rmb().
f2b1d720
MK
1242.TP
1243.I data_tail;
1244When the mapping is
1245.BR PROT_WRITE ,
7db515ef
MK
1246the
1247.I data_tail
1248value should be written by user space to reflect the last read data.
f2b1d720 1249In this case the kernel will not over-write unread data.
e525b89f 1250.PP
f2b1d720
MK
1251The following 2^n ring-buffer pages have the layout described below.
1252
1253If
1254.I perf_event_attr.sample_id_all
1255is set, then all event types will
1256have the sample_type selected fields related to where/when (identity)
1257an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
1258.B PERF_RECORD_SAMPLE
1259below, it will be stashed just after the
7db515ef
MK
1260.I perf_event_header
1261and the fields already present for the existing
f2b1d720
MK
1262fields, i.e., at the end of the payload.
1263That way a newer perf.data
1264file will be supported by older perf tools, with these new optional
1265fields being ignored.
1266
1267The mmap values start with a header:
1268
1269.in +4n
1270.nf
1271struct perf_event_header {
1272 __u32 type;
1273 __u16 misc;
1274 __u16 size;
1275};
1276.fi
1277.in
1278
1279Below, we describe the
1280.I perf_event_header
1281fields in more detail.
f2b1d720
MK
1282.TP
1283.I type
1284The
1285.I type
1286value is one of the below.
1287The values in the corresponding record (that follows the header)
1288depend on the
1289.I type
1290selected as shown.
f2b1d720 1291.RS
7db515ef 1292.TP 4
f2b1d720
MK
1293.B PERF_RECORD_MMAP
1294The MMAP events record the
1295.B PROT_EXEC
1296mappings so that we can correlate
ad73a2cc 1297user-space IPs to code.
f2b1d720
MK
1298They have the following structure:
1299
1300.in +4n
1301.nf
1302struct {
1303 struct perf_event_header header;
1304 u32 pid, tid;
1305 u64 addr;
1306 u64 len;
1307 u64 pgoff;
1308 char filename[];
1309};
1310.fi
1311.in
f2b1d720
MK
1312.TP
1313.B PERF_RECORD_LOST
1314This record indicates when events are lost.
1315
1316.in +4n
1317.nf
1318struct {
1319 struct perf_event_header header;
1320 u64 id;
1321 u64 lost;
1322};
1323.fi
1324.in
f2b1d720
MK
1325.RS
1326.TP
1327.I id
1328is the unique event ID for the samples that were lost.
1329.TP
1330.I lost
1331is the number of events that were lost.
1332.RE
f2b1d720
MK
1333.TP
1334.B PERF_RECORD_COMM
1335This record indicates a change in the process name.
1336
1337.in +4n
1338.nf
1339struct {
1340 struct perf_event_header header;
1341 u32 pid, tid;
1342 char comm[];
1343};
1344.fi
1345.in
f2b1d720
MK
1346.TP
1347.B PERF_RECORD_EXIT
1348This record indicates a process exit event.
1349
1350.in +4n
1351.nf
1352struct {
1353 struct perf_event_header header;
1354 u32 pid, ppid;
1355 u32 tid, ptid;
1356 u64 time;
1357};
1358.fi
1359.in
f2b1d720
MK
1360.TP
1361.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
1362This record indicates a throttle/unthrottle event.
1363
1364.in +4n
1365.nf
1366struct {
1367 struct perf_event_header header;
1368 u64 time;
1369 u64 id;
1370 u64 stream_id;
1371};
1372.fi
1373.in
f2b1d720
MK
1374.TP
1375.B PERF_RECORD_FORK
1376This record indicates a fork event.
1377
1378.in +4n
1379.nf
1380struct {
1381 struct perf_event_header header;
1382 u32 pid, ppid;
1383 u32 tid, ptid;
1384 u64 time;
1385};
1386.fi
1387.in
f2b1d720
MK
1388.TP
1389.B PERF_RECORD_READ
1390This record indicates a read event.
1391
1392.in +4n
1393.nf
1394struct {
1395 struct perf_event_header header;
1396 u32 pid, tid;
1397 struct read_format values;
1398};
1399.fi
1400.in
f2b1d720
MK
1401.TP
1402.B PERF_RECORD_SAMPLE
1403This record indicates a sample.
1404
1405.in +4n
1406.nf
1407struct {
1408 struct perf_event_header header;
7db515ef
MK
1409 u64 ip; /* if PERF_SAMPLE_IP */
1410 u32 pid, tid; /* if PERF_SAMPLE_TID */
1411 u64 time; /* if PERF_SAMPLE_TIME */
1412 u64 addr; /* if PERF_SAMPLE_ADDR */
1413 u64 id; /* if PERF_SAMPLE_ID */
1414 u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
1415 u32 cpu, res; /* if PERF_SAMPLE_CPU */
1416 u64 period; /* if PERF_SAMPLE_PERIOD */
f2b1d720 1417 struct read_format v; /* if PERF_SAMPLE_READ */
7db515ef
MK
1418 u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
1419 u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
1420 u32 size; /* if PERF_SAMPLE_RAW */
1421 char data[size]; /* if PERF_SAMPLE_RAW */
1422 u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
1423 struct perf_branch_entry lbr[bnr];
1424 /* if PERF_SAMPLE_BRANCH_STACK */
1425 u64 abi; /* if PERF_SAMPLE_REGS_USER */
1426 u64 regs[weight(mask)];
1427 /* if PERF_SAMPLE_REGS_USER */
1428 u64 size; /* if PERF_SAMPLE_STACK_USER */
1429 char data[size]; /* if PERF_SAMPLE_STACK_USER */
1430 u64 dyn_size; /* if PERF_SAMPLE_STACK_USER */
f2b1d720
MK
1431};
1432.fi
f2b1d720
MK
1433.RS
1434.TP
1435.I ip
7db515ef
MK
1436If
1437.B PERF_SAMPLE_IP
1438is enabled, then a 64-bit instruction
f2b1d720 1439pointer value is included.
f2b1d720 1440.TP
7db515ef
MK
1441.IR pid ", " tid
1442If
1443.B PERF_SAMPLE_TID
1444is enabled, then a 32-bit process ID
1445and 32-bit thread ID are included.
f2b1d720
MK
1446.TP
1447.I time
7db515ef
MK
1448If
1449.B PERF_SAMPLE_TIME
1450is enabled, then a 64-bit timestamp
f2b1d720
MK
1451is included.
1452This is obtained via local_clock() which is a hardware timestamp
1453if available and the jiffies value if not.
f2b1d720
MK
1454.TP
1455.I addr
7db515ef
MK
1456If
1457.B PERF_SAMPLE_ADDR
1458is enabled, then a 64-bit address is included.
f2b1d720
MK
1459This is usually the address of a tracepoint,
1460breakpoint, or software event; otherwise the value is 0.
f2b1d720
MK
1461.TP
1462.I id
7db515ef
MK
1463If
1464.B PERF_SAMPLE_ID
1465is enabled, a 64-bit unique ID is included.
f2b1d720 1466If the event is a member of an event group, the group leader ID is returned.
7db515ef
MK
1467This ID is the same as the one returned by
1468.BR PERF_FORMAT_ID .
f2b1d720
MK
1469.TP
1470.I stream_id
7db515ef
MK
1471If
1472.B PERF_SAMPLE_STREAM_ID
1473is enabled, a 64-bit unique ID is included.
f2b1d720
MK
1474Unlike
1475.B PERF_SAMPLE_ID
1476the actual ID is returned, not the group leader.
7db515ef
MK
1477This ID is the same as the one returned by
1478.BR PERF_FORMAT_ID .
f2b1d720 1479.TP
7db515ef
MK
1480.IR cpu ", " res
1481If
1482.B PERF_SAMPLE_CPU
1483is enabled, this is a 32-bit value indicating
f2b1d720
MK
1484which CPU was being used, in addition to a reserved (unused)
148532-bit value.
f2b1d720
MK
1486.TP
1487.I period
7db515ef
MK
1488If
1489.B PERF_SAMPLE_PERIOD
1490is enabled, a 64-bit value indicating
f2b1d720 1491the current sampling period is written.
f2b1d720
MK
1492.TP
1493.I v
7db515ef
MK
1494If
1495.B PERF_SAMPLE_READ
1496is enabled, a structure of type read_format
f2b1d720
MK
1497is included which has values for all events in the event group.
1498The values included depend on the
1499.I read_format
7db515ef
MK
1500value used at
1501.BR perf_event_open ()
1502time.
f2b1d720 1503.TP
7db515ef
MK
1504.IR nr ", " ips[nr]
1505If
1506.B PERF_SAMPLE_CALLCHAIN
1507is enabled, then a 64-bit number is included
f2b1d720 1508which indicates how many following 64-bit instruction pointers will
7db515ef
MK
1509follow.
1510This is the current callchain.
f2b1d720 1511.TP
7ede2f66 1512.IR size ", " data[size]
7db515ef
MK
1513If
1514.B PERF_SAMPLE_RAW
1515is enabled, then a 32-bit value indicating size
f2b1d720
MK
1516is included followed by an array of 8-bit values of length size.
1517The values are padded with 0 to have 64-bit alignment.
1518
1519This RAW record data is opaque with respect to the ABI.
1520The ABI doesn't make any promises with respect to the stability
1521of its content, it may vary depending
1522on event, hardware, and kernel version.
f2b1d720 1523.TP
7db515ef
MK
1524.IR bnr ", " lbr[bnr]
1525If
1526.B PERF_SAMPLE_BRANCH_STACK
1527is enabled, then a 64-bit value indicating
1528the number of records is included, followed by
1529.I bnr
1530.I perf_branch_entry
1531structures.
1532These structures have from, to, and flags values indicating
f2b1d720 1533the from and to addresses from the branches on the callstack.
f2b1d720 1534.TP
7db515ef
MK
1535.IR abi ", " regs[weight(mask)]
1536If
1537.B PERF_SAMPLE_REGS_USER
1538is enabled, then
f2b1d720
MK
1539[to be documented].
1540
1541The
1542.I abi
1543field is one of
1544.BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or "
7db515ef 1545.BR PERF_SAMPLE_REGS_ABI_64 .
f2b1d720 1546.TP
7db515ef
MK
1547.IR size ", " data[size] ", " dyn_size
1548If
1549.B PERF_SAMPLE_STACK_USER
1550is enabled, then
f2b1d720 1551[to be documented].
f2b1d720 1552.RE
f2b1d720 1553.RE
f2b1d720
MK
1554.TP
1555.I misc
1556The
1557.I misc
1558field contains additional information about the sample.
1559
1560The CPU mode can be determined from this value by masking with
1561.B PERF_RECORD_MISC_CPUMODE_MASK
1562and looking for one of the following (note these are not
f3ae1918 1563bit masks, only one can be set at a time):
f2b1d720
MK
1564.RS
1565.TP
1566.B PERF_RECORD_MISC_CPUMODE_UNKNOWN
1567Unknown CPU mode.
1568.TP
1569.B PERF_RECORD_MISC_KERNEL
1570Sample happened in the kernel.
1571.TP
1572.B PERF_RECORD_MISC_USER
1573Sample happened in user code.
1574.TP
1575.B PERF_RECORD_MISC_HYPERVISOR
1576Sample happened in the hypervisor.
1577.TP
1578.B PERF_RECORD_MISC_GUEST_KERNEL
1579Sample happened in the guest kernel.
1580.TP
1581.B PERF_RECORD_MISC_GUEST_USER
1582Sample happened in guest user code.
1583.RE
1584
7db515ef 1585In addition, one of the following bits can be set:
f2b1d720
MK
1586.RS
1587.TP
1588.B PERF_RECORD_MISC_EXACT_IP
1589This indicates that the content of
1590.B PERF_SAMPLE_IP
1591points
1592to the actual instruction that triggered the event.
1593See also
1594.IR perf_event_attr.precise_ip .
f2b1d720
MK
1595.TP
1596.B PERF_RECORD_MISC_EXT_RESERVED
1597This indicates there is extended data available (currently not used).
f2b1d720
MK
1598.TP
1599.I size
1600This indicates the size of the record.
f2b1d720 1601.RE
73d8cece 1602.SS Signal overflow
f2b1d720
MK
1603Events can be set to deliver a signal when a threshold is crossed.
1604The signal handler is set up using the
1605.BR poll (2),
1606.BR select (2),
1607.BR epoll (2)
1608and
1609.BR fcntl (2),
1610system calls.
1611
1612To generate signals, sampling must be enabled
1613.RI ( sample_period
1614must have a non-zero value).
1615
1616There are two ways to generate signals.
1617
1618The first is to set a
1619.I wakeup_events
1620or
1621.I wakeup_watermark
1622value that will generate a signal if a certain number of samples
1623or bytes have been written to the mmap ring buffer.
7db515ef
MK
1624In this case a signal of type
1625.B POLL_IN
1626is sent.
f2b1d720
MK
1627
1628The other way is by use of the
7db515ef 1629.B PERF_EVENT_IOC_REFRESH
f2b1d720
MK
1630ioctl.
1631This ioctl adds to a counter that decrements each time the event overflows.
7db515ef
MK
1632When non-zero, a
1633.B POLL_IN
1634signal is sent on overflow, but
1635once the value reaches 0, a signal is sent of type
1636.B POLL_HUP
1637and
f2b1d720
MK
1638the underlying event is disabled.
1639
1640Note: on newer kernels (definitely noticed with 3.2)
7db515ef 1641.\" FIXME(Vince) : Find out when this was introduced
f2b1d720
MK
1642a signal is provided for every overflow, even if
1643.I wakeup_events
1644is not set.
73d8cece 1645.SS rdpmc instruction
f2b1d720
MK
1646Starting with Linux 3.4 on x86, you can use the
1647.I rdpmc
1648instruction to get low-latency reads without having to enter the kernel.
1649Note that using
1650.I rdpmc
1651is not necessarily faster than other methods for reading event values.
1652
1653Support for this can be detected with the
1654.I cap_usr_rdpmc
1655field in the mmap page; documentation on how
1656to calculate event values can be found in that section.
73d8cece 1657.SS perf_event ioctl calls
f2b1d720
MK
1658.PP
1659Various ioctls act on
7db515ef 1660.BR perf_event_open ()
f2b1d720 1661file descriptors
f2b1d720
MK
1662.TP
1663.B PERF_EVENT_IOC_ENABLE
36127c0e 1664Enables the individual event or event group specified by the
7db515ef 1665file descriptor argument.
f2b1d720
MK
1666
1667The ioctl argument is ignored.
f2b1d720
MK
1668.TP
1669.B PERF_EVENT_IOC_DISABLE
36127c0e 1670Disables the individual counter or event group specified by the
7db515ef 1671file descriptor argument.
f2b1d720
MK
1672
1673Enabling or disabling the leader of a group enables or disables the
1674entire group; that is, while the group leader is disabled, none of the
1675counters in the group will count.
33a0ccb2
MK
1676Enabling or disabling a member of a group other than the leader
1677affects only that counter; disabling a non-leader
f2b1d720
MK
1678stops that counter from counting but doesn't affect any other counter.
1679
1680The ioctl argument is ignored.
f2b1d720
MK
1681.TP
1682.B PERF_EVENT_IOC_REFRESH
1683Non-inherited overflow counters can use this
1684to enable a counter for a number of overflows specified by the argument,
1685after which it is disabled.
1686Subsequent calls of this ioctl add the argument value to the current
1687count.
7db515ef
MK
1688A signal with
1689.B POLL_IN
1690set will happen on each overflow until the
1691count reaches 0; when that happens a signal with
1692POLL_HUP
1693set is sent and the event is disabled.
f2b1d720 1694Using an argument of 0 is considered undefined behavior.
f2b1d720
MK
1695.TP
1696.B PERF_EVENT_IOC_RESET
36127c0e 1697Reset the event count specified by the
7db515ef 1698file descriptor argumentto zero.
33a0ccb2 1699This resets only the counts; there is no way to reset the
f2b1d720
MK
1700multiplexing
1701.I time_enabled
1702or
1703.I time_running
1704values.
1705When sent to a group leader, only
1706the leader is reset (child events are not).
1707
1708The ioctl argument is ignored.
f2b1d720
MK
1709.TP
1710.B PERF_EVENT_IOC_PERIOD
1711IOC_PERIOD is the command to update the period; it
1712does not update the current period but instead defers until next.
1713
1714The argument is a pointer to a 64-bit value containing the
1715desired new period.
f2b1d720
MK
1716.TP
1717.B PERF_EVENT_IOC_SET_OUTPUT
1718This tells the kernel to report event notifications to the specified
1719file descriptor rather than the default one.
1720The file descriptors must all be on the same CPU.
1721
1722The argument specifies the desired file descriptor, or \-1 if
1723output should be ignored.
f2b1d720
MK
1724.TP
1725.BR PERF_EVENT_IOC_SET_FILTER " (Since Linux 2.6.33)"
1726This adds an ftrace filter to this event.
1727
1728The argument is a pointer to the desired ftrace filter.
73d8cece 1729.SS Using prctl
f2b1d720
MK
1730A process can enable or disable all the event groups that are
1731attached to it using the
1732.BR prctl (2)
1733.B PR_TASK_PERF_EVENTS_ENABLE
1734and
1735.B PR_TASK_PERF_EVENTS_DISABLE
1736operations.
1737This applies to all counters on the current process, whether created by
1738this process or by another, and does not affect any counters that this
1739process has created on other processes.
33a0ccb2 1740It enables or disables only
f2b1d720 1741the group leaders, not any other members in the groups.
f2b1d720 1742.SS perf_event related configuration files
7db515ef
MK
1743Files in
1744.I /proc/sys/kernel/
7db515ef 1745.RS 4
f2b1d720 1746.TP
7db515ef 1747.I /proc/sys/kernel/perf_event_paranoid
f2b1d720
MK
1748
1749The
1750.I perf_event_paranoid
1751file can be set to restrict access to the performance counters.
1752
7db515ef 17532 - only allow user-space measurements
f2b1d720
MK
1754
17551 - (default) allow both kernel and user measurements
1756
17570 - allow access to CPU-specific data but not raw tracepoint samples
1758
1759\-1 - no restrictions
1760
1761The existence of the
1762.I perf_event_paranoid
1763file is the official method for determining if a kernel supports
7db515ef 1764.BR perf_event_open ().
f2b1d720
MK
1765.TP
1766.I /proc/sys/kernel/perf_event_max_sample_rate
1767
7db515ef
MK
1768This sets the maximum sample rate.
1769Setting this too high can allow
f2b1d720 1770users to sample at a rate that impacts overall machine performance
7db515ef
MK
1771and potentially lock up the machine.
1772The default value is
f2b1d720 1773100000 (samples per second).
f2b1d720
MK
1774.TP
1775.I /proc/sys/kernel/perf_event_mlock_kb
1776
7db515ef 1777Maximum number of pages an unprivileged user can mlock (2) .
f2b1d720
MK
1778The default is 516 (kB).
1779.RE
7db515ef
MK
1780Files in
1781.I /sys/bus/event_source/devices/
7db515ef 1782.RS 4
f2b1d720
MK
1783Since Linux 2.6.34 the kernel supports having multiple PMUs
1784available for monitoring.
1785Information on how to program these PMUs can be found under
1786.IR /sys/bus/event_source/devices/ .
1787Each subdirectory corresponds to a different PMU.
f2b1d720
MK
1788.TP
1789.I /sys/bus/event_source/devices/*/type
1790This contains an integer that can be used in the
1791.I type
1792field of perf_event_attr to indicate you wish to use this PMU.
f2b1d720
MK
1793.TP
1794.I /sys/bus/event_source/devices/*/rdpmc
1795[To be documented]
f2b1d720
MK
1796.TP
1797.I /sys/bus/event_source/devices/*/format/
1798This sub-directory contains information on what bits in the
1799.I config
1800field of perf_event_attr correspond to.
f2b1d720
MK
1801.TP
1802.I /sys/bus/event_source/devices/*/events/
1803This sub-directory contains files with pre-defined events.
1804The contents are strings describing the event settings
1805expressed in terms of the fields found in the
1806.I ./format/
1807directory.
1808These are not necessarily complete lists of all events supported by
1809a PMU, but usually a subset of events deemed useful or interesting.
f2b1d720
MK
1810.TP
1811.I /sys/bus/event_source/devices/*/uevent
1812[To be documented]
f2b1d720 1813.RE
47297adb 1814.SH RETURN VALUE
f2b1d720
MK
1815.BR perf_event_open ()
1816returns the new file descriptor, or \-1 if an error occurred
1817(in which case,
1818.I errno
1819is set appropriately).
1820.SH ERRORS
1821.TP
1822.B EINVAL
1823Returned if the specified event is not available.
1824.TP
1825.B ENOSPC
1826Prior to Linux 3.3, if there was not enough room for the event,
1827.B ENOSPC
1828was returned.
1829Linus did not like this, and this was changed to
1830.BR EINVAL .
1831.B ENOSPC
1832is still returned if you try to read results into
1833too small of a buffer.
f2b1d720 1834.SH VERSION
f2b1d720
MK
1835.BR perf_event_open ()
1836was introduced in Linux 2.6.31 but was called
1837.BR perf_counter_open () .
1838It was renamed in Linux 2.6.32.
f2b1d720 1839.SH CONFORMING TO
7db515ef
MK
1840This
1841.BR perf_event_open ()
1842system call Linux- specific
f2b1d720 1843and should not be used in programs intended to be portable.
f2b1d720
MK
1844.SH NOTES
1845Glibc does not provide a wrapper for this system call; call it using
1846.BR syscall (2).
7db515ef 1847See the example below.
f2b1d720
MK
1848
1849The official way of knowing if
7db515ef 1850.BR perf_event_open ()
f2b1d720
MK
1851support is enabled is checking
1852for the existence of the file
7db515ef 1853.IR /proc/sys/kernel/perf_event_paranoid .
f2b1d720 1854.SH BUGS
f2b1d720
MK
1855The
1856.B F_SETOWN_EX
1857option to
7db515ef 1858.BR fcntl (2)
f2b1d720
MK
1859is needed to properly get overflow signals in threads.
1860This was introduced in Linux 2.6.32.
1861
1862Prior to Linux 2.6.33 (at least for x86) the kernel did not check
1863if events could be scheduled together until read time.
1864The same happens on all known kernels if the NMI watchdog is enabled.
1865This means to see if a given set of events works you have to
1866.BR perf_event_open (),
1867start, then read before you know for sure you
1868can get valid measurements.
1869
1870Prior to Linux 2.6.34 event constraints were not enforced by the kernel.
1871In that case, some events would silently return "0" if the kernel
1872scheduled them in an improper counter slot.
1873
1874Prior to Linux 2.6.34 there was a bug when multiplexing where the
1875wrong results could be returned.
1876
1877Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
1878"inherit" is enabled and many threads are started.
1879
1880Prior to Linux 2.6.35,
1881.B PERF_FORMAT_GROUP
1882did not work with attached processes.
1883
1884In older Linux 2.6 versions,
1885refreshing an event group leader refreshed all siblings,
1886and refreshing with a parameter of 0 enabled infinite refresh.
1887This behavior is unsupported and should not be relied on.
1888
1889There is a bug in the kernel code between
1890Linux 2.6.36 and Linux 3.0 that ignores the
1891"watermark" field and acts as if a wakeup_event
1892was chosen if the union has a
1893non-zero value in it.
1894
7db515ef
MK
1895Always double-check your results!
1896Various generalized events have had wrong values.
f2b1d720
MK
1897For example, retired branches measured
1898the wrong thing on AMD machines until Linux 2.6.35.
f2b1d720
MK
1899.SH EXAMPLE
1900The following is a short example that measures the total
7db515ef
MK
1901instruction count of a call to
1902.BR printf (3).
f2b1d720
MK
1903.nf
1904
1905#include <stdlib.h>
1906#include <stdio.h>
1907#include <unistd.h>
1908#include <string.h>
1909#include <sys/ioctl.h>
1910#include <linux/perf_event.h>
1911#include <asm/unistd.h>
1912
7db515ef
MK
1913long
1914perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
1915 int cpu, int group_fd, unsigned long flags)
f2b1d720
MK
1916{
1917 int ret;
1918
7db515ef
MK
1919 ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
1920 group_fd, flags);
f2b1d720
MK
1921 return ret;
1922}
1923
f2b1d720
MK
1924int
1925main(int argc, char **argv)
1926{
f2b1d720
MK
1927 struct perf_event_attr pe;
1928 long long count;
1929 int fd;
1930
1931 memset(&pe, 0, sizeof(struct perf_event_attr));
1932 pe.type = PERF_TYPE_HARDWARE;
1933 pe.size = sizeof(struct perf_event_attr);
1934 pe.config = PERF_COUNT_HW_INSTRUCTIONS;
1935 pe.disabled = 1;
1936 pe.exclude_kernel = 1;
1937 pe.exclude_hv = 1;
1938
1939 fd = perf_event_open(&pe, 0, \-1, \-1, 0);
7db515ef 1940 if (fd == \-1) {
f2b1d720 1941 fprintf(stderr, "Error opening leader %llx\\n", pe.config);
7db515ef 1942 exit(EXIT_FAILURE);
f2b1d720
MK
1943 }
1944
1945 ioctl(fd, PERF_EVENT_IOC_RESET, 0);
1946 ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
1947
1948 printf("Measuring instruction count for this printf\\n");
1949
1950 ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
1951 read(fd, &count, sizeof(long long));
1952
1953 printf("Used %lld instructions\\n", count);
1954
1955 close(fd);
1956}
1957.fi
47297adb 1958.SH SEE ALSO
f2b1d720
MK
1959.BR fcntl (2),
1960.BR mmap (2),
1961.BR open (2),
1962.BR prctl (2),
1963.BR read (2)