]> git.ipfire.org Git - thirdparty/man-pages.git/blame - man2/perf_event_open.2
random.4: spfix
[thirdparty/man-pages.git] / man2 / perf_event_open.2
CommitLineData
f2b1d720
MK
1.\" Hey Emacs! This file is -*- nroff -*- source.
2.\"
3.\" Copyright (c) 2012, Vincent Weaver
4.\"
5.\" This is free documentation; you can redistribute it and/or
6.\" modify it under the terms of the GNU General Public License as
7.\" published by the Free Software Foundation; either version 2 of
8.\" the License, or (at your option) any later version.
9.\"
10.\" The GNU General Public License's references to "object code"
11.\" and "executables" are to be interpreted as the output of any
12.\" document formatting or typesetting system, including
13.\" intermediate and printed output.
14.\"
15.\" This manual is distributed in the hope that it will be useful,
16.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
17.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18.\" GNU General Public License for more details.
19.\"
20.\" You should have received a copy of the GNU General Public
21.\" License along with this manual; if not, see
22.\" <http://www.gnu.org/licenses/>.
23.\"
24.\" This document is based on the perf_event.h header file, the
25.\" tools/perf/design.txt file, and a lot of bitter experience.
26.\"
27.TH PERF_EVENT_OPEN 2 2013-02-04 "Linux" "Linux Programmer's Manual"
28.SH NAME
29perf_event_open \- set up performance monitoring
30.SH SYNOPSIS
31.nf
32.B #include <linux/perf_event.h>
33.B #include <linux/hw_breakpoint.h>
34.sp
35.BI "int perf_event_open(struct perf_event_attr *" attr ,
36.BI " pid_t " pid ", int " cpu ", int " group_fd ,
37.BI " unsigned long " flags );
38.fi
39
40.IR Note :
41There is no glibc wrapper for this system call; see NOTES.
42.SH DESCRIPTION
43Given a list of parameters,
44.BR perf_event_open ()
45returns a file descriptor, for use in subsequent system calls
46.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
47.PP
48A call to
49.BR perf_event_open ()
50creates a file descriptor that allows measuring performance
51information.
52Each file descriptor corresponds to one
53event that is measured; these can be grouped together
54to measure multiple events simultaneously.
55.PP
56Events can be enabled and disabled in two ways: via
57.BR ioctl (2)
58and via
59.BR prctl (2) .
60When an event is disabled it does not count or generate overflows but does
61continue to exist and maintain its count value.
62.PP
63Events come in two flavors: counting and sampled.
64A
65.I counting
66event is one that is used for counting the aggregate number of events
67that occur.
68In general, counting event results are gathered with a
69.BR read (2)
70call.
71A
72.I sampling
73event periodically writes measurements to a buffer that can then
74be accessed via
75.BR mmap (2) .
76.SS Arguments
77.P
78The argument
79.I pid
80allows events to be attached to processes in various ways.
81If
82.I pid
83is 0, measurements happen on the current thread, if
84.I pid
85is greater than 0, the process indicated by
86.I pid
87is measured, and if
88.I pid
89is \-1, all processes are counted.
90
91The
92.I cpu
93argument allows measurements to be specific to a CPU.
94If
95.I cpu
96is greater than or equal to 0,
97measurements are restricted to the specified CPU;
98if
99.I cpu
100is \-1, the events are measured on all CPUs.
101.P
102Note that the combination of
103.IR pid " == \-1"
104and
105.IR cpu " == \-1"
106is not valid.
107.P
108A
109.IR pid " > 0"
110and
111.IR cpu " == \-1"
112setting measures per-process and follows that process to whatever CPU the
113process gets scheduled to.
114Per-process events can be created by any user.
115.P
116A
117.IR pid " == \-1"
118and
119.IR cpu " >= 0"
120setting is per-CPU and measures all processes on the specified CPU.
121Per-CPU events need the
122.B CAP_SYS_ADMIN
123capability or a
124.I /proc/sys/kernel/perf_event_paranoid
125value of less than 1.
126.P
127The
128.I group_fd
129argument allows event groups to be created.
130An event group has one event which is the group leader.
131The leader is created first, with
132.IR group_fd " = \-1."
133The rest of the group members are created with subsequent
134.BR perf_event_open ()
135calls with
136.IR group_fd
137being set to the fd of the group leader.
138(A single event on its own is created with
139.IR group_fd " = \-1"
140and is considered to be a group with only 1 member.)
141An event group is scheduled onto the CPU as a unit: it will only
142be put onto the CPU if all of the events in the group can be put onto
143the CPU.
144This means that the values of the member events can be
145meaningfully compared, added, divided (to get ratios), etc., with each
146other, since they have counted events for the same set of executed
147instructions.
148.P
149The
150.I flags
151argument takes one of the following values:
152.TP
153.BR PERF_FLAG_FD_NO_GROUP
154.\" FIXME The following sentence is unclear
155This flag allows creating an event as part of an event group but
156having no group leader.
157It is unclear why this is useful.
158.\" FIXME So, why is it useful?
159.TP
160.BR PERF_FLAG_FD_OUTPUT
161This flag re-routes the output from an event to the group leader.
162.TP
163.BR PERF_FLAG_PID_CGROUP " (Since Linux 2.6.39)."
164This flag activates per-container system-wide monitoring.
165A container
166is an abstraction that isolates a set of resources for finer grain
167control (CPUs, memory, etc...).
168In this mode, the event is measured
169only if the thread running on the monitored CPU belongs to the designated
170container (cgroup).
171The cgroup is identified by passing a file descriptor
172opened on its directory in the cgroupfs filesystem.
173For instance, if the
174cgroup to monitor is called
175.IR test ,
176then a file descriptor opened on
177.I /dev/cgroup/test
178(assuming cgroupfs is mounted on
179.IR /dev/cgroup )
180must be passed as the
181.I pid
182parameter.
183cgroup monitoring is only available
184for system-wide events and may therefore require extra permissions.
185.P
186The
187.I perf_event_attr
188structure provides detailed configuration information
189for the event being created.
190
191.in +4n
192.nf
193struct perf_event_attr {
194 __u32 type; /* Type of event */
195 __u32 size; /* Size of attribute structure */
196 __u64 config; /* Type-specific configuration */
197
198 union {
199 __u64 sample_period; /* Period of sampling */
200 __u64 sample_freq; /* Frequency of sampling */
201 };
202
203 __u64 sample_type; /* Specifies values included in sample */
204 __u64 read_format; /* Specifies values returned in read */
205
7db515ef
MK
206 __u64 disabled : 1, /* off by default */
207 inherit : 1, /* children inherit it */
208 pinned : 1, /* must always be on PMU */
209 exclusive : 1, /* only group on PMU */
210 exclude_user : 1, /* don't count user */
211 exclude_kernel : 1, /* don't count kernel */
f2b1d720 212 exclude_hv : 1, /* don't count hypervisor */
7db515ef
MK
213 exclude_idle : 1, /* don't count when idle */
214 mmap : 1, /* include mmap data */
215 comm : 1, /* include comm data */
216 freq : 1, /* use freq, not period */
217 inherit_stat : 1, /* per task counts */
218 enable_on_exec : 1, /* next exec enables */
219 task : 1, /* trace fork/exit */
220 watermark : 1, /* wakeup_watermark */
221 precise_ip : 2, /* skid constraint */
222 mmap_data : 1, /* non-exec mmap data */
f2b1d720 223 sample_id_all : 1, /* sample_type all events */
7db515ef
MK
224 exclude_host : 1, /* don't count in host */
225 exclude_guest : 1, /* don't count in guest */
226 exclude_callchain_kernel : 1,
227 /* exclude kernel callchains */
228 exclude_callchain_user : 1,
229 /* exclude user callchains */
f2b1d720
MK
230 __reserved_1 : 41;
231
232 union {
233 __u32 wakeup_events; /* wakeup every n events */
7db515ef 234 __u32 wakeup_watermark; /* bytes before wakeup */
f2b1d720
MK
235 };
236
237 __u32 bp_type; /* breakpoint type */
238
239 union {
240 __u64 bp_addr; /* breakpoint address */
241 __u64 config1; /* extension of config */
242 };
243
244 union {
245 __u64 bp_len; /* breakpoint length */
246 __u64 config2; /* extension of config1 */
247 };
7db515ef
MK
248 __u64 branch_sample_type; /* enum perf_branch_sample_type */
249 __u64 sample_regs_user; /* user regs to dump on samples */
250 __u32 sample_stack_user; /* size of stack to dump on
251 samples */
252 __u32 __reserved_2; /* Align to u64 */
f2b1d720
MK
253
254};
255.fi
256.in
257
258The fields of the
259.I perf_event_attr
260structure are described in more detail below:
f2b1d720
MK
261.TP
262.I type
263This field specifies the overall event type.
264It has one of the following values:
265.RS
266.TP
267.B PERF_TYPE_HARDWARE
268This indicates one of the "generalized" hardware events provided
269by the kernel.
270See the
271.I config
272field definition for more details.
273.TP
274.B PERF_TYPE_SOFTWARE
275This indicates one of the software-defined events provided by the kernel
276(even if no hardware support is available).
277.TP
278.B PERF_TYPE_TRACEPOINT
279This indicates a tracepoint
280provided by the kernel tracepoint infrastructure.
281.TP
282.B PERF_TYPE_HW_CACHE
283This indicates a hardware cache event.
284This has a special encoding, described in the
285.I config
286field definition.
287.TP
288.B PERF_TYPE_RAW
289This indicates a "raw" implementation-specific event in the
290.IR config " field."
291.TP
292.BR PERF_TYPE_BREAKPOINT " (Since Linux 2.6.33)"
293This indicates a hardware breakpoint as provided by the CPU.
294Breakpoints can be read/write accesses to an address as well as
295execution of an instruction address.
296.TP
297.RB "dynamic PMU"
298Since Linux 2.6.39,
7db515ef 299.BR perf_event_open ()
f2b1d720
MK
300can support multiple PMUs.
301To enable this, a value exported by the kernel can be used in the
302.I type
303field to indicate which PMU to use.
304The value to use can be found in the sysfs filesystem:
305there is a subdirectory per PMU instance under
306.IR /sys/bus/event_source/devices .
307In each sub-directory there is a
308.I type
309file whose content is an integer that can be used in the
310.I type
311field.
312For instance,
313.I /sys/bus/event_source/devices/cpu/type
314contains the value for the core CPU PMU, which is usually 4.
315.RE
f2b1d720
MK
316.TP
317.I "size"
318The size of the
319.I perf_event_attr
320structure for forward/backward compatibility.
321Set this using
322.I sizeof(struct perf_event_attr)
323to allow the kernel to see
324the struct size at the time of compilation.
325
326The related define
327.B PERF_ATTR_SIZE_VER0
328is set to 64; this was the size of the first published struct.
329.B PERF_ATTR_SIZE_VER1
330is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
331.B PERF_ATTR_SIZE_VER2
332is 80 corresponding to the addition of branch sampling in Linux 3.4.
333.B PERF_ATR_SIZE_VER3
334is 96 corresponding to the addition
335of sample_regs_user and sample_stack_user in Linux 3.7.
336
337.TP
338.I "config"
339This specifies which event you want, in conjunction with
340the
341.I type
342field.
343The
344.IR config1 " and " config2
345fields are also taken into account in cases where 64 bits is not
346enough to fully specify the event.
347The encoding of these fields are event dependent.
348
349The most significant bit (bit 63) of
350.I config
351signifies CPU-specific (raw) counter configuration data;
352if the most significant bit is unset, the next 7 bits are an event
353type and the rest of the bits are the event identifier.
354
355There are various ways to set the
356.I config
357field that are dependent on the value of the previously
358described
359.I type
360field.
361What follows are various possible settings for
362.I config
363separated out by
364.IR type .
365
366If
367.I type
368is
369.BR PERF_TYPE_HARDWARE ,
370we are measuring one of the generalized hardware CPU events.
371Not all of these are available on all platforms.
372Set
373.I config
374to one of the following:
375.RS 12
376.TP
377.B PERF_COUNT_HW_CPU_CYCLES
378Total cycles.
379Be wary of what happens during CPU frequency scaling
380.TP
381.B PERF_COUNT_HW_INSTRUCTIONS
382Retired instructions.
383Be careful, these can be affected by various
384issues, most notably hardware interrupt counts
385.TP
386.B PERF_COUNT_HW_CACHE_REFERENCES
387Cache accesses.
388Usually this indicates Last Level Cache accesses but this may
389vary depending on your CPU.
390This may include prefetches and coherency messages; again this
391depends on the design of your CPU.
392.TP
393.B PERF_COUNT_HW_CACHE_MISSES
394Cache misses.
395Usually this indicates Last Level Cache misses; this is intended to be
396used in conjunction with the
397.B PERF_COUNT_HW_CACHE_REFERENCES
398event to calculate cache miss rates.
399.TP
400.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
401Retired branch instructions.
402Prior to Linux 2.6.34, this used
403the wrong event on AMD processors.
404.TP
405.B PERF_COUNT_HW_BRANCH_MISSES
406Mispredicted branch instructions.
407.TP
408.B PERF_COUNT_HW_BUS_CYCLES
409Bus cycles, which can be different from total cycles.
410.TP
411.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (Since Linux 3.0)"
412Stalled cycles during issue.
413.TP
414.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (Since Linux 3.0)"
415Stalled cycles during retirement.
416.TP
417.BR PERF_COUNT_HW_REF_CPU_CYCLES " (Since Linux 3.3)"
418Total cycles; not affected by CPU frequency scaling.
419.RE
420.IP
421If
422.I type
423is
424.BR PERF_TYPE_SOFTWARE ,
425we are measuring software events provided by the kernel.
426Set
427.I config
428to one of the following:
429.RS 12
430.TP
431.B PERF_COUNT_SW_CPU_CLOCK
432This reports the CPU clock, a high-resolution per-CPU timer.
433.TP
434.B PERF_COUNT_SW_TASK_CLOCK
435This reports a clock count specific to the task that is running.
436.TP
437.B PERF_COUNT_SW_PAGE_FAULTS
438This reports the number of page faults.
439.TP
440.B PERF_COUNT_SW_CONTEXT_SWITCHES
441This counts context switches.
442Until Linux 2.6.34, these were all reported as user-space
443events, after that they are reported as happening in the kernel.
444.TP
445.B PERF_COUNT_SW_CPU_MIGRATIONS
446This reports the number of times the process
447has migrated to a new CPU.
448.TP
449.B PERF_COUNT_SW_PAGE_FAULTS_MIN
450This counts the number of minor page faults.
451These did not require disk I/O to handle.
452.TP
453.B PERF_COUNT_SW_PAGE_FAULTS_MAJ
454This counts the number of major page faults.
455These required disk I/O to handle.
456.TP
457.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (Since Linux 2.6.33)"
458This counts the number of alignment faults.
459These happen when unaligned memory accesses happen; the kernel
460can handle these but it reduces performance.
461This only happens on some architectures (never on x86).
462.TP
463.BR PERF_COUNT_SW_EMULATION_FAULTS " (Since Linux 2.6.33)"
464This counts the number of emulation faults.
465The kernel sometimes traps on unimplemented instructions
7db515ef 466and emulates them for user space.
f2b1d720
MK
467This can negatively impact performance.
468.RE
469.RE
470
f2b1d720
MK
471.RS
472If
473.I type
474is
475.BR PERF_TYPE_TRACEPOINT ,
476then we are measuring kernel tracepoints.
477The value to use in
478.I config
479can be obtained from under debugfs
480.I tracing/events/*/*/id
481if ftrace is enabled in the kernel.
482
483.RE
484
485.RS
486If
487.I type
488is
489.BR PERF_TYPE_HW_CACHE ,
490then we are measuring a hardware CPU cache event.
491To calculate the appropriate
492.I config
493value use the following equation:
494.RS 4
495.nf
496
497 (perf_hw_cache_id) | (perf_hw_cache_op_id << 8) |
498 (perf_hw_cache_op_result_id << 16)
499.fi
500.P
501where
502.I perf_hw_cache_id
503is one of:
7db515ef 504.RS 4
f2b1d720
MK
505.TP
506.B PERF_COUNT_HW_CACHE_L1D
507for measuring Level 1 Data Cache
508.TP
509.B PERF_COUNT_HW_CACHE_L1I
510for measuring Level 1 Instruction Cache
511.TP
512.B PERF_COUNT_HW_CACHE_LL
513for measuring Last-Level Cache
514.TP
515.B PERF_COUNT_HW_CACHE_DTLB
516for measuring the Data TLB
517.TP
518.B PERF_COUNT_HW_CACHE_ITLB
519for measuring the Instruction TLB
520.TP
521.B PERF_COUNT_HW_CACHE_BPU
522for measuring the branch prediction unit
523.TP
524.BR PERF_COUNT_HW_CACHE_NODE " (Since Linux 3.0)"
525for measuring local memory accesses
526.RE
527
528.P
529and
530.I perf_hw_cache_op_id
531is one of
7db515ef 532.RS 4
f2b1d720
MK
533.TP
534.B PERF_COUNT_HW_CACHE_OP_READ
535for read accesses
536.TP
537.B PERF_COUNT_HW_CACHE_OP_WRITE
538for write accesses
539.TP
540.B PERF_COUNT_HW_CACHE_OP_PREFETCH
541for prefetch accesses
542.RE
543
544.P
545and
546.I perf_hw_cache_op_result_id
547is one of
7db515ef 548.RS 4
f2b1d720
MK
549.TP
550.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
551to measure accesses
552.TP
553.B PERF_COUNT_HW_CACHE_RESULT_MISS
554to measure misses
555.RE
556.RE
557
558If
559.I type
560is
561.BR PERF_TYPE_RAW ,
562then a custom "raw"
563.I config
564value is needed.
565Most CPUs support events that are not covered by the "generalized" events.
566These are implementation defined; see your CPU manual (for example
567the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
568Guide).
569The libpfm4 library can be used to translate from the name in the
570architectural manuals to the raw hex value
571.BR perf_event_open ()
572expects in this field.
573
574If
575.I type
576is
577.BR PERF_TYPE_BREAKPOINT ,
578then leave
579.I config
580set to zero.
581Its parameters are set in other places.
582.RE
583.TP
584.IR sample_period ", " sample_freq
585A "sampling" counter is one that generates an interrupt
586every N events, where N is given by
587.IR sample_period .
588A sampling counter has
589.IR sample_period " > 0."
590When an overflow interrupt occurs, requested data is recorded
591in the mmap buffer.
592The
593.I sample_type
594field controls what data is recorded on each interrupt.
595
596.I sample_freq
597can be used if you wish to use frequency rather than period.
598In this case you set the
599.I freq
600flag.
601The kernel will adjust the sampling period
602to try and achieve the desired rate.
603The rate of adjustment is a
604timer tick.
605
606
607.TP
608.I "sample_type"
609The various bits in this field specify which values to include
610in the sample.
611They will be recorded in a ring-buffer,
612which is available to user-space using
613.BR mmap (2).
614The order in which the values are saved in the
615sample are documented in the MMAP Layout subsection below;
616it is not the
617.I "enum perf_event_sample_format"
618order.
619.RS
620.TP
621.B PERF_SAMPLE_IP
622Records instruction pointer.
623.TP
624.B PERF_SAMPLE_TID
7db515ef 625Records the process and thread IDs.
f2b1d720
MK
626.TP
627.B PERF_SAMPLE_TIME
628Records a timestamp.
629.TP
630.B PERF_SAMPLE_ADDR
631Records an address, if applicable.
632.TP
633.B PERF_SAMPLE_READ
634Record counter values for all events in a group, not just the group leader.
635.TP
636.B PERF_SAMPLE_CALLCHAIN
637Records the callchain (stack backtrace).
638.TP
639.B PERF_SAMPLE_ID
640Records a unique ID for the opened event's group leader.
641.TP
642.B PERF_SAMPLE_CPU
643Records CPU number.
644.TP
645.B PERF_SAMPLE_PERIOD
646Records the current sampling period.
647.TP
648.B PERF_SAMPLE_STREAM_ID
649Records a unique ID for the opened event.
650Unlike
651.B PERF_SAMPLE_ID
652the actual ID is returned, not the group leader.
653This ID is the same as the one returned by PERF_FORMAT_ID.
654.TP
655.B PERF_SAMPLE_RAW
656Records additional data, if applicable.
657Usually returned by tracepoint events.
658.TP
659.BR PERF_SAMPLE_BRANCH_STACK " (Since Linux 3.4)"
7db515ef
MK
660Records the branch stack.
661See branch_sample_type.
f2b1d720
MK
662.TP
663.BR PERF_SAMPLE_REGS_USER " (Since Linux 3.7)"
664Records the current register state.
665.TP
666.BR PERF_SAMPLE_STACK_USER " (Since Linux 3.7)"
667[To be documented]
668.RE
669
670.TP
671.IR "read_format"
672This field specifies the format of the data returned by
673.BR read (2)
674on a
7db515ef 675.BR perf_event_open ()
f2b1d720
MK
676file descriptor.
677.RS
678.TP
679.B PERF_FORMAT_TOTAL_TIME_ENABLED
680Adds the 64-bit "time_enabled" field.
681This can be used to calculate estimated totals if
682the PMU is overcommitted and multiplexing is happening.
683.TP
684.B PERF_FORMAT_TOTAL_TIME_RUNNING
685Adds the 64-bit "time_running" field.
686This can be used to calculate estimated totals if
687the PMU is overcommitted and multiplexing is happening.
688.TP
689.B PERF_FORMAT_ID
690Adds a 64-bit unique value that corresponds to the event group.
691.TP
692.B PERF_FORMAT_GROUP
693Allows all counter values in an event group to be read with one read.
694.RE
695
696.TP
697.IR "disabled"
698The
699.I disabled
700bit specifies whether the counter starts out disabled or enabled.
701If disabled, the event can later be enabled by
702.BR ioctl (2),
703.BR prctl (2),
704or
705.IR enable_on_exec .
706
707.TP
708.IR "inherit"
709The
710.I inherit
711bit specifies that this counter should count events of child
712tasks as well as the task specified.
713This only applies to new children, not to any existing children at
714the time the counter is created (nor to any new children of
715existing children).
716
717Inherit does not work for some combinations of
718.IR read_format s,
719such as
720.BR PERF_FORMAT_GROUP .
721
722.TP
723.IR "pinned"
724The
725.I pinned
726bit specifies that the counter should always be on the CPU if at all
727possible.
728It only applies to hardware counters and only to group leaders.
729If a pinned counter cannot be put onto the CPU (e.g., because there are
730not enough hardware counters or because of a conflict with some other
731event), then the counter goes into an 'error' state, where reads
732return end-of-file (i.e.,
733.BR read (2)
734returns 0) until the counter is subsequently enabled or disabled.
735
736.TP
737.IR "exclusive"
738The
739.I exclusive
740bit specifies that when this counter's group is on the CPU,
741it should be the only group using the CPU's counters.
742In the future this may allow monitoring programs to
743support PMU features that need to run alone so that they do not
744disrupt other hardware counters.
745
746.TP
747.IR "exclude_user"
748If this bit is set, the count excludes events that happen in user-space.
749
750.TP
751.IR "exclude_kernel"
752If this bit is set, the count excludes events that happen in kernel-space.
753
754.TP
755.IR "exclude_hv"
756If this bit is set, the count excludes events that happen in the
757hypervisor.
758This is mainly for PMUs that have built-in support for handling this
759(such as POWER).
760Extra support is needed for handling hypervisor measurements on most
761machines.
762
763.TP
764.IR "exclude_idle"
765If set, don't count when the CPU is idle.
766
767.TP
768.IR "mmap"
769The
770.I mmap
771bit enables recording of exec mmap events.
772
773.TP
774.IR "comm"
775The
776.I comm
777bit enables tracking of process command name as modified by the
778.IR exec (2)
779and
780.IR prctl (PR_SET_NAME)
781system calls.
782Unfortunately for tools,
783there is no way to distinguish one system call versus the other.
784
785.TP
786.IR "freq"
787If this bit is set, then
788.I sample_frequency
789not
790.I sample_period
791is used when setting up the sampling interval.
792
793.TP
794.IR "inherit_stat"
795This bit enables saving of event counts on context switch for
796inherited tasks.
797This is only meaningful if the
798.I inherit
799field is set.
800
801.TP
802.IR "enable_on_exec"
803If this bit is set, a counter is automatically
804enabled after a call to
805.BR exec (2).
806
807.TP
808.IR "task"
809If this bit is set, then
810fork/exit notifications are included in the ring buffer.
811
812.TP
813.IR "watermark"
814If set, have a sampling interrupt happen when we cross the
815.I wakeup_watermark
816boundary.
817Otherwise interrupts happen after
818.I wakeup_events
819samples.
820
821.TP
822.IR "precise_ip" " (Since Linux 2.6.35)"
823This controls the amount of skid.
824Skid is how many instructions
825execute between an event of interest happening and the kernel
826being able to stop and record the event.
827Smaller skid is
828better and allows more accurate reporting of which events
829correspond to which instructions, but hardware is often limited
830with how small this can be.
831
832The values of this are the following:
833.RS
834.TP
8350 -
836.B SAMPLE_IP
837can have arbitrary skid
838.TP
8391 -
840.B SAMPLE_IP
841must have constant skid
842.TP
8432 -
844.B SAMPLE_IP
845requested to have 0 skid
846.TP
8473 -
848.B SAMPLE_IP
849must have 0 skid.
850See also
851.BR PERF_RECORD_MISC_EXACT_IP .
852.RE
853
854.TP
855.IR "mmap_data" " (Since Linux 2.6.36)"
856The counterpart of the
857.I mmap
858field, but enables including data mmap events
859in the ring-buffer.
860
861.TP
862.IR "sample_id_all" " (Since Linux 2.6.38)"
863If set, then TID, TIME, ID, CPU, and STREAM_ID can
864additionally be included in
865.RB non- PERF_RECORD_SAMPLE s
866if the corresponding
867.I sample_type
868is selected.
869
870.TP
871.IR "exclude_host" " (Since Linux 3.2)"
872Do not measure time spent in VM host
873
874.TP
875.IR "exclude_guest" " (Since Linux 3.2)"
876Do not measure time spent in VM guest
877
878.TP
879.IR "exclude_callchain_kernel" " (Since Linux 3.7)"
880Do not include kernel callchains.
881
882.TP
883.IR "exclude_callchain_user" " (Since Linux 3.7)"
884Do not include user callchains.
885
886.TP
887.IR "wakeup_events" ", " "wakeup_watermark"
888This union sets how many samples
889.RI ( wakeup_events )
890or bytes
891.RI ( wakeup_watermark )
892happen before an overflow signal happens.
893Which one is used is selected by the
894.I watermark
895bitflag.
896
897.TP
898.IR "bp_type" " (Since Linux 2.6.33)"
899This chooses the breakpoint type.
900It is one of:
901.RS
902.TP
903.BR HW_BREAKPOINT_EMPTY
904no breakpoint
905.TP
906.BR HW_BREAKPOINT_R
907count when we read the memory location
908.TP
909.BR HW_BREAKPOINT_W
910count when we write the memory location
911.TP
912.BR HW_BREAKPOINT_RW
913count when we read or write the memory location
914.TP
915.BR HW_BREAKPOINT_X
916count when we execute code at the memory location
917
918.LP
7db515ef 919The values can be combined via a bitwise or, but the
f2b1d720
MK
920combination of
921.B HW_BREAKPOINT_R
922or
923.B HW_BREAKPOINT_W
924with
925.B HW_BREAKPOINT_X
926is not allowed.
927.RE
928
929.TP
930.IR "bp_addr" " (Since Linux 2.6.33)"
931.I bp_addr
932address of the breakpoint.
933For execution breakpoints this is the memory address of the instruction
934of interest; for read and write breakpoints it is the memory address
935of the memory location of interest.
936
937.TP
938.IR "config1" " (Since Linux 2.6.39)"
939.I config1
940is used for setting events that need an extra register or otherwise
941do not fit in the regular config field.
942Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
943on 3.3 and later kernels.
944
945.TP
946.IR "bp_len" " (Since Linux 2.6.33)"
947.I bp_len
948is the length of the breakpoint being measured if
949.I type
950is
951.BR PERF_TYPE_BREAKPOINT .
952Options are
953.BR HW_BREAKPOINT_LEN_1 ,
954.BR HW_BREAKPOINT_LEN_2 ,
955.BR HW_BREAKPOINT_LEN_4 ,
956.BR HW_BREAKPOINT_LEN_8 .
957For an execution breakpoint, set this to
958.IR sizeof(long) .
959
960.TP
961.IR "config2" " (Since Linux 2.6.39)"
962
963.I config2
964is a further extension of the
965.I config1
966field.
967
968.TP
969.IR "branch_sample_type" " (Since Linux 3.4)"
970This is used with the CPUs hardware branch sampling, if available.
971It can have one of the following values:
972.RS
973.TP
974.B PERF_SAMPLE_BRANCH_USER
975Branch target is in user space
976.TP
977.B PERF_SAMPLE_BRANCH_KERNEL
978Branch target is in kernel space
979.TP
980.B PERF_SAMPLE_BRANCH_HV
981Branch target is in hypervisor
982.TP
983.B PERF_SAMPLE_BRANCH_ANY
984Any branch type.
985.TP
986.B PERF_SAMPLE_BRANCH_ANY_CALL
987Any call branch
988.TP
989.B PERF_SAMPLE_BRANCH_ANY_RETURN
990Any return branch
991.TP
992.BR PERF_SAMPLE_BRANCH_IND_CALL
993Indirect calls
994.TP
995.BR PERF_SAMPLE_BRANCH_PLM_ALL
996User, kernel, and hv
997.RE
998
999.TP
1000.IR "sample_regs_user" " (Since Linux 3.7)"
1001This defines the set of user registers to dump on samples.
7db515ef 1002See
12eb3e64 1003.\" FIXME: The following reference seems to be not quite right:
7db515ef 1004.IR asm/perf_regs.h .
f2b1d720
MK
1005
1006.TP
1007.IR "sample_stack_user" " (Since Linux 3.7)"
1008This defines the size of the user stack to dump on sample.
1009
1010.RE
1011
c634028a 1012.SS "Reading results"
f2b1d720 1013Once a
7db515ef 1014.BR perf_event_open ()
f2b1d720
MK
1015file descriptor has been opened, the values
1016of the events can be read from the file descriptor.
1017The values that are there are specified by the
1018.I read_format
7db515ef
MK
1019field in the
1020.I attr
1021structure at open time.
f2b1d720
MK
1022
1023If you attempt to read into a buffer that is not big enough to hold the
1024data
1025.B ENOSPC
1026is returned
1027
1028Here is the layout of the data returned by a read:
1029
1030If
1031.B PERF_FORMAT_GROUP
1032was specified to allow reading all events in a group at once:
1033
1034.in +4n
1035.nf
1036struct read_format {
1037 u64 nr; /* The number of events */
1038 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1039 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1040 struct {
1041 u64 value; /* The value of the event */
1042 u64 id; /* if PERF_FORMAT_ID */
1043 } values[nr];
1044};
1045.fi
1046.in
1047
1048If
1049.B PERF_FORMAT_GROUP
1050was
1051.I not
1052specified, then the read values look as following:
1053
1054.in +4n
1055.nf
1056struct read_format {
1057 u64 value; /* The value of the event */
1058 u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
1059 u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
1060 u64 id; /* if PERF_FORMAT_ID */
1061};
1062.fi
1063.in
1064
1065The values read are described in more detail below.
1066.RS
1067.TP
1068.I nr
1069The number of events in this file descriptor.
1070Only available if
1071.B PERF_FORMAT_GROUP
1072was specified.
1073
1074.TP
1075.IR time_enabled ", " time_running
1076Total time the event was enabled and running.
1077Normally these are the same.
1078If more events are started
1079than available counter slots on the PMU, then multiplexing
1080happens and events only run part of the time.
1081In that case the
1082.I time_enabled
1083and
1084.I time running
1085values can be used to scale an estimated value for the count.
1086
1087.TP
1088.I value
1089An unsigned 64-bit value containing the counter result.
1090
1091.TP
1092.I id
1093A globally unique value for this particular event, only there if
1094.B PERF_FORMAT_ID
1095was specified in read_format.
1096
1097.RE
1098.RE
1099
1100
1101
c634028a 1102.SS "MMAP layout"
f2b1d720
MK
1103
1104When using
7db515ef 1105.BR perf_event_open ()
f2b1d720
MK
1106in sampled mode, asynchronous events
1107(like counter overflow or
1108.B PROT_EXEC
1109mmap tracking)
1110are logged into a ring-buffer.
1111This ring-buffer is created and accessed through
1112.BR mmap (2).
1113
1114The mmap size should be 1+2^n pages, where the first page is a
1115metadata page
1116.IR ( "struct perf_event_mmap_page" )
1117that contains various
1118bits of information such as where the ring-buffer head is.
1119
1120Before kernel 2.6.39, there is a bug that means you must allocate a mmap
1121ring buffer when sampling even if you do not plan to access it.
1122
1123The structure of the first metadata mmap page is as follows:
1124
1125.in +4n
1126.nf
1127struct perf_event_mmap_page {
7db515ef 1128 __u32 version; /* version number of this structure */
f2b1d720 1129 __u32 compat_version; /* lowest version this is compat with */
7db515ef
MK
1130 __u32 lock; /* seqlock for synchronization */
1131 __u32 index; /* hardware counter identifier */
1132 __s64 offset; /* add to hardware counter value */
1133 __u64 time_enabled; /* time event active */
1134 __u64 time_running; /* time event on CPU */
f2b1d720
MK
1135 union {
1136 __u64 capabilities;
1137 __u64 cap_usr_time : 1,
1138 cap_usr_rdpmc : 1,
1139 };
1140 __u16 pmc_width;
1141 __u16 time_shift;
1142 __u32 time_mult;
1143 __u64 time_offset;
7db515ef 1144 __u64 __reserved[120]; /* Pad to 1k */
f2b1d720 1145 __u64 data_head; /* head in the data section */
7db515ef 1146 __u64 data_tail; /* user-space written tail */
f2b1d720
MK
1147}
1148.fi
1149.in
1150
1151
1152
1153The following looks at the fields in the
1154.I perf_event_mmap_page
1155structure in more detail.
1156
7db515ef 1157.RS 4
f2b1d720
MK
1158
1159.TP
1160.I version
1161Version number of this structure.
1162
1163.TP
1164.I compat_version
1165The lowest version this is compatible with.
1166
1167.TP
1168.I lock
1169A seqlock for synchronization.
1170
1171.TP
1172.I index
1173A unique hardware counter identifier.
1174
1175.TP
1176.I offset
1177.\" FIXME clarify
1178Add this to hardware counter value??
1179
1180.TP
1181.I time_enabled
1182Time the event was active.
1183
1184.TP
1185.I time_running
1186Time the event was running.
1187
1188.TP
1189.I cap_usr_time
1190User time capability
1191
1192.TP
1193.I cap_usr_rdpmc
1194If the hardware supports user-space read of performance counters
1195without syscall (this is the "rdpmc" instruction on x86), then
1196the following code can be used to do a read:
1197
1198.in +4n
1199.nf
1200u32 seq, time_mult, time_shift, idx, width;
1201u64 count, enabled, running;
1202u64 cyc, time_offset;
1203s64 pmc = 0;
1204
1205do {
1206 seq = pc\->lock;
1207 barrier();
1208 enabled = pc\->time_enabled;
1209 running = pc\->time_running;
1210
1211 if (pc\->cap_usr_time && enabled != running) {
1212 cyc = rdtsc();
1213 time_offset = pc\->time_offset;
1214 time_mult = pc\->time_mult;
1215 time_shift = pc\->time_shift;
1216 }
1217
1218 idx = pc\->index;
1219 count = pc\->offset;
1220
1221 if (pc\->cap_usr_rdpmc && idx) {
1222 width = pc\->pmc_width;
1223 pmc = rdpmc(idx \- 1);
1224 }
1225
1226 barrier();
1227} while (pc\->lock != seq);
1228.fi
1229.in
1230
1231
1232
1233.TP
1234.I pmc_width
1235If
1236.IR cap_usr_rdpmc ,
1237this field provides the bit-width of the value
1238read using the rdpmc or equivalent instruction.
1239This can be used to sign extend the result like:
1240
1241.in +4n
1242.nf
1243pmc <<= 64 \- pmc_width;
1244pmc >>= 64 \- pmc_width; // signed shift right
1245count += pmc;
1246.fi
1247.in
1248
1249
1250.TP
1251.IR time_shift ", " time_mult ", " time_offset
1252
1253If
1254.IR cap_usr_time ,
1255these fields can be used to compute the time
7db515ef 1256delta since time_enabled (in nanoseconds) using rdtsc or similar.
f2b1d720
MK
1257.nf
1258
1259 u64 quot, rem;
1260 u64 delta;
1261 quot = (cyc >> time_shift);
1262 rem = cyc & ((1 << time_shift) \- 1);
1263 delta = time_offset + quot * time_mult +
1264 ((rem * time_mult) >> time_shift);
1265.fi
1266
7db515ef
MK
1267Where
1268.IR time_offset ,
1269.IR time_mult ,
1270.IR time_shift ,
1271and
1272.IR cyc
1273are read in the
f2b1d720
MK
1274seqcount loop described above.
1275This delta can then be added to
1276enabled and possible running (if idx), improving the scaling:
1277.nf
1278
1279 enabled += delta;
1280 if (idx)
1281 running += delta;
1282 quot = count / running;
1283 rem = count % running;
1284 count = quot * enabled + (rem * enabled) / running;
1285.fi
1286
1287.TP
1288.I data_head
1289This points to the head of the data section.
7db515ef
MK
1290The value continuously increases, it does not wrap.
1291The value needs to be manually wrapped by the size of the mmap buffer
f2b1d720
MK
1292before accessing the samples.
1293
1294On SMP-capable platforms, after reading the data_head value,
1295user-space should issue an rmb().
1296
1297.TP
1298.I data_tail;
1299When the mapping is
1300.BR PROT_WRITE ,
7db515ef
MK
1301the
1302.I data_tail
1303value should be written by user space to reflect the last read data.
f2b1d720
MK
1304In this case the kernel will not over-write unread data.
1305
1306.RE
1307
1308
1309The following 2^n ring-buffer pages have the layout described below.
1310
1311If
1312.I perf_event_attr.sample_id_all
1313is set, then all event types will
1314have the sample_type selected fields related to where/when (identity)
1315an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
1316.B PERF_RECORD_SAMPLE
1317below, it will be stashed just after the
7db515ef
MK
1318.I perf_event_header
1319and the fields already present for the existing
f2b1d720
MK
1320fields, i.e., at the end of the payload.
1321That way a newer perf.data
1322file will be supported by older perf tools, with these new optional
1323fields being ignored.
1324
1325The mmap values start with a header:
1326
1327.in +4n
1328.nf
1329struct perf_event_header {
1330 __u32 type;
1331 __u16 misc;
1332 __u16 size;
1333};
1334.fi
1335.in
1336
1337Below, we describe the
1338.I perf_event_header
1339fields in more detail.
1340
1341.TP
1342.I type
1343The
1344.I type
1345value is one of the below.
1346The values in the corresponding record (that follows the header)
1347depend on the
1348.I type
1349selected as shown.
1350
1351.RS
7db515ef 1352.TP 4
f2b1d720
MK
1353.B PERF_RECORD_MMAP
1354The MMAP events record the
1355.B PROT_EXEC
1356mappings so that we can correlate
7db515ef 1357user space IPs to code.
f2b1d720
MK
1358They have the following structure:
1359
1360.in +4n
1361.nf
1362struct {
1363 struct perf_event_header header;
1364 u32 pid, tid;
1365 u64 addr;
1366 u64 len;
1367 u64 pgoff;
1368 char filename[];
1369};
1370.fi
1371.in
1372
1373.TP
1374.B PERF_RECORD_LOST
1375This record indicates when events are lost.
1376
1377.in +4n
1378.nf
1379struct {
1380 struct perf_event_header header;
1381 u64 id;
1382 u64 lost;
1383};
1384.fi
1385.in
1386
1387.RS
1388.TP
1389.I id
1390is the unique event ID for the samples that were lost.
1391.TP
1392.I lost
1393is the number of events that were lost.
1394.RE
1395
1396.TP
1397.B PERF_RECORD_COMM
1398This record indicates a change in the process name.
1399
1400.in +4n
1401.nf
1402struct {
1403 struct perf_event_header header;
1404 u32 pid, tid;
1405 char comm[];
1406};
1407.fi
1408.in
1409
1410.TP
1411.B PERF_RECORD_EXIT
1412This record indicates a process exit event.
1413
1414.in +4n
1415.nf
1416struct {
1417 struct perf_event_header header;
1418 u32 pid, ppid;
1419 u32 tid, ptid;
1420 u64 time;
1421};
1422.fi
1423.in
1424
1425.TP
1426.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
1427This record indicates a throttle/unthrottle event.
1428
1429.in +4n
1430.nf
1431struct {
1432 struct perf_event_header header;
1433 u64 time;
1434 u64 id;
1435 u64 stream_id;
1436};
1437.fi
1438.in
1439
1440.TP
1441.B PERF_RECORD_FORK
1442This record indicates a fork event.
1443
1444.in +4n
1445.nf
1446struct {
1447 struct perf_event_header header;
1448 u32 pid, ppid;
1449 u32 tid, ptid;
1450 u64 time;
1451};
1452.fi
1453.in
1454
1455.TP
1456.B PERF_RECORD_READ
1457This record indicates a read event.
1458
1459.in +4n
1460.nf
1461struct {
1462 struct perf_event_header header;
1463 u32 pid, tid;
1464 struct read_format values;
1465};
1466.fi
1467.in
1468
1469.TP
1470.B PERF_RECORD_SAMPLE
1471This record indicates a sample.
1472
1473.in +4n
1474.nf
1475struct {
1476 struct perf_event_header header;
7db515ef
MK
1477 u64 ip; /* if PERF_SAMPLE_IP */
1478 u32 pid, tid; /* if PERF_SAMPLE_TID */
1479 u64 time; /* if PERF_SAMPLE_TIME */
1480 u64 addr; /* if PERF_SAMPLE_ADDR */
1481 u64 id; /* if PERF_SAMPLE_ID */
1482 u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
1483 u32 cpu, res; /* if PERF_SAMPLE_CPU */
1484 u64 period; /* if PERF_SAMPLE_PERIOD */
f2b1d720 1485 struct read_format v; /* if PERF_SAMPLE_READ */
7db515ef
MK
1486 u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
1487 u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
1488 u32 size; /* if PERF_SAMPLE_RAW */
1489 char data[size]; /* if PERF_SAMPLE_RAW */
1490 u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
1491 struct perf_branch_entry lbr[bnr];
1492 /* if PERF_SAMPLE_BRANCH_STACK */
1493 u64 abi; /* if PERF_SAMPLE_REGS_USER */
1494 u64 regs[weight(mask)];
1495 /* if PERF_SAMPLE_REGS_USER */
1496 u64 size; /* if PERF_SAMPLE_STACK_USER */
1497 char data[size]; /* if PERF_SAMPLE_STACK_USER */
1498 u64 dyn_size; /* if PERF_SAMPLE_STACK_USER */
f2b1d720
MK
1499};
1500.fi
1501
1502.RS
1503.TP
1504.I ip
7db515ef
MK
1505If
1506.B PERF_SAMPLE_IP
1507is enabled, then a 64-bit instruction
f2b1d720
MK
1508pointer value is included.
1509
1510.TP
7db515ef
MK
1511.IR pid ", " tid
1512If
1513.B PERF_SAMPLE_TID
1514is enabled, then a 32-bit process ID
1515and 32-bit thread ID are included.
f2b1d720
MK
1516
1517.TP
1518.I time
7db515ef
MK
1519If
1520.B PERF_SAMPLE_TIME
1521is enabled, then a 64-bit timestamp
f2b1d720
MK
1522is included.
1523This is obtained via local_clock() which is a hardware timestamp
1524if available and the jiffies value if not.
1525
1526.TP
1527.I addr
7db515ef
MK
1528If
1529.B PERF_SAMPLE_ADDR
1530is enabled, then a 64-bit address is included.
f2b1d720
MK
1531This is usually the address of a tracepoint,
1532breakpoint, or software event; otherwise the value is 0.
1533
1534.TP
1535.I id
7db515ef
MK
1536If
1537.B PERF_SAMPLE_ID
1538is enabled, a 64-bit unique ID is included.
f2b1d720 1539If the event is a member of an event group, the group leader ID is returned.
7db515ef
MK
1540This ID is the same as the one returned by
1541.BR PERF_FORMAT_ID .
f2b1d720
MK
1542
1543.TP
1544.I stream_id
7db515ef
MK
1545If
1546.B PERF_SAMPLE_STREAM_ID
1547is enabled, a 64-bit unique ID is included.
f2b1d720
MK
1548Unlike
1549.B PERF_SAMPLE_ID
1550the actual ID is returned, not the group leader.
7db515ef
MK
1551This ID is the same as the one returned by
1552.BR PERF_FORMAT_ID .
f2b1d720
MK
1553
1554.TP
7db515ef
MK
1555.IR cpu ", " res
1556If
1557.B PERF_SAMPLE_CPU
1558is enabled, this is a 32-bit value indicating
f2b1d720
MK
1559which CPU was being used, in addition to a reserved (unused)
156032-bit value.
1561
1562.TP
1563.I period
7db515ef
MK
1564If
1565.B PERF_SAMPLE_PERIOD
1566is enabled, a 64-bit value indicating
f2b1d720
MK
1567the current sampling period is written.
1568
1569.TP
1570.I v
7db515ef
MK
1571If
1572.B PERF_SAMPLE_READ
1573is enabled, a structure of type read_format
f2b1d720
MK
1574is included which has values for all events in the event group.
1575The values included depend on the
1576.I read_format
7db515ef
MK
1577value used at
1578.BR perf_event_open ()
1579time.
f2b1d720
MK
1580
1581.TP
7db515ef
MK
1582.IR nr ", " ips[nr]
1583If
1584.B PERF_SAMPLE_CALLCHAIN
1585is enabled, then a 64-bit number is included
f2b1d720 1586which indicates how many following 64-bit instruction pointers will
7db515ef
MK
1587follow.
1588This is the current callchain.
f2b1d720
MK
1589
1590.TP
7db515ef
MK
1591.IR size ", " data
1592If
1593.B PERF_SAMPLE_RAW
1594is enabled, then a 32-bit value indicating size
f2b1d720
MK
1595is included followed by an array of 8-bit values of length size.
1596The values are padded with 0 to have 64-bit alignment.
1597
1598This RAW record data is opaque with respect to the ABI.
1599The ABI doesn't make any promises with respect to the stability
1600of its content, it may vary depending
1601on event, hardware, and kernel version.
1602
1603.TP
7db515ef
MK
1604.IR bnr ", " lbr[bnr]
1605If
1606.B PERF_SAMPLE_BRANCH_STACK
1607is enabled, then a 64-bit value indicating
1608the number of records is included, followed by
1609.I bnr
1610.I perf_branch_entry
1611structures.
1612These structures have from, to, and flags values indicating
f2b1d720
MK
1613the from and to addresses from the branches on the callstack.
1614
1615.TP
7db515ef
MK
1616.IR abi ", " regs[weight(mask)]
1617If
1618.B PERF_SAMPLE_REGS_USER
1619is enabled, then
f2b1d720
MK
1620[to be documented].
1621
1622The
1623.I abi
1624field is one of
1625.BR PERF_SAMPLE_REGS_ABI_NONE ", " PERF_SAMPLE_REGS_ABI_32 " or "
7db515ef 1626.BR PERF_SAMPLE_REGS_ABI_64 .
f2b1d720
MK
1627
1628.TP
7db515ef
MK
1629.IR size ", " data[size] ", " dyn_size
1630If
1631.B PERF_SAMPLE_STACK_USER
1632is enabled, then
f2b1d720
MK
1633[to be documented].
1634
1635.RE
1636
1637.RE
1638
1639
1640.TP
1641.I misc
1642The
1643.I misc
1644field contains additional information about the sample.
1645
1646The CPU mode can be determined from this value by masking with
1647.B PERF_RECORD_MISC_CPUMODE_MASK
1648and looking for one of the following (note these are not
1649bitmasks, only one can be set at a time):
1650.RS
1651.TP
1652.B PERF_RECORD_MISC_CPUMODE_UNKNOWN
1653Unknown CPU mode.
1654.TP
1655.B PERF_RECORD_MISC_KERNEL
1656Sample happened in the kernel.
1657.TP
1658.B PERF_RECORD_MISC_USER
1659Sample happened in user code.
1660.TP
1661.B PERF_RECORD_MISC_HYPERVISOR
1662Sample happened in the hypervisor.
1663.TP
1664.B PERF_RECORD_MISC_GUEST_KERNEL
1665Sample happened in the guest kernel.
1666.TP
1667.B PERF_RECORD_MISC_GUEST_USER
1668Sample happened in guest user code.
1669.RE
1670
7db515ef 1671In addition, one of the following bits can be set:
f2b1d720
MK
1672.RS
1673.TP
1674.B PERF_RECORD_MISC_EXACT_IP
1675This indicates that the content of
1676.B PERF_SAMPLE_IP
1677points
1678to the actual instruction that triggered the event.
1679See also
1680.IR perf_event_attr.precise_ip .
1681
1682.TP
1683.B PERF_RECORD_MISC_EXT_RESERVED
1684This indicates there is extended data available (currently not used).
1685
1686.RE
1687
1688.TP
1689.I size
1690This indicates the size of the record.
1691
1692.RE
1693
c634028a 1694.SS "Signal overflow"
f2b1d720
MK
1695
1696Events can be set to deliver a signal when a threshold is crossed.
1697The signal handler is set up using the
1698.BR poll (2),
1699.BR select (2),
1700.BR epoll (2)
1701and
1702.BR fcntl (2),
1703system calls.
1704
1705To generate signals, sampling must be enabled
1706.RI ( sample_period
1707must have a non-zero value).
1708
1709There are two ways to generate signals.
1710
1711The first is to set a
1712.I wakeup_events
1713or
1714.I wakeup_watermark
1715value that will generate a signal if a certain number of samples
1716or bytes have been written to the mmap ring buffer.
7db515ef
MK
1717In this case a signal of type
1718.B POLL_IN
1719is sent.
f2b1d720
MK
1720
1721The other way is by use of the
7db515ef 1722.B PERF_EVENT_IOC_REFRESH
f2b1d720
MK
1723ioctl.
1724This ioctl adds to a counter that decrements each time the event overflows.
7db515ef
MK
1725When non-zero, a
1726.B POLL_IN
1727signal is sent on overflow, but
1728once the value reaches 0, a signal is sent of type
1729.B POLL_HUP
1730and
f2b1d720
MK
1731the underlying event is disabled.
1732
1733Note: on newer kernels (definitely noticed with 3.2)
7db515ef 1734.\" FIXME(Vince) : Find out when this was introduced
f2b1d720
MK
1735a signal is provided for every overflow, even if
1736.I wakeup_events
1737is not set.
1738
1739.SS "rdpmc instruction"
1740Starting with Linux 3.4 on x86, you can use the
1741.I rdpmc
1742instruction to get low-latency reads without having to enter the kernel.
1743Note that using
1744.I rdpmc
1745is not necessarily faster than other methods for reading event values.
1746
1747Support for this can be detected with the
1748.I cap_usr_rdpmc
1749field in the mmap page; documentation on how
1750to calculate event values can be found in that section.
1751
1752.SS "perf_event ioctl calls"
1753.PP
1754Various ioctls act on
7db515ef 1755.BR perf_event_open ()
f2b1d720
MK
1756file descriptors
1757
1758.TP
1759.B PERF_EVENT_IOC_ENABLE
36127c0e 1760Enables the individual event or event group specified by the
7db515ef 1761file descriptor argument.
f2b1d720
MK
1762
1763The ioctl argument is ignored.
1764
1765.TP
1766.B PERF_EVENT_IOC_DISABLE
36127c0e 1767Disables the individual counter or event group specified by the
7db515ef 1768file descriptor argument.
f2b1d720
MK
1769
1770Enabling or disabling the leader of a group enables or disables the
1771entire group; that is, while the group leader is disabled, none of the
1772counters in the group will count.
1773Enabling or disabling a member of a group other than the leader only
1774affects that counter; disabling a non-leader
1775stops that counter from counting but doesn't affect any other counter.
1776
1777The ioctl argument is ignored.
1778
1779.TP
1780.B PERF_EVENT_IOC_REFRESH
1781Non-inherited overflow counters can use this
1782to enable a counter for a number of overflows specified by the argument,
1783after which it is disabled.
1784Subsequent calls of this ioctl add the argument value to the current
1785count.
7db515ef
MK
1786A signal with
1787.B POLL_IN
1788set will happen on each overflow until the
1789count reaches 0; when that happens a signal with
1790POLL_HUP
1791set is sent and the event is disabled.
f2b1d720
MK
1792Using an argument of 0 is considered undefined behavior.
1793
1794.TP
1795.B PERF_EVENT_IOC_RESET
36127c0e 1796Reset the event count specified by the
7db515ef 1797file descriptor argumentto zero.
f2b1d720
MK
1798This only resets the counts; there is no way to reset the
1799multiplexing
1800.I time_enabled
1801or
1802.I time_running
1803values.
1804When sent to a group leader, only
1805the leader is reset (child events are not).
1806
1807The ioctl argument is ignored.
1808
1809.TP
1810.B PERF_EVENT_IOC_PERIOD
1811IOC_PERIOD is the command to update the period; it
1812does not update the current period but instead defers until next.
1813
1814The argument is a pointer to a 64-bit value containing the
1815desired new period.
1816
1817.TP
1818.B PERF_EVENT_IOC_SET_OUTPUT
1819This tells the kernel to report event notifications to the specified
1820file descriptor rather than the default one.
1821The file descriptors must all be on the same CPU.
1822
1823The argument specifies the desired file descriptor, or \-1 if
1824output should be ignored.
1825
1826.TP
1827.BR PERF_EVENT_IOC_SET_FILTER " (Since Linux 2.6.33)"
1828This adds an ftrace filter to this event.
1829
1830The argument is a pointer to the desired ftrace filter.
1831
1832.SS "Using prctl"
1833A process can enable or disable all the event groups that are
1834attached to it using the
1835.BR prctl (2)
1836.B PR_TASK_PERF_EVENTS_ENABLE
1837and
1838.B PR_TASK_PERF_EVENTS_DISABLE
1839operations.
1840This applies to all counters on the current process, whether created by
1841this process or by another, and does not affect any counters that this
1842process has created on other processes.
1843It only enables or disables
1844the group leaders, not any other members in the groups.
1845
1846.SS perf_event related configuration files
1847
7db515ef
MK
1848Files in
1849.I /proc/sys/kernel/
f2b1d720 1850
7db515ef 1851.RS 4
f2b1d720 1852.TP
7db515ef 1853.I /proc/sys/kernel/perf_event_paranoid
f2b1d720
MK
1854
1855The
1856.I perf_event_paranoid
1857file can be set to restrict access to the performance counters.
1858
7db515ef 18592 - only allow user-space measurements
f2b1d720
MK
1860
18611 - (default) allow both kernel and user measurements
1862
18630 - allow access to CPU-specific data but not raw tracepoint samples
1864
1865\-1 - no restrictions
1866
1867The existence of the
1868.I perf_event_paranoid
1869file is the official method for determining if a kernel supports
7db515ef 1870.BR perf_event_open ().
f2b1d720
MK
1871
1872.TP
1873.I /proc/sys/kernel/perf_event_max_sample_rate
1874
7db515ef
MK
1875This sets the maximum sample rate.
1876Setting this too high can allow
f2b1d720 1877users to sample at a rate that impacts overall machine performance
7db515ef
MK
1878and potentially lock up the machine.
1879The default value is
f2b1d720
MK
1880100000 (samples per second).
1881
1882.TP
1883.I /proc/sys/kernel/perf_event_mlock_kb
1884
7db515ef 1885Maximum number of pages an unprivileged user can mlock (2) .
f2b1d720
MK
1886The default is 516 (kB).
1887.RE
1888
7db515ef
MK
1889Files in
1890.I /sys/bus/event_source/devices/
f2b1d720 1891
7db515ef 1892.RS 4
f2b1d720
MK
1893Since Linux 2.6.34 the kernel supports having multiple PMUs
1894available for monitoring.
1895Information on how to program these PMUs can be found under
1896.IR /sys/bus/event_source/devices/ .
1897Each subdirectory corresponds to a different PMU.
1898
f2b1d720
MK
1899.TP
1900.I /sys/bus/event_source/devices/*/type
1901This contains an integer that can be used in the
1902.I type
1903field of perf_event_attr to indicate you wish to use this PMU.
1904
1905.TP
1906.I /sys/bus/event_source/devices/*/rdpmc
1907[To be documented]
1908
1909.TP
1910.I /sys/bus/event_source/devices/*/format/
1911This sub-directory contains information on what bits in the
1912.I config
1913field of perf_event_attr correspond to.
1914
1915.TP
1916.I /sys/bus/event_source/devices/*/events/
1917This sub-directory contains files with pre-defined events.
1918The contents are strings describing the event settings
1919expressed in terms of the fields found in the
1920.I ./format/
1921directory.
1922These are not necessarily complete lists of all events supported by
1923a PMU, but usually a subset of events deemed useful or interesting.
1924
1925.TP
1926.I /sys/bus/event_source/devices/*/uevent
1927[To be documented]
1928
1929.RE
1930
1931
1932.SH "RETURN VALUE"
1933.BR perf_event_open ()
1934returns the new file descriptor, or \-1 if an error occurred
1935(in which case,
1936.I errno
1937is set appropriately).
1938.SH ERRORS
1939.TP
1940.B EINVAL
1941Returned if the specified event is not available.
1942.TP
1943.B ENOSPC
1944Prior to Linux 3.3, if there was not enough room for the event,
1945.B ENOSPC
1946was returned.
1947Linus did not like this, and this was changed to
1948.BR EINVAL .
1949.B ENOSPC
1950is still returned if you try to read results into
1951too small of a buffer.
1952
1953.SH VERSION
1954
1955.BR perf_event_open ()
1956was introduced in Linux 2.6.31 but was called
1957.BR perf_counter_open () .
1958It was renamed in Linux 2.6.32.
1959
1960.SH CONFORMING TO
1961
7db515ef
MK
1962This
1963.BR perf_event_open ()
1964system call Linux- specific
f2b1d720
MK
1965and should not be used in programs intended to be portable.
1966
1967.SH NOTES
1968Glibc does not provide a wrapper for this system call; call it using
1969.BR syscall (2).
7db515ef 1970See the example below.
f2b1d720
MK
1971
1972The official way of knowing if
7db515ef 1973.BR perf_event_open ()
f2b1d720
MK
1974support is enabled is checking
1975for the existence of the file
7db515ef 1976.IR /proc/sys/kernel/perf_event_paranoid .
f2b1d720
MK
1977
1978.SH BUGS
1979
1980The
1981.B F_SETOWN_EX
1982option to
7db515ef 1983.BR fcntl (2)
f2b1d720
MK
1984is needed to properly get overflow signals in threads.
1985This was introduced in Linux 2.6.32.
1986
1987Prior to Linux 2.6.33 (at least for x86) the kernel did not check
1988if events could be scheduled together until read time.
1989The same happens on all known kernels if the NMI watchdog is enabled.
1990This means to see if a given set of events works you have to
1991.BR perf_event_open (),
1992start, then read before you know for sure you
1993can get valid measurements.
1994
1995Prior to Linux 2.6.34 event constraints were not enforced by the kernel.
1996In that case, some events would silently return "0" if the kernel
1997scheduled them in an improper counter slot.
1998
1999Prior to Linux 2.6.34 there was a bug when multiplexing where the
2000wrong results could be returned.
2001
2002Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
2003"inherit" is enabled and many threads are started.
2004
2005Prior to Linux 2.6.35,
2006.B PERF_FORMAT_GROUP
2007did not work with attached processes.
2008
2009In older Linux 2.6 versions,
2010refreshing an event group leader refreshed all siblings,
2011and refreshing with a parameter of 0 enabled infinite refresh.
2012This behavior is unsupported and should not be relied on.
2013
2014There is a bug in the kernel code between
2015Linux 2.6.36 and Linux 3.0 that ignores the
2016"watermark" field and acts as if a wakeup_event
2017was chosen if the union has a
2018non-zero value in it.
2019
7db515ef
MK
2020Always double-check your results!
2021Various generalized events have had wrong values.
f2b1d720
MK
2022For example, retired branches measured
2023the wrong thing on AMD machines until Linux 2.6.35.
2024
2025.SH EXAMPLE
2026The following is a short example that measures the total
7db515ef
MK
2027instruction count of a call to
2028.BR printf (3).
f2b1d720
MK
2029.nf
2030
2031#include <stdlib.h>
2032#include <stdio.h>
2033#include <unistd.h>
2034#include <string.h>
2035#include <sys/ioctl.h>
2036#include <linux/perf_event.h>
2037#include <asm/unistd.h>
2038
7db515ef
MK
2039long
2040perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
2041 int cpu, int group_fd, unsigned long flags)
f2b1d720
MK
2042{
2043 int ret;
2044
7db515ef
MK
2045 ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
2046 group_fd, flags);
f2b1d720
MK
2047 return ret;
2048}
2049
2050
2051int
2052main(int argc, char **argv)
2053{
2054
2055 struct perf_event_attr pe;
2056 long long count;
2057 int fd;
2058
2059 memset(&pe, 0, sizeof(struct perf_event_attr));
2060 pe.type = PERF_TYPE_HARDWARE;
2061 pe.size = sizeof(struct perf_event_attr);
2062 pe.config = PERF_COUNT_HW_INSTRUCTIONS;
2063 pe.disabled = 1;
2064 pe.exclude_kernel = 1;
2065 pe.exclude_hv = 1;
2066
2067 fd = perf_event_open(&pe, 0, \-1, \-1, 0);
7db515ef 2068 if (fd == \-1) {
f2b1d720 2069 fprintf(stderr, "Error opening leader %llx\\n", pe.config);
7db515ef 2070 exit(EXIT_FAILURE);
f2b1d720
MK
2071 }
2072
2073 ioctl(fd, PERF_EVENT_IOC_RESET, 0);
2074 ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
2075
2076 printf("Measuring instruction count for this printf\\n");
2077
2078 ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
2079 read(fd, &count, sizeof(long long));
2080
2081 printf("Used %lld instructions\\n", count);
2082
2083 close(fd);
2084}
2085.fi
2086
2087.SH "SEE ALSO"
2088.BR fcntl (2),
2089.BR mmap (2),
2090.BR open (2),
2091.BR prctl (2),
2092.BR read (2)