]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blame - src/patches/suse-2.6.27.25/patches.suse/perfmon2.patch
Changed checkfs to auto reboot after correctable fsck fixes.
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.suse / perfmon2.patch
CommitLineData
00e5a55c
BS
1From: Cliff Wickman <cpw@sgi.com>
2Subject: perfmon2
3References: bnc#430298
4Patch-mainline: never
5
6This is Stephane Eranian's patch
7from http://perfmon2.sourceforge.net/
8but backfitted to the SuSE KOTD for 10/20/2008
9
10[greg's note: I really don't like this, as perfmon2 has been rejected
11from upstream, and perfmon3 is being worked on. This should be going
12away for SP1, and no one should count on the userspace interface
13remaining the same...]
14
15Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
16
17---
18 Documentation/ABI/testing/sysfs-perfmon | 87 +
19 Documentation/ABI/testing/sysfs-perfmon-fmt | 18
20 Documentation/ABI/testing/sysfs-perfmon-pmu | 46
21 Documentation/kernel-parameters.txt | 3
22 Documentation/perfmon2-debugfs.txt | 126 ++
23 Documentation/perfmon2.txt | 213 +++
24 MAINTAINERS | 8
25 Makefile | 1
26 arch/ia64/Kconfig | 10
27 arch/ia64/Makefile | 1
28 arch/ia64/configs/generic_defconfig | 11
29 arch/ia64/include/asm/Kbuild | 4
30 arch/ia64/include/asm/hw_irq.h | 2
31 arch/ia64/include/asm/perfmon.h | 302 -----
32 arch/ia64/include/asm/perfmon_compat.h | 167 +++
33 arch/ia64/include/asm/perfmon_default_smpl.h | 121 +-
34 arch/ia64/include/asm/perfmon_kern.h | 356 ++++++
35 arch/ia64/include/asm/processor.h | 10
36 arch/ia64/include/asm/system.h | 18
37 arch/ia64/include/asm/thread_info.h | 4
38 arch/ia64/include/asm/unistd.h | 14
39 arch/ia64/kernel/Makefile | 3
40 arch/ia64/kernel/entry.S | 12
41 arch/ia64/kernel/irq_ia64.c | 7
42 arch/ia64/kernel/perfmon_default_smpl.c | 296 -----
43 arch/ia64/kernel/perfmon_generic.h | 45
44 arch/ia64/kernel/perfmon_itanium.h | 115 --
45 arch/ia64/kernel/perfmon_mckinley.h | 187 ---
46 arch/ia64/kernel/perfmon_montecito.h | 269 -----
47 arch/ia64/kernel/process.c | 98 -
48 arch/ia64/kernel/ptrace.c | 8
49 arch/ia64/kernel/setup.c | 3
50 arch/ia64/kernel/smpboot.c | 10
51 arch/ia64/kernel/sys_ia64.c | 8
52 arch/ia64/lib/Makefile | 1
53 arch/ia64/oprofile/init.c | 8
54 arch/ia64/oprofile/perfmon.c | 39
55 arch/ia64/perfmon/Kconfig | 67 +
56 arch/ia64/perfmon/Makefile | 11
57 arch/ia64/perfmon/perfmon.c | 946 +++++++++++++++++
58 arch/ia64/perfmon/perfmon_compat.c | 1210 ++++++++++++++++++++++
59 arch/ia64/perfmon/perfmon_default_smpl.c | 273 +++++
60 arch/ia64/perfmon/perfmon_generic.c | 148 ++
61 arch/ia64/perfmon/perfmon_itanium.c | 232 ++++
62 arch/ia64/perfmon/perfmon_mckinley.c | 290 +++++
63 arch/ia64/perfmon/perfmon_montecito.c | 412 +++++++
64 arch/mips/Kconfig | 2
65 arch/mips/Makefile | 6
66 arch/mips/kernel/process.c | 4
67 arch/mips/kernel/scall32-o32.S | 12
68 arch/mips/kernel/scall64-64.S | 12
69 arch/mips/kernel/scall64-n32.S | 16
70 arch/mips/kernel/scall64-o32.S | 12
71 arch/mips/kernel/signal.c | 6
72 arch/mips/kernel/time.c | 3
73 arch/mips/kernel/traps.c | 16
74 arch/mips/mti-malta/malta-time.c | 1
75 arch/mips/perfmon/Kconfig | 61 +
76 arch/mips/perfmon/Makefile | 2
77 arch/mips/perfmon/perfmon.c | 313 +++++
78 arch/mips/perfmon/perfmon_mips64.c | 218 ++++
79 arch/powerpc/Kconfig | 2
80 arch/powerpc/Makefile | 1
81 arch/powerpc/include/asm/Kbuild | 1
82 arch/powerpc/include/asm/cell-pmu.h | 5
83 arch/powerpc/include/asm/cell-regs.h | 30
84 arch/powerpc/include/asm/paca.h | 4
85 arch/powerpc/include/asm/perfmon.h | 33
86 arch/powerpc/include/asm/perfmon_kern.h | 390 +++++++
87 arch/powerpc/include/asm/reg.h | 1
88 arch/powerpc/include/asm/systbl.h | 12
89 arch/powerpc/include/asm/thread_info.h | 4
90 arch/powerpc/include/asm/unistd.h | 14
91 arch/powerpc/kernel/entry_32.S | 2
92 arch/powerpc/kernel/entry_64.S | 4
93 arch/powerpc/kernel/irq.c | 31
94 arch/powerpc/kernel/process.c | 10
95 arch/powerpc/perfmon/Kconfig | 67 +
96 arch/powerpc/perfmon/Makefile | 6
97 arch/powerpc/perfmon/perfmon.c | 334 ++++++
98 arch/powerpc/perfmon/perfmon_cell.c | 1449 +++++++++++++++++++++++++++
99 arch/powerpc/perfmon/perfmon_power4.c | 309 +++++
100 arch/powerpc/perfmon/perfmon_power5.c | 326 ++++++
101 arch/powerpc/perfmon/perfmon_power6.c | 520 +++++++++
102 arch/powerpc/perfmon/perfmon_ppc32.c | 340 ++++++
103 arch/powerpc/platforms/cell/cbe_regs.c | 27
104 arch/sparc/include/asm/hypervisor.h | 24
105 arch/sparc/include/asm/irq_64.h | 3
106 arch/sparc/include/asm/perfmon.h | 11
107 arch/sparc/include/asm/perfmon_kern.h | 286 +++++
108 arch/sparc/include/asm/system_64.h | 34
109 arch/sparc/include/asm/thread_info_64.h | 28
110 arch/sparc/include/asm/unistd_32.h | 14
111 arch/sparc/include/asm/unistd_64.h | 14
112 arch/sparc/kernel/systbls.S | 4
113 arch/sparc64/Kconfig | 2
114 arch/sparc64/Makefile | 2
115 arch/sparc64/kernel/cpu.c | 47
116 arch/sparc64/kernel/hvcalls.S | 41
117 arch/sparc64/kernel/irq.c | 63 +
118 arch/sparc64/kernel/process.c | 26
119 arch/sparc64/kernel/rtrap.S | 51
120 arch/sparc64/kernel/setup.c | 2
121 arch/sparc64/kernel/signal.c | 4
122 arch/sparc64/kernel/sys_sparc.c | 101 -
123 arch/sparc64/kernel/syscalls.S | 23
124 arch/sparc64/kernel/systbls.S | 8
125 arch/sparc64/kernel/traps.c | 158 +-
126 arch/sparc64/kernel/ttable.S | 2
127 arch/sparc64/perfmon/Kconfig | 26
128 arch/sparc64/perfmon/Makefile | 1
129 arch/sparc64/perfmon/perfmon.c | 422 +++++++
130 arch/x86/Kconfig | 2
131 arch/x86/Makefile | 2
132 arch/x86/ia32/ia32entry.S | 12
133 arch/x86/kernel/apic_32.c | 5
134 arch/x86/kernel/apic_64.c | 1
135 arch/x86/kernel/cpu/common.c | 3
136 arch/x86/kernel/entry_32.S | 2
137 arch/x86/kernel/entry_64.S | 8
138 arch/x86/kernel/irqinit_64.c | 5
139 arch/x86/kernel/process_32.c | 10
140 arch/x86/kernel/process_64.c | 10
141 arch/x86/kernel/signal_32.c | 5
142 arch/x86/kernel/signal_64.c | 6
143 arch/x86/kernel/smpboot.c | 2
144 arch/x86/kernel/syscall_table_32.S | 12
145 arch/x86/oprofile/nmi_int.c | 10
146 arch/x86/perfmon/Kconfig | 89 +
147 arch/x86/perfmon/Makefile | 13
148 arch/x86/perfmon/perfmon.c | 761 ++++++++++++++
149 arch/x86/perfmon/perfmon_amd64.c | 754 ++++++++++++++
150 arch/x86/perfmon/perfmon_intel_arch.c | 610 +++++++++++
151 arch/x86/perfmon/perfmon_intel_atom.c | 541 ++++++++++
152 arch/x86/perfmon/perfmon_intel_core.c | 449 ++++++++
153 arch/x86/perfmon/perfmon_p4.c | 913 +++++++++++++++++
154 arch/x86/perfmon/perfmon_p6.c | 310 +++++
155 arch/x86/perfmon/perfmon_pebs_core_smpl.c | 256 ++++
156 arch/x86/perfmon/perfmon_pebs_p4_smpl.c | 253 ++++
157 include/asm-mips/Kbuild | 1
158 include/asm-mips/perfmon.h | 34
159 include/asm-mips/perfmon_kern.h | 412 +++++++
160 include/asm-mips/system.h | 4
161 include/asm-mips/thread_info.h | 4
162 include/asm-mips/unistd.h | 46
163 include/asm-x86/Kbuild | 1
164 include/asm-x86/ia32_unistd.h | 13
165 include/asm-x86/irq_vectors.h | 5
166 include/asm-x86/mach-default/entry_arch.h | 4
167 include/asm-x86/perfmon.h | 34
168 include/asm-x86/perfmon_kern.h | 548 ++++++++++
169 include/asm-x86/perfmon_pebs_core_smpl.h | 164 +++
170 include/asm-x86/perfmon_pebs_p4_smpl.h | 193 +++
171 include/asm-x86/thread_info.h | 8
172 include/asm-x86/unistd_32.h | 14
173 include/asm-x86/unistd_64.h | 25
174 include/linux/Kbuild | 2
175 include/linux/perfmon.h | 213 +++
176 include/linux/perfmon_dfl_smpl.h | 78 +
177 include/linux/perfmon_fmt.h | 74 +
178 include/linux/perfmon_kern.h | 551 ++++++++++
179 include/linux/perfmon_pmu.h | 192 +++
180 include/linux/sched.h | 4
181 include/linux/syscalls.h | 30
182 kernel/sched.c | 1
183 kernel/sys_ni.c | 13
184 perfmon/Makefile | 12
185 perfmon/perfmon_activate.c | 265 ++++
186 perfmon/perfmon_attach.c | 474 ++++++++
187 perfmon/perfmon_ctx.c | 314 +++++
188 perfmon/perfmon_ctxsw.c | 342 ++++++
189 perfmon/perfmon_debugfs.c | 168 +++
190 perfmon/perfmon_dfl_smpl.c | 298 +++++
191 perfmon/perfmon_file.c | 751 +++++++++++++
192 perfmon/perfmon_fmt.c | 219 ++++
193 perfmon/perfmon_hotplug.c | 151 ++
194 perfmon/perfmon_init.c | 131 ++
195 perfmon/perfmon_intr.c | 648 ++++++++++++
196 perfmon/perfmon_msg.c | 229 ++++
197 perfmon/perfmon_pmu.c | 590 ++++++++++
198 perfmon/perfmon_priv.h | 182 +++
199 perfmon/perfmon_res.c | 450 ++++++++
200 perfmon/perfmon_rw.c | 733 +++++++++++++
201 perfmon/perfmon_sets.c | 873 ++++++++++++++++
202 perfmon/perfmon_smpl.c | 865 ++++++++++++++++
203 perfmon/perfmon_syscalls.c | 1060 +++++++++++++++++++
204 perfmon/perfmon_sysfs.c | 525 +++++++++
205 187 files changed, 27484 insertions(+), 1731 deletions(-)
206
207--- /dev/null
208+++ b/Documentation/ABI/testing/sysfs-perfmon
209@@ -0,0 +1,87 @@
210+What: /sys/kernel/perfmon
211+Date: Nov 2007
212+KernelVersion: 2.6.24
213+Contact: eranian@gmail.com
214+
215+Description: provide the configuration interface for the perfmon2 subsystems.
216+ The tree contains information about the detected hardware, current
217+ state of the subsystem as well as some configuration parameters.
218+
219+ The tree consists of the following entries:
220+
221+ /sys/kernel/perfmon/debug (read-write):
222+
223+ Enable perfmon2 debugging output via klogd. Debug messages produced during
224+ PMU interrupt handling are not controlled by this entry. The traces a rate-limited
225+ to avoid flooding of the console. It is possible to change the throttling
226+ via /proc/sys/kernel/printk_ratelimit. The value is interpreted as a bitmask.
227+ Each bit enables a particular type of debug messages. Refer to the file
228+ include/linux/perfmon_kern.h for more information
229+
230+ /sys/kernel/perfmon/pmc_max_fast_arg (read-only):
231+
232+ Number of perfmon2 syscall arguments copied directly onto the
233+ stack (copy_from_user) for pfm_write_pmcs(). Copying to the stack avoids
234+ having to allocate a buffer. The unit is the number of pfarg_pmc_t
235+ structures.
236+
237+ /sys/kernel/perfmon/pmd_max_fast_arg (read-only):
238+
239+ Number of perfmon2 syscall arguments copied directly onto the
240+ stack (copy_from_user) for pfm_write_pmds()/pfm_read_pmds(). Copying
241+ to the stack avoids having to allocate a buffer. The unit is the number
242+ of pfarg_pmd_t structures.
243+
244+
245+ /sys/kernel/perfmon/reset_stats (write-only):
246+
247+ Reset the statistics collected by perfmon2. Stats are available
248+ per-cpu via debugfs.
249+
250+ /sys/kernel/perfmon/smpl_buffer_mem_cur (read-only):
251+
252+ Reports the amount of memory currently dedicated to sampling
253+ buffers by the kernel. The unit is byte.
254+
255+ /sys/kernel/perfmon/smpl_buffer_mem_max (read-write):
256+
257+ Maximum amount of kernel memory usable for sampling buffers. -1 means
258+ everything that is available. Unit is byte.
259+
260+ /sys/kernel/perfmon/smpl_buffer_mem_cur (read-only):
261+
262+ Current utilization of kernel memory in bytes.
263+
264+ /sys/kernel/perfmon/sys_group (read-write):
265+
266+ Users group allowed to create a system-wide perfmon2 context (session).
267+ -1 means any group. This control will be kept until we find a package
268+ able to control capabilities via PAM.
269+
270+ /sys/kernel/perfmon/task_group (read-write):
271+
272+ Users group allowed to create a per-thread context (session).
273+ -1 means any group. This control will be kept until we find a
274+ package able to control capabilities via PAM.
275+
276+ /sys/kernel/perfmon/sys_sessions_count (read-only):
277+
278+ Number of system-wide contexts currently attached to CPUs.
279+
280+ /sys/kernel/perfmon/task_sessions_count (read-only):
281+
282+ Number of per-thread contexts currently attached to threads.
283+
284+ /sys/kernel/perfmon/version (read-only):
285+
286+ Perfmon2 interface revision number.
287+
288+ /sys/kernel/perfmon/arg_mem_max(read-write):
289+
290+ Maximum size of vector arguments expressed in bytes. Can be modified
291+
292+ /sys/kernel/perfmon/mode(read-write):
293+
294+ Bitmask to enable/disable certain perfmon2 features.
295+ Currently defined:
296+ - bit 0: if set, then reserved bitfield are ignored on PMC writes
297--- /dev/null
298+++ b/Documentation/ABI/testing/sysfs-perfmon-fmt
299@@ -0,0 +1,18 @@
300+What: /sys/kernel/perfmon/formats
301+Date: 2007
302+KernelVersion: 2.6.24
303+Contact: eranian@gmail.com
304+
305+Description: provide description of available perfmon2 custom sampling buffer formats
306+ which are implemented as independent kernel modules. Each formats gets
307+ a subdir which a few entries.
308+
309+ The name of the subdir is the name of the sampling format. The same name
310+ must be passed to pfm_create_context() to use the format.
311+
312+ Each subdir XX contains the following entries:
313+
314+ /sys/kernel/perfmon/formats/XX/version (read-only):
315+
316+ Version number of the format in clear text and null terminated.
317+
318--- /dev/null
319+++ b/Documentation/ABI/testing/sysfs-perfmon-pmu
320@@ -0,0 +1,46 @@
321+What: /sys/kernel/perfmon/pmu
322+Date: Nov 2007
323+KernelVersion: 2.6.24
324+Contact: eranian@gmail.com
325+
326+Description: provide information about the currently loaded PMU description module.
327+ The module contains the mapping of the actual performance counter registers
328+ onto the logical PMU exposed by perfmon. There is at most one PMU description
329+ module loaded at any time.
330+
331+ The sysfs PMU tree provides a description of the mapping for each register.
332+ There is one subdir per config and data registers along an entry for the
333+ name of the PMU model.
334+
335+ The model entry is as follows:
336+
337+ /sys/kernel/perfmon/pmu_desc/model (read-only):
338+
339+ Name of the PMU model is clear text and zero terminated.
340+
341+ Then for each logical PMU register, XX, gets a subtree with the following entries:
342+
343+ /sys/kernel/perfmon/pmu_desc/pm*XX/addr (read-only):
344+
345+ The physical address or index of the actual underlying hardware register.
346+ On Itanium, it corresponds to the index. But on X86 processor, this is
347+ the actual MSR address.
348+
349+ /sys/kernel/perfmon/pmu_desc/pm*XX/dfl_val (read-only):
350+
351+ The default value of the register in hexadecimal.
352+
353+ /sys/kernel/perfmon/pmu_desc/pm*XX/name (read-only):
354+
355+ The name of the hardware register.
356+
357+ /sys/kernel/perfmon/pmu_desc/pm*XX/rsvd_msk (read-only):
358+
359+ The bitmask of reserved bits, i.e., bits which cannot be changed by
360+ applications. When a bit is set, it means the corresponding bit in the
361+ actual register is reserved.
362+
363+ /sys/kernel/perfmon/pmu_desc/pm*XX/width (read-only):
364+
365+ the width in bits of the registers. This field is only relevant for counter
366+ registers.
367--- a/Documentation/kernel-parameters.txt
368+++ b/Documentation/kernel-parameters.txt
369@@ -1698,6 +1698,9 @@ and is between 256 and 4096 characters.
370 Format: { 0 | 1 }
371 See arch/parisc/kernel/pdc_chassis.c
372
373+ perfmon_debug [PERFMON] Enables Perfmon debug messages. Needed
374+ to see traces of the early startup startup phase.
375+
376 pf. [PARIDE]
377 See Documentation/paride.txt.
378
379--- /dev/null
380+++ b/Documentation/perfmon2-debugfs.txt
381@@ -0,0 +1,126 @@
382+ The perfmon2 debug and statistics interface
383+ ------------------------------------------
384+ Stephane Eranian
385+ <eranian@gmail.com>
386+
387+The perfmon2 interfaces exports a set of statistics which are used to tune and
388+debug the implementation. The data is composed of a set of very simple metrics
389+mostly aggregated counts and durations. They instruments key points in the
390+perfmon2 code, such as context switch and interrupt handling.
391+
392+The data is accessible via the debug filesystem (debugfs). Thus you need to
393+have the filesystem support enabled in your kernel. Furthermore since, 2.6.25,
394+the perfmon2 statistics interface is an optional component. It needs to be
395+explicitely enabled in the kernel config file (CONFIG_PERFMON_DEBUG_FS).
396+
397+To access the data, the debugs filesystem must be mounted. Supposing the mount
398+point is /debugfs, you would need to do:
399+ $ mount -t debugs none /debugfs
400+
401+The data is located under the perfmon subdirectory and is organized per CPU.
402+For each CPU, the same set of metrics is available, one metric per file in
403+clear ASCII text.
404+
405+The metrics are as follows:
406+
407+ ctxswin_count (read-only):
408+
409+ Number of PMU context switch in.
410+
411+ ctxswin_ns (read-only):
412+
413+ Number of nanoseconds spent in the PMU context switch in
414+ routine. Dividing this number by the value of ctxswin_count,
415+ yields average cost of the PMU context switch in.
416+
417+ ctxswout_count (read-only):
418+
419+ Number of PMU context switch out.
420+
421+ ctxswout_ns (read-only):
422+
423+ Number of nanoseconds spent in the PMU context switch in
424+ routine. Dividing this number by the value of ctxswout_count,
425+ yields average cost of the PMU context switch out.
426+
427+ fmt_handler_calls (read-only):
428+
429+ Number of calls to the sampling format routine that handles
430+ PMU interrupts, i.e., typically the routine that records a
431+ sample.
432+
433+ fmt_handler_ns (read-only):
434+
435+ Number of nanoseconds spent in the routine that handle PMU
436+ interrupt in the sampling format. Dividing this number by
437+ the number of calls provided by fmt_handler_calls, yields
438+ average time spent in this routine.
439+
440+ ovfl_intr_all_count (read-only):
441+
442+ Number of PMU interrupts received by the kernel.
443+
444+
445+ ovfl_intr_nmi_count (read-only):
446+
447+ Number of Non Maskeable Interrupts (NMI) received by the kernel
448+ for perfmon. This is relevant only on X86 hardware.
449+
450+ ovfl_intr_ns (read-only):
451+
452+ Number of nanoseconds spent in the perfmon2 PMU interrupt
453+ handler routine. Dividing this number of ovfl_intr_all_count
454+ yields the average time to handle one PMU interrupt.
455+
456+ ovfl_intr_regular_count (read-only):
457+
458+ Number of PMU interrupts which are actually processed by
459+ the perfmon interrupt handler. There may be spurious or replay
460+ interrupts.
461+
462+ ovfl_intr_replay_count (read-only):
463+
464+ Number of PMU interrupts which were replayed on context switch
465+ in or on event set switching. Interrupts get replayed when they
466+ were in flight at the time monitoring had to be stopped.
467+
468+ perfmon/ovfl_intr_spurious_count (read-only):
469+
470+ Number of PMU interrupts which were dropped because there was
471+ no active context (session).
472+
473+ ovfl_notify_count (read-only):
474+
475+ Number of user level notifications sent. Notifications are
476+ appended as messages to the context queue. Notifications may
477+ be sent on PMU interrupts.
478+
479+ pfm_restart_count (read-only):
480+
481+ Number of times pfm_restart() is called.
482+
483+ reset_pmds_count (read-only):
484+
485+ Number of times pfm_reset_pmds() is called.
486+
487+ set_switch_count (read-only):
488+
489+ Number of event set switches.
490+
491+ set_switch_ns (read-only):
492+
493+ Number of nanoseconds spent in the set switching routine.
494+ Dividing this number by set_switch_count yields the average
495+ cost of switching sets.
496+
497+ handle_timeout_count (read-only):
498+
499+ Number of times the pfm_handle_timeout() routine is called.
500+ It is used for timeout-based set switching.
501+
502+ handle_work_count (read-only):
503+
504+ Number of times pfm_handle_work() is called. The routine
505+ handles asynchronous perfmon2 work for per-thread contexts
506+ (sessions).
507+
508--- /dev/null
509+++ b/Documentation/perfmon2.txt
510@@ -0,0 +1,213 @@
511+ The perfmon2 hardware monitoring interface
512+ ------------------------------------------
513+ Stephane Eranian
514+ <eranian@gmail.com>
515+
516+I/ Introduction
517+
518+ The perfmon2 interface provides access to the hardware performance counters of
519+ major processors. Nowadays, all processors implement some flavors of performance
520+ counters which capture micro-architectural level information such as the number
521+ of elapsed cycles, number of cache misses, and so on.
522+
523+ The interface is implemented as a set of new system calls and a set of config files
524+ in /sys.
525+
526+ It is possible to monitoring a single thread or a CPU. In either mode, applications
527+ can count or collect samples. System-wide monitoring is supported by running a
528+ monitoring session on each CPU. The interface support event-based sampling where the
529+ sampling period is expressed as the number of occurrences of event, instead of just a
530+ timeout. This approach provides a much better granularity and flexibility.
531+
532+ For performance reason, it is possible to use a kernel-level sampling buffer to minimize
533+ the overhead incurred by sampling. The format of the buffer, i.e., what is recorded, how
534+ it is recorded, and how it is exported to user-land is controlled by a kernel module called
535+ a custom sampling format. The current implementation comes with a default format but
536+ it is possible to create additional formats. There is an in-kernel registration
537+ interface for formats. Each format is identified by a simple string which a tool
538+ can pass when a monitoring session is created.
539+
540+ The interface also provides support for event set and multiplexing to work around
541+ hardware limitations in the number of available counters or in how events can be
542+ combined. Each set defines as many counters as the hardware can support. The kernel
543+ then multiplexes the sets. The interface supports time-base switching but also
544+ overflow based switching, i.e., after n overflows of designated counters.
545+
546+ Applications never manipulates the actual performance counter registers. Instead they see
547+ a logical Performance Monitoring Unit (PMU) composed of a set of config register (PMC)
548+ and a set of data registers (PMD). Note that PMD are not necessarily counters, they
549+ can be buffers. The logical PMU is then mapped onto the actual PMU using a mapping
550+ table which is implemented as a kernel module. The mapping is chosen once for each
551+ new processor. It is visible in /sys/kernel/perfmon/pmu_desc. The kernel module
552+ is automatically loaded on first use.
553+
554+ A monitoring session, or context, is uniquely identified by a file descriptor
555+ obtained when the context is created. File sharing semantics apply to access
556+ the context inside a process. A context is never inherited across fork. The file
557+ descriptor can be used to received counter overflow notifications or when the
558+ sampling buffer is full. It is possible to use poll/select on the descriptor
559+ to wait for notifications from multiplex contexts. Similarly, the descriptor
560+ supports asynchronous notification via SIGIO.
561+
562+ Counters are always exported as being 64-bit wide regardless of what the underlying
563+ hardware implements.
564+
565+II/ Kernel compilation
566+
567+ To enable perfmon2, you need to enable CONFIG_PERFMON
568+
569+III/ OProfile interactions
570+
571+ The set of features offered by perfmon2 is rich enough to support migrating
572+ Oprofile on top of it. That means that PMU programming and low-level interrupt
573+ handling could be done by perfmon2. The Oprofile sampling buffer management code
574+ in the kernel as well as how samples are exported to users could remain through
575+ the use of a custom sampling buffer format. This is how Oprofile work on Itanium.
576+
577+ The current interactions with Oprofile are:
578+ - on X86: Both subsystems can be compiled into the same kernel. There is enforced
579+ mutual exclusion between the two subsystems. When there is an Oprofile
580+ session, no perfmon2 session can exist and vice-versa. Perfmon2 session
581+ encapsulates both per-thread and system-wide sessions here.
582+
583+ - On IA-64: Oprofile works on top of perfmon2. Oprofile being a system-wide monitoring
584+ tool, the regular per-thread vs. system-wide session restrictions apply.
585+
586+ - on PPC: no integration yet. You need to enable/disble one of the two subsystems
587+ - on MIPS: no integration yet. You need to enable/disble one of the two subsystems
588+
589+IV/ User tools
590+
591+ We have released a simple monitoring tool to demonstrate the feature of the
592+ interface. The tool is called pfmon and it comes with a simple helper library
593+ called libpfm. The library comes with a set of examples to show how to use the
594+ kernel perfmon2 interface. Visit http://perfmon2.sf.net for details.
595+
596+ There maybe other tools available for perfmon2.
597+
598+V/ How to program?
599+
600+ The best way to learn how to program perfmon2, is to take a look at the source
601+ code for the examples in libpfm. The source code is available from:
602+ http://perfmon2.sf.net
603+
604+VI/ System calls overview
605+
606+ The interface is implemented by the following system calls:
607+
608+ * int pfm_create_context(pfarg_ctx_t *ctx, char *fmt, void *arg, size_t arg_size)
609+
610+ This function create a perfmon2 context. The type of context is per-thread by
611+ default unless PFM_FL_SYSTEM_WIDE is passed in ctx. The sampling format name
612+ is passed in fmt. Arguments to the format are passed in arg which is of size
613+ arg_size. Upon successful return, the file descriptor identifying the context
614+ is returned.
615+
616+ * int pfm_write_pmds(int fd, pfarg_pmd_t *pmds, int n)
617+
618+ This function is used to program the PMD registers. It is possible to pass
619+ vectors of PMDs.
620+
621+ * int pfm_write_pmcs(int fd, pfarg_pmc_t *pmds, int n)
622+
623+ This function is used to program the PMC registers. It is possible to pass
624+ vectors of PMDs.
625+
626+ * int pfm_read_pmds(int fd, pfarg_pmd_t *pmds, int n)
627+
628+ This function is used to read the PMD registers. It is possible to pass
629+ vectors of PMDs.
630+
631+ * int pfm_load_context(int fd, pfarg_load_t *load)
632+
633+ This function is used to attach the context to a thread or CPU.
634+ Thread means kernel-visible thread (NPTL). The thread identification
635+ as obtained by gettid must be passed to load->load_target.
636+
637+ To operate on another thread (not self), it is mandatory that the thread
638+ be stopped via ptrace().
639+
640+ To attach to a CPU, the CPU number must be specified in load->load_target
641+ AND the call must be issued on that CPU. To monitor a CPU, a thread MUST
642+ be pinned on that CPU.
643+
644+ Until the context is attached, the actual counters are not accessed.
645+
646+ * int pfm_unload_context(int fd)
647+
648+ The context is detached for the thread or CPU is was attached to.
649+ As a consequence monitoring is stopped.
650+
651+ When monitoring another thread, the thread MUST be stopped via ptrace()
652+ for this function to succeed.
653+
654+ * int pfm_start(int fd, pfarg_start_t *st)
655+
656+ Start monitoring. The context must be attached for this function to succeed.
657+ Optionally, it is possible to specify the event set on which to start using the
658+ st argument, otherwise just pass NULL.
659+
660+ When monitoring another thread, the thread MUST be stopped via ptrace()
661+ for this function to succeed.
662+
663+ * int pfm_stop(int fd)
664+
665+ Stop monitoring. The context must be attached for this function to succeed.
666+
667+ When monitoring another thread, the thread MUST be stopped via ptrace()
668+ for this function to succeed.
669+
670+
671+ * int pfm_create_evtsets(int fd, pfarg_setdesc_t *sets, int n)
672+
673+ This function is used to create or change event sets. By default set 0 exists.
674+ It is possible to create/change multiple sets in one call.
675+
676+ The context must be detached for this call to succeed.
677+
678+ Sets are identified by a 16-bit integer. They are sorted based on this
679+ set and switching occurs in a round-robin fashion.
680+
681+ * int pfm_delete_evtsets(int fd, pfarg_setdesc_t *sets, int n)
682+
683+ Delete event sets. The context must be detached for this call to succeed.
684+
685+
686+ * int pfm_getinfo_evtsets(int fd, pfarg_setinfo_t *sets, int n)
687+
688+ Retrieve information about event sets. In particular it is possible
689+ to get the number of activation of a set. It is possible to retrieve
690+ information about multiple sets in one call.
691+
692+
693+ * int pfm_restart(int fd)
694+
695+ Indicate to the kernel that the application is done processing an overflow
696+ notification. A consequence of this call could be that monitoring resumes.
697+
698+ * int read(fd, pfm_msg_t *msg, sizeof(pfm_msg_t))
699+
700+ the regular read() system call can be used with the context file descriptor to
701+ receive overflow notification messages. Non-blocking read() is supported.
702+
703+ Each message carry information about the overflow such as which counter overflowed
704+ and where the program was (interrupted instruction pointer).
705+
706+ * int close(int fd)
707+
708+ To destroy a context, the regular close() system call is used.
709+
710+
711+VII/ /sys interface overview
712+
713+ Refer to Documentation/ABI/testing/sysfs-perfmon-* for a detailed description
714+ of the sysfs interface of perfmon2.
715+
716+VIII/ debugfs interface overview
717+
718+ Refer to Documentation/perfmon2-debugfs.txt for a detailed description of the
719+ debug and statistics interface of perfmon2.
720+
721+IX/ Documentation
722+
723+ Visit http://perfmon2.sf.net
724--- a/MAINTAINERS
725+++ b/MAINTAINERS
726@@ -3244,6 +3244,14 @@ M: balbir@linux.vnet.ibm.com
727 L: linux-kernel@vger.kernel.org
728 S: Maintained
729
730+PERFMON SUBSYSTEM
731+P: Stephane Eranian
732+M: eranian@gmail.com
733+L: perfmon2-devel@lists.sf.net
734+W: http://perfmon2.sf.net
735+T: git kernel.org:/pub/scm/linux/kernel/git/eranian/linux-2.6
736+S: Maintained
737+
738 PERSONALITY HANDLING
739 P: Christoph Hellwig
740 M: hch@infradead.org
741--- a/Makefile
742+++ b/Makefile
743@@ -651,6 +651,7 @@ export mod_strip_cmd
744 ifeq ($(KBUILD_EXTMOD),)
745 core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/
746 core-$(CONFIG_KDB) += kdb/
747+core-$(CONFIG_PERFMON) += perfmon/
748
749 vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
750 $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
751--- a/arch/ia64/Kconfig
752+++ b/arch/ia64/Kconfig
753@@ -479,14 +479,6 @@ config IA64_CPE_MIGRATE
754 build this functionality as a kernel loadable module. Installing
755 the module will turn on the functionality.
756
757-config PERFMON
758- bool "Performance monitor support"
759- help
760- Selects whether support for the IA-64 performance monitor hardware
761- is included in the kernel. This makes some kernel data-structures a
762- little bigger and slows down execution a bit, but it is generally
763- a good idea to turn this on. If you're unsure, say Y.
764-
765 config IA64_PALINFO
766 tristate "/proc/pal support"
767 help
768@@ -558,6 +550,8 @@ source "drivers/firmware/Kconfig"
769
770 source "fs/Kconfig.binfmt"
771
772+source "arch/ia64/perfmon/Kconfig"
773+
774 endmenu
775
776 menu "Power management and ACPI"
777--- a/arch/ia64/Makefile
778+++ b/arch/ia64/Makefile
779@@ -57,6 +57,7 @@ core-$(CONFIG_IA64_GENERIC) += arch/ia6
780 core-$(CONFIG_IA64_HP_ZX1) += arch/ia64/dig/
781 core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/
782 core-$(CONFIG_IA64_SGI_SN2) += arch/ia64/sn/
783+core-$(CONFIG_PERFMON) += arch/ia64/perfmon/
784 core-$(CONFIG_IA64_SGI_UV) += arch/ia64/uv/
785 core-$(CONFIG_KVM) += arch/ia64/kvm/
786
787--- a/arch/ia64/configs/generic_defconfig
788+++ b/arch/ia64/configs/generic_defconfig
789@@ -209,7 +209,6 @@ CONFIG_IA32_SUPPORT=y
790 CONFIG_COMPAT=y
791 CONFIG_COMPAT_FOR_U64_ALIGNMENT=y
792 CONFIG_IA64_MCA_RECOVERY=y
793-CONFIG_PERFMON=y
794 CONFIG_IA64_PALINFO=y
795 # CONFIG_IA64_MC_ERR_INJECT is not set
796 CONFIG_SGI_SN=y
797@@ -234,6 +233,16 @@ CONFIG_BINFMT_ELF=y
798 CONFIG_BINFMT_MISC=m
799
800 #
801+# Hardware Performance Monitoring support
802+#
803+CONFIG_PERFMON=y
804+CONFIG_IA64_PERFMON_COMPAT=y
805+CONFIG_IA64_PERFMON_GENERIC=m
806+CONFIG_IA64_PERFMON_ITANIUM=y
807+CONFIG_IA64_PERFMON_MCKINLEY=y
808+CONFIG_IA64_PERFMON_MONTECITO=y
809+
810+#
811 # Power management and ACPI
812 #
813 CONFIG_PM=y
814--- a/arch/ia64/include/asm/Kbuild
815+++ b/arch/ia64/include/asm/Kbuild
816@@ -5,10 +5,12 @@ header-y += fpu.h
817 header-y += fpswa.h
818 header-y += ia64regs.h
819 header-y += intel_intrin.h
820-header-y += perfmon_default_smpl.h
821 header-y += ptrace_offsets.h
822 header-y += rse.h
823 header-y += ucontext.h
824+header-y += perfmon.h
825+header-y += perfmon_compat.h
826+header-y += perfmon_default_smpl.h
827
828 unifdef-y += gcc_intrin.h
829 unifdef-y += intrinsics.h
830--- a/arch/ia64/include/asm/hw_irq.h
831+++ b/arch/ia64/include/asm/hw_irq.h
832@@ -67,9 +67,9 @@ extern int ia64_last_device_vector;
833 #define IA64_NUM_DEVICE_VECTORS (IA64_LAST_DEVICE_VECTOR - IA64_FIRST_DEVICE_VECTOR + 1)
834
835 #define IA64_MCA_RENDEZ_VECTOR 0xe8 /* MCA rendez interrupt */
836-#define IA64_PERFMON_VECTOR 0xee /* performance monitor interrupt vector */
837 #define IA64_TIMER_VECTOR 0xef /* use highest-prio group 15 interrupt for timer */
838 #define IA64_MCA_WAKEUP_VECTOR 0xf0 /* MCA wakeup (must be >MCA_RENDEZ_VECTOR) */
839+#define IA64_PERFMON_VECTOR 0xf1 /* performance monitor interrupt vector */
840 #define IA64_IPI_LOCAL_TLB_FLUSH 0xfc /* SMP flush local TLB */
841 #define IA64_IPI_RESCHEDULE 0xfd /* SMP reschedule */
842 #define IA64_IPI_VECTOR 0xfe /* inter-processor interrupt vector */
843--- a/arch/ia64/include/asm/perfmon.h
844+++ b/arch/ia64/include/asm/perfmon.h
845@@ -1,279 +1,59 @@
846 /*
847- * Copyright (C) 2001-2003 Hewlett-Packard Co
848- * Stephane Eranian <eranian@hpl.hp.com>
849- */
850-
851-#ifndef _ASM_IA64_PERFMON_H
852-#define _ASM_IA64_PERFMON_H
853-
854-/*
855- * perfmon comamnds supported on all CPU models
856- */
857-#define PFM_WRITE_PMCS 0x01
858-#define PFM_WRITE_PMDS 0x02
859-#define PFM_READ_PMDS 0x03
860-#define PFM_STOP 0x04
861-#define PFM_START 0x05
862-#define PFM_ENABLE 0x06 /* obsolete */
863-#define PFM_DISABLE 0x07 /* obsolete */
864-#define PFM_CREATE_CONTEXT 0x08
865-#define PFM_DESTROY_CONTEXT 0x09 /* obsolete use close() */
866-#define PFM_RESTART 0x0a
867-#define PFM_PROTECT_CONTEXT 0x0b /* obsolete */
868-#define PFM_GET_FEATURES 0x0c
869-#define PFM_DEBUG 0x0d
870-#define PFM_UNPROTECT_CONTEXT 0x0e /* obsolete */
871-#define PFM_GET_PMC_RESET_VAL 0x0f
872-#define PFM_LOAD_CONTEXT 0x10
873-#define PFM_UNLOAD_CONTEXT 0x11
874-
875-/*
876- * PMU model specific commands (may not be supported on all PMU models)
877- */
878-#define PFM_WRITE_IBRS 0x20
879-#define PFM_WRITE_DBRS 0x21
880-
881-/*
882- * context flags
883- */
884-#define PFM_FL_NOTIFY_BLOCK 0x01 /* block task on user level notifications */
885-#define PFM_FL_SYSTEM_WIDE 0x02 /* create a system wide context */
886-#define PFM_FL_OVFL_NO_MSG 0x80 /* do not post overflow/end messages for notification */
887-
888-/*
889- * event set flags
890- */
891-#define PFM_SETFL_EXCL_IDLE 0x01 /* exclude idle task (syswide only) XXX: DO NOT USE YET */
892-
893-/*
894- * PMC flags
895- */
896-#define PFM_REGFL_OVFL_NOTIFY 0x1 /* send notification on overflow */
897-#define PFM_REGFL_RANDOM 0x2 /* randomize sampling interval */
898-
899-/*
900- * PMD/PMC/IBR/DBR return flags (ignored on input)
901+ * Copyright (c) 2001-2007 Hewlett-Packard Development Company, L.P.
902+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
903 *
904- * Those flags are used on output and must be checked in case EAGAIN is returned
905- * by any of the calls using a pfarg_reg_t or pfarg_dbreg_t structure.
906- */
907-#define PFM_REG_RETFL_NOTAVAIL (1UL<<31) /* set if register is implemented but not available */
908-#define PFM_REG_RETFL_EINVAL (1UL<<30) /* set if register entry is invalid */
909-#define PFM_REG_RETFL_MASK (PFM_REG_RETFL_NOTAVAIL|PFM_REG_RETFL_EINVAL)
910-
911-#define PFM_REG_HAS_ERROR(flag) (((flag) & PFM_REG_RETFL_MASK) != 0)
912-
913-typedef unsigned char pfm_uuid_t[16]; /* custom sampling buffer identifier type */
914-
915-/*
916- * Request structure used to define a context
917- */
918-typedef struct {
919- pfm_uuid_t ctx_smpl_buf_id; /* which buffer format to use (if needed) */
920- unsigned long ctx_flags; /* noblock/block */
921- unsigned short ctx_nextra_sets; /* number of extra event sets (you always get 1) */
922- unsigned short ctx_reserved1; /* for future use */
923- int ctx_fd; /* return arg: unique identification for context */
924- void *ctx_smpl_vaddr; /* return arg: virtual address of sampling buffer, is used */
925- unsigned long ctx_reserved2[11];/* for future use */
926-} pfarg_context_t;
927-
928-/*
929- * Request structure used to write/read a PMC or PMD
930- */
931-typedef struct {
932- unsigned int reg_num; /* which register */
933- unsigned short reg_set; /* event set for this register */
934- unsigned short reg_reserved1; /* for future use */
935-
936- unsigned long reg_value; /* initial pmc/pmd value */
937- unsigned long reg_flags; /* input: pmc/pmd flags, return: reg error */
938-
939- unsigned long reg_long_reset; /* reset after buffer overflow notification */
940- unsigned long reg_short_reset; /* reset after counter overflow */
941-
942- unsigned long reg_reset_pmds[4]; /* which other counters to reset on overflow */
943- unsigned long reg_random_seed; /* seed value when randomization is used */
944- unsigned long reg_random_mask; /* bitmask used to limit random value */
945- unsigned long reg_last_reset_val;/* return: PMD last reset value */
946-
947- unsigned long reg_smpl_pmds[4]; /* which pmds are accessed when PMC overflows */
948- unsigned long reg_smpl_eventid; /* opaque sampling event identifier */
949-
950- unsigned long reg_reserved2[3]; /* for future use */
951-} pfarg_reg_t;
952-
953-typedef struct {
954- unsigned int dbreg_num; /* which debug register */
955- unsigned short dbreg_set; /* event set for this register */
956- unsigned short dbreg_reserved1; /* for future use */
957- unsigned long dbreg_value; /* value for debug register */
958- unsigned long dbreg_flags; /* return: dbreg error */
959- unsigned long dbreg_reserved2[1]; /* for future use */
960-} pfarg_dbreg_t;
961-
962-typedef struct {
963- unsigned int ft_version; /* perfmon: major [16-31], minor [0-15] */
964- unsigned int ft_reserved; /* reserved for future use */
965- unsigned long reserved[4]; /* for future use */
966-} pfarg_features_t;
967-
968-typedef struct {
969- pid_t load_pid; /* process to load the context into */
970- unsigned short load_set; /* first event set to load */
971- unsigned short load_reserved1; /* for future use */
972- unsigned long load_reserved2[3]; /* for future use */
973-} pfarg_load_t;
974-
975-typedef struct {
976- int msg_type; /* generic message header */
977- int msg_ctx_fd; /* generic message header */
978- unsigned long msg_ovfl_pmds[4]; /* which PMDs overflowed */
979- unsigned short msg_active_set; /* active set at the time of overflow */
980- unsigned short msg_reserved1; /* for future use */
981- unsigned int msg_reserved2; /* for future use */
982- unsigned long msg_tstamp; /* for perf tuning/debug */
983-} pfm_ovfl_msg_t;
984-
985-typedef struct {
986- int msg_type; /* generic message header */
987- int msg_ctx_fd; /* generic message header */
988- unsigned long msg_tstamp; /* for perf tuning */
989-} pfm_end_msg_t;
990-
991-typedef struct {
992- int msg_type; /* type of the message */
993- int msg_ctx_fd; /* unique identifier for the context */
994- unsigned long msg_tstamp; /* for perf tuning */
995-} pfm_gen_msg_t;
996-
997-#define PFM_MSG_OVFL 1 /* an overflow happened */
998-#define PFM_MSG_END 2 /* task to which context was attached ended */
999-
1000-typedef union {
1001- pfm_ovfl_msg_t pfm_ovfl_msg;
1002- pfm_end_msg_t pfm_end_msg;
1003- pfm_gen_msg_t pfm_gen_msg;
1004-} pfm_msg_t;
1005-
1006-/*
1007- * Define the version numbers for both perfmon as a whole and the sampling buffer format.
1008+ * This file contains Itanium Processor Family specific definitions
1009+ * for the perfmon interface.
1010+ *
1011+ * This program is free software; you can redistribute it and/or
1012+ * modify it under the terms of version 2 of the GNU General Public
1013+ * License as published by the Free Software Foundation.
1014+ *
1015+ * This program is distributed in the hope that it will be useful,
1016+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1017+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1018+ * General Public License for more details.
1019+ *
1020+ * You should have received a copy of the GNU General Public License
1021+ * along with this program; if not, write to the Free Software
1022+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
1023+ * 02111-1307 USA
1024 */
1025-#define PFM_VERSION_MAJ 2U
1026-#define PFM_VERSION_MIN 0U
1027-#define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|(PFM_VERSION_MIN & 0xffff))
1028-#define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff)
1029-#define PFM_VERSION_MINOR(x) ((x) & 0xffff)
1030-
1031+#ifndef _ASM_IA64_PERFMON_H_
1032+#define _ASM_IA64_PERFMON_H_
1033
1034 /*
1035- * miscellaneous architected definitions
1036+ * arch-specific user visible interface definitions
1037 */
1038-#define PMU_FIRST_COUNTER 4 /* first counting monitor (PMC/PMD) */
1039-#define PMU_MAX_PMCS 256 /* maximum architected number of PMC registers */
1040-#define PMU_MAX_PMDS 256 /* maximum architected number of PMD registers */
1041-
1042-#ifdef __KERNEL__
1043-
1044-extern long perfmonctl(int fd, int cmd, void *arg, int narg);
1045-
1046-typedef struct {
1047- void (*handler)(int irq, void *arg, struct pt_regs *regs);
1048-} pfm_intr_handler_desc_t;
1049-
1050-extern void pfm_save_regs (struct task_struct *);
1051-extern void pfm_load_regs (struct task_struct *);
1052
1053-extern void pfm_exit_thread(struct task_struct *);
1054-extern int pfm_use_debug_registers(struct task_struct *);
1055-extern int pfm_release_debug_registers(struct task_struct *);
1056-extern void pfm_syst_wide_update_task(struct task_struct *, unsigned long info, int is_ctxswin);
1057-extern void pfm_inherit(struct task_struct *task, struct pt_regs *regs);
1058-extern void pfm_init_percpu(void);
1059-extern void pfm_handle_work(void);
1060-extern int pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *h);
1061-extern int pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *h);
1062+#define PFM_ARCH_MAX_PMCS (256+64)
1063+#define PFM_ARCH_MAX_PMDS (256+64)
1064
1065-
1066-
1067-/*
1068- * Reset PMD register flags
1069- */
1070-#define PFM_PMD_SHORT_RESET 0
1071-#define PFM_PMD_LONG_RESET 1
1072-
1073-typedef union {
1074- unsigned int val;
1075- struct {
1076- unsigned int notify_user:1; /* notify user program of overflow */
1077- unsigned int reset_ovfl_pmds:1; /* reset overflowed PMDs */
1078- unsigned int block_task:1; /* block monitored task on kernel exit */
1079- unsigned int mask_monitoring:1; /* mask monitors via PMCx.plm */
1080- unsigned int reserved:28; /* for future use */
1081- } bits;
1082-} pfm_ovfl_ctrl_t;
1083-
1084-typedef struct {
1085- unsigned char ovfl_pmd; /* index of overflowed PMD */
1086- unsigned char ovfl_notify; /* =1 if monitor requested overflow notification */
1087- unsigned short active_set; /* event set active at the time of the overflow */
1088- pfm_ovfl_ctrl_t ovfl_ctrl; /* return: perfmon controls to set by handler */
1089-
1090- unsigned long pmd_last_reset; /* last reset value of of the PMD */
1091- unsigned long smpl_pmds[4]; /* bitmask of other PMD of interest on overflow */
1092- unsigned long smpl_pmds_values[PMU_MAX_PMDS]; /* values for the other PMDs of interest */
1093- unsigned long pmd_value; /* current 64-bit value of the PMD */
1094- unsigned long pmd_eventid; /* eventid associated with PMD */
1095-} pfm_ovfl_arg_t;
1096-
1097-
1098-typedef struct {
1099- char *fmt_name;
1100- pfm_uuid_t fmt_uuid;
1101- size_t fmt_arg_size;
1102- unsigned long fmt_flags;
1103-
1104- int (*fmt_validate)(struct task_struct *task, unsigned int flags, int cpu, void *arg);
1105- int (*fmt_getsize)(struct task_struct *task, unsigned int flags, int cpu, void *arg, unsigned long *size);
1106- int (*fmt_init)(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *arg);
1107- int (*fmt_handler)(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp);
1108- int (*fmt_restart)(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs);
1109- int (*fmt_restart_active)(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs);
1110- int (*fmt_exit)(struct task_struct *task, void *buf, struct pt_regs *regs);
1111-
1112- struct list_head fmt_list;
1113-} pfm_buffer_fmt_t;
1114-
1115-extern int pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt);
1116-extern int pfm_unregister_buffer_fmt(pfm_uuid_t uuid);
1117+#define PFM_ARCH_PMD_STK_ARG 8
1118+#define PFM_ARCH_PMC_STK_ARG 8
1119
1120 /*
1121- * perfmon interface exported to modules
1122+ * Itanium specific context flags
1123+ *
1124+ * bits[00-15]: generic flags (see asm/perfmon.h)
1125+ * bits[16-31]: arch-specific flags
1126 */
1127-extern int pfm_mod_read_pmds(struct task_struct *, void *req, unsigned int nreq, struct pt_regs *regs);
1128-extern int pfm_mod_write_pmcs(struct task_struct *, void *req, unsigned int nreq, struct pt_regs *regs);
1129-extern int pfm_mod_write_ibrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs);
1130-extern int pfm_mod_write_dbrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs);
1131+#define PFM_ITA_FL_INSECURE 0x10000 /* clear psr.sp on non system, non self */
1132
1133 /*
1134- * describe the content of the local_cpu_date->pfm_syst_info field
1135+ * Itanium specific public event set flags (set_flags)
1136+ *
1137+ * event set flags layout:
1138+ * bits[00-15] : generic flags
1139+ * bits[16-31] : arch-specific flags
1140 */
1141-#define PFM_CPUINFO_SYST_WIDE 0x1 /* if set a system wide session exists */
1142-#define PFM_CPUINFO_DCR_PP 0x2 /* if set the system wide session has started */
1143-#define PFM_CPUINFO_EXCL_IDLE 0x4 /* the system wide session excludes the idle task */
1144+#define PFM_ITA_SETFL_EXCL_INTR 0x10000 /* exclude interrupt execution */
1145+#define PFM_ITA_SETFL_INTR_ONLY 0x20000 /* include only interrupt execution */
1146+#define PFM_ITA_SETFL_IDLE_EXCL 0x40000 /* stop monitoring in idle loop */
1147
1148 /*
1149- * sysctl control structure. visible to sampling formats
1150+ * compatibility for version v2.0 of the interface
1151 */
1152-typedef struct {
1153- int debug; /* turn on/off debugging via syslog */
1154- int debug_ovfl; /* turn on/off debug printk in overflow handler */
1155- int fastctxsw; /* turn on/off fast (unsecure) ctxsw */
1156- int expert_mode; /* turn on/off value checking */
1157-} pfm_sysctl_t;
1158-extern pfm_sysctl_t pfm_sysctl;
1159-
1160-
1161-#endif /* __KERNEL__ */
1162+#include <asm/perfmon_compat.h>
1163
1164-#endif /* _ASM_IA64_PERFMON_H */
1165+#endif /* _ASM_IA64_PERFMON_H_ */
1166--- /dev/null
1167+++ b/arch/ia64/include/asm/perfmon_compat.h
1168@@ -0,0 +1,167 @@
1169+/*
1170+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
1171+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
1172+ *
1173+ * This header file contains perfmon interface definition
1174+ * that are now obsolete and should be dropped in favor
1175+ * of their equivalent functions as explained below.
1176+ *
1177+ * This program is free software; you can redistribute it and/or
1178+ * modify it under the terms of version 2 of the GNU General Public
1179+ * License as published by the Free Software Foundation.
1180+ *
1181+ * This program is distributed in the hope that it will be useful,
1182+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1183+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1184+ * General Public License for more details.
1185+ *
1186+ * You should have received a copy of the GNU General Public License
1187+ * along with this program; if not, write to the Free Software
1188+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
1189+ * 02111-1307 USA
1190+ */
1191+
1192+#ifndef _ASM_IA64_PERFMON_COMPAT_H_
1193+#define _ASM_IA64_PERFMON_COMPAT_H_
1194+
1195+/*
1196+ * custom sampling buffer identifier type
1197+ */
1198+typedef __u8 pfm_uuid_t[16];
1199+
1200+/*
1201+ * obsolete perfmon commands. Supported only on IA-64 for
1202+ * backward compatiblity reasons with perfmon v2.0.
1203+ */
1204+#define PFM_WRITE_PMCS 0x01 /* use pfm_write_pmcs */
1205+#define PFM_WRITE_PMDS 0x02 /* use pfm_write_pmds */
1206+#define PFM_READ_PMDS 0x03 /* use pfm_read_pmds */
1207+#define PFM_STOP 0x04 /* use pfm_stop */
1208+#define PFM_START 0x05 /* use pfm_start */
1209+#define PFM_ENABLE 0x06 /* obsolete */
1210+#define PFM_DISABLE 0x07 /* obsolete */
1211+#define PFM_CREATE_CONTEXT 0x08 /* use pfm_create_context */
1212+#define PFM_DESTROY_CONTEXT 0x09 /* use close() */
1213+#define PFM_RESTART 0x0a /* use pfm_restart */
1214+#define PFM_PROTECT_CONTEXT 0x0b /* obsolete */
1215+#define PFM_GET_FEATURES 0x0c /* use /proc/sys/perfmon */
1216+#define PFM_DEBUG 0x0d /* /proc/sys/kernel/perfmon/debug */
1217+#define PFM_UNPROTECT_CONTEXT 0x0e /* obsolete */
1218+#define PFM_GET_PMC_RESET_VAL 0x0f /* use /proc/perfmon_map */
1219+#define PFM_LOAD_CONTEXT 0x10 /* use pfm_load_context */
1220+#define PFM_UNLOAD_CONTEXT 0x11 /* use pfm_unload_context */
1221+
1222+/*
1223+ * PMU model specific commands (may not be supported on all PMU models)
1224+ */
1225+#define PFM_WRITE_IBRS 0x20 /* obsolete: use PFM_WRITE_PMCS[256-263]*/
1226+#define PFM_WRITE_DBRS 0x21 /* obsolete: use PFM_WRITE_PMCS[264-271]*/
1227+
1228+/*
1229+ * argument to PFM_CREATE_CONTEXT
1230+ */
1231+struct pfarg_context {
1232+ pfm_uuid_t ctx_smpl_buf_id; /* buffer format to use */
1233+ unsigned long ctx_flags; /* noblock/block */
1234+ unsigned int ctx_reserved1; /* for future use */
1235+ int ctx_fd; /* return: fildesc */
1236+ void *ctx_smpl_vaddr; /* return: vaddr of buffer */
1237+ unsigned long ctx_reserved3[11];/* for future use */
1238+};
1239+
1240+/*
1241+ * argument structure for PFM_WRITE_PMCS/PFM_WRITE_PMDS/PFM_WRITE_PMDS
1242+ */
1243+struct pfarg_reg {
1244+ unsigned int reg_num; /* which register */
1245+ unsigned short reg_set; /* event set for this register */
1246+ unsigned short reg_reserved1; /* for future use */
1247+
1248+ unsigned long reg_value; /* initial pmc/pmd value */
1249+ unsigned long reg_flags; /* input: flags, ret: error */
1250+
1251+ unsigned long reg_long_reset; /* reset value after notification */
1252+ unsigned long reg_short_reset; /* reset after counter overflow */
1253+
1254+ unsigned long reg_reset_pmds[4]; /* registers to reset on overflow */
1255+ unsigned long reg_random_seed; /* seed for randomization */
1256+ unsigned long reg_random_mask; /* random range limit */
1257+ unsigned long reg_last_reset_val;/* return: PMD last reset value */
1258+
1259+ unsigned long reg_smpl_pmds[4]; /* pmds to be saved on overflow */
1260+ unsigned long reg_smpl_eventid; /* opaque sampling event id */
1261+ unsigned long reg_ovfl_switch_cnt;/* #overflows to switch */
1262+
1263+ unsigned long reg_reserved2[2]; /* for future use */
1264+};
1265+
1266+/*
1267+ * argument to PFM_WRITE_IBRS/PFM_WRITE_DBRS
1268+ */
1269+struct pfarg_dbreg {
1270+ unsigned int dbreg_num; /* which debug register */
1271+ unsigned short dbreg_set; /* event set */
1272+ unsigned short dbreg_reserved1; /* for future use */
1273+ unsigned long dbreg_value; /* value for debug register */
1274+ unsigned long dbreg_flags; /* return: dbreg error */
1275+ unsigned long dbreg_reserved2[1]; /* for future use */
1276+};
1277+
1278+/*
1279+ * argument to PFM_GET_FEATURES
1280+ */
1281+struct pfarg_features {
1282+ unsigned int ft_version; /* major [16-31], minor [0-15] */
1283+ unsigned int ft_reserved; /* reserved for future use */
1284+ unsigned long reserved[4]; /* for future use */
1285+};
1286+
1287+typedef struct {
1288+ int msg_type; /* generic message header */
1289+ int msg_ctx_fd; /* generic message header */
1290+ unsigned long msg_ovfl_pmds[4]; /* which PMDs overflowed */
1291+ unsigned short msg_active_set; /* active set on overflow */
1292+ unsigned short msg_reserved1; /* for future use */
1293+ unsigned int msg_reserved2; /* for future use */
1294+ unsigned long msg_tstamp; /* for perf tuning/debug */
1295+} pfm_ovfl_msg_t;
1296+
1297+typedef struct {
1298+ int msg_type; /* generic message header */
1299+ int msg_ctx_fd; /* generic message header */
1300+ unsigned long msg_tstamp; /* for perf tuning */
1301+} pfm_end_msg_t;
1302+
1303+typedef struct {
1304+ int msg_type; /* type of the message */
1305+ int msg_ctx_fd; /* context file descriptor */
1306+ unsigned long msg_tstamp; /* for perf tuning */
1307+} pfm_gen_msg_t;
1308+
1309+typedef union {
1310+ int type;
1311+ pfm_ovfl_msg_t pfm_ovfl_msg;
1312+ pfm_end_msg_t pfm_end_msg;
1313+ pfm_gen_msg_t pfm_gen_msg;
1314+} pfm_msg_t;
1315+
1316+/*
1317+ * PMD/PMC return flags in case of error (ignored on input)
1318+ *
1319+ * reg_flags layout:
1320+ * bit 00-15 : generic flags
1321+ * bits[16-23] : arch-specific flags (see asm/perfmon.h)
1322+ * bit 24-31 : error codes
1323+ *
1324+ * Those flags are used on output and must be checked in case EINVAL is
1325+ * returned by a command accepting a vector of values and each has a flag
1326+ * field, such as pfarg_reg or pfarg_reg
1327+ */
1328+#define PFM_REG_RETFL_NOTAVAIL (1<<31) /* not implemented or unaccessible */
1329+#define PFM_REG_RETFL_EINVAL (1<<30) /* entry is invalid */
1330+#define PFM_REG_RETFL_MASK (PFM_REG_RETFL_NOTAVAIL|\
1331+ PFM_REG_RETFL_EINVAL)
1332+
1333+#define PFM_REG_HAS_ERROR(flag) (((flag) & PFM_REG_RETFL_MASK) != 0)
1334+
1335+#endif /* _ASM_IA64_PERFMON_COMPAT_H_ */
1336--- a/arch/ia64/include/asm/perfmon_default_smpl.h
1337+++ b/arch/ia64/include/asm/perfmon_default_smpl.h
1338@@ -1,83 +1,106 @@
1339 /*
1340- * Copyright (C) 2002-2003 Hewlett-Packard Co
1341- * Stephane Eranian <eranian@hpl.hp.com>
1342+ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
1343+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
1344 *
1345- * This file implements the default sampling buffer format
1346- * for Linux/ia64 perfmon subsystem.
1347+ * This file implements the old default sampling buffer format
1348+ * for the perfmon2 subsystem. For IA-64 only.
1349+ *
1350+ * It requires the use of the perfmon_compat.h header. It is recommended
1351+ * that applications be ported to the new format instead.
1352+ *
1353+ * This program is free software; you can redistribute it and/or
1354+ * modify it under the terms of version 2 of the GNU General Public
1355+ * License as published by the Free Software Foundation.
1356+ *
1357+ * This program is distributed in the hope that it will be useful,
1358+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1359+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1360+ * General Public License for more details.
1361+ *
1362+ * You should have received a copy of the GNU General Public License
1363+ * along with this program; if not, write to the Free Software
1364+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
1365+ * 02111-1307 USA
1366 */
1367-#ifndef __PERFMON_DEFAULT_SMPL_H__
1368-#define __PERFMON_DEFAULT_SMPL_H__ 1
1369+#ifndef __ASM_IA64_PERFMON_DEFAULT_SMPL_H__
1370+#define __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ 1
1371+
1372+#ifndef __ia64__
1373+#error "this file must be used for compatibility reasons only on IA-64"
1374+#endif
1375
1376 #define PFM_DEFAULT_SMPL_UUID { \
1377- 0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82, 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97}
1378+ 0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82,\
1379+ 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97}
1380
1381 /*
1382 * format specific parameters (passed at context creation)
1383 */
1384-typedef struct {
1385+struct pfm_default_smpl_arg {
1386 unsigned long buf_size; /* size of the buffer in bytes */
1387 unsigned int flags; /* buffer specific flags */
1388 unsigned int res1; /* for future use */
1389 unsigned long reserved[2]; /* for future use */
1390-} pfm_default_smpl_arg_t;
1391+};
1392
1393 /*
1394 * combined context+format specific structure. Can be passed
1395- * to PFM_CONTEXT_CREATE
1396+ * to PFM_CONTEXT_CREATE (not PFM_CONTEXT_CREATE2)
1397 */
1398-typedef struct {
1399- pfarg_context_t ctx_arg;
1400- pfm_default_smpl_arg_t buf_arg;
1401-} pfm_default_smpl_ctx_arg_t;
1402+struct pfm_default_smpl_ctx_arg {
1403+ struct pfarg_context ctx_arg;
1404+ struct pfm_default_smpl_arg buf_arg;
1405+};
1406
1407 /*
1408 * This header is at the beginning of the sampling buffer returned to the user.
1409 * It is directly followed by the first record.
1410 */
1411-typedef struct {
1412- unsigned long hdr_count; /* how many valid entries */
1413- unsigned long hdr_cur_offs; /* current offset from top of buffer */
1414- unsigned long hdr_reserved2; /* reserved for future use */
1415-
1416- unsigned long hdr_overflows; /* how many times the buffer overflowed */
1417- unsigned long hdr_buf_size; /* how many bytes in the buffer */
1418-
1419- unsigned int hdr_version; /* contains perfmon version (smpl format diffs) */
1420- unsigned int hdr_reserved1; /* for future use */
1421- unsigned long hdr_reserved[10]; /* for future use */
1422-} pfm_default_smpl_hdr_t;
1423+struct pfm_default_smpl_hdr {
1424+ u64 hdr_count; /* how many valid entries */
1425+ u64 hdr_cur_offs; /* current offset from top of buffer */
1426+ u64 dr_reserved2; /* reserved for future use */
1427+
1428+ u64 hdr_overflows; /* how many times the buffer overflowed */
1429+ u64 hdr_buf_size; /* how many bytes in the buffer */
1430+
1431+ u32 hdr_version; /* smpl format version*/
1432+ u32 hdr_reserved1; /* for future use */
1433+ u64 hdr_reserved[10]; /* for future use */
1434+};
1435
1436 /*
1437 * Entry header in the sampling buffer. The header is directly followed
1438- * with the values of the PMD registers of interest saved in increasing
1439- * index order: PMD4, PMD5, and so on. How many PMDs are present depends
1440+ * with the values of the PMD registers of interest saved in increasing
1441+ * index order: PMD4, PMD5, and so on. How many PMDs are present depends
1442 * on how the session was programmed.
1443 *
1444 * In the case where multiple counters overflow at the same time, multiple
1445 * entries are written consecutively.
1446 *
1447- * last_reset_value member indicates the initial value of the overflowed PMD.
1448+ * last_reset_value member indicates the initial value of the overflowed PMD.
1449 */
1450-typedef struct {
1451- int pid; /* thread id (for NPTL, this is gettid()) */
1452- unsigned char reserved1[3]; /* reserved for future use */
1453- unsigned char ovfl_pmd; /* index of overflowed PMD */
1454-
1455- unsigned long last_reset_val; /* initial value of overflowed PMD */
1456- unsigned long ip; /* where did the overflow interrupt happened */
1457- unsigned long tstamp; /* ar.itc when entering perfmon intr. handler */
1458-
1459- unsigned short cpu; /* cpu on which the overfow occured */
1460- unsigned short set; /* event set active when overflow ocurred */
1461- int tgid; /* thread group id (for NPTL, this is getpid()) */
1462-} pfm_default_smpl_entry_t;
1463-
1464-#define PFM_DEFAULT_MAX_PMDS 64 /* how many pmds supported by data structures (sizeof(unsigned long) */
1465-#define PFM_DEFAULT_MAX_ENTRY_SIZE (sizeof(pfm_default_smpl_entry_t)+(sizeof(unsigned long)*PFM_DEFAULT_MAX_PMDS))
1466-#define PFM_DEFAULT_SMPL_MIN_BUF_SIZE (sizeof(pfm_default_smpl_hdr_t)+PFM_DEFAULT_MAX_ENTRY_SIZE)
1467+struct pfm_default_smpl_entry {
1468+ pid_t pid; /* thread id (for NPTL, this is gettid()) */
1469+ uint8_t reserved1[3]; /* for future use */
1470+ uint8_t ovfl_pmd; /* overflow pmd for this sample */
1471+ u64 last_reset_val; /* initial value of overflowed PMD */
1472+ unsigned long ip; /* where did the overflow interrupt happened */
1473+ u64 tstamp; /* overflow timetamp */
1474+ u16 cpu; /* cpu on which the overfow occured */
1475+ u16 set; /* event set active when overflow ocurred */
1476+ pid_t tgid; /* thread group id (for NPTL, this is getpid()) */
1477+};
1478+
1479+#define PFM_DEFAULT_MAX_PMDS 64 /* #pmds supported */
1480+#define PFM_DEFAULT_MAX_ENTRY_SIZE (sizeof(struct pfm_default_smpl_entry)+\
1481+ (sizeof(u64)*PFM_DEFAULT_MAX_PMDS))
1482+#define PFM_DEFAULT_SMPL_MIN_BUF_SIZE (sizeof(struct pfm_default_smpl_hdr)+\
1483+ PFM_DEFAULT_MAX_ENTRY_SIZE)
1484
1485 #define PFM_DEFAULT_SMPL_VERSION_MAJ 2U
1486-#define PFM_DEFAULT_SMPL_VERSION_MIN 0U
1487-#define PFM_DEFAULT_SMPL_VERSION (((PFM_DEFAULT_SMPL_VERSION_MAJ&0xffff)<<16)|(PFM_DEFAULT_SMPL_VERSION_MIN & 0xffff))
1488+#define PFM_DEFAULT_SMPL_VERSION_MIN 1U
1489+#define PFM_DEFAULT_SMPL_VERSION (((PFM_DEFAULT_SMPL_VERSION_MAJ&0xffff)<<16)|\
1490+ (PFM_DEFAULT_SMPL_VERSION_MIN & 0xffff))
1491
1492-#endif /* __PERFMON_DEFAULT_SMPL_H__ */
1493+#endif /* __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ */
1494--- /dev/null
1495+++ b/arch/ia64/include/asm/perfmon_kern.h
1496@@ -0,0 +1,356 @@
1497+/*
1498+ * Copyright (c) 2001-2007 Hewlett-Packard Development Company, L.P.
1499+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
1500+ *
1501+ * This file contains Itanium Processor Family specific definitions
1502+ * for the perfmon interface.
1503+ *
1504+ * This program is free software; you can redistribute it and/or
1505+ * modify it under the terms of version 2 of the GNU General Public
1506+ * License as published by the Free Software Foundation.
1507+ *
1508+ * This program is distributed in the hope that it will be useful,
1509+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1510+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1511+ * General Public License for more details.
1512+ *
1513+ * You should have received a copy of the GNU General Public License
1514+ * along with this program; if not, write to the Free Software
1515+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
1516+ * 02111-1307 USA
1517+ */
1518+#ifndef _ASM_IA64_PERFMON_KERN_H_
1519+#define _ASM_IA64_PERFMON_KERN_H_
1520+
1521+#ifdef __KERNEL__
1522+
1523+#ifdef CONFIG_PERFMON
1524+#include <asm/unistd.h>
1525+#include <asm/hw_irq.h>
1526+
1527+/*
1528+ * describe the content of the pfm_syst_info field
1529+ * layout:
1530+ * bits[00-15] : generic flags
1531+ * bits[16-31] : arch-specific flags
1532+ */
1533+#define PFM_ITA_CPUINFO_IDLE_EXCL 0x10000 /* stop monitoring in idle loop */
1534+
1535+/*
1536+ * For some CPUs, the upper bits of a counter must be set in order for the
1537+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
1538+ * and the upper bits are cleared. This function may be used to set them back.
1539+ */
1540+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx,
1541+ unsigned int cnum)
1542+{}
1543+
1544+/*
1545+ * called from __pfm_interrupt_handler(). ctx is not NULL.
1546+ * ctx is locked. PMU interrupt is masked.
1547+ *
1548+ * must stop all monitoring to ensure handler has consistent view.
1549+ * must collect overflowed PMDs bitmask into povfls_pmds and
1550+ * npend_ovfls. If no interrupt detected then npend_ovfls
1551+ * must be set to zero.
1552+ */
1553+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx,
1554+ struct pfm_event_set *set)
1555+{
1556+ u64 tmp;
1557+
1558+ /*
1559+ * do not overwrite existing value, must
1560+ * process those first (coming from context switch replay)
1561+ */
1562+ if (set->npend_ovfls)
1563+ return;
1564+
1565+ ia64_srlz_d();
1566+
1567+ tmp = ia64_get_pmc(0) & ~0xf;
1568+
1569+ set->povfl_pmds[0] = tmp;
1570+
1571+ set->npend_ovfls = ia64_popcnt(tmp);
1572+}
1573+
1574+static inline int pfm_arch_init_pmu_config(void)
1575+{
1576+ return 0;
1577+}
1578+
1579+static inline void pfm_arch_resend_irq(struct pfm_context *ctx)
1580+{
1581+ ia64_resend_irq(IA64_PERFMON_VECTOR);
1582+}
1583+
1584+static inline void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx,
1585+ struct pfm_event_set *set)
1586+{}
1587+
1588+static inline void pfm_arch_serialize(void)
1589+{
1590+ ia64_srlz_d();
1591+}
1592+
1593+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
1594+{
1595+ PFM_DBG_ovfl("state=%d", ctx->state);
1596+ ia64_set_pmc(0, 0);
1597+ /* no serialization */
1598+}
1599+
1600+static inline void pfm_arch_write_pmc(struct pfm_context *ctx,
1601+ unsigned int cnum, u64 value)
1602+{
1603+ if (cnum < 256) {
1604+ ia64_set_pmc(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value);
1605+ } else if (cnum < 264) {
1606+ ia64_set_ibr(cnum-256, value);
1607+ ia64_dv_serialize_instruction();
1608+ } else {
1609+ ia64_set_dbr(cnum-264, value);
1610+ ia64_dv_serialize_instruction();
1611+ }
1612+}
1613+
1614+/*
1615+ * On IA-64, for per-thread context which have the ITA_FL_INSECURE
1616+ * flag, it is possible to start/stop monitoring directly from user evel
1617+ * without calling pfm_start()/pfm_stop. This allows very lightweight
1618+ * control yet the kernel sometimes needs to know if monitoring is actually
1619+ * on or off.
1620+ *
1621+ * Tracking of this information is normally done by pfm_start/pfm_stop
1622+ * in flags.started. Here we need to compensate by checking actual
1623+ * psr bit.
1624+ */
1625+static inline int pfm_arch_is_active(struct pfm_context *ctx)
1626+{
1627+ return ctx->flags.started
1628+ || ia64_getreg(_IA64_REG_PSR) & (IA64_PSR_UP|IA64_PSR_PP);
1629+}
1630+
1631+static inline void pfm_arch_write_pmd(struct pfm_context *ctx,
1632+ unsigned int cnum, u64 value)
1633+{
1634+ /*
1635+ * for a counting PMD, overflow bit must be cleared
1636+ */
1637+ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64)
1638+ value &= pfm_pmu_conf->ovfl_mask;
1639+
1640+ /*
1641+ * for counters, write to upper bits are ignored, no need to mask
1642+ */
1643+ ia64_set_pmd(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value);
1644+}
1645+
1646+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
1647+{
1648+ return ia64_get_pmd(pfm_pmu_conf->pmd_desc[cnum].hw_addr);
1649+}
1650+
1651+static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum)
1652+{
1653+ return ia64_get_pmc(pfm_pmu_conf->pmc_desc[cnum].hw_addr);
1654+}
1655+
1656+static inline void pfm_arch_ctxswout_sys(struct task_struct *task,
1657+ struct pfm_context *ctx)
1658+{
1659+ struct pt_regs *regs;
1660+
1661+ regs = task_pt_regs(task);
1662+ ia64_psr(regs)->pp = 0;
1663+}
1664+
1665+static inline void pfm_arch_ctxswin_sys(struct task_struct *task,
1666+ struct pfm_context *ctx)
1667+{
1668+ struct pt_regs *regs;
1669+
1670+ if (!(ctx->active_set->flags & PFM_ITA_SETFL_INTR_ONLY)) {
1671+ regs = task_pt_regs(task);
1672+ ia64_psr(regs)->pp = 1;
1673+ }
1674+}
1675+
1676+/*
1677+ * On IA-64, the PMDs are NOT saved by pfm_arch_freeze_pmu()
1678+ * when entering the PMU interrupt handler, thus, we need
1679+ * to save them in pfm_switch_sets_from_intr()
1680+ */
1681+static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx,
1682+ struct pfm_event_set *set)
1683+{
1684+ pfm_save_pmds(ctx, set);
1685+}
1686+
1687+int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags);
1688+
1689+static inline void pfm_arch_context_free(struct pfm_context *ctx)
1690+{}
1691+
1692+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx);
1693+void pfm_arch_ctxswin_thread(struct task_struct *task,
1694+ struct pfm_context *ctx);
1695+
1696+void pfm_arch_unload_context(struct pfm_context *ctx);
1697+int pfm_arch_load_context(struct pfm_context *ctx);
1698+int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags);
1699+
1700+void pfm_arch_mask_monitoring(struct pfm_context *ctx,
1701+ struct pfm_event_set *set);
1702+void pfm_arch_unmask_monitoring(struct pfm_context *ctx,
1703+ struct pfm_event_set *set);
1704+
1705+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
1706+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
1707+
1708+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx);
1709+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx);
1710+
1711+int pfm_arch_init(void);
1712+void pfm_arch_init_percpu(void);
1713+char *pfm_arch_get_pmu_module_name(void);
1714+
1715+int __pfm_use_dbregs(struct task_struct *task);
1716+int __pfm_release_dbregs(struct task_struct *task);
1717+int pfm_ia64_mark_dbregs_used(struct pfm_context *ctx,
1718+ struct pfm_event_set *set);
1719+
1720+void pfm_arch_show_session(struct seq_file *m);
1721+
1722+static inline int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds)
1723+{
1724+ return 0;
1725+}
1726+
1727+static inline void pfm_arch_pmu_release(void)
1728+{}
1729+
1730+/* not necessary on IA-64 */
1731+static inline void pfm_cacheflush(void *addr, unsigned int len)
1732+{}
1733+
1734+/*
1735+ * miscellaneous architected definitions
1736+ */
1737+#define PFM_ITA_FCNTR 4 /* first counting monitor (PMC/PMD) */
1738+
1739+/*
1740+ * private event set flags (set_priv_flags)
1741+ */
1742+#define PFM_ITA_SETFL_USE_DBR 0x1000000 /* set uses debug registers */
1743+
1744+
1745+/*
1746+ * Itanium-specific data structures
1747+ */
1748+struct pfm_ia64_context_flags {
1749+ unsigned int use_dbr:1; /* use range restrictions (debug registers) */
1750+ unsigned int insecure:1; /* insecure monitoring for non-self session */
1751+ unsigned int reserved:30;/* for future use */
1752+};
1753+
1754+struct pfm_arch_context {
1755+ struct pfm_ia64_context_flags flags; /* arch specific ctx flags */
1756+ u64 ctx_saved_psr_up;/* storage for psr_up */
1757+#ifdef CONFIG_IA64_PERFMON_COMPAT
1758+ void *ctx_smpl_vaddr; /* vaddr of user mapping */
1759+#endif
1760+};
1761+
1762+#ifdef CONFIG_IA64_PERFMON_COMPAT
1763+ssize_t pfm_arch_compat_read(struct pfm_context *ctx,
1764+ char __user *buf,
1765+ int non_block,
1766+ size_t size);
1767+int pfm_ia64_compat_init(void);
1768+int pfm_smpl_buf_alloc_compat(struct pfm_context *ctx,
1769+ size_t rsize, struct file *filp);
1770+#else
1771+static inline ssize_t pfm_arch_compat_read(struct pfm_context *ctx,
1772+ char __user *buf,
1773+ int non_block,
1774+ size_t size)
1775+{
1776+ return -EINVAL;
1777+}
1778+
1779+static inline int pfm_smpl_buf_alloc_compat(struct pfm_context *ctx,
1780+ size_t rsize, struct file *filp)
1781+{
1782+ return -EINVAL;
1783+}
1784+#endif
1785+
1786+static inline void pfm_arch_arm_handle_work(struct task_struct *task)
1787+{
1788+ /*
1789+ * On IA-64, we ran out of bits in the bottom 7 bits of the
1790+ * threadinfo bitmask.Thus we used a 2-stage approach by piggybacking
1791+ * on NOTIFY_RESUME and then in do_notify_resume() we demultiplex and
1792+ * call pfm_handle_work() if needed
1793+ */
1794+ set_tsk_thread_flag(task, TIF_NOTIFY_RESUME);
1795+}
1796+
1797+static inline void pfm_arch_disarm_handle_work(struct task_struct *task)
1798+{
1799+ /*
1800+ * we cannot just clear TIF_NOTIFY_RESUME because other TIF flags are
1801+ * piggybackedonto it: TIF_PERFMON_WORK, TIF_RESTORE_RSE
1802+ *
1803+ * The tsk_clear_notify_resume() checks if any of those are set before
1804+ * clearing the * bit
1805+ */
1806+ tsk_clear_notify_resume(task);
1807+}
1808+
1809+static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg)
1810+{
1811+ return 0;
1812+}
1813+
1814+extern struct pfm_ia64_pmu_info *pfm_ia64_pmu_info;
1815+
1816+#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context))
1817+
1818+/*
1819+ * IA-64 does not need extra alignment requirements for the sampling buffer
1820+ */
1821+#define PFM_ARCH_SMPL_ALIGN_SIZE 0
1822+
1823+
1824+static inline void pfm_release_dbregs(struct task_struct *task)
1825+{
1826+ if (task->thread.flags & IA64_THREAD_DBG_VALID)
1827+ __pfm_release_dbregs(task);
1828+}
1829+
1830+#define pfm_use_dbregs(_t) __pfm_use_dbregs(_t)
1831+
1832+static inline int pfm_arch_get_base_syscall(void)
1833+{
1834+ return __NR_pfm_create_context;
1835+}
1836+
1837+struct pfm_arch_pmu_info {
1838+ unsigned long mask_pmcs[PFM_PMC_BV]; /* modify on when masking */
1839+};
1840+
1841+DECLARE_PER_CPU(u32, pfm_syst_info);
1842+#else /* !CONFIG_PERFMON */
1843+/*
1844+ * perfmon ia64-specific hooks
1845+ */
1846+#define pfm_release_dbregs(_t) do { } while (0)
1847+#define pfm_use_dbregs(_t) (0)
1848+
1849+#endif /* CONFIG_PERFMON */
1850+
1851+#endif /* __KERNEL__ */
1852+#endif /* _ASM_IA64_PERFMON_KERN_H_ */
1853--- a/arch/ia64/include/asm/processor.h
1854+++ b/arch/ia64/include/asm/processor.h
1855@@ -42,7 +42,6 @@
1856
1857 #define IA64_THREAD_FPH_VALID (__IA64_UL(1) << 0) /* floating-point high state valid? */
1858 #define IA64_THREAD_DBG_VALID (__IA64_UL(1) << 1) /* debug registers valid? */
1859-#define IA64_THREAD_PM_VALID (__IA64_UL(1) << 2) /* performance registers valid? */
1860 #define IA64_THREAD_UAC_NOPRINT (__IA64_UL(1) << 3) /* don't log unaligned accesses */
1861 #define IA64_THREAD_UAC_SIGBUS (__IA64_UL(1) << 4) /* generate SIGBUS on unaligned acc. */
1862 #define IA64_THREAD_MIGRATION (__IA64_UL(1) << 5) /* require migration
1863@@ -321,14 +320,6 @@ struct thread_struct {
1864 #else
1865 # define INIT_THREAD_IA32
1866 #endif /* CONFIG_IA32_SUPPORT */
1867-#ifdef CONFIG_PERFMON
1868- void *pfm_context; /* pointer to detailed PMU context */
1869- unsigned long pfm_needs_checking; /* when >0, pending perfmon work on kernel exit */
1870-# define INIT_THREAD_PM .pfm_context = NULL, \
1871- .pfm_needs_checking = 0UL,
1872-#else
1873-# define INIT_THREAD_PM
1874-#endif
1875 __u64 dbr[IA64_NUM_DBG_REGS];
1876 __u64 ibr[IA64_NUM_DBG_REGS];
1877 struct ia64_fpreg fph[96]; /* saved/loaded on demand */
1878@@ -343,7 +334,6 @@ struct thread_struct {
1879 .task_size = DEFAULT_TASK_SIZE, \
1880 .last_fph_cpu = -1, \
1881 INIT_THREAD_IA32 \
1882- INIT_THREAD_PM \
1883 .dbr = {0, }, \
1884 .ibr = {0, }, \
1885 .fph = {{{{0}}}, } \
1886--- a/arch/ia64/include/asm/system.h
1887+++ b/arch/ia64/include/asm/system.h
1888@@ -217,6 +217,7 @@ struct task_struct;
1889 extern void ia64_save_extra (struct task_struct *task);
1890 extern void ia64_load_extra (struct task_struct *task);
1891
1892+
1893 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
1894 extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct *next);
1895 # define IA64_ACCOUNT_ON_SWITCH(p,n) ia64_account_on_switch(p,n)
1896@@ -224,16 +225,9 @@ extern void ia64_account_on_switch (stru
1897 # define IA64_ACCOUNT_ON_SWITCH(p,n)
1898 #endif
1899
1900-#ifdef CONFIG_PERFMON
1901- DECLARE_PER_CPU(unsigned long, pfm_syst_info);
1902-# define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1)
1903-#else
1904-# define PERFMON_IS_SYSWIDE() (0)
1905-#endif
1906-
1907-#define IA64_HAS_EXTRA_STATE(t) \
1908- ((t)->thread.flags & (IA64_THREAD_DBG_VALID|IA64_THREAD_PM_VALID) \
1909- || IS_IA32_PROCESS(task_pt_regs(t)) || PERFMON_IS_SYSWIDE())
1910+#define IA64_HAS_EXTRA_STATE(t) \
1911+ (((t)->thread.flags & IA64_THREAD_DBG_VALID) \
1912+ || IS_IA32_PROCESS(task_pt_regs(t)))
1913
1914 #define __switch_to(prev,next,last) do { \
1915 IA64_ACCOUNT_ON_SWITCH(prev, next); \
1916@@ -241,6 +235,10 @@ extern void ia64_account_on_switch (stru
1917 ia64_save_extra(prev); \
1918 if (IA64_HAS_EXTRA_STATE(next)) \
1919 ia64_load_extra(next); \
1920+ if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)) \
1921+ pfm_ctxsw_out(prev, next); \
1922+ if (test_tsk_thread_flag(next, TIF_PERFMON_CTXSW)) \
1923+ pfm_ctxsw_in(prev, next); \
1924 ia64_psr(task_pt_regs(next))->dfh = !ia64_is_local_fpu_owner(next); \
1925 (last) = ia64_switch_to((next)); \
1926 } while (0)
1927--- a/arch/ia64/include/asm/thread_info.h
1928+++ b/arch/ia64/include/asm/thread_info.h
1929@@ -110,6 +110,8 @@ extern void tsk_clear_notify_resume(stru
1930 #define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */
1931 #define TIF_FREEZE 20 /* is freezing for suspend */
1932 #define TIF_RESTORE_RSE 21 /* user RBS is newer than kernel RBS */
1933+#define TIF_PERFMON_CTXSW 22 /* perfmon needs ctxsw calls */
1934+#define TIF_PERFMON_WORK 23 /* work for pfm_handle_work() */
1935
1936 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
1937 #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
1938@@ -123,6 +125,8 @@ extern void tsk_clear_notify_resume(stru
1939 #define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED)
1940 #define _TIF_FREEZE (1 << TIF_FREEZE)
1941 #define _TIF_RESTORE_RSE (1 << TIF_RESTORE_RSE)
1942+#define _TIF_PERFMON_CTXSW (1 << TIF_PERFMON_CTXSW)
1943+#define _TIF_PERFMON_WORK (1 << TIF_PERFMON_WORK)
1944
1945 /* "work to do on user-return" bits */
1946 #define TIF_ALLWORK_MASK (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\
1947--- a/arch/ia64/include/asm/unistd.h
1948+++ b/arch/ia64/include/asm/unistd.h
1949@@ -308,11 +308,23 @@
1950 #define __NR_dup3 1316
1951 #define __NR_pipe2 1317
1952 #define __NR_inotify_init1 1318
1953+#define __NR_pfm_create_context 1319
1954+#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1)
1955+#define __NR_pfm_write_pmds (__NR_pfm_create_context+2)
1956+#define __NR_pfm_read_pmds (__NR_pfm_create_context+3)
1957+#define __NR_pfm_load_context (__NR_pfm_create_context+4)
1958+#define __NR_pfm_start (__NR_pfm_create_context+5)
1959+#define __NR_pfm_stop (__NR_pfm_create_context+6)
1960+#define __NR_pfm_restart (__NR_pfm_create_context+7)
1961+#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8)
1962+#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9)
1963+#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10)
1964+#define __NR_pfm_unload_context (__NR_pfm_create_context+11)
1965
1966 #ifdef __KERNEL__
1967
1968
1969-#define NR_syscalls 295 /* length of syscall table */
1970+#define NR_syscalls 307 /* length of syscall table */
1971
1972 /*
1973 * The following defines stop scripts/checksyscalls.sh from complaining about
1974--- a/arch/ia64/kernel/Makefile
1975+++ b/arch/ia64/kernel/Makefile
1976@@ -5,7 +5,7 @@
1977 extra-y := head.o init_task.o vmlinux.lds
1978
1979 obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \
1980- irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \
1981+ irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o ptrace.o sal.o \
1982 salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
1983 unwind.o mca.o mca_asm.o topology.o
1984
1985@@ -23,7 +23,6 @@ obj-$(CONFIG_IOSAPIC) += iosapic.o
1986 obj-$(CONFIG_MODULES) += module.o
1987 obj-$(CONFIG_SMP) += smp.o smpboot.o
1988 obj-$(CONFIG_NUMA) += numa.o
1989-obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o
1990 obj-$(CONFIG_IA64_CYCLONE) += cyclone.o
1991 obj-$(CONFIG_CPU_FREQ) += cpufreq/
1992 obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o
1993--- a/arch/ia64/kernel/entry.S
1994+++ b/arch/ia64/kernel/entry.S
1995@@ -1697,6 +1697,18 @@ sys_call_table:
1996 data8 sys_dup3
1997 data8 sys_pipe2
1998 data8 sys_inotify_init1
1999+ data8 sys_pfm_create_context
2000+ data8 sys_pfm_write_pmcs // 1320
2001+ data8 sys_pfm_write_pmds
2002+ data8 sys_pfm_read_pmds
2003+ data8 sys_pfm_load_context
2004+ data8 sys_pfm_start
2005+ data8 sys_pfm_stop // 1325
2006+ data8 sys_pfm_restart
2007+ data8 sys_pfm_create_evtsets
2008+ data8 sys_pfm_getinfo_evtsets
2009+ data8 sys_pfm_delete_evtsets
2010+ data8 sys_pfm_unload_context // 1330
2011
2012 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
2013 #endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
2014--- a/arch/ia64/kernel/irq_ia64.c
2015+++ b/arch/ia64/kernel/irq_ia64.c
2016@@ -40,10 +40,6 @@
2017 #include <asm/system.h>
2018 #include <asm/tlbflush.h>
2019
2020-#ifdef CONFIG_PERFMON
2021-# include <asm/perfmon.h>
2022-#endif
2023-
2024 #define IRQ_DEBUG 0
2025
2026 #define IRQ_VECTOR_UNASSIGNED (0)
2027@@ -660,9 +656,6 @@ init_IRQ (void)
2028 }
2029 #endif
2030 #endif
2031-#ifdef CONFIG_PERFMON
2032- pfm_init_percpu();
2033-#endif
2034 platform_irq_init();
2035 }
2036
2037--- a/arch/ia64/kernel/perfmon_default_smpl.c
2038+++ /dev/null
2039@@ -1,296 +0,0 @@
2040-/*
2041- * Copyright (C) 2002-2003 Hewlett-Packard Co
2042- * Stephane Eranian <eranian@hpl.hp.com>
2043- *
2044- * This file implements the default sampling buffer format
2045- * for the Linux/ia64 perfmon-2 subsystem.
2046- */
2047-#include <linux/kernel.h>
2048-#include <linux/types.h>
2049-#include <linux/module.h>
2050-#include <linux/init.h>
2051-#include <asm/delay.h>
2052-#include <linux/smp.h>
2053-
2054-#include <asm/perfmon.h>
2055-#include <asm/perfmon_default_smpl.h>
2056-
2057-MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
2058-MODULE_DESCRIPTION("perfmon default sampling format");
2059-MODULE_LICENSE("GPL");
2060-
2061-#define DEFAULT_DEBUG 1
2062-
2063-#ifdef DEFAULT_DEBUG
2064-#define DPRINT(a) \
2065- do { \
2066- if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d ", __func__, __LINE__, smp_processor_id()); printk a; } \
2067- } while (0)
2068-
2069-#define DPRINT_ovfl(a) \
2070- do { \
2071- if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d ", __func__, __LINE__, smp_processor_id()); printk a; } \
2072- } while (0)
2073-
2074-#else
2075-#define DPRINT(a)
2076-#define DPRINT_ovfl(a)
2077-#endif
2078-
2079-static int
2080-default_validate(struct task_struct *task, unsigned int flags, int cpu, void *data)
2081-{
2082- pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t*)data;
2083- int ret = 0;
2084-
2085- if (data == NULL) {
2086- DPRINT(("[%d] no argument passed\n", task_pid_nr(task)));
2087- return -EINVAL;
2088- }
2089-
2090- DPRINT(("[%d] validate flags=0x%x CPU%d\n", task_pid_nr(task), flags, cpu));
2091-
2092- /*
2093- * must hold at least the buffer header + one minimally sized entry
2094- */
2095- if (arg->buf_size < PFM_DEFAULT_SMPL_MIN_BUF_SIZE) return -EINVAL;
2096-
2097- DPRINT(("buf_size=%lu\n", arg->buf_size));
2098-
2099- return ret;
2100-}
2101-
2102-static int
2103-default_get_size(struct task_struct *task, unsigned int flags, int cpu, void *data, unsigned long *size)
2104-{
2105- pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data;
2106-
2107- /*
2108- * size has been validated in default_validate
2109- */
2110- *size = arg->buf_size;
2111-
2112- return 0;
2113-}
2114-
2115-static int
2116-default_init(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *data)
2117-{
2118- pfm_default_smpl_hdr_t *hdr;
2119- pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data;
2120-
2121- hdr = (pfm_default_smpl_hdr_t *)buf;
2122-
2123- hdr->hdr_version = PFM_DEFAULT_SMPL_VERSION;
2124- hdr->hdr_buf_size = arg->buf_size;
2125- hdr->hdr_cur_offs = sizeof(*hdr);
2126- hdr->hdr_overflows = 0UL;
2127- hdr->hdr_count = 0UL;
2128-
2129- DPRINT(("[%d] buffer=%p buf_size=%lu hdr_size=%lu hdr_version=%u cur_offs=%lu\n",
2130- task_pid_nr(task),
2131- buf,
2132- hdr->hdr_buf_size,
2133- sizeof(*hdr),
2134- hdr->hdr_version,
2135- hdr->hdr_cur_offs));
2136-
2137- return 0;
2138-}
2139-
2140-static int
2141-default_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp)
2142-{
2143- pfm_default_smpl_hdr_t *hdr;
2144- pfm_default_smpl_entry_t *ent;
2145- void *cur, *last;
2146- unsigned long *e, entry_size;
2147- unsigned int npmds, i;
2148- unsigned char ovfl_pmd;
2149- unsigned char ovfl_notify;
2150-
2151- if (unlikely(buf == NULL || arg == NULL|| regs == NULL || task == NULL)) {
2152- DPRINT(("[%d] invalid arguments buf=%p arg=%p\n", task->pid, buf, arg));
2153- return -EINVAL;
2154- }
2155-
2156- hdr = (pfm_default_smpl_hdr_t *)buf;
2157- cur = buf+hdr->hdr_cur_offs;
2158- last = buf+hdr->hdr_buf_size;
2159- ovfl_pmd = arg->ovfl_pmd;
2160- ovfl_notify = arg->ovfl_notify;
2161-
2162- /*
2163- * precheck for sanity
2164- */
2165- if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full;
2166-
2167- npmds = hweight64(arg->smpl_pmds[0]);
2168-
2169- ent = (pfm_default_smpl_entry_t *)cur;
2170-
2171- prefetch(arg->smpl_pmds_values);
2172-
2173- entry_size = sizeof(*ent) + (npmds << 3);
2174-
2175- /* position for first pmd */
2176- e = (unsigned long *)(ent+1);
2177-
2178- hdr->hdr_count++;
2179-
2180- DPRINT_ovfl(("[%d] count=%lu cur=%p last=%p free_bytes=%lu ovfl_pmd=%d ovfl_notify=%d npmds=%u\n",
2181- task->pid,
2182- hdr->hdr_count,
2183- cur, last,
2184- last-cur,
2185- ovfl_pmd,
2186- ovfl_notify, npmds));
2187-
2188- /*
2189- * current = task running at the time of the overflow.
2190- *
2191- * per-task mode:
2192- * - this is ususally the task being monitored.
2193- * Under certain conditions, it might be a different task
2194- *
2195- * system-wide:
2196- * - this is not necessarily the task controlling the session
2197- */
2198- ent->pid = current->pid;
2199- ent->ovfl_pmd = ovfl_pmd;
2200- ent->last_reset_val = arg->pmd_last_reset; //pmd[0].reg_last_reset_val;
2201-
2202- /*
2203- * where did the fault happen (includes slot number)
2204- */
2205- ent->ip = regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3);
2206-
2207- ent->tstamp = stamp;
2208- ent->cpu = smp_processor_id();
2209- ent->set = arg->active_set;
2210- ent->tgid = current->tgid;
2211-
2212- /*
2213- * selectively store PMDs in increasing index number
2214- */
2215- if (npmds) {
2216- unsigned long *val = arg->smpl_pmds_values;
2217- for(i=0; i < npmds; i++) {
2218- *e++ = *val++;
2219- }
2220- }
2221-
2222- /*
2223- * update position for next entry
2224- */
2225- hdr->hdr_cur_offs += entry_size;
2226- cur += entry_size;
2227-
2228- /*
2229- * post check to avoid losing the last sample
2230- */
2231- if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full;
2232-
2233- /*
2234- * keep same ovfl_pmds, ovfl_notify
2235- */
2236- arg->ovfl_ctrl.bits.notify_user = 0;
2237- arg->ovfl_ctrl.bits.block_task = 0;
2238- arg->ovfl_ctrl.bits.mask_monitoring = 0;
2239- arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; /* reset before returning from interrupt handler */
2240-
2241- return 0;
2242-full:
2243- DPRINT_ovfl(("sampling buffer full free=%lu, count=%lu, ovfl_notify=%d\n", last-cur, hdr->hdr_count, ovfl_notify));
2244-
2245- /*
2246- * increment number of buffer overflow.
2247- * important to detect duplicate set of samples.
2248- */
2249- hdr->hdr_overflows++;
2250-
2251- /*
2252- * if no notification requested, then we saturate the buffer
2253- */
2254- if (ovfl_notify == 0) {
2255- arg->ovfl_ctrl.bits.notify_user = 0;
2256- arg->ovfl_ctrl.bits.block_task = 0;
2257- arg->ovfl_ctrl.bits.mask_monitoring = 1;
2258- arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0;
2259- } else {
2260- arg->ovfl_ctrl.bits.notify_user = 1;
2261- arg->ovfl_ctrl.bits.block_task = 1; /* ignored for non-blocking context */
2262- arg->ovfl_ctrl.bits.mask_monitoring = 1;
2263- arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; /* no reset now */
2264- }
2265- return -1; /* we are full, sorry */
2266-}
2267-
2268-static int
2269-default_restart(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
2270-{
2271- pfm_default_smpl_hdr_t *hdr;
2272-
2273- hdr = (pfm_default_smpl_hdr_t *)buf;
2274-
2275- hdr->hdr_count = 0UL;
2276- hdr->hdr_cur_offs = sizeof(*hdr);
2277-
2278- ctrl->bits.mask_monitoring = 0;
2279- ctrl->bits.reset_ovfl_pmds = 1; /* uses long-reset values */
2280-
2281- return 0;
2282-}
2283-
2284-static int
2285-default_exit(struct task_struct *task, void *buf, struct pt_regs *regs)
2286-{
2287- DPRINT(("[%d] exit(%p)\n", task_pid_nr(task), buf));
2288- return 0;
2289-}
2290-
2291-static pfm_buffer_fmt_t default_fmt={
2292- .fmt_name = "default_format",
2293- .fmt_uuid = PFM_DEFAULT_SMPL_UUID,
2294- .fmt_arg_size = sizeof(pfm_default_smpl_arg_t),
2295- .fmt_validate = default_validate,
2296- .fmt_getsize = default_get_size,
2297- .fmt_init = default_init,
2298- .fmt_handler = default_handler,
2299- .fmt_restart = default_restart,
2300- .fmt_restart_active = default_restart,
2301- .fmt_exit = default_exit,
2302-};
2303-
2304-static int __init
2305-pfm_default_smpl_init_module(void)
2306-{
2307- int ret;
2308-
2309- ret = pfm_register_buffer_fmt(&default_fmt);
2310- if (ret == 0) {
2311- printk("perfmon_default_smpl: %s v%u.%u registered\n",
2312- default_fmt.fmt_name,
2313- PFM_DEFAULT_SMPL_VERSION_MAJ,
2314- PFM_DEFAULT_SMPL_VERSION_MIN);
2315- } else {
2316- printk("perfmon_default_smpl: %s cannot register ret=%d\n",
2317- default_fmt.fmt_name,
2318- ret);
2319- }
2320-
2321- return ret;
2322-}
2323-
2324-static void __exit
2325-pfm_default_smpl_cleanup_module(void)
2326-{
2327- int ret;
2328- ret = pfm_unregister_buffer_fmt(default_fmt.fmt_uuid);
2329-
2330- printk("perfmon_default_smpl: unregister %s=%d\n", default_fmt.fmt_name, ret);
2331-}
2332-
2333-module_init(pfm_default_smpl_init_module);
2334-module_exit(pfm_default_smpl_cleanup_module);
2335-
2336--- a/arch/ia64/kernel/perfmon_generic.h
2337+++ /dev/null
2338@@ -1,45 +0,0 @@
2339-/*
2340- * This file contains the generic PMU register description tables
2341- * and pmc checker used by perfmon.c.
2342- *
2343- * Copyright (C) 2002-2003 Hewlett Packard Co
2344- * Stephane Eranian <eranian@hpl.hp.com>
2345- */
2346-
2347-static pfm_reg_desc_t pfm_gen_pmc_desc[PMU_MAX_PMCS]={
2348-/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2349-/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2350-/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2351-/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2352-/* pmc4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2353-/* pmc5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2354-/* pmc6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2355-/* pmc7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2356- { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
2357-};
2358-
2359-static pfm_reg_desc_t pfm_gen_pmd_desc[PMU_MAX_PMDS]={
2360-/* pmd0 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
2361-/* pmd1 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
2362-/* pmd2 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
2363-/* pmd3 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
2364-/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
2365-/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
2366-/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
2367-/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
2368- { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
2369-};
2370-
2371-/*
2372- * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
2373- */
2374-static pmu_config_t pmu_conf_gen={
2375- .pmu_name = "Generic",
2376- .pmu_family = 0xff, /* any */
2377- .ovfl_val = (1UL << 32) - 1,
2378- .num_ibrs = 0, /* does not use */
2379- .num_dbrs = 0, /* does not use */
2380- .pmd_desc = pfm_gen_pmd_desc,
2381- .pmc_desc = pfm_gen_pmc_desc
2382-};
2383-
2384--- a/arch/ia64/kernel/perfmon_itanium.h
2385+++ /dev/null
2386@@ -1,115 +0,0 @@
2387-/*
2388- * This file contains the Itanium PMU register description tables
2389- * and pmc checker used by perfmon.c.
2390- *
2391- * Copyright (C) 2002-2003 Hewlett Packard Co
2392- * Stephane Eranian <eranian@hpl.hp.com>
2393- */
2394-static int pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
2395-
2396-static pfm_reg_desc_t pfm_ita_pmc_desc[PMU_MAX_PMCS]={
2397-/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2398-/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2399-/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2400-/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2401-/* pmc4 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2402-/* pmc5 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2403-/* pmc6 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2404-/* pmc7 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2405-/* pmc8 */ { PFM_REG_CONFIG , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2406-/* pmc9 */ { PFM_REG_CONFIG , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2407-/* pmc10 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2408-/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0000000010000000UL, -1UL, NULL, pfm_ita_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2409-/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2410-/* pmc13 */ { PFM_REG_CONFIG , 0, 0x0003ffff00000001UL, -1UL, NULL, pfm_ita_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2411- { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
2412-};
2413-
2414-static pfm_reg_desc_t pfm_ita_pmd_desc[PMU_MAX_PMDS]={
2415-/* pmd0 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
2416-/* pmd1 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
2417-/* pmd2 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
2418-/* pmd3 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
2419-/* pmd4 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
2420-/* pmd5 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
2421-/* pmd6 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
2422-/* pmd7 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
2423-/* pmd8 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2424-/* pmd9 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2425-/* pmd10 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2426-/* pmd11 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2427-/* pmd12 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2428-/* pmd13 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2429-/* pmd14 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2430-/* pmd15 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2431-/* pmd16 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2432-/* pmd17 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
2433- { PFM_REG_END , 0, 0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
2434-};
2435-
2436-static int
2437-pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
2438-{
2439- int ret;
2440- int is_loaded;
2441-
2442- /* sanitfy check */
2443- if (ctx == NULL) return -EINVAL;
2444-
2445- is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
2446-
2447- /*
2448- * we must clear the (instruction) debug registers if pmc13.ta bit is cleared
2449- * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
2450- */
2451- if (cnum == 13 && is_loaded && ((*val & 0x1) == 0UL) && ctx->ctx_fl_using_dbreg == 0) {
2452-
2453- DPRINT(("pmc[%d]=0x%lx has active pmc13.ta cleared, clearing ibr\n", cnum, *val));
2454-
2455- /* don't mix debug with perfmon */
2456- if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
2457-
2458- /*
2459- * a count of 0 will mark the debug registers as in use and also
2460- * ensure that they are properly cleared.
2461- */
2462- ret = pfm_write_ibr_dbr(1, ctx, NULL, 0, regs);
2463- if (ret) return ret;
2464- }
2465-
2466- /*
2467- * we must clear the (data) debug registers if pmc11.pt bit is cleared
2468- * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
2469- */
2470- if (cnum == 11 && is_loaded && ((*val >> 28)& 0x1) == 0 && ctx->ctx_fl_using_dbreg == 0) {
2471-
2472- DPRINT(("pmc[%d]=0x%lx has active pmc11.pt cleared, clearing dbr\n", cnum, *val));
2473-
2474- /* don't mix debug with perfmon */
2475- if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
2476-
2477- /*
2478- * a count of 0 will mark the debug registers as in use and also
2479- * ensure that they are properly cleared.
2480- */
2481- ret = pfm_write_ibr_dbr(0, ctx, NULL, 0, regs);
2482- if (ret) return ret;
2483- }
2484- return 0;
2485-}
2486-
2487-/*
2488- * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
2489- */
2490-static pmu_config_t pmu_conf_ita={
2491- .pmu_name = "Itanium",
2492- .pmu_family = 0x7,
2493- .ovfl_val = (1UL << 32) - 1,
2494- .pmd_desc = pfm_ita_pmd_desc,
2495- .pmc_desc = pfm_ita_pmc_desc,
2496- .num_ibrs = 8,
2497- .num_dbrs = 8,
2498- .use_rr_dbregs = 1, /* debug register are use for range retrictions */
2499-};
2500-
2501-
2502--- a/arch/ia64/kernel/perfmon_mckinley.h
2503+++ /dev/null
2504@@ -1,187 +0,0 @@
2505-/*
2506- * This file contains the McKinley PMU register description tables
2507- * and pmc checker used by perfmon.c.
2508- *
2509- * Copyright (C) 2002-2003 Hewlett Packard Co
2510- * Stephane Eranian <eranian@hpl.hp.com>
2511- */
2512-static int pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
2513-
2514-static pfm_reg_desc_t pfm_mck_pmc_desc[PMU_MAX_PMCS]={
2515-/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2516-/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2517-/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2518-/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2519-/* pmc4 */ { PFM_REG_COUNTING, 6, 0x0000000000800000UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2520-/* pmc5 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2521-/* pmc6 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2522-/* pmc7 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2523-/* pmc8 */ { PFM_REG_CONFIG , 0, 0xffffffff3fffffffUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2524-/* pmc9 */ { PFM_REG_CONFIG , 0, 0xffffffff3ffffffcUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2525-/* pmc10 */ { PFM_REG_MONITOR , 4, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2526-/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0UL, 0x30f01cf, NULL, pfm_mck_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2527-/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2528-/* pmc13 */ { PFM_REG_CONFIG , 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2529-/* pmc14 */ { PFM_REG_CONFIG , 0, 0x0db60db60db60db6UL, 0x2492UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2530-/* pmc15 */ { PFM_REG_CONFIG , 0, 0x00000000fffffff0UL, 0xfUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
2531- { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
2532-};
2533-
2534-static pfm_reg_desc_t pfm_mck_pmd_desc[PMU_MAX_PMDS]={
2535-/* pmd0 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
2536-/* pmd1 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
2537-/* pmd2 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
2538-/* pmd3 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
2539-/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
2540-/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
2541-/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
2542-/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
2543-/* pmd8 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2544-/* pmd9 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2545-/* pmd10 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2546-/* pmd11 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2547-/* pmd12 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2548-/* pmd13 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2549-/* pmd14 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2550-/* pmd15 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2551-/* pmd16 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
2552-/* pmd17 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
2553- { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
2554-};
2555-
2556-/*
2557- * PMC reserved fields must have their power-up values preserved
2558- */
2559-static int
2560-pfm_mck_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs)
2561-{
2562- unsigned long tmp1, tmp2, ival = *val;
2563-
2564- /* remove reserved areas from user value */
2565- tmp1 = ival & PMC_RSVD_MASK(cnum);
2566-
2567- /* get reserved fields values */
2568- tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum);
2569-
2570- *val = tmp1 | tmp2;
2571-
2572- DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n",
2573- cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val));
2574- return 0;
2575-}
2576-
2577-/*
2578- * task can be NULL if the context is unloaded
2579- */
2580-static int
2581-pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
2582-{
2583- int ret = 0, check_case1 = 0;
2584- unsigned long val8 = 0, val14 = 0, val13 = 0;
2585- int is_loaded;
2586-
2587- /* first preserve the reserved fields */
2588- pfm_mck_reserved(cnum, val, regs);
2589-
2590- /* sanitfy check */
2591- if (ctx == NULL) return -EINVAL;
2592-
2593- is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
2594-
2595- /*
2596- * we must clear the debug registers if pmc13 has a value which enable
2597- * memory pipeline event constraints. In this case we need to clear the
2598- * the debug registers if they have not yet been accessed. This is required
2599- * to avoid picking stale state.
2600- * PMC13 is "active" if:
2601- * one of the pmc13.cfg_dbrpXX field is different from 0x3
2602- * AND
2603- * at the corresponding pmc13.ena_dbrpXX is set.
2604- */
2605- DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, *val, ctx->ctx_fl_using_dbreg, is_loaded));
2606-
2607- if (cnum == 13 && is_loaded
2608- && (*val & 0x1e00000000000UL) && (*val & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) {
2609-
2610- DPRINT(("pmc[%d]=0x%lx has active pmc13 settings, clearing dbr\n", cnum, *val));
2611-
2612- /* don't mix debug with perfmon */
2613- if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
2614-
2615- /*
2616- * a count of 0 will mark the debug registers as in use and also
2617- * ensure that they are properly cleared.
2618- */
2619- ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs);
2620- if (ret) return ret;
2621- }
2622- /*
2623- * we must clear the (instruction) debug registers if any pmc14.ibrpX bit is enabled
2624- * before they are (fl_using_dbreg==0) to avoid picking up stale information.
2625- */
2626- if (cnum == 14 && is_loaded && ((*val & 0x2222UL) != 0x2222UL) && ctx->ctx_fl_using_dbreg == 0) {
2627-
2628- DPRINT(("pmc[%d]=0x%lx has active pmc14 settings, clearing ibr\n", cnum, *val));
2629-
2630- /* don't mix debug with perfmon */
2631- if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
2632-
2633- /*
2634- * a count of 0 will mark the debug registers as in use and also
2635- * ensure that they are properly cleared.
2636- */
2637- ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs);
2638- if (ret) return ret;
2639-
2640- }
2641-
2642- switch(cnum) {
2643- case 4: *val |= 1UL << 23; /* force power enable bit */
2644- break;
2645- case 8: val8 = *val;
2646- val13 = ctx->ctx_pmcs[13];
2647- val14 = ctx->ctx_pmcs[14];
2648- check_case1 = 1;
2649- break;
2650- case 13: val8 = ctx->ctx_pmcs[8];
2651- val13 = *val;
2652- val14 = ctx->ctx_pmcs[14];
2653- check_case1 = 1;
2654- break;
2655- case 14: val8 = ctx->ctx_pmcs[8];
2656- val13 = ctx->ctx_pmcs[13];
2657- val14 = *val;
2658- check_case1 = 1;
2659- break;
2660- }
2661- /* check illegal configuration which can produce inconsistencies in tagging
2662- * i-side events in L1D and L2 caches
2663- */
2664- if (check_case1) {
2665- ret = ((val13 >> 45) & 0xf) == 0
2666- && ((val8 & 0x1) == 0)
2667- && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0)
2668- ||(((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0));
2669-
2670- if (ret) DPRINT((KERN_DEBUG "perfmon: failure check_case1\n"));
2671- }
2672-
2673- return ret ? -EINVAL : 0;
2674-}
2675-
2676-/*
2677- * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
2678- */
2679-static pmu_config_t pmu_conf_mck={
2680- .pmu_name = "Itanium 2",
2681- .pmu_family = 0x1f,
2682- .flags = PFM_PMU_IRQ_RESEND,
2683- .ovfl_val = (1UL << 47) - 1,
2684- .pmd_desc = pfm_mck_pmd_desc,
2685- .pmc_desc = pfm_mck_pmc_desc,
2686- .num_ibrs = 8,
2687- .num_dbrs = 8,
2688- .use_rr_dbregs = 1 /* debug register are use for range restrictions */
2689-};
2690-
2691-
2692--- a/arch/ia64/kernel/perfmon_montecito.h
2693+++ /dev/null
2694@@ -1,269 +0,0 @@
2695-/*
2696- * This file contains the Montecito PMU register description tables
2697- * and pmc checker used by perfmon.c.
2698- *
2699- * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
2700- * Contributed by Stephane Eranian <eranian@hpl.hp.com>
2701- */
2702-static int pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
2703-
2704-#define RDEP_MONT_ETB (RDEP(38)|RDEP(39)|RDEP(48)|RDEP(49)|RDEP(50)|RDEP(51)|RDEP(52)|RDEP(53)|RDEP(54)|\
2705- RDEP(55)|RDEP(56)|RDEP(57)|RDEP(58)|RDEP(59)|RDEP(60)|RDEP(61)|RDEP(62)|RDEP(63))
2706-#define RDEP_MONT_DEAR (RDEP(32)|RDEP(33)|RDEP(36))
2707-#define RDEP_MONT_IEAR (RDEP(34)|RDEP(35))
2708-
2709-static pfm_reg_desc_t pfm_mont_pmc_desc[PMU_MAX_PMCS]={
2710-/* pmc0 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}},
2711-/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}},
2712-/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}},
2713-/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}},
2714-/* pmc4 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(4),0, 0, 0}, {0,0, 0, 0}},
2715-/* pmc5 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(5),0, 0, 0}, {0,0, 0, 0}},
2716-/* pmc6 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(6),0, 0, 0}, {0,0, 0, 0}},
2717-/* pmc7 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(7),0, 0, 0}, {0,0, 0, 0}},
2718-/* pmc8 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(8),0, 0, 0}, {0,0, 0, 0}},
2719-/* pmc9 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(9),0, 0, 0}, {0,0, 0, 0}},
2720-/* pmc10 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(10),0, 0, 0}, {0,0, 0, 0}},
2721-/* pmc11 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(11),0, 0, 0}, {0,0, 0, 0}},
2722-/* pmc12 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(12),0, 0, 0}, {0,0, 0, 0}},
2723-/* pmc13 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(13),0, 0, 0}, {0,0, 0, 0}},
2724-/* pmc14 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(14),0, 0, 0}, {0,0, 0, 0}},
2725-/* pmc15 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(15),0, 0, 0}, {0,0, 0, 0}},
2726-/* pmc16 */ { PFM_REG_NOTIMPL, },
2727-/* pmc17 */ { PFM_REG_NOTIMPL, },
2728-/* pmc18 */ { PFM_REG_NOTIMPL, },
2729-/* pmc19 */ { PFM_REG_NOTIMPL, },
2730-/* pmc20 */ { PFM_REG_NOTIMPL, },
2731-/* pmc21 */ { PFM_REG_NOTIMPL, },
2732-/* pmc22 */ { PFM_REG_NOTIMPL, },
2733-/* pmc23 */ { PFM_REG_NOTIMPL, },
2734-/* pmc24 */ { PFM_REG_NOTIMPL, },
2735-/* pmc25 */ { PFM_REG_NOTIMPL, },
2736-/* pmc26 */ { PFM_REG_NOTIMPL, },
2737-/* pmc27 */ { PFM_REG_NOTIMPL, },
2738-/* pmc28 */ { PFM_REG_NOTIMPL, },
2739-/* pmc29 */ { PFM_REG_NOTIMPL, },
2740-/* pmc30 */ { PFM_REG_NOTIMPL, },
2741-/* pmc31 */ { PFM_REG_NOTIMPL, },
2742-/* pmc32 */ { PFM_REG_CONFIG, 0, 0x30f01ffffffffffUL, 0x30f01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
2743-/* pmc33 */ { PFM_REG_CONFIG, 0, 0x0, 0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
2744-/* pmc34 */ { PFM_REG_CONFIG, 0, 0xf01ffffffffffUL, 0xf01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
2745-/* pmc35 */ { PFM_REG_CONFIG, 0, 0x0, 0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
2746-/* pmc36 */ { PFM_REG_CONFIG, 0, 0xfffffff0, 0xf, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
2747-/* pmc37 */ { PFM_REG_MONITOR, 4, 0x0, 0x3fff, NULL, pfm_mont_pmc_check, {RDEP_MONT_IEAR, 0, 0, 0}, {0, 0, 0, 0}},
2748-/* pmc38 */ { PFM_REG_CONFIG, 0, 0xdb6, 0x2492, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
2749-/* pmc39 */ { PFM_REG_MONITOR, 6, 0x0, 0xffcf, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}},
2750-/* pmc40 */ { PFM_REG_MONITOR, 6, 0x2000000, 0xf01cf, NULL, pfm_mont_pmc_check, {RDEP_MONT_DEAR,0, 0, 0}, {0,0, 0, 0}},
2751-/* pmc41 */ { PFM_REG_CONFIG, 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
2752-/* pmc42 */ { PFM_REG_MONITOR, 6, 0x0, 0x7ff4f, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}},
2753- { PFM_REG_END , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */
2754-};
2755-
2756-static pfm_reg_desc_t pfm_mont_pmd_desc[PMU_MAX_PMDS]={
2757-/* pmd0 */ { PFM_REG_NOTIMPL, },
2758-/* pmd1 */ { PFM_REG_NOTIMPL, },
2759-/* pmd2 */ { PFM_REG_NOTIMPL, },
2760-/* pmd3 */ { PFM_REG_NOTIMPL, },
2761-/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(4),0, 0, 0}},
2762-/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(5),0, 0, 0}},
2763-/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(6),0, 0, 0}},
2764-/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(7),0, 0, 0}},
2765-/* pmd8 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(8),0, 0, 0}},
2766-/* pmd9 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(9),0, 0, 0}},
2767-/* pmd10 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(10),0, 0, 0}},
2768-/* pmd11 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(11),0, 0, 0}},
2769-/* pmd12 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(12),0, 0, 0}},
2770-/* pmd13 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(13),0, 0, 0}},
2771-/* pmd14 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(14),0, 0, 0}},
2772-/* pmd15 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(15),0, 0, 0}},
2773-/* pmd16 */ { PFM_REG_NOTIMPL, },
2774-/* pmd17 */ { PFM_REG_NOTIMPL, },
2775-/* pmd18 */ { PFM_REG_NOTIMPL, },
2776-/* pmd19 */ { PFM_REG_NOTIMPL, },
2777-/* pmd20 */ { PFM_REG_NOTIMPL, },
2778-/* pmd21 */ { PFM_REG_NOTIMPL, },
2779-/* pmd22 */ { PFM_REG_NOTIMPL, },
2780-/* pmd23 */ { PFM_REG_NOTIMPL, },
2781-/* pmd24 */ { PFM_REG_NOTIMPL, },
2782-/* pmd25 */ { PFM_REG_NOTIMPL, },
2783-/* pmd26 */ { PFM_REG_NOTIMPL, },
2784-/* pmd27 */ { PFM_REG_NOTIMPL, },
2785-/* pmd28 */ { PFM_REG_NOTIMPL, },
2786-/* pmd29 */ { PFM_REG_NOTIMPL, },
2787-/* pmd30 */ { PFM_REG_NOTIMPL, },
2788-/* pmd31 */ { PFM_REG_NOTIMPL, },
2789-/* pmd32 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(33)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}},
2790-/* pmd33 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}},
2791-/* pmd34 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(35),0, 0, 0}, {RDEP(37),0, 0, 0}},
2792-/* pmd35 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(34),0, 0, 0}, {RDEP(37),0, 0, 0}},
2793-/* pmd36 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(33),0, 0, 0}, {RDEP(40),0, 0, 0}},
2794-/* pmd37 */ { PFM_REG_NOTIMPL, },
2795-/* pmd38 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2796-/* pmd39 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2797-/* pmd40 */ { PFM_REG_NOTIMPL, },
2798-/* pmd41 */ { PFM_REG_NOTIMPL, },
2799-/* pmd42 */ { PFM_REG_NOTIMPL, },
2800-/* pmd43 */ { PFM_REG_NOTIMPL, },
2801-/* pmd44 */ { PFM_REG_NOTIMPL, },
2802-/* pmd45 */ { PFM_REG_NOTIMPL, },
2803-/* pmd46 */ { PFM_REG_NOTIMPL, },
2804-/* pmd47 */ { PFM_REG_NOTIMPL, },
2805-/* pmd48 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2806-/* pmd49 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2807-/* pmd50 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2808-/* pmd51 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2809-/* pmd52 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2810-/* pmd53 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2811-/* pmd54 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2812-/* pmd55 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2813-/* pmd56 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2814-/* pmd57 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2815-/* pmd58 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2816-/* pmd59 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2817-/* pmd60 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2818-/* pmd61 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2819-/* pmd62 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2820-/* pmd63 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
2821- { PFM_REG_END , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */
2822-};
2823-
2824-/*
2825- * PMC reserved fields must have their power-up values preserved
2826- */
2827-static int
2828-pfm_mont_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs)
2829-{
2830- unsigned long tmp1, tmp2, ival = *val;
2831-
2832- /* remove reserved areas from user value */
2833- tmp1 = ival & PMC_RSVD_MASK(cnum);
2834-
2835- /* get reserved fields values */
2836- tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum);
2837-
2838- *val = tmp1 | tmp2;
2839-
2840- DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n",
2841- cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val));
2842- return 0;
2843-}
2844-
2845-/*
2846- * task can be NULL if the context is unloaded
2847- */
2848-static int
2849-pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
2850-{
2851- int ret = 0;
2852- unsigned long val32 = 0, val38 = 0, val41 = 0;
2853- unsigned long tmpval;
2854- int check_case1 = 0;
2855- int is_loaded;
2856-
2857- /* first preserve the reserved fields */
2858- pfm_mont_reserved(cnum, val, regs);
2859-
2860- tmpval = *val;
2861-
2862- /* sanity check */
2863- if (ctx == NULL) return -EINVAL;
2864-
2865- is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
2866-
2867- /*
2868- * we must clear the debug registers if pmc41 has a value which enable
2869- * memory pipeline event constraints. In this case we need to clear the
2870- * the debug registers if they have not yet been accessed. This is required
2871- * to avoid picking stale state.
2872- * PMC41 is "active" if:
2873- * one of the pmc41.cfg_dtagXX field is different from 0x3
2874- * AND
2875- * at the corresponding pmc41.en_dbrpXX is set.
2876- * AND
2877- * ctx_fl_using_dbreg == 0 (i.e., dbr not yet used)
2878- */
2879- DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, tmpval, ctx->ctx_fl_using_dbreg, is_loaded));
2880-
2881- if (cnum == 41 && is_loaded
2882- && (tmpval & 0x1e00000000000UL) && (tmpval & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) {
2883-
2884- DPRINT(("pmc[%d]=0x%lx has active pmc41 settings, clearing dbr\n", cnum, tmpval));
2885-
2886- /* don't mix debug with perfmon */
2887- if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
2888-
2889- /*
2890- * a count of 0 will mark the debug registers if:
2891- * AND
2892- */
2893- ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs);
2894- if (ret) return ret;
2895- }
2896- /*
2897- * we must clear the (instruction) debug registers if:
2898- * pmc38.ig_ibrpX is 0 (enabled)
2899- * AND
2900- * ctx_fl_using_dbreg == 0 (i.e., dbr not yet used)
2901- */
2902- if (cnum == 38 && is_loaded && ((tmpval & 0x492UL) != 0x492UL) && ctx->ctx_fl_using_dbreg == 0) {
2903-
2904- DPRINT(("pmc38=0x%lx has active pmc38 settings, clearing ibr\n", tmpval));
2905-
2906- /* don't mix debug with perfmon */
2907- if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
2908-
2909- /*
2910- * a count of 0 will mark the debug registers as in use and also
2911- * ensure that they are properly cleared.
2912- */
2913- ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs);
2914- if (ret) return ret;
2915-
2916- }
2917- switch(cnum) {
2918- case 32: val32 = *val;
2919- val38 = ctx->ctx_pmcs[38];
2920- val41 = ctx->ctx_pmcs[41];
2921- check_case1 = 1;
2922- break;
2923- case 38: val38 = *val;
2924- val32 = ctx->ctx_pmcs[32];
2925- val41 = ctx->ctx_pmcs[41];
2926- check_case1 = 1;
2927- break;
2928- case 41: val41 = *val;
2929- val32 = ctx->ctx_pmcs[32];
2930- val38 = ctx->ctx_pmcs[38];
2931- check_case1 = 1;
2932- break;
2933- }
2934- /* check illegal configuration which can produce inconsistencies in tagging
2935- * i-side events in L1D and L2 caches
2936- */
2937- if (check_case1) {
2938- ret = (((val41 >> 45) & 0xf) == 0 && ((val32>>57) & 0x1) == 0)
2939- && ((((val38>>1) & 0x3) == 0x2 || ((val38>>1) & 0x3) == 0)
2940- || (((val38>>4) & 0x3) == 0x2 || ((val38>>4) & 0x3) == 0));
2941- if (ret) {
2942- DPRINT(("invalid config pmc38=0x%lx pmc41=0x%lx pmc32=0x%lx\n", val38, val41, val32));
2943- return -EINVAL;
2944- }
2945- }
2946- *val = tmpval;
2947- return 0;
2948-}
2949-
2950-/*
2951- * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
2952- */
2953-static pmu_config_t pmu_conf_mont={
2954- .pmu_name = "Montecito",
2955- .pmu_family = 0x20,
2956- .flags = PFM_PMU_IRQ_RESEND,
2957- .ovfl_val = (1UL << 47) - 1,
2958- .pmd_desc = pfm_mont_pmd_desc,
2959- .pmc_desc = pfm_mont_pmc_desc,
2960- .num_ibrs = 8,
2961- .num_dbrs = 8,
2962- .use_rr_dbregs = 1 /* debug register are use for range retrictions */
2963-};
2964--- a/arch/ia64/kernel/process.c
2965+++ b/arch/ia64/kernel/process.c
2966@@ -28,6 +28,7 @@
2967 #include <linux/delay.h>
2968 #include <linux/kdebug.h>
2969 #include <linux/utsname.h>
2970+#include <linux/perfmon_kern.h>
2971
2972 #include <asm/cpu.h>
2973 #include <asm/delay.h>
2974@@ -45,10 +46,6 @@
2975
2976 #include "entry.h"
2977
2978-#ifdef CONFIG_PERFMON
2979-# include <asm/perfmon.h>
2980-#endif
2981-
2982 #include "sigframe.h"
2983
2984 void (*ia64_mark_idle)(int);
2985@@ -162,10 +159,8 @@ show_regs (struct pt_regs *regs)
2986
2987 void tsk_clear_notify_resume(struct task_struct *tsk)
2988 {
2989-#ifdef CONFIG_PERFMON
2990- if (tsk->thread.pfm_needs_checking)
2991+ if (test_ti_thread_flag(task_thread_info(tsk), TIF_PERFMON_WORK))
2992 return;
2993-#endif
2994 if (test_ti_thread_flag(task_thread_info(tsk), TIF_RESTORE_RSE))
2995 return;
2996 clear_ti_thread_flag(task_thread_info(tsk), TIF_NOTIFY_RESUME);
2997@@ -188,14 +183,9 @@ do_notify_resume_user(sigset_t *unused,
2998 return;
2999 }
3000
3001-#ifdef CONFIG_PERFMON
3002- if (current->thread.pfm_needs_checking)
3003- /*
3004- * Note: pfm_handle_work() allow us to call it with interrupts
3005- * disabled, and may enable interrupts within the function.
3006- */
3007- pfm_handle_work();
3008-#endif
3009+ /* process perfmon asynchronous work (e.g. block thread or reset) */
3010+ if (test_thread_flag(TIF_PERFMON_WORK))
3011+ pfm_handle_work(task_pt_regs(current));
3012
3013 /* deal with pending signal delivery */
3014 if (test_thread_flag(TIF_SIGPENDING)) {
3015@@ -212,22 +202,15 @@ do_notify_resume_user(sigset_t *unused,
3016 local_irq_disable(); /* force interrupt disable */
3017 }
3018
3019-static int pal_halt = 1;
3020 static int can_do_pal_halt = 1;
3021
3022 static int __init nohalt_setup(char * str)
3023 {
3024- pal_halt = can_do_pal_halt = 0;
3025+ can_do_pal_halt = 0;
3026 return 1;
3027 }
3028 __setup("nohalt", nohalt_setup);
3029
3030-void
3031-update_pal_halt_status(int status)
3032-{
3033- can_do_pal_halt = pal_halt && status;
3034-}
3035-
3036 /*
3037 * We use this if we don't have any better idle routine..
3038 */
3039@@ -236,6 +219,22 @@ default_idle (void)
3040 {
3041 local_irq_enable();
3042 while (!need_resched()) {
3043+#ifdef CONFIG_PERFMON
3044+ u64 psr = 0;
3045+ /*
3046+ * If requested, we stop the PMU to avoid
3047+ * measuring across the core idle loop.
3048+ *
3049+ * dcr.pp is not modified on purpose
3050+ * it is used when coming out of
3051+ * safe_halt() via interrupt
3052+ */
3053+ if ((__get_cpu_var(pfm_syst_info) & PFM_ITA_CPUINFO_IDLE_EXCL)) {
3054+ psr = ia64_getreg(_IA64_REG_PSR);
3055+ if (psr & IA64_PSR_PP)
3056+ ia64_rsm(IA64_PSR_PP);
3057+ }
3058+#endif
3059 if (can_do_pal_halt) {
3060 local_irq_disable();
3061 if (!need_resched()) {
3062@@ -244,6 +243,12 @@ default_idle (void)
3063 local_irq_enable();
3064 } else
3065 cpu_relax();
3066+#ifdef CONFIG_PERFMON
3067+ if ((__get_cpu_var(pfm_syst_info) & PFM_ITA_CPUINFO_IDLE_EXCL)) {
3068+ if (psr & IA64_PSR_PP)
3069+ ia64_ssm(IA64_PSR_PP);
3070+ }
3071+#endif
3072 }
3073 }
3074
3075@@ -344,22 +349,9 @@ cpu_idle (void)
3076 void
3077 ia64_save_extra (struct task_struct *task)
3078 {
3079-#ifdef CONFIG_PERFMON
3080- unsigned long info;
3081-#endif
3082-
3083 if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
3084 ia64_save_debug_regs(&task->thread.dbr[0]);
3085
3086-#ifdef CONFIG_PERFMON
3087- if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
3088- pfm_save_regs(task);
3089-
3090- info = __get_cpu_var(pfm_syst_info);
3091- if (info & PFM_CPUINFO_SYST_WIDE)
3092- pfm_syst_wide_update_task(task, info, 0);
3093-#endif
3094-
3095 #ifdef CONFIG_IA32_SUPPORT
3096 if (IS_IA32_PROCESS(task_pt_regs(task)))
3097 ia32_save_state(task);
3098@@ -369,22 +361,9 @@ ia64_save_extra (struct task_struct *tas
3099 void
3100 ia64_load_extra (struct task_struct *task)
3101 {
3102-#ifdef CONFIG_PERFMON
3103- unsigned long info;
3104-#endif
3105-
3106 if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
3107 ia64_load_debug_regs(&task->thread.dbr[0]);
3108
3109-#ifdef CONFIG_PERFMON
3110- if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
3111- pfm_load_regs(task);
3112-
3113- info = __get_cpu_var(pfm_syst_info);
3114- if (info & PFM_CPUINFO_SYST_WIDE)
3115- pfm_syst_wide_update_task(task, info, 1);
3116-#endif
3117-
3118 #ifdef CONFIG_IA32_SUPPORT
3119 if (IS_IA32_PROCESS(task_pt_regs(task)))
3120 ia32_load_state(task);
3121@@ -510,8 +489,7 @@ copy_thread (int nr, unsigned long clone
3122 * call behavior where scratch registers are preserved across
3123 * system calls (unless used by the system call itself).
3124 */
3125-# define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \
3126- | IA64_THREAD_PM_VALID)
3127+# define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID)
3128 # define THREAD_FLAGS_TO_SET 0
3129 p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR)
3130 | THREAD_FLAGS_TO_SET);
3131@@ -533,10 +511,8 @@ copy_thread (int nr, unsigned long clone
3132 }
3133 #endif
3134
3135-#ifdef CONFIG_PERFMON
3136- if (current->thread.pfm_context)
3137- pfm_inherit(p, child_ptregs);
3138-#endif
3139+ pfm_copy_thread(p);
3140+
3141 return retval;
3142 }
3143
3144@@ -745,15 +721,13 @@ exit_thread (void)
3145 {
3146
3147 ia64_drop_fpu(current);
3148-#ifdef CONFIG_PERFMON
3149- /* if needed, stop monitoring and flush state to perfmon context */
3150- if (current->thread.pfm_context)
3151- pfm_exit_thread(current);
3152+
3153+ /* if needed, stop monitoring and flush state to perfmon context */
3154+ pfm_exit_thread();
3155
3156 /* free debug register resources */
3157- if (current->thread.flags & IA64_THREAD_DBG_VALID)
3158- pfm_release_debug_registers(current);
3159-#endif
3160+ pfm_release_dbregs(current);
3161+
3162 if (IS_IA32_PROCESS(task_pt_regs(current)))
3163 ia32_drop_ia64_partial_page_list(current);
3164 }
3165--- a/arch/ia64/kernel/ptrace.c
3166+++ b/arch/ia64/kernel/ptrace.c
3167@@ -20,6 +20,7 @@
3168 #include <linux/security.h>
3169 #include <linux/audit.h>
3170 #include <linux/signal.h>
3171+#include <linux/perfmon_kern.h>
3172 #include <linux/regset.h>
3173 #include <linux/elf.h>
3174
3175@@ -30,9 +31,6 @@
3176 #include <asm/system.h>
3177 #include <asm/uaccess.h>
3178 #include <asm/unwind.h>
3179-#ifdef CONFIG_PERFMON
3180-#include <asm/perfmon.h>
3181-#endif
3182
3183 #include "entry.h"
3184
3185@@ -2124,7 +2122,6 @@ access_uarea(struct task_struct *child,
3186 "address 0x%lx\n", addr);
3187 return -1;
3188 }
3189-#ifdef CONFIG_PERFMON
3190 /*
3191 * Check if debug registers are used by perfmon. This
3192 * test must be done once we know that we can do the
3193@@ -2142,9 +2139,8 @@ access_uarea(struct task_struct *child,
3194 * IA64_THREAD_DBG_VALID. The registers are restored
3195 * by the PMU context switch code.
3196 */
3197- if (pfm_use_debug_registers(child))
3198+ if (pfm_use_dbregs(child))
3199 return -1;
3200-#endif
3201
3202 if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) {
3203 child->thread.flags |= IA64_THREAD_DBG_VALID;
3204--- a/arch/ia64/kernel/setup.c
3205+++ b/arch/ia64/kernel/setup.c
3206@@ -45,6 +45,7 @@
3207 #include <linux/cpufreq.h>
3208 #include <linux/kexec.h>
3209 #include <linux/crash_dump.h>
3210+#include <linux/perfmon_kern.h>
3211
3212 #include <asm/ia32.h>
3213 #include <asm/machvec.h>
3214@@ -1052,6 +1053,8 @@ cpu_init (void)
3215 }
3216 platform_cpu_init();
3217 pm_idle = default_idle;
3218+
3219+ pfm_init_percpu();
3220 }
3221
3222 void __init
3223--- a/arch/ia64/kernel/smpboot.c
3224+++ b/arch/ia64/kernel/smpboot.c
3225@@ -39,6 +39,7 @@
3226 #include <linux/efi.h>
3227 #include <linux/percpu.h>
3228 #include <linux/bitops.h>
3229+#include <linux/perfmon_kern.h>
3230
3231 #include <asm/atomic.h>
3232 #include <asm/cache.h>
3233@@ -381,10 +382,6 @@ smp_callin (void)
3234 extern void ia64_init_itm(void);
3235 extern volatile int time_keeper_id;
3236
3237-#ifdef CONFIG_PERFMON
3238- extern void pfm_init_percpu(void);
3239-#endif
3240-
3241 cpuid = smp_processor_id();
3242 phys_id = hard_smp_processor_id();
3243 itc_master = time_keeper_id;
3244@@ -410,10 +407,6 @@ smp_callin (void)
3245
3246 ia64_mca_cmc_vector_setup(); /* Setup vector on AP */
3247
3248-#ifdef CONFIG_PERFMON
3249- pfm_init_percpu();
3250-#endif
3251-
3252 local_irq_enable();
3253
3254 if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
3255@@ -749,6 +742,7 @@ int __cpu_disable(void)
3256 fixup_irqs();
3257 local_flush_tlb_all();
3258 cpu_clear(cpu, cpu_callin_map);
3259+ pfm_cpu_disable();
3260 return 0;
3261 }
3262
3263--- a/arch/ia64/kernel/sys_ia64.c
3264+++ b/arch/ia64/kernel/sys_ia64.c
3265@@ -293,3 +293,11 @@ sys_pciconfig_write (unsigned long bus,
3266 }
3267
3268 #endif /* CONFIG_PCI */
3269+
3270+#ifndef CONFIG_IA64_PERFMON_COMPAT
3271+asmlinkage long
3272+sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
3273+{
3274+ return -ENOSYS;
3275+}
3276+#endif
3277--- a/arch/ia64/lib/Makefile
3278+++ b/arch/ia64/lib/Makefile
3279@@ -13,7 +13,6 @@ lib-y := __divsi3.o __udivsi3.o __modsi3
3280
3281 obj-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o
3282 obj-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o
3283-lib-$(CONFIG_PERFMON) += carta_random.o
3284
3285 AFLAGS___divdi3.o =
3286 AFLAGS___udivdi3.o = -DUNSIGNED
3287--- a/arch/ia64/oprofile/init.c
3288+++ b/arch/ia64/oprofile/init.c
3289@@ -12,8 +12,8 @@
3290 #include <linux/init.h>
3291 #include <linux/errno.h>
3292
3293-extern int perfmon_init(struct oprofile_operations * ops);
3294-extern void perfmon_exit(void);
3295+extern int op_perfmon_init(struct oprofile_operations * ops);
3296+extern void op_perfmon_exit(void);
3297 extern void ia64_backtrace(struct pt_regs * const regs, unsigned int depth);
3298
3299 int __init oprofile_arch_init(struct oprofile_operations * ops)
3300@@ -22,7 +22,7 @@ int __init oprofile_arch_init(struct opr
3301
3302 #ifdef CONFIG_PERFMON
3303 /* perfmon_init() can fail, but we have no way to report it */
3304- ret = perfmon_init(ops);
3305+ ret = op_perfmon_init(ops);
3306 #endif
3307 ops->backtrace = ia64_backtrace;
3308
3309@@ -33,6 +33,6 @@ int __init oprofile_arch_init(struct opr
3310 void oprofile_arch_exit(void)
3311 {
3312 #ifdef CONFIG_PERFMON
3313- perfmon_exit();
3314+ op_perfmon_exit();
3315 #endif
3316 }
3317--- a/arch/ia64/oprofile/perfmon.c
3318+++ b/arch/ia64/oprofile/perfmon.c
3319@@ -10,25 +10,30 @@
3320 #include <linux/kernel.h>
3321 #include <linux/oprofile.h>
3322 #include <linux/sched.h>
3323-#include <asm/perfmon.h>
3324+#include <linux/module.h>
3325+#include <linux/perfmon_kern.h>
3326 #include <asm/ptrace.h>
3327 #include <asm/errno.h>
3328
3329 static int allow_ints;
3330
3331 static int
3332-perfmon_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg,
3333- struct pt_regs *regs, unsigned long stamp)
3334+perfmon_handler(struct pfm_context *ctx,
3335+ unsigned long ip, u64 stamp, void *data)
3336 {
3337- int event = arg->pmd_eventid;
3338+ struct pt_regs *regs;
3339+ struct pfm_ovfl_arg *arg;
3340+
3341+ regs = data;
3342+ arg = &ctx->ovfl_arg;
3343
3344- arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1;
3345+ arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET;
3346
3347 /* the owner of the oprofile event buffer may have exited
3348 * without perfmon being shutdown (e.g. SIGSEGV)
3349 */
3350 if (allow_ints)
3351- oprofile_add_sample(regs, event);
3352+ oprofile_add_sample(regs, arg->pmd_eventid);
3353 return 0;
3354 }
3355
3356@@ -45,17 +50,13 @@ static void perfmon_stop(void)
3357 allow_ints = 0;
3358 }
3359
3360-
3361-#define OPROFILE_FMT_UUID { \
3362- 0x77, 0x7a, 0x6e, 0x61, 0x20, 0x65, 0x73, 0x69, 0x74, 0x6e, 0x72, 0x20, 0x61, 0x65, 0x0a, 0x6c }
3363-
3364-static pfm_buffer_fmt_t oprofile_fmt = {
3365- .fmt_name = "oprofile_format",
3366- .fmt_uuid = OPROFILE_FMT_UUID,
3367- .fmt_handler = perfmon_handler,
3368+static struct pfm_smpl_fmt oprofile_fmt = {
3369+ .fmt_name = "OProfile",
3370+ .fmt_handler = perfmon_handler,
3371+ .fmt_flags = PFM_FMT_BUILTIN_FLAG,
3372+ .owner = THIS_MODULE
3373 };
3374
3375-
3376 static char * get_cpu_type(void)
3377 {
3378 __u8 family = local_cpu_data->family;
3379@@ -75,9 +76,9 @@ static char * get_cpu_type(void)
3380
3381 static int using_perfmon;
3382
3383-int perfmon_init(struct oprofile_operations * ops)
3384+int __init op_perfmon_init(struct oprofile_operations * ops)
3385 {
3386- int ret = pfm_register_buffer_fmt(&oprofile_fmt);
3387+ int ret = pfm_fmt_register(&oprofile_fmt);
3388 if (ret)
3389 return -ENODEV;
3390
3391@@ -90,10 +91,10 @@ int perfmon_init(struct oprofile_operati
3392 }
3393
3394
3395-void perfmon_exit(void)
3396+void op_perfmon_exit(void)
3397 {
3398 if (!using_perfmon)
3399 return;
3400
3401- pfm_unregister_buffer_fmt(oprofile_fmt.fmt_uuid);
3402+ pfm_fmt_unregister(&oprofile_fmt);
3403 }
3404--- /dev/null
3405+++ b/arch/ia64/perfmon/Kconfig
3406@@ -0,0 +1,67 @@
3407+menu "Hardware Performance Monitoring support"
3408+config PERFMON
3409+ bool "Perfmon2 performance monitoring interface"
3410+ default n
3411+ help
3412+ Enables the perfmon2 interface to access the hardware
3413+ performance counters. See <http://perfmon2.sf.net/> for
3414+ more details.
3415+
3416+config PERFMON_DEBUG
3417+ bool "Perfmon debugging"
3418+ default n
3419+ depends on PERFMON
3420+ help
3421+ Enables perfmon debugging support
3422+
3423+config PERFMON_DEBUG_FS
3424+ bool "Enable perfmon statistics reporting via debugfs"
3425+ default y
3426+ depends on PERFMON && DEBUG_FS
3427+ help
3428+ Enable collection and reporting of perfmon timing statistics under
3429+ debugfs. This is used for debugging and performance analysis of the
3430+ subsystem. The debugfs filesystem must be mounted.
3431+
3432+config IA64_PERFMON_COMPAT
3433+ bool "Enable old perfmon-2 compatbility mode"
3434+ default n
3435+ depends on PERFMON
3436+ help
3437+ Enable this option to allow performance tools which used the old
3438+ perfmon-2 interface to continue to work. Old tools are those using
3439+ the obsolete commands and arguments. Check your programs and look
3440+ in include/asm-ia64/perfmon_compat.h for more information.
3441+
3442+config IA64_PERFMON_GENERIC
3443+ tristate "Generic IA-64 PMU support"
3444+ depends on PERFMON
3445+ default n
3446+ help
3447+ Enables generic IA-64 PMU support.
3448+ The generic PMU is defined by the IA-64 architecture document.
3449+ This option should only be necessary when running with a PMU that
3450+ is not yet explicitely supported. Even then, there is no guarantee
3451+ that this support will work.
3452+
3453+config IA64_PERFMON_ITANIUM
3454+ tristate "Itanium (Merced) Performance Monitoring support"
3455+ depends on PERFMON
3456+ default n
3457+ help
3458+ Enables Itanium (Merced) PMU support.
3459+
3460+config IA64_PERFMON_MCKINLEY
3461+ tristate "Itanium 2 (McKinley) Performance Monitoring support"
3462+ depends on PERFMON
3463+ default n
3464+ help
3465+ Enables Itanium 2 (McKinley, Madison, Deerfield) PMU support.
3466+
3467+config IA64_PERFMON_MONTECITO
3468+ tristate "Itanium 2 9000 (Montecito) Performance Monitoring support"
3469+ depends on PERFMON
3470+ default n
3471+ help
3472+ Enables support for Itanium 2 9000 (Montecito) PMU.
3473+endmenu
3474--- /dev/null
3475+++ b/arch/ia64/perfmon/Makefile
3476@@ -0,0 +1,11 @@
3477+#
3478+# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
3479+# Contributed by Stephane Eranian <eranian@hpl.hp.com>
3480+#
3481+obj-$(CONFIG_PERFMON) += perfmon.o
3482+obj-$(CONFIG_IA64_PERFMON_COMPAT) += perfmon_default_smpl.o \
3483+ perfmon_compat.o
3484+obj-$(CONFIG_IA64_PERFMON_GENERIC) += perfmon_generic.o
3485+obj-$(CONFIG_IA64_PERFMON_ITANIUM) += perfmon_itanium.o
3486+obj-$(CONFIG_IA64_PERFMON_MCKINLEY) += perfmon_mckinley.o
3487+obj-$(CONFIG_IA64_PERFMON_MONTECITO) += perfmon_montecito.o
3488--- /dev/null
3489+++ b/arch/ia64/perfmon/perfmon.c
3490@@ -0,0 +1,946 @@
3491+/*
3492+ * This file implements the IA-64 specific
3493+ * support for the perfmon2 interface
3494+ *
3495+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
3496+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
3497+ *
3498+ * This program is free software; you can redistribute it and/or
3499+ * modify it under the terms of version 2 of the GNU General Public
3500+ * License as published by the Free Software Foundation.
3501+ *
3502+ * This program is distributed in the hope that it will be useful,
3503+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
3504+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
3505+ * General Public License for more details.
3506+ *
3507+ * You should have received a copy of the GNU General Public License
3508+ * along with this program; if not, write to the Free Software
3509+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
3510+ * 02111-1307 USA
3511+ */
3512+#include <linux/module.h>
3513+#include <linux/perfmon_kern.h>
3514+
3515+struct pfm_arch_session {
3516+ u32 pfs_sys_use_dbr; /* syswide session uses dbr */
3517+ u32 pfs_ptrace_use_dbr; /* a thread uses dbr via ptrace()*/
3518+};
3519+
3520+DEFINE_PER_CPU(u32, pfm_syst_info);
3521+
3522+static struct pfm_arch_session pfm_arch_sessions;
3523+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_arch_sessions_lock);
3524+
3525+static inline void pfm_clear_psr_pp(void)
3526+{
3527+ ia64_rsm(IA64_PSR_PP);
3528+}
3529+
3530+static inline void pfm_set_psr_pp(void)
3531+{
3532+ ia64_ssm(IA64_PSR_PP);
3533+}
3534+
3535+static inline void pfm_clear_psr_up(void)
3536+{
3537+ ia64_rsm(IA64_PSR_UP);
3538+}
3539+
3540+static inline void pfm_set_psr_up(void)
3541+{
3542+ ia64_ssm(IA64_PSR_UP);
3543+}
3544+
3545+static inline void pfm_set_psr_l(u64 val)
3546+{
3547+ ia64_setreg(_IA64_REG_PSR_L, val);
3548+}
3549+
3550+static inline void pfm_restore_ibrs(u64 *ibrs, unsigned int nibrs)
3551+{
3552+ unsigned int i;
3553+
3554+ for (i = 0; i < nibrs; i++) {
3555+ ia64_set_ibr(i, ibrs[i]);
3556+ ia64_dv_serialize_instruction();
3557+ }
3558+ ia64_srlz_i();
3559+}
3560+
3561+static inline void pfm_restore_dbrs(u64 *dbrs, unsigned int ndbrs)
3562+{
3563+ unsigned int i;
3564+
3565+ for (i = 0; i < ndbrs; i++) {
3566+ ia64_set_dbr(i, dbrs[i]);
3567+ ia64_dv_serialize_data();
3568+ }
3569+ ia64_srlz_d();
3570+}
3571+
3572+irqreturn_t pmu_interrupt_handler(int irq, void *arg)
3573+{
3574+ struct pt_regs *regs;
3575+ regs = get_irq_regs();
3576+ irq_enter();
3577+ pfm_interrupt_handler(instruction_pointer(regs), regs);
3578+ irq_exit();
3579+ return IRQ_HANDLED;
3580+}
3581+static struct irqaction perfmon_irqaction = {
3582+ .handler = pmu_interrupt_handler,
3583+ .flags = IRQF_DISABLED, /* means keep interrupts masked */
3584+ .name = "perfmon"
3585+};
3586+
3587+void pfm_arch_quiesce_pmu_percpu(void)
3588+{
3589+ u64 dcr;
3590+ /*
3591+ * make sure no measurement is active
3592+ * (may inherit programmed PMCs from EFI).
3593+ */
3594+ pfm_clear_psr_pp();
3595+ pfm_clear_psr_up();
3596+
3597+ /*
3598+ * ensure dcr.pp is cleared
3599+ */
3600+ dcr = ia64_getreg(_IA64_REG_CR_DCR);
3601+ ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP);
3602+
3603+ /*
3604+ * we run with the PMU not frozen at all times
3605+ */
3606+ ia64_set_pmc(0, 0);
3607+ ia64_srlz_d();
3608+}
3609+
3610+void pfm_arch_init_percpu(void)
3611+{
3612+ pfm_arch_quiesce_pmu_percpu();
3613+ /*
3614+ * program PMU interrupt vector
3615+ */
3616+ ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
3617+ ia64_srlz_d();
3618+}
3619+
3620+int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags)
3621+{
3622+ struct pfm_arch_context *ctx_arch;
3623+
3624+ ctx_arch = pfm_ctx_arch(ctx);
3625+
3626+ ctx_arch->flags.use_dbr = 0;
3627+ ctx_arch->flags.insecure = (ctx_flags & PFM_ITA_FL_INSECURE) ? 1: 0;
3628+
3629+ PFM_DBG("insecure=%d", ctx_arch->flags.insecure);
3630+
3631+ return 0;
3632+}
3633+
3634+/*
3635+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
3636+ * Context is locked. Interrupts are masked. Monitoring may be active.
3637+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
3638+ *
3639+ * Return:
3640+ * non-zero : did not save PMDs (as part of stopping the PMU)
3641+ * 0 : saved PMDs (no need to save them in caller)
3642+ */
3643+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx)
3644+{
3645+ struct pfm_arch_context *ctx_arch;
3646+ struct pfm_event_set *set;
3647+ u64 psr, tmp;
3648+
3649+ ctx_arch = pfm_ctx_arch(ctx);
3650+ set = ctx->active_set;
3651+
3652+ /*
3653+ * save current PSR: needed because we modify it
3654+ */
3655+ ia64_srlz_d();
3656+ psr = ia64_getreg(_IA64_REG_PSR);
3657+
3658+ /*
3659+ * stop monitoring:
3660+ * This is the last instruction which may generate an overflow
3661+ *
3662+ * we do not clear ipsr.up
3663+ */
3664+ pfm_clear_psr_up();
3665+ ia64_srlz_d();
3666+
3667+ /*
3668+ * extract overflow status bits
3669+ */
3670+ tmp = ia64_get_pmc(0) & ~0xf;
3671+
3672+ /*
3673+ * keep a copy of psr.up (for reload)
3674+ */
3675+ ctx_arch->ctx_saved_psr_up = psr & IA64_PSR_UP;
3676+
3677+ /*
3678+ * save overflow status bits
3679+ */
3680+ set->povfl_pmds[0] = tmp;
3681+
3682+ /*
3683+ * record how many pending overflows
3684+ * XXX: assume identity mapping for counters
3685+ */
3686+ set->npend_ovfls = ia64_popcnt(tmp);
3687+
3688+ /*
3689+ * make sure the PMU is unfrozen for the next task
3690+ */
3691+ if (set->npend_ovfls) {
3692+ ia64_set_pmc(0, 0);
3693+ ia64_srlz_d();
3694+ }
3695+ return 1;
3696+}
3697+
3698+/*
3699+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
3700+ * set cannot be NULL. Context is locked. Interrupts are masked.
3701+ * Caller has already restored all PMD and PMC registers.
3702+ *
3703+ * must reactivate monitoring
3704+ */
3705+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx)
3706+{
3707+ struct pfm_arch_context *ctx_arch;
3708+
3709+ ctx_arch = pfm_ctx_arch(ctx);
3710+
3711+ /*
3712+ * when monitoring is not explicitly started
3713+ * then psr_up = 0, in which case we do not
3714+ * need to restore
3715+ */
3716+ if (likely(ctx_arch->ctx_saved_psr_up)) {
3717+ pfm_set_psr_up();
3718+ ia64_srlz_d();
3719+ }
3720+}
3721+
3722+int pfm_arch_reserve_session(struct pfm_context *ctx, u32 cpu)
3723+{
3724+ struct pfm_arch_context *ctx_arch;
3725+ int is_system;
3726+ int ret = 0;
3727+
3728+ ctx_arch = pfm_ctx_arch(ctx);
3729+ is_system = ctx->flags.system;
3730+
3731+ spin_lock(&pfm_arch_sessions_lock);
3732+
3733+ if (is_system && ctx_arch->flags.use_dbr) {
3734+ PFM_DBG("syswide context uses dbregs");
3735+
3736+ if (pfm_arch_sessions.pfs_ptrace_use_dbr) {
3737+ PFM_DBG("cannot reserve syswide context: "
3738+ "dbregs in use by ptrace");
3739+ ret = -EBUSY;
3740+ } else {
3741+ pfm_arch_sessions.pfs_sys_use_dbr++;
3742+ }
3743+ }
3744+ spin_unlock(&pfm_arch_sessions_lock);
3745+
3746+ return ret;
3747+}
3748+
3749+void pfm_arch_release_session(struct pfm_context *ctx, u32 cpu)
3750+{
3751+ struct pfm_arch_context *ctx_arch;
3752+ int is_system;
3753+
3754+ ctx_arch = pfm_ctx_arch(ctx);
3755+ is_system = ctx->flags.system;
3756+
3757+ spin_lock(&pfm_arch_sessions_lock);
3758+
3759+ if (is_system && ctx_arch->flags.use_dbr)
3760+ pfm_arch_sessions.pfs_sys_use_dbr--;
3761+ spin_unlock(&pfm_arch_sessions_lock);
3762+}
3763+
3764+/*
3765+ * function called from pfm_load_context_*(). Task is not guaranteed to be
3766+ * current task. If not then other task is guaranteed stopped and off any CPU.
3767+ * context is locked and interrupts are masked.
3768+ *
3769+ * On PFM_LOAD_CONTEXT, the interface guarantees monitoring is stopped.
3770+ *
3771+ * For system-wide task is NULL
3772+ */
3773+int pfm_arch_load_context(struct pfm_context *ctx)
3774+{
3775+ struct pfm_arch_context *ctx_arch;
3776+ struct pt_regs *regs;
3777+ int ret = 0;
3778+
3779+ ctx_arch = pfm_ctx_arch(ctx);
3780+
3781+ /*
3782+ * cannot load a context which is using range restrictions,
3783+ * into a thread that is being debugged.
3784+ *
3785+ * if one set out of several is using the debug registers, then
3786+ * we assume the context as whole is using them.
3787+ */
3788+ if (ctx_arch->flags.use_dbr) {
3789+ if (ctx->flags.system) {
3790+ spin_lock(&pfm_arch_sessions_lock);
3791+
3792+ if (pfm_arch_sessions.pfs_ptrace_use_dbr) {
3793+ PFM_DBG("cannot reserve syswide context: "
3794+ "dbregs in use by ptrace");
3795+ ret = -EBUSY;
3796+ } else {
3797+ pfm_arch_sessions.pfs_sys_use_dbr++;
3798+ PFM_DBG("pfs_sys_use_dbr=%u",
3799+ pfm_arch_sessions.pfs_sys_use_dbr);
3800+ }
3801+ spin_unlock(&pfm_arch_sessions_lock);
3802+
3803+ } else if (ctx->task->thread.flags & IA64_THREAD_DBG_VALID) {
3804+ PFM_DBG("load_pid [%d] thread is debugged, cannot "
3805+ "use range restrictions", ctx->task->pid);
3806+ ret = -EBUSY;
3807+ }
3808+ if (ret)
3809+ return ret;
3810+ }
3811+
3812+ /*
3813+ * We need to intervene on context switch to toggle the
3814+ * psr.pp bit in system-wide. As such, we set the TIF
3815+ * flag so that pfm_arch_ctxswout_sys() and the
3816+ * pfm_arch_ctxswin_sys() functions get called
3817+ * from pfm_ctxsw_sys();
3818+ */
3819+ if (ctx->flags.system) {
3820+ set_thread_flag(TIF_PERFMON_CTXSW);
3821+ PFM_DBG("[%d] set TIF", current->pid);
3822+ return 0;
3823+ }
3824+
3825+ regs = task_pt_regs(ctx->task);
3826+
3827+ /*
3828+ * self-monitoring systematically allows user level control
3829+ */
3830+ if (ctx->task != current) {
3831+ /*
3832+ * when not current, task is stopped, so this is safe
3833+ */
3834+ ctx_arch->ctx_saved_psr_up = 0;
3835+ ia64_psr(regs)->up = ia64_psr(regs)->pp = 0;
3836+ } else
3837+ ctx_arch->flags.insecure = 1;
3838+
3839+ /*
3840+ * allow user level control (start/stop/read pmd) if:
3841+ * - self-monitoring
3842+ * - requested at context creation (PFM_IA64_FL_INSECURE)
3843+ *
3844+ * There is not security hole with PFM_IA64_FL_INSECURE because
3845+ * when not self-monitored, the caller must have permissions to
3846+ * attached to the task.
3847+ */
3848+ if (ctx_arch->flags.insecure) {
3849+ ia64_psr(regs)->sp = 0;
3850+ PFM_DBG("clearing psr.sp for [%d]", ctx->task->pid);
3851+ }
3852+ return 0;
3853+}
3854+
3855+int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags)
3856+{
3857+#define PFM_SETFL_BOTH_SWITCH (PFM_SETFL_OVFL_SWITCH|PFM_SETFL_TIME_SWITCH)
3858+#define PFM_ITA_SETFL_BOTH_INTR (PFM_ITA_SETFL_INTR_ONLY|\
3859+ PFM_ITA_SETFL_EXCL_INTR)
3860+
3861+/* exclude return value field */
3862+#define PFM_SETFL_ALL_MASK (PFM_ITA_SETFL_BOTH_INTR \
3863+ | PFM_SETFL_BOTH_SWITCH \
3864+ | PFM_ITA_SETFL_IDLE_EXCL)
3865+
3866+ if ((flags & ~PFM_SETFL_ALL_MASK)) {
3867+ PFM_DBG("invalid flags=0x%x", flags);
3868+ return -EINVAL;
3869+ }
3870+
3871+ if ((flags & PFM_ITA_SETFL_BOTH_INTR) == PFM_ITA_SETFL_BOTH_INTR) {
3872+ PFM_DBG("both excl intr and ontr only are set");
3873+ return -EINVAL;
3874+ }
3875+
3876+ if ((flags & PFM_ITA_SETFL_IDLE_EXCL) && !ctx->flags.system) {
3877+ PFM_DBG("idle exclude flag only for system-wide context");
3878+ return -EINVAL;
3879+ }
3880+ return 0;
3881+}
3882+
3883+/*
3884+ * function called from pfm_unload_context_*(). Context is locked.
3885+ * interrupts are masked. task is not guaranteed to be current task.
3886+ * Access to PMU is not guaranteed.
3887+ *
3888+ * function must do whatever arch-specific action is required on unload
3889+ * of a context.
3890+ *
3891+ * called for both system-wide and per-thread. task is NULL for ssytem-wide
3892+ */
3893+void pfm_arch_unload_context(struct pfm_context *ctx)
3894+{
3895+ struct pfm_arch_context *ctx_arch;
3896+ struct pt_regs *regs;
3897+
3898+ ctx_arch = pfm_ctx_arch(ctx);
3899+
3900+ if (ctx->flags.system) {
3901+ /*
3902+ * disable context switch hook
3903+ */
3904+ clear_thread_flag(TIF_PERFMON_CTXSW);
3905+
3906+ if (ctx_arch->flags.use_dbr) {
3907+ spin_lock(&pfm_arch_sessions_lock);
3908+ pfm_arch_sessions.pfs_sys_use_dbr--;
3909+ PFM_DBG("sys_use_dbr=%u", pfm_arch_sessions.pfs_sys_use_dbr);
3910+ spin_unlock(&pfm_arch_sessions_lock);
3911+ }
3912+ } else {
3913+ regs = task_pt_regs(ctx->task);
3914+
3915+ /*
3916+ * cancel user level control for per-task context
3917+ */
3918+ ia64_psr(regs)->sp = 1;
3919+ PFM_DBG("setting psr.sp for [%d]", ctx->task->pid);
3920+ }
3921+}
3922+
3923+/*
3924+ * mask monitoring by setting the privilege level to 0
3925+ * we cannot use psr.pp/psr.up for this, it is controlled by
3926+ * the user
3927+ */
3928+void pfm_arch_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set)
3929+{
3930+ struct pfm_arch_pmu_info *arch_info;
3931+ unsigned long mask;
3932+ unsigned int i;
3933+
3934+ arch_info = pfm_pmu_info();
3935+ /*
3936+ * as an optimization we look at the first 64 PMC
3937+ * registers only starting at PMC4.
3938+ */
3939+ mask = arch_info->mask_pmcs[0] >> PFM_ITA_FCNTR;
3940+ for (i = PFM_ITA_FCNTR; mask; i++, mask >>= 1) {
3941+ if (likely(mask & 0x1))
3942+ ia64_set_pmc(i, set->pmcs[i] & ~0xfUL);
3943+ }
3944+ /*
3945+ * make changes visisble
3946+ */
3947+ ia64_srlz_d();
3948+}
3949+
3950+/*
3951+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
3952+ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets()
3953+ * context is locked. Interrupts are masked. set cannot be NULL.
3954+ * Access to the PMU is guaranteed.
3955+ *
3956+ * function must restore all PMD registers from set.
3957+ */
3958+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
3959+{
3960+ struct pfm_arch_context *ctx_arch;
3961+ unsigned long *mask;
3962+ u16 i, num;
3963+
3964+ ctx_arch = pfm_ctx_arch(ctx);
3965+
3966+ if (ctx_arch->flags.insecure) {
3967+ num = ctx->regs.num_rw_pmd;
3968+ mask = ctx->regs.rw_pmds;
3969+ } else {
3970+ num = set->nused_pmds;
3971+ mask = set->used_pmds;
3972+ }
3973+ /*
3974+ * must restore all implemented read-write PMDS to avoid leaking
3975+ * information especially when PFM_IA64_FL_INSECURE is set.
3976+ *
3977+ * XXX: should check PFM_IA64_FL_INSECURE==0 and use used_pmd instead
3978+ */
3979+ for (i = 0; num; i++) {
3980+ if (likely(test_bit(i, mask))) {
3981+ pfm_arch_write_pmd(ctx, i, set->pmds[i].value);
3982+ num--;
3983+ }
3984+ }
3985+ ia64_srlz_d();
3986+}
3987+
3988+/*
3989+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
3990+ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets()
3991+ * context is locked. Interrupts are masked. set cannot be NULL.
3992+ * Access to the PMU is guaranteed.
3993+ *
3994+ * function must restore all PMC registers from set if needed
3995+ */
3996+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
3997+{
3998+ struct pfm_arch_pmu_info *arch_info;
3999+ u64 mask2 = 0, val, plm;
4000+ unsigned long impl_mask, mask_pmcs;
4001+ unsigned int i;
4002+
4003+ arch_info = pfm_pmu_info();
4004+ /*
4005+ * as an optimization we only look at the first 64
4006+ * PMC registers. In fact, we should never scan the
4007+ * entire impl_pmcs because ibr/dbr are implemented
4008+ * separately.
4009+ *
4010+ * always skip PMC0-PMC3. PMC0 taken care of when saving
4011+ * state. PMC1-PMC3 not used until we get counters in
4012+ * the 60 and above index range.
4013+ */
4014+ impl_mask = ctx->regs.pmcs[0] >> PFM_ITA_FCNTR;
4015+ mask_pmcs = arch_info->mask_pmcs[0] >> PFM_ITA_FCNTR;
4016+ plm = ctx->state == PFM_CTX_MASKED ? ~0xf : ~0x0;
4017+
4018+ for (i = PFM_ITA_FCNTR;
4019+ impl_mask;
4020+ i++, impl_mask >>= 1, mask_pmcs >>= 1) {
4021+ if (likely(impl_mask & 0x1)) {
4022+ mask2 = mask_pmcs & 0x1 ? plm : ~0;
4023+ val = set->pmcs[i] & mask2;
4024+ ia64_set_pmc(i, val);
4025+ PFM_DBG_ovfl("pmc%u=0x%lx", i, val);
4026+ }
4027+ }
4028+ /*
4029+ * restore DBR/IBR
4030+ */
4031+ if (set->priv_flags & PFM_ITA_SETFL_USE_DBR) {
4032+ pfm_restore_ibrs(set->pmcs+256, 8);
4033+ pfm_restore_dbrs(set->pmcs+264, 8);
4034+ }
4035+ ia64_srlz_d();
4036+}
4037+
4038+void pfm_arch_unmask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set)
4039+{
4040+ u64 psr;
4041+ int is_system;
4042+
4043+ is_system = ctx->flags.system;
4044+
4045+ psr = ia64_getreg(_IA64_REG_PSR);
4046+
4047+ /*
4048+ * monitoring is masked via the PMC.plm
4049+ *
4050+ * As we restore their value, we do not want each counter to
4051+ * restart right away. We stop monitoring using the PSR,
4052+ * restore the PMC (and PMD) and then re-establish the psr
4053+ * as it was. Note that there can be no pending overflow at
4054+ * this point, because monitoring is still MASKED.
4055+ *
4056+ * Because interrupts are masked we can avoid changing
4057+ * DCR.pp.
4058+ */
4059+ if (is_system)
4060+ pfm_clear_psr_pp();
4061+ else
4062+ pfm_clear_psr_up();
4063+
4064+ ia64_srlz_d();
4065+
4066+ pfm_arch_restore_pmcs(ctx, set);
4067+
4068+ /*
4069+ * restore psr
4070+ *
4071+ * monitoring may start right now but interrupts
4072+ * are still masked
4073+ */
4074+ pfm_set_psr_l(psr);
4075+ ia64_srlz_d();
4076+}
4077+
4078+/*
4079+ * Called from pfm_stop()
4080+ *
4081+ * For per-thread:
4082+ * task is not necessarily current. If not current task, then
4083+ * task is guaranteed stopped and off any cpu. Access to PMU
4084+ * is not guaranteed. Interrupts are masked. Context is locked.
4085+ * Set is the active set.
4086+ *
4087+ * must disable active monitoring. ctx cannot be NULL
4088+ */
4089+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx)
4090+{
4091+ struct pfm_arch_context *ctx_arch;
4092+ struct pt_regs *regs;
4093+ u64 dcr, psr;
4094+
4095+ ctx_arch = pfm_ctx_arch(ctx);
4096+ regs = task_pt_regs(task);
4097+
4098+ if (!ctx->flags.system) {
4099+ /*
4100+ * in ZOMBIE state we always have task == current due to
4101+ * pfm_exit_thread()
4102+ */
4103+ ia64_psr(regs)->up = 0;
4104+ ctx_arch->ctx_saved_psr_up = 0;
4105+
4106+ /*
4107+ * in case of ZOMBIE state, there is no unload to clear
4108+ * insecure monitoring, so we do it in stop instead.
4109+ */
4110+ if (ctx->state == PFM_CTX_ZOMBIE)
4111+ ia64_psr(regs)->sp = 1;
4112+
4113+ if (task == current) {
4114+ pfm_clear_psr_up();
4115+ ia64_srlz_d();
4116+ }
4117+ } else if (ctx->flags.started) { /* do not stop twice */
4118+ dcr = ia64_getreg(_IA64_REG_CR_DCR);
4119+ psr = ia64_getreg(_IA64_REG_PSR);
4120+
4121+ ia64_psr(regs)->pp = 0;
4122+ ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP);
4123+ pfm_clear_psr_pp();
4124+ ia64_srlz_d();
4125+
4126+ if (ctx->active_set->flags & PFM_ITA_SETFL_IDLE_EXCL) {
4127+ PFM_DBG("disabling idle exclude");
4128+ __get_cpu_var(pfm_syst_info) &= ~PFM_ITA_CPUINFO_IDLE_EXCL;
4129+ }
4130+ }
4131+}
4132+
4133+/*
4134+ * called from pfm_start()
4135+ *
4136+ * Interrupts are masked. Context is locked. Set is the active set.
4137+ *
4138+ * For per-thread:
4139+ * Task is not necessarily current. If not current task, then task
4140+ * is guaranteed stopped and off any cpu. No access to PMU is task
4141+ * is not current.
4142+ *
4143+ * For system-wide:
4144+ * task is always current
4145+ *
4146+ * must enable active monitoring.
4147+ */
4148+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx)
4149+{
4150+ struct pfm_arch_context *ctx_arch;
4151+ struct pt_regs *regs;
4152+ u64 dcr, dcr_pp, psr_pp;
4153+ u32 flags;
4154+
4155+ ctx_arch = pfm_ctx_arch(ctx);
4156+ regs = task_pt_regs(task);
4157+ flags = ctx->active_set->flags;
4158+
4159+ /*
4160+ * per-thread mode
4161+ */
4162+ if (!ctx->flags.system) {
4163+
4164+ ia64_psr(regs)->up = 1;
4165+
4166+ if (task == current) {
4167+ pfm_set_psr_up();
4168+ ia64_srlz_d();
4169+ } else {
4170+ /*
4171+ * activate monitoring at next ctxswin
4172+ */
4173+ ctx_arch->ctx_saved_psr_up = IA64_PSR_UP;
4174+ }
4175+ return;
4176+ }
4177+
4178+ /*
4179+ * system-wide mode
4180+ */
4181+ dcr = ia64_getreg(_IA64_REG_CR_DCR);
4182+ if (flags & PFM_ITA_SETFL_INTR_ONLY) {
4183+ dcr_pp = 1;
4184+ psr_pp = 0;
4185+ } else if (flags & PFM_ITA_SETFL_EXCL_INTR) {
4186+ dcr_pp = 0;
4187+ psr_pp = 1;
4188+ } else {
4189+ dcr_pp = psr_pp = 1;
4190+ }
4191+ PFM_DBG("dcr_pp=%lu psr_pp=%lu", dcr_pp, psr_pp);
4192+
4193+ /*
4194+ * update dcr_pp and psr_pp
4195+ */
4196+ if (dcr_pp)
4197+ ia64_setreg(_IA64_REG_CR_DCR, dcr | IA64_DCR_PP);
4198+ else
4199+ ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP);
4200+
4201+ if (psr_pp) {
4202+ pfm_set_psr_pp();
4203+ ia64_psr(regs)->pp = 1;
4204+ } else {
4205+ pfm_clear_psr_pp();
4206+ ia64_psr(regs)->pp = 0;
4207+ }
4208+ ia64_srlz_d();
4209+
4210+ if (ctx->active_set->flags & PFM_ITA_SETFL_IDLE_EXCL) {
4211+ PFM_DBG("enable idle exclude");
4212+ __get_cpu_var(pfm_syst_info) |= PFM_ITA_CPUINFO_IDLE_EXCL;
4213+ }
4214+}
4215+
4216+/*
4217+ * Only call this function when a process is trying to
4218+ * write the debug registers (reading is always allowed)
4219+ * called from arch/ia64/kernel/ptrace.c:access_uarea()
4220+ */
4221+int __pfm_use_dbregs(struct task_struct *task)
4222+{
4223+ struct pfm_arch_context *ctx_arch;
4224+ struct pfm_context *ctx;
4225+ unsigned long flags;
4226+ int ret = 0;
4227+
4228+ PFM_DBG("called for [%d]", task->pid);
4229+
4230+ ctx = task->pfm_context;
4231+
4232+ /*
4233+ * do it only once
4234+ */
4235+ if (task->thread.flags & IA64_THREAD_DBG_VALID) {
4236+ PFM_DBG("IA64_THREAD_DBG_VALID already set");
4237+ return 0;
4238+ }
4239+ if (ctx) {
4240+ spin_lock_irqsave(&ctx->lock, flags);
4241+ ctx_arch = pfm_ctx_arch(ctx);
4242+
4243+ if (ctx_arch->flags.use_dbr == 1) {
4244+ PFM_DBG("PMU using dbregs already, no ptrace access");
4245+ ret = -1;
4246+ }
4247+ spin_unlock_irqrestore(&ctx->lock, flags);
4248+ if (ret)
4249+ return ret;
4250+ }
4251+
4252+ spin_lock(&pfm_arch_sessions_lock);
4253+
4254+ /*
4255+ * We cannot allow setting breakpoints when system wide monitoring
4256+ * sessions are using the debug registers.
4257+ */
4258+ if (!pfm_arch_sessions.pfs_sys_use_dbr)
4259+ pfm_arch_sessions.pfs_ptrace_use_dbr++;
4260+ else
4261+ ret = -1;
4262+
4263+ PFM_DBG("ptrace_use_dbr=%u sys_use_dbr=%u by [%d] ret = %d",
4264+ pfm_arch_sessions.pfs_ptrace_use_dbr,
4265+ pfm_arch_sessions.pfs_sys_use_dbr,
4266+ task->pid, ret);
4267+
4268+ spin_unlock(&pfm_arch_sessions_lock);
4269+ if (ret)
4270+ return ret;
4271+#ifndef CONFIG_SMP
4272+ /*
4273+ * in UP, we need to check whether the current
4274+ * owner of the PMU is not using the debug registers
4275+ * for monitoring. Because we are using a lazy
4276+ * save on ctxswout, we must force a save in this
4277+ * case because the debug registers are being
4278+ * modified by another task. We save the current
4279+ * PMD registers, and clear ownership. In ctxswin,
4280+ * full state will be reloaded.
4281+ *
4282+ * Note: we overwrite task.
4283+ */
4284+ task = __get_cpu_var(pmu_owner);
4285+ ctx = __get_cpu_var(pmu_ctx);
4286+
4287+ if (task == NULL)
4288+ return 0;
4289+
4290+ ctx_arch = pfm_ctx_arch(ctx);
4291+
4292+ if (ctx_arch->flags.use_dbr)
4293+ pfm_save_pmds_release(ctx);
4294+#endif
4295+ return 0;
4296+}
4297+
4298+/*
4299+ * This function is called for every task that exits with the
4300+ * IA64_THREAD_DBG_VALID set. This indicates a task which was
4301+ * able to use the debug registers for debugging purposes via
4302+ * ptrace(). Therefore we know it was not using them for
4303+ * perfmormance monitoring, so we only decrement the number
4304+ * of "ptraced" debug register users to keep the count up to date
4305+ */
4306+int __pfm_release_dbregs(struct task_struct *task)
4307+{
4308+ int ret;
4309+
4310+ spin_lock(&pfm_arch_sessions_lock);
4311+
4312+ if (pfm_arch_sessions.pfs_ptrace_use_dbr == 0) {
4313+ PFM_ERR("invalid release for [%d] ptrace_use_dbr=0", task->pid);
4314+ ret = -1;
4315+ } else {
4316+ pfm_arch_sessions.pfs_ptrace_use_dbr--;
4317+ ret = 0;
4318+ }
4319+ spin_unlock(&pfm_arch_sessions_lock);
4320+
4321+ return ret;
4322+}
4323+
4324+int pfm_ia64_mark_dbregs_used(struct pfm_context *ctx,
4325+ struct pfm_event_set *set)
4326+{
4327+ struct pfm_arch_context *ctx_arch;
4328+ struct task_struct *task;
4329+ struct thread_struct *thread;
4330+ int ret = 0, state;
4331+ int i, can_access_pmu = 0;
4332+ int is_loaded, is_system;
4333+
4334+ ctx_arch = pfm_ctx_arch(ctx);
4335+ state = ctx->state;
4336+ task = ctx->task;
4337+ is_loaded = state == PFM_CTX_LOADED || state == PFM_CTX_MASKED;
4338+ is_system = ctx->flags.system;
4339+ can_access_pmu = __get_cpu_var(pmu_owner) == task || is_system;
4340+
4341+ if (is_loaded == 0)
4342+ goto done;
4343+
4344+ if (is_system == 0) {
4345+ thread = &(task->thread);
4346+
4347+ /*
4348+ * cannot use debug registers for montioring if they are
4349+ * already used for debugging
4350+ */
4351+ if (thread->flags & IA64_THREAD_DBG_VALID) {
4352+ PFM_DBG("debug registers already in use for [%d]",
4353+ task->pid);
4354+ return -EBUSY;
4355+ }
4356+ }
4357+
4358+ /*
4359+ * check for debug registers in system wide mode
4360+ */
4361+ spin_lock(&pfm_arch_sessions_lock);
4362+
4363+ if (is_system) {
4364+ if (pfm_arch_sessions.pfs_ptrace_use_dbr)
4365+ ret = -EBUSY;
4366+ else
4367+ pfm_arch_sessions.pfs_sys_use_dbr++;
4368+ }
4369+
4370+ spin_unlock(&pfm_arch_sessions_lock);
4371+
4372+ if (ret != 0)
4373+ return ret;
4374+
4375+ /*
4376+ * clear hardware registers to make sure we don't
4377+ * pick up stale state.
4378+ */
4379+ if (can_access_pmu) {
4380+ PFM_DBG("clearing ibrs, dbrs");
4381+ for (i = 0; i < 8; i++) {
4382+ ia64_set_ibr(i, 0);
4383+ ia64_dv_serialize_instruction();
4384+ }
4385+ ia64_srlz_i();
4386+ for (i = 0; i < 8; i++) {
4387+ ia64_set_dbr(i, 0);
4388+ ia64_dv_serialize_data();
4389+ }
4390+ ia64_srlz_d();
4391+ }
4392+done:
4393+ /*
4394+ * debug registers are now in use
4395+ */
4396+ ctx_arch->flags.use_dbr = 1;
4397+ set->priv_flags |= PFM_ITA_SETFL_USE_DBR;
4398+ PFM_DBG("set%u use_dbr=1", set->id);
4399+ return 0;
4400+}
4401+EXPORT_SYMBOL(pfm_ia64_mark_dbregs_used);
4402+
4403+char *pfm_arch_get_pmu_module_name(void)
4404+{
4405+ switch (local_cpu_data->family) {
4406+ case 0x07:
4407+ return "perfmon_itanium";
4408+ case 0x1f:
4409+ return "perfmon_mckinley";
4410+ case 0x20:
4411+ return "perfmon_montecito";
4412+ default:
4413+ return "perfmon_generic";
4414+ }
4415+ return NULL;
4416+}
4417+
4418+/*
4419+ * global arch-specific intialization, called only once
4420+ */
4421+int __init pfm_arch_init(void)
4422+{
4423+ int ret;
4424+
4425+ spin_lock_init(&pfm_arch_sessions_lock);
4426+
4427+#ifdef CONFIG_IA64_PERFMON_COMPAT
4428+ ret = pfm_ia64_compat_init();
4429+ if (ret)
4430+ return ret;
4431+#endif
4432+ register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
4433+
4434+
4435+ return 0;
4436+}
4437--- /dev/null
4438+++ b/arch/ia64/perfmon/perfmon_compat.c
4439@@ -0,0 +1,1210 @@
4440+/*
4441+ * This file implements the IA-64 specific
4442+ * support for the perfmon2 interface
4443+ *
4444+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
4445+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
4446+ *
4447+ * This program is free software; you can redistribute it and/or
4448+ * modify it under the terms of version 2 of the GNU General Public
4449+ * License as published by the Free Software Foundation.
4450+ *
4451+ * This program is distributed in the hope that it will be useful,
4452+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
4453+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4454+ * General Public License for more details.
4455+ *
4456+ * You should have received a copy of the GNU General Public License
4457+ * along with this program; if not, write to the Free Software
4458+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
4459+ * 02111-1307 USA
4460+ */
4461+#include <linux/interrupt.h>
4462+#include <linux/module.h>
4463+#include <linux/file.h>
4464+#include <linux/fdtable.h>
4465+#include <linux/seq_file.h>
4466+#include <linux/vmalloc.h>
4467+#include <linux/proc_fs.h>
4468+#include <linux/perfmon_kern.h>
4469+#include <linux/uaccess.h>
4470+
4471+asmlinkage long sys_pfm_stop(int fd);
4472+asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *st);
4473+asmlinkage long sys_pfm_unload_context(int fd);
4474+asmlinkage long sys_pfm_restart(int fd);
4475+asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ld);
4476+
4477+ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what);
4478+
4479+extern ssize_t __pfm_read(struct pfm_context *ctx,
4480+ union pfarg_msg *msg_buf,
4481+ int non_block);
4482+/*
4483+ * function providing some help for backward compatiblity with old IA-64
4484+ * applications. In the old model, certain attributes of a counter were
4485+ * passed via the PMC, now they are passed via the PMD.
4486+ */
4487+static int pfm_compat_update_pmd(struct pfm_context *ctx, u16 set_id, u16 cnum,
4488+ u32 rflags,
4489+ unsigned long *smpl_pmds,
4490+ unsigned long *reset_pmds,
4491+ u64 eventid)
4492+{
4493+ struct pfm_event_set *set;
4494+ int is_counting;
4495+ unsigned long *impl_pmds;
4496+ u32 flags = 0;
4497+ u16 max_pmd;
4498+
4499+ impl_pmds = ctx->regs.pmds;
4500+ max_pmd = ctx->regs.max_pmd;
4501+
4502+ /*
4503+ * given that we do not maintain PMC ->PMD dependencies
4504+ * we cannot figure out what to do in case PMCxx != PMDxx
4505+ */
4506+ if (cnum > max_pmd)
4507+ return 0;
4508+
4509+ /*
4510+ * assumes PMCxx controls PMDxx which is always true for counters
4511+ * on Itanium PMUs.
4512+ */
4513+ is_counting = pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64;
4514+ set = pfm_find_set(ctx, set_id, 0);
4515+
4516+ /*
4517+ * for v2.0, we only allowed counting PMD to generate
4518+ * user-level notifications. Same thing with randomization.
4519+ */
4520+ if (is_counting) {
4521+ if (rflags & PFM_REGFL_OVFL_NOTIFY)
4522+ flags |= PFM_REGFL_OVFL_NOTIFY;
4523+ if (rflags & PFM_REGFL_RANDOM)
4524+ flags |= PFM_REGFL_RANDOM;
4525+ /*
4526+ * verify validity of smpl_pmds
4527+ */
4528+ if (unlikely(bitmap_subset(smpl_pmds,
4529+ impl_pmds, max_pmd) == 0)) {
4530+ PFM_DBG("invalid smpl_pmds=0x%llx for pmd%u",
4531+ (unsigned long long)smpl_pmds[0], cnum);
4532+ return -EINVAL;
4533+ }
4534+ /*
4535+ * verify validity of reset_pmds
4536+ */
4537+ if (unlikely(bitmap_subset(reset_pmds,
4538+ impl_pmds, max_pmd) == 0)) {
4539+ PFM_DBG("invalid reset_pmds=0x%lx for pmd%u",
4540+ reset_pmds[0], cnum);
4541+ return -EINVAL;
4542+ }
4543+ /*
4544+ * ensures that a PFM_READ_PMDS succeeds with a
4545+ * corresponding PFM_WRITE_PMDS
4546+ */
4547+ __set_bit(cnum, set->used_pmds);
4548+
4549+ } else if (rflags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) {
4550+ PFM_DBG("cannot set ovfl_notify or random on pmd%u", cnum);
4551+ return -EINVAL;
4552+ }
4553+
4554+ set->pmds[cnum].flags = flags;
4555+
4556+ if (is_counting) {
4557+ bitmap_copy(set->pmds[cnum].reset_pmds,
4558+ reset_pmds,
4559+ max_pmd);
4560+
4561+ bitmap_copy(set->pmds[cnum].smpl_pmds,
4562+ smpl_pmds,
4563+ max_pmd);
4564+
4565+ set->pmds[cnum].eventid = eventid;
4566+
4567+ /*
4568+ * update ovfl_notify
4569+ */
4570+ if (rflags & PFM_REGFL_OVFL_NOTIFY)
4571+ __set_bit(cnum, set->ovfl_notify);
4572+ else
4573+ __clear_bit(cnum, set->ovfl_notify);
4574+
4575+ }
4576+ PFM_DBG("pmd%u flags=0x%x eventid=0x%lx r_pmds=0x%lx s_pmds=0x%lx",
4577+ cnum, flags,
4578+ eventid,
4579+ reset_pmds[0],
4580+ smpl_pmds[0]);
4581+
4582+ return 0;
4583+}
4584+
4585+
4586+int __pfm_write_ibrs_old(struct pfm_context *ctx, void *arg, int count)
4587+{
4588+ struct pfarg_dbreg *req = arg;
4589+ struct pfarg_pmc pmc;
4590+ int i, ret = 0;
4591+
4592+ memset(&pmc, 0, sizeof(pmc));
4593+
4594+ for (i = 0; i < count; i++, req++) {
4595+ pmc.reg_num = 256+req->dbreg_num;
4596+ pmc.reg_value = req->dbreg_value;
4597+ pmc.reg_flags = 0;
4598+ pmc.reg_set = req->dbreg_set;
4599+
4600+ ret = __pfm_write_pmcs(ctx, &pmc, 1);
4601+
4602+ req->dbreg_flags &= ~PFM_REG_RETFL_MASK;
4603+ req->dbreg_flags |= pmc.reg_flags;
4604+
4605+ if (ret)
4606+ return ret;
4607+ }
4608+ return 0;
4609+}
4610+
4611+static long pfm_write_ibrs_old(int fd, void __user *ureq, int count)
4612+{
4613+ struct pfm_context *ctx;
4614+ struct task_struct *task;
4615+ struct file *filp;
4616+ struct pfarg_dbreg *req = NULL;
4617+ void *fptr, *resume;
4618+ unsigned long flags;
4619+ size_t sz;
4620+ int ret, fput_needed;
4621+
4622+ if (count < 1 || count >= PFM_MAX_ARG_COUNT(req))
4623+ return -EINVAL;
4624+
4625+ sz = count*sizeof(*req);
4626+
4627+ filp = fget_light(fd, &fput_needed);
4628+ if (unlikely(filp == NULL)) {
4629+ PFM_DBG("invalid fd %d", fd);
4630+ return -EBADF;
4631+ }
4632+
4633+ ctx = filp->private_data;
4634+ ret = -EBADF;
4635+
4636+ if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
4637+ PFM_DBG("fd %d not related to perfmon", fd);
4638+ goto error;
4639+ }
4640+
4641+ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
4642+ if (ret)
4643+ goto error;
4644+
4645+ spin_lock_irqsave(&ctx->lock, flags);
4646+
4647+ task = ctx->task;
4648+
4649+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume);
4650+ if (ret == 0)
4651+ ret = __pfm_write_ibrs_old(ctx, req, count);
4652+
4653+ spin_unlock_irqrestore(&ctx->lock, flags);
4654+
4655+ if (resume)
4656+ pfm_resume_task(task, resume);
4657+
4658+ if (copy_to_user(ureq, req, sz))
4659+ ret = -EFAULT;
4660+
4661+ kfree(fptr);
4662+error:
4663+ fput_light(filp, fput_needed);
4664+ return ret;
4665+}
4666+
4667+int __pfm_write_dbrs_old(struct pfm_context *ctx, void *arg, int count)
4668+{
4669+ struct pfarg_dbreg *req = arg;
4670+ struct pfarg_pmc pmc;
4671+ int i, ret = 0;
4672+
4673+ memset(&pmc, 0, sizeof(pmc));
4674+
4675+ for (i = 0; i < count; i++, req++) {
4676+ pmc.reg_num = 264+req->dbreg_num;
4677+ pmc.reg_value = req->dbreg_value;
4678+ pmc.reg_flags = 0;
4679+ pmc.reg_set = req->dbreg_set;
4680+
4681+ ret = __pfm_write_pmcs(ctx, &pmc, 1);
4682+
4683+ req->dbreg_flags &= ~PFM_REG_RETFL_MASK;
4684+ req->dbreg_flags |= pmc.reg_flags;
4685+ if (ret)
4686+ return ret;
4687+ }
4688+ return 0;
4689+}
4690+
4691+static long pfm_write_dbrs_old(int fd, void __user *ureq, int count)
4692+{
4693+ struct pfm_context *ctx;
4694+ struct task_struct *task;
4695+ struct file *filp;
4696+ struct pfarg_dbreg *req = NULL;
4697+ void *fptr, *resume;
4698+ unsigned long flags;
4699+ size_t sz;
4700+ int ret, fput_needed;
4701+
4702+ if (count < 1 || count >= PFM_MAX_ARG_COUNT(req))
4703+ return -EINVAL;
4704+
4705+ sz = count*sizeof(*req);
4706+
4707+ filp = fget_light(fd, &fput_needed);
4708+ if (unlikely(filp == NULL)) {
4709+ PFM_DBG("invalid fd %d", fd);
4710+ return -EBADF;
4711+ }
4712+
4713+ ctx = filp->private_data;
4714+ ret = -EBADF;
4715+
4716+ if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
4717+ PFM_DBG("fd %d not related to perfmon", fd);
4718+ goto error;
4719+ }
4720+
4721+ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
4722+ if (ret)
4723+ goto error;
4724+
4725+ spin_lock_irqsave(&ctx->lock, flags);
4726+
4727+ task = ctx->task;
4728+
4729+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume);
4730+ if (ret == 0)
4731+ ret = __pfm_write_dbrs_old(ctx, req, count);
4732+
4733+ spin_unlock_irqrestore(&ctx->lock, flags);
4734+
4735+ if (resume)
4736+ pfm_resume_task(task, resume);
4737+
4738+ if (copy_to_user(ureq, req, sz))
4739+ ret = -EFAULT;
4740+
4741+ kfree(fptr);
4742+error:
4743+ fput_light(filp, fput_needed);
4744+ return ret;
4745+}
4746+
4747+int __pfm_write_pmcs_old(struct pfm_context *ctx, struct pfarg_reg *req_old,
4748+ int count)
4749+{
4750+ struct pfarg_pmc req;
4751+ unsigned int i;
4752+ int ret, error_code;
4753+
4754+ memset(&req, 0, sizeof(req));
4755+
4756+ for (i = 0; i < count; i++, req_old++) {
4757+ req.reg_num = req_old->reg_num;
4758+ req.reg_set = req_old->reg_set;
4759+ req.reg_flags = 0;
4760+ req.reg_value = req_old->reg_value;
4761+
4762+ ret = __pfm_write_pmcs(ctx, (void *)&req, 1);
4763+ req_old->reg_flags &= ~PFM_REG_RETFL_MASK;
4764+ req_old->reg_flags |= req.reg_flags;
4765+
4766+ if (ret)
4767+ return ret;
4768+
4769+ ret = pfm_compat_update_pmd(ctx, req_old->reg_set,
4770+ req_old->reg_num,
4771+ (u32)req_old->reg_flags,
4772+ req_old->reg_smpl_pmds,
4773+ req_old->reg_reset_pmds,
4774+ req_old->reg_smpl_eventid);
4775+
4776+ error_code = ret ? PFM_REG_RETFL_EINVAL : 0;
4777+ req_old->reg_flags &= ~PFM_REG_RETFL_MASK;
4778+ req_old->reg_flags |= error_code;
4779+
4780+ if (ret)
4781+ return ret;
4782+ }
4783+ return 0;
4784+}
4785+
4786+static long pfm_write_pmcs_old(int fd, void __user *ureq, int count)
4787+{
4788+ struct pfm_context *ctx;
4789+ struct task_struct *task;
4790+ struct file *filp;
4791+ struct pfarg_reg *req = NULL;
4792+ void *fptr, *resume;
4793+ unsigned long flags;
4794+ size_t sz;
4795+ int ret, fput_needed;
4796+
4797+ if (count < 1 || count >= PFM_MAX_ARG_COUNT(req))
4798+ return -EINVAL;
4799+
4800+ sz = count*sizeof(*req);
4801+
4802+ filp = fget_light(fd, &fput_needed);
4803+ if (unlikely(filp == NULL)) {
4804+ PFM_DBG("invalid fd %d", fd);
4805+ return -EBADF;
4806+ }
4807+
4808+ ctx = filp->private_data;
4809+ ret = -EBADF;
4810+
4811+ if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
4812+ PFM_DBG("fd %d not related to perfmon", fd);
4813+ goto error;
4814+ }
4815+
4816+ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
4817+ if (ret)
4818+ goto error;
4819+
4820+ spin_lock_irqsave(&ctx->lock, flags);
4821+
4822+ task = ctx->task;
4823+
4824+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume);
4825+ if (ret == 0)
4826+ ret = __pfm_write_pmcs_old(ctx, req, count);
4827+
4828+ spin_unlock_irqrestore(&ctx->lock, flags);
4829+
4830+ if (resume)
4831+ pfm_resume_task(task, resume);
4832+
4833+ if (copy_to_user(ureq, req, sz))
4834+ ret = -EFAULT;
4835+
4836+ kfree(fptr);
4837+
4838+error:
4839+ fput_light(filp, fput_needed);
4840+ return ret;
4841+}
4842+
4843+int __pfm_write_pmds_old(struct pfm_context *ctx, struct pfarg_reg *req_old,
4844+ int count)
4845+{
4846+ struct pfarg_pmd req;
4847+ int i, ret;
4848+
4849+ memset(&req, 0, sizeof(req));
4850+
4851+ for (i = 0; i < count; i++, req_old++) {
4852+ req.reg_num = req_old->reg_num;
4853+ req.reg_set = req_old->reg_set;
4854+ req.reg_value = req_old->reg_value;
4855+ /* flags passed with pmcs in v2.0 */
4856+
4857+ req.reg_long_reset = req_old->reg_long_reset;
4858+ req.reg_short_reset = req_old->reg_short_reset;
4859+ req.reg_random_mask = req_old->reg_random_mask;
4860+ /*
4861+ * reg_random_seed is ignored since v2.3
4862+ */
4863+
4864+ /*
4865+ * skip last_reset_val not used for writing
4866+ * skip smpl_pmds, reset_pmds, eventid, ovfl_swtch_cnt
4867+ * as set in pfm_write_pmcs_old.
4868+ *
4869+ * ovfl_switch_cnt ignored, not implemented in v2.0
4870+ */
4871+ ret = __pfm_write_pmds(ctx, (void *)&req, 1, 1);
4872+
4873+ req_old->reg_flags &= ~PFM_REG_RETFL_MASK;
4874+ req_old->reg_flags |= req.reg_flags;
4875+
4876+ if (ret)
4877+ return ret;
4878+ }
4879+ return 0;
4880+}
4881+
4882+static long pfm_write_pmds_old(int fd, void __user *ureq, int count)
4883+{
4884+ struct pfm_context *ctx;
4885+ struct task_struct *task;
4886+ struct file *filp;
4887+ struct pfarg_reg *req = NULL;
4888+ void *fptr, *resume;
4889+ unsigned long flags;
4890+ size_t sz;
4891+ int ret, fput_needed;
4892+
4893+ if (count < 1 || count >= PFM_MAX_ARG_COUNT(req))
4894+ return -EINVAL;
4895+
4896+ sz = count*sizeof(*req);
4897+
4898+ filp = fget_light(fd, &fput_needed);
4899+ if (unlikely(filp == NULL)) {
4900+ PFM_DBG("invalid fd %d", fd);
4901+ return -EBADF;
4902+ }
4903+
4904+ ctx = filp->private_data;
4905+ ret = -EBADF;
4906+
4907+ if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
4908+ PFM_DBG("fd %d not related to perfmon", fd);
4909+ goto error;
4910+ }
4911+
4912+ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
4913+ if (ret)
4914+ goto error;
4915+
4916+ spin_lock_irqsave(&ctx->lock, flags);
4917+
4918+ task = ctx->task;
4919+
4920+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume);
4921+ if (ret == 0)
4922+ ret = __pfm_write_pmds_old(ctx, req, count);
4923+
4924+ spin_unlock_irqrestore(&ctx->lock, flags);
4925+
4926+ if (copy_to_user(ureq, req, sz))
4927+ ret = -EFAULT;
4928+
4929+ if (resume)
4930+ pfm_resume_task(task, resume);
4931+
4932+ kfree(fptr);
4933+error:
4934+ fput_light(filp, fput_needed);
4935+ return ret;
4936+}
4937+
4938+int __pfm_read_pmds_old(struct pfm_context *ctx, struct pfarg_reg *req_old,
4939+ int count)
4940+{
4941+ struct pfarg_pmd req;
4942+ int i, ret;
4943+
4944+ memset(&req, 0, sizeof(req));
4945+
4946+ for (i = 0; i < count; i++, req_old++) {
4947+ req.reg_num = req_old->reg_num;
4948+ req.reg_set = req_old->reg_set;
4949+
4950+ /* skip value not used for reading */
4951+ req.reg_flags = req_old->reg_flags;
4952+
4953+ /* skip short/long_reset not used for reading */
4954+ /* skip last_reset_val not used for reading */
4955+ /* skip ovfl_switch_cnt not used for reading */
4956+
4957+ ret = __pfm_read_pmds(ctx, (void *)&req, 1);
4958+
4959+ req_old->reg_flags &= ~PFM_REG_RETFL_MASK;
4960+ req_old->reg_flags |= req.reg_flags;
4961+ if (ret)
4962+ return ret;
4963+
4964+ /* update fields */
4965+ req_old->reg_value = req.reg_value;
4966+
4967+ req_old->reg_last_reset_val = req.reg_last_reset_val;
4968+ req_old->reg_ovfl_switch_cnt = req.reg_ovfl_switch_cnt;
4969+ }
4970+ return 0;
4971+}
4972+
4973+static long pfm_read_pmds_old(int fd, void __user *ureq, int count)
4974+{
4975+ struct pfm_context *ctx;
4976+ struct task_struct *task;
4977+ struct file *filp;
4978+ struct pfarg_reg *req = NULL;
4979+ void *fptr, *resume;
4980+ unsigned long flags;
4981+ size_t sz;
4982+ int ret, fput_needed;
4983+
4984+ if (count < 1 || count >= PFM_MAX_ARG_COUNT(req))
4985+ return -EINVAL;
4986+
4987+ sz = count*sizeof(*req);
4988+
4989+ filp = fget_light(fd, &fput_needed);
4990+ if (unlikely(filp == NULL)) {
4991+ PFM_DBG("invalid fd %d", fd);
4992+ return -EBADF;
4993+ }
4994+
4995+ ctx = filp->private_data;
4996+ ret = -EBADF;
4997+
4998+ if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
4999+ PFM_DBG("fd %d not related to perfmon", fd);
5000+ goto error;
5001+ }
5002+
5003+ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
5004+ if (ret)
5005+ goto error;
5006+
5007+ spin_lock_irqsave(&ctx->lock, flags);
5008+
5009+ task = ctx->task;
5010+
5011+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume);
5012+ if (ret == 0)
5013+ ret = __pfm_read_pmds_old(ctx, req, count);
5014+
5015+ spin_unlock_irqrestore(&ctx->lock, flags);
5016+
5017+ if (resume)
5018+ pfm_resume_task(task, resume);
5019+
5020+ if (copy_to_user(ureq, req, sz))
5021+ ret = -EFAULT;
5022+
5023+ kfree(fptr);
5024+error:
5025+ fput_light(filp, fput_needed);
5026+ return ret;
5027+}
5028+
5029+/*
5030+ * OBSOLETE: use /proc/perfmon_map instead
5031+ */
5032+static long pfm_get_default_pmcs_old(int fd, void __user *ureq, int count)
5033+{
5034+ struct pfarg_reg *req = NULL;
5035+ void *fptr;
5036+ size_t sz;
5037+ int ret, i;
5038+ unsigned int cnum;
5039+
5040+ if (count < 1)
5041+ return -EINVAL;
5042+
5043+ /*
5044+ * ensure the pfm_pmu_conf does not disappear while
5045+ * we use it
5046+ */
5047+ ret = pfm_pmu_conf_get(1);
5048+ if (ret)
5049+ return ret;
5050+
5051+ sz = count*sizeof(*ureq);
5052+
5053+ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
5054+ if (ret)
5055+ goto error;
5056+
5057+
5058+ for (i = 0; i < count; i++, req++) {
5059+ cnum = req->reg_num;
5060+
5061+ if (i >= PFM_MAX_PMCS ||
5062+ (pfm_pmu_conf->pmc_desc[cnum].type & PFM_REG_I) == 0) {
5063+ req->reg_flags = PFM_REG_RETFL_EINVAL;
5064+ break;
5065+ }
5066+ req->reg_value = pfm_pmu_conf->pmc_desc[cnum].dfl_val;
5067+ req->reg_flags = 0;
5068+
5069+ PFM_DBG("pmc[%u]=0x%lx", cnum, req->reg_value);
5070+ }
5071+
5072+ if (copy_to_user(ureq, req, sz))
5073+ ret = -EFAULT;
5074+
5075+ kfree(fptr);
5076+error:
5077+ pfm_pmu_conf_put();
5078+
5079+ return ret;
5080+}
5081+
5082+/*
5083+ * allocate a sampling buffer and remaps it into the user address space of
5084+ * the task. This is only in compatibility mode
5085+ *
5086+ * function called ONLY on current task
5087+ */
5088+int pfm_smpl_buf_alloc_compat(struct pfm_context *ctx, size_t rsize,
5089+ struct file *filp)
5090+{
5091+ struct mm_struct *mm = current->mm;
5092+ struct vm_area_struct *vma = NULL;
5093+ struct pfm_arch_context *ctx_arch;
5094+ size_t size;
5095+ int ret;
5096+ extern struct vm_operations_struct pfm_buf_map_vm_ops;
5097+
5098+ ctx_arch = pfm_ctx_arch(ctx);
5099+
5100+ /*
5101+ * allocate buffer + map desc
5102+ */
5103+ ret = pfm_smpl_buf_alloc(ctx, rsize);
5104+ if (ret)
5105+ return ret;
5106+
5107+ size = ctx->smpl_size;
5108+
5109+
5110+ /* allocate vma */
5111+ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
5112+ if (!vma) {
5113+ PFM_DBG("Cannot allocate vma");
5114+ goto error_kmem;
5115+ }
5116+ memset(vma, 0, sizeof(*vma));
5117+
5118+ /*
5119+ * partially initialize the vma for the sampling buffer
5120+ */
5121+ vma->vm_mm = mm;
5122+ vma->vm_flags = VM_READ | VM_MAYREAD | VM_RESERVED;
5123+ vma->vm_page_prot = PAGE_READONLY;
5124+ vma->vm_ops = &pfm_buf_map_vm_ops;
5125+ vma->vm_file = filp;
5126+ vma->vm_private_data = ctx;
5127+ vma->vm_pgoff = 0;
5128+
5129+ /*
5130+ * simulate effect of mmap()
5131+ */
5132+ get_file(filp);
5133+
5134+ /*
5135+ * Let's do the difficult operations next.
5136+ *
5137+ * now we atomically find some area in the address space and
5138+ * remap the buffer into it.
5139+ */
5140+ down_write(&current->mm->mmap_sem);
5141+
5142+ /* find some free area in address space, must have mmap sem held */
5143+ vma->vm_start = get_unmapped_area(NULL, 0, size, 0,
5144+ MAP_PRIVATE|MAP_ANONYMOUS);
5145+ if (vma->vm_start == 0) {
5146+ PFM_DBG("cannot find unmapped area of size %zu", size);
5147+ up_write(&current->mm->mmap_sem);
5148+ goto error;
5149+ }
5150+ vma->vm_end = vma->vm_start + size;
5151+
5152+ PFM_DBG("aligned_size=%zu mapped @0x%lx", size, vma->vm_start);
5153+ /*
5154+ * now insert the vma in the vm list for the process, must be
5155+ * done with mmap lock held
5156+ */
5157+ insert_vm_struct(mm, vma);
5158+
5159+ mm->total_vm += size >> PAGE_SHIFT;
5160+
5161+ up_write(&current->mm->mmap_sem);
5162+
5163+ /*
5164+ * IMPORTANT: we do not issue the fput()
5165+ * because we want to increase the ref count
5166+ * on the descriptor to simulate what mmap()
5167+ * would do
5168+ */
5169+
5170+ /*
5171+ * used to propagate vaddr to syscall stub
5172+ */
5173+ ctx_arch->ctx_smpl_vaddr = (void *)vma->vm_start;
5174+
5175+ return 0;
5176+error:
5177+ kmem_cache_free(vm_area_cachep, vma);
5178+error_kmem:
5179+ pfm_smpl_buf_space_release(ctx, ctx->smpl_size);
5180+ vfree(ctx->smpl_addr);
5181+ return -ENOMEM;
5182+}
5183+
5184+#define PFM_DEFAULT_SMPL_UUID { \
5185+ 0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82,\
5186+ 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97}
5187+
5188+static pfm_uuid_t old_default_uuid = PFM_DEFAULT_SMPL_UUID;
5189+static pfm_uuid_t null_uuid;
5190+
5191+/*
5192+ * function invoked in case, pfm_context_create fails
5193+ * at the last operation, copy_to_user. It needs to
5194+ * undo memory allocations and free the file descriptor
5195+ */
5196+static void pfm_undo_create_context_fd(int fd, struct pfm_context *ctx)
5197+{
5198+ struct files_struct *files = current->files;
5199+ struct file *file;
5200+ int fput_needed;
5201+
5202+ file = fget_light(fd, &fput_needed);
5203+ /*
5204+ * there is no fd_uninstall(), so we do it
5205+ * here. put_unused_fd() does not remove the
5206+ * effect of fd_install().
5207+ */
5208+
5209+ spin_lock(&files->file_lock);
5210+ files->fd_array[fd] = NULL;
5211+ spin_unlock(&files->file_lock);
5212+
5213+ fput_light(file, fput_needed);
5214+
5215+ /*
5216+ * decrement ref count and kill file
5217+ */
5218+ put_filp(file);
5219+
5220+ put_unused_fd(fd);
5221+
5222+ pfm_free_context(ctx);
5223+}
5224+
5225+static int pfm_get_smpl_arg_old(pfm_uuid_t uuid, void __user *fmt_uarg,
5226+ size_t usize, void **arg,
5227+ struct pfm_smpl_fmt **fmt)
5228+{
5229+ struct pfm_smpl_fmt *f;
5230+ void *addr = NULL;
5231+ size_t sz;
5232+ int ret;
5233+
5234+ if (!memcmp(uuid, null_uuid, sizeof(pfm_uuid_t)))
5235+ return 0;
5236+
5237+ if (memcmp(uuid, old_default_uuid, sizeof(pfm_uuid_t))) {
5238+ PFM_DBG("compatibility mode supports only default sampling format");
5239+ return -EINVAL;
5240+ }
5241+ /*
5242+ * find fmt and increase refcount
5243+ */
5244+ f = pfm_smpl_fmt_get("default-old");
5245+ if (f == NULL) {
5246+ PFM_DBG("default-old buffer format not found");
5247+ return -EINVAL;
5248+ }
5249+
5250+ /*
5251+ * expected format argument size
5252+ */
5253+ sz = f->fmt_arg_size;
5254+
5255+ /*
5256+ * check user size matches expected size
5257+ * usize = -1 is for IA-64 backward compatibility
5258+ */
5259+ ret = -EINVAL;
5260+ if (sz != usize && usize != -1) {
5261+ PFM_DBG("invalid arg size %zu, format expects %zu",
5262+ usize, sz);
5263+ goto error;
5264+ }
5265+
5266+ ret = -ENOMEM;
5267+ addr = kmalloc(sz, GFP_KERNEL);
5268+ if (addr == NULL)
5269+ goto error;
5270+
5271+ ret = -EFAULT;
5272+ if (copy_from_user(addr, fmt_uarg, sz))
5273+ goto error;
5274+
5275+ *arg = addr;
5276+ *fmt = f;
5277+ return 0;
5278+
5279+error:
5280+ kfree(addr);
5281+ pfm_smpl_fmt_put(f);
5282+ return ret;
5283+}
5284+
5285+static long pfm_create_context_old(int fd, void __user *ureq, int count)
5286+{
5287+ struct pfm_context *new_ctx;
5288+ struct pfm_arch_context *ctx_arch;
5289+ struct pfm_smpl_fmt *fmt = NULL;
5290+ struct pfarg_context req_old;
5291+ void __user *usmpl_arg;
5292+ void *smpl_arg = NULL;
5293+ struct pfarg_ctx req;
5294+ int ret;
5295+
5296+ if (count != 1)
5297+ return -EINVAL;
5298+
5299+ if (copy_from_user(&req_old, ureq, sizeof(req_old)))
5300+ return -EFAULT;
5301+
5302+ memset(&req, 0, sizeof(req));
5303+
5304+ /*
5305+ * sampling format args are following pfarg_context
5306+ */
5307+ usmpl_arg = ureq+sizeof(req_old);
5308+
5309+ ret = pfm_get_smpl_arg_old(req_old.ctx_smpl_buf_id, usmpl_arg, -1,
5310+ &smpl_arg, &fmt);
5311+ if (ret)
5312+ return ret;
5313+
5314+ req.ctx_flags = req_old.ctx_flags;
5315+
5316+ /*
5317+ * returns file descriptor if >=0, or error code */
5318+ ret = __pfm_create_context(&req, fmt, smpl_arg, PFM_COMPAT, &new_ctx);
5319+ if (ret >= 0) {
5320+ ctx_arch = pfm_ctx_arch(new_ctx);
5321+ req_old.ctx_fd = ret;
5322+ req_old.ctx_smpl_vaddr = ctx_arch->ctx_smpl_vaddr;
5323+ }
5324+
5325+ if (copy_to_user(ureq, &req_old, sizeof(req_old))) {
5326+ pfm_undo_create_context_fd(req_old.ctx_fd, new_ctx);
5327+ ret = -EFAULT;
5328+ }
5329+
5330+ kfree(smpl_arg);
5331+
5332+ return ret;
5333+}
5334+
5335+/*
5336+ * obsolete call: use /proc/perfmon
5337+ */
5338+static long pfm_get_features_old(int fd, void __user *arg, int count)
5339+{
5340+ struct pfarg_features req;
5341+ int ret = 0;
5342+
5343+ if (count != 1)
5344+ return -EINVAL;
5345+
5346+ memset(&req, 0, sizeof(req));
5347+
5348+ req.ft_version = PFM_VERSION;
5349+
5350+ if (copy_to_user(arg, &req, sizeof(req)))
5351+ ret = -EFAULT;
5352+
5353+ return ret;
5354+}
5355+
5356+static long pfm_debug_old(int fd, void __user *arg, int count)
5357+{
5358+ int m;
5359+
5360+ if (count != 1)
5361+ return -EINVAL;
5362+
5363+ if (get_user(m, (int __user *)arg))
5364+ return -EFAULT;
5365+
5366+
5367+ pfm_controls.debug = m == 0 ? 0 : 1;
5368+
5369+ PFM_INFO("debugging %s (timing reset)",
5370+ pfm_controls.debug ? "on" : "off");
5371+
5372+ if (m == 0)
5373+ for_each_online_cpu(m) {
5374+ memset(&per_cpu(pfm_stats, m), 0,
5375+ sizeof(struct pfm_stats));
5376+ }
5377+ return 0;
5378+}
5379+
5380+static long pfm_unload_context_old(int fd, void __user *arg, int count)
5381+{
5382+ if (count)
5383+ return -EINVAL;
5384+
5385+ return sys_pfm_unload_context(fd);
5386+}
5387+
5388+static long pfm_restart_old(int fd, void __user *arg, int count)
5389+{
5390+ if (count)
5391+ return -EINVAL;
5392+
5393+ return sys_pfm_restart(fd);
5394+}
5395+
5396+static long pfm_stop_old(int fd, void __user *arg, int count)
5397+{
5398+ if (count)
5399+ return -EINVAL;
5400+
5401+ return sys_pfm_stop(fd);
5402+}
5403+
5404+static long pfm_start_old(int fd, void __user *arg, int count)
5405+{
5406+ if (count > 1)
5407+ return -EINVAL;
5408+
5409+ return sys_pfm_start(fd, arg);
5410+}
5411+
5412+static long pfm_load_context_old(int fd, void __user *ureq, int count)
5413+{
5414+ if (count != 1)
5415+ return -EINVAL;
5416+
5417+ return sys_pfm_load_context(fd, ureq);
5418+}
5419+
5420+/*
5421+ * perfmon command descriptions
5422+ */
5423+struct pfm_cmd_desc {
5424+ long (*cmd_func)(int fd, void __user *arg, int count);
5425+};
5426+
5427+/*
5428+ * functions MUST be listed in the increasing order of
5429+ * their index (see permfon.h)
5430+ */
5431+#define PFM_CMD(name) \
5432+ { .cmd_func = name, \
5433+ }
5434+#define PFM_CMD_NONE \
5435+ { .cmd_func = NULL \
5436+ }
5437+
5438+static struct pfm_cmd_desc pfm_cmd_tab[] = {
5439+/* 0 */PFM_CMD_NONE,
5440+/* 1 */PFM_CMD(pfm_write_pmcs_old),
5441+/* 2 */PFM_CMD(pfm_write_pmds_old),
5442+/* 3 */PFM_CMD(pfm_read_pmds_old),
5443+/* 4 */PFM_CMD(pfm_stop_old),
5444+/* 5 */PFM_CMD(pfm_start_old),
5445+/* 6 */PFM_CMD_NONE,
5446+/* 7 */PFM_CMD_NONE,
5447+/* 8 */PFM_CMD(pfm_create_context_old),
5448+/* 9 */PFM_CMD_NONE,
5449+/* 10 */PFM_CMD(pfm_restart_old),
5450+/* 11 */PFM_CMD_NONE,
5451+/* 12 */PFM_CMD(pfm_get_features_old),
5452+/* 13 */PFM_CMD(pfm_debug_old),
5453+/* 14 */PFM_CMD_NONE,
5454+/* 15 */PFM_CMD(pfm_get_default_pmcs_old),
5455+/* 16 */PFM_CMD(pfm_load_context_old),
5456+/* 17 */PFM_CMD(pfm_unload_context_old),
5457+/* 18 */PFM_CMD_NONE,
5458+/* 19 */PFM_CMD_NONE,
5459+/* 20 */PFM_CMD_NONE,
5460+/* 21 */PFM_CMD_NONE,
5461+/* 22 */PFM_CMD_NONE,
5462+/* 23 */PFM_CMD_NONE,
5463+/* 24 */PFM_CMD_NONE,
5464+/* 25 */PFM_CMD_NONE,
5465+/* 26 */PFM_CMD_NONE,
5466+/* 27 */PFM_CMD_NONE,
5467+/* 28 */PFM_CMD_NONE,
5468+/* 29 */PFM_CMD_NONE,
5469+/* 30 */PFM_CMD_NONE,
5470+/* 31 */PFM_CMD_NONE,
5471+/* 32 */PFM_CMD(pfm_write_ibrs_old),
5472+/* 33 */PFM_CMD(pfm_write_dbrs_old),
5473+};
5474+#define PFM_CMD_COUNT ARRAY_SIZE(pfm_cmd_tab)
5475+
5476+/*
5477+ * system-call entry point (must return long)
5478+ */
5479+asmlinkage long sys_perfmonctl(int fd, int cmd, void __user *arg, int count)
5480+{
5481+ if (perfmon_disabled)
5482+ return -ENOSYS;
5483+
5484+ if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT
5485+ || pfm_cmd_tab[cmd].cmd_func == NULL)) {
5486+ PFM_DBG("invalid cmd=%d", cmd);
5487+ return -EINVAL;
5488+ }
5489+ return (long)pfm_cmd_tab[cmd].cmd_func(fd, arg, count);
5490+}
5491+
5492+/*
5493+ * Called from pfm_read() for a perfmon v2.0 context.
5494+ *
5495+ * compatibility mode pfm_read() routine. We need a separate
5496+ * routine because the definition of the message has changed.
5497+ * The pfm_msg and pfarg_msg structures are different.
5498+ *
5499+ * return: sizeof(pfm_msg_t) on success, -errno otherwise
5500+ */
5501+ssize_t pfm_arch_compat_read(struct pfm_context *ctx,
5502+ char __user *buf,
5503+ int non_block,
5504+ size_t size)
5505+{
5506+ union pfarg_msg msg_buf;
5507+ pfm_msg_t old_msg_buf;
5508+ pfm_ovfl_msg_t *o_msg;
5509+ struct pfarg_ovfl_msg *n_msg;
5510+ int ret;
5511+
5512+ PFM_DBG("msg=%p size=%zu", buf, size);
5513+
5514+ /*
5515+ * cannot extract partial messages.
5516+ * check even when there is no message
5517+ *
5518+ * cannot extract more than one message per call. Bytes
5519+ * above sizeof(msg) are ignored.
5520+ */
5521+ if (size < sizeof(old_msg_buf)) {
5522+ PFM_DBG("message is too small size=%zu must be >=%zu)",
5523+ size,
5524+ sizeof(old_msg_buf));
5525+ return -EINVAL;
5526+ }
5527+
5528+ ret = __pfm_read(ctx, &msg_buf, non_block);
5529+ if (ret < 1)
5530+ return ret;
5531+
5532+ /*
5533+ * force return value to old message size
5534+ */
5535+ ret = sizeof(old_msg_buf);
5536+
5537+ o_msg = &old_msg_buf.pfm_ovfl_msg;
5538+ n_msg = &msg_buf.pfm_ovfl_msg;
5539+
5540+ switch (msg_buf.type) {
5541+ case PFM_MSG_OVFL:
5542+ o_msg->msg_type = PFM_MSG_OVFL;
5543+ o_msg->msg_ctx_fd = 0;
5544+ o_msg->msg_active_set = n_msg->msg_active_set;
5545+ o_msg->msg_tstamp = 0;
5546+
5547+ o_msg->msg_ovfl_pmds[0] = n_msg->msg_ovfl_pmds[0];
5548+ o_msg->msg_ovfl_pmds[1] = n_msg->msg_ovfl_pmds[1];
5549+ o_msg->msg_ovfl_pmds[2] = n_msg->msg_ovfl_pmds[2];
5550+ o_msg->msg_ovfl_pmds[3] = n_msg->msg_ovfl_pmds[3];
5551+ break;
5552+ case PFM_MSG_END:
5553+ o_msg->msg_type = PFM_MSG_END;
5554+ o_msg->msg_ctx_fd = 0;
5555+ o_msg->msg_tstamp = 0;
5556+ break;
5557+ default:
5558+ PFM_DBG("unknown msg type=%d", msg_buf.type);
5559+ }
5560+ if (copy_to_user(buf, &old_msg_buf, sizeof(old_msg_buf)))
5561+ ret = -EFAULT;
5562+ PFM_DBG_ovfl("ret=%d", ret);
5563+ return ret;
5564+}
5565+
5566+/*
5567+ * legacy /proc/perfmon simplified interface (we only maintain the
5568+ * global information (no more per-cpu stats, use
5569+ * /sys/devices/system/cpu/cpuXX/perfmon
5570+ */
5571+static struct proc_dir_entry *perfmon_proc;
5572+
5573+static void *pfm_proc_start(struct seq_file *m, loff_t *pos)
5574+{
5575+ if (*pos == 0)
5576+ return (void *)1;
5577+
5578+ return NULL;
5579+}
5580+
5581+static void *pfm_proc_next(struct seq_file *m, void *v, loff_t *pos)
5582+{
5583+ ++*pos;
5584+ return pfm_proc_start(m, pos);
5585+}
5586+
5587+static void pfm_proc_stop(struct seq_file *m, void *v)
5588+{
5589+}
5590+
5591+/*
5592+ * this is a simplified version of the legacy /proc/perfmon.
5593+ * We have retained ONLY the key information that tools are actually
5594+ * using
5595+ */
5596+static void pfm_proc_show_header(struct seq_file *m)
5597+{
5598+ char buf[128];
5599+
5600+ pfm_sysfs_res_show(buf, sizeof(buf), 3);
5601+
5602+ seq_printf(m, "perfmon version : %u.%u\n",
5603+ PFM_VERSION_MAJ, PFM_VERSION_MIN);
5604+
5605+ seq_printf(m, "model : %s", buf);
5606+}
5607+
5608+static int pfm_proc_show(struct seq_file *m, void *v)
5609+{
5610+ pfm_proc_show_header(m);
5611+ return 0;
5612+}
5613+
5614+struct seq_operations pfm_proc_seq_ops = {
5615+ .start = pfm_proc_start,
5616+ .next = pfm_proc_next,
5617+ .stop = pfm_proc_stop,
5618+ .show = pfm_proc_show
5619+};
5620+
5621+static int pfm_proc_open(struct inode *inode, struct file *file)
5622+{
5623+ return seq_open(file, &pfm_proc_seq_ops);
5624+}
5625+
5626+
5627+static struct file_operations pfm_proc_fops = {
5628+ .open = pfm_proc_open,
5629+ .read = seq_read,
5630+ .llseek = seq_lseek,
5631+ .release = seq_release,
5632+};
5633+
5634+/*
5635+ * called from pfm_arch_init(), global initialization, called once
5636+ */
5637+int __init pfm_ia64_compat_init(void)
5638+{
5639+ /*
5640+ * create /proc/perfmon
5641+ */
5642+ perfmon_proc = create_proc_entry("perfmon", S_IRUGO, NULL);
5643+ if (perfmon_proc == NULL) {
5644+ PFM_ERR("cannot create /proc entry, perfmon disabled");
5645+ return -1;
5646+ }
5647+ perfmon_proc->proc_fops = &pfm_proc_fops;
5648+ return 0;
5649+}
5650--- /dev/null
5651+++ b/arch/ia64/perfmon/perfmon_default_smpl.c
5652@@ -0,0 +1,273 @@
5653+/*
5654+ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
5655+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
5656+ *
5657+ * This file implements the old default sampling buffer format
5658+ * for the Linux/ia64 perfmon-2 subsystem. This is for backward
5659+ * compatibility only. use the new default format in perfmon/
5660+ *
5661+ * This program is free software; you can redistribute it and/or
5662+ * modify it under the terms of version 2 of the GNU General Public
5663+ * License as published by the Free Software Foundation.
5664+ *
5665+ * This program is distributed in the hope that it will be useful,
5666+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
5667+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5668+ * General Public License for more details.
5669+ *
5670+ * You should have received a copy of the GNU General Public License
5671+ * along with this program; if not, write to the Free Software
5672+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
5673+ * 02111-1307 USA
5674+ */
5675+#include <linux/kernel.h>
5676+#include <linux/types.h>
5677+#include <linux/module.h>
5678+#include <linux/init.h>
5679+#include <linux/delay.h>
5680+#include <linux/smp.h>
5681+#include <linux/sysctl.h>
5682+
5683+#ifdef MODULE
5684+#define FMT_FLAGS 0
5685+#else
5686+#define FMT_FLAGS PFM_FMTFL_IS_BUILTIN
5687+#endif
5688+
5689+#include <linux/perfmon_kern.h>
5690+#include <asm/perfmon_default_smpl.h>
5691+
5692+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
5693+MODULE_DESCRIPTION("perfmon old default sampling format");
5694+MODULE_LICENSE("GPL");
5695+
5696+static int pfm_default_fmt_validate(u32 flags, u16 npmds, void *data)
5697+{
5698+ struct pfm_default_smpl_arg *arg = data;
5699+ size_t min_buf_size;
5700+
5701+ if (data == NULL) {
5702+ PFM_DBG("no argument passed");
5703+ return -EINVAL;
5704+ }
5705+
5706+ /*
5707+ * compute min buf size. All PMD are manipulated as 64bit entities
5708+ */
5709+ min_buf_size = sizeof(struct pfm_default_smpl_hdr)
5710+ + (sizeof(struct pfm_default_smpl_entry) + (npmds*sizeof(u64)));
5711+
5712+ PFM_DBG("validate flags=0x%x npmds=%u min_buf_size=%lu "
5713+ "buf_size=%lu CPU%d", flags, npmds, min_buf_size,
5714+ arg->buf_size, smp_processor_id());
5715+
5716+ /*
5717+ * must hold at least the buffer header + one minimally sized entry
5718+ */
5719+ if (arg->buf_size < min_buf_size)
5720+ return -EINVAL;
5721+
5722+ return 0;
5723+}
5724+
5725+static int pfm_default_fmt_get_size(unsigned int flags, void *data,
5726+ size_t *size)
5727+{
5728+ struct pfm_default_smpl_arg *arg = data;
5729+
5730+ /*
5731+ * size has been validated in default_validate
5732+ */
5733+ *size = arg->buf_size;
5734+
5735+ return 0;
5736+}
5737+
5738+static int pfm_default_fmt_init(struct pfm_context *ctx, void *buf,
5739+ u32 flags, u16 npmds, void *data)
5740+{
5741+ struct pfm_default_smpl_hdr *hdr;
5742+ struct pfm_default_smpl_arg *arg = data;
5743+
5744+ hdr = buf;
5745+
5746+ hdr->hdr_version = PFM_DEFAULT_SMPL_VERSION;
5747+ hdr->hdr_buf_size = arg->buf_size;
5748+ hdr->hdr_cur_offs = sizeof(*hdr);
5749+ hdr->hdr_overflows = 0;
5750+ hdr->hdr_count = 0;
5751+
5752+ PFM_DBG("buffer=%p buf_size=%lu hdr_size=%lu "
5753+ "hdr_version=%u cur_offs=%lu",
5754+ buf,
5755+ hdr->hdr_buf_size,
5756+ sizeof(*hdr),
5757+ hdr->hdr_version,
5758+ hdr->hdr_cur_offs);
5759+
5760+ return 0;
5761+}
5762+
5763+static int pfm_default_fmt_handler(struct pfm_context *ctx,
5764+ unsigned long ip, u64 tstamp, void *data)
5765+{
5766+ struct pfm_default_smpl_hdr *hdr;
5767+ struct pfm_default_smpl_entry *ent;
5768+ void *cur, *last, *buf;
5769+ u64 *e;
5770+ size_t entry_size;
5771+ u16 npmds, i, ovfl_pmd;
5772+ struct pfm_ovfl_arg *arg;
5773+
5774+ hdr = ctx->smpl_addr;
5775+ arg = &ctx->ovfl_arg;
5776+
5777+ buf = hdr;
5778+ cur = buf+hdr->hdr_cur_offs;
5779+ last = buf+hdr->hdr_buf_size;
5780+ ovfl_pmd = arg->ovfl_pmd;
5781+
5782+ /*
5783+ * precheck for sanity
5784+ */
5785+ if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE)
5786+ goto full;
5787+
5788+ npmds = arg->num_smpl_pmds;
5789+
5790+ ent = cur;
5791+
5792+ prefetch(arg->smpl_pmds_values);
5793+
5794+ entry_size = sizeof(*ent) + (npmds << 3);
5795+
5796+ /* position for first pmd */
5797+ e = (unsigned long *)(ent+1);
5798+
5799+ hdr->hdr_count++;
5800+
5801+ PFM_DBG_ovfl("count=%lu cur=%p last=%p free_bytes=%lu "
5802+ "ovfl_pmd=%d npmds=%u",
5803+ hdr->hdr_count,
5804+ cur, last,
5805+ last-cur,
5806+ ovfl_pmd,
5807+ npmds);
5808+
5809+ /*
5810+ * current = task running at the time of the overflow.
5811+ *
5812+ * per-task mode:
5813+ * - this is ususally the task being monitored.
5814+ * Under certain conditions, it might be a different task
5815+ *
5816+ * system-wide:
5817+ * - this is not necessarily the task controlling the session
5818+ */
5819+ ent->pid = current->pid;
5820+ ent->ovfl_pmd = ovfl_pmd;
5821+ ent->last_reset_val = arg->pmd_last_reset;
5822+
5823+ /*
5824+ * where did the fault happen (includes slot number)
5825+ */
5826+ ent->ip = ip;
5827+
5828+ ent->tstamp = tstamp;
5829+ ent->cpu = smp_processor_id();
5830+ ent->set = arg->active_set;
5831+ ent->tgid = current->tgid;
5832+
5833+ /*
5834+ * selectively store PMDs in increasing index number
5835+ */
5836+ if (npmds) {
5837+ u64 *val = arg->smpl_pmds_values;
5838+ for (i = 0; i < npmds; i++)
5839+ *e++ = *val++;
5840+ }
5841+
5842+ /*
5843+ * update position for next entry
5844+ */
5845+ hdr->hdr_cur_offs += entry_size;
5846+ cur += entry_size;
5847+
5848+ /*
5849+ * post check to avoid losing the last sample
5850+ */
5851+ if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE)
5852+ goto full;
5853+
5854+ /*
5855+ * reset before returning from interrupt handler
5856+ */
5857+ arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET;
5858+ return 0;
5859+full:
5860+ PFM_DBG_ovfl("smpl buffer full free=%lu, count=%lu",
5861+ last-cur, hdr->hdr_count);
5862+
5863+ /*
5864+ * increment number of buffer overflow.
5865+ * important to detect duplicate set of samples.
5866+ */
5867+ hdr->hdr_overflows++;
5868+
5869+ /*
5870+ * request notification and masking of monitoring.
5871+ * Notification is still subject to the overflowed
5872+ */
5873+ arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY | PFM_OVFL_CTRL_MASK;
5874+
5875+ return -ENOBUFS; /* we are full, sorry */
5876+}
5877+
5878+static int pfm_default_fmt_restart(int is_active, u32 *ovfl_ctrl, void *buf)
5879+{
5880+ struct pfm_default_smpl_hdr *hdr;
5881+
5882+ hdr = buf;
5883+
5884+ hdr->hdr_count = 0;
5885+ hdr->hdr_cur_offs = sizeof(*hdr);
5886+
5887+ *ovfl_ctrl = PFM_OVFL_CTRL_RESET;
5888+
5889+ return 0;
5890+}
5891+
5892+static int pfm_default_fmt_exit(void *buf)
5893+{
5894+ return 0;
5895+}
5896+
5897+static struct pfm_smpl_fmt default_fmt = {
5898+ .fmt_name = "default-old",
5899+ .fmt_version = 0x10000,
5900+ .fmt_arg_size = sizeof(struct pfm_default_smpl_arg),
5901+ .fmt_validate = pfm_default_fmt_validate,
5902+ .fmt_getsize = pfm_default_fmt_get_size,
5903+ .fmt_init = pfm_default_fmt_init,
5904+ .fmt_handler = pfm_default_fmt_handler,
5905+ .fmt_restart = pfm_default_fmt_restart,
5906+ .fmt_exit = pfm_default_fmt_exit,
5907+ .fmt_flags = FMT_FLAGS,
5908+ .owner = THIS_MODULE
5909+};
5910+
5911+static int pfm_default_fmt_init_module(void)
5912+{
5913+ int ret;
5914+
5915+ return pfm_fmt_register(&default_fmt);
5916+ return ret;
5917+}
5918+
5919+static void pfm_default_fmt_cleanup_module(void)
5920+{
5921+ pfm_fmt_unregister(&default_fmt);
5922+}
5923+
5924+module_init(pfm_default_fmt_init_module);
5925+module_exit(pfm_default_fmt_cleanup_module);
5926--- /dev/null
5927+++ b/arch/ia64/perfmon/perfmon_generic.c
5928@@ -0,0 +1,148 @@
5929+/*
5930+ * This file contains the generic PMU register description tables
5931+ * and pmc checker used by perfmon.c.
5932+ *
5933+ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
5934+ * contributed by Stephane Eranian <eranian@hpl.hp.com>
5935+ *
5936+ * This program is free software; you can redistribute it and/or
5937+ * modify it under the terms of version 2 of the GNU General Public
5938+ * License as published by the Free Software Foundation.
5939+ *
5940+ * This program is distributed in the hope that it will be useful,
5941+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
5942+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5943+ * General Public License for more details.
5944+ *
5945+ * You should have received a copy of the GNU General Public License
5946+ * along with this program; if not, write to the Free Software
5947+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
5948+ * 02111-1307 USA
5949+ */
5950+#include <linux/module.h>
5951+#include <linux/perfmon_kern.h>
5952+#include <asm/pal.h>
5953+
5954+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
5955+MODULE_DESCRIPTION("Generic IA-64 PMU description tables");
5956+MODULE_LICENSE("GPL");
5957+
5958+#define RDEP(x) (1UL << (x))
5959+
5960+#define PFM_IA64GEN_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7))
5961+#define PFM_IA64GEN_RSVD (0xffffffffffff0080UL)
5962+#define PFM_IA64GEN_NO64 (1UL<<5)
5963+
5964+/* forward declaration */
5965+static struct pfm_pmu_config pfm_ia64gen_pmu_conf;
5966+
5967+static struct pfm_arch_pmu_info pfm_ia64gen_pmu_info = {
5968+ .mask_pmcs = {PFM_IA64GEN_MASK_PMCS,},
5969+};
5970+
5971+static struct pfm_regmap_desc pfm_ia64gen_pmc_desc[] = {
5972+/* pmc0 */ PMX_NA,
5973+/* pmc1 */ PMX_NA,
5974+/* pmc2 */ PMX_NA,
5975+/* pmc3 */ PMX_NA,
5976+/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 4),
5977+/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 5),
5978+/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 6),
5979+/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 7)
5980+};
5981+#define PFM_IA64GEN_NUM_PMCS ARRAY_SIZE(pfm_ia64gen_pmc_desc)
5982+
5983+static struct pfm_regmap_desc pfm_ia64gen_pmd_desc[] = {
5984+/* pmd0 */ PMX_NA,
5985+/* pmd1 */ PMX_NA,
5986+/* pmd2 */ PMX_NA,
5987+/* pmd3 */ PMX_NA,
5988+/* pmd4 */ PMD_DP(PFM_REG_C, "PMD4", 4, 1ull << 4),
5989+/* pmd5 */ PMD_DP(PFM_REG_C, "PMD5", 5, 1ull << 5),
5990+/* pmd6 */ PMD_DP(PFM_REG_C, "PMD6", 6, 1ull << 6),
5991+/* pmd7 */ PMD_DP(PFM_REG_C, "PMD7", 7, 1ull << 7)
5992+};
5993+#define PFM_IA64GEN_NUM_PMDS ARRAY_SIZE(pfm_ia64gen_pmd_desc)
5994+
5995+static int pfm_ia64gen_pmc_check(struct pfm_context *ctx,
5996+ struct pfm_event_set *set,
5997+ struct pfarg_pmc *req)
5998+{
5999+#define PFM_IA64GEN_PMC_PM_POS6 (1UL<<6)
6000+ u64 tmpval;
6001+ int is_system;
6002+
6003+ is_system = ctx->flags.system;
6004+ tmpval = req->reg_value;
6005+
6006+ switch (req->reg_num) {
6007+ case 4:
6008+ case 5:
6009+ case 6:
6010+ case 7:
6011+ /* set pmc.oi for 64-bit emulation */
6012+ tmpval |= 1UL << 5;
6013+
6014+ if (is_system)
6015+ tmpval |= PFM_IA64GEN_PMC_PM_POS6;
6016+ else
6017+ tmpval &= ~PFM_IA64GEN_PMC_PM_POS6;
6018+ break;
6019+
6020+ }
6021+ req->reg_value = tmpval;
6022+
6023+ return 0;
6024+}
6025+
6026+/*
6027+ * matches anything
6028+ */
6029+static int pfm_ia64gen_probe_pmu(void)
6030+{
6031+ u64 pm_buffer[16];
6032+ pal_perf_mon_info_u_t pm_info;
6033+
6034+ /*
6035+ * call PAL_PERFMON_INFO to retrieve counter width which
6036+ * is implementation specific
6037+ */
6038+ if (ia64_pal_perf_mon_info(pm_buffer, &pm_info))
6039+ return -1;
6040+
6041+ pfm_ia64gen_pmu_conf.counter_width = pm_info.pal_perf_mon_info_s.width;
6042+
6043+ return 0;
6044+}
6045+
6046+/*
6047+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
6048+ */
6049+static struct pfm_pmu_config pfm_ia64gen_pmu_conf = {
6050+ .pmu_name = "Generic IA-64",
6051+ .counter_width = 0, /* computed from PAL_PERFMON_INFO */
6052+ .pmd_desc = pfm_ia64gen_pmd_desc,
6053+ .pmc_desc = pfm_ia64gen_pmc_desc,
6054+ .probe_pmu = pfm_ia64gen_probe_pmu,
6055+ .num_pmc_entries = PFM_IA64GEN_NUM_PMCS,
6056+ .num_pmd_entries = PFM_IA64GEN_NUM_PMDS,
6057+ .pmc_write_check = pfm_ia64gen_pmc_check,
6058+ .version = "1.0",
6059+ .flags = PFM_PMU_BUILTIN_FLAG,
6060+ .owner = THIS_MODULE,
6061+ .pmu_info = &pfm_ia64gen_pmu_info
6062+ /* no read/write checkers */
6063+};
6064+
6065+static int __init pfm_gen_pmu_init_module(void)
6066+{
6067+ return pfm_pmu_register(&pfm_ia64gen_pmu_conf);
6068+}
6069+
6070+static void __exit pfm_gen_pmu_cleanup_module(void)
6071+{
6072+ pfm_pmu_unregister(&pfm_ia64gen_pmu_conf);
6073+}
6074+
6075+module_init(pfm_gen_pmu_init_module);
6076+module_exit(pfm_gen_pmu_cleanup_module);
6077--- /dev/null
6078+++ b/arch/ia64/perfmon/perfmon_itanium.c
6079@@ -0,0 +1,232 @@
6080+/*
6081+ * This file contains the Itanium PMU register description tables
6082+ * and pmc checker used by perfmon.c.
6083+ *
6084+ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
6085+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
6086+ *
6087+ * This program is free software; you can redistribute it and/or
6088+ * modify it under the terms of version 2 of the GNU General Public
6089+ * License as published by the Free Software Foundation.
6090+ *
6091+ * This program is distributed in the hope that it will be useful,
6092+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
6093+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
6094+ * General Public License for more details.
6095+ *
6096+ * You should have received a copy of the GNU General Public License
6097+ * along with this program; if not, write to the Free Software
6098+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
6099+ * 02111-1307 USA
6100+ */
6101+#include <linux/module.h>
6102+#include <linux/perfmon_kern.h>
6103+
6104+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
6105+MODULE_DESCRIPTION("Itanium (Merced) PMU description tables");
6106+MODULE_LICENSE("GPL");
6107+
6108+#define RDEP(x) (1ULL << (x))
6109+
6110+#define PFM_ITA_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|RDEP(10)|RDEP(11)|\
6111+ RDEP(12))
6112+
6113+#define PFM_ITA_NO64 (1ULL<<5)
6114+
6115+static struct pfm_arch_pmu_info pfm_ita_pmu_info = {
6116+ .mask_pmcs = {PFM_ITA_MASK_PMCS,},
6117+};
6118+/* reserved bits are 1 in the mask */
6119+#define PFM_ITA_RSVD 0xfffffffffc8000a0UL
6120+/*
6121+ * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using
6122+ * PMC256+y means we use PMC256+y+1. Yet, we do not have dependency information
6123+ * but this is fine because they are handled separately in the IA-64 specific
6124+ * code.
6125+ */
6126+static struct pfm_regmap_desc pfm_ita_pmc_desc[] = {
6127+/* pmc0 */ PMX_NA,
6128+/* pmc1 */ PMX_NA,
6129+/* pmc2 */ PMX_NA,
6130+/* pmc3 */ PMX_NA,
6131+/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 4),
6132+/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 5),
6133+/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 6),
6134+/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 7),
6135+/* pmc8 */ PMC_D(PFM_REG_W , "PMC8" , 0xfffffffe3ffffff8UL, 0xfff00000001c0000UL, 0, 8),
6136+/* pmc9 */ PMC_D(PFM_REG_W , "PMC9" , 0xfffffffe3ffffff8UL, 0xfff00000001c0000UL, 0, 9),
6137+/* pmc10 */ PMC_D(PFM_REG_W , "PMC10", 0x0, 0xfffffffff3f0ff30UL, 0, 10),
6138+/* pmc11 */ PMC_D(PFM_REG_W , "PMC11", 0x10000000UL, 0xffffffffecf0ff30UL, 0, 11),
6139+/* pmc12 */ PMC_D(PFM_REG_W , "PMC12", 0x0, 0xffffffffffff0030UL, 0, 12),
6140+/* pmc13 */ PMC_D(PFM_REG_W , "PMC13", 0x3ffff00000001UL, 0xfffffffffffffffeUL, 0, 13),
6141+/* pmc14 */ PMX_NA,
6142+/* pmc15 */ PMX_NA,
6143+/* pmc16 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6144+/* pmc24 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6145+/* pmc32 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6146+/* pmc40 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6147+/* pmc48 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6148+/* pmc56 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6149+/* pmc64 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6150+/* pmc72 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6151+/* pmc80 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6152+/* pmc88 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6153+/* pmc96 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6154+/* pmc104 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6155+/* pmc112 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6156+/* pmc120 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6157+/* pmc128 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6158+/* pmc136 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6159+/* pmc144 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6160+/* pmc152 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6161+/* pmc160 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6162+/* pmc168 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6163+/* pmc176 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6164+/* pmc184 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6165+/* pmc192 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6166+/* pmc200 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6167+/* pmc208 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6168+/* pmc216 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6169+/* pmc224 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6170+/* pmc232 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6171+/* pmc240 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6172+/* pmc248 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6173+/* pmc256 */ PMC_D(PFM_REG_W , "IBR0", 0x0, 0, 0, 0),
6174+/* pmc257 */ PMC_D(PFM_REG_W , "IBR1", 0x0, 0x8000000000000000UL, 0, 1),
6175+/* pmc258 */ PMC_D(PFM_REG_W , "IBR2", 0x0, 0, 0, 2),
6176+/* pmc259 */ PMC_D(PFM_REG_W , "IBR3", 0x0, 0x8000000000000000UL, 0, 3),
6177+/* pmc260 */ PMC_D(PFM_REG_W , "IBR4", 0x0, 0, 0, 4),
6178+/* pmc261 */ PMC_D(PFM_REG_W , "IBR5", 0x0, 0x8000000000000000UL, 0, 5),
6179+/* pmc262 */ PMC_D(PFM_REG_W , "IBR6", 0x0, 0, 0, 6),
6180+/* pmc263 */ PMC_D(PFM_REG_W , "IBR7", 0x0, 0x8000000000000000UL, 0, 7),
6181+/* pmc264 */ PMC_D(PFM_REG_W , "DBR0", 0x0, 0, 0, 0),
6182+/* pmc265 */ PMC_D(PFM_REG_W , "DBR1", 0x0, 0xc000000000000000UL, 0, 1),
6183+/* pmc266 */ PMC_D(PFM_REG_W , "DBR2", 0x0, 0, 0, 2),
6184+/* pmc267 */ PMC_D(PFM_REG_W , "DBR3", 0x0, 0xc000000000000000UL, 0, 3),
6185+/* pmc268 */ PMC_D(PFM_REG_W , "DBR4", 0x0, 0, 0, 4),
6186+/* pmc269 */ PMC_D(PFM_REG_W , "DBR5", 0x0, 0xc000000000000000UL, 0, 5),
6187+/* pmc270 */ PMC_D(PFM_REG_W , "DBR6", 0x0, 0, 0, 6),
6188+/* pmc271 */ PMC_D(PFM_REG_W , "DBR7", 0x0, 0xc000000000000000UL, 0, 7)
6189+};
6190+#define PFM_ITA_NUM_PMCS ARRAY_SIZE(pfm_ita_pmc_desc)
6191+
6192+static struct pfm_regmap_desc pfm_ita_pmd_desc[] = {
6193+/* pmd0 */ PMD_DP(PFM_REG_I , "PMD0", 0, 1ull << 10),
6194+/* pmd1 */ PMD_DP(PFM_REG_I , "PMD1", 1, 1ull << 10),
6195+/* pmd2 */ PMD_DP(PFM_REG_I , "PMD2", 2, 1ull << 11),
6196+/* pmd3 */ PMD_DP(PFM_REG_I , "PMD3", 3, 1ull << 11),
6197+/* pmd4 */ PMD_DP(PFM_REG_C , "PMD4", 4, 1ull << 4),
6198+/* pmd5 */ PMD_DP(PFM_REG_C , "PMD5", 5, 1ull << 5),
6199+/* pmd6 */ PMD_DP(PFM_REG_C , "PMD6", 6, 1ull << 6),
6200+/* pmd7 */ PMD_DP(PFM_REG_C , "PMD7", 7, 1ull << 7),
6201+/* pmd8 */ PMD_DP(PFM_REG_I , "PMD8", 8, 1ull << 12),
6202+/* pmd9 */ PMD_DP(PFM_REG_I , "PMD9", 9, 1ull << 12),
6203+/* pmd10 */ PMD_DP(PFM_REG_I , "PMD10", 10, 1ull << 12),
6204+/* pmd11 */ PMD_DP(PFM_REG_I , "PMD11", 11, 1ull << 12),
6205+/* pmd12 */ PMD_DP(PFM_REG_I , "PMD12", 12, 1ull << 12),
6206+/* pmd13 */ PMD_DP(PFM_REG_I , "PMD13", 13, 1ull << 12),
6207+/* pmd14 */ PMD_DP(PFM_REG_I , "PMD14", 14, 1ull << 12),
6208+/* pmd15 */ PMD_DP(PFM_REG_I , "PMD15", 15, 1ull << 12),
6209+/* pmd16 */ PMD_DP(PFM_REG_I , "PMD16", 16, 1ull << 12),
6210+/* pmd17 */ PMD_DP(PFM_REG_I , "PMD17", 17, 1ull << 11)
6211+};
6212+#define PFM_ITA_NUM_PMDS ARRAY_SIZE(pfm_ita_pmd_desc)
6213+
6214+static int pfm_ita_pmc_check(struct pfm_context *ctx,
6215+ struct pfm_event_set *set,
6216+ struct pfarg_pmc *req)
6217+{
6218+#define PFM_ITA_PMC_PM_POS6 (1UL<<6)
6219+ struct pfm_arch_context *ctx_arch;
6220+ u64 tmpval;
6221+ u16 cnum;
6222+ int ret = 0, is_system;
6223+
6224+ tmpval = req->reg_value;
6225+ cnum = req->reg_num;
6226+ ctx_arch = pfm_ctx_arch(ctx);
6227+ is_system = ctx->flags.system;
6228+
6229+ switch (cnum) {
6230+ case 4:
6231+ case 5:
6232+ case 6:
6233+ case 7:
6234+ case 10:
6235+ case 11:
6236+ case 12:
6237+ if (is_system)
6238+ tmpval |= PFM_ITA_PMC_PM_POS6;
6239+ else
6240+ tmpval &= ~PFM_ITA_PMC_PM_POS6;
6241+ break;
6242+ }
6243+
6244+ /*
6245+ * we must clear the (instruction) debug registers if pmc13.ta bit is
6246+ * cleared before they are written (fl_using_dbreg==0) to avoid
6247+ * picking up stale information.
6248+ */
6249+ if (cnum == 13 && ((tmpval & 0x1) == 0)
6250+ && ctx_arch->flags.use_dbr == 0) {
6251+ PFM_DBG("pmc13 has pmc13.ta cleared, clearing ibr");
6252+ ret = pfm_ia64_mark_dbregs_used(ctx, set);
6253+ if (ret)
6254+ return ret;
6255+ }
6256+
6257+ /*
6258+ * we must clear the (data) debug registers if pmc11.pt bit is cleared
6259+ * before they are written (fl_using_dbreg==0) to avoid picking up
6260+ * stale information.
6261+ */
6262+ if (cnum == 11 && ((tmpval >> 28) & 0x1) == 0
6263+ && ctx_arch->flags.use_dbr == 0) {
6264+ PFM_DBG("pmc11 has pmc11.pt cleared, clearing dbr");
6265+ ret = pfm_ia64_mark_dbregs_used(ctx, set);
6266+ if (ret)
6267+ return ret;
6268+ }
6269+
6270+ req->reg_value = tmpval;
6271+
6272+ return 0;
6273+}
6274+
6275+static int pfm_ita_probe_pmu(void)
6276+{
6277+ return local_cpu_data->family == 0x7 && !ia64_platform_is("hpsim")
6278+ ? 0 : -1;
6279+}
6280+
6281+/*
6282+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
6283+ */
6284+static struct pfm_pmu_config pfm_ita_pmu_conf = {
6285+ .pmu_name = "Itanium",
6286+ .counter_width = 32,
6287+ .pmd_desc = pfm_ita_pmd_desc,
6288+ .pmc_desc = pfm_ita_pmc_desc,
6289+ .pmc_write_check = pfm_ita_pmc_check,
6290+ .num_pmc_entries = PFM_ITA_NUM_PMCS,
6291+ .num_pmd_entries = PFM_ITA_NUM_PMDS,
6292+ .probe_pmu = pfm_ita_probe_pmu,
6293+ .version = "1.0",
6294+ .flags = PFM_PMU_BUILTIN_FLAG,
6295+ .owner = THIS_MODULE,
6296+ .pmu_info = &pfm_ita_pmu_info
6297+};
6298+
6299+static int __init pfm_ita_pmu_init_module(void)
6300+{
6301+ return pfm_pmu_register(&pfm_ita_pmu_conf);
6302+}
6303+
6304+static void __exit pfm_ita_pmu_cleanup_module(void)
6305+{
6306+ pfm_pmu_unregister(&pfm_ita_pmu_conf);
6307+}
6308+
6309+module_init(pfm_ita_pmu_init_module);
6310+module_exit(pfm_ita_pmu_cleanup_module);
6311+
6312--- /dev/null
6313+++ b/arch/ia64/perfmon/perfmon_mckinley.c
6314@@ -0,0 +1,290 @@
6315+/*
6316+ * This file contains the McKinley PMU register description tables
6317+ * and pmc checker used by perfmon.c.
6318+ *
6319+ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
6320+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
6321+ *
6322+ * This program is free software; you can redistribute it and/or
6323+ * modify it under the terms of version 2 of the GNU General Public
6324+ * License as published by the Free Software Foundation.
6325+ *
6326+ * This program is distributed in the hope that it will be useful,
6327+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
6328+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
6329+ * General Public License for more details.
6330+ *
6331+ * You should have received a copy of the GNU General Public License
6332+ * along with this program; if not, write to the Free Software
6333+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
6334+ * 02111-1307 USA
6335+ */
6336+#include <linux/module.h>
6337+#include <linux/perfmon_kern.h>
6338+
6339+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
6340+MODULE_DESCRIPTION("Itanium 2 (McKinley) PMU description tables");
6341+MODULE_LICENSE("GPL");
6342+
6343+#define RDEP(x) (1UL << (x))
6344+
6345+#define PFM_MCK_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|RDEP(10)|RDEP(11)|\
6346+ RDEP(12))
6347+
6348+#define PFM_MCK_NO64 (1UL<<5)
6349+
6350+static struct pfm_arch_pmu_info pfm_mck_pmu_info = {
6351+ .mask_pmcs = {PFM_MCK_MASK_PMCS,},
6352+};
6353+
6354+/* reserved bits are 1 in the mask */
6355+#define PFM_ITA2_RSVD 0xfffffffffc8000a0UL
6356+
6357+/*
6358+ * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using
6359+ * PMC256+y means we use PMC256+y+1. Yet, we do not have dependency information
6360+ * but this is fine because they are handled separately in the IA-64 specific
6361+ * code.
6362+ */
6363+static struct pfm_regmap_desc pfm_mck_pmc_desc[] = {
6364+/* pmc0 */ PMX_NA,
6365+/* pmc1 */ PMX_NA,
6366+/* pmc2 */ PMX_NA,
6367+/* pmc3 */ PMX_NA,
6368+/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4" , 0x800020UL, 0xfffffffffc8000a0, PFM_MCK_NO64, 4),
6369+/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 5),
6370+/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 6),
6371+/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 7),
6372+/* pmc8 */ PMC_D(PFM_REG_W , "PMC8" , 0xffffffff3fffffffUL, 0xc0000004UL, 0, 8),
6373+/* pmc9 */ PMC_D(PFM_REG_W , "PMC9" , 0xffffffff3ffffffcUL, 0xc0000004UL, 0, 9),
6374+/* pmc10 */ PMC_D(PFM_REG_W , "PMC10", 0x0, 0xffffffffffff0000UL, 0, 10),
6375+/* pmc11 */ PMC_D(PFM_REG_W , "PMC11", 0x0, 0xfffffffffcf0fe30UL, 0, 11),
6376+/* pmc12 */ PMC_D(PFM_REG_W , "PMC12", 0x0, 0xffffffffffff0000UL, 0, 12),
6377+/* pmc13 */ PMC_D(PFM_REG_W , "PMC13", 0x2078fefefefeUL, 0xfffe1fffe7e7e7e7UL, 0, 13),
6378+/* pmc14 */ PMC_D(PFM_REG_W , "PMC14", 0x0db60db60db60db6UL, 0xffffffffffffdb6dUL, 0, 14),
6379+/* pmc15 */ PMC_D(PFM_REG_W , "PMC15", 0xfffffff0UL, 0xfffffffffffffff0UL, 0, 15),
6380+/* pmc16 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6381+/* pmc24 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6382+/* pmc32 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6383+/* pmc40 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6384+/* pmc48 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6385+/* pmc56 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6386+/* pmc64 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6387+/* pmc72 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6388+/* pmc80 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6389+/* pmc88 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6390+/* pmc96 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6391+/* pmc104 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6392+/* pmc112 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6393+/* pmc120 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6394+/* pmc128 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6395+/* pmc136 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6396+/* pmc144 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6397+/* pmc152 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6398+/* pmc160 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6399+/* pmc168 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6400+/* pmc176 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6401+/* pmc184 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6402+/* pmc192 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6403+/* pmc200 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6404+/* pmc208 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6405+/* pmc216 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6406+/* pmc224 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6407+/* pmc232 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6408+/* pmc240 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6409+/* pmc248 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6410+/* pmc256 */ PMC_D(PFM_REG_W , "IBR0", 0x0, 0, 0, 0),
6411+/* pmc257 */ PMC_D(PFM_REG_W , "IBR1", 0x0, 0x8000000000000000UL, 0, 1),
6412+/* pmc258 */ PMC_D(PFM_REG_W , "IBR2", 0x0, 0, 0, 2),
6413+/* pmc259 */ PMC_D(PFM_REG_W , "IBR3", 0x0, 0x8000000000000000UL, 0, 3),
6414+/* pmc260 */ PMC_D(PFM_REG_W , "IBR4", 0x0, 0, 0, 4),
6415+/* pmc261 */ PMC_D(PFM_REG_W , "IBR5", 0x0, 0x8000000000000000UL, 0, 5),
6416+/* pmc262 */ PMC_D(PFM_REG_W , "IBR6", 0x0, 0, 0, 6),
6417+/* pmc263 */ PMC_D(PFM_REG_W , "IBR7", 0x0, 0x8000000000000000UL, 0, 7),
6418+/* pmc264 */ PMC_D(PFM_REG_W , "DBR0", 0x0, 0, 0, 0),
6419+/* pmc265 */ PMC_D(PFM_REG_W , "DBR1", 0x0, 0xc000000000000000UL, 0, 1),
6420+/* pmc266 */ PMC_D(PFM_REG_W , "DBR2", 0x0, 0, 0, 2),
6421+/* pmc267 */ PMC_D(PFM_REG_W , "DBR3", 0x0, 0xc000000000000000UL, 0, 3),
6422+/* pmc268 */ PMC_D(PFM_REG_W , "DBR4", 0x0, 0, 0, 4),
6423+/* pmc269 */ PMC_D(PFM_REG_W , "DBR5", 0x0, 0xc000000000000000UL, 0, 5),
6424+/* pmc270 */ PMC_D(PFM_REG_W , "DBR6", 0x0, 0, 0, 6),
6425+/* pmc271 */ PMC_D(PFM_REG_W , "DBR7", 0x0, 0xc000000000000000UL, 0, 7)
6426+};
6427+#define PFM_MCK_NUM_PMCS ARRAY_SIZE(pfm_mck_pmc_desc)
6428+
6429+static struct pfm_regmap_desc pfm_mck_pmd_desc[] = {
6430+/* pmd0 */ PMD_DP(PFM_REG_I, "PMD0", 0, 1ull << 10),
6431+/* pmd1 */ PMD_DP(PFM_REG_I, "PMD1", 1, 1ull << 10),
6432+/* pmd2 */ PMD_DP(PFM_REG_I, "PMD2", 2, 1ull << 11),
6433+/* pmd3 */ PMD_DP(PFM_REG_I, "PMD3", 3, 1ull << 11),
6434+/* pmd4 */ PMD_DP(PFM_REG_C, "PMD4", 4, 1ull << 4),
6435+/* pmd5 */ PMD_DP(PFM_REG_C, "PMD5", 5, 1ull << 5),
6436+/* pmd6 */ PMD_DP(PFM_REG_C, "PMD6", 6, 1ull << 6),
6437+/* pmd7 */ PMD_DP(PFM_REG_C, "PMD7", 7, 1ull << 7),
6438+/* pmd8 */ PMD_DP(PFM_REG_I, "PMD8", 8, 1ull << 12),
6439+/* pmd9 */ PMD_DP(PFM_REG_I, "PMD9", 9, 1ull << 12),
6440+/* pmd10 */ PMD_DP(PFM_REG_I, "PMD10", 10, 1ull << 12),
6441+/* pmd11 */ PMD_DP(PFM_REG_I, "PMD11", 11, 1ull << 12),
6442+/* pmd12 */ PMD_DP(PFM_REG_I, "PMD12", 12, 1ull << 12),
6443+/* pmd13 */ PMD_DP(PFM_REG_I, "PMD13", 13, 1ull << 12),
6444+/* pmd14 */ PMD_DP(PFM_REG_I, "PMD14", 14, 1ull << 12),
6445+/* pmd15 */ PMD_DP(PFM_REG_I, "PMD15", 15, 1ull << 12),
6446+/* pmd16 */ PMD_DP(PFM_REG_I, "PMD16", 16, 1ull << 12),
6447+/* pmd17 */ PMD_DP(PFM_REG_I, "PMD17", 17, 1ull << 11)
6448+};
6449+#define PFM_MCK_NUM_PMDS ARRAY_SIZE(pfm_mck_pmd_desc)
6450+
6451+static int pfm_mck_pmc_check(struct pfm_context *ctx,
6452+ struct pfm_event_set *set,
6453+ struct pfarg_pmc *req)
6454+{
6455+ struct pfm_arch_context *ctx_arch;
6456+ u64 val8 = 0, val14 = 0, val13 = 0;
6457+ u64 tmpval;
6458+ u16 cnum;
6459+ int ret = 0, check_case1 = 0;
6460+ int is_system;
6461+
6462+ tmpval = req->reg_value;
6463+ cnum = req->reg_num;
6464+ ctx_arch = pfm_ctx_arch(ctx);
6465+ is_system = ctx->flags.system;
6466+
6467+#define PFM_MCK_PMC_PM_POS6 (1UL<<6)
6468+#define PFM_MCK_PMC_PM_POS4 (1UL<<4)
6469+
6470+ switch (cnum) {
6471+ case 4:
6472+ case 5:
6473+ case 6:
6474+ case 7:
6475+ case 11:
6476+ case 12:
6477+ if (is_system)
6478+ tmpval |= PFM_MCK_PMC_PM_POS6;
6479+ else
6480+ tmpval &= ~PFM_MCK_PMC_PM_POS6;
6481+ break;
6482+
6483+ case 8:
6484+ val8 = tmpval;
6485+ val13 = set->pmcs[13];
6486+ val14 = set->pmcs[14];
6487+ check_case1 = 1;
6488+ break;
6489+
6490+ case 10:
6491+ if (is_system)
6492+ tmpval |= PFM_MCK_PMC_PM_POS4;
6493+ else
6494+ tmpval &= ~PFM_MCK_PMC_PM_POS4;
6495+ break;
6496+
6497+ case 13:
6498+ val8 = set->pmcs[8];
6499+ val13 = tmpval;
6500+ val14 = set->pmcs[14];
6501+ check_case1 = 1;
6502+ break;
6503+
6504+ case 14:
6505+ val8 = set->pmcs[8];
6506+ val13 = set->pmcs[13];
6507+ val14 = tmpval;
6508+ check_case1 = 1;
6509+ break;
6510+ }
6511+
6512+ /*
6513+ * check illegal configuration which can produce inconsistencies
6514+ * in tagging i-side events in L1D and L2 caches
6515+ */
6516+ if (check_case1) {
6517+ ret = (((val13 >> 45) & 0xf) == 0 && ((val8 & 0x1) == 0))
6518+ && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0)
6519+ || (((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0));
6520+
6521+ if (ret) {
6522+ PFM_DBG("perfmon: invalid config pmc8=0x%lx "
6523+ "pmc13=0x%lx pmc14=0x%lx",
6524+ val8, val13, val14);
6525+ return -EINVAL;
6526+ }
6527+ }
6528+
6529+ /*
6530+ * check if configuration implicitely activates the use of
6531+ * the debug registers. If true, then we ensure that this is
6532+ * possible and that we do not pick up stale value in the HW
6533+ * registers.
6534+ *
6535+ * We postpone the checks of pmc13 and pmc14 to avoid side effects
6536+ * in case of errors
6537+ */
6538+
6539+ /*
6540+ * pmc13 is "active" if:
6541+ * one of the pmc13.cfg_dbrpXX field is different from 0x3
6542+ * AND
6543+ * at the corresponding pmc13.ena_dbrpXX is set.
6544+ */
6545+ if (cnum == 13 && (tmpval & 0x1e00000000000UL)
6546+ && (tmpval & 0x18181818UL) != 0x18181818UL
6547+ && ctx_arch->flags.use_dbr == 0) {
6548+ PFM_DBG("pmc13=0x%lx active", tmpval);
6549+ ret = pfm_ia64_mark_dbregs_used(ctx, set);
6550+ if (ret)
6551+ return ret;
6552+ }
6553+
6554+ /*
6555+ * if any pmc14.ibrpX bit is enabled we must clear the ibrs
6556+ */
6557+ if (cnum == 14 && ((tmpval & 0x2222UL) != 0x2222UL)
6558+ && ctx_arch->flags.use_dbr == 0) {
6559+ PFM_DBG("pmc14=0x%lx active", tmpval);
6560+ ret = pfm_ia64_mark_dbregs_used(ctx, set);
6561+ if (ret)
6562+ return ret;
6563+ }
6564+
6565+ req->reg_value = tmpval;
6566+
6567+ return 0;
6568+}
6569+
6570+static int pfm_mck_probe_pmu(void)
6571+{
6572+ return local_cpu_data->family == 0x1f ? 0 : -1;
6573+}
6574+
6575+/*
6576+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
6577+ */
6578+static struct pfm_pmu_config pfm_mck_pmu_conf = {
6579+ .pmu_name = "Itanium 2",
6580+ .counter_width = 47,
6581+ .pmd_desc = pfm_mck_pmd_desc,
6582+ .pmc_desc = pfm_mck_pmc_desc,
6583+ .pmc_write_check = pfm_mck_pmc_check,
6584+ .num_pmc_entries = PFM_MCK_NUM_PMCS,
6585+ .num_pmd_entries = PFM_MCK_NUM_PMDS,
6586+ .probe_pmu = pfm_mck_probe_pmu,
6587+ .version = "1.0",
6588+ .flags = PFM_PMU_BUILTIN_FLAG,
6589+ .owner = THIS_MODULE,
6590+ .pmu_info = &pfm_mck_pmu_info,
6591+};
6592+
6593+static int __init pfm_mck_pmu_init_module(void)
6594+{
6595+ return pfm_pmu_register(&pfm_mck_pmu_conf);
6596+}
6597+
6598+static void __exit pfm_mck_pmu_cleanup_module(void)
6599+{
6600+ pfm_pmu_unregister(&pfm_mck_pmu_conf);
6601+}
6602+
6603+module_init(pfm_mck_pmu_init_module);
6604+module_exit(pfm_mck_pmu_cleanup_module);
6605--- /dev/null
6606+++ b/arch/ia64/perfmon/perfmon_montecito.c
6607@@ -0,0 +1,412 @@
6608+/*
6609+ * This file contains the McKinley PMU register description tables
6610+ * and pmc checker used by perfmon.c.
6611+ *
6612+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
6613+ * Contributed Stephane Eranian <eranian@hpl.hp.com>
6614+ *
6615+ * This program is free software; you can redistribute it and/or
6616+ * modify it under the terms of version 2 of the GNU General Public
6617+ * License as published by the Free Software Foundation.
6618+ *
6619+ * This program is distributed in the hope that it will be useful,
6620+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
6621+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
6622+ * General Public License for more details.
6623+ *
6624+ * You should have received a copy of the GNU General Public License
6625+ * along with this program; if not, write to the Free Software
6626+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
6627+ * 02111-1307 USA
6628+ */
6629+#include <linux/module.h>
6630+#include <linux/smp.h>
6631+#include <linux/perfmon_kern.h>
6632+
6633+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
6634+MODULE_DESCRIPTION("Dual-Core Itanium 2 (Montecito) PMU description table");
6635+MODULE_LICENSE("GPL");
6636+
6637+#define RDEP(x) (1UL << (x))
6638+
6639+#define PFM_MONT_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|\
6640+ RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|\
6641+ RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|\
6642+ RDEP(37)|RDEP(39)|RDEP(40)|RDEP(42))
6643+
6644+#define PFM_MONT_NO64 (1UL<<5)
6645+
6646+static struct pfm_arch_pmu_info pfm_mont_pmu_info = {
6647+ .mask_pmcs = {PFM_MONT_MASK_PMCS,},
6648+};
6649+
6650+#define PFM_MONT_RSVD 0xffffffff838000a0UL
6651+/*
6652+ *
6653+ * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using
6654+ * PMC256+y means we use PMC256+y+1. Yet, we do not have dependency information
6655+ * but this is fine because they are handled separately in the IA-64 specific
6656+ * code.
6657+ *
6658+ * For PMC4-PMC15, PMC40: we force pmc.ism=2 (IA-64 mode only)
6659+ */
6660+static struct pfm_regmap_desc pfm_mont_pmc_desc[] = {
6661+/* pmc0 */ PMX_NA,
6662+/* pmc1 */ PMX_NA,
6663+/* pmc2 */ PMX_NA,
6664+/* pmc3 */ PMX_NA,
6665+/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 4),
6666+/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 5),
6667+/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 6),
6668+/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 7),
6669+/* pmc8 */ PMC_D(PFM_REG_W64, "PMC8" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 8),
6670+/* pmc9 */ PMC_D(PFM_REG_W64, "PMC9" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 9),
6671+/* pmc10 */ PMC_D(PFM_REG_W64, "PMC10", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 10),
6672+/* pmc11 */ PMC_D(PFM_REG_W64, "PMC11", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 11),
6673+/* pmc12 */ PMC_D(PFM_REG_W64, "PMC12", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 12),
6674+/* pmc13 */ PMC_D(PFM_REG_W64, "PMC13", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 13),
6675+/* pmc14 */ PMC_D(PFM_REG_W64, "PMC14", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 14),
6676+/* pmc15 */ PMC_D(PFM_REG_W64, "PMC15", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 15),
6677+/* pmc16 */ PMX_NA,
6678+/* pmc17 */ PMX_NA,
6679+/* pmc18 */ PMX_NA,
6680+/* pmc19 */ PMX_NA,
6681+/* pmc20 */ PMX_NA,
6682+/* pmc21 */ PMX_NA,
6683+/* pmc22 */ PMX_NA,
6684+/* pmc23 */ PMX_NA,
6685+/* pmc24 */ PMX_NA,
6686+/* pmc25 */ PMX_NA,
6687+/* pmc26 */ PMX_NA,
6688+/* pmc27 */ PMX_NA,
6689+/* pmc28 */ PMX_NA,
6690+/* pmc29 */ PMX_NA,
6691+/* pmc30 */ PMX_NA,
6692+/* pmc31 */ PMX_NA,
6693+/* pmc32 */ PMC_D(PFM_REG_W , "PMC32", 0x30f01ffffffffffUL, 0xfcf0fe0000000000UL, 0, 32),
6694+/* pmc33 */ PMC_D(PFM_REG_W , "PMC33", 0x0, 0xfffffe0000000000UL, 0, 33),
6695+/* pmc34 */ PMC_D(PFM_REG_W , "PMC34", 0xf01ffffffffffUL, 0xfff0fe0000000000UL, 0, 34),
6696+/* pmc35 */ PMC_D(PFM_REG_W , "PMC35", 0x0, 0x1ffffffffffUL, 0, 35),
6697+/* pmc36 */ PMC_D(PFM_REG_W , "PMC36", 0xfffffff0UL, 0xfffffffffffffff0UL, 0, 36),
6698+/* pmc37 */ PMC_D(PFM_REG_W , "PMC37", 0x0, 0xffffffffffffc000UL, 0, 37),
6699+/* pmc38 */ PMC_D(PFM_REG_W , "PMC38", 0xdb6UL, 0xffffffffffffdb6dUL, 0, 38),
6700+/* pmc39 */ PMC_D(PFM_REG_W , "PMC39", 0x0, 0xffffffffffff0030UL, 0, 39),
6701+/* pmc40 */ PMC_D(PFM_REG_W , "PMC40", 0x2000000UL, 0xfffffffffff0fe30UL, 0, 40),
6702+/* pmc41 */ PMC_D(PFM_REG_W , "PMC41", 0x00002078fefefefeUL, 0xfffe1fffe7e7e7e7UL, 0, 41),
6703+/* pmc42 */ PMC_D(PFM_REG_W , "PMC42", 0x0, 0xfff800b0UL, 0, 42),
6704+/* pmc43 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6705+/* pmc48 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6706+/* pmc56 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6707+/* pmc64 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6708+/* pmc72 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6709+/* pmc80 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6710+/* pmc88 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6711+/* pmc96 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6712+/* pmc104 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6713+/* pmc112 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6714+/* pmc120 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6715+/* pmc128 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6716+/* pmc136 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6717+/* pmc144 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6718+/* pmc152 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6719+/* pmc160 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6720+/* pmc168 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6721+/* pmc176 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6722+/* pmc184 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6723+/* pmc192 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6724+/* pmc200 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6725+/* pmc208 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6726+/* pmc216 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6727+/* pmc224 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6728+/* pmc232 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6729+/* pmc240 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6730+/* pmc248 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA,
6731+/* pmc256 */ PMC_D(PFM_REG_W, "IBR0", 0x0, 0, 0, 0),
6732+/* pmc257 */ PMC_D(PFM_REG_W, "IBR1", 0x0, 0x8000000000000000UL, 0, 1),
6733+/* pmc258 */ PMC_D(PFM_REG_W, "IBR2", 0x0, 0, 0, 2),
6734+/* pmc259 */ PMC_D(PFM_REG_W, "IBR3", 0x0, 0x8000000000000000UL, 0, 3),
6735+/* pmc260 */ PMC_D(PFM_REG_W, "IBR4", 0x0, 0, 0, 4),
6736+/* pmc261 */ PMC_D(PFM_REG_W, "IBR5", 0x0, 0x8000000000000000UL, 0, 5),
6737+/* pmc262 */ PMC_D(PFM_REG_W, "IBR6", 0x0, 0, 0, 6),
6738+/* pmc263 */ PMC_D(PFM_REG_W, "IBR7", 0x0, 0x8000000000000000UL, 0, 7),
6739+/* pmc264 */ PMC_D(PFM_REG_W, "DBR0", 0x0, 0, 0, 0),
6740+/* pmc265 */ PMC_D(PFM_REG_W, "DBR1", 0x0, 0xc000000000000000UL, 0, 1),
6741+/* pmc266 */ PMC_D(PFM_REG_W, "DBR2", 0x0, 0, 0, 2),
6742+/* pmc267 */ PMC_D(PFM_REG_W, "DBR3", 0x0, 0xc000000000000000UL, 0, 3),
6743+/* pmc268 */ PMC_D(PFM_REG_W, "DBR4", 0x0, 0, 0, 4),
6744+/* pmc269 */ PMC_D(PFM_REG_W, "DBR5", 0x0, 0xc000000000000000UL, 0, 5),
6745+/* pmc270 */ PMC_D(PFM_REG_W, "DBR6", 0x0, 0, 0, 6),
6746+/* pmc271 */ PMC_D(PFM_REG_W, "DBR7", 0x0, 0xc000000000000000UL, 0, 7)
6747+};
6748+#define PFM_MONT_NUM_PMCS ARRAY_SIZE(pfm_mont_pmc_desc)
6749+
6750+static struct pfm_regmap_desc pfm_mont_pmd_desc[] = {
6751+/* pmd0 */ PMX_NA,
6752+/* pmd1 */ PMX_NA,
6753+/* pmd2 */ PMX_NA,
6754+/* pmd3 */ PMX_NA,
6755+/* pmd4 */ PMD_DP(PFM_REG_C, "PMD4", 4, 1ull << 4),
6756+/* pmd5 */ PMD_DP(PFM_REG_C, "PMD5", 5, 1ull << 5),
6757+/* pmd6 */ PMD_DP(PFM_REG_C, "PMD6", 6, 1ull << 6),
6758+/* pmd7 */ PMD_DP(PFM_REG_C, "PMD7", 7, 1ull << 7),
6759+/* pmd8 */ PMD_DP(PFM_REG_C, "PMD8", 8, 1ull << 8),
6760+/* pmd9 */ PMD_DP(PFM_REG_C, "PMD9", 9, 1ull << 9),
6761+/* pmd10 */ PMD_DP(PFM_REG_C, "PMD10", 10, 1ull << 10),
6762+/* pmd11 */ PMD_DP(PFM_REG_C, "PMD11", 11, 1ull << 11),
6763+/* pmd12 */ PMD_DP(PFM_REG_C, "PMD12", 12, 1ull << 12),
6764+/* pmd13 */ PMD_DP(PFM_REG_C, "PMD13", 13, 1ull << 13),
6765+/* pmd14 */ PMD_DP(PFM_REG_C, "PMD14", 14, 1ull << 14),
6766+/* pmd15 */ PMD_DP(PFM_REG_C, "PMD15", 15, 1ull << 15),
6767+/* pmd16 */ PMX_NA,
6768+/* pmd17 */ PMX_NA,
6769+/* pmd18 */ PMX_NA,
6770+/* pmd19 */ PMX_NA,
6771+/* pmd20 */ PMX_NA,
6772+/* pmd21 */ PMX_NA,
6773+/* pmd22 */ PMX_NA,
6774+/* pmd23 */ PMX_NA,
6775+/* pmd24 */ PMX_NA,
6776+/* pmd25 */ PMX_NA,
6777+/* pmd26 */ PMX_NA,
6778+/* pmd27 */ PMX_NA,
6779+/* pmd28 */ PMX_NA,
6780+/* pmd29 */ PMX_NA,
6781+/* pmd30 */ PMX_NA,
6782+/* pmd31 */ PMX_NA,
6783+/* pmd32 */ PMD_DP(PFM_REG_I, "PMD32", 32, 1ull << 40),
6784+/* pmd33 */ PMD_DP(PFM_REG_I, "PMD33", 33, 1ull << 40),
6785+/* pmd34 */ PMD_DP(PFM_REG_I, "PMD34", 34, 1ull << 37),
6786+/* pmd35 */ PMD_DP(PFM_REG_I, "PMD35", 35, 1ull << 37),
6787+/* pmd36 */ PMD_DP(PFM_REG_I, "PMD36", 36, 1ull << 40),
6788+/* pmd37 */ PMX_NA,
6789+/* pmd38 */ PMD_DP(PFM_REG_I, "PMD38", 38, (1ull<<39)|(1ull<<42)),
6790+/* pmd39 */ PMD_DP(PFM_REG_I, "PMD39", 39, (1ull<<39)|(1ull<<42)),
6791+/* pmd40 */ PMX_NA,
6792+/* pmd41 */ PMX_NA,
6793+/* pmd42 */ PMX_NA,
6794+/* pmd43 */ PMX_NA,
6795+/* pmd44 */ PMX_NA,
6796+/* pmd45 */ PMX_NA,
6797+/* pmd46 */ PMX_NA,
6798+/* pmd47 */ PMX_NA,
6799+/* pmd48 */ PMD_DP(PFM_REG_I, "PMD48", 48, (1ull<<39)|(1ull<<42)),
6800+/* pmd49 */ PMD_DP(PFM_REG_I, "PMD49", 49, (1ull<<39)|(1ull<<42)),
6801+/* pmd50 */ PMD_DP(PFM_REG_I, "PMD50", 50, (1ull<<39)|(1ull<<42)),
6802+/* pmd51 */ PMD_DP(PFM_REG_I, "PMD51", 51, (1ull<<39)|(1ull<<42)),
6803+/* pmd52 */ PMD_DP(PFM_REG_I, "PMD52", 52, (1ull<<39)|(1ull<<42)),
6804+/* pmd53 */ PMD_DP(PFM_REG_I, "PMD53", 53, (1ull<<39)|(1ull<<42)),
6805+/* pmd54 */ PMD_DP(PFM_REG_I, "PMD54", 54, (1ull<<39)|(1ull<<42)),
6806+/* pmd55 */ PMD_DP(PFM_REG_I, "PMD55", 55, (1ull<<39)|(1ull<<42)),
6807+/* pmd56 */ PMD_DP(PFM_REG_I, "PMD56", 56, (1ull<<39)|(1ull<<42)),
6808+/* pmd57 */ PMD_DP(PFM_REG_I, "PMD57", 57, (1ull<<39)|(1ull<<42)),
6809+/* pmd58 */ PMD_DP(PFM_REG_I, "PMD58", 58, (1ull<<39)|(1ull<<42)),
6810+/* pmd59 */ PMD_DP(PFM_REG_I, "PMD59", 59, (1ull<<39)|(1ull<<42)),
6811+/* pmd60 */ PMD_DP(PFM_REG_I, "PMD60", 60, (1ull<<39)|(1ull<<42)),
6812+/* pmd61 */ PMD_DP(PFM_REG_I, "PMD61", 61, (1ull<<39)|(1ull<<42)),
6813+/* pmd62 */ PMD_DP(PFM_REG_I, "PMD62", 62, (1ull<<39)|(1ull<<42)),
6814+/* pmd63 */ PMD_DP(PFM_REG_I, "PMD63", 63, (1ull<<39)|(1ull<<42))
6815+};
6816+#define PFM_MONT_NUM_PMDS ARRAY_SIZE(pfm_mont_pmd_desc)
6817+
6818+static int pfm_mont_has_ht;
6819+
6820+static int pfm_mont_pmc_check(struct pfm_context *ctx,
6821+ struct pfm_event_set *set,
6822+ struct pfarg_pmc *req)
6823+{
6824+ struct pfm_arch_context *ctx_arch;
6825+ u64 val32 = 0, val38 = 0, val41 = 0;
6826+ u64 tmpval;
6827+ u16 cnum;
6828+ int ret = 0, check_case1 = 0;
6829+ int is_system;
6830+
6831+ tmpval = req->reg_value;
6832+ cnum = req->reg_num;
6833+ ctx_arch = pfm_ctx_arch(ctx);
6834+ is_system = ctx->flags.system;
6835+
6836+#define PFM_MONT_PMC_PM_POS6 (1UL<<6)
6837+#define PFM_MONT_PMC_PM_POS4 (1UL<<4)
6838+
6839+ switch (cnum) {
6840+ case 4:
6841+ case 5:
6842+ case 6:
6843+ case 7:
6844+ case 8:
6845+ case 9:
6846+ if (is_system)
6847+ tmpval |= PFM_MONT_PMC_PM_POS6;
6848+ else
6849+ tmpval &= ~PFM_MONT_PMC_PM_POS6;
6850+ break;
6851+ case 10:
6852+ case 11:
6853+ case 12:
6854+ case 13:
6855+ case 14:
6856+ case 15:
6857+ if ((req->reg_flags & PFM_REGFL_NO_EMUL64) == 0) {
6858+ if (pfm_mont_has_ht) {
6859+ PFM_INFO("perfmon: Errata 121 PMD10/PMD15 cannot be used to overflow"
6860+ "when threads on on");
6861+ return -EINVAL;
6862+ }
6863+ }
6864+ if (is_system)
6865+ tmpval |= PFM_MONT_PMC_PM_POS6;
6866+ else
6867+ tmpval &= ~PFM_MONT_PMC_PM_POS6;
6868+ break;
6869+ case 39:
6870+ case 40:
6871+ case 42:
6872+ if (pfm_mont_has_ht && ((req->reg_value >> 8) & 0x7) == 4) {
6873+ PFM_INFO("perfmon: Errata 120: IP-EAR not available when threads are on");
6874+ return -EINVAL;
6875+ }
6876+ if (is_system)
6877+ tmpval |= PFM_MONT_PMC_PM_POS6;
6878+ else
6879+ tmpval &= ~PFM_MONT_PMC_PM_POS6;
6880+ break;
6881+
6882+ case 32:
6883+ val32 = tmpval;
6884+ val38 = set->pmcs[38];
6885+ val41 = set->pmcs[41];
6886+ check_case1 = 1;
6887+ break;
6888+
6889+ case 37:
6890+ if (is_system)
6891+ tmpval |= PFM_MONT_PMC_PM_POS4;
6892+ else
6893+ tmpval &= ~PFM_MONT_PMC_PM_POS4;
6894+ break;
6895+
6896+ case 38:
6897+ val38 = tmpval;
6898+ val32 = set->pmcs[32];
6899+ val41 = set->pmcs[41];
6900+ check_case1 = 1;
6901+ break;
6902+ case 41:
6903+ val41 = tmpval;
6904+ val32 = set->pmcs[32];
6905+ val38 = set->pmcs[38];
6906+ check_case1 = 1;
6907+ break;
6908+ }
6909+
6910+ if (check_case1) {
6911+ ret = (((val41 >> 45) & 0xf) == 0 && ((val32>>57) & 0x1) == 0)
6912+ && ((((val38>>1) & 0x3) == 0x2 || ((val38>>1) & 0x3) == 0)
6913+ || (((val38>>4) & 0x3) == 0x2 || ((val38>>4) & 0x3) == 0));
6914+ if (ret) {
6915+ PFM_DBG("perfmon: invalid config pmc38=0x%lx "
6916+ "pmc41=0x%lx pmc32=0x%lx",
6917+ val38, val41, val32);
6918+ return -EINVAL;
6919+ }
6920+ }
6921+
6922+ /*
6923+ * check if configuration implicitely activates the use of the
6924+ * debug registers. If true, then we ensure that this is possible
6925+ * and that we do not pick up stale value in the HW registers.
6926+ */
6927+
6928+ /*
6929+ *
6930+ * pmc41 is "active" if:
6931+ * one of the pmc41.cfgdtagXX field is different from 0x3
6932+ * AND
6933+ * the corsesponding pmc41.en_dbrpXX is set.
6934+ * AND
6935+ * ctx_fl_use_dbr (dbr not yet used)
6936+ */
6937+ if (cnum == 41
6938+ && (tmpval & 0x1e00000000000)
6939+ && (tmpval & 0x18181818) != 0x18181818
6940+ && ctx_arch->flags.use_dbr == 0) {
6941+ PFM_DBG("pmc41=0x%lx active, clearing dbr", tmpval);
6942+ ret = pfm_ia64_mark_dbregs_used(ctx, set);
6943+ if (ret)
6944+ return ret;
6945+ }
6946+ /*
6947+ * we must clear the (instruction) debug registers if:
6948+ * pmc38.ig_ibrpX is 0 (enabled)
6949+ * and
6950+ * fl_use_dbr == 0 (dbr not yet used)
6951+ */
6952+ if (cnum == 38 && ((tmpval & 0x492) != 0x492)
6953+ && ctx_arch->flags.use_dbr == 0) {
6954+ PFM_DBG("pmc38=0x%lx active pmc38, clearing ibr", tmpval);
6955+ ret = pfm_ia64_mark_dbregs_used(ctx, set);
6956+ if (ret)
6957+ return ret;
6958+
6959+ }
6960+ req->reg_value = tmpval;
6961+ return 0;
6962+}
6963+
6964+static void pfm_handle_errata(void)
6965+{
6966+ pfm_mont_has_ht = 1;
6967+
6968+ PFM_INFO("activating workaround for errata 120 "
6969+ "(Disable IP-EAR when threads are on)");
6970+
6971+ PFM_INFO("activating workaround for Errata 121 "
6972+ "(PMC10-PMC15 cannot be used to overflow"
6973+ " when threads are on");
6974+}
6975+static int pfm_mont_probe_pmu(void)
6976+{
6977+ if (local_cpu_data->family != 0x20)
6978+ return -1;
6979+
6980+ /*
6981+ * the 2 errata must be activated when
6982+ * threads are/can be enabled
6983+ */
6984+ if (is_multithreading_enabled())
6985+ pfm_handle_errata();
6986+
6987+ return 0;
6988+}
6989+
6990+/*
6991+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
6992+ */
6993+static struct pfm_pmu_config pfm_mont_pmu_conf = {
6994+ .pmu_name = "Montecito",
6995+ .counter_width = 47,
6996+ .pmd_desc = pfm_mont_pmd_desc,
6997+ .pmc_desc = pfm_mont_pmc_desc,
6998+ .num_pmc_entries = PFM_MONT_NUM_PMCS,
6999+ .num_pmd_entries = PFM_MONT_NUM_PMDS,
7000+ .pmc_write_check = pfm_mont_pmc_check,
7001+ .probe_pmu = pfm_mont_probe_pmu,
7002+ .version = "1.0",
7003+ .pmu_info = &pfm_mont_pmu_info,
7004+ .flags = PFM_PMU_BUILTIN_FLAG,
7005+ .owner = THIS_MODULE
7006+};
7007+
7008+static int __init pfm_mont_pmu_init_module(void)
7009+{
7010+ return pfm_pmu_register(&pfm_mont_pmu_conf);
7011+}
7012+
7013+static void __exit pfm_mont_pmu_cleanup_module(void)
7014+{
7015+ pfm_pmu_unregister(&pfm_mont_pmu_conf);
7016+}
7017+
7018+module_init(pfm_mont_pmu_init_module);
7019+module_exit(pfm_mont_pmu_cleanup_module);
7020--- a/arch/mips/Kconfig
7021+++ b/arch/mips/Kconfig
7022@@ -1858,6 +1858,8 @@ config SECCOMP
7023
7024 If unsure, say Y. Only embedded should say N here.
7025
7026+source "arch/mips/perfmon/Kconfig"
7027+
7028 endmenu
7029
7030 config RWSEM_GENERIC_SPINLOCK
7031--- a/arch/mips/Makefile
7032+++ b/arch/mips/Makefile
7033@@ -154,6 +154,12 @@ endif
7034 endif
7035
7036 #
7037+# Perfmon support
7038+#
7039+
7040+core-$(CONFIG_PERFMON) += arch/mips/perfmon/
7041+
7042+#
7043 # Firmware support
7044 #
7045 libs-$(CONFIG_ARC) += arch/mips/fw/arc/
7046--- a/arch/mips/kernel/process.c
7047+++ b/arch/mips/kernel/process.c
7048@@ -27,6 +27,7 @@
7049 #include <linux/completion.h>
7050 #include <linux/kallsyms.h>
7051 #include <linux/random.h>
7052+#include <linux/perfmon_kern.h>
7053
7054 #include <asm/asm.h>
7055 #include <asm/bootinfo.h>
7056@@ -94,6 +95,7 @@ void start_thread(struct pt_regs * regs,
7057
7058 void exit_thread(void)
7059 {
7060+ pfm_exit_thread();
7061 }
7062
7063 void flush_thread(void)
7064@@ -162,6 +164,8 @@ int copy_thread(int nr, unsigned long cl
7065 if (clone_flags & CLONE_SETTLS)
7066 ti->tp_value = regs->regs[7];
7067
7068+ pfm_copy_thread(p);
7069+
7070 return 0;
7071 }
7072
7073--- a/arch/mips/kernel/scall32-o32.S
7074+++ b/arch/mips/kernel/scall32-o32.S
7075@@ -653,6 +653,18 @@ einval: li v0, -EINVAL
7076 sys sys_dup3 3
7077 sys sys_pipe2 2
7078 sys sys_inotify_init1 1
7079+ sys sys_pfm_create_context 4 /* 4330 */
7080+ sys sys_pfm_write_pmcs 3
7081+ sys sys_pfm_write_pmds 4
7082+ sys sys_pfm_read_pmds 3
7083+ sys sys_pfm_load_context 2
7084+ sys sys_pfm_start 2 /* 4335 */
7085+ sys sys_pfm_stop 1
7086+ sys sys_pfm_restart 1
7087+ sys sys_pfm_create_evtsets 3
7088+ sys sys_pfm_getinfo_evtsets 3
7089+ sys sys_pfm_delete_evtsets 3 /* 4340 */
7090+ sys sys_pfm_unload_context 1
7091 .endm
7092
7093 /* We pre-compute the number of _instruction_ bytes needed to
7094--- a/arch/mips/kernel/scall64-64.S
7095+++ b/arch/mips/kernel/scall64-64.S
7096@@ -487,4 +487,16 @@ sys_call_table:
7097 PTR sys_dup3
7098 PTR sys_pipe2
7099 PTR sys_inotify_init1
7100+ PTR sys_pfm_create_context
7101+ PTR sys_pfm_write_pmcs /* 5290 */
7102+ PTR sys_pfm_write_pmds
7103+ PTR sys_pfm_read_pmds
7104+ PTR sys_pfm_load_context
7105+ PTR sys_pfm_start
7106+ PTR sys_pfm_stop /* 5295 */
7107+ PTR sys_pfm_restart
7108+ PTR sys_pfm_create_evtsets
7109+ PTR sys_pfm_getinfo_evtsets
7110+ PTR sys_pfm_delete_evtsets
7111+ PTR sys_pfm_unload_context /* 5300 */
7112 .size sys_call_table,.-sys_call_table
7113--- a/arch/mips/kernel/scall64-n32.S
7114+++ b/arch/mips/kernel/scall64-n32.S
7115@@ -400,12 +400,12 @@ EXPORT(sysn32_call_table)
7116 PTR sys_ioprio_set
7117 PTR sys_ioprio_get
7118 PTR compat_sys_utimensat
7119- PTR compat_sys_signalfd /* 5280 */
7120+ PTR compat_sys_signalfd /* 6280 */
7121 PTR sys_ni_syscall
7122 PTR sys_eventfd
7123 PTR sys_fallocate
7124 PTR sys_timerfd_create
7125- PTR sys_timerfd_gettime /* 5285 */
7126+ PTR sys_timerfd_gettime /* 6285 */
7127 PTR sys_timerfd_settime
7128 PTR sys_signalfd4
7129 PTR sys_eventfd2
7130@@ -413,4 +413,16 @@ EXPORT(sysn32_call_table)
7131 PTR sys_dup3 /* 5290 */
7132 PTR sys_pipe2
7133 PTR sys_inotify_init1
7134+ PTR sys_pfm_create_context
7135+ PTR sys_pfm_write_pmcs
7136+ PTR sys_pfm_write_pmds /* 6295 */
7137+ PTR sys_pfm_read_pmds
7138+ PTR sys_pfm_load_context
7139+ PTR sys_pfm_start
7140+ PTR sys_pfm_stop
7141+ PTR sys_pfm_restart /* 6300 */
7142+ PTR sys_pfm_create_evtsets
7143+ PTR sys_pfm_getinfo_evtsets
7144+ PTR sys_pfm_delete_evtsets
7145+ PTR sys_pfm_unload_context
7146 .size sysn32_call_table,.-sysn32_call_table
7147--- a/arch/mips/kernel/scall64-o32.S
7148+++ b/arch/mips/kernel/scall64-o32.S
7149@@ -535,4 +535,16 @@ sys_call_table:
7150 PTR sys_dup3
7151 PTR sys_pipe2
7152 PTR sys_inotify_init1
7153+ PTR sys_pfm_create_context /* 4330 */
7154+ PTR sys_pfm_write_pmcs
7155+ PTR sys_pfm_write_pmds
7156+ PTR sys_pfm_read_pmds
7157+ PTR sys_pfm_load_context
7158+ PTR sys_pfm_start /* 4335 */
7159+ PTR sys_pfm_stop
7160+ PTR sys_pfm_restart
7161+ PTR sys_pfm_create_evtsets
7162+ PTR sys_pfm_getinfo_evtsets
7163+ PTR sys_pfm_delete_evtsets /* 4340 */
7164+ PTR sys_pfm_unload_context
7165 .size sys_call_table,.-sys_call_table
7166--- a/arch/mips/kernel/signal.c
7167+++ b/arch/mips/kernel/signal.c
7168@@ -21,6 +21,7 @@
7169 #include <linux/compiler.h>
7170 #include <linux/syscalls.h>
7171 #include <linux/uaccess.h>
7172+#include <linux/perfmon_kern.h>
7173
7174 #include <asm/abi.h>
7175 #include <asm/asm.h>
7176@@ -695,8 +696,11 @@ static void do_signal(struct pt_regs *re
7177 * - triggered by the TIF_WORK_MASK flags
7178 */
7179 asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused,
7180- __u32 thread_info_flags)
7181+ __u32 thread_info_flags)
7182 {
7183+ if (thread_info_flags & _TIF_PERFMON_WORK)
7184+ pfm_handle_work(regs);
7185+
7186 /* deal with pending signal delivery */
7187 if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
7188 do_signal(regs);
7189--- a/arch/mips/kernel/time.c
7190+++ b/arch/mips/kernel/time.c
7191@@ -49,10 +49,11 @@ int update_persistent_clock(struct times
7192 return rtc_mips_set_mmss(now.tv_sec);
7193 }
7194
7195-static int null_perf_irq(void)
7196+int null_perf_irq(void)
7197 {
7198 return 0;
7199 }
7200+EXPORT_SYMBOL(null_perf_irq);
7201
7202 int (*perf_irq)(void) = null_perf_irq;
7203
7204--- a/arch/mips/kernel/traps.c
7205+++ b/arch/mips/kernel/traps.c
7206@@ -92,17 +92,15 @@ static void show_raw_backtrace(unsigned
7207 #ifdef CONFIG_KALLSYMS
7208 printk("\n");
7209 #endif
7210- while (!kstack_end(sp)) {
7211- unsigned long __user *p =
7212- (unsigned long __user *)(unsigned long)sp++;
7213- if (__get_user(addr, p)) {
7214- printk(" (Bad stack address)");
7215- break;
7216+#define IS_KVA01(a) ((((unsigned long)a) & 0xc0000000) == 0x80000000)
7217+ if (IS_KVA01(sp)) {
7218+ while (!kstack_end(sp)) {
7219+ addr = *sp++;
7220+ if (__kernel_text_address(addr))
7221+ print_ip_sym(addr);
7222 }
7223- if (__kernel_text_address(addr))
7224- print_ip_sym(addr);
7225+ printk("\n");
7226 }
7227- printk("\n");
7228 }
7229
7230 #ifdef CONFIG_KALLSYMS
7231--- a/arch/mips/mti-malta/malta-time.c
7232+++ b/arch/mips/mti-malta/malta-time.c
7233@@ -27,6 +27,7 @@
7234 #include <linux/time.h>
7235 #include <linux/timex.h>
7236 #include <linux/mc146818rtc.h>
7237+#include <linux/perfmon_kern.h>
7238
7239 #include <asm/mipsregs.h>
7240 #include <asm/mipsmtregs.h>
7241--- /dev/null
7242+++ b/arch/mips/perfmon/Kconfig
7243@@ -0,0 +1,61 @@
7244+menu "Hardware Performance Monitoring support"
7245+config PERFMON
7246+ bool "Perfmon2 performance monitoring interface"
7247+ default n
7248+ help
7249+ Enables the perfmon2 interface to access the hardware
7250+ performance counters. See <http://perfmon2.sf.net/> for
7251+ more details.
7252+
7253+config PERFMON_DEBUG
7254+ bool "Perfmon debugging"
7255+ default n
7256+ depends on PERFMON
7257+ help
7258+ Enables perfmon debugging support
7259+
7260+config PERFMON_DEBUG_FS
7261+ bool "Enable perfmon statistics reporting via debugfs"
7262+ default y
7263+ depends on PERFMON && DEBUG_FS
7264+ help
7265+ Enable collection and reporting of perfmon timing statistics under
7266+ debugfs. This is used for debugging and performance analysis of the
7267+ subsystem. The debugfs filesystem must be mounted.
7268+
7269+config PERFMON_FLUSH
7270+ bool "Flush sampling buffer when modified"
7271+ depends on PERFMON
7272+ default n
7273+ help
7274+ On some MIPS models, cache aliasing may cause invalid
7275+ data to be read from the perfmon sampling buffer. Use this option
7276+ to flush the buffer when it is modified to ensure valid data is
7277+ visible at the user level.
7278+
7279+config PERFMON_ALIGN
7280+ bool "Align sampling buffer to avoid cache aliasing"
7281+ depends on PERFMON
7282+ default n
7283+ help
7284+ On some MIPS models, cache aliasing may cause invalid
7285+ data to be read from the perfmon sampling buffer. By forcing a bigger
7286+ page alignment (4-page), one can guarantee the buffer virtual address
7287+ will conflict in the cache with the user level mapping of the buffer
7288+ thereby ensuring a consistent view by user programs.
7289+
7290+config PERFMON_DEBUG
7291+ bool "Perfmon debugging"
7292+ depends on PERFMON
7293+ default n
7294+ depends on PERFMON
7295+ help
7296+ Enables perfmon debugging support
7297+
7298+config PERFMON_MIPS64
7299+ tristate "Support for MIPS64 hardware performance counters"
7300+ depends on PERFMON
7301+ default n
7302+ help
7303+ Enables support for the MIPS64 hardware performance counters"
7304+endmenu
7305--- /dev/null
7306+++ b/arch/mips/perfmon/Makefile
7307@@ -0,0 +1,2 @@
7308+obj-$(CONFIG_PERFMON) += perfmon.o
7309+obj-$(CONFIG_PERFMON_MIPS64) += perfmon_mips64.o
7310--- /dev/null
7311+++ b/arch/mips/perfmon/perfmon.c
7312@@ -0,0 +1,313 @@
7313+/*
7314+ * This file implements the MIPS64 specific
7315+ * support for the perfmon2 interface
7316+ *
7317+ * Copyright (c) 2005 Philip J. Mucci
7318+ *
7319+ * based on versions for other architectures:
7320+ * Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
7321+ * Contributed by Stephane Eranian <eranian@htrpl.hp.com>
7322+ *
7323+ * This program is free software; you can redistribute it and/or
7324+ * modify it under the terms of version 2 of the GNU General Public
7325+ * License as published by the Free Software Foundation.
7326+ *
7327+ * This program is distributed in the hope that it will be useful,
7328+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
7329+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
7330+ * General Public License for more details.
7331+ *
7332+ * You should have received a copy of the GNU General Public License
7333+ * along with this program; if not, write to the Free Software
7334+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
7335+ * 02111-1307 USA
7336+ */
7337+#include <linux/interrupt.h>
7338+#include <linux/module.h>
7339+#include <linux/perfmon_kern.h>
7340+
7341+/*
7342+ * collect pending overflowed PMDs. Called from pfm_ctxsw()
7343+ * and from PMU interrupt handler. Must fill in set->povfl_pmds[]
7344+ * and set->npend_ovfls. Interrupts are masked
7345+ */
7346+static void __pfm_get_ovfl_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
7347+{
7348+ u64 new_val, wmask;
7349+ u64 *used_mask, *intr_pmds;
7350+ u64 mask[PFM_PMD_BV];
7351+ unsigned int i, max;
7352+
7353+ max = ctx->regs.max_intr_pmd;
7354+ intr_pmds = ctx->regs.intr_pmds;
7355+ used_mask = set->used_pmds;
7356+
7357+ wmask = 1ULL << pfm_pmu_conf->counter_width;
7358+
7359+ bitmap_and(cast_ulp(mask),
7360+ cast_ulp(intr_pmds),
7361+ cast_ulp(used_mask),
7362+ max);
7363+
7364+ /*
7365+ * check all PMD that can generate interrupts
7366+ * (that includes counters)
7367+ */
7368+ for (i = 0; i < max; i++) {
7369+ if (test_bit(i, mask)) {
7370+ new_val = pfm_arch_read_pmd(ctx, i);
7371+
7372+ PFM_DBG_ovfl("pmd%u new_val=0x%llx bit=%d\n",
7373+ i, (unsigned long long)new_val,
7374+ (new_val&wmask) ? 1 : 0);
7375+
7376+ if (new_val & wmask) {
7377+ __set_bit(i, set->povfl_pmds);
7378+ set->npend_ovfls++;
7379+ }
7380+ }
7381+ }
7382+}
7383+
7384+static void pfm_stop_active(struct task_struct *task, struct pfm_context *ctx,
7385+ struct pfm_event_set *set)
7386+{
7387+ unsigned int i, max;
7388+
7389+ max = ctx->regs.max_pmc;
7390+
7391+ /*
7392+ * clear enable bits, assume all pmcs are enable pmcs
7393+ */
7394+ for (i = 0; i < max; i++) {
7395+ if (test_bit(i, set->used_pmcs))
7396+ pfm_arch_write_pmc(ctx, i, 0);
7397+ }
7398+
7399+ if (set->npend_ovfls)
7400+ return;
7401+
7402+ __pfm_get_ovfl_pmds(ctx, set);
7403+}
7404+
7405+/*
7406+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
7407+ * Context is locked. Interrupts are masked. Monitoring is active.
7408+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
7409+ *
7410+ * for per-thread:
7411+ * must stop monitoring for the task
7412+ *
7413+ * Return:
7414+ * non-zero : did not save PMDs (as part of stopping the PMU)
7415+ * 0 : saved PMDs (no need to save them in caller)
7416+ */
7417+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx)
7418+{
7419+ /*
7420+ * disable lazy restore of PMC registers.
7421+ */
7422+ ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
7423+
7424+ /*
7425+ * if masked, monitoring is stopped, thus there is no
7426+ * need to stop the PMU again and there is no need to
7427+ * check for pending overflows. This is not just an
7428+ * optimization, this is also for correctness as you
7429+ * may end up detecting overflows twice.
7430+ */
7431+ if (ctx->state == PFM_CTX_MASKED)
7432+ return 1;
7433+
7434+ pfm_stop_active(task, ctx, ctx->active_set);
7435+
7436+ return 1;
7437+}
7438+
7439+/*
7440+ * Called from pfm_stop() and pfm_ctxsw()
7441+ * Interrupts are masked. Context is locked. Set is the active set.
7442+ *
7443+ * For per-thread:
7444+ * task is not necessarily current. If not current task, then
7445+ * task is guaranteed stopped and off any cpu. Access to PMU
7446+ * is not guaranteed. Interrupts are masked. Context is locked.
7447+ * Set is the active set.
7448+ *
7449+ * For system-wide:
7450+ * task is current
7451+ *
7452+ * must disable active monitoring. ctx cannot be NULL
7453+ */
7454+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx)
7455+{
7456+ /*
7457+ * no need to go through stop_save()
7458+ * if we are already stopped
7459+ */
7460+ if (!ctx->flags.started || ctx->state == PFM_CTX_MASKED)
7461+ return;
7462+
7463+ /*
7464+ * stop live registers and collect pending overflow
7465+ */
7466+ if (task == current)
7467+ pfm_stop_active(task, ctx, ctx->active_set);
7468+}
7469+
7470+/*
7471+ * called from pfm_start() or pfm_ctxsw() when idle task and
7472+ * EXCL_IDLE is on.
7473+ *
7474+ * Interrupts are masked. Context is locked. Set is the active set.
7475+ *
7476+ * For per-trhead:
7477+ * Task is not necessarily current. If not current task, then task
7478+ * is guaranteed stopped and off any cpu. Access to PMU is not guaranteed.
7479+ *
7480+ * For system-wide:
7481+ * task is always current
7482+ *
7483+ * must enable active monitoring.
7484+ */
7485+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx)
7486+{
7487+ struct pfm_event_set *set;
7488+ unsigned int i, max_pmc;
7489+
7490+ if (task != current)
7491+ return;
7492+
7493+ set = ctx->active_set;
7494+ max_pmc = ctx->regs.max_pmc;
7495+
7496+ for (i = 0; i < max_pmc; i++) {
7497+ if (test_bit(i, set->used_pmcs))
7498+ pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
7499+ }
7500+}
7501+
7502+/*
7503+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
7504+ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets()
7505+ * context is locked. Interrupts are masked. set cannot be NULL.
7506+ * Access to the PMU is guaranteed.
7507+ *
7508+ * function must restore all PMD registers from set.
7509+ */
7510+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
7511+{
7512+ u64 ovfl_mask, val;
7513+ u64 *impl_pmds;
7514+ unsigned int i;
7515+ unsigned int max_pmd;
7516+
7517+ max_pmd = ctx->regs.max_pmd;
7518+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
7519+ impl_pmds = ctx->regs.pmds;
7520+
7521+ /*
7522+ * must restore all pmds to avoid leaking
7523+ * information to user.
7524+ */
7525+ for (i = 0; i < max_pmd; i++) {
7526+
7527+ if (test_bit(i, impl_pmds) == 0)
7528+ continue;
7529+
7530+ val = set->pmds[i].value;
7531+
7532+ /*
7533+ * set upper bits for counter to ensure
7534+ * overflow will trigger
7535+ */
7536+ val &= ovfl_mask;
7537+
7538+ pfm_arch_write_pmd(ctx, i, val);
7539+ }
7540+}
7541+
7542+/*
7543+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
7544+ * pfm_context_load_sys(), pfm_ctxsw().
7545+ * Context is locked. Interrupts are masked. set cannot be NULL.
7546+ * Access to the PMU is guaranteed.
7547+ *
7548+ * function must restore all PMC registers from set, if needed.
7549+ */
7550+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
7551+{
7552+ u64 *impl_pmcs;
7553+ unsigned int i, max_pmc;
7554+
7555+ max_pmc = ctx->regs.max_pmc;
7556+ impl_pmcs = ctx->regs.pmcs;
7557+
7558+ /*
7559+ * - by default no PMCS measures anything
7560+ * - on ctxswout, all used PMCs are disabled (cccr enable bit cleared)
7561+ * hence when masked we do not need to restore anything
7562+ */
7563+ if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0)
7564+ return;
7565+
7566+ /*
7567+ * restore all pmcs
7568+ */
7569+ for (i = 0; i < max_pmc; i++)
7570+ if (test_bit(i, impl_pmcs))
7571+ pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
7572+}
7573+
7574+char *pfm_arch_get_pmu_module_name(void)
7575+{
7576+ switch (cpu_data->cputype) {
7577+#ifndef CONFIG_SMP
7578+ case CPU_34K:
7579+#if defined(CPU_74K)
7580+ case CPU_74K:
7581+#endif
7582+#endif
7583+ case CPU_SB1:
7584+ case CPU_SB1A:
7585+ case CPU_R12000:
7586+ case CPU_25KF:
7587+ case CPU_24K:
7588+ case CPU_20KC:
7589+ case CPU_5KC:
7590+ return "perfmon_mips64";
7591+ default:
7592+ return NULL;
7593+ }
7594+ return NULL;
7595+}
7596+
7597+int perfmon_perf_irq(void)
7598+{
7599+ /* BLATANTLY STOLEN FROM OPROFILE, then modified */
7600+ struct pt_regs *regs;
7601+ unsigned int counters = pfm_pmu_conf->regs_all.max_pmc;
7602+ unsigned int control;
7603+ unsigned int counter;
7604+
7605+ regs = get_irq_regs();
7606+ switch (counters) {
7607+#define HANDLE_COUNTER(n) \
7608+ case n + 1: \
7609+ control = read_c0_perfctrl ## n(); \
7610+ counter = read_c0_perfcntr ## n(); \
7611+ if ((control & MIPS64_PMC_INT_ENABLE_MASK) && \
7612+ (counter & MIPS64_PMD_INTERRUPT)) { \
7613+ pfm_interrupt_handler(instruction_pointer(regs),\
7614+ regs); \
7615+ return(1); \
7616+ }
7617+ HANDLE_COUNTER(3)
7618+ HANDLE_COUNTER(2)
7619+ HANDLE_COUNTER(1)
7620+ HANDLE_COUNTER(0)
7621+ }
7622+
7623+ return 0;
7624+}
7625+EXPORT_SYMBOL(perfmon_perf_irq);
7626--- /dev/null
7627+++ b/arch/mips/perfmon/perfmon_mips64.c
7628@@ -0,0 +1,218 @@
7629+/*
7630+ * This file contains the MIPS64 and decendent PMU register description tables
7631+ * and pmc checker used by perfmon.c.
7632+ *
7633+ * Copyright (c) 2005 Philip Mucci
7634+ *
7635+ * Based on perfmon_p6.c:
7636+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
7637+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
7638+ *
7639+ * This program is free software; you can redistribute it and/or
7640+ * modify it under the terms of version 2 of the GNU General Public
7641+ * License as published by the Free Software Foundation.
7642+ *
7643+ * This program is distributed in the hope that it will be useful,
7644+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
7645+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
7646+ * General Public License for more details.
7647+ *
7648+ * You should have received a copy of the GNU General Public License
7649+ * along with this program; if not, write to the Free Software
7650+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
7651+ * 02111-1307 USA
7652+ */
7653+#include <linux/module.h>
7654+#include <linux/perfmon_kern.h>
7655+
7656+MODULE_AUTHOR("Philip Mucci <mucci@cs.utk.edu>");
7657+MODULE_DESCRIPTION("MIPS64 PMU description tables");
7658+MODULE_LICENSE("GPL");
7659+
7660+/*
7661+ * reserved:
7662+ * - bit 63-9
7663+ * RSVD: reserved bits must be 1
7664+ */
7665+#define PFM_MIPS64_PMC_RSVD 0xfffffffffffff810ULL
7666+#define PFM_MIPS64_PMC_VAL (1ULL<<4)
7667+
7668+extern int null_perf_irq(struct pt_regs *regs);
7669+extern int (*perf_irq)(struct pt_regs *regs);
7670+extern int perfmon_perf_irq(struct pt_regs *regs);
7671+
7672+static struct pfm_arch_pmu_info pfm_mips64_pmu_info;
7673+
7674+static struct pfm_regmap_desc pfm_mips64_pmc_desc[] = {
7675+/* pmc0 */ PMC_D(PFM_REG_I64, "CP0_25_0", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 0),
7676+/* pmc1 */ PMC_D(PFM_REG_I64, "CP0_25_1", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 1),
7677+/* pmc2 */ PMC_D(PFM_REG_I64, "CP0_25_2", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 2),
7678+/* pmc3 */ PMC_D(PFM_REG_I64, "CP0_25_3", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 3)
7679+};
7680+#define PFM_MIPS64_NUM_PMCS ARRAY_SIZE(pfm_mips64_pmc_desc)
7681+
7682+static struct pfm_regmap_desc pfm_mips64_pmd_desc[] = {
7683+/* pmd0 */ PMD_D(PFM_REG_C, "CP0_25_0", 0),
7684+/* pmd1 */ PMD_D(PFM_REG_C, "CP0_25_1", 1),
7685+/* pmd2 */ PMD_D(PFM_REG_C, "CP0_25_2", 2),
7686+/* pmd3 */ PMD_D(PFM_REG_C, "CP0_25_3", 3)
7687+};
7688+#define PFM_MIPS64_NUM_PMDS ARRAY_SIZE(pfm_mips64_pmd_desc)
7689+
7690+static int pfm_mips64_probe_pmu(void)
7691+{
7692+ struct cpuinfo_mips *c = &current_cpu_data;
7693+
7694+ switch (c->cputype) {
7695+#ifndef CONFIG_SMP
7696+ case CPU_34K:
7697+#if defined(CPU_74K)
7698+ case CPU_74K:
7699+#endif
7700+#endif
7701+ case CPU_SB1:
7702+ case CPU_SB1A:
7703+ case CPU_R12000:
7704+ case CPU_25KF:
7705+ case CPU_24K:
7706+ case CPU_20KC:
7707+ case CPU_5KC:
7708+ return 0;
7709+ break;
7710+ default:
7711+ PFM_INFO("Unknown cputype 0x%x", c->cputype);
7712+ }
7713+ return -1;
7714+}
7715+
7716+/*
7717+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
7718+ */
7719+static struct pfm_pmu_config pfm_mips64_pmu_conf = {
7720+ .pmu_name = "MIPS", /* placeholder */
7721+ .counter_width = 31,
7722+ .pmd_desc = pfm_mips64_pmd_desc,
7723+ .pmc_desc = pfm_mips64_pmc_desc,
7724+ .num_pmc_entries = PFM_MIPS64_NUM_PMCS,
7725+ .num_pmd_entries = PFM_MIPS64_NUM_PMDS,
7726+ .probe_pmu = pfm_mips64_probe_pmu,
7727+ .flags = PFM_PMU_BUILTIN_FLAG,
7728+ .owner = THIS_MODULE,
7729+ .pmu_info = &pfm_mips64_pmu_info
7730+};
7731+
7732+static inline int n_counters(void)
7733+{
7734+ if (!(read_c0_config1() & MIPS64_CONFIG_PMC_MASK))
7735+ return 0;
7736+ if (!(read_c0_perfctrl0() & MIPS64_PMC_CTR_MASK))
7737+ return 1;
7738+ if (!(read_c0_perfctrl1() & MIPS64_PMC_CTR_MASK))
7739+ return 2;
7740+ if (!(read_c0_perfctrl2() & MIPS64_PMC_CTR_MASK))
7741+ return 3;
7742+ return 4;
7743+}
7744+
7745+static int __init pfm_mips64_pmu_init_module(void)
7746+{
7747+ struct cpuinfo_mips *c = &current_cpu_data;
7748+ int i, ret, num;
7749+ u64 temp_mask;
7750+
7751+ switch (c->cputype) {
7752+ case CPU_5KC:
7753+ pfm_mips64_pmu_conf.pmu_name = "MIPS5KC";
7754+ break;
7755+ case CPU_R12000:
7756+ pfm_mips64_pmu_conf.pmu_name = "MIPSR12000";
7757+ break;
7758+ case CPU_20KC:
7759+ pfm_mips64_pmu_conf.pmu_name = "MIPS20KC";
7760+ break;
7761+ case CPU_24K:
7762+ pfm_mips64_pmu_conf.pmu_name = "MIPS24K";
7763+ break;
7764+ case CPU_25KF:
7765+ pfm_mips64_pmu_conf.pmu_name = "MIPS25KF";
7766+ break;
7767+ case CPU_SB1:
7768+ pfm_mips64_pmu_conf.pmu_name = "SB1";
7769+ break;
7770+ case CPU_SB1A:
7771+ pfm_mips64_pmu_conf.pmu_name = "SB1A";
7772+ break;
7773+#ifndef CONFIG_SMP
7774+ case CPU_34K:
7775+ pfm_mips64_pmu_conf.pmu_name = "MIPS34K";
7776+ break;
7777+#if defined(CPU_74K)
7778+ case CPU_74K:
7779+ pfm_mips64_pmu_conf.pmu_name = "MIPS74K";
7780+ break;
7781+#endif
7782+#endif
7783+ default:
7784+ PFM_INFO("Unknown cputype 0x%x", c->cputype);
7785+ return -1;
7786+ }
7787+
7788+ /* The R14k and older performance counters have to */
7789+ /* be hard-coded, as there is no support for auto-detection */
7790+ if ((c->cputype == CPU_R12000) || (c->cputype == CPU_R14000))
7791+ num = 4;
7792+ else if (c->cputype == CPU_R10000)
7793+ num = 2;
7794+ else
7795+ num = n_counters();
7796+
7797+ if (num == 0) {
7798+ PFM_INFO("cputype 0x%x has no counters", c->cputype);
7799+ return -1;
7800+ }
7801+ /* mark remaining counters unavailable */
7802+ for (i = num; i < PFM_MIPS64_NUM_PMCS; i++)
7803+ pfm_mips64_pmc_desc[i].type = PFM_REG_NA;
7804+
7805+ for (i = num; i < PFM_MIPS64_NUM_PMDS; i++)
7806+ pfm_mips64_pmd_desc[i].type = PFM_REG_NA;
7807+
7808+ /* set the PMC_RSVD mask */
7809+ switch (c->cputype) {
7810+ case CPU_5KC:
7811+ case CPU_R10000:
7812+ case CPU_20KC:
7813+ /* 4-bits for event */
7814+ temp_mask = 0xfffffffffffffe10ULL;
7815+ break;
7816+ case CPU_R12000:
7817+ case CPU_R14000:
7818+ /* 5-bits for event */
7819+ temp_mask = 0xfffffffffffffc10ULL;
7820+ break;
7821+ default:
7822+ /* 6-bits for event */
7823+ temp_mask = 0xfffffffffffff810ULL;
7824+ }
7825+ for (i = 0; i < PFM_MIPS64_NUM_PMCS; i++)
7826+ pfm_mips64_pmc_desc[i].rsvd_msk = temp_mask;
7827+
7828+ pfm_mips64_pmu_conf.num_pmc_entries = num;
7829+ pfm_mips64_pmu_conf.num_pmd_entries = num;
7830+
7831+ pfm_mips64_pmu_info.pmu_style = c->cputype;
7832+
7833+ ret = pfm_pmu_register(&pfm_mips64_pmu_conf);
7834+ if (ret == 0)
7835+ perf_irq = perfmon_perf_irq;
7836+ return ret;
7837+}
7838+
7839+static void __exit pfm_mips64_pmu_cleanup_module(void)
7840+{
7841+ pfm_pmu_unregister(&pfm_mips64_pmu_conf);
7842+ perf_irq = null_perf_irq;
7843+}
7844+
7845+module_init(pfm_mips64_pmu_init_module);
7846+module_exit(pfm_mips64_pmu_cleanup_module);
7847--- a/arch/powerpc/Kconfig
7848+++ b/arch/powerpc/Kconfig
7849@@ -231,6 +231,8 @@ source "init/Kconfig"
7850 source "arch/powerpc/sysdev/Kconfig"
7851 source "arch/powerpc/platforms/Kconfig"
7852
7853+source "arch/powerpc/perfmon/Kconfig"
7854+
7855 menu "Kernel options"
7856
7857 config HIGHMEM
7858--- a/arch/powerpc/Makefile
7859+++ b/arch/powerpc/Makefile
7860@@ -148,6 +148,7 @@ core-y += arch/powerpc/kernel/ \
7861 arch/powerpc/platforms/
7862 core-$(CONFIG_MATH_EMULATION) += arch/powerpc/math-emu/
7863 core-$(CONFIG_XMON) += arch/powerpc/xmon/
7864+core-$(CONFIG_PERFMON) += arch/powerpc/perfmon/
7865 core-$(CONFIG_KVM) += arch/powerpc/kvm/
7866
7867 drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/
7868--- a/arch/powerpc/include/asm/Kbuild
7869+++ b/arch/powerpc/include/asm/Kbuild
7870@@ -21,6 +21,7 @@ header-y += resource.h
7871 header-y += sigcontext.h
7872 header-y += statfs.h
7873 header-y += ps3fb.h
7874+header-y += perfmon.h
7875
7876 unifdef-y += bootx.h
7877 unifdef-y += byteorder.h
7878--- a/arch/powerpc/include/asm/cell-pmu.h
7879+++ b/arch/powerpc/include/asm/cell-pmu.h
7880@@ -61,6 +61,11 @@
7881
7882 /* Macros for the pm_status register. */
7883 #define CBE_PM_CTR_OVERFLOW_INTR(ctr) (1 << (31 - ((ctr) & 7)))
7884+#define CBE_PM_OVERFLOW_CTRS(pm_status) (((pm_status) >> 24) & 0xff)
7885+#define CBE_PM_ALL_OVERFLOW_INTR 0xff000000
7886+#define CBE_PM_INTERVAL_INTR 0x00800000
7887+#define CBE_PM_TRACE_BUFFER_FULL_INTR 0x00400000
7888+#define CBE_PM_TRACE_BUFFER_UNDERFLOW_INTR 0x00200000
7889
7890 enum pm_reg_name {
7891 group_control,
7892--- a/arch/powerpc/include/asm/cell-regs.h
7893+++ b/arch/powerpc/include/asm/cell-regs.h
7894@@ -117,8 +117,9 @@ struct cbe_pmd_regs {
7895 u8 pad_0x0c1c_0x0c20 [4]; /* 0x0c1c */
7896 #define CBE_PMD_FIR_MODE_M8 0x00800
7897 u64 fir_enable_mask; /* 0x0c20 */
7898-
7899- u8 pad_0x0c28_0x0ca8 [0x0ca8 - 0x0c28]; /* 0x0c28 */
7900+ u8 pad_0x0c28_0x0c98 [0x0c98 - 0x0c28]; /* 0x0c28 */
7901+ u64 on_ramp_trace; /* 0x0c98 */
7902+ u64 pad_0x0ca0; /* 0x0ca0 */
7903 u64 ras_esc_0; /* 0x0ca8 */
7904 u8 pad_0x0cb0_0x1000 [0x1000 - 0x0cb0]; /* 0x0cb0 */
7905 };
7906@@ -218,7 +219,11 @@ extern struct cbe_iic_regs __iomem *cbe_
7907
7908
7909 struct cbe_mic_tm_regs {
7910- u8 pad_0x0000_0x0040[0x0040 - 0x0000]; /* 0x0000 */
7911+ u8 pad_0x0000_0x0010[0x0010 - 0x0000]; /* 0x0000 */
7912+
7913+ u64 MBL_debug; /* 0x0010 */
7914+
7915+ u8 pad_0x0018_0x0040[0x0040 - 0x0018]; /* 0x0018 */
7916
7917 u64 mic_ctl_cnfg2; /* 0x0040 */
7918 #define CBE_MIC_ENABLE_AUX_TRC 0x8000000000000000LL
7919@@ -303,6 +308,25 @@ struct cbe_mic_tm_regs {
7920 extern struct cbe_mic_tm_regs __iomem *cbe_get_mic_tm_regs(struct device_node *np);
7921 extern struct cbe_mic_tm_regs __iomem *cbe_get_cpu_mic_tm_regs(int cpu);
7922
7923+/*
7924+ *
7925+ * PPE Privileged MMIO Registers definition. (offset 0x500000 - 0x500fff)
7926+ *
7927+ */
7928+struct cbe_ppe_priv_regs {
7929+ u8 pad_0x0000_0x0858[0x0858 - 0x0000]; /* 0x0000 */
7930+
7931+ u64 L2_debug1; /* 0x0858 */
7932+
7933+ u8 pad_0x0860_0x0958[0x0958 - 0x0860]; /* 0x0860 */
7934+
7935+ u64 ciu_dr1; /* 0x0958 */
7936+
7937+ u8 pad_0x0960_0x1000[0x1000 - 0x0960]; /* 0x0960 */
7938+};
7939+
7940+extern struct cbe_ppe_priv_regs __iomem *cbe_get_cpu_ppe_priv_regs(int cpu);
7941+
7942 /* some utility functions to deal with SMT */
7943 extern u32 cbe_get_hw_thread_id(int cpu);
7944 extern u32 cbe_cpu_to_node(int cpu);
7945--- a/arch/powerpc/include/asm/paca.h
7946+++ b/arch/powerpc/include/asm/paca.h
7947@@ -97,6 +97,10 @@ struct paca_struct {
7948 u8 soft_enabled; /* irq soft-enable flag */
7949 u8 hard_enabled; /* set if irqs are enabled in MSR */
7950 u8 io_sync; /* writel() needs spin_unlock sync */
7951+#ifdef CONFIG_PERFMON
7952+ u8 pmu_except_pending; /* PMU exception occurred while soft
7953+ * disabled */
7954+#endif
7955
7956 /* Stuff for accurate time accounting */
7957 u64 user_time; /* accumulated usermode TB ticks */
7958--- /dev/null
7959+++ b/arch/powerpc/include/asm/perfmon.h
7960@@ -0,0 +1,33 @@
7961+/*
7962+ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P.
7963+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
7964+ *
7965+ * This file contains powerpc specific definitions for the perfmon
7966+ * interface.
7967+ *
7968+ * This file MUST never be included directly. Use linux/perfmon.h.
7969+ *
7970+ * This program is free software; you can redistribute it and/or
7971+ * modify it under the terms of version 2 of the GNU General Public
7972+ * License as published by the Free Software Foundation.
7973+ *
7974+ * This program is distributed in the hope that it will be useful,
7975+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
7976+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
7977+ * General Public License for more details.
7978+ *
7979+ * You should have received a copy of the GNU General Public License
7980+ * along with this program; if not, write to the Free Software
7981+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
7982+ * 02111-1307 USA
7983+ */
7984+#ifndef _ASM_POWERPC_PERFMON_H_
7985+#define _ASM_POWERPC_PERFMON_H_
7986+
7987+/*
7988+ * arch-specific user visible interface definitions
7989+ */
7990+#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */
7991+#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */
7992+
7993+#endif /* _ASM_POWERPC_PERFMON_H_ */
7994--- /dev/null
7995+++ b/arch/powerpc/include/asm/perfmon_kern.h
7996@@ -0,0 +1,390 @@
7997+/*
7998+ * Copyright (c) 2005 David Gibson, IBM Corporation.
7999+ *
8000+ * Based on other versions:
8001+ * Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
8002+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
8003+ *
8004+ * This file contains powerpc specific definitions for the perfmon
8005+ * interface.
8006+ *
8007+ * This program is free software; you can redistribute it and/or
8008+ * modify it under the terms of version 2 of the GNU General Public
8009+ * License as published by the Free Software Foundation.
8010+ *
8011+ * This program is distributed in the hope that it will be useful,
8012+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
8013+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
8014+ * General Public License for more details.
8015+ *
8016+ * You should have received a copy of the GNU General Public License
8017+ * along with this program; if not, write to the Free Software
8018+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
8019+ * 02111-1307 USA
8020+ */
8021+#ifndef _ASM_POWERPC_PERFMON_KERN_H_
8022+#define _ASM_POWERPC_PERFMON_KERN_H_
8023+
8024+#ifdef __KERNEL__
8025+
8026+#ifdef CONFIG_PERFMON
8027+
8028+#include <asm/pmc.h>
8029+#include <asm/unistd.h>
8030+
8031+#define HID0_PMC5_6_GR_MODE (1UL << (63 - 40))
8032+
8033+enum powerpc_pmu_type {
8034+ PFM_POWERPC_PMU_NONE,
8035+ PFM_POWERPC_PMU_604,
8036+ PFM_POWERPC_PMU_604e,
8037+ PFM_POWERPC_PMU_750, /* XXX: Minor event set diffs between IBM and Moto. */
8038+ PFM_POWERPC_PMU_7400,
8039+ PFM_POWERPC_PMU_7450,
8040+ PFM_POWERPC_PMU_POWER4,
8041+ PFM_POWERPC_PMU_POWER5,
8042+ PFM_POWERPC_PMU_POWER5p,
8043+ PFM_POWERPC_PMU_POWER6,
8044+ PFM_POWERPC_PMU_CELL,
8045+};
8046+
8047+struct pfm_arch_pmu_info {
8048+ enum powerpc_pmu_type pmu_style;
8049+
8050+ void (*write_pmc)(unsigned int cnum, u64 value);
8051+ void (*write_pmd)(unsigned int cnum, u64 value);
8052+
8053+ u64 (*read_pmd)(unsigned int cnum);
8054+
8055+ void (*enable_counters)(struct pfm_context *ctx,
8056+ struct pfm_event_set *set);
8057+ void (*disable_counters)(struct pfm_context *ctx,
8058+ struct pfm_event_set *set);
8059+
8060+ void (*irq_handler)(struct pt_regs *regs, struct pfm_context *ctx);
8061+ void (*get_ovfl_pmds)(struct pfm_context *ctx,
8062+ struct pfm_event_set *set);
8063+
8064+ /* The following routines are optional. */
8065+ void (*restore_pmcs)(struct pfm_context *ctx,
8066+ struct pfm_event_set *set);
8067+ void (*restore_pmds)(struct pfm_context *ctx,
8068+ struct pfm_event_set *set);
8069+
8070+ int (*ctxswout_thread)(struct task_struct *task,
8071+ struct pfm_context *ctx,
8072+ struct pfm_event_set *set);
8073+ void (*ctxswin_thread)(struct task_struct *task,
8074+ struct pfm_context *ctx,
8075+ struct pfm_event_set *set);
8076+ int (*load_context)(struct pfm_context *ctx);
8077+ void (*unload_context)(struct pfm_context *ctx);
8078+ int (*acquire_pmu)(u64 *unavail_pmcs, u64 *unavail_pmds);
8079+ void (*release_pmu)(void);
8080+ void *platform_info;
8081+ void (*resend_irq)(struct pfm_context *ctx);
8082+};
8083+
8084+#ifdef CONFIG_PPC32
8085+#define PFM_ARCH_PMD_STK_ARG 6 /* conservative value */
8086+#define PFM_ARCH_PMC_STK_ARG 6 /* conservative value */
8087+#else
8088+#define PFM_ARCH_PMD_STK_ARG 8 /* conservative value */
8089+#define PFM_ARCH_PMC_STK_ARG 8 /* conservative value */
8090+#endif
8091+
8092+static inline void pfm_arch_resend_irq(struct pfm_context *ctx)
8093+{
8094+ struct pfm_arch_pmu_info *arch_info;
8095+
8096+ arch_info = pfm_pmu_info();
8097+ arch_info->resend_irq(ctx);
8098+}
8099+
8100+static inline void pfm_arch_serialize(void)
8101+{}
8102+
8103+static inline void pfm_arch_write_pmc(struct pfm_context *ctx,
8104+ unsigned int cnum,
8105+ u64 value)
8106+{
8107+ struct pfm_arch_pmu_info *arch_info;
8108+
8109+ arch_info = pfm_pmu_info();
8110+
8111+ /*
8112+ * we only write to the actual register when monitoring is
8113+ * active (pfm_start was issued)
8114+ */
8115+ if (ctx && ctx->flags.started == 0)
8116+ return;
8117+
8118+ BUG_ON(!arch_info->write_pmc);
8119+
8120+ arch_info->write_pmc(cnum, value);
8121+}
8122+
8123+static inline void pfm_arch_write_pmd(struct pfm_context *ctx,
8124+ unsigned int cnum, u64 value)
8125+{
8126+ struct pfm_arch_pmu_info *arch_info;
8127+
8128+ arch_info = pfm_pmu_info();
8129+
8130+ value &= pfm_pmu_conf->ovfl_mask;
8131+
8132+ BUG_ON(!arch_info->write_pmd);
8133+
8134+ arch_info->write_pmd(cnum, value);
8135+}
8136+
8137+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
8138+{
8139+ struct pfm_arch_pmu_info *arch_info;
8140+
8141+ arch_info = pfm_pmu_info();
8142+
8143+ BUG_ON(!arch_info->read_pmd);
8144+
8145+ return arch_info->read_pmd(cnum);
8146+}
8147+
8148+/*
8149+ * For some CPUs, the upper bits of a counter must be set in order for the
8150+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
8151+ * and the upper bits are cleared. This function may be used to set them back.
8152+ */
8153+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx,
8154+ unsigned int cnum)
8155+{
8156+ u64 val = pfm_arch_read_pmd(ctx, cnum);
8157+
8158+ /* This masks out overflow bit 31 */
8159+ pfm_arch_write_pmd(ctx, cnum, val);
8160+}
8161+
8162+/*
8163+ * At certain points, perfmon needs to know if monitoring has been
8164+ * explicitely started/stopped by user via pfm_start/pfm_stop. The
8165+ * information is tracked in flags.started. However on certain
8166+ * architectures, it may be possible to start/stop directly from
8167+ * user level with a single assembly instruction bypassing
8168+ * the kernel. This function must be used to determine by
8169+ * an arch-specific mean if monitoring is actually started/stopped.
8170+ */
8171+static inline int pfm_arch_is_active(struct pfm_context *ctx)
8172+{
8173+ return ctx->flags.started;
8174+}
8175+
8176+static inline void pfm_arch_ctxswout_sys(struct task_struct *task,
8177+ struct pfm_context *ctx)
8178+{}
8179+
8180+static inline void pfm_arch_ctxswin_sys(struct task_struct *task,
8181+ struct pfm_context *ctx)
8182+{}
8183+
8184+void pfm_arch_init_percpu(void);
8185+int pfm_arch_is_monitoring_active(struct pfm_context *ctx);
8186+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx);
8187+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx);
8188+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx);
8189+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx);
8190+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
8191+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
8192+void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, struct pfm_event_set *set);
8193+int pfm_arch_get_ovfl_pmds(struct pfm_context *ctx,
8194+ struct pfm_event_set *set);
8195+char *pfm_arch_get_pmu_module_name(void);
8196+/*
8197+ * called from __pfm_interrupt_handler(). ctx is not NULL.
8198+ * ctx is locked. PMU interrupt is masked.
8199+ *
8200+ * must stop all monitoring to ensure handler has consistent view.
8201+ * must collect overflowed PMDs bitmask into povfls_pmds and
8202+ * npend_ovfls. If no interrupt detected then npend_ovfls
8203+ * must be set to zero.
8204+ */
8205+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, struct pfm_event_set *set)
8206+{
8207+ pfm_arch_stop(current, ctx);
8208+}
8209+
8210+void powerpc_irq_handler(struct pt_regs *regs);
8211+
8212+/*
8213+ * unfreeze PMU from pfm_do_interrupt_handler()
8214+ * ctx may be NULL for spurious
8215+ */
8216+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
8217+{
8218+ struct pfm_arch_pmu_info *arch_info;
8219+
8220+ if (!ctx)
8221+ return;
8222+
8223+ PFM_DBG_ovfl("state=%d", ctx->state);
8224+
8225+ ctx->flags.started = 1;
8226+
8227+ if (ctx->state == PFM_CTX_MASKED)
8228+ return;
8229+
8230+ arch_info = pfm_pmu_info();
8231+ BUG_ON(!arch_info->enable_counters);
8232+ arch_info->enable_counters(ctx, ctx->active_set);
8233+}
8234+
8235+/*
8236+ * PowerPC does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus
8237+ * this routine needs to do it when switching sets on overflow
8238+ */
8239+static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx,
8240+ struct pfm_event_set *set)
8241+{
8242+ pfm_save_pmds(ctx, set);
8243+}
8244+
8245+/*
8246+ * this function is called from the PMU interrupt handler ONLY.
8247+ * On PPC, the PMU is frozen via arch_stop, masking would be implemented
8248+ * via arch-stop as well. Given that the PMU is already stopped when
8249+ * entering the interrupt handler, we do not need to stop it again, so
8250+ * this function is a nop.
8251+ */
8252+static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx,
8253+ struct pfm_event_set *set)
8254+{}
8255+
8256+/*
8257+ * Simply need to start the context in order to unmask.
8258+ */
8259+static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx,
8260+ struct pfm_event_set *set)
8261+{
8262+ pfm_arch_start(current, ctx);
8263+}
8264+
8265+
8266+static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg)
8267+{
8268+ return 0;
8269+}
8270+
8271+static inline int pfm_arch_context_create(struct pfm_context *ctx,
8272+ u32 ctx_flags)
8273+{
8274+ return 0;
8275+}
8276+
8277+static inline void pfm_arch_context_free(struct pfm_context *ctx)
8278+{}
8279+
8280+/* not necessary on PowerPC */
8281+static inline void pfm_cacheflush(void *addr, unsigned int len)
8282+{}
8283+
8284+/*
8285+ * function called from pfm_setfl_sane(). Context is locked
8286+ * and interrupts are masked.
8287+ * The value of flags is the value of ctx_flags as passed by
8288+ * user.
8289+ *
8290+ * function must check arch-specific set flags.
8291+ * Return:
8292+ * 1 when flags are valid
8293+ * 0 on error
8294+ */
8295+static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags)
8296+{
8297+ return 0;
8298+}
8299+
8300+static inline int pfm_arch_init(void)
8301+{
8302+ return 0;
8303+}
8304+
8305+static inline int pfm_arch_load_context(struct pfm_context *ctx)
8306+{
8307+ struct pfm_arch_pmu_info *arch_info;
8308+ int rc = 0;
8309+
8310+ arch_info = pfm_pmu_info();
8311+ if (arch_info->load_context)
8312+ rc = arch_info->load_context(ctx);
8313+
8314+ return rc;
8315+}
8316+
8317+static inline void pfm_arch_unload_context(struct pfm_context *ctx)
8318+{
8319+ struct pfm_arch_pmu_info *arch_info;
8320+
8321+ arch_info = pfm_pmu_info();
8322+ if (arch_info->unload_context)
8323+ arch_info->unload_context(ctx);
8324+}
8325+
8326+static inline int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds)
8327+{
8328+ struct pfm_arch_pmu_info *arch_info;
8329+ int rc = 0;
8330+
8331+ arch_info = pfm_pmu_info();
8332+ if (arch_info->acquire_pmu) {
8333+ rc = arch_info->acquire_pmu(unavail_pmcs, unavail_pmds);
8334+ if (rc)
8335+ return rc;
8336+ }
8337+
8338+ return reserve_pmc_hardware(powerpc_irq_handler);
8339+}
8340+
8341+static inline void pfm_arch_pmu_release(void)
8342+{
8343+ struct pfm_arch_pmu_info *arch_info;
8344+
8345+ arch_info = pfm_pmu_info();
8346+ if (arch_info->release_pmu)
8347+ arch_info->release_pmu();
8348+
8349+ release_pmc_hardware();
8350+}
8351+
8352+static inline void pfm_arch_arm_handle_work(struct task_struct *task)
8353+{}
8354+
8355+static inline void pfm_arch_disarm_handle_work(struct task_struct *task)
8356+{}
8357+
8358+static inline int pfm_arch_get_base_syscall(void)
8359+{
8360+ return __NR_pfm_create_context;
8361+}
8362+
8363+struct pfm_arch_context {
8364+ /* Cell: Most recent value of the pm_status
8365+ * register read by the interrupt handler.
8366+ *
8367+ * Interrupt handler sets last_read_updated if it
8368+ * just read and updated last_read_pm_status
8369+ */
8370+ u32 last_read_pm_status;
8371+ u32 last_read_updated;
8372+ u64 powergs_pmc5, powergs_pmc6;
8373+ u64 delta_tb, delta_tb_start;
8374+ u64 delta_purr, delta_purr_start;
8375+};
8376+
8377+#define PFM_ARCH_CTX_SIZE sizeof(struct pfm_arch_context)
8378+/*
8379+ * PowerPC does not need extra alignment requirements for the sampling buffer
8380+ */
8381+#define PFM_ARCH_SMPL_ALIGN_SIZE 0
8382+
8383+#endif /* CONFIG_PERFMON */
8384+
8385+#endif /* __KERNEL__ */
8386+#endif /* _ASM_POWERPC_PERFMON_KERN_H_ */
8387--- a/arch/powerpc/include/asm/reg.h
8388+++ b/arch/powerpc/include/asm/reg.h
8389@@ -698,6 +698,7 @@
8390 #define PV_POWER5 0x003A
8391 #define PV_POWER5p 0x003B
8392 #define PV_970FX 0x003C
8393+#define PV_POWER6 0x003E
8394 #define PV_630 0x0040
8395 #define PV_630p 0x0041
8396 #define PV_970MP 0x0044
8397--- a/arch/powerpc/include/asm/systbl.h
8398+++ b/arch/powerpc/include/asm/systbl.h
8399@@ -322,3 +322,15 @@ SYSCALL_SPU(epoll_create1)
8400 SYSCALL_SPU(dup3)
8401 SYSCALL_SPU(pipe2)
8402 SYSCALL(inotify_init1)
8403+SYSCALL(pfm_create_context)
8404+SYSCALL(pfm_write_pmcs)
8405+SYSCALL(pfm_write_pmds)
8406+SYSCALL(pfm_read_pmds)
8407+SYSCALL(pfm_load_context)
8408+SYSCALL(pfm_start)
8409+SYSCALL(pfm_stop)
8410+SYSCALL(pfm_restart)
8411+SYSCALL(pfm_create_evtsets)
8412+SYSCALL(pfm_getinfo_evtsets)
8413+SYSCALL(pfm_delete_evtsets)
8414+SYSCALL(pfm_unload_context)
8415--- a/arch/powerpc/include/asm/thread_info.h
8416+++ b/arch/powerpc/include/asm/thread_info.h
8417@@ -130,10 +130,12 @@ static inline struct thread_info *curren
8418 #define _TIF_FREEZE (1<<TIF_FREEZE)
8419 #define _TIF_RUNLATCH (1<<TIF_RUNLATCH)
8420 #define _TIF_ABI_PENDING (1<<TIF_ABI_PENDING)
8421+#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK)
8422+#define _TIF_PERFMON_CTXSW (1<<TIF_PERFMON_CTXSW)
8423 #define _TIF_SYSCALL_T_OR_A (_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP)
8424
8425 #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
8426- _TIF_NOTIFY_RESUME)
8427+ _TIF_NOTIFY_RESUME | _TIF_PERFMON_WORK)
8428 #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
8429
8430 /* Bits in local_flags */
8431--- a/arch/powerpc/include/asm/unistd.h
8432+++ b/arch/powerpc/include/asm/unistd.h
8433@@ -341,10 +341,22 @@
8434 #define __NR_dup3 316
8435 #define __NR_pipe2 317
8436 #define __NR_inotify_init1 318
8437+#define __NR_pfm_create_context 319
8438+#define __NR_pfm_write_pmcs 320
8439+#define __NR_pfm_write_pmds 321
8440+#define __NR_pfm_read_pmds 322
8441+#define __NR_pfm_load_context 323
8442+#define __NR_pfm_start 324
8443+#define __NR_pfm_stop 325
8444+#define __NR_pfm_restart 326
8445+#define __NR_pfm_create_evtsets 327
8446+#define __NR_pfm_getinfo_evtsets 328
8447+#define __NR_pfm_delete_evtsets 329
8448+#define __NR_pfm_unload_context 330
8449
8450 #ifdef __KERNEL__
8451
8452-#define __NR_syscalls 319
8453+#define __NR_syscalls 331
8454
8455 #define __NR__exit __NR_exit
8456 #define NR_syscalls __NR_syscalls
8457--- a/arch/powerpc/kernel/entry_32.S
8458+++ b/arch/powerpc/kernel/entry_32.S
8459@@ -39,7 +39,7 @@
8460 * MSR_KERNEL is > 0x10000 on 4xx/Book-E since it include MSR_CE.
8461 */
8462 #if MSR_KERNEL >= 0x10000
8463-#define LOAD_MSR_KERNEL(r, x) lis r,(x)@h; ori r,r,(x)@l
8464+#define LOAD_MSR_KERNEL(r, x) lis r,(x)@ha; ori r,r,(x)@l
8465 #else
8466 #define LOAD_MSR_KERNEL(r, x) li r,(x)
8467 #endif
8468--- a/arch/powerpc/kernel/entry_64.S
8469+++ b/arch/powerpc/kernel/entry_64.S
8470@@ -643,6 +643,10 @@ user_work:
8471 b .ret_from_except_lite
8472
8473 1: bl .save_nvgprs
8474+#ifdef CONFIG_PERFMON
8475+ addi r3,r1,STACK_FRAME_OVERHEAD
8476+ bl .pfm_handle_work
8477+#endif /* CONFIG_PERFMON */
8478 addi r3,r1,STACK_FRAME_OVERHEAD
8479 bl .do_signal
8480 b .ret_from_except
8481--- a/arch/powerpc/kernel/irq.c
8482+++ b/arch/powerpc/kernel/irq.c
8483@@ -104,6 +104,24 @@ static inline notrace void set_soft_enab
8484 : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
8485 }
8486
8487+#ifdef CONFIG_PERFMON
8488+static inline unsigned long get_pmu_except_pending(void)
8489+{
8490+ unsigned long pending;
8491+
8492+ __asm__ __volatile__("lbz %0,%1(13)"
8493+ : "=r" (pending) : "i" (offsetof(struct paca_struct, pmu_except_pending)));
8494+
8495+ return pending;
8496+}
8497+
8498+static inline void set_pmu_except_pending(unsigned long pending)
8499+{
8500+ __asm__ __volatile__("stb %0,%1(13)"
8501+ : : "r" (pending), "i" (offsetof(struct paca_struct, pmu_except_pending)));
8502+}
8503+#endif /* CONFIG_PERFMON */
8504+
8505 notrace void raw_local_irq_restore(unsigned long en)
8506 {
8507 /*
8508@@ -162,6 +180,19 @@ notrace void raw_local_irq_restore(unsig
8509 lv1_get_version_info(&tmp);
8510 }
8511
8512+#ifdef CONFIG_PERFMON
8513+ /*
8514+ * If a PMU exception occurred while interrupts were soft disabled,
8515+ * force a PMU exception.
8516+ */
8517+ if (get_pmu_except_pending()) {
8518+ set_pmu_except_pending(0);
8519+ /* Make sure we trigger the edge detection circuitry */
8520+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO);
8521+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_PMAO);
8522+ }
8523+#endif /* CONFIG_PERFMON */
8524+
8525 __hard_irq_enable();
8526 }
8527 EXPORT_SYMBOL(raw_local_irq_restore);
8528--- a/arch/powerpc/kernel/process.c
8529+++ b/arch/powerpc/kernel/process.c
8530@@ -33,6 +33,7 @@
8531 #include <linux/mqueue.h>
8532 #include <linux/hardirq.h>
8533 #include <linux/utsname.h>
8534+#include <linux/perfmon_kern.h>
8535
8536 #include <asm/pgtable.h>
8537 #include <asm/uaccess.h>
8538@@ -393,9 +394,14 @@ struct task_struct *__switch_to(struct t
8539 new_thread->start_tb = current_tb;
8540 }
8541 #endif
8542-
8543 local_irq_save(flags);
8544
8545+ if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW))
8546+ pfm_ctxsw_out(prev, new);
8547+
8548+ if (test_tsk_thread_flag(new, TIF_PERFMON_CTXSW))
8549+ pfm_ctxsw_in(prev, new);
8550+
8551 account_system_vtime(current);
8552 account_process_vtime(current);
8553 calculate_steal_time();
8554@@ -544,6 +550,7 @@ void show_regs(struct pt_regs * regs)
8555 void exit_thread(void)
8556 {
8557 discard_lazy_cpu_state();
8558+ pfm_exit_thread();
8559 }
8560
8561 void flush_thread(void)
8562@@ -669,6 +676,7 @@ int copy_thread(int nr, unsigned long cl
8563 #else
8564 kregs->nip = (unsigned long)ret_from_fork;
8565 #endif
8566+ pfm_copy_thread(p);
8567
8568 return 0;
8569 }
8570--- /dev/null
8571+++ b/arch/powerpc/perfmon/Kconfig
8572@@ -0,0 +1,67 @@
8573+menu "Hardware Performance Monitoring support"
8574+config PERFMON
8575+ bool "Perfmon2 performance monitoring interface"
8576+ default n
8577+ help
8578+ Enables the perfmon2 interface to access the hardware
8579+ performance counters. See <http://perfmon2.sf.net/> for
8580+ more details.
8581+
8582+config PERFMON_DEBUG
8583+ bool "Perfmon debugging"
8584+ default n
8585+ depends on PERFMON
8586+ help
8587+ Enables perfmon debugging support
8588+
8589+config PERFMON_DEBUG_FS
8590+ bool "Enable perfmon statistics reporting via debugfs"
8591+ default y
8592+ depends on PERFMON && DEBUG_FS
8593+ help
8594+ Enable collection and reporting of perfmon timing statistics under
8595+ debugfs. This is used for debugging and performance analysis of the
8596+ subsystem. The debugfs filesystem must be mounted.
8597+
8598+config PERFMON_POWER4
8599+ tristate "Support for Power4 hardware performance counters"
8600+ depends on PERFMON && PPC64
8601+ default n
8602+ help
8603+ Enables support for the Power 4 hardware performance counters
8604+ If unsure, say M.
8605+
8606+config PERFMON_POWER5
8607+ tristate "Support for Power5 hardware performance counters"
8608+ depends on PERFMON && PPC64
8609+ default n
8610+ help
8611+ Enables support for the Power 5 hardware performance counters
8612+ If unsure, say M.
8613+
8614+config PERFMON_POWER6
8615+ tristate "Support for Power6 hardware performance counters"
8616+ depends on PERFMON && PPC64
8617+ default n
8618+ help
8619+ Enables support for the Power 6 hardware performance counters
8620+ If unsure, say M.
8621+
8622+config PERFMON_PPC32
8623+ tristate "Support for PPC32 hardware performance counters"
8624+ depends on PERFMON && PPC32
8625+ default n
8626+ help
8627+ Enables support for the PPC32 hardware performance counters
8628+ If unsure, say M.
8629+
8630+config PERFMON_CELL
8631+ tristate "Support for Cell hardware performance counters"
8632+ depends on PERFMON && PPC_CELL
8633+ select PS3_LPM if PPC_PS3
8634+ default n
8635+ help
8636+ Enables support for the Cell hardware performance counters.
8637+ If unsure, say M.
8638+
8639+endmenu
8640--- /dev/null
8641+++ b/arch/powerpc/perfmon/Makefile
8642@@ -0,0 +1,6 @@
8643+obj-$(CONFIG_PERFMON) += perfmon.o
8644+obj-$(CONFIG_PERFMON_POWER4) += perfmon_power4.o
8645+obj-$(CONFIG_PERFMON_POWER5) += perfmon_power5.o
8646+obj-$(CONFIG_PERFMON_POWER6) += perfmon_power6.o
8647+obj-$(CONFIG_PERFMON_PPC32) += perfmon_ppc32.o
8648+obj-$(CONFIG_PERFMON_CELL) += perfmon_cell.o
8649--- /dev/null
8650+++ b/arch/powerpc/perfmon/perfmon.c
8651@@ -0,0 +1,334 @@
8652+/*
8653+ * This file implements the powerpc specific
8654+ * support for the perfmon2 interface
8655+ *
8656+ * Copyright (c) 2005 David Gibson, IBM Corporation.
8657+ *
8658+ * based on versions for other architectures:
8659+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
8660+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
8661+ *
8662+ * This program is free software; you can redistribute it and/or
8663+ * modify it under the terms of version 2 of the GNU General Public
8664+ * License as published by the Free Software Foundation.
8665+ *
8666+ * This program is distributed in the hope that it will be useful,
8667+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
8668+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
8669+ * General Public License for more details.
8670+ *
8671+ * You should have received a copy of the GNU General Public License
8672+ * along with this program; if not, write to the Free Software
8673+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
8674+ * 02111-1307 USA
8675+ */
8676+#include <linux/interrupt.h>
8677+#include <linux/perfmon_kern.h>
8678+
8679+static void pfm_stop_active(struct task_struct *task,
8680+ struct pfm_context *ctx, struct pfm_event_set *set)
8681+{
8682+ struct pfm_arch_pmu_info *arch_info;
8683+
8684+ arch_info = pfm_pmu_info();
8685+ BUG_ON(!arch_info->disable_counters || !arch_info->get_ovfl_pmds);
8686+
8687+ arch_info->disable_counters(ctx, set);
8688+
8689+ if (set->npend_ovfls)
8690+ return;
8691+
8692+ arch_info->get_ovfl_pmds(ctx, set);
8693+}
8694+
8695+/*
8696+ * Called from pfm_save_pmds(). Interrupts are masked. Registers are
8697+ * already saved away.
8698+ */
8699+void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx,
8700+ struct pfm_event_set *set)
8701+{
8702+ int i, num;
8703+ u64 *used_pmds, *intr_pmds;
8704+
8705+ num = set->nused_pmds;
8706+ used_pmds = set->used_pmds;
8707+ intr_pmds = ctx->regs.intr_pmds;
8708+
8709+ for (i = 0; num; i++)
8710+ if (likely(test_bit(i, used_pmds))) {
8711+ if (likely(test_bit(i, intr_pmds)))
8712+ pfm_write_pmd(ctx, i, 0);
8713+ num--;
8714+ }
8715+}
8716+
8717+/*
8718+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
8719+ * Context is locked. Interrupts are masked. Monitoring is active.
8720+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
8721+ *
8722+ * for per-thread:
8723+ * must stop monitoring for the task
8724+ * Return:
8725+ * non-zero : did not save PMDs (as part of stopping the PMU)
8726+ * 0 : saved PMDs (no need to save them in caller)
8727+ */
8728+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx)
8729+{
8730+ struct pfm_arch_pmu_info *arch_info;
8731+
8732+ arch_info = pfm_pmu_info();
8733+ /*
8734+ * disable lazy restore of the PMC/PMD registers.
8735+ */
8736+ ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
8737+
8738+ if (ctx->state == PFM_CTX_MASKED)
8739+ return 1;
8740+
8741+ pfm_stop_active(task, ctx, ctx->active_set);
8742+
8743+ if (arch_info->ctxswout_thread)
8744+ arch_info->ctxswout_thread(task, ctx, ctx->active_set);
8745+
8746+ return pfm_arch_is_active(ctx);
8747+}
8748+
8749+/*
8750+ * Called from pfm_ctxsw
8751+ */
8752+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx)
8753+{
8754+ struct pfm_arch_pmu_info *arch_info;
8755+
8756+ arch_info = pfm_pmu_info();
8757+ if (ctx->state != PFM_CTX_MASKED && ctx->flags.started == 1) {
8758+ BUG_ON(!arch_info->enable_counters);
8759+ arch_info->enable_counters(ctx, ctx->active_set);
8760+ }
8761+
8762+ if (arch_info->ctxswin_thread)
8763+ arch_info->ctxswin_thread(task, ctx, ctx->active_set);
8764+}
8765+
8766+/*
8767+ * Called from pfm_stop() and idle notifier
8768+ *
8769+ * Interrupts are masked. Context is locked. Set is the active set.
8770+ *
8771+ * For per-thread:
8772+ * task is not necessarily current. If not current task, then
8773+ * task is guaranteed stopped and off any cpu. Access to PMU
8774+ * is not guaranteed. Interrupts are masked. Context is locked.
8775+ * Set is the active set.
8776+ *
8777+ * For system-wide:
8778+ * task is current
8779+ *
8780+ * must disable active monitoring. ctx cannot be NULL
8781+ */
8782+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx)
8783+{
8784+ /*
8785+ * no need to go through stop_save()
8786+ * if we are already stopped
8787+ */
8788+ if (!ctx->flags.started || ctx->state == PFM_CTX_MASKED)
8789+ return;
8790+
8791+ /*
8792+ * stop live registers and collect pending overflow
8793+ */
8794+ if (task == current)
8795+ pfm_stop_active(task, ctx, ctx->active_set);
8796+}
8797+
8798+/*
8799+ * Enable active monitoring. Called from pfm_start() and
8800+ * pfm_arch_unmask_monitoring().
8801+ *
8802+ * Interrupts are masked. Context is locked. Set is the active set.
8803+ *
8804+ * For per-thread:
8805+ * Task is not necessarily current. If not current task, then task
8806+ * is guaranteed stopped and off any cpu. No access to PMU if task
8807+ * is not current.
8808+ *
8809+ * For system-wide:
8810+ * Task is always current
8811+ */
8812+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx)
8813+{
8814+ struct pfm_arch_pmu_info *arch_info;
8815+
8816+ arch_info = pfm_pmu_info();
8817+ if (task != current)
8818+ return;
8819+
8820+ BUG_ON(!arch_info->enable_counters);
8821+
8822+ arch_info->enable_counters(ctx, ctx->active_set);
8823+}
8824+
8825+/*
8826+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
8827+ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets()
8828+ * context is locked. Interrupts are masked. set cannot be NULL.
8829+ * Access to the PMU is guaranteed.
8830+ *
8831+ * function must restore all PMD registers from set.
8832+ */
8833+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
8834+{
8835+ struct pfm_arch_pmu_info *arch_info;
8836+ u64 *used_pmds;
8837+ u16 i, num;
8838+
8839+ arch_info = pfm_pmu_info();
8840+
8841+ /* The model-specific module can override the default
8842+ * restore-PMD method.
8843+ */
8844+ if (arch_info->restore_pmds)
8845+ return arch_info->restore_pmds(ctx, set);
8846+
8847+ num = set->nused_pmds;
8848+ used_pmds = set->used_pmds;
8849+
8850+ for (i = 0; num; i++) {
8851+ if (likely(test_bit(i, used_pmds))) {
8852+ pfm_write_pmd(ctx, i, set->pmds[i].value);
8853+ num--;
8854+ }
8855+ }
8856+}
8857+
8858+/*
8859+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
8860+ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets()
8861+ * context is locked. Interrupts are masked. set cannot be NULL.
8862+ * Access to the PMU is guaranteed.
8863+ *
8864+ * function must restore all PMC registers from set, if needed.
8865+ */
8866+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
8867+{
8868+ struct pfm_arch_pmu_info *arch_info;
8869+ u64 *impl_pmcs;
8870+ unsigned int i, max_pmc, reg;
8871+
8872+ arch_info = pfm_pmu_info();
8873+ /* The model-specific module can override the default
8874+ * restore-PMC method.
8875+ */
8876+ if (arch_info->restore_pmcs)
8877+ return arch_info->restore_pmcs(ctx, set);
8878+
8879+ /* The "common" powerpc model's enable the counters simply by writing
8880+ * all the control registers. Therefore, if we're masked or stopped we
8881+ * don't need to bother restoring the PMCs now.
8882+ */
8883+ if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0)
8884+ return;
8885+
8886+ max_pmc = ctx->regs.max_pmc;
8887+ impl_pmcs = ctx->regs.pmcs;
8888+
8889+ /*
8890+ * Restore all pmcs in reverse order to ensure the counters aren't
8891+ * enabled before their event selectors are set correctly.
8892+ */
8893+ reg = max_pmc - 1;
8894+ for (i = 0; i < max_pmc; i++) {
8895+ if (test_bit(reg, impl_pmcs))
8896+ pfm_arch_write_pmc(ctx, reg, set->pmcs[reg]);
8897+ reg--;
8898+ }
8899+}
8900+
8901+char *pfm_arch_get_pmu_module_name(void)
8902+{
8903+ unsigned int pvr = mfspr(SPRN_PVR);
8904+
8905+ switch (PVR_VER(pvr)) {
8906+ case 0x0004: /* 604 */
8907+ case 0x0009: /* 604e; */
8908+ case 0x000A: /* 604ev */
8909+ case 0x0008: /* 750/740 */
8910+ case 0x7000: /* 750FX */
8911+ case 0x7001:
8912+ case 0x7002: /* 750GX */
8913+ case 0x000C: /* 7400 */
8914+ case 0x800C: /* 7410 */
8915+ case 0x8000: /* 7451/7441 */
8916+ case 0x8001: /* 7455/7445 */
8917+ case 0x8002: /* 7457/7447 */
8918+ case 0x8003: /* 7447A */
8919+ case 0x8004: /* 7448 */
8920+ return("perfmon_ppc32");
8921+ case PV_POWER4:
8922+ case PV_POWER4p:
8923+ return "perfmon_power4";
8924+ case PV_POWER5:
8925+ return "perfmon_power5";
8926+ case PV_POWER5p:
8927+ if (PVR_REV(pvr) < 0x300)
8928+ /* PMU behaves like POWER5 */
8929+ return "perfmon_power5";
8930+ else
8931+ /* PMU behaves like POWER6 */
8932+ return "perfmon_power6";
8933+ case PV_POWER6:
8934+ return "perfmon_power6";
8935+ case PV_970:
8936+ case PV_970FX:
8937+ case PV_970MP:
8938+ return "perfmon_ppc970";
8939+ case PV_BE:
8940+ return "perfmon_cell";
8941+ }
8942+ return NULL;
8943+}
8944+
8945+void pfm_arch_init_percpu(void)
8946+{
8947+#ifdef CONFIG_PPC64
8948+ extern void ppc64_enable_pmcs(void);
8949+ ppc64_enable_pmcs();
8950+#endif
8951+}
8952+
8953+/**
8954+ * powerpc_irq_handler
8955+ *
8956+ * Get the perfmon context that belongs to the current CPU, and call the
8957+ * model-specific interrupt handler.
8958+ **/
8959+void powerpc_irq_handler(struct pt_regs *regs)
8960+{
8961+ struct pfm_arch_pmu_info *arch_info;
8962+ struct pfm_context *ctx;
8963+
8964+ if (! regs->softe) {
8965+ /*
8966+ * We got a PMU interrupt while interrupts were soft
8967+ * disabled. Disable hardware interrupts by clearing
8968+ * MSR_EE and also clear PMAO because we will need to set
8969+ * that again later when interrupts are re-enabled and
8970+ * raw_local_irq_restore() sees that the pmu_except_pending
8971+ * flag is set.
8972+ */
8973+ regs->msr &= ~MSR_EE;
8974+ get_paca()->pmu_except_pending = 1;
8975+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO);
8976+ return;
8977+ }
8978+
8979+ arch_info = pfm_pmu_info();
8980+ if (arch_info->irq_handler) {
8981+ ctx = __get_cpu_var(pmu_ctx);
8982+ if (likely(ctx))
8983+ arch_info->irq_handler(regs, ctx);
8984+ }
8985+}
8986--- /dev/null
8987+++ b/arch/powerpc/perfmon/perfmon_cell.c
8988@@ -0,0 +1,1449 @@
8989+/*
8990+ * This file contains the Cell PMU register description tables
8991+ * and pmc checker used by perfmon.c.
8992+ *
8993+ * Copyright IBM Corporation 2007
8994+ * (C) Copyright 2007 TOSHIBA CORPORATION
8995+ *
8996+ * Based on other Perfmon2 PMU modules.
8997+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
8998+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
8999+ *
9000+ * This program is free software; you can redistribute it and/or
9001+ * modify it under the terms of version 2 of the GNU General Public
9002+ * License as published by the Free Software Foundation.
9003+ *
9004+ * This program is distributed in the hope that it will be useful,
9005+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
9006+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9007+ * General Public License for more details.
9008+ *
9009+ * You should have received a copy of the GNU General Public License
9010+ * along with this program; if not, write to the Free Software
9011+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
9012+ * 02111-1307 USA
9013+ */
9014+
9015+#include <linux/module.h>
9016+#include <linux/perfmon_kern.h>
9017+#include <linux/io.h>
9018+#include <asm/cell-pmu.h>
9019+#include <asm/cell-regs.h>
9020+#include <asm/machdep.h>
9021+#include <asm/rtas.h>
9022+#include <asm/ps3.h>
9023+#include <asm/spu.h>
9024+
9025+MODULE_AUTHOR("Kevin Corry <kevcorry@us.ibm.com>, "
9026+ "Carl Love <carll@us.ibm.com>");
9027+MODULE_DESCRIPTION("Cell PMU description table");
9028+MODULE_LICENSE("GPL");
9029+
9030+struct pfm_cell_platform_pmu_info {
9031+ u32 (*read_ctr)(u32 cpu, u32 ctr);
9032+ void (*write_ctr)(u32 cpu, u32 ctr, u32 val);
9033+ void (*write_pm07_control)(u32 cpu, u32 ctr, u32 val);
9034+ void (*write_pm)(u32 cpu, enum pm_reg_name reg, u32 val);
9035+ void (*enable_pm)(u32 cpu);
9036+ void (*disable_pm)(u32 cpu);
9037+ void (*enable_pm_interrupts)(u32 cpu, u32 thread, u32 mask);
9038+ u32 (*get_and_clear_pm_interrupts)(u32 cpu);
9039+ u32 (*get_hw_thread_id)(int cpu);
9040+ struct cbe_ppe_priv_regs __iomem *(*get_cpu_ppe_priv_regs)(int cpu);
9041+ struct cbe_pmd_regs __iomem *(*get_cpu_pmd_regs)(int cpu);
9042+ struct cbe_mic_tm_regs __iomem *(*get_cpu_mic_tm_regs)(int cpu);
9043+ int (*rtas_token)(const char *service);
9044+ int (*rtas_call)(int token, int param1, int param2, int *param3, ...);
9045+};
9046+
9047+/*
9048+ * Mapping from Perfmon logical control registers to Cell hardware registers.
9049+ */
9050+static struct pfm_regmap_desc pfm_cell_pmc_desc[] = {
9051+ /* Per-counter control registers. */
9052+ PMC_D(PFM_REG_I, "pm0_control", 0, 0, 0, 0),
9053+ PMC_D(PFM_REG_I, "pm1_control", 0, 0, 0, 0),
9054+ PMC_D(PFM_REG_I, "pm2_control", 0, 0, 0, 0),
9055+ PMC_D(PFM_REG_I, "pm3_control", 0, 0, 0, 0),
9056+ PMC_D(PFM_REG_I, "pm4_control", 0, 0, 0, 0),
9057+ PMC_D(PFM_REG_I, "pm5_control", 0, 0, 0, 0),
9058+ PMC_D(PFM_REG_I, "pm6_control", 0, 0, 0, 0),
9059+ PMC_D(PFM_REG_I, "pm7_control", 0, 0, 0, 0),
9060+
9061+ /* Per-counter RTAS arguments. Each of these registers has three fields.
9062+ * bits 63-48: debug-bus word
9063+ * bits 47-32: sub-unit
9064+ * bits 31-0 : full signal number
9065+ * (MSB = 63, LSB = 0)
9066+ */
9067+ PMC_D(PFM_REG_I, "pm0_event", 0, 0, 0, 0),
9068+ PMC_D(PFM_REG_I, "pm1_event", 0, 0, 0, 0),
9069+ PMC_D(PFM_REG_I, "pm2_event", 0, 0, 0, 0),
9070+ PMC_D(PFM_REG_I, "pm3_event", 0, 0, 0, 0),
9071+ PMC_D(PFM_REG_I, "pm4_event", 0, 0, 0, 0),
9072+ PMC_D(PFM_REG_I, "pm5_event", 0, 0, 0, 0),
9073+ PMC_D(PFM_REG_I, "pm6_event", 0, 0, 0, 0),
9074+ PMC_D(PFM_REG_I, "pm7_event", 0, 0, 0, 0),
9075+
9076+ /* Global control registers. Same order as enum pm_reg_name. */
9077+ PMC_D(PFM_REG_I, "group_control", 0, 0, 0, 0),
9078+ PMC_D(PFM_REG_I, "debug_bus_control", 0, 0, 0, 0),
9079+ PMC_D(PFM_REG_I, "trace_address", 0, 0, 0, 0),
9080+ PMC_D(PFM_REG_I, "ext_trace_timer", 0, 0, 0, 0),
9081+ PMC_D(PFM_REG_I, "pm_status", 0, 0, 0, 0),
9082+ /* set the interrupt overflow bit for the four 32 bit counters
9083+ * that is currently supported. Will need to fix when 32 and 16
9084+ * bit counters are supported.
9085+ */
9086+ PMC_D(PFM_REG_I, "pm_control", 0xF0000000, 0xF0000000, 0, 0),
9087+ PMC_D(PFM_REG_I, "pm_interval", 0, 0, 0, 0), /* FIX: Does user-space also need read access to this one? */
9088+ PMC_D(PFM_REG_I, "pm_start_stop", 0, 0, 0, 0),
9089+};
9090+#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_cell_pmc_desc)
9091+
9092+#define CELL_PMC_GROUP_CONTROL 16
9093+#define CELL_PMC_PM_STATUS 20
9094+#define CELL_PMC_PM_CONTROL 21
9095+#define CELL_PMC_PM_CONTROL_CNTR_MASK 0x01E00000UL
9096+#define CELL_PMC_PM_CONTROL_CNTR_16 0x01E00000UL
9097+
9098+/*
9099+ * Mapping from Perfmon logical data counters to Cell hardware counters.
9100+ */
9101+static struct pfm_regmap_desc pfm_cell_pmd_desc[] = {
9102+ PMD_D(PFM_REG_C, "pm0", 0),
9103+ PMD_D(PFM_REG_C, "pm1", 0),
9104+ PMD_D(PFM_REG_C, "pm2", 0),
9105+ PMD_D(PFM_REG_C, "pm3", 0),
9106+ PMD_D(PFM_REG_C, "pm4", 0),
9107+ PMD_D(PFM_REG_C, "pm5", 0),
9108+ PMD_D(PFM_REG_C, "pm6", 0),
9109+ PMD_D(PFM_REG_C, "pm7", 0),
9110+};
9111+#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_cell_pmd_desc)
9112+
9113+#define PFM_EVENT_PMC_BUS_WORD(x) (((x) >> 48) & 0x00ff)
9114+#define PFM_EVENT_PMC_FULL_SIGNAL_NUMBER(x) ((x) & 0xffffffff)
9115+#define PFM_EVENT_PMC_SIGNAL_GROUP(x) (((x) & 0xffffffff) / 100)
9116+#define PFM_PM_CTR_INPUT_MUX_BIT(pm07_control) (((pm07_control) >> 26) & 0x1f)
9117+#define PFM_PM_CTR_INPUT_MUX_GROUP_INDEX(pm07_control) ((pm07_control) >> 31)
9118+#define PFM_GROUP_CONTROL_GROUP0_WORD(grp_ctrl) ((grp_ctrl) >> 30)
9119+#define PFM_GROUP_CONTROL_GROUP1_WORD(grp_ctrl) (((grp_ctrl) >> 28) & 0x3)
9120+#define PFM_NUM_OF_GROUPS 2
9121+#define PFM_PPU_IU1_THREAD1_BASE_BIT 19
9122+#define PFM_PPU_XU_THREAD1_BASE_BIT 16
9123+#define PFM_COUNTER_CTRL_PMC_PPU_TH0 0x100000000ULL
9124+#define PFM_COUNTER_CTRL_PMC_PPU_TH1 0x200000000ULL
9125+
9126+/*
9127+ * Debug-bus signal handling.
9128+ *
9129+ * Some Cell systems have firmware that can handle the debug-bus signal
9130+ * routing. For systems without this firmware, we have a minimal in-kernel
9131+ * implementation as well.
9132+ */
9133+
9134+/* The firmware only sees physical CPUs, so divide by 2 if SMT is on. */
9135+#ifdef CONFIG_SCHED_SMT
9136+#define RTAS_CPU(cpu) ((cpu) / 2)
9137+#else
9138+#define RTAS_CPU(cpu) (cpu)
9139+#endif
9140+#define RTAS_BUS_WORD(x) (u16)(((x) >> 48) & 0x0000ffff)
9141+#define RTAS_SUB_UNIT(x) (u16)(((x) >> 32) & 0x0000ffff)
9142+#define RTAS_SIGNAL_NUMBER(x) (s32)( (x) & 0xffffffff)
9143+#define RTAS_SIGNAL_GROUP(x) (RTAS_SIGNAL_NUMBER(x) / 100)
9144+
9145+#define subfunc_RESET 1
9146+#define subfunc_ACTIVATE 2
9147+
9148+#define passthru_ENABLE 1
9149+#define passthru_DISABLE 2
9150+
9151+/**
9152+ * struct cell_rtas_arg
9153+ *
9154+ * @cpu: Processor to modify. Linux numbers CPUs based on SMT IDs, but the
9155+ * firmware only sees the physical CPUs. So this value should be the
9156+ * SMT ID (from smp_processor_id() or get_cpu()) divided by 2.
9157+ * @sub_unit: Hardware subunit this applies to (if applicable).
9158+ * @signal_group: Signal group to enable/disable on the trace bus.
9159+ * @bus_word: For signal groups that propagate via the trace bus, this trace
9160+ * bus word will be used. This is a mask of (1 << TraceBusWord).
9161+ * For other signal groups, this specifies the trigger or event bus.
9162+ * @bit: Trigger/Event bit, if applicable for the signal group.
9163+ *
9164+ * An array of these structures are passed to rtas_call() to set up the
9165+ * signals on the debug bus.
9166+ **/
9167+struct cell_rtas_arg {
9168+ u16 cpu;
9169+ u16 sub_unit;
9170+ s16 signal_group;
9171+ u8 bus_word;
9172+ u8 bit;
9173+};
9174+
9175+/**
9176+ * rtas_reset_signals
9177+ *
9178+ * Use the firmware RTAS call to disable signal pass-thru and to reset the
9179+ * debug-bus signals.
9180+ **/
9181+static int rtas_reset_signals(u32 cpu)
9182+{
9183+ struct cell_rtas_arg signal;
9184+ u64 real_addr = virt_to_phys(&signal);
9185+ int rc;
9186+ struct pfm_cell_platform_pmu_info *info =
9187+ ((struct pfm_arch_pmu_info *)
9188+ (pfm_pmu_conf->pmu_info))->platform_info;
9189+
9190+ memset(&signal, 0, sizeof(signal));
9191+ signal.cpu = RTAS_CPU(cpu);
9192+ rc = info->rtas_call(info->rtas_token("ibm,cbe-perftools"),
9193+ 5, 1, NULL,
9194+ subfunc_RESET,
9195+ passthru_DISABLE,
9196+ real_addr >> 32,
9197+ real_addr & 0xffffffff,
9198+ sizeof(signal));
9199+
9200+ return rc;
9201+}
9202+
9203+/**
9204+ * rtas_activate_signals
9205+ *
9206+ * Use the firmware RTAS call to enable signal pass-thru and to activate the
9207+ * desired signal groups on the debug-bus.
9208+ **/
9209+static int rtas_activate_signals(struct cell_rtas_arg *signals,
9210+ int num_signals)
9211+{
9212+ u64 real_addr = virt_to_phys(signals);
9213+ int rc;
9214+ struct pfm_cell_platform_pmu_info *info =
9215+ ((struct pfm_arch_pmu_info *)
9216+ (pfm_pmu_conf->pmu_info))->platform_info;
9217+
9218+ rc = info->rtas_call(info->rtas_token("ibm,cbe-perftools"),
9219+ 5, 1, NULL,
9220+ subfunc_ACTIVATE,
9221+ passthru_ENABLE,
9222+ real_addr >> 32,
9223+ real_addr & 0xffffffff,
9224+ num_signals * sizeof(*signals));
9225+
9226+ return rc;
9227+}
9228+
9229+#define HID1_RESET_MASK (~0x00000001ffffffffUL)
9230+#define PPU_IU1_WORD0_HID1_EN_MASK (~0x00000001f0c0802cUL)
9231+#define PPU_IU1_WORD0_HID1_EN_WORD ( 0x00000001f0400000UL)
9232+#define PPU_IU1_WORD1_HID1_EN_MASK (~0x000000010fc08023UL)
9233+#define PPU_IU1_WORD1_HID1_EN_WORD ( 0x000000010f400001UL)
9234+#define PPU_XU_WORD0_HID1_EN_MASK (~0x00000001f038402cUL)
9235+#define PPU_XU_WORD0_HID1_EN_WORD ( 0x00000001f0080008UL)
9236+#define PPU_XU_WORD1_HID1_EN_MASK (~0x000000010f074023UL)
9237+#define PPU_XU_WORD1_HID1_EN_WORD ( 0x000000010f030002UL)
9238+
9239+/* The bus_word field in the cell_rtas_arg structure is a bit-mask
9240+ * indicating which debug-bus word(s) to use.
9241+ */
9242+enum {
9243+ BUS_WORD_0 = 1,
9244+ BUS_WORD_1 = 2,
9245+ BUS_WORD_2 = 4,
9246+ BUS_WORD_3 = 8,
9247+};
9248+
9249+/* Definitions of the signal-groups that the built-in signal-activation
9250+ * code can handle.
9251+ */
9252+enum {
9253+ SIG_GROUP_NONE = 0,
9254+
9255+ /* 2.x PowerPC Processor Unit (PPU) Signal Groups */
9256+ SIG_GROUP_PPU_BASE = 20,
9257+ SIG_GROUP_PPU_IU1 = 21,
9258+ SIG_GROUP_PPU_XU = 22,
9259+
9260+ /* 3.x PowerPC Storage Subsystem (PPSS) Signal Groups */
9261+ SIG_GROUP_PPSS_BASE = 30,
9262+
9263+ /* 4.x Synergistic Processor Unit (SPU) Signal Groups */
9264+ SIG_GROUP_SPU_BASE = 40,
9265+
9266+ /* 5.x Memory Flow Controller (MFC) Signal Groups */
9267+ SIG_GROUP_MFC_BASE = 50,
9268+
9269+ /* 6.x Element )nterconnect Bus (EIB) Signal Groups */
9270+ SIG_GROUP_EIB_BASE = 60,
9271+
9272+ /* 7.x Memory Interface Controller (MIC) Signal Groups */
9273+ SIG_GROUP_MIC_BASE = 70,
9274+
9275+ /* 8.x Cell Broadband Engine Interface (BEI) Signal Groups */
9276+ SIG_GROUP_BEI_BASE = 80,
9277+};
9278+
9279+/**
9280+ * rmw_spr
9281+ *
9282+ * Read-modify-write for a special-purpose-register.
9283+ **/
9284+#define rmw_spr(spr_id, a_mask, o_mask) \
9285+ do { \
9286+ u64 value = mfspr(spr_id); \
9287+ value &= (u64)(a_mask); \
9288+ value |= (u64)(o_mask); \
9289+ mtspr((spr_id), value); \
9290+ } while (0)
9291+
9292+/**
9293+ * rmw_mmio_reg64
9294+ *
9295+ * Read-modify-write for a 64-bit MMIO register.
9296+ **/
9297+#define rmw_mmio_reg64(mem, a_mask, o_mask) \
9298+ do { \
9299+ u64 value = in_be64(&(mem)); \
9300+ value &= (u64)(a_mask); \
9301+ value |= (u64)(o_mask); \
9302+ out_be64(&(mem), value); \
9303+ } while (0)
9304+
9305+/**
9306+ * rmwb_mmio_reg64
9307+ *
9308+ * Set or unset a specified bit within a 64-bit MMIO register.
9309+ **/
9310+#define rmwb_mmio_reg64(mem, bit_num, set_bit) \
9311+ rmw_mmio_reg64((mem), ~(1UL << (63 - (bit_num))), \
9312+ ((set_bit) << (63 - (bit_num))))
9313+
9314+/**
9315+ * passthru
9316+ *
9317+ * Enable or disable passthru mode in all the Cell signal islands.
9318+ **/
9319+static int passthru(u32 cpu, u64 enable)
9320+{
9321+ struct cbe_ppe_priv_regs __iomem *ppe_priv_regs;
9322+ struct cbe_pmd_regs __iomem *pmd_regs;
9323+ struct cbe_mic_tm_regs __iomem *mic_tm_regs;
9324+ struct pfm_cell_platform_pmu_info *info =
9325+ ((struct pfm_arch_pmu_info *)
9326+ (pfm_pmu_conf->pmu_info))->platform_info;
9327+
9328+ ppe_priv_regs = info->get_cpu_ppe_priv_regs(cpu);
9329+ pmd_regs = info->get_cpu_pmd_regs(cpu);
9330+ mic_tm_regs = info->get_cpu_mic_tm_regs(cpu);
9331+
9332+ if (!ppe_priv_regs || !pmd_regs || !mic_tm_regs) {
9333+ PFM_ERR("Error getting Cell PPE, PMD, and MIC "
9334+ "register maps: 0x%p, 0x%p, 0x%p",
9335+ ppe_priv_regs, pmd_regs, mic_tm_regs);
9336+ return -EINVAL;
9337+ }
9338+
9339+ rmwb_mmio_reg64(ppe_priv_regs->L2_debug1, 61, enable);
9340+ rmwb_mmio_reg64(ppe_priv_regs->ciu_dr1, 5, enable);
9341+ rmwb_mmio_reg64(pmd_regs->on_ramp_trace, 39, enable);
9342+ rmwb_mmio_reg64(mic_tm_regs->MBL_debug, 20, enable);
9343+
9344+ return 0;
9345+}
9346+
9347+#define passthru_enable(cpu) passthru(cpu, 1)
9348+#define passthru_disable(cpu) passthru(cpu, 0)
9349+
9350+static inline void reset_signal_registers(u32 cpu)
9351+{
9352+ rmw_spr(SPRN_HID1, HID1_RESET_MASK, 0);
9353+}
9354+
9355+/**
9356+ * celleb_reset_signals
9357+ *
9358+ * Non-rtas version of resetting the debug-bus signals.
9359+ **/
9360+static int celleb_reset_signals(u32 cpu)
9361+{
9362+ int rc;
9363+ rc = passthru_disable(cpu);
9364+ if (!rc)
9365+ reset_signal_registers(cpu);
9366+ return rc;
9367+}
9368+
9369+/**
9370+ * ppu_selection
9371+ *
9372+ * Write the HID1 register to connect the specified PPU signal-group to the
9373+ * debug-bus.
9374+ **/
9375+static int ppu_selection(struct cell_rtas_arg *signal)
9376+{
9377+ u64 hid1_enable_word = 0;
9378+ u64 hid1_enable_mask = 0;
9379+
9380+ switch (signal->signal_group) {
9381+
9382+ case SIG_GROUP_PPU_IU1: /* 2.1 PPU Instruction Unit - Group 1 */
9383+ switch (signal->bus_word) {
9384+ case BUS_WORD_0:
9385+ hid1_enable_mask = PPU_IU1_WORD0_HID1_EN_MASK;
9386+ hid1_enable_word = PPU_IU1_WORD0_HID1_EN_WORD;
9387+ break;
9388+ case BUS_WORD_1:
9389+ hid1_enable_mask = PPU_IU1_WORD1_HID1_EN_MASK;
9390+ hid1_enable_word = PPU_IU1_WORD1_HID1_EN_WORD;
9391+ break;
9392+ default:
9393+ PFM_ERR("Invalid bus-word (0x%x) for signal-group %d.",
9394+ signal->bus_word, signal->signal_group);
9395+ return -EINVAL;
9396+ }
9397+ break;
9398+
9399+ case SIG_GROUP_PPU_XU: /* 2.2 PPU Execution Unit */
9400+ switch (signal->bus_word) {
9401+ case BUS_WORD_0:
9402+ hid1_enable_mask = PPU_XU_WORD0_HID1_EN_MASK;
9403+ hid1_enable_word = PPU_XU_WORD0_HID1_EN_WORD;
9404+ break;
9405+ case BUS_WORD_1:
9406+ hid1_enable_mask = PPU_XU_WORD1_HID1_EN_MASK;
9407+ hid1_enable_word = PPU_XU_WORD1_HID1_EN_WORD;
9408+ break;
9409+ default:
9410+ PFM_ERR("Invalid bus-word (0x%x) for signal-group %d.",
9411+ signal->bus_word, signal->signal_group);
9412+ return -EINVAL;
9413+ }
9414+ break;
9415+
9416+ default:
9417+ PFM_ERR("Signal-group %d not implemented.",
9418+ signal->signal_group);
9419+ return -EINVAL;
9420+ }
9421+
9422+ rmw_spr(SPRN_HID1, hid1_enable_mask, hid1_enable_word);
9423+
9424+ return 0;
9425+}
9426+
9427+/**
9428+ * celleb_activate_signals
9429+ *
9430+ * Non-rtas version of activating the debug-bus signals.
9431+ **/
9432+static int celleb_activate_signals(struct cell_rtas_arg *signals,
9433+ int num_signals)
9434+{
9435+ int i, rc = -EINVAL;
9436+
9437+ for (i = 0; i < num_signals; i++) {
9438+ switch (signals[i].signal_group) {
9439+
9440+ /* 2.x PowerPC Processor Unit (PPU) Signal Selection */
9441+ case SIG_GROUP_PPU_IU1:
9442+ case SIG_GROUP_PPU_XU:
9443+ rc = ppu_selection(signals + i);
9444+ if (rc)
9445+ return rc;
9446+ break;
9447+
9448+ default:
9449+ PFM_ERR("Signal-group %d not implemented.",
9450+ signals[i].signal_group);
9451+ return -EINVAL;
9452+ }
9453+ }
9454+
9455+ if (0 < i)
9456+ rc = passthru_enable(signals[0].cpu);
9457+
9458+ return rc;
9459+}
9460+
9461+/**
9462+ * ps3_reset_signals
9463+ *
9464+ * ps3 version of resetting the debug-bus signals.
9465+ **/
9466+static int ps3_reset_signals(u32 cpu)
9467+{
9468+#ifdef CONFIG_PPC_PS3
9469+ return ps3_set_signal(0, 0, 0, 0);
9470+#else
9471+ return 0;
9472+#endif
9473+}
9474+
9475+/**
9476+ * ps3_activate_signals
9477+ *
9478+ * ps3 version of activating the debug-bus signals.
9479+ **/
9480+static int ps3_activate_signals(struct cell_rtas_arg *signals,
9481+ int num_signals)
9482+{
9483+#ifdef CONFIG_PPC_PS3
9484+ int i;
9485+
9486+ for (i = 0; i < num_signals; i++)
9487+ ps3_set_signal(signals[i].signal_group, signals[i].bit,
9488+ signals[i].sub_unit, signals[i].bus_word);
9489+#endif
9490+ return 0;
9491+}
9492+
9493+
9494+/**
9495+ * reset_signals
9496+ *
9497+ * Call to the firmware (if available) to reset the debug-bus signals.
9498+ * Otherwise call the built-in version.
9499+ **/
9500+int reset_signals(u32 cpu)
9501+{
9502+ int rc;
9503+
9504+ if (machine_is(celleb))
9505+ rc = celleb_reset_signals(cpu);
9506+ else if (machine_is(ps3))
9507+ rc = ps3_reset_signals(cpu);
9508+ else
9509+ rc = rtas_reset_signals(cpu);
9510+
9511+ return rc;
9512+}
9513+
9514+/**
9515+ * activate_signals
9516+ *
9517+ * Call to the firmware (if available) to activate the debug-bus signals.
9518+ * Otherwise call the built-in version.
9519+ **/
9520+int activate_signals(struct cell_rtas_arg *signals, int num_signals)
9521+{
9522+ int rc;
9523+
9524+ if (machine_is(celleb))
9525+ rc = celleb_activate_signals(signals, num_signals);
9526+ else if (machine_is(ps3))
9527+ rc = ps3_activate_signals(signals, num_signals);
9528+ else
9529+ rc = rtas_activate_signals(signals, num_signals);
9530+
9531+ return rc;
9532+}
9533+
9534+/**
9535+ * pfm_cell_pmc_check
9536+ *
9537+ * Verify that we are going to write a valid value to the specified PMC.
9538+ **/
9539+int pfm_cell_pmc_check(struct pfm_context *ctx,
9540+ struct pfm_event_set *set,
9541+ struct pfarg_pmc *req)
9542+{
9543+ u16 cnum, reg_num = req->reg_num;
9544+ s16 signal_group = RTAS_SIGNAL_GROUP(req->reg_value);
9545+ u8 bus_word = RTAS_BUS_WORD(req->reg_value);
9546+
9547+ if (reg_num < NR_CTRS || reg_num >= (NR_CTRS * 2))
9548+ return -EINVAL;
9549+
9550+ switch (signal_group) {
9551+ case SIG_GROUP_PPU_IU1:
9552+ case SIG_GROUP_PPU_XU:
9553+ if ((bus_word != 0) && (bus_word != 1)) {
9554+ PFM_ERR("Invalid bus word (%d) for signal-group %d",
9555+ bus_word, signal_group);
9556+ return -EINVAL;
9557+ }
9558+ break;
9559+ default:
9560+ PFM_ERR("Signal-group %d not implemented.", signal_group);
9561+ return -EINVAL;
9562+ }
9563+
9564+ for (cnum = NR_CTRS; cnum < (NR_CTRS * 2); cnum++) {
9565+ if (test_bit(cnum, cast_ulp(set->used_pmcs)) &&
9566+ bus_word == RTAS_BUS_WORD(set->pmcs[cnum]) &&
9567+ signal_group != RTAS_SIGNAL_GROUP(set->pmcs[cnum])) {
9568+ PFM_ERR("Impossible signal-group combination: "
9569+ "(%u,%u,%d) (%u,%u,%d)",
9570+ reg_num, bus_word, signal_group, cnum,
9571+ RTAS_BUS_WORD(set->pmcs[cnum]),
9572+ RTAS_SIGNAL_GROUP(set->pmcs[cnum]));
9573+ return -EBUSY;
9574+ }
9575+ }
9576+
9577+ return 0;
9578+}
9579+
9580+/**
9581+ * write_pm07_event
9582+ *
9583+ * Pull out the RTAS arguments from the 64-bit register value and make the
9584+ * RTAS activate-signals call.
9585+ **/
9586+static void write_pm07_event(int cpu, unsigned int ctr, u64 value)
9587+{
9588+ struct cell_rtas_arg signal;
9589+ s32 signal_number;
9590+ int rc;
9591+
9592+ signal_number = RTAS_SIGNAL_NUMBER(value);
9593+ if (!signal_number) {
9594+ /* Don't include counters that are counting cycles. */
9595+ return;
9596+ }
9597+
9598+ signal.cpu = RTAS_CPU(cpu);
9599+ signal.bus_word = 1 << RTAS_BUS_WORD(value);
9600+ signal.sub_unit = RTAS_SUB_UNIT(value);
9601+ signal.signal_group = signal_number / 100;
9602+ signal.bit = abs(signal_number) % 100;
9603+
9604+ rc = activate_signals(&signal, 1);
9605+ if (rc) {
9606+ PFM_WARN("%s(%d, %u, %lu): Error calling "
9607+ "activate_signals(): %d\n", __func__,
9608+ cpu, ctr, (unsigned long)value, rc);
9609+ /* FIX: Could we change this routine to return an error? */
9610+ }
9611+}
9612+
9613+/**
9614+ * pfm_cell_probe_pmu
9615+ *
9616+ * Simply check the processor version register to see if we're currently
9617+ * on a Cell system.
9618+ **/
9619+static int pfm_cell_probe_pmu(void)
9620+{
9621+ unsigned long pvr = mfspr(SPRN_PVR);
9622+
9623+ if (PVR_VER(pvr) != PV_BE)
9624+ return -1;
9625+
9626+ return 0;
9627+}
9628+
9629+/**
9630+ * pfm_cell_write_pmc
9631+ **/
9632+static void pfm_cell_write_pmc(unsigned int cnum, u64 value)
9633+{
9634+ int cpu = smp_processor_id();
9635+ struct pfm_cell_platform_pmu_info *info =
9636+ ((struct pfm_arch_pmu_info *)
9637+ (pfm_pmu_conf->pmu_info))->platform_info;
9638+
9639+ if (cnum < NR_CTRS) {
9640+ info->write_pm07_control(cpu, cnum, value);
9641+
9642+ } else if (cnum < NR_CTRS * 2) {
9643+ write_pm07_event(cpu, cnum - NR_CTRS, value);
9644+
9645+ } else if (cnum == CELL_PMC_PM_STATUS) {
9646+ /* The pm_status register must be treated separately from
9647+ * the other "global" PMCs. This call will ensure that
9648+ * the interrupts are routed to the correct CPU, as well
9649+ * as writing the desired value to the pm_status register.
9650+ */
9651+ info->enable_pm_interrupts(cpu, info->get_hw_thread_id(cpu),
9652+ value);
9653+
9654+ } else if (cnum < PFM_PM_NUM_PMCS) {
9655+ info->write_pm(cpu, cnum - (NR_CTRS * 2), value);
9656+ }
9657+}
9658+
9659+/**
9660+ * pfm_cell_write_pmd
9661+ **/
9662+static void pfm_cell_write_pmd(unsigned int cnum, u64 value)
9663+{
9664+ int cpu = smp_processor_id();
9665+ struct pfm_cell_platform_pmu_info *info =
9666+ ((struct pfm_arch_pmu_info *)
9667+ (pfm_pmu_conf->pmu_info))->platform_info;
9668+
9669+ if (cnum < NR_CTRS)
9670+ info->write_ctr(cpu, cnum, value);
9671+}
9672+
9673+/**
9674+ * pfm_cell_read_pmd
9675+ **/
9676+static u64 pfm_cell_read_pmd(unsigned int cnum)
9677+{
9678+ int cpu = smp_processor_id();
9679+ struct pfm_cell_platform_pmu_info *info =
9680+ ((struct pfm_arch_pmu_info *)
9681+ (pfm_pmu_conf->pmu_info))->platform_info;
9682+
9683+ if (cnum < NR_CTRS)
9684+ return info->read_ctr(cpu, cnum);
9685+
9686+ return -EINVAL;
9687+}
9688+
9689+/**
9690+ * pfm_cell_enable_counters
9691+ *
9692+ * Just need to turn on the global disable bit in pm_control.
9693+ **/
9694+static void pfm_cell_enable_counters(struct pfm_context *ctx,
9695+ struct pfm_event_set *set)
9696+{
9697+ struct pfm_cell_platform_pmu_info *info =
9698+ ((struct pfm_arch_pmu_info *)
9699+ (pfm_pmu_conf->pmu_info))->platform_info;
9700+
9701+ info->enable_pm(smp_processor_id());
9702+}
9703+
9704+/**
9705+ * pfm_cell_disable_counters
9706+ *
9707+ * Just need to turn off the global disable bit in pm_control.
9708+ **/
9709+static void pfm_cell_disable_counters(struct pfm_context *ctx,
9710+ struct pfm_event_set *set)
9711+{
9712+ struct pfm_cell_platform_pmu_info *info =
9713+ ((struct pfm_arch_pmu_info *)
9714+ (pfm_pmu_conf->pmu_info))->platform_info;
9715+
9716+ info->disable_pm(smp_processor_id());
9717+ if (machine_is(ps3))
9718+ reset_signals(smp_processor_id());
9719+}
9720+
9721+/*
9722+ * Return the thread id of the specified ppu signal.
9723+ */
9724+static inline u32 get_target_ppu_thread_id(u32 group, u32 bit)
9725+{
9726+ if ((group == SIG_GROUP_PPU_IU1 &&
9727+ bit < PFM_PPU_IU1_THREAD1_BASE_BIT) ||
9728+ (group == SIG_GROUP_PPU_XU &&
9729+ bit < PFM_PPU_XU_THREAD1_BASE_BIT))
9730+ return 0;
9731+ else
9732+ return 1;
9733+}
9734+
9735+/*
9736+ * Return whether the specified counter is for PPU signal group.
9737+ */
9738+static inline int is_counter_for_ppu_sig_grp(u32 counter_control, u32 sig_grp)
9739+{
9740+ if (!(counter_control & CBE_PM_CTR_INPUT_CONTROL) &&
9741+ (counter_control & CBE_PM_CTR_ENABLE) &&
9742+ ((sig_grp == SIG_GROUP_PPU_IU1) || (sig_grp == SIG_GROUP_PPU_XU)))
9743+ return 1;
9744+ else
9745+ return 0;
9746+}
9747+
9748+/*
9749+ * Search ppu signal groups.
9750+ */
9751+static int get_ppu_signal_groups(struct pfm_event_set *set,
9752+ u32 *ppu_sig_grp0, u32 *ppu_sig_grp1)
9753+{
9754+ u64 pm_event, *used_pmcs = set->used_pmcs;
9755+ int i, j;
9756+ u32 grp0_wd, grp1_wd, wd, sig_grp;
9757+
9758+ *ppu_sig_grp0 = 0;
9759+ *ppu_sig_grp1 = 0;
9760+ grp0_wd = PFM_GROUP_CONTROL_GROUP0_WORD(
9761+ set->pmcs[CELL_PMC_GROUP_CONTROL]);
9762+ grp1_wd = PFM_GROUP_CONTROL_GROUP1_WORD(
9763+ set->pmcs[CELL_PMC_GROUP_CONTROL]);
9764+
9765+ for (i = 0, j = 0; (i < NR_CTRS) && (j < PFM_NUM_OF_GROUPS); i++) {
9766+ if (test_bit(i + NR_CTRS, used_pmcs)) {
9767+ pm_event = set->pmcs[i + NR_CTRS];
9768+ wd = PFM_EVENT_PMC_BUS_WORD(pm_event);
9769+ sig_grp = PFM_EVENT_PMC_SIGNAL_GROUP(pm_event);
9770+ if ((sig_grp == SIG_GROUP_PPU_IU1) ||
9771+ (sig_grp == SIG_GROUP_PPU_XU)) {
9772+
9773+ if (wd == grp0_wd && *ppu_sig_grp0 == 0) {
9774+ *ppu_sig_grp0 = sig_grp;
9775+ j++;
9776+ } else if (wd == grp1_wd &&
9777+ *ppu_sig_grp1 == 0) {
9778+ *ppu_sig_grp1 = sig_grp;
9779+ j++;
9780+ }
9781+ }
9782+ }
9783+ }
9784+ return j;
9785+}
9786+
9787+/**
9788+ * pfm_cell_restore_pmcs
9789+ *
9790+ * Write all control register values that are saved in the specified event
9791+ * set. We could use the pfm_arch_write_pmc() function to restore each PMC
9792+ * individually (as is done in other architectures), but that results in
9793+ * multiple RTAS calls. As an optimization, we will setup the RTAS argument
9794+ * array so we can do all event-control registers in one RTAS call.
9795+ *
9796+ * In per-thread mode,
9797+ * The counter enable bit of the pmX_control PMC is enabled while the target
9798+ * task runs on the target HW thread.
9799+ **/
9800+void pfm_cell_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
9801+{
9802+ u64 ctr_ctrl;
9803+ u64 *used_pmcs = set->used_pmcs;
9804+ int i;
9805+ int cpu = smp_processor_id();
9806+ u32 current_th_id;
9807+ struct pfm_cell_platform_pmu_info *info =
9808+ ((struct pfm_arch_pmu_info *)
9809+ (pfm_pmu_conf->pmu_info))->platform_info;
9810+
9811+ for (i = 0; i < NR_CTRS; i++) {
9812+ ctr_ctrl = set->pmcs[i];
9813+
9814+ if (ctr_ctrl & PFM_COUNTER_CTRL_PMC_PPU_TH0) {
9815+ current_th_id = info->get_hw_thread_id(cpu);
9816+
9817+ /*
9818+ * Set the counter enable bit down if the current
9819+ * HW thread is NOT 0
9820+ **/
9821+ if (current_th_id)
9822+ ctr_ctrl = ctr_ctrl & ~CBE_PM_CTR_ENABLE;
9823+
9824+ } else if (ctr_ctrl & PFM_COUNTER_CTRL_PMC_PPU_TH1) {
9825+ current_th_id = info->get_hw_thread_id(cpu);
9826+
9827+ /*
9828+ * Set the counter enable bit down if the current
9829+ * HW thread is 0
9830+ **/
9831+ if (!current_th_id)
9832+ ctr_ctrl = ctr_ctrl & ~CBE_PM_CTR_ENABLE;
9833+ }
9834+
9835+ /* Write the per-counter control register. If the PMC is not
9836+ * in use, then it will simply clear the register, which will
9837+ * disable the associated counter.
9838+ */
9839+ info->write_pm07_control(cpu, i, ctr_ctrl);
9840+
9841+ if (test_bit(i + NR_CTRS, used_pmcs))
9842+ write_pm07_event(cpu, 0, set->pmcs[i + NR_CTRS]);
9843+ }
9844+
9845+ /* Write all the global PMCs. Need to call pfm_cell_write_pmc()
9846+ * instead of cbe_write_pm() due to special handling for the
9847+ * pm_status register.
9848+ */
9849+ for (i *= 2; i < PFM_PM_NUM_PMCS; i++)
9850+ pfm_cell_write_pmc(i, set->pmcs[i]);
9851+}
9852+
9853+/**
9854+ * pfm_cell_restore_pmds
9855+ *
9856+ * Write to pm_control register before writing to counter registers
9857+ * so that we can decide the counter width berfore writing to the couters.
9858+ **/
9859+void pfm_cell_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
9860+{
9861+ u64 *used_pmds;
9862+ unsigned int i, max_pmd;
9863+ int cpu = smp_processor_id();
9864+ struct pfm_cell_platform_pmu_info *info =
9865+ ((struct pfm_arch_pmu_info *)
9866+ (pfm_pmu_conf->pmu_info))->platform_info;
9867+
9868+ /*
9869+ * Write pm_control register value
9870+ */
9871+ info->write_pm(cpu, pm_control,
9872+ set->pmcs[CELL_PMC_PM_CONTROL] &
9873+ ~CBE_PM_ENABLE_PERF_MON);
9874+ PFM_DBG("restore pm_control(0x%lx) before restoring pmds",
9875+ set->pmcs[CELL_PMC_PM_CONTROL]);
9876+
9877+ max_pmd = ctx->regs.max_pmd;
9878+ used_pmds = set->used_pmds;
9879+
9880+ for (i = 0; i < max_pmd; i++)
9881+ if (test_bit(i, used_pmds) &&
9882+ !(pfm_pmu_conf->pmd_desc[i].type & PFM_REG_RO))
9883+ pfm_cell_write_pmd(i, set->pmds[i].value);
9884+}
9885+
9886+/**
9887+ * pfm_cell_get_cntr_width
9888+ *
9889+ * This function check the 16bit counter field in pm_control pmc.
9890+ *
9891+ * Return value
9892+ * 16 : all counters are 16bit width.
9893+ * 32 : all counters are 32bit width.
9894+ * 0 : several counter width exists.
9895+ **/
9896+static int pfm_cell_get_cntr_width(struct pfm_context *ctx,
9897+ struct pfm_event_set *s)
9898+{
9899+ int width = 0;
9900+ int tmp = 0;
9901+ u64 cntr_field;
9902+
9903+ if (ctx->flags.switch_ovfl || ctx->flags.switch_time) {
9904+ list_for_each_entry(s, &ctx->set_list, list) {
9905+ cntr_field = s->pmcs[CELL_PMC_PM_CONTROL] &
9906+ CELL_PMC_PM_CONTROL_CNTR_MASK;
9907+
9908+ if (cntr_field == CELL_PMC_PM_CONTROL_CNTR_16)
9909+ tmp = 16;
9910+ else if (cntr_field == 0x0)
9911+ tmp = 32;
9912+ else
9913+ return 0;
9914+
9915+ if (tmp != width && width != 0)
9916+ return 0;
9917+
9918+ width = tmp;
9919+ }
9920+ } else {
9921+ cntr_field = s->pmcs[CELL_PMC_PM_CONTROL] &
9922+ CELL_PMC_PM_CONTROL_CNTR_MASK;
9923+
9924+ if (cntr_field == CELL_PMC_PM_CONTROL_CNTR_16)
9925+ width = 16;
9926+ else if (cntr_field == 0x0)
9927+ width = 32;
9928+ else
9929+ width = 0;
9930+ }
9931+ return width;
9932+}
9933+
9934+/**
9935+ * pfm_cell_check_cntr_ovfl_mask
9936+ *
9937+ * Return value
9938+ * 1 : cntr_ovfl interrupt is used.
9939+ * 0 : cntr_ovfl interrupt is not used.
9940+ **/
9941+static int pfm_cell_check_cntr_ovfl(struct pfm_context *ctx,
9942+ struct pfm_event_set *s)
9943+{
9944+ if (ctx->flags.switch_ovfl || ctx->flags.switch_time) {
9945+ list_for_each_entry(s, &ctx->set_list, list) {
9946+ if (CBE_PM_OVERFLOW_CTRS(s->pmcs[CELL_PMC_PM_STATUS]))
9947+ return 1;
9948+ }
9949+ } else {
9950+ if (CBE_PM_OVERFLOW_CTRS(s->pmcs[CELL_PMC_PM_STATUS]))
9951+ return 1;
9952+ }
9953+ return 0;
9954+}
9955+
9956+#ifdef CONFIG_PPC_PS3
9957+/**
9958+ * update_sub_unit_field
9959+ *
9960+ **/
9961+static inline u64 update_sub_unit_field(u64 pm_event, u64 spe_id)
9962+{
9963+ return ((pm_event & 0xFFFF0000FFFFFFFF) | (spe_id << 32));
9964+}
9965+
9966+/**
9967+ * pfm_get_spe_id
9968+ *
9969+ **/
9970+static u64 pfm_get_spe_id(void *arg)
9971+{
9972+ struct spu *spu = arg;
9973+ u64 spe_id;
9974+
9975+ if (machine_is(ps3))
9976+ spe_id = ps3_get_spe_id(arg);
9977+ else
9978+ spe_id = spu->spe_id;
9979+
9980+ return spe_id;
9981+}
9982+
9983+/**
9984+ * pfm_spu_number_to_id
9985+ *
9986+ **/
9987+static int pfm_spu_number_to_id(int number, u64 *spe_id)
9988+{
9989+ struct spu *spu;
9990+ int i;
9991+
9992+ for (i = 0; i < MAX_NUMNODES; i++) {
9993+ if (cbe_spu_info[i].n_spus == 0)
9994+ continue;
9995+
9996+ list_for_each_entry(spu, &cbe_spu_info[i].spus, cbe_list)
9997+ if (spu->number == number) {
9998+ *spe_id = pfm_get_spe_id(spu);
9999+ return 0;
10000+ }
10001+ }
10002+ return -ENODEV;
10003+}
10004+
10005+/**
10006+ * pfm_update_pmX_event_subunit_field
10007+ *
10008+ * In system wide mode,
10009+ * This function updates the subunit field of SPE pmX_event.
10010+ **/
10011+static int pfm_update_pmX_event_subunit_field(struct pfm_context *ctx)
10012+{
10013+ struct pfm_event_set *set;
10014+ int i, last_pmc, ret;
10015+ u64 signal_group, spe_id;
10016+ int sub_unit;
10017+ u64 *used_pmcs;
10018+
10019+ last_pmc = NR_CTRS + 8;
10020+ ret = 0;
10021+ list_for_each_entry(set, &ctx->set_list, list) {
10022+
10023+ used_pmcs = set->used_pmcs;
10024+ for (i = NR_CTRS; i < last_pmc; i++) {
10025+ if (!test_bit(i, used_pmcs))
10026+ continue;
10027+
10028+ signal_group = PFM_EVENT_PMC_SIGNAL_GROUP(set->pmcs[i]);
10029+
10030+ /*
10031+ * If the target event is a SPE signal group event,
10032+ * The sub_unit field in pmX_event pmc is changed to the
10033+ * specified spe_id.
10034+ */
10035+ if (SIG_GROUP_SPU_BASE < signal_group &&
10036+ signal_group < SIG_GROUP_EIB_BASE) {
10037+ sub_unit = RTAS_SUB_UNIT(set->pmcs[i]);
10038+
10039+ ret = pfm_spu_number_to_id(sub_unit, &spe_id);
10040+ if (ret)
10041+ return ret;
10042+
10043+ set->pmcs[i] = update_sub_unit_field(
10044+ set->pmcs[i], spe_id);
10045+ }
10046+ }
10047+ }
10048+ return 0;
10049+}
10050+#endif
10051+
10052+/**
10053+ * pfm_cell_load_context
10054+ *
10055+ * In per-thread mode,
10056+ * The pmX_control PMCs which are used for PPU IU/XU event are marked with
10057+ * the thread id(PFM_COUNTER_CTRL_PMC_PPU_TH0/TH1).
10058+ **/
10059+static int pfm_cell_load_context(struct pfm_context *ctx)
10060+{
10061+ int i;
10062+ u32 ppu_sig_grp[PFM_NUM_OF_GROUPS] = {SIG_GROUP_NONE, SIG_GROUP_NONE};
10063+ u32 bit;
10064+ int index;
10065+ u32 target_th_id;
10066+ int ppu_sig_num = 0;
10067+ struct pfm_event_set *s;
10068+ int cntr_width = 32;
10069+ int ret = 0;
10070+
10071+ if (pfm_cell_check_cntr_ovfl(ctx, ctx->active_set)) {
10072+ cntr_width = pfm_cell_get_cntr_width(ctx, ctx->active_set);
10073+
10074+ /*
10075+ * Counter overflow interrupt works with only 32bit counter,
10076+ * because perfmon core uses pfm_cell_pmu_conf.counter_width
10077+ * to deal with the counter overflow. we can't change the
10078+ * counter width here.
10079+ */
10080+ if (cntr_width != 32)
10081+ return -EINVAL;
10082+ }
10083+
10084+ if (ctx->flags.system) {
10085+#ifdef CONFIG_PPC_PS3
10086+ if (machine_is(ps3))
10087+ ret = pfm_update_pmX_event_subunit_field(ctx);
10088+#endif
10089+ return ret;
10090+ }
10091+
10092+ list_for_each_entry(s, &ctx->set_list, list) {
10093+ ppu_sig_num = get_ppu_signal_groups(s, &ppu_sig_grp[0],
10094+ &ppu_sig_grp[1]);
10095+
10096+ for (i = 0; i < NR_CTRS; i++) {
10097+ index = PFM_PM_CTR_INPUT_MUX_GROUP_INDEX(s->pmcs[i]);
10098+ if (ppu_sig_num &&
10099+ (ppu_sig_grp[index] != SIG_GROUP_NONE) &&
10100+ is_counter_for_ppu_sig_grp(s->pmcs[i],
10101+ ppu_sig_grp[index])) {
10102+
10103+ bit = PFM_PM_CTR_INPUT_MUX_BIT(s->pmcs[i]);
10104+ target_th_id = get_target_ppu_thread_id(
10105+ ppu_sig_grp[index], bit);
10106+ if (!target_th_id)
10107+ s->pmcs[i] |=
10108+ PFM_COUNTER_CTRL_PMC_PPU_TH0;
10109+ else
10110+ s->pmcs[i] |=
10111+ PFM_COUNTER_CTRL_PMC_PPU_TH1;
10112+ PFM_DBG("set:%d mark ctr:%d target_thread:%d",
10113+ s->id, i, target_th_id);
10114+ }
10115+ }
10116+ }
10117+
10118+ return ret;
10119+}
10120+
10121+/**
10122+ * pfm_cell_unload_context
10123+ *
10124+ * For system-wide contexts and self-monitored contexts, make the RTAS call
10125+ * to reset the debug-bus signals.
10126+ *
10127+ * For non-self-monitored contexts, the monitored thread will already have
10128+ * been taken off the CPU and we don't need to do anything additional.
10129+ **/
10130+static void pfm_cell_unload_context(struct pfm_context *ctx)
10131+{
10132+ if (ctx->task == current || ctx->flags.system)
10133+ reset_signals(smp_processor_id());
10134+}
10135+
10136+/**
10137+ * pfm_cell_ctxswout_thread
10138+ *
10139+ * When a monitored thread is switched out (self-monitored or externally
10140+ * monitored) we need to reset the debug-bus signals so the next context that
10141+ * gets switched in can start from a clean set of signals.
10142+ **/
10143+int pfm_cell_ctxswout_thread(struct task_struct *task,
10144+ struct pfm_context *ctx, struct pfm_event_set *set)
10145+{
10146+ reset_signals(smp_processor_id());
10147+ return 0;
10148+}
10149+
10150+/**
10151+ * pfm_cell_get_ovfl_pmds
10152+ *
10153+ * Determine which counters in this set have overflowed and fill in the
10154+ * set->povfl_pmds mask and set->npend_ovfls count. On Cell, the pm_status
10155+ * register contains a bit for each counter to indicate overflow. However,
10156+ * those 8 bits are in the reverse order than what Perfmon2 is expecting,
10157+ * so we need to reverse the order of the overflow bits.
10158+ **/
10159+static void pfm_cell_get_ovfl_pmds(struct pfm_context *ctx,
10160+ struct pfm_event_set *set)
10161+{
10162+ struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx);
10163+ u32 pm_status, ovfl_ctrs;
10164+ u64 povfl_pmds = 0;
10165+ int i;
10166+ struct pfm_cell_platform_pmu_info *info =
10167+ ((struct pfm_arch_pmu_info *)
10168+ (pfm_pmu_conf->pmu_info))->platform_info;
10169+
10170+ if (!ctx_arch->last_read_updated)
10171+ /* This routine was not called via the interrupt handler.
10172+ * Need to start by getting interrupts and updating
10173+ * last_read_pm_status.
10174+ */
10175+ ctx_arch->last_read_pm_status =
10176+ info->get_and_clear_pm_interrupts(smp_processor_id());
10177+
10178+ /* Reset the flag that the interrupt handler last read pm_status. */
10179+ ctx_arch->last_read_updated = 0;
10180+
10181+ pm_status = ctx_arch->last_read_pm_status &
10182+ set->pmcs[CELL_PMC_PM_STATUS];
10183+ ovfl_ctrs = CBE_PM_OVERFLOW_CTRS(pm_status);
10184+
10185+ /* Reverse the order of the bits in ovfl_ctrs
10186+ * and store the result in povfl_pmds.
10187+ */
10188+ for (i = 0; i < PFM_PM_NUM_PMDS; i++) {
10189+ povfl_pmds = (povfl_pmds << 1) | (ovfl_ctrs & 1);
10190+ ovfl_ctrs >>= 1;
10191+ }
10192+
10193+ /* Mask povfl_pmds with set->used_pmds to get set->povfl_pmds.
10194+ * Count the bits set in set->povfl_pmds to get set->npend_ovfls.
10195+ */
10196+ bitmap_and(set->povfl_pmds, &povfl_pmds,
10197+ set->used_pmds, PFM_PM_NUM_PMDS);
10198+ set->npend_ovfls = bitmap_weight(set->povfl_pmds, PFM_PM_NUM_PMDS);
10199+}
10200+
10201+/**
10202+ * pfm_cell_acquire_pmu
10203+ *
10204+ * acquire PMU resource.
10205+ * This acquisition is done when the first context is created.
10206+ **/
10207+int pfm_cell_acquire_pmu(u64 *unavail_pmcs, u64 *unavail_pmds)
10208+{
10209+#ifdef CONFIG_PPC_PS3
10210+ int ret;
10211+
10212+ if (machine_is(ps3)) {
10213+ PFM_DBG("");
10214+ ret = ps3_lpm_open(PS3_LPM_TB_TYPE_INTERNAL, NULL, 0);
10215+ if (ret) {
10216+ PFM_ERR("Can't create PS3 lpm. error:%d", ret);
10217+ return -EFAULT;
10218+ }
10219+ }
10220+#endif
10221+ return 0;
10222+}
10223+
10224+/**
10225+ * pfm_cell_release_pmu
10226+ *
10227+ * release PMU resource.
10228+ * actual release happens when last context is destroyed
10229+ **/
10230+void pfm_cell_release_pmu(void)
10231+{
10232+#ifdef CONFIG_PPC_PS3
10233+ if (machine_is(ps3)) {
10234+ if (ps3_lpm_close())
10235+ PFM_ERR("Can't delete PS3 lpm.");
10236+ }
10237+#endif
10238+}
10239+
10240+/**
10241+ * handle_trace_buffer_interrupts
10242+ *
10243+ * This routine is for processing just the interval timer and trace buffer
10244+ * overflow interrupts. Performance counter interrupts are handled by the
10245+ * perf_irq_handler() routine, which reads and saves the pm_status register.
10246+ * This routine should not read the actual pm_status register, but rather
10247+ * the value passed in.
10248+ **/
10249+static void handle_trace_buffer_interrupts(unsigned long iip,
10250+ struct pt_regs *regs,
10251+ struct pfm_context *ctx,
10252+ u32 pm_status)
10253+{
10254+ /* FIX: Currently ignoring trace-buffer interrupts. */
10255+ return;
10256+}
10257+
10258+/**
10259+ * pfm_cell_irq_handler
10260+ *
10261+ * Handler for all Cell performance-monitor interrupts.
10262+ **/
10263+static void pfm_cell_irq_handler(struct pt_regs *regs, struct pfm_context *ctx)
10264+{
10265+ struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx);
10266+ u32 last_read_pm_status;
10267+ int cpu = smp_processor_id();
10268+ struct pfm_cell_platform_pmu_info *info =
10269+ ((struct pfm_arch_pmu_info *)
10270+ (pfm_pmu_conf->pmu_info))->platform_info;
10271+
10272+ /* Need to disable and reenable the performance counters to get the
10273+ * desired behavior from the hardware. This is specific to the Cell
10274+ * PMU hardware.
10275+ */
10276+ info->disable_pm(cpu);
10277+
10278+ /* Read the pm_status register to get the interrupt bits. If a
10279+ * perfmormance counter overflow interrupt occurred, call the core
10280+ * perfmon interrupt handler to service the counter overflow. If the
10281+ * interrupt was for the interval timer or the trace_buffer,
10282+ * call the interval timer and trace buffer interrupt handler.
10283+ *
10284+ * The value read from the pm_status register is stored in the
10285+ * pmf_arch_context structure for use by other routines. Note that
10286+ * reading the pm_status register resets the interrupt flags to zero.
10287+ * Hence, it is important that the register is only read in one place.
10288+ *
10289+ * The pm_status reg interrupt reg format is:
10290+ * [pmd0:pmd1:pmd2:pmd3:pmd4:pmd5:pmd6:pmd7:intt:tbf:tbu:]
10291+ * - pmd0 to pm7 are the perf counter overflow interrupts.
10292+ * - intt is the interval timer overflowed interrupt.
10293+ * - tbf is the trace buffer full interrupt.
10294+ * - tbu is the trace buffer underflow interrupt.
10295+ * - The pmd0 bit is the MSB of the 32 bit register.
10296+ */
10297+ ctx_arch->last_read_pm_status = last_read_pm_status =
10298+ info->get_and_clear_pm_interrupts(cpu);
10299+
10300+ /* Set flag for pfm_cell_get_ovfl_pmds() routine so it knows
10301+ * last_read_pm_status was updated by the interrupt handler.
10302+ */
10303+ ctx_arch->last_read_updated = 1;
10304+
10305+ if (last_read_pm_status & CBE_PM_ALL_OVERFLOW_INTR)
10306+ /* At least one counter overflowed. */
10307+ pfm_interrupt_handler(instruction_pointer(regs), regs);
10308+
10309+ if (last_read_pm_status & (CBE_PM_INTERVAL_INTR |
10310+ CBE_PM_TRACE_BUFFER_FULL_INTR |
10311+ CBE_PM_TRACE_BUFFER_UNDERFLOW_INTR))
10312+ /* Trace buffer or interval timer overflow. */
10313+ handle_trace_buffer_interrupts(instruction_pointer(regs),
10314+ regs, ctx, last_read_pm_status);
10315+
10316+ /* The interrupt settings is the value written to the pm_status
10317+ * register. It is saved in the context when the register is
10318+ * written.
10319+ */
10320+ info->enable_pm_interrupts(cpu, info->get_hw_thread_id(cpu),
10321+ ctx->active_set->pmcs[CELL_PMC_PM_STATUS]);
10322+
10323+ /* The writes to the various performance counters only writes to a
10324+ * latch. The new values (interrupt setting bits, reset counter value
10325+ * etc.) are not copied to the actual registers until the performance
10326+ * monitor is enabled. In order to get this to work as desired, the
10327+ * permormance monitor needs to be disabled while writting to the
10328+ * latches. This is a HW design issue.
10329+ */
10330+ info->enable_pm(cpu);
10331+}
10332+
10333+
10334+static struct pfm_cell_platform_pmu_info ps3_platform_pmu_info = {
10335+#ifdef CONFIG_PPC_PS3
10336+ .read_ctr = ps3_read_ctr,
10337+ .write_ctr = ps3_write_ctr,
10338+ .write_pm07_control = ps3_write_pm07_control,
10339+ .write_pm = ps3_write_pm,
10340+ .enable_pm = ps3_enable_pm,
10341+ .disable_pm = ps3_disable_pm,
10342+ .enable_pm_interrupts = ps3_enable_pm_interrupts,
10343+ .get_and_clear_pm_interrupts = ps3_get_and_clear_pm_interrupts,
10344+ .get_hw_thread_id = ps3_get_hw_thread_id,
10345+ .get_cpu_ppe_priv_regs = NULL,
10346+ .get_cpu_pmd_regs = NULL,
10347+ .get_cpu_mic_tm_regs = NULL,
10348+ .rtas_token = NULL,
10349+ .rtas_call = NULL,
10350+#endif
10351+};
10352+
10353+static struct pfm_cell_platform_pmu_info native_platform_pmu_info = {
10354+#ifdef CONFIG_PPC_CELL_NATIVE
10355+ .read_ctr = cbe_read_ctr,
10356+ .write_ctr = cbe_write_ctr,
10357+ .write_pm07_control = cbe_write_pm07_control,
10358+ .write_pm = cbe_write_pm,
10359+ .enable_pm = cbe_enable_pm,
10360+ .disable_pm = cbe_disable_pm,
10361+ .enable_pm_interrupts = cbe_enable_pm_interrupts,
10362+ .get_and_clear_pm_interrupts = cbe_get_and_clear_pm_interrupts,
10363+ .get_hw_thread_id = cbe_get_hw_thread_id,
10364+ .get_cpu_ppe_priv_regs = cbe_get_cpu_ppe_priv_regs,
10365+ .get_cpu_pmd_regs = cbe_get_cpu_pmd_regs,
10366+ .get_cpu_mic_tm_regs = cbe_get_cpu_mic_tm_regs,
10367+ .rtas_token = rtas_token,
10368+ .rtas_call = rtas_call,
10369+#endif
10370+};
10371+
10372+static struct pfm_arch_pmu_info pfm_cell_pmu_info = {
10373+ .pmu_style = PFM_POWERPC_PMU_CELL,
10374+ .acquire_pmu = pfm_cell_acquire_pmu,
10375+ .release_pmu = pfm_cell_release_pmu,
10376+ .write_pmc = pfm_cell_write_pmc,
10377+ .write_pmd = pfm_cell_write_pmd,
10378+ .read_pmd = pfm_cell_read_pmd,
10379+ .enable_counters = pfm_cell_enable_counters,
10380+ .disable_counters = pfm_cell_disable_counters,
10381+ .irq_handler = pfm_cell_irq_handler,
10382+ .get_ovfl_pmds = pfm_cell_get_ovfl_pmds,
10383+ .restore_pmcs = pfm_cell_restore_pmcs,
10384+ .restore_pmds = pfm_cell_restore_pmds,
10385+ .ctxswout_thread = pfm_cell_ctxswout_thread,
10386+ .load_context = pfm_cell_load_context,
10387+ .unload_context = pfm_cell_unload_context,
10388+};
10389+
10390+static struct pfm_pmu_config pfm_cell_pmu_conf = {
10391+ .pmu_name = "Cell",
10392+ .version = "0.1",
10393+ .counter_width = 32,
10394+ .pmd_desc = pfm_cell_pmd_desc,
10395+ .pmc_desc = pfm_cell_pmc_desc,
10396+ .num_pmc_entries = PFM_PM_NUM_PMCS,
10397+ .num_pmd_entries = PFM_PM_NUM_PMDS,
10398+ .probe_pmu = pfm_cell_probe_pmu,
10399+ .pmu_info = &pfm_cell_pmu_info,
10400+ .flags = PFM_PMU_BUILTIN_FLAG,
10401+ .owner = THIS_MODULE,
10402+};
10403+
10404+/**
10405+ * pfm_cell_platform_probe
10406+ *
10407+ * If we're on a system without the firmware rtas call available, set up the
10408+ * PMC write-checker for all the pmX_event control registers.
10409+ **/
10410+static void pfm_cell_platform_probe(void)
10411+{
10412+ if (machine_is(celleb)) {
10413+ int cnum;
10414+ pfm_cell_pmu_conf.pmc_write_check = pfm_cell_pmc_check;
10415+ for (cnum = NR_CTRS; cnum < (NR_CTRS * 2); cnum++)
10416+ pfm_cell_pmc_desc[cnum].type |= PFM_REG_WC;
10417+ }
10418+
10419+ if (machine_is(ps3))
10420+ pfm_cell_pmu_info.platform_info = &ps3_platform_pmu_info;
10421+ else
10422+ pfm_cell_pmu_info.platform_info = &native_platform_pmu_info;
10423+}
10424+
10425+static int __init pfm_cell_pmu_init_module(void)
10426+{
10427+ pfm_cell_platform_probe();
10428+ return pfm_pmu_register(&pfm_cell_pmu_conf);
10429+}
10430+
10431+static void __exit pfm_cell_pmu_cleanup_module(void)
10432+{
10433+ pfm_pmu_unregister(&pfm_cell_pmu_conf);
10434+}
10435+
10436+module_init(pfm_cell_pmu_init_module);
10437+module_exit(pfm_cell_pmu_cleanup_module);
10438--- /dev/null
10439+++ b/arch/powerpc/perfmon/perfmon_power4.c
10440@@ -0,0 +1,309 @@
10441+/*
10442+ * This file contains the POWER4 PMU register description tables
10443+ * and pmc checker used by perfmon.c.
10444+ *
10445+ * Copyright (c) 2007, IBM Corporation.
10446+ *
10447+ * Based on a simple modification of perfmon_power5.c for POWER4 by
10448+ * Corey Ashford <cjashfor@us.ibm.com>.
10449+ *
10450+ * This program is free software; you can redistribute it and/or
10451+ * modify it under the terms of version 2 of the GNU General Public
10452+ * License as published by the Free Software Foundation.
10453+ *
10454+ * This program is distributed in the hope that it will be useful,
10455+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10456+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10457+ * General Public License for more details.
10458+ *
10459+ * You should have received a copy of the GNU General Public License
10460+ * along with this program; if not, write to the Free Software
10461+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
10462+ * 02111-1307 USA
10463+ */
10464+#include <linux/module.h>
10465+#include <linux/perfmon_kern.h>
10466+
10467+MODULE_AUTHOR("Corey Ashford <cjashfor@us.ibm.com>");
10468+MODULE_DESCRIPTION("POWER4 PMU description table");
10469+MODULE_LICENSE("GPL");
10470+
10471+static struct pfm_regmap_desc pfm_power4_pmc_desc[] = {
10472+/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", MMCR0_FC, 0, 0, SPRN_MMCR0),
10473+/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0, 0, 0, SPRN_MMCR1),
10474+/* mmcra */ PMC_D(PFM_REG_I, "MMCRA", 0, 0, 0, SPRN_MMCRA)
10475+};
10476+#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_power4_pmc_desc)
10477+
10478+/* The TB and PURR registers are read-only. Also, note that the TB register
10479+ * actually consists of both the 32-bit SPRN_TBRU and SPRN_TBRL registers.
10480+ * For Perfmon2's purposes, we'll treat it as a single 64-bit register.
10481+ */
10482+static struct pfm_regmap_desc pfm_power4_pmd_desc[] = {
10483+/* tb */ PMD_D((PFM_REG_I|PFM_REG_RO), "TB", SPRN_TBRL),
10484+/* pmd1 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1),
10485+/* pmd2 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2),
10486+/* pmd3 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3),
10487+/* pmd4 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4),
10488+/* pmd5 */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5),
10489+/* pmd6 */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6),
10490+/* pmd7 */ PMD_D(PFM_REG_C, "PMC7", SPRN_PMC7),
10491+/* pmd8 */ PMD_D(PFM_REG_C, "PMC8", SPRN_PMC8)
10492+};
10493+#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_power4_pmd_desc)
10494+
10495+static int pfm_power4_probe_pmu(void)
10496+{
10497+ unsigned long pvr = mfspr(SPRN_PVR);
10498+ int ver = PVR_VER(pvr);
10499+
10500+ if ((ver == PV_POWER4) || (ver == PV_POWER4p))
10501+ return 0;
10502+
10503+ return -1;
10504+}
10505+
10506+static void pfm_power4_write_pmc(unsigned int cnum, u64 value)
10507+{
10508+ switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) {
10509+ case SPRN_MMCR0:
10510+ mtspr(SPRN_MMCR0, value);
10511+ break;
10512+ case SPRN_MMCR1:
10513+ mtspr(SPRN_MMCR1, value);
10514+ break;
10515+ case SPRN_MMCRA:
10516+ mtspr(SPRN_MMCRA, value);
10517+ break;
10518+ default:
10519+ BUG();
10520+ }
10521+}
10522+
10523+static void pfm_power4_write_pmd(unsigned int cnum, u64 value)
10524+{
10525+ u64 ovfl_mask = pfm_pmu_conf->ovfl_mask;
10526+
10527+ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
10528+ case SPRN_PMC1:
10529+ mtspr(SPRN_PMC1, value & ovfl_mask);
10530+ break;
10531+ case SPRN_PMC2:
10532+ mtspr(SPRN_PMC2, value & ovfl_mask);
10533+ break;
10534+ case SPRN_PMC3:
10535+ mtspr(SPRN_PMC3, value & ovfl_mask);
10536+ break;
10537+ case SPRN_PMC4:
10538+ mtspr(SPRN_PMC4, value & ovfl_mask);
10539+ break;
10540+ case SPRN_PMC5:
10541+ mtspr(SPRN_PMC5, value & ovfl_mask);
10542+ break;
10543+ case SPRN_PMC6:
10544+ mtspr(SPRN_PMC6, value & ovfl_mask);
10545+ break;
10546+ case SPRN_PMC7:
10547+ mtspr(SPRN_PMC7, value & ovfl_mask);
10548+ break;
10549+ case SPRN_PMC8:
10550+ mtspr(SPRN_PMC8, value & ovfl_mask);
10551+ break;
10552+ case SPRN_TBRL:
10553+ case SPRN_PURR:
10554+ /* Ignore writes to read-only registers. */
10555+ break;
10556+ default:
10557+ BUG();
10558+ }
10559+}
10560+
10561+static u64 pfm_power4_read_pmd(unsigned int cnum)
10562+{
10563+ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
10564+ case SPRN_PMC1:
10565+ return mfspr(SPRN_PMC1);
10566+ case SPRN_PMC2:
10567+ return mfspr(SPRN_PMC2);
10568+ case SPRN_PMC3:
10569+ return mfspr(SPRN_PMC3);
10570+ case SPRN_PMC4:
10571+ return mfspr(SPRN_PMC4);
10572+ case SPRN_PMC5:
10573+ return mfspr(SPRN_PMC5);
10574+ case SPRN_PMC6:
10575+ return mfspr(SPRN_PMC6);
10576+ case SPRN_PMC7:
10577+ return mfspr(SPRN_PMC7);
10578+ case SPRN_PMC8:
10579+ return mfspr(SPRN_PMC8);
10580+ case SPRN_TBRL:
10581+ return ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL);
10582+ case SPRN_PURR:
10583+ if (cpu_has_feature(CPU_FTR_PURR))
10584+ return mfspr(SPRN_PURR);
10585+ else
10586+ return 0;
10587+ default:
10588+ BUG();
10589+ }
10590+}
10591+
10592+/* forward decl */
10593+static void pfm_power4_disable_counters(struct pfm_context *ctx,
10594+ struct pfm_event_set *set);
10595+
10596+/**
10597+ * pfm_power4_enable_counters
10598+ *
10599+ **/
10600+static void pfm_power4_enable_counters(struct pfm_context *ctx,
10601+ struct pfm_event_set *set)
10602+{
10603+ unsigned int i, max_pmc;
10604+
10605+ /* Make sure the counters are disabled before touching the other
10606+ control registers */
10607+ pfm_power4_disable_counters(ctx, set);
10608+
10609+ max_pmc = ctx->regs.max_pmc;
10610+
10611+ /* Write MMCR0 last, and a fairly easy way to do this is to write
10612+ the registers in the reverse order */
10613+ for (i = max_pmc; i != 0; i--)
10614+ if (test_bit(i - 1, set->used_pmcs))
10615+ pfm_power4_write_pmc(i - 1, set->pmcs[i - 1]);
10616+}
10617+
10618+/**
10619+ * pfm_power4_disable_counters
10620+ *
10621+ **/
10622+static void pfm_power4_disable_counters(struct pfm_context *ctx,
10623+ struct pfm_event_set *set)
10624+{
10625+ /* Set the Freeze Counters bit */
10626+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
10627+ asm volatile ("sync");
10628+}
10629+
10630+/**
10631+ * pfm_power4_get_ovfl_pmds
10632+ *
10633+ * Determine which counters in this set have overflowed and fill in the
10634+ * set->povfl_pmds mask and set->npend_ovfls count.
10635+ **/
10636+static void pfm_power4_get_ovfl_pmds(struct pfm_context *ctx,
10637+ struct pfm_event_set *set)
10638+{
10639+ unsigned int i;
10640+ unsigned int max_pmd = ctx->regs.max_intr_pmd;
10641+ u64 *used_pmds = set->used_pmds;
10642+ u64 *cntr_pmds = ctx->regs.cnt_pmds;
10643+ u64 width_mask = 1 << pfm_pmu_conf->counter_width;
10644+ u64 new_val, mask[PFM_PMD_BV];
10645+
10646+ bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds),
10647+ cast_ulp(used_pmds), max_pmd);
10648+
10649+ for (i = 0; i < max_pmd; i++) {
10650+ if (test_bit(i, mask)) {
10651+ new_val = pfm_power4_read_pmd(i);
10652+ if (new_val & width_mask) {
10653+ set_bit(i, set->povfl_pmds);
10654+ set->npend_ovfls++;
10655+ }
10656+ }
10657+ }
10658+}
10659+
10660+static void pfm_power4_irq_handler(struct pt_regs *regs,
10661+ struct pfm_context *ctx)
10662+{
10663+ u32 mmcr0;
10664+
10665+ /* Disable the counters (set the freeze bit) to not polute
10666+ * the counts.
10667+ */
10668+ mmcr0 = mfspr(SPRN_MMCR0);
10669+ mtspr(SPRN_MMCR0, (mmcr0 | MMCR0_FC));
10670+
10671+ /* Set the PMM bit (see comment below). */
10672+ mtmsrd(mfmsr() | MSR_PMM);
10673+
10674+ pfm_interrupt_handler(instruction_pointer(regs), regs);
10675+
10676+ mmcr0 = mfspr(SPRN_MMCR0);
10677+
10678+ /*
10679+ * Reset the perfmon trigger if
10680+ * not in masking mode.
10681+ */
10682+ if (ctx->state != PFM_CTX_MASKED)
10683+ mmcr0 |= MMCR0_PMXE;
10684+
10685+ /*
10686+ * We must clear the PMAO bit on some (GQ) chips. Just do it
10687+ * all the time.
10688+ */
10689+ mmcr0 &= ~MMCR0_PMAO;
10690+
10691+ /*
10692+ * Now clear the freeze bit, counting will not start until we
10693+ * rfid from this exception, because only at that point will
10694+ * the PMM bit be cleared.
10695+ */
10696+ mmcr0 &= ~MMCR0_FC;
10697+ mtspr(SPRN_MMCR0, mmcr0);
10698+}
10699+
10700+static void pfm_power4_resend_irq(struct pfm_context *ctx)
10701+{
10702+ /*
10703+ * Assert the PMAO bit to cause a PMU interrupt. Make sure we
10704+ * trigger the edge detection circuitry for PMAO
10705+ */
10706+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO);
10707+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_PMAO);
10708+}
10709+
10710+struct pfm_arch_pmu_info pfm_power4_pmu_info = {
10711+ .pmu_style = PFM_POWERPC_PMU_POWER4,
10712+ .write_pmc = pfm_power4_write_pmc,
10713+ .write_pmd = pfm_power4_write_pmd,
10714+ .read_pmd = pfm_power4_read_pmd,
10715+ .irq_handler = pfm_power4_irq_handler,
10716+ .get_ovfl_pmds = pfm_power4_get_ovfl_pmds,
10717+ .enable_counters = pfm_power4_enable_counters,
10718+ .disable_counters = pfm_power4_disable_counters,
10719+ .resend_irq = pfm_power4_resend_irq
10720+};
10721+
10722+/*
10723+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
10724+ */
10725+static struct pfm_pmu_config pfm_power4_pmu_conf = {
10726+ .pmu_name = "POWER4",
10727+ .counter_width = 31,
10728+ .pmd_desc = pfm_power4_pmd_desc,
10729+ .pmc_desc = pfm_power4_pmc_desc,
10730+ .num_pmc_entries = PFM_PM_NUM_PMCS,
10731+ .num_pmd_entries = PFM_PM_NUM_PMDS,
10732+ .probe_pmu = pfm_power4_probe_pmu,
10733+ .pmu_info = &pfm_power4_pmu_info,
10734+ .flags = PFM_PMU_BUILTIN_FLAG,
10735+ .owner = THIS_MODULE
10736+};
10737+
10738+static int __init pfm_power4_pmu_init_module(void)
10739+{
10740+ return pfm_pmu_register(&pfm_power4_pmu_conf);
10741+}
10742+
10743+static void __exit pfm_power4_pmu_cleanup_module(void)
10744+{
10745+ pfm_pmu_unregister(&pfm_power4_pmu_conf);
10746+}
10747+
10748+module_init(pfm_power4_pmu_init_module);
10749+module_exit(pfm_power4_pmu_cleanup_module);
10750--- /dev/null
10751+++ b/arch/powerpc/perfmon/perfmon_power5.c
10752@@ -0,0 +1,326 @@
10753+/*
10754+ * This file contains the POWER5 PMU register description tables
10755+ * and pmc checker used by perfmon.c.
10756+ *
10757+ * Copyright (c) 2005 David Gibson, IBM Corporation.
10758+ *
10759+ * Based on perfmon_p6.c:
10760+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
10761+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
10762+ *
10763+ * This program is free software; you can redistribute it and/or
10764+ * modify it under the terms of version 2 of the GNU General Public
10765+ * License as published by the Free Software Foundation.
10766+ *
10767+ * This program is distributed in the hope that it will be useful,
10768+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10769+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10770+ * General Public License for more details.
10771+ *
10772+ * You should have received a copy of the GNU General Public License
10773+ * along with this program; if not, write to the Free Software
10774+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
10775+ * 02111-1307 USA
10776+ */
10777+#include <linux/module.h>
10778+#include <linux/perfmon_kern.h>
10779+
10780+MODULE_AUTHOR("David Gibson <dwg@au1.ibm.com>");
10781+MODULE_DESCRIPTION("POWER5 PMU description table");
10782+MODULE_LICENSE("GPL");
10783+
10784+static struct pfm_regmap_desc pfm_power5_pmc_desc[] = {
10785+/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", MMCR0_FC, 0, 0, SPRN_MMCR0),
10786+/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0, 0, 0, SPRN_MMCR1),
10787+/* mmcra */ PMC_D(PFM_REG_I, "MMCRA", 0, 0, 0, SPRN_MMCRA)
10788+};
10789+#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_power5_pmc_desc)
10790+
10791+/* The TB and PURR registers are read-only. Also, note that the TB register
10792+ * actually consists of both the 32-bit SPRN_TBRU and SPRN_TBRL registers.
10793+ * For Perfmon2's purposes, we'll treat it as a single 64-bit register.
10794+ */
10795+static struct pfm_regmap_desc pfm_power5_pmd_desc[] = {
10796+/* tb */ PMD_D((PFM_REG_I|PFM_REG_RO), "TB", SPRN_TBRL),
10797+/* pmd1 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1),
10798+/* pmd2 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2),
10799+/* pmd3 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3),
10800+/* pmd4 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4),
10801+/* pmd5 */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5),
10802+/* pmd6 */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6),
10803+/* purr */ PMD_D((PFM_REG_I|PFM_REG_RO), "PURR", SPRN_PURR),
10804+};
10805+#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_power5_pmd_desc)
10806+
10807+/* forward decl */
10808+static void pfm_power5_disable_counters(struct pfm_context *ctx,
10809+ struct pfm_event_set *set);
10810+
10811+static int pfm_power5_probe_pmu(void)
10812+{
10813+ unsigned long pvr = mfspr(SPRN_PVR);
10814+
10815+ switch (PVR_VER(pvr)) {
10816+ case PV_POWER5:
10817+ return 0;
10818+ case PV_POWER5p:
10819+ return (PVR_REV(pvr) < 0x300) ? 0 : -1;
10820+ default:
10821+ return -1;
10822+ }
10823+}
10824+
10825+static void pfm_power5_write_pmc(unsigned int cnum, u64 value)
10826+{
10827+ switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) {
10828+ case SPRN_MMCR0:
10829+ mtspr(SPRN_MMCR0, value);
10830+ break;
10831+ case SPRN_MMCR1:
10832+ mtspr(SPRN_MMCR1, value);
10833+ break;
10834+ case SPRN_MMCRA:
10835+ mtspr(SPRN_MMCRA, value);
10836+ break;
10837+ default:
10838+ BUG();
10839+ }
10840+}
10841+
10842+static void pfm_power5_write_pmd(unsigned int cnum, u64 value)
10843+{
10844+ u64 ovfl_mask = pfm_pmu_conf->ovfl_mask;
10845+
10846+ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
10847+ case SPRN_PMC1:
10848+ mtspr(SPRN_PMC1, value & ovfl_mask);
10849+ break;
10850+ case SPRN_PMC2:
10851+ mtspr(SPRN_PMC2, value & ovfl_mask);
10852+ break;
10853+ case SPRN_PMC3:
10854+ mtspr(SPRN_PMC3, value & ovfl_mask);
10855+ break;
10856+ case SPRN_PMC4:
10857+ mtspr(SPRN_PMC4, value & ovfl_mask);
10858+ break;
10859+ case SPRN_PMC5:
10860+ mtspr(SPRN_PMC5, value & ovfl_mask);
10861+ break;
10862+ case SPRN_PMC6:
10863+ mtspr(SPRN_PMC6, value & ovfl_mask);
10864+ break;
10865+ case SPRN_TBRL:
10866+ case SPRN_PURR:
10867+ /* Ignore writes to read-only registers. */
10868+ break;
10869+ default:
10870+ BUG();
10871+ }
10872+}
10873+
10874+static u64 pfm_power5_read_pmd(unsigned int cnum)
10875+{
10876+ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
10877+ case SPRN_PMC1:
10878+ return mfspr(SPRN_PMC1);
10879+ case SPRN_PMC2:
10880+ return mfspr(SPRN_PMC2);
10881+ case SPRN_PMC3:
10882+ return mfspr(SPRN_PMC3);
10883+ case SPRN_PMC4:
10884+ return mfspr(SPRN_PMC4);
10885+ case SPRN_PMC5:
10886+ return mfspr(SPRN_PMC5);
10887+ case SPRN_PMC6:
10888+ return mfspr(SPRN_PMC6);
10889+ case SPRN_TBRL:
10890+ return ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL);
10891+ case SPRN_PURR:
10892+ if (cpu_has_feature(CPU_FTR_PURR))
10893+ return mfspr(SPRN_PURR);
10894+ else
10895+ return 0;
10896+ default:
10897+ BUG();
10898+ }
10899+}
10900+
10901+/**
10902+ * pfm_power5_enable_counters
10903+ *
10904+ **/
10905+static void pfm_power5_enable_counters(struct pfm_context *ctx,
10906+ struct pfm_event_set *set)
10907+{
10908+ unsigned int i, max_pmc;
10909+
10910+ /*
10911+ * Make sure the counters are disabled before touching the
10912+ * other control registers
10913+ */
10914+ pfm_power5_disable_counters(ctx, set);
10915+
10916+ max_pmc = ctx->regs.max_pmc;
10917+
10918+ /*
10919+ * Write MMCR0 last, and a fairly easy way to do
10920+ * this is to write the registers in the reverse
10921+ * order
10922+ */
10923+ for (i = max_pmc; i != 0; i--)
10924+ if (test_bit(i - 1, set->used_pmcs))
10925+ pfm_power5_write_pmc(i - 1, set->pmcs[i - 1]);
10926+}
10927+
10928+/**
10929+ * pfm_power5_disable_counters
10930+ *
10931+ * Just need to zero all the control registers.
10932+ **/
10933+static void pfm_power5_disable_counters(struct pfm_context *ctx,
10934+ struct pfm_event_set *set)
10935+{
10936+ /* Set the Freeze Counters bit */
10937+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
10938+ asm volatile ("sync");
10939+}
10940+
10941+/**
10942+ * pfm_power5_get_ovfl_pmds
10943+ *
10944+ * Determine which counters in this set have overflowed and fill in the
10945+ * set->povfl_pmds mask and set->npend_ovfls count.
10946+ **/
10947+static void pfm_power5_get_ovfl_pmds(struct pfm_context *ctx,
10948+ struct pfm_event_set *set)
10949+{
10950+ unsigned int i;
10951+ unsigned int max = ctx->regs.max_intr_pmd;
10952+ u64 *used_pmds = set->used_pmds;
10953+ u64 *intr_pmds = ctx->regs.intr_pmds;
10954+ u64 width_mask = 1 << pfm_pmu_conf->counter_width;
10955+ u64 new_val, mask[PFM_PMD_BV];
10956+
10957+ bitmap_and(cast_ulp(mask), cast_ulp(intr_pmds),
10958+ cast_ulp(used_pmds), max);
10959+ /*
10960+ * If either PMC5 or PMC6 are not being used, just zero out the unused
10961+ * ones so that they won't interrupt again for another 2^31 counts.
10962+ * Note that if no other counters overflowed, set->npend_ovfls will
10963+ * be zero upon returning from this call (i.e. a spurious
10964+ * interrupt), but that should be ok.
10965+ *
10966+ * If neither PMC5 nor PMC6 are used, the counters should be frozen
10967+ * via MMCR0_FC5_6 and zeroed out.
10968+ *
10969+ * If both PMC5 and PMC6 are used, they can be handled correctly by
10970+ * the loop that follows.
10971+ */
10972+
10973+ if (!test_bit(5, cast_ulp(used_pmds)))
10974+ mtspr(SPRN_PMC5, 0);
10975+ if (!test_bit(6, cast_ulp(used_pmds)))
10976+ mtspr(SPRN_PMC6, 0);
10977+
10978+ for (i = 0; i < max; i++) {
10979+ if (test_bit(i, mask)) {
10980+ new_val = pfm_power5_read_pmd(i);
10981+ if (new_val & width_mask) {
10982+ set_bit(i, set->povfl_pmds);
10983+ set->npend_ovfls++;
10984+ }
10985+ }
10986+ }
10987+}
10988+
10989+static void pfm_power5_irq_handler(struct pt_regs *regs,
10990+ struct pfm_context *ctx)
10991+{
10992+ u32 mmcr0;
10993+
10994+ /* Disable the counters (set the freeze bit) to not polute
10995+ * the counts.
10996+ */
10997+ mmcr0 = mfspr(SPRN_MMCR0);
10998+ mtspr(SPRN_MMCR0, (mmcr0 | MMCR0_FC));
10999+
11000+ /* Set the PMM bit (see comment below). */
11001+ mtmsrd(mfmsr() | MSR_PMM);
11002+
11003+ pfm_interrupt_handler(instruction_pointer(regs), regs);
11004+
11005+ mmcr0 = mfspr(SPRN_MMCR0);
11006+
11007+ /*
11008+ * Reset the perfmon trigger if
11009+ * not in masking mode.
11010+ */
11011+ if (ctx->state != PFM_CTX_MASKED)
11012+ mmcr0 |= MMCR0_PMXE;
11013+
11014+ /*
11015+ * We must clear the PMAO bit on some (GQ) chips. Just do it
11016+ * all the time.
11017+ */
11018+ mmcr0 &= ~MMCR0_PMAO;
11019+
11020+ /*
11021+ * Now clear the freeze bit, counting will not start until we
11022+ * rfid from this exception, because only at that point will
11023+ * the PMM bit be cleared.
11024+ */
11025+ mmcr0 &= ~MMCR0_FC;
11026+ mtspr(SPRN_MMCR0, mmcr0);
11027+}
11028+
11029+static void pfm_power5_resend_irq(struct pfm_context *ctx)
11030+{
11031+ /*
11032+ * Assert the PMAO bit to cause a PMU interrupt. Make sure we
11033+ * trigger the edge detection circuitry for PMAO
11034+ */
11035+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO);
11036+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_PMAO);
11037+}
11038+
11039+struct pfm_arch_pmu_info pfm_power5_pmu_info = {
11040+ .pmu_style = PFM_POWERPC_PMU_POWER5,
11041+ .write_pmc = pfm_power5_write_pmc,
11042+ .write_pmd = pfm_power5_write_pmd,
11043+ .read_pmd = pfm_power5_read_pmd,
11044+ .irq_handler = pfm_power5_irq_handler,
11045+ .get_ovfl_pmds = pfm_power5_get_ovfl_pmds,
11046+ .enable_counters = pfm_power5_enable_counters,
11047+ .disable_counters = pfm_power5_disable_counters,
11048+ .resend_irq = pfm_power5_resend_irq
11049+};
11050+
11051+/*
11052+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
11053+ */
11054+static struct pfm_pmu_config pfm_power5_pmu_conf = {
11055+ .pmu_name = "POWER5",
11056+ .counter_width = 31,
11057+ .pmd_desc = pfm_power5_pmd_desc,
11058+ .pmc_desc = pfm_power5_pmc_desc,
11059+ .num_pmc_entries = PFM_PM_NUM_PMCS,
11060+ .num_pmd_entries = PFM_PM_NUM_PMDS,
11061+ .probe_pmu = pfm_power5_probe_pmu,
11062+ .pmu_info = &pfm_power5_pmu_info,
11063+ .flags = PFM_PMU_BUILTIN_FLAG,
11064+ .owner = THIS_MODULE
11065+};
11066+
11067+static int __init pfm_power5_pmu_init_module(void)
11068+{
11069+ return pfm_pmu_register(&pfm_power5_pmu_conf);
11070+}
11071+
11072+static void __exit pfm_power5_pmu_cleanup_module(void)
11073+{
11074+ pfm_pmu_unregister(&pfm_power5_pmu_conf);
11075+}
11076+
11077+module_init(pfm_power5_pmu_init_module);
11078+module_exit(pfm_power5_pmu_cleanup_module);
11079--- /dev/null
11080+++ b/arch/powerpc/perfmon/perfmon_power6.c
11081@@ -0,0 +1,520 @@
11082+/*
11083+ * This file contains the POWER6 PMU register description tables
11084+ * and pmc checker used by perfmon.c.
11085+ *
11086+ * Copyright (c) 2007, IBM Corporation
11087+ *
11088+ * Based on perfmon_power5.c, and written by Carl Love <carll@us.ibm.com>
11089+ * and Kevin Corry <kevcorry@us.ibm.com>. Some fixes and refinement by
11090+ * Corey Ashford <cjashfor@us.ibm.com>
11091+ *
11092+ * This program is free software; you can redistribute it and/or
11093+ * modify it under the terms of version 2 of the GNU General Public
11094+ * License as published by the Free Software Foundation.
11095+ *
11096+ * This program is distributed in the hope that it will be useful,
11097+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
11098+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11099+ * General Public License for more details.
11100+ *
11101+ * You should have received a copy of the GNU General Public License
11102+ * along with this program; if not, write to the Free Software
11103+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
11104+ * 02111-1307 USA
11105+ */
11106+#include <linux/module.h>
11107+#include <linux/perfmon_kern.h>
11108+
11109+MODULE_AUTHOR("Corey Ashford <cjashfor@us.ibm.com>");
11110+MODULE_DESCRIPTION("POWER6 PMU description table");
11111+MODULE_LICENSE("GPL");
11112+
11113+static struct pfm_regmap_desc pfm_power6_pmc_desc[] = {
11114+/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", MMCR0_FC, 0, 0, SPRN_MMCR0),
11115+/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0, 0, 0, SPRN_MMCR1),
11116+/* mmcra */ PMC_D(PFM_REG_I, "MMCRA", 0, 0, 0, SPRN_MMCRA)
11117+};
11118+#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_power6_pmc_desc)
11119+#define PFM_DELTA_TB 10000 /* Not a real registers */
11120+#define PFM_DELTA_PURR 10001
11121+
11122+/*
11123+ * counters wrap to zero at transition from 2^32-1 to 2^32. Note:
11124+ * interrupt generated at transition from 2^31-1 to 2^31
11125+ */
11126+#define OVERFLOW_VALUE 0x100000000UL
11127+
11128+/* The TB and PURR registers are read-only. Also, note that the TB register
11129+ * actually consists of both the 32-bit SPRN_TBRU and SPRN_TBRL registers.
11130+ * For Perfmon2's purposes, we'll treat it as a single 64-bit register.
11131+ */
11132+static struct pfm_regmap_desc pfm_power6_pmd_desc[] = {
11133+ /* On POWER 6 PMC5 and PMC6 are not writable, they do not
11134+ * generate interrupts, and do not qualify their counts
11135+ * based on problem mode, supervisor mode or hypervisor mode.
11136+ * These two counters are implemented as virtual counters
11137+ * to make the appear to work like the other counters. A
11138+ * kernel timer is used sample the real PMC5 and PMC6 and
11139+ * update the virtual counters.
11140+ */
11141+/* tb */ PMD_D((PFM_REG_I|PFM_REG_RO), "TB", SPRN_TBRL),
11142+/* pmd1 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1),
11143+/* pmd2 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2),
11144+/* pmd3 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3),
11145+/* pmd4 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4),
11146+/* pmd5 */ PMD_D((PFM_REG_I|PFM_REG_V), "PMC5", SPRN_PMC5),
11147+/* pmd6 */ PMD_D((PFM_REG_I|PFM_REG_V), "PMC6", SPRN_PMC6),
11148+/* purr */ PMD_D((PFM_REG_I|PFM_REG_RO), "PURR", SPRN_PURR),
11149+/* delta purr */ PMD_D((PFM_REG_I|PFM_REG_V), "DELTA_TB", PFM_DELTA_TB),
11150+/* delta tb */ PMD_D((PFM_REG_I|PFM_REG_V), "DELTA_PURR", PFM_DELTA_PURR),
11151+};
11152+
11153+#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_power6_pmd_desc)
11154+
11155+u32 pmc5_start_save[NR_CPUS];
11156+u32 pmc6_start_save[NR_CPUS];
11157+
11158+static struct timer_list pmc5_6_update[NR_CPUS];
11159+u64 enable_cntrs_cnt;
11160+u64 disable_cntrs_cnt;
11161+u64 call_delta;
11162+u64 pm5_6_interrupt;
11163+u64 pm1_4_interrupt;
11164+/* need ctx_arch for kernel timer. Can't get it in context of the kernel
11165+ * timer.
11166+ */
11167+struct pfm_arch_context *pmc5_6_ctx_arch[NR_CPUS];
11168+long int update_time;
11169+
11170+static void delta(int cpu_num, struct pfm_arch_context *ctx_arch)
11171+{
11172+ u32 tmp5, tmp6;
11173+
11174+ call_delta++;
11175+
11176+ tmp5 = (u32) mfspr(SPRN_PMC5);
11177+ tmp6 = (u32) mfspr(SPRN_PMC6);
11178+
11179+ /*
11180+ * The following difference calculation relies on 32-bit modular
11181+ * arithmetic for the deltas to come out correct (especially in the
11182+ * presence of a 32-bit counter wrap).
11183+ */
11184+ ctx_arch->powergs_pmc5 += (u64)(tmp5 - pmc5_start_save[cpu_num]);
11185+ ctx_arch->powergs_pmc6 += (u64)(tmp6 - pmc6_start_save[cpu_num]);
11186+
11187+ pmc5_start_save[cpu_num] = tmp5;
11188+ pmc6_start_save[cpu_num] = tmp6;
11189+
11190+ return;
11191+}
11192+
11193+
11194+static void pmc5_6_updater(unsigned long cpu_num)
11195+{
11196+ /* update the virtual pmd 5 and pmd 6 counters */
11197+
11198+ delta(cpu_num, pmc5_6_ctx_arch[cpu_num]);
11199+ mod_timer(&pmc5_6_update[cpu_num], jiffies + update_time);
11200+}
11201+
11202+
11203+static int pfm_power6_probe_pmu(void)
11204+{
11205+ unsigned long pvr = mfspr(SPRN_PVR);
11206+
11207+ switch (PVR_VER(pvr)) {
11208+ case PV_POWER6:
11209+ return 0;
11210+ case PV_POWER5p:
11211+ /* If this is a POWER5+ and the revision is less than 0x300,
11212+ don't treat it as a POWER6. */
11213+ return (PVR_REV(pvr) < 0x300) ? -1 : 0;
11214+ default:
11215+ return -1;
11216+ }
11217+}
11218+
11219+static void pfm_power6_write_pmc(unsigned int cnum, u64 value)
11220+{
11221+ switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) {
11222+ case SPRN_MMCR0:
11223+ mtspr(SPRN_MMCR0, value);
11224+ break;
11225+ case SPRN_MMCR1:
11226+ mtspr(SPRN_MMCR1, value);
11227+ break;
11228+ case SPRN_MMCRA:
11229+ mtspr(SPRN_MMCRA, value);
11230+ break;
11231+ default:
11232+ BUG();
11233+ }
11234+}
11235+
11236+static void pfm_power6_write_pmd(unsigned int cnum, u64 value)
11237+{
11238+ /* On POWER 6 PMC5 and PMC6 are implemented as
11239+ * virtual counters. See comment in pfm_power6_pmd_desc
11240+ * definition.
11241+ */
11242+ u64 ovfl_mask = pfm_pmu_conf->ovfl_mask;
11243+
11244+ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
11245+ case SPRN_PMC1:
11246+ mtspr(SPRN_PMC1, value & ovfl_mask);
11247+ break;
11248+ case SPRN_PMC2:
11249+ mtspr(SPRN_PMC2, value & ovfl_mask);
11250+ break;
11251+ case SPRN_PMC3:
11252+ mtspr(SPRN_PMC3, value & ovfl_mask);
11253+ break;
11254+ case SPRN_PMC4:
11255+ mtspr(SPRN_PMC4, value & ovfl_mask);
11256+ break;
11257+ case SPRN_TBRL:
11258+ case SPRN_PURR:
11259+ /* Ignore writes to read-only registers. */
11260+ break;
11261+ default:
11262+ BUG();
11263+ }
11264+}
11265+
11266+static u64 pfm_power6_sread(struct pfm_context *ctx, unsigned int cnum)
11267+{
11268+ struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx);
11269+ int cpu_num = smp_processor_id();
11270+
11271+ /* On POWER 6 PMC5 and PMC6 are implemented as
11272+ * virtual counters. See comment in pfm_power6_pmd_desc
11273+ * definition.
11274+ */
11275+
11276+ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
11277+ case SPRN_PMC5:
11278+ return ctx_arch->powergs_pmc5 + (u64)((u32)mfspr(SPRN_PMC5) - pmc5_start_save[cpu_num]);
11279+ break;
11280+
11281+ case SPRN_PMC6:
11282+ return ctx_arch->powergs_pmc6 + (u64)((u32)mfspr(SPRN_PMC6) - pmc6_start_save[cpu_num]);
11283+ break;
11284+
11285+ case PFM_DELTA_TB:
11286+ return ctx_arch->delta_tb
11287+ + (((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL))
11288+ - ctx_arch->delta_tb_start;
11289+ break;
11290+
11291+ case PFM_DELTA_PURR:
11292+ return ctx_arch->delta_purr
11293+ + mfspr(SPRN_PURR)
11294+ - ctx_arch->delta_purr_start;
11295+ break;
11296+
11297+ default:
11298+ BUG();
11299+ }
11300+}
11301+
11302+void pfm_power6_swrite(struct pfm_context *ctx, unsigned int cnum,
11303+ u64 val)
11304+{
11305+ struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx);
11306+ int cpu_num = smp_processor_id();
11307+
11308+ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
11309+ case SPRN_PMC5:
11310+ pmc5_start_save[cpu_num] = mfspr(SPRN_PMC5);
11311+ ctx_arch->powergs_pmc5 = val;
11312+ break;
11313+
11314+ case SPRN_PMC6:
11315+ pmc6_start_save[cpu_num] = mfspr(SPRN_PMC6);
11316+ ctx_arch->powergs_pmc6 = val;
11317+ break;
11318+
11319+ case PFM_DELTA_TB:
11320+ ctx_arch->delta_tb_start =
11321+ (((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL));
11322+ ctx_arch->delta_tb = val;
11323+ break;
11324+
11325+ case PFM_DELTA_PURR:
11326+ ctx_arch->delta_purr_start = mfspr(SPRN_PURR);
11327+ ctx_arch->delta_purr = val;
11328+ break;
11329+
11330+ default:
11331+ BUG();
11332+ }
11333+}
11334+
11335+static u64 pfm_power6_read_pmd(unsigned int cnum)
11336+{
11337+ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
11338+ case SPRN_PMC1:
11339+ return mfspr(SPRN_PMC1);
11340+ case SPRN_PMC2:
11341+ return mfspr(SPRN_PMC2);
11342+ case SPRN_PMC3:
11343+ return mfspr(SPRN_PMC3);
11344+ case SPRN_PMC4:
11345+ return mfspr(SPRN_PMC4);
11346+ case SPRN_TBRL:
11347+ return ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL);
11348+ case SPRN_PURR:
11349+ if (cpu_has_feature(CPU_FTR_PURR))
11350+ return mfspr(SPRN_PURR);
11351+ else
11352+ return 0;
11353+ default:
11354+ BUG();
11355+ }
11356+}
11357+
11358+
11359+/**
11360+ * pfm_power6_enable_counters
11361+ *
11362+ **/
11363+static void pfm_power6_enable_counters(struct pfm_context *ctx,
11364+ struct pfm_event_set *set)
11365+{
11366+
11367+ unsigned int i, max_pmc;
11368+ int cpu_num = smp_processor_id();
11369+ struct pfm_arch_context *ctx_arch;
11370+
11371+ enable_cntrs_cnt++;
11372+
11373+ /* need the ctx passed down to the routine */
11374+ ctx_arch = pfm_ctx_arch(ctx);
11375+ max_pmc = ctx->regs.max_pmc;
11376+
11377+ /* Write MMCR0 last, and a fairly easy way to do this is to write
11378+ the registers in the reverse order */
11379+ for (i = max_pmc; i != 0; i--)
11380+ if (test_bit(i - 1, set->used_pmcs))
11381+ pfm_power6_write_pmc(i - 1, set->pmcs[i - 1]);
11382+
11383+ /* save current free running HW event count */
11384+ pmc5_start_save[cpu_num] = mfspr(SPRN_PMC5);
11385+ pmc6_start_save[cpu_num] = mfspr(SPRN_PMC6);
11386+
11387+ ctx_arch->delta_purr_start = mfspr(SPRN_PURR);
11388+
11389+ if (cpu_has_feature(CPU_FTR_PURR))
11390+ ctx_arch->delta_tb_start =
11391+ ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL);
11392+ else
11393+ ctx_arch->delta_tb_start = 0;
11394+
11395+ /* Start kernel timer for this cpu to periodically update
11396+ * the virtual counters.
11397+ */
11398+ init_timer(&pmc5_6_update[cpu_num]);
11399+ pmc5_6_update[cpu_num].function = pmc5_6_updater;
11400+ pmc5_6_update[cpu_num].data = (unsigned long) cpu_num;
11401+ pmc5_6_update[cpu_num].expires = jiffies + update_time;
11402+ /* context for this timer, timer will be removed if context
11403+ * is switched because the counters will be stopped first.
11404+ * NEEDS WORK, I think this is all ok, a little concerned about a
11405+ * race between the kernel timer going off right as the counters
11406+ * are being stopped and the context switching. Need to think
11407+ * about this.
11408+ */
11409+ pmc5_6_ctx_arch[cpu_num] = ctx_arch;
11410+ add_timer(&pmc5_6_update[cpu_num]);
11411+}
11412+
11413+/**
11414+ * pfm_power6_disable_counters
11415+ *
11416+ **/
11417+static void pfm_power6_disable_counters(struct pfm_context *ctx,
11418+ struct pfm_event_set *set)
11419+{
11420+ struct pfm_arch_context *ctx_arch;
11421+ int cpu_num = smp_processor_id();
11422+
11423+ disable_cntrs_cnt++;
11424+
11425+ /* Set the Freeze Counters bit */
11426+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
11427+ asm volatile ("sync");
11428+
11429+ /* delete kernel update timer */
11430+ del_timer_sync(&pmc5_6_update[cpu_num]);
11431+
11432+ /* Update the virtual pmd 5 and 6 counters from the free running
11433+ * HW counters
11434+ */
11435+ ctx_arch = pfm_ctx_arch(ctx);
11436+ delta(cpu_num, ctx_arch);
11437+
11438+ ctx_arch->delta_tb +=
11439+ (((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL))
11440+ - ctx_arch->delta_tb_start;
11441+
11442+ ctx_arch->delta_purr += mfspr(SPRN_PURR)
11443+ - ctx_arch->delta_purr_start;
11444+}
11445+
11446+/**
11447+ * pfm_power6_get_ovfl_pmds
11448+ *
11449+ * Determine which counters in this set have overflowed and fill in the
11450+ * set->povfl_pmds mask and set->npend_ovfls count.
11451+ **/
11452+static void pfm_power6_get_ovfl_pmds(struct pfm_context *ctx,
11453+ struct pfm_event_set *set)
11454+{
11455+ unsigned int i;
11456+ unsigned int first_intr_pmd = ctx->regs.first_intr_pmd;
11457+ unsigned int max_intr_pmd = ctx->regs.max_intr_pmd;
11458+ u64 *used_pmds = set->used_pmds;
11459+ u64 *cntr_pmds = ctx->regs.cnt_pmds;
11460+ u64 width_mask = 1 << pfm_pmu_conf->counter_width;
11461+ u64 new_val, mask[PFM_PMD_BV];
11462+
11463+ bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds), cast_ulp(used_pmds), max_intr_pmd);
11464+
11465+ /* max_intr_pmd is actually the last interrupting pmd register + 1 */
11466+ for (i = first_intr_pmd; i < max_intr_pmd; i++) {
11467+ if (test_bit(i, mask)) {
11468+ new_val = pfm_power6_read_pmd(i);
11469+ if (new_val & width_mask) {
11470+ set_bit(i, set->povfl_pmds);
11471+ set->npend_ovfls++;
11472+ }
11473+ }
11474+ }
11475+}
11476+
11477+static void pfm_power6_irq_handler(struct pt_regs *regs,
11478+ struct pfm_context *ctx)
11479+{
11480+ u32 mmcr0;
11481+ u64 mmcra;
11482+
11483+ /* Disable the counters (set the freeze bit) to not polute
11484+ * the counts.
11485+ */
11486+ mmcr0 = mfspr(SPRN_MMCR0);
11487+ mtspr(SPRN_MMCR0, (mmcr0 | MMCR0_FC));
11488+ mmcra = mfspr(SPRN_MMCRA);
11489+
11490+ /* Set the PMM bit (see comment below). */
11491+ mtmsrd(mfmsr() | MSR_PMM);
11492+
11493+ pm1_4_interrupt++;
11494+
11495+ pfm_interrupt_handler(instruction_pointer(regs), regs);
11496+
11497+ mmcr0 = mfspr(SPRN_MMCR0);
11498+
11499+ /*
11500+ * Reset the perfmon trigger if
11501+ * not in masking mode.
11502+ */
11503+ if (ctx->state != PFM_CTX_MASKED)
11504+ mmcr0 |= MMCR0_PMXE;
11505+
11506+ /*
11507+ * Clear the PMU Alert Occurred bit
11508+ */
11509+ mmcr0 &= ~MMCR0_PMAO;
11510+
11511+ /* Clear the appropriate bits in the MMCRA. */
11512+ mmcra &= ~(POWER6_MMCRA_THRM | POWER6_MMCRA_OTHER);
11513+ mtspr(SPRN_MMCRA, mmcra);
11514+
11515+ /*
11516+ * Now clear the freeze bit, counting will not start until we
11517+ * rfid from this exception, because only at that point will
11518+ * the PMM bit be cleared.
11519+ */
11520+ mmcr0 &= ~MMCR0_FC;
11521+ mtspr(SPRN_MMCR0, mmcr0);
11522+}
11523+
11524+static void pfm_power6_resend_irq(struct pfm_context *ctx)
11525+{
11526+ /*
11527+ * Assert the PMAO bit to cause a PMU interrupt. Make sure we
11528+ * trigger the edge detection circuitry for PMAO
11529+ */
11530+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO);
11531+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_PMAO);
11532+}
11533+
11534+struct pfm_arch_pmu_info pfm_power6_pmu_info = {
11535+ .pmu_style = PFM_POWERPC_PMU_POWER6,
11536+ .write_pmc = pfm_power6_write_pmc,
11537+ .write_pmd = pfm_power6_write_pmd,
11538+ .read_pmd = pfm_power6_read_pmd,
11539+ .irq_handler = pfm_power6_irq_handler,
11540+ .get_ovfl_pmds = pfm_power6_get_ovfl_pmds,
11541+ .enable_counters = pfm_power6_enable_counters,
11542+ .disable_counters = pfm_power6_disable_counters,
11543+ .resend_irq = pfm_power6_resend_irq
11544+};
11545+
11546+/*
11547+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
11548+ */
11549+static struct pfm_pmu_config pfm_power6_pmu_conf = {
11550+ .pmu_name = "POWER6",
11551+ .counter_width = 31,
11552+ .pmd_desc = pfm_power6_pmd_desc,
11553+ .pmc_desc = pfm_power6_pmc_desc,
11554+ .num_pmc_entries = PFM_PM_NUM_PMCS,
11555+ .num_pmd_entries = PFM_PM_NUM_PMDS,
11556+ .probe_pmu = pfm_power6_probe_pmu,
11557+ .pmu_info = &pfm_power6_pmu_info,
11558+ .pmd_sread = pfm_power6_sread,
11559+ .pmd_swrite = pfm_power6_swrite,
11560+ .flags = PFM_PMU_BUILTIN_FLAG,
11561+ .owner = THIS_MODULE
11562+};
11563+
11564+static int __init pfm_power6_pmu_init_module(void)
11565+{
11566+ int ret;
11567+ disable_cntrs_cnt = 0;
11568+ enable_cntrs_cnt = 0;
11569+ call_delta = 0;
11570+ pm5_6_interrupt = 0;
11571+ pm1_4_interrupt = 0;
11572+
11573+ /* calculate the time for updating counters 5 and 6 */
11574+
11575+ /*
11576+ * MAX_EVENT_RATE assumes a max instruction issue rate of 2
11577+ * instructions per clock cycle. Experience shows that this factor
11578+ * of 2 is more than adequate.
11579+ */
11580+
11581+# define MAX_EVENT_RATE (ppc_proc_freq * 2)
11582+
11583+ /*
11584+ * Calculate the time, in jiffies, it takes for event counter 5 or
11585+ * 6 to completely wrap when counting at the max event rate, and
11586+ * then figure on sampling at twice that rate.
11587+ */
11588+ update_time = (((unsigned long)HZ * OVERFLOW_VALUE)
11589+ / ((unsigned long)MAX_EVENT_RATE)) / 2;
11590+
11591+ ret = pfm_pmu_register(&pfm_power6_pmu_conf);
11592+ return ret;
11593+}
11594+
11595+static void __exit pfm_power6_pmu_cleanup_module(void)
11596+{
11597+ pfm_pmu_unregister(&pfm_power6_pmu_conf);
11598+}
11599+
11600+module_init(pfm_power6_pmu_init_module);
11601+module_exit(pfm_power6_pmu_cleanup_module);
11602--- /dev/null
11603+++ b/arch/powerpc/perfmon/perfmon_ppc32.c
11604@@ -0,0 +1,340 @@
11605+/*
11606+ * This file contains the PPC32 PMU register description tables
11607+ * and pmc checker used by perfmon.c.
11608+ *
11609+ * Philip Mucci, mucci@cs.utk.edu
11610+ *
11611+ * Based on code from:
11612+ * Copyright (c) 2005 David Gibson, IBM Corporation.
11613+ *
11614+ * Based on perfmon_p6.c:
11615+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
11616+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
11617+ *
11618+ * This program is free software; you can redistribute it and/or
11619+ * modify it under the terms of version 2 of the GNU General Public
11620+ * License as published by the Free Software Foundation.
11621+ *
11622+ * This program is distributed in the hope that it will be useful,
11623+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
11624+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11625+ * General Public License for more details.
11626+ *
11627+ * You should have received a copy of the GNU General Public License
11628+ * along with this program; if not, write to the Free Software
11629+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
11630+ * 02111-1307 USA
11631+ */
11632+#include <linux/module.h>
11633+#include <linux/perfmon_kern.h>
11634+#include <asm/reg.h>
11635+
11636+MODULE_AUTHOR("Philip Mucci <mucci@cs.utk.edu>");
11637+MODULE_DESCRIPTION("PPC32 PMU description table");
11638+MODULE_LICENSE("GPL");
11639+
11640+static struct pfm_pmu_config pfm_ppc32_pmu_conf;
11641+
11642+static struct pfm_regmap_desc pfm_ppc32_pmc_desc[] = {
11643+/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", 0x0, 0, 0, SPRN_MMCR0),
11644+/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0x0, 0, 0, SPRN_MMCR1),
11645+/* mmcr2 */ PMC_D(PFM_REG_I, "MMCR2", 0x0, 0, 0, SPRN_MMCR2),
11646+};
11647+#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_ppc32_pmc_desc)
11648+
11649+static struct pfm_regmap_desc pfm_ppc32_pmd_desc[] = {
11650+/* pmd0 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1),
11651+/* pmd1 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2),
11652+/* pmd2 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3),
11653+/* pmd3 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4),
11654+/* pmd4 */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5),
11655+/* pmd5 */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6),
11656+};
11657+#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_ppc32_pmd_desc)
11658+
11659+static void perfmon_perf_irq(struct pt_regs *regs)
11660+{
11661+ u32 mmcr0;
11662+
11663+ /* BLATANTLY STOLEN FROM OPROFILE, then modified */
11664+
11665+ /* set the PMM bit (see comment below) */
11666+ mtmsr(mfmsr() | MSR_PMM);
11667+
11668+ pfm_interrupt_handler(instruction_pointer(regs), regs);
11669+
11670+ /* The freeze bit was set by the interrupt.
11671+ * Clear the freeze bit, and reenable the interrupt.
11672+ * The counters won't actually start until the rfi clears
11673+ * the PMM bit.
11674+ */
11675+
11676+ /* Unfreezes the counters on this CPU, enables the interrupt,
11677+ * enables the counters to trigger the interrupt, and sets the
11678+ * counters to only count when the mark bit is not set.
11679+ */
11680+ mmcr0 = mfspr(SPRN_MMCR0);
11681+
11682+ mmcr0 &= ~(MMCR0_FC | MMCR0_FCM0);
11683+ mmcr0 |= (MMCR0_FCECE | MMCR0_PMC1CE | MMCR0_PMCnCE | MMCR0_PMXE);
11684+
11685+ mtspr(SPRN_MMCR0, mmcr0);
11686+}
11687+
11688+static int pfm_ppc32_probe_pmu(void)
11689+{
11690+ enum ppc32_pmu_type pm_type;
11691+ int nmmcr = 0, npmds = 0, intsok = 0, i;
11692+ unsigned int pvr;
11693+ char *str;
11694+
11695+ pvr = mfspr(SPRN_PVR);
11696+
11697+ switch (PVR_VER(pvr)) {
11698+ case 0x0004: /* 604 */
11699+ str = "PPC604";
11700+ pm_type = PFM_POWERPC_PMU_604;
11701+ nmmcr = 1;
11702+ npmds = 2;
11703+ break;
11704+ case 0x0009: /* 604e; */
11705+ case 0x000A: /* 604ev */
11706+ str = "PPC604e";
11707+ pm_type = PFM_POWERPC_PMU_604e;
11708+ nmmcr = 2;
11709+ npmds = 4;
11710+ break;
11711+ case 0x0008: /* 750/740 */
11712+ str = "PPC750";
11713+ pm_type = PFM_POWERPC_PMU_750;
11714+ nmmcr = 2;
11715+ npmds = 4;
11716+ break;
11717+ case 0x7000: /* 750FX */
11718+ case 0x7001:
11719+ str = "PPC750";
11720+ pm_type = PFM_POWERPC_PMU_750;
11721+ nmmcr = 2;
11722+ npmds = 4;
11723+ if ((pvr & 0xFF0F) >= 0x0203)
11724+ intsok = 1;
11725+ break;
11726+ case 0x7002: /* 750GX */
11727+ str = "PPC750";
11728+ pm_type = PFM_POWERPC_PMU_750;
11729+ nmmcr = 2;
11730+ npmds = 4;
11731+ intsok = 1;
11732+ case 0x000C: /* 7400 */
11733+ str = "PPC7400";
11734+ pm_type = PFM_POWERPC_PMU_7400;
11735+ nmmcr = 3;
11736+ npmds = 4;
11737+ break;
11738+ case 0x800C: /* 7410 */
11739+ str = "PPC7410";
11740+ pm_type = PFM_POWERPC_PMU_7400;
11741+ nmmcr = 3;
11742+ npmds = 4;
11743+ if ((pvr & 0xFFFF) >= 0x01103)
11744+ intsok = 1;
11745+ break;
11746+ case 0x8000: /* 7451/7441 */
11747+ case 0x8001: /* 7455/7445 */
11748+ case 0x8002: /* 7457/7447 */
11749+ case 0x8003: /* 7447A */
11750+ case 0x8004: /* 7448 */
11751+ str = "PPC7450";
11752+ pm_type = PFM_POWERPC_PMU_7450;
11753+ nmmcr = 3; npmds = 6;
11754+ intsok = 1;
11755+ break;
11756+ default:
11757+ PFM_INFO("Unknown PVR_VER(0x%x)\n", PVR_VER(pvr));
11758+ return -1;
11759+ }
11760+
11761+ /*
11762+ * deconfigure unimplemented registers
11763+ */
11764+ for (i = npmds; i < PFM_PM_NUM_PMDS; i++)
11765+ pfm_ppc32_pmd_desc[i].type = PFM_REG_NA;
11766+
11767+ for (i = nmmcr; i < PFM_PM_NUM_PMCS; i++)
11768+ pfm_ppc32_pmc_desc[i].type = PFM_REG_NA;
11769+
11770+ /*
11771+ * update PMU description structure
11772+ */
11773+ pfm_ppc32_pmu_conf.pmu_name = str;
11774+ pfm_ppc32_pmu_info.pmu_style = pm_type;
11775+ pfm_ppc32_pmu_conf.num_pmc_entries = nmmcr;
11776+ pfm_ppc32_pmu_conf.num_pmd_entries = npmds;
11777+
11778+ if (intsok == 0)
11779+ PFM_INFO("Interrupts unlikely to work\n");
11780+
11781+ return reserve_pmc_hardware(perfmon_perf_irq);
11782+}
11783+
11784+static void pfm_ppc32_write_pmc(unsigned int cnum, u64 value)
11785+{
11786+ switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) {
11787+ case SPRN_MMCR0:
11788+ mtspr(SPRN_MMCR0, value);
11789+ break;
11790+ case SPRN_MMCR1:
11791+ mtspr(SPRN_MMCR1, value);
11792+ break;
11793+ case SPRN_MMCR2:
11794+ mtspr(SPRN_MMCR2, value);
11795+ break;
11796+ default:
11797+ BUG();
11798+ }
11799+}
11800+
11801+static void pfm_ppc32_write_pmd(unsigned int cnum, u64 value)
11802+{
11803+ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
11804+ case SPRN_PMC1:
11805+ mtspr(SPRN_PMC1, value);
11806+ break;
11807+ case SPRN_PMC2:
11808+ mtspr(SPRN_PMC2, value);
11809+ break;
11810+ case SPRN_PMC3:
11811+ mtspr(SPRN_PMC3, value);
11812+ break;
11813+ case SPRN_PMC4:
11814+ mtspr(SPRN_PMC4, value);
11815+ break;
11816+ case SPRN_PMC5:
11817+ mtspr(SPRN_PMC5, value);
11818+ break;
11819+ case SPRN_PMC6:
11820+ mtspr(SPRN_PMC6, value);
11821+ break;
11822+ default:
11823+ BUG();
11824+ }
11825+}
11826+
11827+static u64 pfm_ppc32_read_pmd(unsigned int cnum)
11828+{
11829+ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
11830+ case SPRN_PMC1:
11831+ return mfspr(SPRN_PMC1);
11832+ case SPRN_PMC2:
11833+ return mfspr(SPRN_PMC2);
11834+ case SPRN_PMC3:
11835+ return mfspr(SPRN_PMC3);
11836+ case SPRN_PMC4:
11837+ return mfspr(SPRN_PMC4);
11838+ case SPRN_PMC5:
11839+ return mfspr(SPRN_PMC5);
11840+ case SPRN_PMC6:
11841+ return mfspr(SPRN_PMC6);
11842+ default:
11843+ BUG();
11844+ }
11845+}
11846+
11847+/**
11848+ * pfm_ppc32_enable_counters
11849+ *
11850+ * Just need to load the current values into the control registers.
11851+ **/
11852+static void pfm_ppc32_enable_counters(struct pfm_context *ctx,
11853+ struct pfm_event_set *set)
11854+{
11855+ unsigned int i, max_pmc;
11856+
11857+ max_pmc = pfm_pmu_conf->regs.max_pmc;
11858+
11859+ for (i = 0; i < max_pmc; i++)
11860+ if (test_bit(i, set->used_pmcs))
11861+ pfm_ppc32_write_pmc(i, set->pmcs[i]);
11862+}
11863+
11864+/**
11865+ * pfm_ppc32_disable_counters
11866+ *
11867+ * Just need to zero all the control registers.
11868+ **/
11869+static void pfm_ppc32_disable_counters(struct pfm_context *ctx,
11870+ struct pfm_event_set *set)
11871+{
11872+ unsigned int i, max;
11873+
11874+ max = pfm_pmu_conf->regs.max_pmc;
11875+
11876+ for (i = 0; i < max; i++)
11877+ if (test_bit(i, set->used_pmcs))
11878+ pfm_ppc32_write_pmc(ctx, 0);
11879+}
11880+
11881+/**
11882+ * pfm_ppc32_get_ovfl_pmds
11883+ *
11884+ * Determine which counters in this set have overflowed and fill in the
11885+ * set->povfl_pmds mask and set->npend_ovfls count.
11886+ **/
11887+static void pfm_ppc32_get_ovfl_pmds(struct pfm_context *ctx,
11888+ struct pfm_event_set *set)
11889+{
11890+ unsigned int i;
11891+ unsigned int max_pmd = pfm_pmu_conf->regs.max_cnt_pmd;
11892+ u64 *used_pmds = set->used_pmds;
11893+ u64 *cntr_pmds = pfm_pmu_conf->regs.cnt_pmds;
11894+ u64 width_mask = 1 << pfm_pmu_conf->counter_width;
11895+ u64 new_val, mask[PFM_PMD_BV];
11896+
11897+ bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds),
11898+ cast_ulp(used_pmds), max_pmd);
11899+
11900+ for (i = 0; i < max_pmd; i++) {
11901+ if (test_bit(i, mask)) {
11902+ new_val = pfm_ppc32_read_pmd(i);
11903+ if (new_val & width_mask) {
11904+ set_bit(i, set->povfl_pmds);
11905+ set->npend_ovfls++;
11906+ }
11907+ }
11908+ }
11909+}
11910+
11911+struct pfm_arch_pmu_info pfm_ppc32_pmu_info = {
11912+ .pmu_style = PFM_POWERPC_PMU_NONE,
11913+ .write_pmc = pfm_ppc32_write_pmc,
11914+ .write_pmd = pfm_ppc32_write_pmd,
11915+ .read_pmd = pfm_ppc32_read_pmd,
11916+ .get_ovfl_pmds = pfm_ppc32_get_ovfl_pmds,
11917+ .enable_counters = pfm_ppc32_enable_counters,
11918+ .disable_counters = pfm_ppc32_disable_counters,
11919+};
11920+
11921+static struct pfm_pmu_config pfm_ppc32_pmu_conf = {
11922+ .counter_width = 31,
11923+ .pmd_desc = pfm_ppc32_pmd_desc,
11924+ .pmc_desc = pfm_ppc32_pmc_desc,
11925+ .probe_pmu = pfm_ppc32_probe_pmu,
11926+ .flags = PFM_PMU_BUILTIN_FLAG,
11927+ .owner = THIS_MODULE,
11928+ .version = "0.1",
11929+ .arch_info = &pfm_ppc32_pmu_info,
11930+};
11931+
11932+static int __init pfm_ppc32_pmu_init_module(void)
11933+{
11934+ return pfm_pmu_register(&pfm_ppc32_pmu_conf);
11935+}
11936+
11937+static void __exit pfm_ppc32_pmu_cleanup_module(void)
11938+{
11939+ release_pmc_hardware();
11940+ pfm_pmu_unregister(&pfm_ppc32_pmu_conf);
11941+}
11942+
11943+module_init(pfm_ppc32_pmu_init_module);
11944+module_exit(pfm_ppc32_pmu_cleanup_module);
11945--- a/arch/powerpc/platforms/cell/cbe_regs.c
11946+++ b/arch/powerpc/platforms/cell/cbe_regs.c
11947@@ -33,6 +33,7 @@ static struct cbe_regs_map
11948 struct cbe_iic_regs __iomem *iic_regs;
11949 struct cbe_mic_tm_regs __iomem *mic_tm_regs;
11950 struct cbe_pmd_shadow_regs pmd_shadow_regs;
11951+ struct cbe_ppe_priv_regs __iomem *ppe_priv_regs;
11952 } cbe_regs_maps[MAX_CBE];
11953 static int cbe_regs_map_count;
11954
11955@@ -145,6 +146,23 @@ struct cbe_mic_tm_regs __iomem *cbe_get_
11956 }
11957 EXPORT_SYMBOL_GPL(cbe_get_cpu_mic_tm_regs);
11958
11959+struct cbe_ppe_priv_regs __iomem *cbe_get_ppe_priv_regs(struct device_node *np)
11960+{
11961+ struct cbe_regs_map *map = cbe_find_map(np);
11962+ if (map == NULL)
11963+ return NULL;
11964+ return map->ppe_priv_regs;
11965+}
11966+
11967+struct cbe_ppe_priv_regs __iomem *cbe_get_cpu_ppe_priv_regs(int cpu)
11968+{
11969+ struct cbe_regs_map *map = cbe_thread_map[cpu].regs;
11970+ if (map == NULL)
11971+ return NULL;
11972+ return map->ppe_priv_regs;
11973+}
11974+EXPORT_SYMBOL_GPL(cbe_get_cpu_ppe_priv_regs);
11975+
11976 u32 cbe_get_hw_thread_id(int cpu)
11977 {
11978 return cbe_thread_map[cpu].thread_id;
11979@@ -206,6 +224,11 @@ void __init cbe_fill_regs_map(struct cbe
11980 for_each_node_by_type(np, "mic-tm")
11981 if (of_get_parent(np) == be)
11982 map->mic_tm_regs = of_iomap(np, 0);
11983+
11984+ for_each_node_by_type(np, "ppe-mmio")
11985+ if (of_get_parent(np) == be)
11986+ map->ppe_priv_regs = of_iomap(np, 0);
11987+
11988 } else {
11989 struct device_node *cpu;
11990 /* That hack must die die die ! */
11991@@ -227,6 +250,10 @@ void __init cbe_fill_regs_map(struct cbe
11992 prop = of_get_property(cpu, "mic-tm", NULL);
11993 if (prop != NULL)
11994 map->mic_tm_regs = ioremap(prop->address, prop->len);
11995+
11996+ prop = of_get_property(cpu, "ppe-mmio", NULL);
11997+ if (prop != NULL)
11998+ map->ppe_priv_regs = ioremap(prop->address, prop->len);
11999 }
12000 }
12001
12002--- a/arch/sparc/include/asm/hypervisor.h
12003+++ b/arch/sparc/include/asm/hypervisor.h
12004@@ -2713,6 +2713,30 @@ extern unsigned long sun4v_ldc_revoke(un
12005 */
12006 #define HV_FAST_SET_PERFREG 0x101
12007
12008+#define HV_N2_PERF_SPARC_CTL 0x0
12009+#define HV_N2_PERF_DRAM_CTL0 0x1
12010+#define HV_N2_PERF_DRAM_CNT0 0x2
12011+#define HV_N2_PERF_DRAM_CTL1 0x3
12012+#define HV_N2_PERF_DRAM_CNT1 0x4
12013+#define HV_N2_PERF_DRAM_CTL2 0x5
12014+#define HV_N2_PERF_DRAM_CNT2 0x6
12015+#define HV_N2_PERF_DRAM_CTL3 0x7
12016+#define HV_N2_PERF_DRAM_CNT3 0x8
12017+
12018+#define HV_FAST_N2_GET_PERFREG 0x104
12019+#define HV_FAST_N2_SET_PERFREG 0x105
12020+
12021+#ifndef __ASSEMBLY__
12022+extern unsigned long sun4v_niagara_getperf(unsigned long reg,
12023+ unsigned long *val);
12024+extern unsigned long sun4v_niagara_setperf(unsigned long reg,
12025+ unsigned long val);
12026+extern unsigned long sun4v_niagara2_getperf(unsigned long reg,
12027+ unsigned long *val);
12028+extern unsigned long sun4v_niagara2_setperf(unsigned long reg,
12029+ unsigned long val);
12030+#endif
12031+
12032 /* MMU statistics services.
12033 *
12034 * The hypervisor maintains MMU statistics and privileged code provides
12035--- a/arch/sparc/include/asm/irq_64.h
12036+++ b/arch/sparc/include/asm/irq_64.h
12037@@ -67,6 +67,9 @@ extern void virt_irq_free(unsigned int v
12038 extern void __init init_IRQ(void);
12039 extern void fixup_irqs(void);
12040
12041+extern int register_perfctr_intr(void (*handler)(struct pt_regs *));
12042+extern void release_perfctr_intr(void (*handler)(struct pt_regs *));
12043+
12044 static inline void set_softint(unsigned long bits)
12045 {
12046 __asm__ __volatile__("wr %0, 0x0, %%set_softint"
12047--- /dev/null
12048+++ b/arch/sparc/include/asm/perfmon.h
12049@@ -0,0 +1,11 @@
12050+#ifndef _SPARC64_PERFMON_H_
12051+#define _SPARC64_PERFMON_H_
12052+
12053+/*
12054+ * arch-specific user visible interface definitions
12055+ */
12056+
12057+#define PFM_ARCH_MAX_PMCS 2
12058+#define PFM_ARCH_MAX_PMDS 3
12059+
12060+#endif /* _SPARC64_PERFMON_H_ */
12061--- /dev/null
12062+++ b/arch/sparc/include/asm/perfmon_kern.h
12063@@ -0,0 +1,286 @@
12064+#ifndef _SPARC64_PERFMON_KERN_H_
12065+#define _SPARC64_PERFMON_KERN_H_
12066+
12067+#ifdef __KERNEL__
12068+
12069+#ifdef CONFIG_PERFMON
12070+
12071+#include <linux/irq.h>
12072+#include <asm/system.h>
12073+
12074+#define PFM_ARCH_PMD_STK_ARG 2
12075+#define PFM_ARCH_PMC_STK_ARG 1
12076+
12077+struct pfm_arch_pmu_info {
12078+ u32 pmu_style;
12079+};
12080+
12081+static inline void pfm_arch_resend_irq(struct pfm_context *ctx)
12082+{
12083+}
12084+
12085+static inline void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx,
12086+ struct pfm_event_set *set)
12087+{}
12088+
12089+static inline void pfm_arch_serialize(void)
12090+{
12091+}
12092+
12093+/*
12094+ * SPARC does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus
12095+ * this routine needs to do it when switching sets on overflow
12096+ */
12097+static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx,
12098+ struct pfm_event_set *set)
12099+{
12100+ pfm_save_pmds(ctx, set);
12101+}
12102+
12103+extern void pfm_arch_write_pmc(struct pfm_context *ctx,
12104+ unsigned int cnum, u64 value);
12105+extern u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum);
12106+
12107+static inline void pfm_arch_write_pmd(struct pfm_context *ctx,
12108+ unsigned int cnum, u64 value)
12109+{
12110+ u64 pic;
12111+
12112+ value &= pfm_pmu_conf->ovfl_mask;
12113+
12114+ read_pic(pic);
12115+
12116+ switch (cnum) {
12117+ case 0:
12118+ pic = (pic & 0xffffffff00000000UL) |
12119+ (value & 0xffffffffUL);
12120+ break;
12121+ case 1:
12122+ pic = (pic & 0xffffffffUL) |
12123+ (value << 32UL);
12124+ break;
12125+ default:
12126+ BUG();
12127+ }
12128+
12129+ write_pic(pic);
12130+}
12131+
12132+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx,
12133+ unsigned int cnum)
12134+{
12135+ u64 pic;
12136+
12137+ read_pic(pic);
12138+
12139+ switch (cnum) {
12140+ case 0:
12141+ return pic & 0xffffffffUL;
12142+ case 1:
12143+ return pic >> 32UL;
12144+ default:
12145+ BUG();
12146+ return 0;
12147+ }
12148+}
12149+
12150+/*
12151+ * For some CPUs, the upper bits of a counter must be set in order for the
12152+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
12153+ * and the upper bits are cleared. This function may be used to set them back.
12154+ */
12155+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx,
12156+ unsigned int cnum)
12157+{
12158+ u64 val = pfm_arch_read_pmd(ctx, cnum);
12159+
12160+ /* This masks out overflow bit 31 */
12161+ pfm_arch_write_pmd(ctx, cnum, val);
12162+}
12163+
12164+/*
12165+ * At certain points, perfmon needs to know if monitoring has been
12166+ * explicitely started/stopped by user via pfm_start/pfm_stop. The
12167+ * information is tracked in ctx.flags.started. However on certain
12168+ * architectures, it may be possible to start/stop directly from
12169+ * user level with a single assembly instruction bypassing
12170+ * the kernel. This function must be used to determine by
12171+ * an arch-specific mean if monitoring is actually started/stopped.
12172+ */
12173+static inline int pfm_arch_is_active(struct pfm_context *ctx)
12174+{
12175+ return ctx->flags.started;
12176+}
12177+
12178+static inline void pfm_arch_ctxswout_sys(struct task_struct *task,
12179+ struct pfm_context *ctx)
12180+{
12181+}
12182+
12183+static inline void pfm_arch_ctxswin_sys(struct task_struct *task,
12184+ struct pfm_context *ctx)
12185+{
12186+}
12187+
12188+static inline void pfm_arch_ctxswin_thread(struct task_struct *task,
12189+ struct pfm_context *ctx)
12190+{
12191+}
12192+
12193+int pfm_arch_is_monitoring_active(struct pfm_context *ctx);
12194+int pfm_arch_ctxswout_thread(struct task_struct *task,
12195+ struct pfm_context *ctx);
12196+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx);
12197+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx);
12198+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
12199+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
12200+char *pfm_arch_get_pmu_module_name(void);
12201+
12202+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx,
12203+ struct pfm_event_set *set)
12204+{
12205+ pfm_arch_stop(current, ctx);
12206+ /*
12207+ * we mark monitoring as stopped to avoid
12208+ * certain side effects especially in
12209+ * pfm_switch_sets_from_intr() on
12210+ * pfm_arch_restore_pmcs()
12211+ */
12212+ ctx->flags.started = 0;
12213+}
12214+
12215+/*
12216+ * unfreeze PMU from pfm_do_interrupt_handler()
12217+ * ctx may be NULL for spurious
12218+ */
12219+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
12220+{
12221+ if (!ctx)
12222+ return;
12223+
12224+ PFM_DBG_ovfl("state=%d", ctx->state);
12225+
12226+ ctx->flags.started = 1;
12227+
12228+ if (ctx->state == PFM_CTX_MASKED)
12229+ return;
12230+
12231+ pfm_arch_restore_pmcs(ctx, ctx->active_set);
12232+}
12233+
12234+/*
12235+ * this function is called from the PMU interrupt handler ONLY.
12236+ * On SPARC, the PMU is frozen via arch_stop, masking would be implemented
12237+ * via arch-stop as well. Given that the PMU is already stopped when
12238+ * entering the interrupt handler, we do not need to stop it again, so
12239+ * this function is a nop.
12240+ */
12241+static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx,
12242+ struct pfm_event_set *set)
12243+{
12244+}
12245+
12246+/*
12247+ * on MIPS masking/unmasking uses the start/stop mechanism, so we simply
12248+ * need to start here.
12249+ */
12250+static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx,
12251+ struct pfm_event_set *set)
12252+{
12253+ pfm_arch_start(current, ctx);
12254+}
12255+
12256+static inline void pfm_arch_pmu_config_remove(void)
12257+{
12258+}
12259+
12260+static inline int pfm_arch_context_create(struct pfm_context *ctx,
12261+ u32 ctx_flags)
12262+{
12263+ return 0;
12264+}
12265+
12266+static inline void pfm_arch_context_free(struct pfm_context *ctx)
12267+{
12268+}
12269+
12270+/*
12271+ * function called from pfm_setfl_sane(). Context is locked
12272+ * and interrupts are masked.
12273+ * The value of flags is the value of ctx_flags as passed by
12274+ * user.
12275+ *
12276+ * function must check arch-specific set flags.
12277+ * Return:
12278+ * 1 when flags are valid
12279+ * 0 on error
12280+ */
12281+static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags)
12282+{
12283+ return 0;
12284+}
12285+
12286+static inline int pfm_arch_init(void)
12287+{
12288+ return 0;
12289+}
12290+
12291+static inline void pfm_arch_init_percpu(void)
12292+{
12293+}
12294+
12295+static inline int pfm_arch_load_context(struct pfm_context *ctx)
12296+{
12297+ return 0;
12298+}
12299+
12300+static inline void pfm_arch_unload_context(struct pfm_context *ctx)
12301+{}
12302+
12303+extern void perfmon_interrupt(struct pt_regs *);
12304+
12305+static inline int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds)
12306+{
12307+ return register_perfctr_intr(perfmon_interrupt);
12308+}
12309+
12310+static inline void pfm_arch_pmu_release(void)
12311+{
12312+ release_perfctr_intr(perfmon_interrupt);
12313+}
12314+
12315+static inline void pfm_arch_arm_handle_work(struct task_struct *task)
12316+{}
12317+
12318+static inline void pfm_arch_disarm_handle_work(struct task_struct *task)
12319+{}
12320+
12321+static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg)
12322+{
12323+ return 0;
12324+}
12325+
12326+static inline int pfm_arch_get_base_syscall(void)
12327+{
12328+ return __NR_pfm_create_context;
12329+}
12330+
12331+struct pfm_arch_context {
12332+ /* empty */
12333+};
12334+
12335+#define PFM_ARCH_CTX_SIZE sizeof(struct pfm_arch_context)
12336+/*
12337+ * SPARC needs extra alignment for the sampling buffer
12338+ */
12339+#define PFM_ARCH_SMPL_ALIGN_SIZE (16 * 1024)
12340+
12341+static inline void pfm_cacheflush(void *addr, unsigned int len)
12342+{
12343+}
12344+
12345+#endif /* CONFIG_PERFMON */
12346+
12347+#endif /* __KERNEL__ */
12348+
12349+#endif /* _SPARC64_PERFMON_KERN_H_ */
12350--- a/arch/sparc/include/asm/system_64.h
12351+++ b/arch/sparc/include/asm/system_64.h
12352@@ -30,6 +30,9 @@ enum sparc_cpu {
12353 #define ARCH_SUN4C_SUN4 0
12354 #define ARCH_SUN4 0
12355
12356+extern char *sparc_cpu_type;
12357+extern char *sparc_fpu_type;
12358+extern char *sparc_pmu_type;
12359 extern char reboot_command[];
12360
12361 /* These are here in an effort to more fully work around Spitfire Errata
12362@@ -104,15 +107,13 @@ do { __asm__ __volatile__("ba,pt %%xcc,
12363 #define write_pcr(__p) __asm__ __volatile__("wr %0, 0x0, %%pcr" : : "r" (__p))
12364 #define read_pic(__p) __asm__ __volatile__("rd %%pic, %0" : "=r" (__p))
12365
12366-/* Blackbird errata workaround. See commentary in
12367- * arch/sparc64/kernel/smp.c:smp_percpu_timer_interrupt()
12368- * for more information.
12369- */
12370-#define reset_pic() \
12371- __asm__ __volatile__("ba,pt %xcc, 99f\n\t" \
12372+/* Blackbird errata workaround. */
12373+#define write_pic(val) \
12374+ __asm__ __volatile__("ba,pt %%xcc, 99f\n\t" \
12375 ".align 64\n" \
12376- "99:wr %g0, 0x0, %pic\n\t" \
12377- "rd %pic, %g0")
12378+ "99:wr %0, 0x0, %%pic\n\t" \
12379+ "rd %%pic, %%g0" : : "r" (val))
12380+#define reset_pic() write_pic(0)
12381
12382 #ifndef __ASSEMBLY__
12383
12384@@ -145,14 +146,10 @@ do { \
12385 * and 2 stores in this critical code path. -DaveM
12386 */
12387 #define switch_to(prev, next, last) \
12388-do { if (test_thread_flag(TIF_PERFCTR)) { \
12389- unsigned long __tmp; \
12390- read_pcr(__tmp); \
12391- current_thread_info()->pcr_reg = __tmp; \
12392- read_pic(__tmp); \
12393- current_thread_info()->kernel_cntd0 += (unsigned int)(__tmp);\
12394- current_thread_info()->kernel_cntd1 += ((__tmp) >> 32); \
12395- } \
12396+do { if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)) \
12397+ pfm_ctxsw_out(prev, next); \
12398+ if (test_tsk_thread_flag(next, TIF_PERFMON_CTXSW)) \
12399+ pfm_ctxsw_in(prev, next); \
12400 flush_tlb_pending(); \
12401 save_and_clear_fpu(); \
12402 /* If you are tempted to conditionalize the following */ \
12403@@ -197,11 +194,6 @@ do { if (test_thread_flag(TIF_PERFCTR))
12404 "l1", "l2", "l3", "l4", "l5", "l6", "l7", \
12405 "i0", "i1", "i2", "i3", "i4", "i5", \
12406 "o0", "o1", "o2", "o3", "o4", "o5", "o7"); \
12407- /* If you fuck with this, update ret_from_syscall code too. */ \
12408- if (test_thread_flag(TIF_PERFCTR)) { \
12409- write_pcr(current_thread_info()->pcr_reg); \
12410- reset_pic(); \
12411- } \
12412 } while(0)
12413
12414 static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val)
12415--- a/arch/sparc/include/asm/thread_info_64.h
12416+++ b/arch/sparc/include/asm/thread_info_64.h
12417@@ -58,11 +58,6 @@ struct thread_info {
12418 unsigned long gsr[7];
12419 unsigned long xfsr[7];
12420
12421- __u64 __user *user_cntd0;
12422- __u64 __user *user_cntd1;
12423- __u64 kernel_cntd0, kernel_cntd1;
12424- __u64 pcr_reg;
12425-
12426 struct restart_block restart_block;
12427
12428 struct pt_regs *kern_una_regs;
12429@@ -96,15 +91,10 @@ struct thread_info {
12430 #define TI_RWIN_SPTRS 0x000003c8
12431 #define TI_GSR 0x00000400
12432 #define TI_XFSR 0x00000438
12433-#define TI_USER_CNTD0 0x00000470
12434-#define TI_USER_CNTD1 0x00000478
12435-#define TI_KERN_CNTD0 0x00000480
12436-#define TI_KERN_CNTD1 0x00000488
12437-#define TI_PCR 0x00000490
12438-#define TI_RESTART_BLOCK 0x00000498
12439-#define TI_KUNA_REGS 0x000004c0
12440-#define TI_KUNA_INSN 0x000004c8
12441-#define TI_FPREGS 0x00000500
12442+#define TI_RESTART_BLOCK 0x00000470
12443+#define TI_KUNA_REGS 0x00000498
12444+#define TI_KUNA_INSN 0x000004a0
12445+#define TI_FPREGS 0x000004c0
12446
12447 /* We embed this in the uppermost byte of thread_info->flags */
12448 #define FAULT_CODE_WRITE 0x01 /* Write access, implies D-TLB */
12449@@ -222,11 +212,11 @@ register struct thread_info *current_thr
12450 #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
12451 #define TIF_SIGPENDING 2 /* signal pending */
12452 #define TIF_NEED_RESCHED 3 /* rescheduling necessary */
12453-#define TIF_PERFCTR 4 /* performance counters active */
12454+/* Bit 4 is available */
12455 #define TIF_UNALIGNED 5 /* allowed to do unaligned accesses */
12456 /* flag bit 6 is available */
12457 #define TIF_32BIT 7 /* 32-bit binary */
12458-/* flag bit 8 is available */
12459+#define TIF_PERFMON_WORK 8 /* work for pfm_handle_work() */
12460 #define TIF_SECCOMP 9 /* secure computing */
12461 #define TIF_SYSCALL_AUDIT 10 /* syscall auditing active */
12462 /* flag bit 11 is available */
12463@@ -237,22 +227,24 @@ register struct thread_info *current_thr
12464 #define TIF_ABI_PENDING 12
12465 #define TIF_MEMDIE 13
12466 #define TIF_POLLING_NRFLAG 14
12467+#define TIF_PERFMON_CTXSW 15 /* perfmon needs ctxsw calls */
12468
12469 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
12470 #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
12471 #define _TIF_SIGPENDING (1<<TIF_SIGPENDING)
12472 #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED)
12473-#define _TIF_PERFCTR (1<<TIF_PERFCTR)
12474 #define _TIF_UNALIGNED (1<<TIF_UNALIGNED)
12475 #define _TIF_32BIT (1<<TIF_32BIT)
12476+#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK)
12477 #define _TIF_SECCOMP (1<<TIF_SECCOMP)
12478 #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
12479 #define _TIF_ABI_PENDING (1<<TIF_ABI_PENDING)
12480 #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
12481+#define _TIF_PERFMON_CTXSW (1<<TIF_PERFMON_CTXSW)
12482
12483 #define _TIF_USER_WORK_MASK ((0xff << TI_FLAG_WSAVED_SHIFT) | \
12484 _TIF_DO_NOTIFY_RESUME_MASK | \
12485- _TIF_NEED_RESCHED | _TIF_PERFCTR)
12486+ _TIF_NEED_RESCHED)
12487 #define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING)
12488
12489 /*
12490--- a/arch/sparc/include/asm/unistd_32.h
12491+++ b/arch/sparc/include/asm/unistd_32.h
12492@@ -338,8 +338,20 @@
12493 #define __NR_dup3 320
12494 #define __NR_pipe2 321
12495 #define __NR_inotify_init1 322
12496+#define __NR_pfm_create_context 323
12497+#define __NR_pfm_write_pmcs 324
12498+#define __NR_pfm_write_pmds 325
12499+#define __NR_pfm_read_pmds 326
12500+#define __NR_pfm_load_context 327
12501+#define __NR_pfm_start 328
12502+#define __NR_pfm_stop 329
12503+#define __NR_pfm_restart 330
12504+#define __NR_pfm_create_evtsets 331
12505+#define __NR_pfm_getinfo_evtsets 332
12506+#define __NR_pfm_delete_evtsets 333
12507+#define __NR_pfm_unload_context 334
12508
12509-#define NR_SYSCALLS 323
12510+#define NR_SYSCALLS 325
12511
12512 /* Sparc 32-bit only has the "setresuid32", "getresuid32" variants,
12513 * it never had the plain ones and there is no value to adding those
12514--- a/arch/sparc/include/asm/unistd_64.h
12515+++ b/arch/sparc/include/asm/unistd_64.h
12516@@ -340,8 +340,20 @@
12517 #define __NR_dup3 320
12518 #define __NR_pipe2 321
12519 #define __NR_inotify_init1 322
12520+#define __NR_pfm_create_context 323
12521+#define __NR_pfm_write_pmcs 324
12522+#define __NR_pfm_write_pmds 325
12523+#define __NR_pfm_read_pmds 326
12524+#define __NR_pfm_load_context 327
12525+#define __NR_pfm_start 328
12526+#define __NR_pfm_stop 329
12527+#define __NR_pfm_restart 330
12528+#define __NR_pfm_create_evtsets 331
12529+#define __NR_pfm_getinfo_evtsets 332
12530+#define __NR_pfm_delete_evtsets 333
12531+#define __NR_pfm_unload_context 334
12532
12533-#define NR_SYSCALLS 323
12534+#define NR_SYSCALLS 335
12535
12536 #ifdef __KERNEL__
12537 #define __ARCH_WANT_IPC_PARSE_VERSION
12538--- a/arch/sparc/kernel/systbls.S
12539+++ b/arch/sparc/kernel/systbls.S
12540@@ -81,4 +81,6 @@ sys_call_table:
12541 /*305*/ .long sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait
12542 /*310*/ .long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
12543 /*315*/ .long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
12544-/*320*/ .long sys_dup3, sys_pipe2, sys_inotify_init1
12545+/*320*/ .long sys_dup3, sys_pipe2, sys_inotify_init1, sys_pfm_create_context, sys_pfm_write_pmcs, sys_pfm_write_pmds
12546+/*325*/ .long sys_pfm_write_pmds, sys_pfm_read_pmds, sys_pfm_load_context, sys_pfm_start, sys_pfm_stop
12547+/*330*/ .long sys_pfm_restart, sys_pfm_create_evtsets, sys_pfm_getinfo_evtsets, sys_pfm_delete_evtsets, sys_pfm_unload_context
12548--- a/arch/sparc64/Kconfig
12549+++ b/arch/sparc64/Kconfig
12550@@ -402,6 +402,8 @@ source "drivers/sbus/char/Kconfig"
12551
12552 source "fs/Kconfig"
12553
12554+source "arch/sparc64/perfmon/Kconfig"
12555+
12556 source "arch/sparc64/Kconfig.debug"
12557
12558 source "security/Kconfig"
12559--- a/arch/sparc64/Makefile
12560+++ b/arch/sparc64/Makefile
12561@@ -32,6 +32,8 @@ core-y += arch/sparc64/math-emu/
12562 libs-y += arch/sparc64/prom/ arch/sparc64/lib/
12563 drivers-$(CONFIG_OPROFILE) += arch/sparc64/oprofile/
12564
12565+core-$(CONFIG_PERFMON) += arch/sparc64/perfmon/
12566+
12567 boot := arch/sparc64/boot
12568
12569 image tftpboot.img vmlinux.aout: vmlinux
12570--- a/arch/sparc64/kernel/cpu.c
12571+++ b/arch/sparc64/kernel/cpu.c
12572@@ -20,16 +20,17 @@
12573 DEFINE_PER_CPU(cpuinfo_sparc, __cpu_data) = { 0 };
12574
12575 struct cpu_iu_info {
12576- short manuf;
12577- short impl;
12578- char* cpu_name; /* should be enough I hope... */
12579+ short manuf;
12580+ short impl;
12581+ char *cpu_name;
12582+ char *pmu_name;
12583 };
12584
12585 struct cpu_fp_info {
12586- short manuf;
12587- short impl;
12588- char fpu_vers;
12589- char* fp_name;
12590+ short manuf;
12591+ short impl;
12592+ char fpu_vers;
12593+ char* fp_name;
12594 };
12595
12596 static struct cpu_fp_info linux_sparc_fpu[] = {
12597@@ -49,23 +50,24 @@ static struct cpu_fp_info linux_sparc_fp
12598 #define NSPARCFPU ARRAY_SIZE(linux_sparc_fpu)
12599
12600 static struct cpu_iu_info linux_sparc_chips[] = {
12601- { 0x17, 0x10, "TI UltraSparc I (SpitFire)"},
12602- { 0x22, 0x10, "TI UltraSparc I (SpitFire)"},
12603- { 0x17, 0x11, "TI UltraSparc II (BlackBird)"},
12604- { 0x17, 0x12, "TI UltraSparc IIi (Sabre)"},
12605- { 0x17, 0x13, "TI UltraSparc IIe (Hummingbird)"},
12606- { 0x3e, 0x14, "TI UltraSparc III (Cheetah)"},
12607- { 0x3e, 0x15, "TI UltraSparc III+ (Cheetah+)"},
12608- { 0x3e, 0x16, "TI UltraSparc IIIi (Jalapeno)"},
12609- { 0x3e, 0x18, "TI UltraSparc IV (Jaguar)"},
12610- { 0x3e, 0x19, "TI UltraSparc IV+ (Panther)"},
12611- { 0x3e, 0x22, "TI UltraSparc IIIi+ (Serrano)"},
12612-};
12613+ { 0x17, 0x10, "TI UltraSparc I (SpitFire)", "ultra12"},
12614+ { 0x22, 0x10, "TI UltraSparc I (SpitFire)", "ultra12"},
12615+ { 0x17, 0x11, "TI UltraSparc II (BlackBird)", "ultra12"},
12616+ { 0x17, 0x12, "TI UltraSparc IIi (Sabre)", "ultra12"},
12617+ { 0x17, 0x13, "TI UltraSparc IIe (Hummingbird)", "ultra12"},
12618+ { 0x3e, 0x14, "TI UltraSparc III (Cheetah)", "ultra3"},
12619+ { 0x3e, 0x15, "TI UltraSparc III+ (Cheetah+)", "ultra3+"},
12620+ { 0x3e, 0x16, "TI UltraSparc IIIi (Jalapeno)", "ultra3i"},
12621+ { 0x3e, 0x18, "TI UltraSparc IV (Jaguar)", "ultra4"},
12622+ { 0x3e, 0x19, "TI UltraSparc IV+ (Panther)", "ultra4+"},
12623+ { 0x3e, 0x22, "TI UltraSparc IIIi+ (Serrano)", "ultra3+"},
12624+ };
12625
12626 #define NSPARCCHIPS ARRAY_SIZE(linux_sparc_chips)
12627
12628 char *sparc_cpu_type;
12629 char *sparc_fpu_type;
12630+char *sparc_pmu_type;
12631
12632 static void __init sun4v_cpu_probe(void)
12633 {
12634@@ -73,11 +75,13 @@ static void __init sun4v_cpu_probe(void)
12635 case SUN4V_CHIP_NIAGARA1:
12636 sparc_cpu_type = "UltraSparc T1 (Niagara)";
12637 sparc_fpu_type = "UltraSparc T1 integrated FPU";
12638+ sparc_pmu_type = "niagara";
12639 break;
12640
12641 case SUN4V_CHIP_NIAGARA2:
12642 sparc_cpu_type = "UltraSparc T2 (Niagara2)";
12643 sparc_fpu_type = "UltraSparc T2 integrated FPU";
12644+ sparc_pmu_type = "niagara2";
12645 break;
12646
12647 default:
12648@@ -85,6 +89,7 @@ static void __init sun4v_cpu_probe(void)
12649 prom_cpu_compatible);
12650 sparc_cpu_type = "Unknown SUN4V CPU";
12651 sparc_fpu_type = "Unknown SUN4V FPU";
12652+ sparc_pmu_type = "Unknown SUN4V PMU";
12653 break;
12654 }
12655 }
12656@@ -117,6 +122,8 @@ retry:
12657 if (linux_sparc_chips[i].impl == impl) {
12658 sparc_cpu_type =
12659 linux_sparc_chips[i].cpu_name;
12660+ sparc_pmu_type =
12661+ linux_sparc_chips[i].pmu_name;
12662 break;
12663 }
12664 }
12665@@ -134,7 +141,7 @@ retry:
12666 printk("DEBUG: manuf[%lx] impl[%lx]\n",
12667 manuf, impl);
12668 }
12669- sparc_cpu_type = "Unknown CPU";
12670+ sparc_pmu_type = "Unknown PMU";
12671 }
12672
12673 for (i = 0; i < NSPARCFPU; i++) {
12674--- a/arch/sparc64/kernel/hvcalls.S
12675+++ b/arch/sparc64/kernel/hvcalls.S
12676@@ -884,3 +884,44 @@ sun4v_mmu_demap_all:
12677 retl
12678 nop
12679 .size sun4v_mmu_demap_all, .-sun4v_mmu_demap_all
12680+
12681+ .globl sun4v_niagara_getperf
12682+ .type sun4v_niagara_getperf,#function
12683+sun4v_niagara_getperf:
12684+ mov %o0, %o4
12685+ mov HV_FAST_GET_PERFREG, %o5
12686+ ta HV_FAST_TRAP
12687+ stx %o1, [%o4]
12688+ retl
12689+ nop
12690+ .size sun4v_niagara_getperf, .-sun4v_niagara_getperf
12691+
12692+ .globl sun4v_niagara_setperf
12693+ .type sun4v_niagara_setperf,#function
12694+sun4v_niagara_setperf:
12695+ mov HV_FAST_SET_PERFREG, %o5
12696+ ta HV_FAST_TRAP
12697+ retl
12698+ nop
12699+ .size sun4v_niagara_setperf, .-sun4v_niagara_setperf
12700+
12701+ .globl sun4v_niagara2_getperf
12702+ .type sun4v_niagara2_getperf,#function
12703+sun4v_niagara2_getperf:
12704+ mov %o0, %o4
12705+ mov HV_FAST_N2_GET_PERFREG, %o5
12706+ ta HV_FAST_TRAP
12707+ stx %o1, [%o4]
12708+ retl
12709+ nop
12710+ .size sun4v_niagara2_getperf, .-sun4v_niagara2_getperf
12711+
12712+ .globl sun4v_niagara2_setperf
12713+ .type sun4v_niagara2_setperf,#function
12714+sun4v_niagara2_setperf:
12715+ mov HV_FAST_N2_SET_PERFREG, %o5
12716+ ta HV_FAST_TRAP
12717+ retl
12718+ nop
12719+ .size sun4v_niagara2_setperf, .-sun4v_niagara2_setperf
12720+
12721--- a/arch/sparc64/kernel/irq.c
12722+++ b/arch/sparc64/kernel/irq.c
12723@@ -758,6 +758,20 @@ void handler_irq(int irq, struct pt_regs
12724 irq_exit();
12725 set_irq_regs(old_regs);
12726 }
12727+static void unhandled_perf_irq(struct pt_regs *regs)
12728+{
12729+ unsigned long pcr, pic;
12730+
12731+ read_pcr(pcr);
12732+ read_pic(pic);
12733+
12734+ write_pcr(0);
12735+
12736+ printk(KERN_EMERG "CPU %d: Got unexpected perf counter IRQ.\n",
12737+ smp_processor_id());
12738+ printk(KERN_EMERG "CPU %d: PCR[%016lx] PIC[%016lx]\n",
12739+ smp_processor_id(), pcr, pic);
12740+}
12741
12742 void do_softirq(void)
12743 {
12744@@ -785,6 +799,55 @@ void do_softirq(void)
12745 local_irq_restore(flags);
12746 }
12747
12748+/* Almost a direct copy of the powerpc PMC code. */
12749+static DEFINE_SPINLOCK(perf_irq_lock);
12750+static void *perf_irq_owner_caller; /* mostly for debugging */
12751+static void (*perf_irq)(struct pt_regs *regs) = unhandled_perf_irq;
12752+
12753+/* Invoked from level 15 PIL handler in trap table. */
12754+void perfctr_irq(int irq, struct pt_regs *regs)
12755+{
12756+ clear_softint(1 << irq);
12757+ perf_irq(regs);
12758+}
12759+
12760+int register_perfctr_intr(void (*handler)(struct pt_regs *))
12761+{
12762+ int ret;
12763+
12764+ if (!handler)
12765+ return -EINVAL;
12766+
12767+ spin_lock(&perf_irq_lock);
12768+ if (perf_irq != unhandled_perf_irq) {
12769+ printk(KERN_WARNING "register_perfctr_intr: "
12770+ "perf IRQ busy (reserved by caller %p)\n",
12771+ perf_irq_owner_caller);
12772+ ret = -EBUSY;
12773+ goto out;
12774+ }
12775+
12776+ perf_irq_owner_caller = __builtin_return_address(0);
12777+ perf_irq = handler;
12778+
12779+ ret = 0;
12780+out:
12781+ spin_unlock(&perf_irq_lock);
12782+
12783+ return ret;
12784+}
12785+EXPORT_SYMBOL_GPL(register_perfctr_intr);
12786+
12787+void release_perfctr_intr(void (*handler)(struct pt_regs *))
12788+{
12789+ spin_lock(&perf_irq_lock);
12790+ perf_irq_owner_caller = NULL;
12791+ perf_irq = unhandled_perf_irq;
12792+ spin_unlock(&perf_irq_lock);
12793+}
12794+EXPORT_SYMBOL_GPL(release_perfctr_intr);
12795+
12796+
12797 #ifdef CONFIG_HOTPLUG_CPU
12798 void fixup_irqs(void)
12799 {
12800--- a/arch/sparc64/kernel/process.c
12801+++ b/arch/sparc64/kernel/process.c
12802@@ -30,6 +30,7 @@
12803 #include <linux/cpu.h>
12804 #include <linux/elfcore.h>
12805 #include <linux/sysrq.h>
12806+#include <linux/perfmon_kern.h>
12807
12808 #include <asm/oplib.h>
12809 #include <asm/uaccess.h>
12810@@ -385,11 +386,7 @@ void exit_thread(void)
12811 t->utraps[0]--;
12812 }
12813
12814- if (test_and_clear_thread_flag(TIF_PERFCTR)) {
12815- t->user_cntd0 = t->user_cntd1 = NULL;
12816- t->pcr_reg = 0;
12817- write_pcr(0);
12818- }
12819+ pfm_exit_thread();
12820 }
12821
12822 void flush_thread(void)
12823@@ -411,13 +408,6 @@ void flush_thread(void)
12824
12825 set_thread_wsaved(0);
12826
12827- /* Turn off performance counters if on. */
12828- if (test_and_clear_thread_flag(TIF_PERFCTR)) {
12829- t->user_cntd0 = t->user_cntd1 = NULL;
12830- t->pcr_reg = 0;
12831- write_pcr(0);
12832- }
12833-
12834 /* Clear FPU register state. */
12835 t->fpsaved[0] = 0;
12836
12837@@ -631,16 +621,6 @@ int copy_thread(int nr, unsigned long cl
12838 t->kregs->u_regs[UREG_FP] =
12839 ((unsigned long) child_sf) - STACK_BIAS;
12840
12841- /* Special case, if we are spawning a kernel thread from
12842- * a userspace task (usermode helper, NFS or similar), we
12843- * must disable performance counters in the child because
12844- * the address space and protection realm are changing.
12845- */
12846- if (t->flags & _TIF_PERFCTR) {
12847- t->user_cntd0 = t->user_cntd1 = NULL;
12848- t->pcr_reg = 0;
12849- t->flags &= ~_TIF_PERFCTR;
12850- }
12851 t->flags |= ((long)ASI_P << TI_FLAG_CURRENT_DS_SHIFT);
12852 t->kregs->u_regs[UREG_G6] = (unsigned long) t;
12853 t->kregs->u_regs[UREG_G4] = (unsigned long) t->task;
12854@@ -673,6 +653,8 @@ int copy_thread(int nr, unsigned long cl
12855 if (clone_flags & CLONE_SETTLS)
12856 t->kregs->u_regs[UREG_G7] = regs->u_regs[UREG_I3];
12857
12858+ pfm_copy_thread(p);
12859+
12860 return 0;
12861 }
12862
12863--- a/arch/sparc64/kernel/rtrap.S
12864+++ b/arch/sparc64/kernel/rtrap.S
12865@@ -65,55 +65,14 @@ __handle_user_windows:
12866 ba,pt %xcc, __handle_user_windows_continue
12867
12868 andn %l1, %l4, %l1
12869-__handle_perfctrs:
12870- call update_perfctrs
12871- wrpr %g0, RTRAP_PSTATE, %pstate
12872- wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
12873- ldub [%g6 + TI_WSAVED], %o2
12874- brz,pt %o2, 1f
12875- nop
12876- /* Redo userwin+sched+sig checks */
12877- call fault_in_user_windows
12878-
12879- wrpr %g0, RTRAP_PSTATE, %pstate
12880- wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
12881- ldx [%g6 + TI_FLAGS], %l0
12882- andcc %l0, _TIF_NEED_RESCHED, %g0
12883- be,pt %xcc, 1f
12884-
12885- nop
12886- call schedule
12887- wrpr %g0, RTRAP_PSTATE, %pstate
12888- wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
12889- ldx [%g6 + TI_FLAGS], %l0
12890-1: andcc %l0, _TIF_DO_NOTIFY_RESUME_MASK, %g0
12891-
12892- be,pt %xcc, __handle_perfctrs_continue
12893- sethi %hi(TSTATE_PEF), %o0
12894- mov %l5, %o1
12895- add %sp, PTREGS_OFF, %o0
12896- mov %l0, %o2
12897- call do_notify_resume
12898-
12899- wrpr %g0, RTRAP_PSTATE, %pstate
12900- wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate
12901- /* Signal delivery can modify pt_regs tstate, so we must
12902- * reload it.
12903- */
12904- ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
12905- sethi %hi(0xf << 20), %l4
12906- and %l1, %l4, %l4
12907- andn %l1, %l4, %l1
12908- ba,pt %xcc, __handle_perfctrs_continue
12909-
12910- sethi %hi(TSTATE_PEF), %o0
12911 __handle_userfpu:
12912 rd %fprs, %l5
12913 andcc %l5, FPRS_FEF, %g0
12914 sethi %hi(TSTATE_PEF), %o0
12915 be,a,pn %icc, __handle_userfpu_continue
12916 andn %l1, %o0, %l1
12917- ba,a,pt %xcc, __handle_userfpu_continue
12918+ ba,pt %xcc, __handle_userfpu_continue
12919+ nop
12920
12921 __handle_signal:
12922 mov %l5, %o1
12923@@ -202,12 +161,8 @@ __handle_signal_continue:
12924 brnz,pn %o2, __handle_user_windows
12925 nop
12926 __handle_user_windows_continue:
12927- ldx [%g6 + TI_FLAGS], %l5
12928- andcc %l5, _TIF_PERFCTR, %g0
12929 sethi %hi(TSTATE_PEF), %o0
12930- bne,pn %xcc, __handle_perfctrs
12931-__handle_perfctrs_continue:
12932- andcc %l1, %o0, %g0
12933+ andcc %l1, %o0, %g0
12934
12935 /* This fpdepth clear is necessary for non-syscall rtraps only */
12936 user_nowork:
12937--- a/arch/sparc64/kernel/setup.c
12938+++ b/arch/sparc64/kernel/setup.c
12939@@ -352,6 +352,7 @@ static int show_cpuinfo(struct seq_file
12940 seq_printf(m,
12941 "cpu\t\t: %s\n"
12942 "fpu\t\t: %s\n"
12943+ "pmu\t\t: %s\n"
12944 "prom\t\t: %s\n"
12945 "type\t\t: %s\n"
12946 "ncpus probed\t: %d\n"
12947@@ -364,6 +365,7 @@ static int show_cpuinfo(struct seq_file
12948 ,
12949 sparc_cpu_type,
12950 sparc_fpu_type,
12951+ sparc_pmu_type,
12952 prom_version,
12953 ((tlb_type == hypervisor) ?
12954 "sun4v" :
12955--- a/arch/sparc64/kernel/signal.c
12956+++ b/arch/sparc64/kernel/signal.c
12957@@ -23,6 +23,7 @@
12958 #include <linux/tty.h>
12959 #include <linux/binfmts.h>
12960 #include <linux/bitops.h>
12961+#include <linux/perfmon_kern.h>
12962
12963 #include <asm/uaccess.h>
12964 #include <asm/ptrace.h>
12965@@ -608,6 +609,9 @@ static void do_signal(struct pt_regs *re
12966
12967 void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long thread_info_flags)
12968 {
12969+ if (thread_info_flags & _TIF_PERFMON_WORK)
12970+ pfm_handle_work(regs);
12971+
12972 if (thread_info_flags & _TIF_SIGPENDING)
12973 do_signal(regs, orig_i0);
12974 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
12975--- a/arch/sparc64/kernel/sys_sparc.c
12976+++ b/arch/sparc64/kernel/sys_sparc.c
12977@@ -26,7 +26,6 @@
12978
12979 #include <asm/uaccess.h>
12980 #include <asm/utrap.h>
12981-#include <asm/perfctr.h>
12982 #include <asm/unistd.h>
12983
12984 #include "entry.h"
12985@@ -788,107 +787,11 @@ SYSCALL_DEFINE5(rt_sigaction, int, sig,
12986 return ret;
12987 }
12988
12989-/* Invoked by rtrap code to update performance counters in
12990- * user space.
12991- */
12992-asmlinkage void update_perfctrs(void)
12993-{
12994- unsigned long pic, tmp;
12995-
12996- read_pic(pic);
12997- tmp = (current_thread_info()->kernel_cntd0 += (unsigned int)pic);
12998- __put_user(tmp, current_thread_info()->user_cntd0);
12999- tmp = (current_thread_info()->kernel_cntd1 += (pic >> 32));
13000- __put_user(tmp, current_thread_info()->user_cntd1);
13001- reset_pic();
13002-}
13003-
13004 SYSCALL_DEFINE4(perfctr, int, opcode, unsigned long, arg0,
13005 unsigned long, arg1, unsigned long, arg2)
13006 {
13007- int err = 0;
13008-
13009- switch(opcode) {
13010- case PERFCTR_ON:
13011- current_thread_info()->pcr_reg = arg2;
13012- current_thread_info()->user_cntd0 = (u64 __user *) arg0;
13013- current_thread_info()->user_cntd1 = (u64 __user *) arg1;
13014- current_thread_info()->kernel_cntd0 =
13015- current_thread_info()->kernel_cntd1 = 0;
13016- write_pcr(arg2);
13017- reset_pic();
13018- set_thread_flag(TIF_PERFCTR);
13019- break;
13020-
13021- case PERFCTR_OFF:
13022- err = -EINVAL;
13023- if (test_thread_flag(TIF_PERFCTR)) {
13024- current_thread_info()->user_cntd0 =
13025- current_thread_info()->user_cntd1 = NULL;
13026- current_thread_info()->pcr_reg = 0;
13027- write_pcr(0);
13028- clear_thread_flag(TIF_PERFCTR);
13029- err = 0;
13030- }
13031- break;
13032-
13033- case PERFCTR_READ: {
13034- unsigned long pic, tmp;
13035-
13036- if (!test_thread_flag(TIF_PERFCTR)) {
13037- err = -EINVAL;
13038- break;
13039- }
13040- read_pic(pic);
13041- tmp = (current_thread_info()->kernel_cntd0 += (unsigned int)pic);
13042- err |= __put_user(tmp, current_thread_info()->user_cntd0);
13043- tmp = (current_thread_info()->kernel_cntd1 += (pic >> 32));
13044- err |= __put_user(tmp, current_thread_info()->user_cntd1);
13045- reset_pic();
13046- break;
13047- }
13048-
13049- case PERFCTR_CLRPIC:
13050- if (!test_thread_flag(TIF_PERFCTR)) {
13051- err = -EINVAL;
13052- break;
13053- }
13054- current_thread_info()->kernel_cntd0 =
13055- current_thread_info()->kernel_cntd1 = 0;
13056- reset_pic();
13057- break;
13058-
13059- case PERFCTR_SETPCR: {
13060- u64 __user *user_pcr = (u64 __user *)arg0;
13061-
13062- if (!test_thread_flag(TIF_PERFCTR)) {
13063- err = -EINVAL;
13064- break;
13065- }
13066- err |= __get_user(current_thread_info()->pcr_reg, user_pcr);
13067- write_pcr(current_thread_info()->pcr_reg);
13068- current_thread_info()->kernel_cntd0 =
13069- current_thread_info()->kernel_cntd1 = 0;
13070- reset_pic();
13071- break;
13072- }
13073-
13074- case PERFCTR_GETPCR: {
13075- u64 __user *user_pcr = (u64 __user *)arg0;
13076-
13077- if (!test_thread_flag(TIF_PERFCTR)) {
13078- err = -EINVAL;
13079- break;
13080- }
13081- err |= __put_user(current_thread_info()->pcr_reg, user_pcr);
13082- break;
13083- }
13084-
13085- default:
13086- err = -EINVAL;
13087- break;
13088- };
13089- return err;
13090+ /* Superceded by perfmon2 */
13091+ return -ENOSYS;
13092 }
13093
13094 /*
13095--- a/arch/sparc64/kernel/syscalls.S
13096+++ b/arch/sparc64/kernel/syscalls.S
13097@@ -117,26 +117,9 @@ ret_from_syscall:
13098 stb %g0, [%g6 + TI_NEW_CHILD]
13099 ldx [%g6 + TI_FLAGS], %l0
13100 call schedule_tail
13101- mov %g7, %o0
13102- andcc %l0, _TIF_PERFCTR, %g0
13103- be,pt %icc, 1f
13104- nop
13105- ldx [%g6 + TI_PCR], %o7
13106- wr %g0, %o7, %pcr
13107-
13108- /* Blackbird errata workaround. See commentary in
13109- * smp.c:smp_percpu_timer_interrupt() for more
13110- * information.
13111- */
13112- ba,pt %xcc, 99f
13113- nop
13114-
13115- .align 64
13116-99: wr %g0, %g0, %pic
13117- rd %pic, %g0
13118-
13119-1: ba,pt %xcc, ret_sys_call
13120- ldx [%sp + PTREGS_OFF + PT_V9_I0], %o0
13121+ mov %g7, %o0
13122+ ba,pt %xcc, ret_sys_call
13123+ ldx [%sp + PTREGS_OFF + PT_V9_I0], %o0
13124
13125 .globl sparc_exit
13126 .type sparc_exit,#function
13127--- a/arch/sparc64/kernel/systbls.S
13128+++ b/arch/sparc64/kernel/systbls.S
13129@@ -82,7 +82,9 @@ sys_call_table32:
13130 .word compat_sys_set_mempolicy, compat_sys_kexec_load, compat_sys_move_pages, sys_getcpu, compat_sys_epoll_pwait
13131 /*310*/ .word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate
13132 .word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1
13133-/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1
13134+/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_pfm_create_context, sys_pfm_write_pmcs
13135+ .word sys_pfm_write_pmds, sys_pfm_read_pmds, sys_pfm_load_context, sys_pfm_start, sys_pfm_stop
13136+/*330*/ .word sys_pfm_restart, sys_pfm_create_evtsets, sys_pfm_getinfo_evtsets, sys_pfm_delete_evtsets, sys_pfm_unload_context
13137
13138 #endif /* CONFIG_COMPAT */
13139
13140@@ -156,4 +158,6 @@ sys_call_table:
13141 .word sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait
13142 /*310*/ .word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
13143 .word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
13144-/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1
13145+/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_pfm_create_context, sys_pfm_write_pmcs
13146+ .word sys_pfm_write_pmds, sys_pfm_read_pmds, sys_pfm_load_context, sys_pfm_start, sys_pfm_stop
13147+/*330*/ .word sys_pfm_restart, sys_pfm_create_evtsets, sys_pfm_getinfo_evtsets, sys_pfm_delete_evtsets, sys_pfm_unload_context
13148--- a/arch/sparc64/kernel/traps.c
13149+++ b/arch/sparc64/kernel/traps.c
13150@@ -2485,85 +2485,89 @@ extern void tsb_config_offsets_are_bolix
13151 /* Only invoked on boot processor. */
13152 void __init trap_init(void)
13153 {
13154- /* Compile time sanity check. */
13155- if (TI_TASK != offsetof(struct thread_info, task) ||
13156- TI_FLAGS != offsetof(struct thread_info, flags) ||
13157- TI_CPU != offsetof(struct thread_info, cpu) ||
13158- TI_FPSAVED != offsetof(struct thread_info, fpsaved) ||
13159- TI_KSP != offsetof(struct thread_info, ksp) ||
13160- TI_FAULT_ADDR != offsetof(struct thread_info, fault_address) ||
13161- TI_KREGS != offsetof(struct thread_info, kregs) ||
13162- TI_UTRAPS != offsetof(struct thread_info, utraps) ||
13163- TI_EXEC_DOMAIN != offsetof(struct thread_info, exec_domain) ||
13164- TI_REG_WINDOW != offsetof(struct thread_info, reg_window) ||
13165- TI_RWIN_SPTRS != offsetof(struct thread_info, rwbuf_stkptrs) ||
13166- TI_GSR != offsetof(struct thread_info, gsr) ||
13167- TI_XFSR != offsetof(struct thread_info, xfsr) ||
13168- TI_USER_CNTD0 != offsetof(struct thread_info, user_cntd0) ||
13169- TI_USER_CNTD1 != offsetof(struct thread_info, user_cntd1) ||
13170- TI_KERN_CNTD0 != offsetof(struct thread_info, kernel_cntd0) ||
13171- TI_KERN_CNTD1 != offsetof(struct thread_info, kernel_cntd1) ||
13172- TI_PCR != offsetof(struct thread_info, pcr_reg) ||
13173- TI_PRE_COUNT != offsetof(struct thread_info, preempt_count) ||
13174- TI_NEW_CHILD != offsetof(struct thread_info, new_child) ||
13175- TI_SYS_NOERROR != offsetof(struct thread_info, syscall_noerror) ||
13176- TI_RESTART_BLOCK != offsetof(struct thread_info, restart_block) ||
13177- TI_KUNA_REGS != offsetof(struct thread_info, kern_una_regs) ||
13178- TI_KUNA_INSN != offsetof(struct thread_info, kern_una_insn) ||
13179- TI_FPREGS != offsetof(struct thread_info, fpregs) ||
13180- (TI_FPREGS & (64 - 1)))
13181- thread_info_offsets_are_bolixed_dave();
13182+ BUILD_BUG_ON(TI_TASK != offsetof(struct thread_info, task));
13183+ BUILD_BUG_ON(TI_FLAGS != offsetof(struct thread_info, flags));
13184+ BUILD_BUG_ON(TI_CPU != offsetof(struct thread_info, cpu));
13185+ BUILD_BUG_ON(TI_FPSAVED != offsetof(struct thread_info, fpsaved));
13186+ BUILD_BUG_ON(TI_KSP != offsetof(struct thread_info, ksp));
13187+ BUILD_BUG_ON(TI_FAULT_ADDR !=
13188+ offsetof(struct thread_info, fault_address));
13189+ BUILD_BUG_ON(TI_KREGS != offsetof(struct thread_info, kregs));
13190+ BUILD_BUG_ON(TI_UTRAPS != offsetof(struct thread_info, utraps));
13191+ BUILD_BUG_ON(TI_EXEC_DOMAIN !=
13192+ offsetof(struct thread_info, exec_domain));
13193+ BUILD_BUG_ON(TI_REG_WINDOW !=
13194+ offsetof(struct thread_info, reg_window));
13195+ BUILD_BUG_ON(TI_RWIN_SPTRS !=
13196+ offsetof(struct thread_info, rwbuf_stkptrs));
13197+ BUILD_BUG_ON(TI_GSR != offsetof(struct thread_info, gsr));
13198+ BUILD_BUG_ON(TI_XFSR != offsetof(struct thread_info, xfsr));
13199+ BUILD_BUG_ON(TI_PRE_COUNT !=
13200+ offsetof(struct thread_info, preempt_count));
13201+ BUILD_BUG_ON(TI_NEW_CHILD !=
13202+ offsetof(struct thread_info, new_child));
13203+ BUILD_BUG_ON(TI_SYS_NOERROR !=
13204+ offsetof(struct thread_info, syscall_noerror));
13205+ BUILD_BUG_ON(TI_RESTART_BLOCK !=
13206+ offsetof(struct thread_info, restart_block));
13207+ BUILD_BUG_ON(TI_KUNA_REGS !=
13208+ offsetof(struct thread_info, kern_una_regs));
13209+ BUILD_BUG_ON(TI_KUNA_INSN !=
13210+ offsetof(struct thread_info, kern_una_insn));
13211+ BUILD_BUG_ON(TI_FPREGS != offsetof(struct thread_info, fpregs));
13212+ BUILD_BUG_ON((TI_FPREGS & (64 - 1)));
13213
13214- if (TRAP_PER_CPU_THREAD != offsetof(struct trap_per_cpu, thread) ||
13215- (TRAP_PER_CPU_PGD_PADDR !=
13216- offsetof(struct trap_per_cpu, pgd_paddr)) ||
13217- (TRAP_PER_CPU_CPU_MONDO_PA !=
13218- offsetof(struct trap_per_cpu, cpu_mondo_pa)) ||
13219- (TRAP_PER_CPU_DEV_MONDO_PA !=
13220- offsetof(struct trap_per_cpu, dev_mondo_pa)) ||
13221- (TRAP_PER_CPU_RESUM_MONDO_PA !=
13222- offsetof(struct trap_per_cpu, resum_mondo_pa)) ||
13223- (TRAP_PER_CPU_RESUM_KBUF_PA !=
13224- offsetof(struct trap_per_cpu, resum_kernel_buf_pa)) ||
13225- (TRAP_PER_CPU_NONRESUM_MONDO_PA !=
13226- offsetof(struct trap_per_cpu, nonresum_mondo_pa)) ||
13227- (TRAP_PER_CPU_NONRESUM_KBUF_PA !=
13228- offsetof(struct trap_per_cpu, nonresum_kernel_buf_pa)) ||
13229- (TRAP_PER_CPU_FAULT_INFO !=
13230- offsetof(struct trap_per_cpu, fault_info)) ||
13231- (TRAP_PER_CPU_CPU_MONDO_BLOCK_PA !=
13232- offsetof(struct trap_per_cpu, cpu_mondo_block_pa)) ||
13233- (TRAP_PER_CPU_CPU_LIST_PA !=
13234- offsetof(struct trap_per_cpu, cpu_list_pa)) ||
13235- (TRAP_PER_CPU_TSB_HUGE !=
13236- offsetof(struct trap_per_cpu, tsb_huge)) ||
13237- (TRAP_PER_CPU_TSB_HUGE_TEMP !=
13238- offsetof(struct trap_per_cpu, tsb_huge_temp)) ||
13239- (TRAP_PER_CPU_IRQ_WORKLIST_PA !=
13240- offsetof(struct trap_per_cpu, irq_worklist_pa)) ||
13241- (TRAP_PER_CPU_CPU_MONDO_QMASK !=
13242- offsetof(struct trap_per_cpu, cpu_mondo_qmask)) ||
13243- (TRAP_PER_CPU_DEV_MONDO_QMASK !=
13244- offsetof(struct trap_per_cpu, dev_mondo_qmask)) ||
13245- (TRAP_PER_CPU_RESUM_QMASK !=
13246- offsetof(struct trap_per_cpu, resum_qmask)) ||
13247- (TRAP_PER_CPU_NONRESUM_QMASK !=
13248- offsetof(struct trap_per_cpu, nonresum_qmask)))
13249- trap_per_cpu_offsets_are_bolixed_dave();
13250+ BUILD_BUG_ON(TRAP_PER_CPU_THREAD !=
13251+ offsetof(struct trap_per_cpu, thread));
13252+ BUILD_BUG_ON(TRAP_PER_CPU_PGD_PADDR !=
13253+ offsetof(struct trap_per_cpu, pgd_paddr));
13254+ BUILD_BUG_ON(TRAP_PER_CPU_CPU_MONDO_PA !=
13255+ offsetof(struct trap_per_cpu, cpu_mondo_pa));
13256+ BUILD_BUG_ON(TRAP_PER_CPU_DEV_MONDO_PA !=
13257+ offsetof(struct trap_per_cpu, dev_mondo_pa));
13258+ BUILD_BUG_ON(TRAP_PER_CPU_RESUM_MONDO_PA !=
13259+ offsetof(struct trap_per_cpu, resum_mondo_pa));
13260+ BUILD_BUG_ON(TRAP_PER_CPU_RESUM_KBUF_PA !=
13261+ offsetof(struct trap_per_cpu, resum_kernel_buf_pa));
13262+ BUILD_BUG_ON(TRAP_PER_CPU_NONRESUM_MONDO_PA !=
13263+ offsetof(struct trap_per_cpu, nonresum_mondo_pa));
13264+ BUILD_BUG_ON(TRAP_PER_CPU_NONRESUM_KBUF_PA !=
13265+ offsetof(struct trap_per_cpu, nonresum_kernel_buf_pa));
13266+ BUILD_BUG_ON(TRAP_PER_CPU_FAULT_INFO !=
13267+ offsetof(struct trap_per_cpu, fault_info));
13268+ BUILD_BUG_ON(TRAP_PER_CPU_CPU_MONDO_BLOCK_PA !=
13269+ offsetof(struct trap_per_cpu, cpu_mondo_block_pa));
13270+ BUILD_BUG_ON(TRAP_PER_CPU_CPU_LIST_PA !=
13271+ offsetof(struct trap_per_cpu, cpu_list_pa));
13272+ BUILD_BUG_ON(TRAP_PER_CPU_TSB_HUGE !=
13273+ offsetof(struct trap_per_cpu, tsb_huge));
13274+ BUILD_BUG_ON(TRAP_PER_CPU_TSB_HUGE_TEMP !=
13275+ offsetof(struct trap_per_cpu, tsb_huge_temp));
13276+#if 0
13277+ BUILD_BUG_ON(TRAP_PER_CPU_IRQ_WORKLIST !=
13278+ offsetof(struct trap_per_cpu, irq_worklist));
13279+#endif
13280+ BUILD_BUG_ON(TRAP_PER_CPU_CPU_MONDO_QMASK !=
13281+ offsetof(struct trap_per_cpu, cpu_mondo_qmask));
13282+ BUILD_BUG_ON(TRAP_PER_CPU_DEV_MONDO_QMASK !=
13283+ offsetof(struct trap_per_cpu, dev_mondo_qmask));
13284+ BUILD_BUG_ON(TRAP_PER_CPU_RESUM_QMASK !=
13285+ offsetof(struct trap_per_cpu, resum_qmask));
13286+ BUILD_BUG_ON(TRAP_PER_CPU_NONRESUM_QMASK !=
13287+ offsetof(struct trap_per_cpu, nonresum_qmask));
13288
13289- if ((TSB_CONFIG_TSB !=
13290- offsetof(struct tsb_config, tsb)) ||
13291- (TSB_CONFIG_RSS_LIMIT !=
13292- offsetof(struct tsb_config, tsb_rss_limit)) ||
13293- (TSB_CONFIG_NENTRIES !=
13294- offsetof(struct tsb_config, tsb_nentries)) ||
13295- (TSB_CONFIG_REG_VAL !=
13296- offsetof(struct tsb_config, tsb_reg_val)) ||
13297- (TSB_CONFIG_MAP_VADDR !=
13298- offsetof(struct tsb_config, tsb_map_vaddr)) ||
13299- (TSB_CONFIG_MAP_PTE !=
13300- offsetof(struct tsb_config, tsb_map_pte)))
13301- tsb_config_offsets_are_bolixed_dave();
13302+ BUILD_BUG_ON(TSB_CONFIG_TSB !=
13303+ offsetof(struct tsb_config, tsb));
13304+ BUILD_BUG_ON(TSB_CONFIG_RSS_LIMIT !=
13305+ offsetof(struct tsb_config, tsb_rss_limit));
13306+ BUILD_BUG_ON(TSB_CONFIG_NENTRIES !=
13307+ offsetof(struct tsb_config, tsb_nentries));
13308+ BUILD_BUG_ON(TSB_CONFIG_REG_VAL !=
13309+ offsetof(struct tsb_config, tsb_reg_val));
13310+ BUILD_BUG_ON(TSB_CONFIG_MAP_VADDR !=
13311+ offsetof(struct tsb_config, tsb_map_vaddr));
13312+ BUILD_BUG_ON(TSB_CONFIG_MAP_PTE !=
13313+ offsetof(struct tsb_config, tsb_map_pte));
13314
13315 /* Attach to the address space of init_task. On SMP we
13316 * do this in smp.c:smp_callin for other cpus.
13317--- a/arch/sparc64/kernel/ttable.S
13318+++ b/arch/sparc64/kernel/ttable.S
13319@@ -72,7 +72,7 @@ tl0_irq8: BTRAP(0x48)
13320 tl0_irq9: BTRAP(0x49)
13321 tl0_irq10: BTRAP(0x4a) BTRAP(0x4b) BTRAP(0x4c) BTRAP(0x4d)
13322 tl0_irq14: TRAP_IRQ(timer_interrupt, 14)
13323-tl0_irq15: TRAP_IRQ(handler_irq, 15)
13324+tl0_irq15: TRAP_IRQ(perfctr_irq, 15)
13325 tl0_resv050: BTRAP(0x50) BTRAP(0x51) BTRAP(0x52) BTRAP(0x53) BTRAP(0x54) BTRAP(0x55)
13326 tl0_resv056: BTRAP(0x56) BTRAP(0x57) BTRAP(0x58) BTRAP(0x59) BTRAP(0x5a) BTRAP(0x5b)
13327 tl0_resv05c: BTRAP(0x5c) BTRAP(0x5d) BTRAP(0x5e) BTRAP(0x5f)
13328--- /dev/null
13329+++ b/arch/sparc64/perfmon/Kconfig
13330@@ -0,0 +1,26 @@
13331+menu "Hardware Performance Monitoring support"
13332+config PERFMON
13333+ bool "Perfmon2 performance monitoring interface"
13334+ default n
13335+ help
13336+ Enables the perfmon2 interface to access the hardware
13337+ performance counters. See <http://perfmon2.sf.net/> for
13338+ more details.
13339+
13340+config PERFMON_DEBUG
13341+ bool "Perfmon debugging"
13342+ depends on PERFMON
13343+ default n
13344+ help
13345+ Enables perfmon debugging support
13346+
13347+config PERFMON_DEBUG_FS
13348+ bool "Enable perfmon statistics reporting via debugfs"
13349+ default y
13350+ depends on PERFMON && DEBUG_FS
13351+ help
13352+ Enable collection and reporting of perfmon timing statistics under
13353+ debugfs. This is used for debugging and performance analysis of the
13354+ subsystem. The debugfs filesystem must be mounted.
13355+
13356+endmenu
13357--- /dev/null
13358+++ b/arch/sparc64/perfmon/Makefile
13359@@ -0,0 +1 @@
13360+obj-$(CONFIG_PERFMON) += perfmon.o
13361--- /dev/null
13362+++ b/arch/sparc64/perfmon/perfmon.c
13363@@ -0,0 +1,422 @@
13364+/* perfmon.c: sparc64 perfmon support
13365+ *
13366+ * Copyright (C) 2007 David S. Miller (davem@davemloft.net)
13367+ */
13368+
13369+#include <linux/kernel.h>
13370+#include <linux/module.h>
13371+#include <linux/irq.h>
13372+#include <linux/perfmon_kern.h>
13373+
13374+#include <asm/system.h>
13375+#include <asm/spitfire.h>
13376+#include <asm/hypervisor.h>
13377+
13378+struct pcr_ops {
13379+ void (*write)(u64);
13380+ u64 (*read)(void);
13381+};
13382+
13383+static void direct_write_pcr(u64 val)
13384+{
13385+ write_pcr(val);
13386+}
13387+
13388+static u64 direct_read_pcr(void)
13389+{
13390+ u64 pcr;
13391+
13392+ read_pcr(pcr);
13393+
13394+ return pcr;
13395+}
13396+
13397+static struct pcr_ops direct_pcr_ops = {
13398+ .write = direct_write_pcr,
13399+ .read = direct_read_pcr,
13400+};
13401+
13402+/* Using the hypervisor call is needed so that we can set the
13403+ * hypervisor trace bit correctly, which is hyperprivileged.
13404+ */
13405+static void n2_write_pcr(u64 val)
13406+{
13407+ unsigned long ret;
13408+
13409+ ret = sun4v_niagara2_setperf(HV_N2_PERF_SPARC_CTL, val);
13410+ if (val != HV_EOK)
13411+ write_pcr(val);
13412+}
13413+
13414+static u64 n2_read_pcr(void)
13415+{
13416+ u64 pcr;
13417+
13418+ read_pcr(pcr);
13419+
13420+ return pcr;
13421+}
13422+
13423+static struct pcr_ops n2_pcr_ops = {
13424+ .write = n2_write_pcr,
13425+ .read = n2_read_pcr,
13426+};
13427+
13428+static struct pcr_ops *pcr_ops;
13429+
13430+void pfm_arch_write_pmc(struct pfm_context *ctx,
13431+ unsigned int cnum, u64 value)
13432+{
13433+ /*
13434+ * we only write to the actual register when monitoring is
13435+ * active (pfm_start was issued)
13436+ */
13437+ if (ctx && ctx->flags.started == 0)
13438+ return;
13439+
13440+ pcr_ops->write(value);
13441+}
13442+
13443+u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum)
13444+{
13445+ return pcr_ops->read();
13446+}
13447+
13448+/*
13449+ * collect pending overflowed PMDs. Called from pfm_ctxsw()
13450+ * and from PMU interrupt handler. Must fill in set->povfl_pmds[]
13451+ * and set->npend_ovfls. Interrupts are masked
13452+ */
13453+static void __pfm_get_ovfl_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
13454+{
13455+ unsigned int max = ctx->regs.max_intr_pmd;
13456+ u64 wmask = 1ULL << pfm_pmu_conf->counter_width;
13457+ u64 *intr_pmds = ctx->regs.intr_pmds;
13458+ u64 *used_mask = set->used_pmds;
13459+ u64 mask[PFM_PMD_BV];
13460+ unsigned int i;
13461+
13462+ bitmap_and(cast_ulp(mask),
13463+ cast_ulp(intr_pmds),
13464+ cast_ulp(used_mask),
13465+ max);
13466+
13467+ /*
13468+ * check all PMD that can generate interrupts
13469+ * (that includes counters)
13470+ */
13471+ for (i = 0; i < max; i++) {
13472+ if (test_bit(i, mask)) {
13473+ u64 new_val = pfm_arch_read_pmd(ctx, i);
13474+
13475+ PFM_DBG_ovfl("pmd%u new_val=0x%llx bit=%d\n",
13476+ i, (unsigned long long)new_val,
13477+ (new_val&wmask) ? 1 : 0);
13478+
13479+ if (new_val & wmask) {
13480+ __set_bit(i, set->povfl_pmds);
13481+ set->npend_ovfls++;
13482+ }
13483+ }
13484+ }
13485+}
13486+
13487+static void pfm_stop_active(struct task_struct *task, struct pfm_context *ctx,
13488+ struct pfm_event_set *set)
13489+{
13490+ unsigned int i, max = ctx->regs.max_pmc;
13491+
13492+ /*
13493+ * clear enable bits, assume all pmcs are enable pmcs
13494+ */
13495+ for (i = 0; i < max; i++) {
13496+ if (test_bit(i, set->used_pmcs))
13497+ pfm_arch_write_pmc(ctx, i, 0);
13498+ }
13499+
13500+ if (set->npend_ovfls)
13501+ return;
13502+
13503+ __pfm_get_ovfl_pmds(ctx, set);
13504+}
13505+
13506+/*
13507+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
13508+ * Context is locked. Interrupts are masked. Monitoring is active.
13509+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
13510+ *
13511+ * for per-thread:
13512+ * must stop monitoring for the task
13513+ *
13514+ * Return:
13515+ * non-zero : did not save PMDs (as part of stopping the PMU)
13516+ * 0 : saved PMDs (no need to save them in caller)
13517+ */
13518+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx)
13519+{
13520+ /*
13521+ * disable lazy restore of PMC registers.
13522+ */
13523+ ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
13524+
13525+ pfm_stop_active(task, ctx, ctx->active_set);
13526+
13527+ return 1;
13528+}
13529+
13530+/*
13531+ * Called from pfm_stop() and idle notifier
13532+ *
13533+ * Interrupts are masked. Context is locked. Set is the active set.
13534+ *
13535+ * For per-thread:
13536+ * task is not necessarily current. If not current task, then
13537+ * task is guaranteed stopped and off any cpu. Access to PMU
13538+ * is not guaranteed. Interrupts are masked. Context is locked.
13539+ * Set is the active set.
13540+ *
13541+ * For system-wide:
13542+ * task is current
13543+ *
13544+ * must disable active monitoring. ctx cannot be NULL
13545+ */
13546+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx)
13547+{
13548+ /*
13549+ * no need to go through stop_save()
13550+ * if we are already stopped
13551+ */
13552+ if (!ctx->flags.started || ctx->state == PFM_CTX_MASKED)
13553+ return;
13554+
13555+ /*
13556+ * stop live registers and collect pending overflow
13557+ */
13558+ if (task == current)
13559+ pfm_stop_active(task, ctx, ctx->active_set);
13560+}
13561+
13562+/*
13563+ * Enable active monitoring. Called from pfm_start() and
13564+ * pfm_arch_unmask_monitoring().
13565+ *
13566+ * Interrupts are masked. Context is locked. Set is the active set.
13567+ *
13568+ * For per-trhead:
13569+ * Task is not necessarily current. If not current task, then task
13570+ * is guaranteed stopped and off any cpu. Access to PMU is not guaranteed.
13571+ *
13572+ * For system-wide:
13573+ * task is always current
13574+ *
13575+ * must enable active monitoring.
13576+ */
13577+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx)
13578+{
13579+ struct pfm_event_set *set;
13580+ unsigned int max_pmc = ctx->regs.max_pmc;
13581+ unsigned int i;
13582+
13583+ if (task != current)
13584+ return;
13585+
13586+ set = ctx->active_set;
13587+ for (i = 0; i < max_pmc; i++) {
13588+ if (test_bit(i, set->used_pmcs))
13589+ pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
13590+ }
13591+}
13592+
13593+/*
13594+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
13595+ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets()
13596+ * context is locked. Interrupts are masked. set cannot be NULL.
13597+ * Access to the PMU is guaranteed.
13598+ *
13599+ * function must restore all PMD registers from set.
13600+ */
13601+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
13602+{
13603+ unsigned int max_pmd = ctx->regs.max_pmd;
13604+ u64 ovfl_mask = pfm_pmu_conf->ovfl_mask;
13605+ u64 *impl_pmds = ctx->regs.pmds;
13606+ unsigned int i;
13607+
13608+ /*
13609+ * must restore all pmds to avoid leaking
13610+ * information to user.
13611+ */
13612+ for (i = 0; i < max_pmd; i++) {
13613+ u64 val;
13614+
13615+ if (test_bit(i, impl_pmds) == 0)
13616+ continue;
13617+
13618+ val = set->pmds[i].value;
13619+
13620+ /*
13621+ * set upper bits for counter to ensure
13622+ * overflow will trigger
13623+ */
13624+ val &= ovfl_mask;
13625+
13626+ pfm_arch_write_pmd(ctx, i, val);
13627+ }
13628+}
13629+
13630+/*
13631+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
13632+ * pfm_context_load_sys(), pfm_ctxsw().
13633+ * Context is locked. Interrupts are masked. set cannot be NULL.
13634+ * Access to the PMU is guaranteed.
13635+ *
13636+ * function must restore all PMC registers from set, if needed.
13637+ */
13638+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
13639+{
13640+ unsigned int max_pmc = ctx->regs.max_pmc;
13641+ u64 *impl_pmcs = ctx->regs.pmcs;
13642+ unsigned int i;
13643+
13644+ /* If we're masked or stopped we don't need to bother restoring
13645+ * the PMCs now.
13646+ */
13647+ if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0)
13648+ return;
13649+
13650+ /*
13651+ * restore all pmcs
13652+ */
13653+ for (i = 0; i < max_pmc; i++)
13654+ if (test_bit(i, impl_pmcs))
13655+ pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
13656+}
13657+
13658+char *pfm_arch_get_pmu_module_name(void)
13659+{
13660+ return NULL;
13661+}
13662+
13663+void perfmon_interrupt(struct pt_regs *regs)
13664+{
13665+ pfm_interrupt_handler(instruction_pointer(regs), regs);
13666+}
13667+
13668+static struct pfm_regmap_desc pfm_sparc64_pmc_desc[] = {
13669+ PMC_D(PFM_REG_I, "PCR", 0, 0, 0, 0),
13670+};
13671+
13672+static struct pfm_regmap_desc pfm_sparc64_pmd_desc[] = {
13673+ PMD_D(PFM_REG_C, "PIC0", 0),
13674+ PMD_D(PFM_REG_C, "PIC1", 0),
13675+};
13676+
13677+static int pfm_sparc64_probe(void)
13678+{
13679+ return 0;
13680+}
13681+
13682+static struct pfm_pmu_config pmu_sparc64_pmu_conf = {
13683+ .counter_width = 31,
13684+ .pmd_desc = pfm_sparc64_pmd_desc,
13685+ .num_pmd_entries = 2,
13686+ .pmc_desc = pfm_sparc64_pmc_desc,
13687+ .num_pmc_entries = 1,
13688+ .probe_pmu = pfm_sparc64_probe,
13689+ .flags = PFM_PMU_BUILTIN_FLAG,
13690+ .owner = THIS_MODULE,
13691+};
13692+
13693+static unsigned long perf_hsvc_group;
13694+static unsigned long perf_hsvc_major;
13695+static unsigned long perf_hsvc_minor;
13696+
13697+static int __init register_perf_hsvc(void)
13698+{
13699+ if (tlb_type == hypervisor) {
13700+ switch (sun4v_chip_type) {
13701+ case SUN4V_CHIP_NIAGARA1:
13702+ perf_hsvc_group = HV_GRP_N2_CPU;
13703+ break;
13704+
13705+ case SUN4V_CHIP_NIAGARA2:
13706+ perf_hsvc_group = HV_GRP_N2_CPU;
13707+ break;
13708+
13709+ default:
13710+ return -ENODEV;
13711+ }
13712+
13713+
13714+ perf_hsvc_major = 1;
13715+ perf_hsvc_minor = 0;
13716+ if (sun4v_hvapi_register(perf_hsvc_group,
13717+ perf_hsvc_major,
13718+ &perf_hsvc_minor)) {
13719+ printk("perfmon: Could not register N2 hvapi.\n");
13720+ return -ENODEV;
13721+ }
13722+ }
13723+ return 0;
13724+}
13725+
13726+static void unregister_perf_hsvc(void)
13727+{
13728+ if (tlb_type != hypervisor)
13729+ return;
13730+ sun4v_hvapi_unregister(perf_hsvc_group);
13731+}
13732+
13733+static int __init pfm_sparc64_pmu_init(void)
13734+{
13735+ u64 mask;
13736+ int err;
13737+
13738+ err = register_perf_hsvc();
13739+ if (err)
13740+ return err;
13741+
13742+ if (tlb_type == hypervisor &&
13743+ sun4v_chip_type == SUN4V_CHIP_NIAGARA2)
13744+ pcr_ops = &n2_pcr_ops;
13745+ else
13746+ pcr_ops = &direct_pcr_ops;
13747+
13748+ if (!strcmp(sparc_pmu_type, "ultra12"))
13749+ mask = (0xf << 11) | (0xf << 4) | 0x7;
13750+ else if (!strcmp(sparc_pmu_type, "ultra3") ||
13751+ !strcmp(sparc_pmu_type, "ultra3i") ||
13752+ !strcmp(sparc_pmu_type, "ultra3+") ||
13753+ !strcmp(sparc_pmu_type, "ultra4+"))
13754+ mask = (0x3f << 11) | (0x3f << 4) | 0x7;
13755+ else if (!strcmp(sparc_pmu_type, "niagara2"))
13756+ mask = ((1UL << 63) | (1UL << 62) |
13757+ (1UL << 31) | (0xfUL << 27) | (0xffUL << 19) |
13758+ (1UL << 18) | (0xfUL << 14) | (0xff << 6) |
13759+ (0x3UL << 4) | 0x7UL);
13760+ else if (!strcmp(sparc_pmu_type, "niagara"))
13761+ mask = ((1UL << 9) | (1UL << 8) |
13762+ (0x7UL << 4) | 0x7UL);
13763+ else {
13764+ err = -ENODEV;
13765+ goto out_err;
13766+ }
13767+
13768+ pmu_sparc64_pmu_conf.pmu_name = sparc_pmu_type;
13769+ pfm_sparc64_pmc_desc[0].rsvd_msk = ~mask;
13770+
13771+ return pfm_pmu_register(&pmu_sparc64_pmu_conf);
13772+
13773+out_err:
13774+ unregister_perf_hsvc();
13775+ return err;
13776+}
13777+
13778+static void __exit pfm_sparc64_pmu_exit(void)
13779+{
13780+ unregister_perf_hsvc();
13781+ return pfm_pmu_unregister(&pmu_sparc64_pmu_conf);
13782+}
13783+
13784+module_init(pfm_sparc64_pmu_init);
13785+module_exit(pfm_sparc64_pmu_exit);
13786--- a/arch/x86/Kconfig
13787+++ b/arch/x86/Kconfig
13788@@ -1448,6 +1448,8 @@ config COMPAT_VDSO
13789
13790 If unsure, say Y.
13791
13792+source "arch/x86/perfmon/Kconfig"
13793+
13794 endmenu
13795
13796 config ARCH_ENABLE_MEMORY_HOTPLUG
13797--- a/arch/x86/Makefile
13798+++ b/arch/x86/Makefile
13799@@ -152,6 +152,8 @@ core-$(CONFIG_LGUEST_GUEST) += arch/x86/
13800 core-y += arch/x86/kernel/
13801 core-y += arch/x86/mm/
13802
13803+core-$(CONFIG_PERFMON) += arch/x86/perfmon/
13804+
13805 # Remaining sub architecture files
13806 core-y += $(mcore-y)
13807
13808--- a/arch/x86/ia32/ia32entry.S
13809+++ b/arch/x86/ia32/ia32entry.S
13810@@ -834,4 +834,16 @@ ia32_sys_call_table:
13811 .quad sys_dup3 /* 330 */
13812 .quad sys_pipe2
13813 .quad sys_inotify_init1
13814+ .quad sys_pfm_create_context
13815+ .quad sys_pfm_write_pmcs
13816+ .quad sys_pfm_write_pmds /* 335 */
13817+ .quad sys_pfm_read_pmds
13818+ .quad sys_pfm_load_context
13819+ .quad sys_pfm_start
13820+ .quad sys_pfm_stop
13821+ .quad sys_pfm_restart /* 340 */
13822+ .quad sys_pfm_create_evtsets
13823+ .quad sys_pfm_getinfo_evtsets
13824+ .quad sys_pfm_delete_evtsets
13825+ .quad sys_pfm_unload_context
13826 ia32_syscall_end:
13827--- a/arch/x86/kernel/apic_32.c
13828+++ b/arch/x86/kernel/apic_32.c
13829@@ -28,6 +28,7 @@
13830 #include <linux/acpi_pmtmr.h>
13831 #include <linux/module.h>
13832 #include <linux/dmi.h>
13833+#include <linux/perfmon_kern.h>
13834
13835 #include <asm/atomic.h>
13836 #include <asm/smp.h>
13837@@ -697,6 +698,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 ms
13838 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
13839 return APIC_EILVT_LVTOFF_IBS;
13840 }
13841+EXPORT_SYMBOL(setup_APIC_eilvt_ibs);
13842
13843 /*
13844 * Local APIC start and shutdown
13845@@ -1397,6 +1399,9 @@ void __init apic_intr_init(void)
13846 #ifdef CONFIG_X86_MCE_P4THERMAL
13847 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
13848 #endif
13849+#ifdef CONFIG_PERFMON
13850+ set_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt);
13851+#endif
13852 }
13853
13854 /**
13855--- a/arch/x86/kernel/apic_64.c
13856+++ b/arch/x86/kernel/apic_64.c
13857@@ -299,6 +299,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 ms
13858 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
13859 return APIC_EILVT_LVTOFF_IBS;
13860 }
13861+EXPORT_SYMBOL(setup_APIC_eilvt_ibs);
13862
13863 /*
13864 * Program the next event, relative to now
13865--- a/arch/x86/kernel/cpu/common.c
13866+++ b/arch/x86/kernel/cpu/common.c
13867@@ -5,6 +5,7 @@
13868 #include <linux/module.h>
13869 #include <linux/percpu.h>
13870 #include <linux/bootmem.h>
13871+#include <linux/perfmon_kern.h>
13872 #include <asm/processor.h>
13873 #include <asm/i387.h>
13874 #include <asm/msr.h>
13875@@ -728,6 +729,8 @@ void __cpuinit cpu_init(void)
13876 current_thread_info()->status = 0;
13877 clear_used_math();
13878 mxcsr_feature_mask_init();
13879+
13880+ pfm_init_percpu();
13881 }
13882
13883 #ifdef CONFIG_HOTPLUG_CPU
13884--- a/arch/x86/kernel/entry_32.S
13885+++ b/arch/x86/kernel/entry_32.S
13886@@ -513,7 +513,7 @@ ENDPROC(system_call)
13887 ALIGN
13888 RING0_PTREGS_FRAME # can't unwind into user space anyway
13889 work_pending:
13890- testb $_TIF_NEED_RESCHED, %cl
13891+ testw $(_TIF_NEED_RESCHED|_TIF_PERFMON_WORK), %cx
13892 jz work_notifysig
13893 work_resched:
13894 call schedule
13895--- a/arch/x86/kernel/entry_64.S
13896+++ b/arch/x86/kernel/entry_64.S
13897@@ -890,7 +890,13 @@ END(error_interrupt)
13898 ENTRY(spurious_interrupt)
13899 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
13900 END(spurious_interrupt)
13901-
13902+
13903+#ifdef CONFIG_PERFMON
13904+ENTRY(pmu_interrupt)
13905+ apicinterrupt LOCAL_PERFMON_VECTOR,smp_pmu_interrupt
13906+END(pmu_interrupt)
13907+#endif
13908+
13909 /*
13910 * Exception entry points.
13911 */
13912--- a/arch/x86/kernel/irqinit_64.c
13913+++ b/arch/x86/kernel/irqinit_64.c
13914@@ -11,6 +11,7 @@
13915 #include <linux/kernel_stat.h>
13916 #include <linux/sysdev.h>
13917 #include <linux/bitops.h>
13918+#include <linux/perfmon_kern.h>
13919
13920 #include <asm/acpi.h>
13921 #include <asm/atomic.h>
13922@@ -217,6 +218,10 @@ void __init native_init_IRQ(void)
13923 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
13924 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
13925
13926+#ifdef CONFIG_PERFMON
13927+ alloc_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt);
13928+#endif
13929+
13930 if (!acpi_ioapic)
13931 setup_irq(2, &irq2);
13932 }
13933--- a/arch/x86/kernel/process_32.c
13934+++ b/arch/x86/kernel/process_32.c
13935@@ -36,6 +36,7 @@
13936 #include <linux/personality.h>
13937 #include <linux/tick.h>
13938 #include <linux/percpu.h>
13939+#include <linux/perfmon_kern.h>
13940 #include <linux/prctl.h>
13941
13942 #include <asm/uaccess.h>
13943@@ -277,6 +278,7 @@ void exit_thread(void)
13944 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
13945 put_cpu();
13946 }
13947+ pfm_exit_thread();
13948 }
13949
13950 void flush_thread(void)
13951@@ -334,6 +336,8 @@ int copy_thread(int nr, unsigned long cl
13952
13953 savesegment(gs, p->thread.gs);
13954
13955+ pfm_copy_thread(p);
13956+
13957 tsk = current;
13958 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
13959 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
13960@@ -450,6 +454,9 @@ __switch_to_xtra(struct task_struct *pre
13961 prev = &prev_p->thread;
13962 next = &next_p->thread;
13963
13964+ if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW))
13965+ pfm_ctxsw_out(prev_p, next_p);
13966+
13967 debugctl = prev->debugctlmsr;
13968 if (next->ds_area_msr != prev->ds_area_msr) {
13969 /* we clear debugctl to make sure DS
13970@@ -462,6 +469,9 @@ __switch_to_xtra(struct task_struct *pre
13971 if (next->debugctlmsr != debugctl)
13972 update_debugctlmsr(next->debugctlmsr);
13973
13974+ if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW))
13975+ pfm_ctxsw_in(prev_p, next_p);
13976+
13977 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
13978 set_debugreg(next->debugreg0, 0);
13979 set_debugreg(next->debugreg1, 1);
13980--- a/arch/x86/kernel/process_64.c
13981+++ b/arch/x86/kernel/process_64.c
13982@@ -36,6 +36,7 @@
13983 #include <linux/kprobes.h>
13984 #include <linux/kdebug.h>
13985 #include <linux/tick.h>
13986+#include <linux/perfmon_kern.h>
13987 #include <linux/prctl.h>
13988
13989 #include <asm/uaccess.h>
13990@@ -240,6 +241,7 @@ void exit_thread(void)
13991 t->io_bitmap_max = 0;
13992 put_cpu();
13993 }
13994+ pfm_exit_thread();
13995 }
13996
13997 void flush_thread(void)
13998@@ -344,6 +346,8 @@ int copy_thread(int nr, unsigned long cl
13999 savesegment(es, p->thread.es);
14000 savesegment(ds, p->thread.ds);
14001
14002+ pfm_copy_thread(p);
14003+
14004 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
14005 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
14006 if (!p->thread.io_bitmap_ptr) {
14007@@ -474,6 +478,9 @@ static inline void __switch_to_xtra(stru
14008 prev = &prev_p->thread,
14009 next = &next_p->thread;
14010
14011+ if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW))
14012+ pfm_ctxsw_out(prev_p, next_p);
14013+
14014 debugctl = prev->debugctlmsr;
14015 if (next->ds_area_msr != prev->ds_area_msr) {
14016 /* we clear debugctl to make sure DS
14017@@ -486,6 +493,9 @@ static inline void __switch_to_xtra(stru
14018 if (next->debugctlmsr != debugctl)
14019 update_debugctlmsr(next->debugctlmsr);
14020
14021+ if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW))
14022+ pfm_ctxsw_in(prev_p, next_p);
14023+
14024 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
14025 loaddebug(next, 0);
14026 loaddebug(next, 1);
14027--- a/arch/x86/kernel/signal_32.c
14028+++ b/arch/x86/kernel/signal_32.c
14029@@ -19,6 +19,7 @@
14030 #include <linux/wait.h>
14031 #include <linux/tracehook.h>
14032 #include <linux/elf.h>
14033+#include <linux/perfmon_kern.h>
14034 #include <linux/smp.h>
14035 #include <linux/mm.h>
14036
14037@@ -664,6 +665,10 @@ static void do_signal(struct pt_regs *re
14038 void
14039 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
14040 {
14041+ /* process perfmon asynchronous work (e.g. block thread or reset) */
14042+ if (thread_info_flags & _TIF_PERFMON_WORK)
14043+ pfm_handle_work(regs);
14044+
14045 /* deal with pending signal delivery */
14046 if (thread_info_flags & _TIF_SIGPENDING)
14047 do_signal(regs);
14048--- a/arch/x86/kernel/signal_64.c
14049+++ b/arch/x86/kernel/signal_64.c
14050@@ -20,6 +20,7 @@
14051 #include <linux/stddef.h>
14052 #include <linux/personality.h>
14053 #include <linux/compiler.h>
14054+#include <linux/perfmon_kern.h>
14055 #include <asm/processor.h>
14056 #include <asm/ucontext.h>
14057 #include <asm/uaccess.h>
14058@@ -528,12 +529,17 @@ static void do_signal(struct pt_regs *re
14059 void do_notify_resume(struct pt_regs *regs, void *unused,
14060 __u32 thread_info_flags)
14061 {
14062+
14063 #ifdef CONFIG_X86_MCE
14064 /* notify userspace of pending MCEs */
14065 if (thread_info_flags & _TIF_MCE_NOTIFY)
14066 mce_notify_user();
14067 #endif /* CONFIG_X86_MCE */
14068
14069+ /* process perfmon asynchronous work (e.g. block thread or reset) */
14070+ if (thread_info_flags & _TIF_PERFMON_WORK)
14071+ pfm_handle_work(regs);
14072+
14073 /* deal with pending signal delivery */
14074 if (thread_info_flags & _TIF_SIGPENDING)
14075 do_signal(regs);
14076--- a/arch/x86/kernel/smpboot.c
14077+++ b/arch/x86/kernel/smpboot.c
14078@@ -42,6 +42,7 @@
14079 #include <linux/init.h>
14080 #include <linux/smp.h>
14081 #include <linux/module.h>
14082+#include <linux/perfmon_kern.h>
14083 #include <linux/sched.h>
14084 #include <linux/percpu.h>
14085 #include <linux/bootmem.h>
14086@@ -1377,6 +1378,7 @@ int __cpu_disable(void)
14087 remove_cpu_from_maps(cpu);
14088 unlock_vector_lock();
14089 fixup_irqs(cpu_online_map);
14090+ pfm_cpu_disable();
14091 return 0;
14092 }
14093
14094--- a/arch/x86/kernel/syscall_table_32.S
14095+++ b/arch/x86/kernel/syscall_table_32.S
14096@@ -332,3 +332,15 @@ ENTRY(sys_call_table)
14097 .long sys_dup3 /* 330 */
14098 .long sys_pipe2
14099 .long sys_inotify_init1
14100+ .long sys_pfm_create_context
14101+ .long sys_pfm_write_pmcs
14102+ .long sys_pfm_write_pmds /* 335 */
14103+ .long sys_pfm_read_pmds
14104+ .long sys_pfm_load_context
14105+ .long sys_pfm_start
14106+ .long sys_pfm_stop
14107+ .long sys_pfm_restart /* 340 */
14108+ .long sys_pfm_create_evtsets
14109+ .long sys_pfm_getinfo_evtsets
14110+ .long sys_pfm_delete_evtsets
14111+ .long sys_pfm_unload_context
14112--- a/arch/x86/oprofile/nmi_int.c
14113+++ b/arch/x86/oprofile/nmi_int.c
14114@@ -16,6 +16,7 @@
14115 #include <linux/moduleparam.h>
14116 #include <linux/kdebug.h>
14117 #include <linux/cpu.h>
14118+#include <linux/perfmon_kern.h>
14119 #include <asm/nmi.h>
14120 #include <asm/msr.h>
14121 #include <asm/apic.h>
14122@@ -217,12 +218,18 @@ static int nmi_setup(void)
14123 int err = 0;
14124 int cpu;
14125
14126- if (!allocate_msrs())
14127+ if (pfm_session_allcpus_acquire())
14128+ return -EBUSY;
14129+
14130+ if (!allocate_msrs()) {
14131+ pfm_session_allcpus_release();
14132 return -ENOMEM;
14133+ }
14134
14135 err = register_die_notifier(&profile_exceptions_nb);
14136 if (err) {
14137 free_msrs();
14138+ pfm_session_allcpus_release();
14139 return err;
14140 }
14141
14142@@ -304,6 +311,7 @@ static void nmi_shutdown(void)
14143 model->shutdown(msrs);
14144 free_msrs();
14145 put_cpu_var(cpu_msrs);
14146+ pfm_session_allcpus_release();
14147 }
14148
14149 static void nmi_cpu_start(void *dummy)
14150--- /dev/null
14151+++ b/arch/x86/perfmon/Kconfig
14152@@ -0,0 +1,89 @@
14153+menu "Hardware Performance Monitoring support"
14154+config PERFMON
14155+ bool "Perfmon2 performance monitoring interface"
14156+ select X86_LOCAL_APIC
14157+ default n
14158+ help
14159+ Enables the perfmon2 interface to access the hardware
14160+ performance counters. See <http://perfmon2.sf.net/> for
14161+ more details.
14162+
14163+config PERFMON_DEBUG
14164+ bool "Perfmon debugging"
14165+ default n
14166+ depends on PERFMON
14167+ help
14168+ Enables perfmon debugging support
14169+
14170+config PERFMON_DEBUG_FS
14171+ bool "Enable perfmon statistics reporting via debugfs"
14172+ default y
14173+ depends on PERFMON && DEBUG_FS
14174+ help
14175+ Enable collection and reporting of perfmon timing statistics under
14176+ debugfs. This is used for debugging and performance analysis of the
14177+ subsystem.The debugfs filesystem must be mounted.
14178+
14179+config X86_PERFMON_P6
14180+ tristate "Support for Intel P6/Pentium M processor hardware performance counters"
14181+ depends on PERFMON && X86_32
14182+ default n
14183+ help
14184+ Enables support for Intel P6-style hardware performance counters.
14185+ To be used for with Intel Pentium III, PentiumPro, Pentium M processors.
14186+
14187+config X86_PERFMON_P4
14188+ tristate "Support for Intel Pentium 4/Xeon hardware performance counters"
14189+ depends on PERFMON
14190+ default n
14191+ help
14192+ Enables support for Intel Pentium 4/Xeon (Netburst) hardware performance
14193+ counters.
14194+
14195+config X86_PERFMON_PEBS_P4
14196+ tristate "Support for Intel Netburst Precise Event-Based Sampling (PEBS)"
14197+ depends on PERFMON && X86_PERFMON_P4
14198+ default n
14199+ help
14200+ Enables support for Precise Event-Based Sampling (PEBS) on the Intel
14201+ Netburst processors such as Pentium 4, Xeon which support it.
14202+
14203+config X86_PERFMON_CORE
14204+ tristate "Support for Intel Core-based performance counters"
14205+ depends on PERFMON
14206+ default n
14207+ help
14208+ Enables support for Intel Core-based performance counters. Enable
14209+ this option to support Intel Core 2 processors.
14210+
14211+config X86_PERFMON_PEBS_CORE
14212+ tristate "Support for Intel Core Precise Event-Based Sampling (PEBS)"
14213+ depends on PERFMON && X86_PERFMON_CORE
14214+ default n
14215+ help
14216+ Enables support for Precise Event-Based Sampling (PEBS) on the Intel
14217+ Core processors.
14218+
14219+config X86_PERFMON_INTEL_ATOM
14220+ tristate "Support for Intel Atom processor"
14221+ depends on PERFMON
14222+ default n
14223+ help
14224+ Enables support for Intel Atom processors.
14225+
14226+config X86_PERFMON_INTEL_ARCH
14227+ tristate "Support for Intel architectural perfmon v1/v2"
14228+ depends on PERFMON
14229+ default n
14230+ help
14231+ Enables support for Intel architectural performance counters.
14232+ This feature was introduced with Intel Core Solo/Core Duo processors.
14233+
14234+config X86_PERFMON_AMD64
14235+ tristate "Support AMD Athlon64/Opteron64 hardware performance counters"
14236+ depends on PERFMON
14237+ default n
14238+ help
14239+ Enables support for Athlon64/Opterton64 hardware performance counters.
14240+ Support for family 6, 15 and 16(10H) processors.
14241+endmenu
14242--- /dev/null
14243+++ b/arch/x86/perfmon/Makefile
14244@@ -0,0 +1,13 @@
14245+#
14246+# Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
14247+# Contributed by Stephane Eranian <eranian@hpl.hp.com>
14248+#
14249+obj-$(CONFIG_PERFMON) += perfmon.o
14250+obj-$(CONFIG_X86_PERFMON_P6) += perfmon_p6.o
14251+obj-$(CONFIG_X86_PERFMON_P4) += perfmon_p4.o
14252+obj-$(CONFIG_X86_PERFMON_CORE) += perfmon_intel_core.o
14253+obj-$(CONFIG_X86_PERFMON_INTEL_ARCH) += perfmon_intel_arch.o
14254+obj-$(CONFIG_X86_PERFMON_PEBS_P4) += perfmon_pebs_p4_smpl.o
14255+obj-$(CONFIG_X86_PERFMON_PEBS_CORE) += perfmon_pebs_core_smpl.o
14256+obj-$(CONFIG_X86_PERFMON_AMD64) += perfmon_amd64.o
14257+obj-$(CONFIG_X86_PERFMON_INTEL_ATOM) += perfmon_intel_atom.o
14258--- /dev/null
14259+++ b/arch/x86/perfmon/perfmon.c
14260@@ -0,0 +1,761 @@
14261+/*
14262+ * This file implements the X86 specific support for the perfmon2 interface
14263+ *
14264+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
14265+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
14266+ *
14267+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
14268+ * Contributed by Robert Richter <robert.richter@amd.com>
14269+ *
14270+ * This program is free software; you can redistribute it and/or
14271+ * modify it under the terms of version 2 of the GNU General Public
14272+ * License as published by the Free Software Foundation.
14273+ *
14274+ * This program is distributed in the hope that it will be useful,
14275+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14276+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14277+ * General Public License for more details.
14278+ *
14279+ * You should have received a copy of the GNU General Public License
14280+ * along with this program; if not, write to the Free Software
14281+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
14282+ * 02111-1307 USA
14283+ */
14284+#include <linux/interrupt.h>
14285+#include <linux/perfmon_kern.h>
14286+#include <linux/kprobes.h>
14287+#include <linux/kdebug.h>
14288+#include <linux/nmi.h>
14289+
14290+#include <asm/apic.h>
14291+
14292+DEFINE_PER_CPU(unsigned long, real_iip);
14293+DEFINE_PER_CPU(int, pfm_using_nmi);
14294+DEFINE_PER_CPU(unsigned long, saved_lvtpc);
14295+
14296+/**
14297+ * pfm_arch_ctxswin_thread - thread context switch in
14298+ * @task: task switched in
14299+ * @ctx: context for the task
14300+ *
14301+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
14302+ * set cannot be NULL. Context is locked. Interrupts are masked.
14303+ *
14304+ * Caller has already restored all PMD and PMC registers, if
14305+ * necessary (i.e., lazy restore scheme).
14306+ *
14307+ * On x86, the only common code just needs to unsecure RDPMC if necessary
14308+ *
14309+ * On model-specific features, e.g., PEBS, IBS, are taken care of in the
14310+ * corresponding PMU description module
14311+ */
14312+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx)
14313+{
14314+ struct pfm_arch_context *ctx_arch;
14315+
14316+ ctx_arch = pfm_ctx_arch(ctx);
14317+
14318+ /*
14319+ * restore saved real iip
14320+ */
14321+ if (ctx->active_set->npend_ovfls)
14322+ __get_cpu_var(real_iip) = ctx_arch->saved_real_iip;
14323+
14324+ /*
14325+ * enable RDPMC on this CPU
14326+ */
14327+ if (ctx_arch->flags.insecure)
14328+ set_in_cr4(X86_CR4_PCE);
14329+}
14330+
14331+/**
14332+ * pfm_arch_ctxswout_thread - context switch out thread
14333+ * @task: task switched out
14334+ * @ctx : context switched out
14335+ *
14336+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
14337+ * Context is locked. Interrupts are masked. Monitoring may be active.
14338+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
14339+ *
14340+ * Return:
14341+ * non-zero : did not save PMDs (as part of stopping the PMU)
14342+ * 0 : saved PMDs (no need to save them in caller)
14343+ */
14344+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx)
14345+{
14346+ struct pfm_arch_context *ctx_arch;
14347+ struct pfm_arch_pmu_info *pmu_info;
14348+
14349+ ctx_arch = pfm_ctx_arch(ctx);
14350+ pmu_info = pfm_pmu_info();
14351+
14352+ /*
14353+ * disable lazy restore of PMCS on ctxswin because
14354+ * we modify some of them.
14355+ */
14356+ ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
14357+
14358+ if (ctx->active_set->npend_ovfls)
14359+ ctx_arch->saved_real_iip = __get_cpu_var(real_iip);
14360+
14361+ /*
14362+ * disable RDPMC on this CPU
14363+ */
14364+ if (ctx_arch->flags.insecure)
14365+ clear_in_cr4(X86_CR4_PCE);
14366+
14367+ if (ctx->state == PFM_CTX_MASKED)
14368+ return 1;
14369+
14370+ return pmu_info->stop_save(ctx, ctx->active_set);
14371+}
14372+
14373+/**
14374+ * pfm_arch_stop - deactivate monitoring
14375+ * @task: task to stop
14376+ * @ctx: context to stop
14377+ *
14378+ * Called from pfm_stop()
14379+ * Interrupts are masked. Context is locked. Set is the active set.
14380+ *
14381+ * For per-thread:
14382+ * task is not necessarily current. If not current task, then
14383+ * task is guaranteed stopped and off any cpu. Access to PMU
14384+ * is not guaranteed.
14385+ *
14386+ * For system-wide:
14387+ * task is current
14388+ *
14389+ * must disable active monitoring. ctx cannot be NULL
14390+ */
14391+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx)
14392+{
14393+ struct pfm_arch_pmu_info *pmu_info;
14394+
14395+ pmu_info = pfm_pmu_info();
14396+
14397+ /*
14398+ * no need to go through stop_save()
14399+ * if we are already stopped
14400+ */
14401+ if (!ctx->flags.started || ctx->state == PFM_CTX_MASKED)
14402+ return;
14403+
14404+ if (task != current)
14405+ return;
14406+
14407+ pmu_info->stop_save(ctx, ctx->active_set);
14408+}
14409+
14410+
14411+/**
14412+ * pfm_arch_start - activate monitoring
14413+ * @task: task to start
14414+ * @ctx: context to stop
14415+ *
14416+ * Interrupts are masked. Context is locked.
14417+ *
14418+ * For per-thread:
14419+ * Task is not necessarily current. If not current task, then task
14420+ * is guaranteed stopped and off any cpu. No access to PMU is task
14421+ * is not current.
14422+ *
14423+ * For system-wide:
14424+ * task is always current
14425+ */
14426+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx)
14427+{
14428+ struct pfm_event_set *set;
14429+
14430+ set = ctx->active_set;
14431+
14432+ if (task != current)
14433+ return;
14434+
14435+ /*
14436+ * cannot restore PMC if no access to PMU. Will be done
14437+ * when the thread is switched back in
14438+ */
14439+
14440+ pfm_arch_restore_pmcs(ctx, set);
14441+}
14442+
14443+/**
14444+ * pfm_arch_restore_pmds - reload PMD registers
14445+ * @ctx: context to restore from
14446+ * @set: current event set
14447+ *
14448+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
14449+ * pfm_context_load_sys(), pfm_ctxsw()
14450+ *
14451+ * Context is locked. Interrupts are masked. Set cannot be NULL.
14452+ * Access to the PMU is guaranteed.
14453+ */
14454+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
14455+{
14456+ struct pfm_arch_pmu_info *pmu_info;
14457+ u16 i, num;
14458+
14459+ pmu_info = pfm_pmu_info();
14460+
14461+ num = set->nused_pmds;
14462+
14463+ /*
14464+ * model-specific override
14465+ */
14466+ if (pmu_info->restore_pmds) {
14467+ pmu_info->restore_pmds(ctx, set);
14468+ return;
14469+ }
14470+
14471+ /*
14472+ * we can restore only the PMD we use because:
14473+ *
14474+ * - can only read with pfm_read_pmds() the registers
14475+ * declared used via pfm_write_pmds(), smpl_pmds, reset_pmds
14476+ *
14477+ * - if cr4.pce=1, only counters are exposed to user. RDPMC
14478+ * does not work with other types of PMU registers.Thus, no
14479+ * address is ever exposed by counters
14480+ *
14481+ * - there is never a dependency between one pmd register and
14482+ * another
14483+ */
14484+ for (i = 0; num; i++) {
14485+ if (likely(test_bit(i, cast_ulp(set->used_pmds)))) {
14486+ pfm_write_pmd(ctx, i, set->pmds[i].value);
14487+ num--;
14488+ }
14489+ }
14490+}
14491+
14492+/**
14493+ * pfm_arch_restore_pmcs - reload PMC registers
14494+ * @ctx: context to restore from
14495+ * @set: current event set
14496+ *
14497+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
14498+ * pfm_context_load_sys(), pfm_ctxsw().
14499+ *
14500+ * Context is locked. Interrupts are masked. set cannot be NULL.
14501+ * Access to the PMU is guaranteed.
14502+ *
14503+ * function must restore all PMC registers from set
14504+ */
14505+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
14506+{
14507+ struct pfm_arch_pmu_info *pmu_info;
14508+ u64 *mask;
14509+ u16 i, num;
14510+
14511+ pmu_info = pfm_pmu_info();
14512+
14513+ /*
14514+ * we need to restore PMCs only when:
14515+ * - context is not masked
14516+ * - monitoring activated
14517+ *
14518+ * Masking monitoring after an overflow does not change the
14519+ * value of flags.started
14520+ */
14521+ if (ctx->state == PFM_CTX_MASKED || !ctx->flags.started)
14522+ return;
14523+
14524+ /*
14525+ * model-specific override
14526+ */
14527+ if (pmu_info->restore_pmcs) {
14528+ pmu_info->restore_pmcs(ctx, set);
14529+ return;
14530+ }
14531+ /*
14532+ * restore all pmcs
14533+ *
14534+ * It is not possible to restore only the pmcs we used because
14535+ * certain PMU models (e.g. Pentium 4) have dependencies. Thus
14536+ * we do not want one application using stale PMC coming from
14537+ * another one.
14538+ *
14539+ * On PMU models where there is no dependencies between pmc, then
14540+ * it is possible to optimize by only restoring the registers that
14541+ * are used, and this can be done with the models-specific override
14542+ * for this function.
14543+ *
14544+ * The default code takes the safest approach, i.e., assume the worse
14545+ */
14546+ mask = ctx->regs.pmcs;
14547+ num = ctx->regs.num_pmcs;
14548+ for (i = 0; num; i++) {
14549+ if (test_bit(i, cast_ulp(mask))) {
14550+ pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
14551+ num--;
14552+ }
14553+ }
14554+}
14555+
14556+/**
14557+ * smp_pmu_interrupt - lowest level PMU interrupt handler for X86
14558+ * @regs: machine state
14559+ *
14560+ * The PMU interrupt is handled through an interrupt gate, therefore
14561+ * the CPU automatically clears the EFLAGS.IF, i.e., masking interrupts.
14562+ *
14563+ * The perfmon interrupt handler MUST run with interrupts disabled due
14564+ * to possible race with other, higher priority interrupts, such as timer
14565+ * or IPI function calls.
14566+ *
14567+ * See description in IA-32 architecture manual, Vol 3 section 5.8.1
14568+ */
14569+void smp_pmu_interrupt(struct pt_regs *regs)
14570+{
14571+ struct pfm_arch_pmu_info *pmu_info;
14572+ struct pfm_context *ctx;
14573+ unsigned long iip;
14574+ int using_nmi;
14575+
14576+ using_nmi = __get_cpu_var(pfm_using_nmi);
14577+
14578+ ack_APIC_irq();
14579+
14580+ irq_enter();
14581+
14582+ /*
14583+ * when using NMI, pfm_handle_nmi() gets called
14584+ * first. It stops monitoring and record the
14585+ * iip into real_iip, then it repost the interrupt
14586+ * using the lower priority vector LOCAL_PERFMON_VECTOR
14587+ *
14588+ * On some processors, e.g., P4, it may be that some
14589+ * state is already recorded from pfm_handle_nmi()
14590+ * and it only needs to be copied back into the normal
14591+ * fields so it can be used transparently by higher level
14592+ * code.
14593+ */
14594+ if (using_nmi) {
14595+ ctx = __get_cpu_var(pmu_ctx);
14596+ pmu_info = pfm_pmu_info();
14597+ iip = __get_cpu_var(real_iip);
14598+ if (ctx && pmu_info->nmi_copy_state)
14599+ pmu_info->nmi_copy_state(ctx);
14600+ } else
14601+ iip = instruction_pointer(regs);
14602+
14603+ pfm_interrupt_handler(iip, regs);
14604+
14605+ /*
14606+ * On Intel P6, Pentium M, P4, Intel Core:
14607+ * - it is necessary to clear the MASK field for the LVTPC
14608+ * vector. Otherwise interrupts remain masked. See
14609+ * section 8.5.1
14610+ * AMD X86-64:
14611+ * - the documentation does not stipulate the behavior.
14612+ * To be safe, we also rewrite the vector to clear the
14613+ * mask field
14614+ */
14615+ if (!using_nmi && current_cpu_data.x86_vendor == X86_VENDOR_INTEL)
14616+ apic_write(APIC_LVTPC, LOCAL_PERFMON_VECTOR);
14617+
14618+ irq_exit();
14619+}
14620+
14621+/**
14622+ * pfm_handle_nmi - PMU NMI handler notifier callback
14623+ * @nb ; notifier block
14624+ * @val: type of die notifier
14625+ * @data: die notifier-specific data
14626+ *
14627+ * called from notify_die() notifier from an trap handler path. We only
14628+ * care about NMI related callbacks, and ignore everything else.
14629+ *
14630+ * Cannot grab any locks, include the perfmon context lock
14631+ *
14632+ * Must detect if NMI interrupt comes from perfmon, and if so it must
14633+ * stop the PMU and repost a lower-priority interrupt. The perfmon interrupt
14634+ * handler needs to grab the context lock, thus is cannot be run directly
14635+ * from the NMI interrupt call path.
14636+ */
14637+static int __kprobes pfm_handle_nmi(struct notifier_block *nb,
14638+ unsigned long val,
14639+ void *data)
14640+{
14641+ struct die_args *args = data;
14642+ struct pfm_context *ctx;
14643+ struct pfm_arch_pmu_info *pmu_info;
14644+
14645+ /*
14646+ * only NMI related calls
14647+ */
14648+ if (val != DIE_NMI_IPI)
14649+ return NOTIFY_DONE;
14650+
14651+ /*
14652+ * perfmon not using NMI
14653+ */
14654+ if (!__get_cpu_var(pfm_using_nmi))
14655+ return NOTIFY_DONE;
14656+
14657+ /*
14658+ * No context
14659+ */
14660+ ctx = __get_cpu_var(pmu_ctx);
14661+ if (!ctx) {
14662+ PFM_DBG_ovfl("no ctx");
14663+ return NOTIFY_DONE;
14664+ }
14665+
14666+ /*
14667+ * Detect if we have overflows, i.e., NMI interrupt
14668+ * caused by PMU
14669+ */
14670+ pmu_info = pfm_pmu_conf->pmu_info;
14671+ if (!pmu_info->has_ovfls(ctx)) {
14672+ PFM_DBG_ovfl("no ovfl");
14673+ return NOTIFY_DONE;
14674+ }
14675+
14676+ /*
14677+ * we stop the PMU to avoid further overflow before this
14678+ * one is treated by lower priority interrupt handler
14679+ */
14680+ pmu_info->quiesce();
14681+
14682+ /*
14683+ * record actual instruction pointer
14684+ */
14685+ __get_cpu_var(real_iip) = instruction_pointer(args->regs);
14686+
14687+ /*
14688+ * post lower priority interrupt (LOCAL_PERFMON_VECTOR)
14689+ */
14690+ pfm_arch_resend_irq(ctx);
14691+
14692+ pfm_stats_inc(ovfl_intr_nmi_count);
14693+
14694+ /*
14695+ * we need to rewrite the APIC vector on Intel
14696+ */
14697+ if (current_cpu_data.x86_vendor == X86_VENDOR_INTEL)
14698+ apic_write(APIC_LVTPC, APIC_DM_NMI);
14699+
14700+ /*
14701+ * the notification was for us
14702+ */
14703+ return NOTIFY_STOP;
14704+}
14705+
14706+static struct notifier_block pfm_nmi_nb = {
14707+ .notifier_call = pfm_handle_nmi
14708+};
14709+
14710+/**
14711+ * pfm_arch_get_pmu_module_name - get PMU description module name for autoload
14712+ *
14713+ * called from pfm_pmu_request_module
14714+ */
14715+char *pfm_arch_get_pmu_module_name(void)
14716+{
14717+ switch (current_cpu_data.x86) {
14718+ case 6:
14719+ switch (current_cpu_data.x86_model) {
14720+ case 3: /* Pentium II */
14721+ case 7 ... 11:
14722+ case 13:
14723+ return "perfmon_p6";
14724+ case 15: /* Merom */
14725+ case 23: /* Penryn */
14726+ return "perfmon_intel_core";
14727+ case 28: /* Atom/Silverthorne */
14728+ return "perfmon_intel_atom";
14729+ case 29: /* Dunnington */
14730+ return "perfmon_intel_core";
14731+ default:
14732+ goto try_arch;
14733+ }
14734+ case 15:
14735+ case 16:
14736+ /* All Opteron processors */
14737+ if (current_cpu_data.x86_vendor == X86_VENDOR_AMD)
14738+ return "perfmon_amd64";
14739+
14740+ switch (current_cpu_data.x86_model) {
14741+ case 0 ... 6:
14742+ return "perfmon_p4";
14743+ }
14744+ /* FALL THROUGH */
14745+ default:
14746+try_arch:
14747+ if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON))
14748+ return "perfmon_intel_arch";
14749+ return NULL;
14750+ }
14751+ return NULL;
14752+}
14753+
14754+/**
14755+ * pfm_arch_resend_irq - post perfmon interrupt on regular vector
14756+ *
14757+ * called from pfm_ctxswin_thread() and pfm_handle_nmi()
14758+ */
14759+void pfm_arch_resend_irq(struct pfm_context *ctx)
14760+{
14761+ unsigned long val, dest;
14762+ /*
14763+ * we cannot use hw_resend_irq() because it goes to
14764+ * the I/O APIC. We need to go to the Local APIC.
14765+ *
14766+ * The "int vec" is not the right solution either
14767+ * because it triggers a software intr. We need
14768+ * to regenerate the interrupt and have it pended
14769+ * until we unmask interrupts.
14770+ *
14771+ * Instead we send ourself an IPI on the perfmon
14772+ * vector.
14773+ */
14774+ val = APIC_DEST_SELF|APIC_INT_ASSERT|
14775+ APIC_DM_FIXED|LOCAL_PERFMON_VECTOR;
14776+
14777+ dest = apic_read(APIC_ID);
14778+ apic_write(APIC_ICR2, dest);
14779+ apic_write(APIC_ICR, val);
14780+}
14781+
14782+/**
14783+ * pfm_arch_pmu_acquire_percpu - setup APIC per CPU
14784+ * @data: contains pmu flags
14785+ */
14786+static void pfm_arch_pmu_acquire_percpu(void *data)
14787+{
14788+
14789+ struct pfm_arch_pmu_info *pmu_info;
14790+ unsigned int tmp, vec;
14791+ unsigned long flags = (unsigned long)data;
14792+ unsigned long lvtpc;
14793+
14794+ pmu_info = pfm_pmu_conf->pmu_info;
14795+
14796+ /*
14797+ * we only reprogram the LVTPC vector if we have detected
14798+ * no sharing, otherwise it means the APIC is already programmed
14799+ * and we use whatever vector (likely NMI) is there
14800+ */
14801+ if (!(flags & PFM_X86_FL_SHARING)) {
14802+ if (flags & PFM_X86_FL_USE_NMI)
14803+ vec = APIC_DM_NMI;
14804+ else
14805+ vec = LOCAL_PERFMON_VECTOR;
14806+
14807+ tmp = apic_read(APIC_LVTERR);
14808+ apic_write(APIC_LVTERR, tmp | APIC_LVT_MASKED);
14809+ apic_write(APIC_LVTPC, vec);
14810+ apic_write(APIC_LVTERR, tmp);
14811+ }
14812+ lvtpc = (unsigned long)apic_read(APIC_LVTPC);
14813+
14814+ __get_cpu_var(pfm_using_nmi) = lvtpc == APIC_DM_NMI;
14815+
14816+ PFM_DBG("LTVPC=0x%lx using_nmi=%d", lvtpc, __get_cpu_var(pfm_using_nmi));
14817+
14818+ /*
14819+ * invoke model specific acquire routine. May be used for
14820+ * model-specific initializations
14821+ */
14822+ if (pmu_info->acquire_pmu_percpu)
14823+ pmu_info->acquire_pmu_percpu();
14824+}
14825+
14826+/**
14827+ * pfm_arch_pmu_acquire - acquire PMU resource from system
14828+ * @unavail_pmcs : bitmask to use to set unavailable pmcs
14829+ * @unavail_pmds : bitmask to use to set unavailable pmds
14830+ *
14831+ * interrupts are not masked
14832+ *
14833+ * Grab PMU registers from lower level MSR allocator
14834+ *
14835+ * Program the APIC according the possible interrupt vector
14836+ * either LOCAL_PERFMON_VECTOR or NMI
14837+ */
14838+int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds)
14839+{
14840+ struct pfm_arch_pmu_info *pmu_info;
14841+ struct pfm_regmap_desc *d;
14842+ u16 i, nlost;
14843+
14844+ pmu_info = pfm_pmu_conf->pmu_info;
14845+ pmu_info->flags &= ~PFM_X86_FL_SHARING;
14846+
14847+ nlost = 0;
14848+
14849+ d = pfm_pmu_conf->pmc_desc;
14850+ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) {
14851+ if (!(d->type & PFM_REG_I))
14852+ continue;
14853+
14854+ if (d->type & PFM_REG_V)
14855+ continue;
14856+ /*
14857+ * reserve register with lower-level allocator
14858+ */
14859+ if (!reserve_evntsel_nmi(d->hw_addr)) {
14860+ PFM_DBG("pmc%d(%s) already used", i, d->desc);
14861+ __set_bit(i, cast_ulp(unavail_pmcs));
14862+ nlost++;
14863+ continue;
14864+ }
14865+ }
14866+ PFM_DBG("nlost=%d info_flags=0x%x\n", nlost, pmu_info->flags);
14867+ /*
14868+ * some PMU models (e.g., P6) do not support sharing
14869+ * so check if we found less than the expected number of PMC registers
14870+ */
14871+ if (nlost) {
14872+ if (pmu_info->flags & PFM_X86_FL_NO_SHARING) {
14873+ PFM_INFO("PMU already used by another subsystem, "
14874+ "PMU does not support sharing, "
14875+ "try disabling Oprofile or "
14876+ "reboot with nmi_watchdog=0");
14877+ goto undo;
14878+ }
14879+ pmu_info->flags |= PFM_X86_FL_SHARING;
14880+ }
14881+
14882+ d = pfm_pmu_conf->pmd_desc;
14883+ for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) {
14884+ if (!(d->type & PFM_REG_I))
14885+ continue;
14886+
14887+ if (d->type & PFM_REG_V)
14888+ continue;
14889+
14890+ if (!reserve_perfctr_nmi(d->hw_addr)) {
14891+ PFM_DBG("pmd%d(%s) already used", i, d->desc);
14892+ __set_bit(i, cast_ulp(unavail_pmds));
14893+ }
14894+ }
14895+ /*
14896+ * program APIC on each CPU
14897+ */
14898+ on_each_cpu(pfm_arch_pmu_acquire_percpu,
14899+ (void *)(unsigned long)pmu_info->flags , 1);
14900+
14901+ return 0;
14902+undo:
14903+ /*
14904+ * must undo reservation of pmcs in case of error
14905+ */
14906+ d = pfm_pmu_conf->pmc_desc;
14907+ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) {
14908+ if (!(d->type & (PFM_REG_I|PFM_REG_V)))
14909+ continue;
14910+ if (!test_bit(i, cast_ulp(unavail_pmcs)))
14911+ release_evntsel_nmi(d->hw_addr);
14912+ }
14913+ return -EBUSY;
14914+}
14915+/**
14916+ * pfm-arch_pmu_release_percpu - clear NMI state for one CPU
14917+ *
14918+ */
14919+static void pfm_arch_pmu_release_percpu(void *data)
14920+{
14921+ struct pfm_arch_pmu_info *pmu_info;
14922+
14923+ pmu_info = pfm_pmu_conf->pmu_info;
14924+
14925+ __get_cpu_var(pfm_using_nmi) = 0;
14926+
14927+ /*
14928+ * invoke model specific release routine.
14929+ * May be used to undo certain initializations
14930+ * or free some model-specific ressources.
14931+ */
14932+ if (pmu_info->release_pmu_percpu)
14933+ pmu_info->release_pmu_percpu();
14934+}
14935+
14936+/**
14937+ * pfm_arch_pmu_release - release PMU resource to system
14938+ *
14939+ * called from pfm_pmu_release()
14940+ * interrupts are not masked
14941+ *
14942+ * On x86, we return the PMU registers to the MSR allocator
14943+ */
14944+void pfm_arch_pmu_release(void)
14945+{
14946+ struct pfm_regmap_desc *d;
14947+ u16 i, n;
14948+
14949+ d = pfm_pmu_conf->pmc_desc;
14950+ n = pfm_pmu_conf->regs_all.num_pmcs;
14951+ for (i = 0; n; i++, d++) {
14952+ if (!test_bit(i, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
14953+ continue;
14954+ release_evntsel_nmi(d->hw_addr);
14955+ n--;
14956+ PFM_DBG("pmc%u released", i);
14957+ }
14958+ d = pfm_pmu_conf->pmd_desc;
14959+ n = pfm_pmu_conf->regs_all.num_pmds;
14960+ for (i = 0; n; i++, d++) {
14961+ if (!test_bit(i, cast_ulp(pfm_pmu_conf->regs_all.pmds)))
14962+ continue;
14963+ release_perfctr_nmi(d->hw_addr);
14964+ n--;
14965+ PFM_DBG("pmd%u released", i);
14966+ }
14967+
14968+ /* clear NMI variable if used */
14969+ if (__get_cpu_var(pfm_using_nmi))
14970+ on_each_cpu(pfm_arch_pmu_release_percpu, NULL , 1);
14971+}
14972+
14973+/**
14974+ * pfm_arch_pmu_config_init - validate PMU description structure
14975+ * @cfg: PMU description structure
14976+ *
14977+ * return:
14978+ * 0 if valid
14979+ * errno otherwise
14980+ *
14981+ * called from pfm_pmu_register()
14982+ */
14983+int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg)
14984+{
14985+ struct pfm_arch_pmu_info *pmu_info;
14986+
14987+ pmu_info = pfm_pmu_info();
14988+ if (!pmu_info) {
14989+ PFM_DBG("%s missing pmu_info", cfg->pmu_name);
14990+ return -EINVAL;
14991+ }
14992+ if (!pmu_info->has_ovfls) {
14993+ PFM_DBG("%s missing has_ovfls callback", cfg->pmu_name);
14994+ return -EINVAL;
14995+ }
14996+ if (!pmu_info->quiesce) {
14997+ PFM_DBG("%s missing quiesce callback", cfg->pmu_name);
14998+ return -EINVAL;
14999+ }
15000+ if (!pmu_info->stop_save) {
15001+ PFM_DBG("%s missing stop_save callback", cfg->pmu_name);
15002+ return -EINVAL;
15003+ }
15004+ return 0;
15005+}
15006+
15007+/**
15008+ * pfm_arch_init - one time global arch-specific initialization
15009+ *
15010+ * called from pfm_init()
15011+ */
15012+int __init pfm_arch_init(void)
15013+{
15014+ /*
15015+ * we need to register our NMI handler when the kernels boots
15016+ * to avoid a deadlock condition with the NMI watchdog or Oprofile
15017+ * if we were to try and register/unregister on-demand.
15018+ */
15019+ register_die_notifier(&pfm_nmi_nb);
15020+ return 0;
15021+}
15022--- /dev/null
15023+++ b/arch/x86/perfmon/perfmon_amd64.c
15024@@ -0,0 +1,754 @@
15025+/*
15026+ * This file contains the PMU description for the Athlon64 and Opteron64
15027+ * processors. It supports 32 and 64-bit modes.
15028+ *
15029+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
15030+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
15031+ *
15032+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
15033+ * Contributed by Robert Richter <robert.richter@amd.com>
15034+ *
15035+ * This program is free software; you can redistribute it and/or
15036+ * modify it under the terms of version 2 of the GNU General Public
15037+ * License as published by the Free Software Foundation.
15038+ *
15039+ * This program is distributed in the hope that it will be useful,
15040+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15041+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15042+ * General Public License for more details.
15043+ *
15044+ * You should have received a copy of the GNU General Public License
15045+ * along with this program; if not, write to the Free Software
15046+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
15047+ * 02111-1307 USA
15048+ */
15049+#include <linux/module.h>
15050+#include <linux/vmalloc.h>
15051+#include <linux/topology.h>
15052+#include <linux/kprobes.h>
15053+#include <linux/pci.h>
15054+#include <linux/perfmon_kern.h>
15055+#include <asm/hw_irq.h>
15056+#include <asm/apic.h>
15057+
15058+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
15059+MODULE_AUTHOR("Robert Richter <robert.richter@amd.com>");
15060+MODULE_DESCRIPTION("AMD64 PMU description table");
15061+MODULE_LICENSE("GPL");
15062+
15063+#define PCI_DEVICE_ID_AMD_10H_NB_MISC 0x1203
15064+
15065+static int force_nmi;
15066+MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt");
15067+module_param(force_nmi, bool, 0600);
15068+
15069+#define HAS_IBS 0x01 /* has IBS support */
15070+
15071+static u8 ibs_eilvt_off, ibs_status; /* AMD: extended interrupt LVT offset */
15072+
15073+static void pfm_amd64_restore_pmcs(struct pfm_context *ctx,
15074+ struct pfm_event_set *set);
15075+static void __kprobes pfm_amd64_quiesce(void);
15076+static int pfm_amd64_has_ovfls(struct pfm_context *ctx);
15077+static int pfm_amd64_stop_save(struct pfm_context *ctx,
15078+ struct pfm_event_set *set);
15079+
15080+#define IBSFETCHCTL_PMC 4 /* pmc4 */
15081+#define IBSFETCHCTL_PMD 4 /* pmd4 */
15082+#define IBSOPSCTL_PMC 5 /* pmc5 */
15083+#define IBSOPSCTL_PMD 7 /* pmd7 */
15084+
15085+static u64 enable_mask[PFM_MAX_PMCS];
15086+static u16 max_enable;
15087+
15088+static struct pfm_arch_pmu_info pfm_amd64_pmu_info = {
15089+ .stop_save = pfm_amd64_stop_save,
15090+ .has_ovfls = pfm_amd64_has_ovfls,
15091+ .quiesce = pfm_amd64_quiesce,
15092+ .restore_pmcs = pfm_amd64_restore_pmcs
15093+};
15094+
15095+#define PFM_AMD64_IBSFETCHVAL (1ULL<<49) /* valid fetch sample */
15096+#define PFM_AMD64_IBSFETCHEN (1ULL<<48) /* fetch sampling enabled */
15097+#define PFM_AMD64_IBSOPVAL (1ULL<<18) /* valid execution sample */
15098+#define PFM_AMD64_IBSOPEN (1ULL<<17) /* execution sampling enabled */
15099+
15100+/*
15101+ * force Local APIC interrupt on overflow
15102+ */
15103+#define PFM_K8_VAL (1ULL<<20)
15104+#define PFM_K8_NO64 (1ULL<<20)
15105+
15106+/*
15107+ * reserved bits must be 1
15108+ *
15109+ * for family 15:
15110+ * - upper 32 bits are reserved
15111+ * - bit 20, bit 21
15112+ *
15113+ * for family 16:
15114+ * - bits 36-39 are reserved
15115+ * - bits 42-63 are reserved
15116+ * - bit 20, bit 21
15117+ *
15118+ * for IBS registers:
15119+ * IBSFETCHCTL: all bits are reserved except bits 57, 48, 15:0
15120+ * IBSOPSCTL : all bits are reserved except bits 17, 15:0
15121+ */
15122+#define PFM_K8_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (1ULL<<21))
15123+#define PFM_16_RSVD ((0x3fffffULL<<42) | (0xfULL<<36) | (1ULL<<20) | (1ULL<<21))
15124+#define PFM_AMD64_IBSFETCHCTL_RSVD (~((1ULL<<48)|(1ULL<<57)|0xffffULL))
15125+#define PFM_AMD64_IBSOPCTL_RSVD (~((1ULL<<17)|0xffffULL))
15126+
15127+static struct pfm_regmap_desc pfm_amd64_pmc_desc[] = {
15128+/* pmc0 */ PMC_D(PFM_REG_I64, "PERFSEL0", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL0),
15129+/* pmc1 */ PMC_D(PFM_REG_I64, "PERFSEL1", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL1),
15130+/* pmc2 */ PMC_D(PFM_REG_I64, "PERFSEL2", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL2),
15131+/* pmc3 */ PMC_D(PFM_REG_I64, "PERFSEL3", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL3),
15132+/* pmc4 */ PMC_D(PFM_REG_I, "IBSFETCHCTL", 0, PFM_AMD64_IBSFETCHCTL_RSVD, 0, MSR_AMD64_IBSFETCHCTL),
15133+/* pmc5 */ PMC_D(PFM_REG_I, "IBSOPCTL", 0, PFM_AMD64_IBSOPCTL_RSVD, 0, MSR_AMD64_IBSOPCTL),
15134+};
15135+#define PFM_AMD_NUM_PMCS ARRAY_SIZE(pfm_amd64_pmc_desc)
15136+
15137+#define PFM_REG_IBS (PFM_REG_I|PFM_REG_INTR)
15138+
15139+/*
15140+ * AMD64 counters are 48 bits, upper bits are reserved
15141+ */
15142+#define PFM_AMD64_CTR_RSVD (~((1ULL<<48)-1))
15143+
15144+#define PFM_AMD_D(n) \
15145+ { .type = PFM_REG_C, \
15146+ .desc = "PERFCTR"#n, \
15147+ .hw_addr = MSR_K7_PERFCTR0+n, \
15148+ .rsvd_msk = PFM_AMD64_CTR_RSVD, \
15149+ .dep_pmcs[0] = 1ULL << n \
15150+ }
15151+
15152+#define PFM_AMD_IBSO(t, s, a) \
15153+ { .type = t, \
15154+ .desc = s, \
15155+ .hw_addr = a, \
15156+ .rsvd_msk = 0, \
15157+ .dep_pmcs[0] = 1ULL << 5 \
15158+ }
15159+
15160+#define PFM_AMD_IBSF(t, s, a) \
15161+ { .type = t, \
15162+ .desc = s, \
15163+ .hw_addr = a, \
15164+ .rsvd_msk = 0, \
15165+ .dep_pmcs[0] = 1ULL << 6 \
15166+ }
15167+
15168+static struct pfm_regmap_desc pfm_amd64_pmd_desc[] = {
15169+/* pmd0 */ PFM_AMD_D(0),
15170+/* pmd1 */ PFM_AMD_D(1),
15171+/* pmd2 */ PFM_AMD_D(2),
15172+/* pmd3 */ PFM_AMD_D(3),
15173+/* pmd4 */ PFM_AMD_IBSF(PFM_REG_IBS, "IBSFETCHCTL", MSR_AMD64_IBSFETCHCTL),
15174+/* pmd5 */ PFM_AMD_IBSF(PFM_REG_IRO, "IBSFETCHLINAD", MSR_AMD64_IBSFETCHLINAD),
15175+/* pmd6 */ PFM_AMD_IBSF(PFM_REG_IRO, "IBSFETCHPHYSAD", MSR_AMD64_IBSFETCHPHYSAD),
15176+/* pmd7 */ PFM_AMD_IBSO(PFM_REG_IBS, "IBSOPCTL", MSR_AMD64_IBSOPCTL),
15177+/* pmd8 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSOPRIP", MSR_AMD64_IBSOPRIP),
15178+/* pmd9 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSOPDATA", MSR_AMD64_IBSOPDATA),
15179+/* pmd10 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSOPDATA2", MSR_AMD64_IBSOPDATA2),
15180+/* pmd11 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSOPDATA3", MSR_AMD64_IBSOPDATA3),
15181+/* pmd12 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSDCLINAD", MSR_AMD64_IBSDCLINAD),
15182+/* pmd13 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSDCPHYSAD", MSR_AMD64_IBSDCPHYSAD),
15183+};
15184+#define PFM_AMD_NUM_PMDS ARRAY_SIZE(pfm_amd64_pmd_desc)
15185+
15186+static struct pfm_context **pfm_nb_sys_owners;
15187+static struct pfm_context *pfm_nb_task_owner;
15188+
15189+static struct pfm_pmu_config pfm_amd64_pmu_conf;
15190+
15191+#define is_ibs_pmc(x) (x == 4 || x == 5)
15192+
15193+static void pfm_amd64_setup_eilvt_per_cpu(void *info)
15194+{
15195+ u8 lvt_off;
15196+
15197+ /* program the IBS vector to the perfmon vector */
15198+ lvt_off = setup_APIC_eilvt_ibs(LOCAL_PERFMON_VECTOR,
15199+ APIC_EILVT_MSG_FIX, 0);
15200+ PFM_DBG("APIC_EILVT%d set to 0x%x", lvt_off, LOCAL_PERFMON_VECTOR);
15201+ ibs_eilvt_off = lvt_off;
15202+}
15203+
15204+static int pfm_amd64_setup_eilvt(void)
15205+{
15206+#define IBSCTL_LVTOFFSETVAL (1 << 8)
15207+#define IBSCTL 0x1cc
15208+ struct pci_dev *cpu_cfg;
15209+ int nodes;
15210+ u32 value = 0;
15211+
15212+ /* per CPU setup */
15213+ on_each_cpu(pfm_amd64_setup_eilvt_per_cpu, NULL, 1);
15214+
15215+ nodes = 0;
15216+ cpu_cfg = NULL;
15217+ do {
15218+ cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
15219+ PCI_DEVICE_ID_AMD_10H_NB_MISC,
15220+ cpu_cfg);
15221+ if (!cpu_cfg)
15222+ break;
15223+ ++nodes;
15224+ pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
15225+ | IBSCTL_LVTOFFSETVAL);
15226+ pci_read_config_dword(cpu_cfg, IBSCTL, &value);
15227+ if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
15228+ PFM_DBG("Failed to setup IBS LVT offset, "
15229+ "IBSCTL = 0x%08x", value);
15230+ return 1;
15231+ }
15232+ } while (1);
15233+
15234+ if (!nodes) {
15235+ PFM_DBG("No CPU node configured for IBS");
15236+ return 1;
15237+ }
15238+
15239+#ifdef CONFIG_NUMA
15240+ /* Sanity check */
15241+ /* Works only for 64bit with proper numa implementation. */
15242+ if (nodes != num_possible_nodes()) {
15243+ PFM_DBG("Failed to setup CPU node(s) for IBS, "
15244+ "found: %d, expected %d",
15245+ nodes, num_possible_nodes());
15246+ return 1;
15247+ }
15248+#endif
15249+ return 0;
15250+}
15251+
15252+/*
15253+ * There can only be one user per socket for the Northbridge (NB) events,
15254+ * so we enforce mutual exclusion as follows:
15255+ * - per-thread : only one context machine-wide can use NB events
15256+ * - system-wide: only one context per processor socket
15257+ *
15258+ * Exclusion is enforced at:
15259+ * - pfm_load_context()
15260+ * - pfm_write_pmcs() for attached contexts
15261+ *
15262+ * Exclusion is released at:
15263+ * - pfm_unload_context() or any calls that implicitely uses it
15264+ *
15265+ * return:
15266+ * 0 : successfully acquire NB access
15267+ * < 0: errno, failed to acquire NB access
15268+ */
15269+static int pfm_amd64_acquire_nb(struct pfm_context *ctx)
15270+{
15271+ struct pfm_context **entry, *old;
15272+ int proc_id;
15273+
15274+#ifdef CONFIG_SMP
15275+ proc_id = cpu_data(smp_processor_id()).phys_proc_id;
15276+#else
15277+ proc_id = 0;
15278+#endif
15279+
15280+ if (ctx->flags.system)
15281+ entry = &pfm_nb_sys_owners[proc_id];
15282+ else
15283+ entry = &pfm_nb_task_owner;
15284+
15285+ old = cmpxchg(entry, NULL, ctx);
15286+ if (!old) {
15287+ if (ctx->flags.system)
15288+ PFM_DBG("acquired Northbridge event access on socket %u", proc_id);
15289+ else
15290+ PFM_DBG("acquired Northbridge event access globally");
15291+ } else if (old != ctx) {
15292+ if (ctx->flags.system)
15293+ PFM_DBG("NorthBridge event conflict on socket %u", proc_id);
15294+ else
15295+ PFM_DBG("global NorthBridge event conflict");
15296+ return -EBUSY;
15297+ }
15298+ return 0;
15299+}
15300+
15301+/*
15302+ * invoked from pfm_write_pmcs() when pfm_nb_sys_owners is not NULL,i.e.,
15303+ * when we have detected a multi-core processor.
15304+ *
15305+ * context is locked, interrupts are masked
15306+ */
15307+static int pfm_amd64_pmc_write_check(struct pfm_context *ctx,
15308+ struct pfm_event_set *set,
15309+ struct pfarg_pmc *req)
15310+{
15311+ unsigned int event;
15312+
15313+ /*
15314+ * delay checking NB event until we load the context
15315+ */
15316+ if (ctx->state == PFM_CTX_UNLOADED)
15317+ return 0;
15318+
15319+ /*
15320+ * check event is NB event
15321+ */
15322+ event = (unsigned int)(req->reg_value & 0xff);
15323+ if (event < 0xee)
15324+ return 0;
15325+
15326+ return pfm_amd64_acquire_nb(ctx);
15327+}
15328+
15329+/*
15330+ * invoked on pfm_load_context().
15331+ * context is locked, interrupts are masked
15332+ */
15333+static int pfm_amd64_load_context(struct pfm_context *ctx)
15334+{
15335+ struct pfm_event_set *set;
15336+ unsigned int i, n;
15337+
15338+ /*
15339+ * scan all sets for NB events
15340+ */
15341+ list_for_each_entry(set, &ctx->set_list, list) {
15342+ n = set->nused_pmcs;
15343+ for (i = 0; n; i++) {
15344+ if (!test_bit(i, cast_ulp(set->used_pmcs)))
15345+ continue;
15346+
15347+ if (!is_ibs_pmc(i) && (set->pmcs[i] & 0xff) >= 0xee)
15348+ goto found;
15349+ n--;
15350+ }
15351+ }
15352+ return 0;
15353+found:
15354+ return pfm_amd64_acquire_nb(ctx);
15355+}
15356+
15357+/*
15358+ * invoked on pfm_unload_context()
15359+ */
15360+static void pfm_amd64_unload_context(struct pfm_context *ctx)
15361+{
15362+ struct pfm_context **entry, *old;
15363+ int proc_id;
15364+
15365+#ifdef CONFIG_SMP
15366+ proc_id = cpu_data(smp_processor_id()).phys_proc_id;
15367+#else
15368+ proc_id = 0;
15369+#endif
15370+
15371+ /*
15372+ * unload always happens on the monitored CPU in system-wide
15373+ */
15374+ if (ctx->flags.system)
15375+ entry = &pfm_nb_sys_owners[proc_id];
15376+ else
15377+ entry = &pfm_nb_task_owner;
15378+
15379+ old = cmpxchg(entry, ctx, NULL);
15380+ if (old == ctx) {
15381+ if (ctx->flags.system)
15382+ PFM_DBG("released NorthBridge on socket %u", proc_id);
15383+ else
15384+ PFM_DBG("released NorthBridge events globally");
15385+ }
15386+}
15387+
15388+/*
15389+ * detect if we need to activate NorthBridge event access control
15390+ */
15391+static int pfm_amd64_setup_nb_event_control(void)
15392+{
15393+ unsigned int c, n = 0;
15394+ unsigned int max_phys = 0;
15395+
15396+#ifdef CONFIG_SMP
15397+ for_each_possible_cpu(c) {
15398+ if (cpu_data(c).phys_proc_id > max_phys)
15399+ max_phys = cpu_data(c).phys_proc_id;
15400+ }
15401+#else
15402+ max_phys = 0;
15403+#endif
15404+ if (max_phys > 255) {
15405+ PFM_INFO("socket id %d is too big to handle", max_phys);
15406+ return -ENOMEM;
15407+ }
15408+
15409+ n = max_phys + 1;
15410+ if (n < 2)
15411+ return 0;
15412+
15413+ pfm_nb_sys_owners = vmalloc(n * sizeof(*pfm_nb_sys_owners));
15414+ if (!pfm_nb_sys_owners)
15415+ return -ENOMEM;
15416+
15417+ memset(pfm_nb_sys_owners, 0, n * sizeof(*pfm_nb_sys_owners));
15418+ pfm_nb_task_owner = NULL;
15419+
15420+ /*
15421+ * activate write-checker for PMC registers
15422+ */
15423+ for (c = 0; c < PFM_AMD_NUM_PMCS; c++) {
15424+ if (!is_ibs_pmc(c))
15425+ pfm_amd64_pmc_desc[c].type |= PFM_REG_WC;
15426+ }
15427+
15428+ pfm_amd64_pmu_info.load_context = pfm_amd64_load_context;
15429+ pfm_amd64_pmu_info.unload_context = pfm_amd64_unload_context;
15430+
15431+ pfm_amd64_pmu_conf.pmc_write_check = pfm_amd64_pmc_write_check;
15432+
15433+ PFM_INFO("NorthBridge event access control enabled");
15434+
15435+ return 0;
15436+}
15437+
15438+/*
15439+ * disable registers which are not available on
15440+ * the host (applies to IBS registers)
15441+ */
15442+static void pfm_amd64_check_registers(void)
15443+{
15444+ u16 i;
15445+
15446+ PFM_DBG("has_ibs=%d", !!(ibs_status & HAS_IBS));
15447+
15448+ __set_bit(0, cast_ulp(enable_mask));
15449+ __set_bit(1, cast_ulp(enable_mask));
15450+ __set_bit(2, cast_ulp(enable_mask));
15451+ __set_bit(3, cast_ulp(enable_mask));
15452+ max_enable = 3+1;
15453+
15454+
15455+ /*
15456+ * remove IBS registers if feature not present
15457+ */
15458+ if (!(ibs_status & HAS_IBS)) {
15459+ pfm_amd64_pmc_desc[4].type = PFM_REG_NA;
15460+ pfm_amd64_pmc_desc[5].type = PFM_REG_NA;
15461+ for (i = 4; i < 14; i++)
15462+ pfm_amd64_pmd_desc[i].type = PFM_REG_NA;
15463+ } else {
15464+ __set_bit(16, cast_ulp(enable_mask));
15465+ __set_bit(17, cast_ulp(enable_mask));
15466+ max_enable = 17 + 1;
15467+ }
15468+
15469+ /*
15470+ * adjust reserved bit fields for family 16
15471+ */
15472+ if (current_cpu_data.x86 == 16) {
15473+ for (i = 0; i < PFM_AMD_NUM_PMCS; i++)
15474+ if (pfm_amd64_pmc_desc[i].rsvd_msk == PFM_K8_RSVD)
15475+ pfm_amd64_pmc_desc[i].rsvd_msk = PFM_16_RSVD;
15476+ }
15477+}
15478+
15479+static int pfm_amd64_probe_pmu(void)
15480+{
15481+ u64 val = 0;
15482+ if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) {
15483+ PFM_INFO("not an AMD processor");
15484+ return -1;
15485+ }
15486+
15487+ switch (current_cpu_data.x86) {
15488+ case 16:
15489+ case 15:
15490+ case 6:
15491+ break;
15492+ default:
15493+ PFM_INFO("unsupported family=%d", current_cpu_data.x86);
15494+ return -1;
15495+ }
15496+
15497+ /* check for IBS */
15498+ if (cpu_has(&current_cpu_data, X86_FEATURE_IBS)) {
15499+ ibs_status |= HAS_IBS;
15500+ rdmsrl(MSR_AMD64_IBSCTL, val);
15501+ }
15502+
15503+ PFM_INFO("found family=%d IBSCTL=0x%llx", current_cpu_data.x86, (unsigned long long)val);
15504+
15505+ /*
15506+ * check for local APIC (required)
15507+ */
15508+ if (!cpu_has_apic) {
15509+ PFM_INFO("no local APIC, unsupported");
15510+ return -1;
15511+ }
15512+
15513+ if (current_cpu_data.x86_max_cores > 1
15514+ && pfm_amd64_setup_nb_event_control())
15515+ return -1;
15516+
15517+ if (force_nmi)
15518+ pfm_amd64_pmu_info.flags |= PFM_X86_FL_USE_NMI;
15519+
15520+ if (ibs_status & HAS_IBS) {
15521+ /* Setup extended interrupt */
15522+ if (pfm_amd64_setup_eilvt()) {
15523+ PFM_INFO("Failed to initialize extended interrupts "
15524+ "for IBS");
15525+ ibs_status &= ~HAS_IBS;
15526+ PFM_INFO("Unable to use IBS");
15527+ } else {
15528+ PFM_INFO("IBS supported");
15529+ }
15530+ }
15531+
15532+ pfm_amd64_check_registers();
15533+
15534+ return 0;
15535+}
15536+
15537+/*
15538+ * detect is counters have overflowed.
15539+ * return:
15540+ * 0 : no overflow
15541+ * 1 : at least one overflow
15542+ */
15543+static int __kprobes pfm_amd64_has_ovfls(struct pfm_context *ctx)
15544+{
15545+ struct pfm_regmap_desc *xrd;
15546+ u64 *cnt_mask;
15547+ u64 wmask, val;
15548+ u16 i, num;
15549+
15550+ /*
15551+ * Check for IBS events
15552+ */
15553+ if (ibs_status & HAS_IBS) {
15554+ rdmsrl(MSR_AMD64_IBSFETCHCTL, val);
15555+ if (val & PFM_AMD64_IBSFETCHVAL)
15556+ return 1;
15557+ rdmsrl(MSR_AMD64_IBSOPCTL, val);
15558+ if (val & PFM_AMD64_IBSOPVAL)
15559+ return 1;
15560+ }
15561+ /*
15562+ * Check regular counters
15563+ */
15564+ cnt_mask = ctx->regs.cnt_pmds;
15565+ num = ctx->regs.num_counters;
15566+ wmask = 1ULL << pfm_pmu_conf->counter_width;
15567+ xrd = pfm_amd64_pmd_desc;
15568+
15569+ for (i = 0; num; i++) {
15570+ if (test_bit(i, cast_ulp(cnt_mask))) {
15571+ rdmsrl(xrd[i].hw_addr, val);
15572+ if (!(val & wmask))
15573+ return 1;
15574+ num--;
15575+ }
15576+ }
15577+ return 0;
15578+}
15579+
15580+/*
15581+ * Must check for IBS event BEFORE stop_save_p6 because
15582+ * stopping monitoring does destroy IBS state information
15583+ * in IBSFETCHCTL/IBSOPCTL because they are tagged as enable
15584+ * registers.
15585+ */
15586+static int pfm_amd64_stop_save(struct pfm_context *ctx, struct pfm_event_set *set)
15587+{
15588+ struct pfm_arch_pmu_info *pmu_info;
15589+ u64 used_mask[PFM_PMC_BV];
15590+ u64 *cnt_pmds;
15591+ u64 val, wmask, ovfl_mask;
15592+ u32 i, count, use_ibs;
15593+
15594+ pmu_info = pfm_pmu_info();
15595+
15596+ /*
15597+ * IBS used if:
15598+ * - on family 10h processor with IBS
15599+ * - at least one of the IBS PMD registers is used
15600+ */
15601+ use_ibs = (ibs_status & HAS_IBS)
15602+ && (test_bit(IBSFETCHCTL_PMD, cast_ulp(set->used_pmds))
15603+ || test_bit(IBSOPSCTL_PMD, cast_ulp(set->used_pmds)));
15604+
15605+ wmask = 1ULL << pfm_pmu_conf->counter_width;
15606+
15607+ bitmap_and(cast_ulp(used_mask),
15608+ cast_ulp(set->used_pmcs),
15609+ cast_ulp(enable_mask),
15610+ max_enable);
15611+
15612+ count = bitmap_weight(cast_ulp(used_mask), max_enable);
15613+
15614+ /*
15615+ * stop monitoring
15616+ * Unfortunately, this is very expensive!
15617+ * wrmsrl() is serializing.
15618+ *
15619+ * With IBS, we need to do read-modify-write to preserve the content
15620+ * for OpsCTL and FetchCTL because they are also used as PMDs and saved
15621+ * below
15622+ */
15623+ if (use_ibs) {
15624+ for (i = 0; count; i++) {
15625+ if (test_bit(i, cast_ulp(used_mask))) {
15626+ if (i == IBSFETCHCTL_PMC) {
15627+ rdmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, val);
15628+ val &= ~PFM_AMD64_IBSFETCHEN;
15629+ } else if (i == IBSOPSCTL_PMC) {
15630+ rdmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, val);
15631+ val &= ~PFM_AMD64_IBSOPEN;
15632+ } else
15633+ val = 0;
15634+ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, val);
15635+ count--;
15636+ }
15637+ }
15638+ } else {
15639+ for (i = 0; count; i++) {
15640+ if (test_bit(i, cast_ulp(used_mask))) {
15641+ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
15642+ count--;
15643+ }
15644+ }
15645+ }
15646+
15647+ /*
15648+ * if we already having a pending overflow condition, we simply
15649+ * return to take care of this first.
15650+ */
15651+ if (set->npend_ovfls)
15652+ return 1;
15653+
15654+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
15655+ cnt_pmds = ctx->regs.cnt_pmds;
15656+
15657+ /*
15658+ * check for pending overflows and save PMDs (combo)
15659+ * we employ used_pmds because we also need to save
15660+ * and not just check for pending interrupts.
15661+ *
15662+ * Must check for counting PMDs because of virtual PMDs and IBS
15663+ */
15664+ count = set->nused_pmds;
15665+ for (i = 0; count; i++) {
15666+ if (test_bit(i, cast_ulp(set->used_pmds))) {
15667+ val = pfm_arch_read_pmd(ctx, i);
15668+ if (likely(test_bit(i, cast_ulp(cnt_pmds)))) {
15669+ if (!(val & wmask)) {
15670+ __set_bit(i, cast_ulp(set->povfl_pmds));
15671+ set->npend_ovfls++;
15672+ }
15673+ val = (set->pmds[i].value & ~ovfl_mask) | (val & ovfl_mask);
15674+ }
15675+ set->pmds[i].value = val;
15676+ count--;
15677+ }
15678+ }
15679+
15680+ /*
15681+ * check if IBS contains valid data, and mark the corresponding
15682+ * PMD has overflowed
15683+ */
15684+ if (use_ibs) {
15685+ if (set->pmds[IBSFETCHCTL_PMD].value & PFM_AMD64_IBSFETCHVAL) {
15686+ __set_bit(IBSFETCHCTL_PMD, cast_ulp(set->povfl_pmds));
15687+ set->npend_ovfls++;
15688+ }
15689+ if (set->pmds[IBSOPSCTL_PMD].value & PFM_AMD64_IBSOPVAL) {
15690+ __set_bit(IBSOPSCTL_PMD, cast_ulp(set->povfl_pmds));
15691+ set->npend_ovfls++;
15692+ }
15693+ }
15694+ /* 0 means: no need to save PMDs at upper level */
15695+ return 0;
15696+}
15697+
15698+/**
15699+ * pfm_amd64_quiesce_pmu -- stop monitoring without grabbing any lock
15700+ *
15701+ * called from NMI interrupt handler to immediately stop monitoring
15702+ * cannot grab any lock, including perfmon related locks
15703+ */
15704+static void __kprobes pfm_amd64_quiesce(void)
15705+{
15706+ /*
15707+ * quiesce PMU by clearing available registers that have
15708+ * the start/stop capability
15709+ */
15710+ if (test_bit(0, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
15711+ wrmsrl(MSR_K7_EVNTSEL0, 0);
15712+ if (test_bit(1, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
15713+ wrmsrl(MSR_K7_EVNTSEL0+1, 0);
15714+ if (test_bit(2, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
15715+ wrmsrl(MSR_K7_EVNTSEL0+2, 0);
15716+ if (test_bit(3, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
15717+ wrmsrl(MSR_K7_EVNTSEL0+3, 0);
15718+
15719+ if (test_bit(4, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
15720+ wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
15721+ if (test_bit(5, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
15722+ wrmsrl(MSR_AMD64_IBSOPCTL, 0);
15723+}
15724+
15725+/**
15726+ * pfm_amd64_restore_pmcs - reload PMC registers
15727+ * @ctx: context to restore from
15728+ * @set: current event set
15729+ *
15730+ * optimized version of pfm_arch_restore_pmcs(). On AMD64, we can
15731+ * afford to only restore the pmcs registers we use, because they are
15732+ * all independent from each other.
15733+ */
15734+static void pfm_amd64_restore_pmcs(struct pfm_context *ctx,
15735+ struct pfm_event_set *set)
15736+{
15737+ u64 *mask;
15738+ u16 i, num;
15739+
15740+ mask = set->used_pmcs;
15741+ num = set->nused_pmcs;
15742+ for (i = 0; num; i++) {
15743+ if (test_bit(i, cast_ulp(mask))) {
15744+ wrmsrl(pfm_amd64_pmc_desc[i].hw_addr, set->pmcs[i]);
15745+ num--;
15746+ }
15747+ }
15748+}
15749+
15750+static struct pfm_pmu_config pfm_amd64_pmu_conf = {
15751+ .pmu_name = "AMD64",
15752+ .counter_width = 47,
15753+ .pmd_desc = pfm_amd64_pmd_desc,
15754+ .pmc_desc = pfm_amd64_pmc_desc,
15755+ .num_pmc_entries = PFM_AMD_NUM_PMCS,
15756+ .num_pmd_entries = PFM_AMD_NUM_PMDS,
15757+ .probe_pmu = pfm_amd64_probe_pmu,
15758+ .version = "1.2",
15759+ .pmu_info = &pfm_amd64_pmu_info,
15760+ .flags = PFM_PMU_BUILTIN_FLAG,
15761+ .owner = THIS_MODULE,
15762+};
15763+
15764+static int __init pfm_amd64_pmu_init_module(void)
15765+{
15766+ return pfm_pmu_register(&pfm_amd64_pmu_conf);
15767+}
15768+
15769+static void __exit pfm_amd64_pmu_cleanup_module(void)
15770+{
15771+ if (pfm_nb_sys_owners)
15772+ vfree(pfm_nb_sys_owners);
15773+
15774+ pfm_pmu_unregister(&pfm_amd64_pmu_conf);
15775+}
15776+
15777+module_init(pfm_amd64_pmu_init_module);
15778+module_exit(pfm_amd64_pmu_cleanup_module);
15779--- /dev/null
15780+++ b/arch/x86/perfmon/perfmon_intel_arch.c
15781@@ -0,0 +1,610 @@
15782+/*
15783+ * This file contains the Intel architectural perfmon v1, v2, v3
15784+ * description tables.
15785+ *
15786+ * Architectural perfmon was introduced with Intel Core Solo/Duo
15787+ * processors.
15788+ *
15789+ * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P.
15790+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
15791+ *
15792+ * This program is free software; you can redistribute it and/or
15793+ * modify it under the terms of version 2 of the GNU General Public
15794+ * License as published by the Free Software Foundation.
15795+ *
15796+ * This program is distributed in the hope that it will be useful,
15797+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15798+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15799+ * General Public License for more details.
15800+ *
15801+ * You should have received a copy of the GNU General Public License
15802+ * along with this program; if not, write to the Free Software
15803+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
15804+ * 02111-1307 USA
15805+ */
15806+#include <linux/module.h>
15807+#include <linux/kprobes.h>
15808+#include <linux/perfmon_kern.h>
15809+#include <linux/nmi.h>
15810+#include <asm/msr.h>
15811+#include <asm/apic.h>
15812+
15813+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
15814+MODULE_DESCRIPTION("Intel architectural perfmon v1");
15815+MODULE_LICENSE("GPL");
15816+
15817+static int force, force_nmi;
15818+MODULE_PARM_DESC(force, "bool: force module to load succesfully");
15819+MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt");
15820+module_param(force, bool, 0600);
15821+module_param(force_nmi, bool, 0600);
15822+
15823+static u64 enable_mask[PFM_MAX_PMCS];
15824+static u16 max_enable;
15825+
15826+/*
15827+ * - upper 32 bits are reserved
15828+ * - INT: APIC enable bit is reserved (forced to 1)
15829+ * - bit 21 is reserved
15830+ *
15831+ * RSVD: reserved bits are 1
15832+ */
15833+#define PFM_IA_PMC_RSVD ((~((1ULL<<32)-1)) \
15834+ | (1ULL<<20) \
15835+ | (1ULL<<21))
15836+
15837+/*
15838+ * force Local APIC interrupt on overflow
15839+ * disable with NO_EMUL64
15840+ */
15841+#define PFM_IA_PMC_VAL (1ULL<<20)
15842+#define PFM_IA_NO64 (1ULL<<20)
15843+
15844+/*
15845+ * architectuture specifies that:
15846+ * IA32_PMCx MSR : starts at 0x0c1 & occupy a contiguous block of MSR
15847+ * IA32_PERFEVTSELx MSR : starts at 0x186 & occupy a contiguous block of MSR
15848+ * MSR_GEN_FIXED_CTR0 : starts at 0x309 & occupy a contiguous block of MSR
15849+ */
15850+#define MSR_GEN_SEL_BASE MSR_P6_EVNTSEL0
15851+#define MSR_GEN_PMC_BASE MSR_P6_PERFCTR0
15852+#define MSR_GEN_FIXED_PMC_BASE MSR_CORE_PERF_FIXED_CTR0
15853+
15854+/*
15855+ * layout of EAX for CPUID.0xa leaf function
15856+ */
15857+struct pmu_eax {
15858+ unsigned int version:8; /* architectural perfmon version */
15859+ unsigned int num_cnt:8; /* number of generic counters */
15860+ unsigned int cnt_width:8; /* width of generic counters */
15861+ unsigned int ebx_length:8; /* number of architected events */
15862+};
15863+
15864+/*
15865+ * layout of EDX for CPUID.0xa leaf function when perfmon v2 is detected
15866+ */
15867+struct pmu_edx {
15868+ unsigned int num_cnt:5; /* number of fixed counters */
15869+ unsigned int cnt_width:8; /* width of fixed counters */
15870+ unsigned int reserved:19;
15871+};
15872+
15873+static void pfm_intel_arch_restore_pmcs(struct pfm_context *ctx,
15874+ struct pfm_event_set *set);
15875+static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
15876+ struct pfm_event_set *set);
15877+static int pfm_intel_arch_has_ovfls(struct pfm_context *ctx);
15878+static void __kprobes pfm_intel_arch_quiesce(void);
15879+
15880+/*
15881+ * physical addresses of MSR controlling the perfevtsel and counter registers
15882+ */
15883+struct pfm_arch_pmu_info pfm_intel_arch_pmu_info = {
15884+ .stop_save = pfm_intel_arch_stop_save,
15885+ .has_ovfls = pfm_intel_arch_has_ovfls,
15886+ .quiesce = pfm_intel_arch_quiesce,
15887+ .restore_pmcs = pfm_intel_arch_restore_pmcs
15888+};
15889+
15890+#define PFM_IA_C(n) { \
15891+ .type = PFM_REG_I64, \
15892+ .desc = "PERFEVTSEL"#n, \
15893+ .dfl_val = PFM_IA_PMC_VAL, \
15894+ .rsvd_msk = PFM_IA_PMC_RSVD, \
15895+ .no_emul64_msk = PFM_IA_NO64, \
15896+ .hw_addr = MSR_GEN_SEL_BASE+(n) \
15897+ }
15898+
15899+#define PFM_IA_D(n) \
15900+ { .type = PFM_REG_C, \
15901+ .desc = "PMC"#n, \
15902+ .hw_addr = MSR_P6_PERFCTR0+n, \
15903+ .dep_pmcs[0] = 1ULL << n \
15904+ }
15905+
15906+#define PFM_IA_FD(n) \
15907+ { .type = PFM_REG_C, \
15908+ .desc = "FIXED_CTR"#n, \
15909+ .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\
15910+ .dep_pmcs[0] = 1ULL << 16 \
15911+ }
15912+
15913+static struct pfm_regmap_desc pfm_intel_arch_pmc_desc[] = {
15914+/* pmc0 */ PFM_IA_C(0), PFM_IA_C(1), PFM_IA_C(2), PFM_IA_C(3),
15915+/* pmc4 */ PFM_IA_C(4), PFM_IA_C(5), PFM_IA_C(6), PFM_IA_C(7),
15916+/* pmc8 */ PFM_IA_C(8), PFM_IA_C(9), PFM_IA_C(10), PFM_IA_C(11),
15917+/* pmc12 */ PFM_IA_C(12), PFM_IA_C(13), PFM_IA_C(14), PFM_IA_C(15),
15918+
15919+/* pmc16 */ { .type = PFM_REG_I,
15920+ .desc = "FIXED_CTRL",
15921+ .dfl_val = 0x8888888888888888ULL, /* force PMI */
15922+ .rsvd_msk = 0, /* set dynamically */
15923+ .no_emul64_msk = 0,
15924+ .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL
15925+ },
15926+};
15927+#define PFM_IA_MAX_PMCS ARRAY_SIZE(pfm_intel_arch_pmc_desc)
15928+
15929+static struct pfm_regmap_desc pfm_intel_arch_pmd_desc[] = {
15930+/* pmd0 */ PFM_IA_D(0), PFM_IA_D(1), PFM_IA_D(2), PFM_IA_D(3),
15931+/* pmd4 */ PFM_IA_D(4), PFM_IA_D(5), PFM_IA_D(6), PFM_IA_D(7),
15932+/* pmd8 */ PFM_IA_D(8), PFM_IA_D(9), PFM_IA_D(10), PFM_IA_D(11),
15933+/* pmd12 */ PFM_IA_D(12), PFM_IA_D(13), PFM_IA_D(14), PFM_IA_D(15),
15934+
15935+/* pmd16 */ PFM_IA_FD(0), PFM_IA_FD(1), PFM_IA_FD(2), PFM_IA_FD(3),
15936+/* pmd20 */ PFM_IA_FD(4), PFM_IA_FD(5), PFM_IA_FD(6), PFM_IA_FD(7),
15937+/* pmd24 */ PFM_IA_FD(8), PFM_IA_FD(9), PFM_IA_FD(10), PFM_IA_FD(11),
15938+/* pmd28 */ PFM_IA_FD(16), PFM_IA_FD(17), PFM_IA_FD(18), PFM_IA_FD(19)
15939+};
15940+#define PFM_IA_MAX_PMDS ARRAY_SIZE(pfm_intel_arch_pmd_desc)
15941+
15942+#define PFM_IA_MAX_CNT 16 /* # generic counters in mapping table */
15943+#define PFM_IA_MAX_FCNT 16 /* # of fixed counters in mapping table */
15944+#define PFM_IA_FCNT_BASE 16 /* base index of fixed counters PMD */
15945+
15946+static struct pfm_pmu_config pfm_intel_arch_pmu_conf;
15947+
15948+static void pfm_intel_arch_check_errata(void)
15949+{
15950+ /*
15951+ * Core Duo errata AE49 (no fix). Both counters share a single
15952+ * enable bit in PERFEVTSEL0
15953+ */
15954+ if (current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 14)
15955+ pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_NO_SHARING;
15956+}
15957+
15958+static inline void set_enable_mask(unsigned int i)
15959+{
15960+ __set_bit(i, cast_ulp(enable_mask));
15961+
15962+ /* max_enable = highest + 1 */
15963+ if ((i+1) > max_enable)
15964+ max_enable = i+ 1;
15965+}
15966+
15967+static void pfm_intel_arch_setup_generic(unsigned int version,
15968+ unsigned int width,
15969+ unsigned int count)
15970+{
15971+ u64 rsvd;
15972+ unsigned int i;
15973+
15974+ /*
15975+ * first we handle the generic counters:
15976+ *
15977+ * - ensure HW does not have more registers than hardcoded in the tables
15978+ * - adjust rsvd_msk to actual counter width
15979+ * - initialize enable_mask (list of PMC with start/stop capability)
15980+ * - mark unused hardcoded generic counters as unimplemented
15981+ */
15982+
15983+ /*
15984+ * min of number of Hw counters and hardcoded in the tables
15985+ */
15986+ if (count >= PFM_IA_MAX_CNT) {
15987+ printk(KERN_INFO "perfmon: Limiting number of generic counters"
15988+ " to %u, HW supports %u",
15989+ PFM_IA_MAX_CNT, count);
15990+ count = PFM_IA_MAX_CNT;
15991+ }
15992+
15993+ /*
15994+ * adjust rsvd_msk for generic counters based on actual width
15995+ * initialize enable_mask (1 per pmd)
15996+ */
15997+ rsvd = ~((1ULL << width)-1);
15998+ for (i = 0; i < count; i++) {
15999+ pfm_intel_arch_pmd_desc[i].rsvd_msk = rsvd;
16000+ set_enable_mask(i);
16001+ }
16002+
16003+ /*
16004+ * handle version 3 new anythread bit (21)
16005+ */
16006+ if (version == 3) {
16007+ for (i = 0; i < count; i++)
16008+ pfm_intel_arch_pmc_desc[i].rsvd_msk &= ~(1ULL << 21);
16009+ }
16010+
16011+
16012+ /*
16013+ * mark unused generic counters as not available
16014+ */
16015+ for (i = count ; i < PFM_IA_MAX_CNT; i++) {
16016+ pfm_intel_arch_pmd_desc[i].type = PFM_REG_NA;
16017+ pfm_intel_arch_pmc_desc[i].type = PFM_REG_NA;
16018+ }
16019+}
16020+
16021+static void pfm_intel_arch_setup_fixed(unsigned int version,
16022+ unsigned int width,
16023+ unsigned int count)
16024+{
16025+ u64 rsvd, dfl;
16026+ unsigned int i;
16027+
16028+ /*
16029+ * handle the fixed counters (if any):
16030+ *
16031+ * - ensure HW does not have more registers than hardcoded in the tables
16032+ * - adjust rsvd_msk to actual counter width
16033+ * - initialize enable_mask (list of PMC with start/stop capability)
16034+ * - mark unused hardcoded generic counters as unimplemented
16035+ */
16036+ if (count >= PFM_IA_MAX_FCNT) {
16037+ printk(KERN_INFO "perfmon: Limiting number of fixed counters"
16038+ " to %u, HW supports %u",
16039+ PFM_IA_MAX_FCNT, count);
16040+ count = PFM_IA_MAX_FCNT;
16041+ }
16042+ /*
16043+ * adjust rsvd_msk for fixed counters based on actual width
16044+ */
16045+ rsvd = ~((1ULL << width)-1);
16046+ for (i = 0; i < count; i++)
16047+ pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].rsvd_msk = rsvd;
16048+
16049+ /*
16050+ * handle version new anythread bit (bit 2)
16051+ */
16052+ if (version == 3)
16053+ rsvd = 1ULL << 3;
16054+ else
16055+ rsvd = 3ULL << 2;
16056+
16057+ pfm_intel_arch_pmc_desc[16].rsvd_msk = 0;
16058+ for (i = 0; i < count; i++)
16059+ pfm_intel_arch_pmc_desc[16].rsvd_msk |= rsvd << (i<<2);
16060+
16061+ /*
16062+ * mark unused fixed counters as unimplemented
16063+ *
16064+ * update the rsvd_msk, dfl_val in FIXED_CTRL:
16065+ * - rsvd_msk: set all 4 bits
16066+ * - dfl_val : clear all 4 bits
16067+ */
16068+ dfl = pfm_intel_arch_pmc_desc[16].dfl_val;
16069+ rsvd = pfm_intel_arch_pmc_desc[16].rsvd_msk;
16070+
16071+ for (i = count ; i < PFM_IA_MAX_FCNT; i++) {
16072+ pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].type = PFM_REG_NA;
16073+ rsvd |= 0xfULL << (i<<2);
16074+ dfl &= ~(0xfULL << (i<<2));
16075+ }
16076+
16077+ /*
16078+ * FIXED_CTR_CTRL unavailable when no fixed counters are defined
16079+ */
16080+ if (!count) {
16081+ pfm_intel_arch_pmc_desc[16].type = PFM_REG_NA;
16082+ } else {
16083+ /* update rsvd_mask and dfl_val */
16084+ pfm_intel_arch_pmc_desc[16].rsvd_msk = rsvd;
16085+ pfm_intel_arch_pmc_desc[16].dfl_val = dfl;
16086+ set_enable_mask(16);
16087+ }
16088+}
16089+
16090+static int pfm_intel_arch_probe_pmu(void)
16091+{
16092+ union {
16093+ unsigned int val;
16094+ struct pmu_eax eax;
16095+ struct pmu_edx edx;
16096+ } eax, edx;
16097+ unsigned int ebx, ecx;
16098+ unsigned int width = 0;
16099+
16100+ edx.val = 0;
16101+
16102+ if (!(cpu_has_arch_perfmon || force)) {
16103+ PFM_INFO("no support for Intel architectural PMU");
16104+ return -1;
16105+ }
16106+
16107+ if (!cpu_has_apic) {
16108+ PFM_INFO("no Local APIC, try rebooting with lapic option");
16109+ return -1;
16110+ }
16111+
16112+ /* cpuid() call protected by cpu_has_arch_perfmon */
16113+ cpuid(0xa, &eax.val, &ebx, &ecx, &edx.val);
16114+
16115+ /*
16116+ * reject processors supported by perfmon_intel_core
16117+ *
16118+ * We need to do this explicitely to avoid depending
16119+ * on the link order in case, the modules are compiled as
16120+ * builtin.
16121+ *
16122+ * non Intel processors are rejected by cpu_has_arch_perfmon
16123+ */
16124+ if (current_cpu_data.x86 == 6 && !force) {
16125+ switch (current_cpu_data.x86_model) {
16126+ case 15: /* Merom: use perfmon_intel_core */
16127+ case 23: /* Penryn: use perfmon_intel_core */
16128+ return -1;
16129+ default:
16130+ break;
16131+ }
16132+ }
16133+
16134+ /*
16135+ * some 6/15 models have buggy BIOS
16136+ */
16137+ if (eax.eax.version == 0
16138+ && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) {
16139+ PFM_INFO("buggy v2 BIOS, adjusting for 2 generic counters");
16140+ eax.eax.version = 2;
16141+ eax.eax.num_cnt = 2;
16142+ eax.eax.cnt_width = 40;
16143+ }
16144+
16145+ /*
16146+ * Intel Atom processors have a buggy firmware which does not report
16147+ * the correct number of fixed counters
16148+ */
16149+ if (eax.eax.version == 3 && edx.edx.num_cnt < 3
16150+ && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 28) {
16151+ PFM_INFO("buggy v3 BIOS, adjusting for 3 fixed counters");
16152+ edx.edx.num_cnt = 3;
16153+ }
16154+
16155+ /*
16156+ * some v2 BIOSes are incomplete
16157+ */
16158+ if (eax.eax.version == 2 && !edx.edx.num_cnt) {
16159+ PFM_INFO("buggy v2 BIOS, adjusting for 3 fixed counters");
16160+ edx.edx.num_cnt = 3;
16161+ edx.edx.cnt_width = 40;
16162+ }
16163+
16164+ /*
16165+ * no fixed counters on earlier versions
16166+ */
16167+ if (eax.eax.version < 2) {
16168+ edx.val = 0;
16169+ } else {
16170+ /*
16171+ * use the min value of both widths until we support
16172+ * variable width counters
16173+ */
16174+ width = eax.eax.cnt_width < edx.edx.cnt_width ?
16175+ eax.eax.cnt_width : edx.edx.cnt_width;
16176+ }
16177+
16178+ PFM_INFO("detected architecural perfmon v%d", eax.eax.version);
16179+ PFM_INFO("num_gen=%d width=%d num_fixed=%d width=%d",
16180+ eax.eax.num_cnt,
16181+ eax.eax.cnt_width,
16182+ edx.edx.num_cnt,
16183+ edx.edx.cnt_width);
16184+
16185+
16186+ pfm_intel_arch_setup_generic(eax.eax.version,
16187+ width,
16188+ eax.eax.num_cnt);
16189+
16190+ pfm_intel_arch_setup_fixed(eax.eax.version,
16191+ width,
16192+ edx.edx.num_cnt);
16193+
16194+ if (force_nmi)
16195+ pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_USE_NMI;
16196+
16197+ pfm_intel_arch_check_errata();
16198+
16199+ return 0;
16200+}
16201+
16202+/**
16203+ * pfm_intel_arch_has_ovfls - check for pending overflow condition
16204+ * @ctx: context to work on
16205+ *
16206+ * detect if counters have overflowed.
16207+ * return:
16208+ * 0 : no overflow
16209+ * 1 : at least one overflow
16210+ */
16211+static int __kprobes pfm_intel_arch_has_ovfls(struct pfm_context *ctx)
16212+{
16213+ u64 *cnt_mask;
16214+ u64 wmask, val;
16215+ u16 i, num;
16216+
16217+ cnt_mask = ctx->regs.cnt_pmds;
16218+ num = ctx->regs.num_counters;
16219+ wmask = 1ULL << pfm_pmu_conf->counter_width;
16220+
16221+ /*
16222+ * we can leverage the fact that we know the mapping
16223+ * to hardcode the MSR address and avoid accessing
16224+ * more cachelines
16225+ *
16226+ * We need to check cnt_mask because not all registers
16227+ * may be available.
16228+ */
16229+ for (i = 0; num; i++) {
16230+ if (test_bit(i, cast_ulp(cnt_mask))) {
16231+ rdmsrl(pfm_intel_arch_pmd_desc[i].hw_addr, val);
16232+ if (!(val & wmask))
16233+ return 1;
16234+ num--;
16235+ }
16236+ }
16237+ return 0;
16238+}
16239+
16240+static int pfm_intel_arch_stop_save(struct pfm_context *ctx,
16241+ struct pfm_event_set *set)
16242+{
16243+ u64 used_mask[PFM_PMC_BV];
16244+ u64 *cnt_pmds;
16245+ u64 val, wmask, ovfl_mask;
16246+ u32 i, count;
16247+
16248+ wmask = 1ULL << pfm_pmu_conf->counter_width;
16249+
16250+ bitmap_and(cast_ulp(used_mask),
16251+ cast_ulp(set->used_pmcs),
16252+ cast_ulp(enable_mask),
16253+ max_enable);
16254+
16255+ count = bitmap_weight(cast_ulp(used_mask), max_enable);
16256+
16257+ /*
16258+ * stop monitoring
16259+ * Unfortunately, this is very expensive!
16260+ * wrmsrl() is serializing.
16261+ */
16262+ for (i = 0; count; i++) {
16263+ if (test_bit(i, cast_ulp(used_mask))) {
16264+ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
16265+ count--;
16266+ }
16267+ }
16268+
16269+ /*
16270+ * if we already having a pending overflow condition, we simply
16271+ * return to take care of this first.
16272+ */
16273+ if (set->npend_ovfls)
16274+ return 1;
16275+
16276+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
16277+ cnt_pmds = ctx->regs.cnt_pmds;
16278+
16279+ /*
16280+ * check for pending overflows and save PMDs (combo)
16281+ * we employ used_pmds because we also need to save
16282+ * and not just check for pending interrupts.
16283+ *
16284+ * Must check for counting PMDs because of virtual PMDs
16285+ */
16286+ count = set->nused_pmds;
16287+ for (i = 0; count; i++) {
16288+ if (test_bit(i, cast_ulp(set->used_pmds))) {
16289+ val = pfm_arch_read_pmd(ctx, i);
16290+ if (likely(test_bit(i, cast_ulp(cnt_pmds)))) {
16291+ if (!(val & wmask)) {
16292+ __set_bit(i, cast_ulp(set->povfl_pmds));
16293+ set->npend_ovfls++;
16294+ }
16295+ val = (set->pmds[i].value & ~ovfl_mask)
16296+ | (val & ovfl_mask);
16297+ }
16298+ set->pmds[i].value = val;
16299+ count--;
16300+ }
16301+ }
16302+ /* 0 means: no need to save PMDs at upper level */
16303+ return 0;
16304+}
16305+
16306+/**
16307+ * pfm_intel_arch_quiesce - stop monitoring without grabbing any lock
16308+ *
16309+ * called from NMI interrupt handler to immediately stop monitoring
16310+ * cannot grab any lock, including perfmon related locks
16311+ */
16312+static void __kprobes pfm_intel_arch_quiesce(void)
16313+{
16314+ u16 i;
16315+
16316+ /*
16317+ * PMC16 is the fixed control control register so it has a
16318+ * distinct MSR address
16319+ *
16320+ * We do not use the hw_addr field in the table to avoid touching
16321+ * too many cachelines
16322+ */
16323+ for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) {
16324+ if (test_bit(i, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) {
16325+ if (i == 16)
16326+ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
16327+ else
16328+ wrmsrl(MSR_P6_EVNTSEL0+i, 0);
16329+ }
16330+ }
16331+}
16332+
16333+/**
16334+ * pfm_intel_arch_restore_pmcs - reload PMC registers
16335+ * @ctx: context to restore from
16336+ * @set: current event set
16337+ *
16338+ * optimized version of pfm_arch_restore_pmcs(). On architectural perfmon,
16339+ * we can afford to only restore the pmcs registers we use, because they
16340+ * are all independent from each other.
16341+ */
16342+static void pfm_intel_arch_restore_pmcs(struct pfm_context *ctx,
16343+ struct pfm_event_set *set)
16344+{
16345+ u64 *mask;
16346+ u16 i, num;
16347+
16348+ mask = set->used_pmcs;
16349+ num = set->nused_pmcs;
16350+ for (i = 0; num; i++) {
16351+ if (test_bit(i, cast_ulp(mask))) {
16352+ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, set->pmcs[i]);
16353+ num--;
16354+ }
16355+ }
16356+}
16357+/*
16358+ * Counters may have model-specific width. Yet the documentation says
16359+ * that only the lower 32 bits can be written to due to the specification
16360+ * of wrmsr. bits [32-(w-1)] are sign extensions of bit 31. Bits [w-63] must
16361+ * not be set (see rsvd_msk for PMDs). As such the effective width of a
16362+ * counter is 31 bits only regardless of what CPUID.0xa returns.
16363+ *
16364+ * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18
16365+ */
16366+static struct pfm_pmu_config pfm_intel_arch_pmu_conf = {
16367+ .pmu_name = "Intel architectural",
16368+ .pmd_desc = pfm_intel_arch_pmd_desc,
16369+ .counter_width = 31,
16370+ .num_pmc_entries = PFM_IA_MAX_PMCS,
16371+ .num_pmd_entries = PFM_IA_MAX_PMDS,
16372+ .pmc_desc = pfm_intel_arch_pmc_desc,
16373+ .probe_pmu = pfm_intel_arch_probe_pmu,
16374+ .version = "1.0",
16375+ .flags = PFM_PMU_BUILTIN_FLAG,
16376+ .owner = THIS_MODULE,
16377+ .pmu_info = &pfm_intel_arch_pmu_info
16378+};
16379+
16380+static int __init pfm_intel_arch_pmu_init_module(void)
16381+{
16382+ return pfm_pmu_register(&pfm_intel_arch_pmu_conf);
16383+}
16384+
16385+static void __exit pfm_intel_arch_pmu_cleanup_module(void)
16386+{
16387+ pfm_pmu_unregister(&pfm_intel_arch_pmu_conf);
16388+}
16389+
16390+module_init(pfm_intel_arch_pmu_init_module);
16391+module_exit(pfm_intel_arch_pmu_cleanup_module);
16392--- /dev/null
16393+++ b/arch/x86/perfmon/perfmon_intel_atom.c
16394@@ -0,0 +1,541 @@
16395+/*
16396+ * perfmon support for Intel Atom (architectural perfmon v3 + PEBS)
16397+ *
16398+ * Copyright (c) 2008 Google,Inc
16399+ * Contributed by Stephane Eranian <eranian@gmail.com>
16400+ *
16401+ * This program is free software; you can redistribute it and/or
16402+ * modify it under the terms of version 2 of the GNU General Public
16403+ * License as published by the Free Software Foundation.
16404+ *
16405+ * This program is distributed in the hope that it will be useful,
16406+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16407+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16408+ * General Public License for more details.
16409+ *
16410+ * You should have received a copy of the GNU General Public License
16411+ * along with this program; if not, write to the Free Software
16412+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
16413+ * 02111-1307 USA
16414+ */
16415+#include <linux/module.h>
16416+#include <linux/kprobes.h>
16417+#include <linux/perfmon_kern.h>
16418+#include <asm/msr.h>
16419+
16420+MODULE_AUTHOR("Stephane Eranian <eranian@gmail.com>");
16421+MODULE_DESCRIPTION("Intel Atom");
16422+MODULE_LICENSE("GPL");
16423+
16424+static int force, force_nmi;
16425+MODULE_PARM_DESC(force, "bool: force module to load succesfully");
16426+MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt");
16427+module_param(force, bool, 0600);
16428+module_param(force_nmi, bool, 0600);
16429+
16430+/*
16431+ * - upper 32 bits are reserved
16432+ * - INT: APIC enable bit is reserved (forced to 1)
16433+ *
16434+ * RSVD: reserved bits are 1
16435+ */
16436+#define PFM_ATOM_PMC_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20))
16437+
16438+/*
16439+ * force Local APIC interrupt on overflow
16440+ * disable with NO_EMUL64
16441+ */
16442+#define PFM_ATOM_PMC_VAL (1ULL<<20)
16443+#define PFM_ATOM_NO64 (1ULL<<20)
16444+
16445+/*
16446+ * Atom counters are 40-bits. 40-bits can be read but ony 31 can be written
16447+ * to due to a limitation of wrmsr. Bits [[63-32] are sign extensions of bit 31.
16448+ * Bits [63-40] must not be set
16449+ *
16450+ * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18
16451+ */
16452+#define PFM_ATOM_PMD_WIDTH 31
16453+#define PFM_ATOM_PMD_RSVD ~((1ULL << 40)-1)
16454+
16455+static void pfm_intel_atom_acquire_pmu_percpu(void);
16456+static void pfm_intel_atom_release_pmu_percpu(void);
16457+static void pfm_intel_atom_restore_pmcs(struct pfm_context *ctx,
16458+ struct pfm_event_set *set);
16459+static int pfm_intel_atom_stop_save(struct pfm_context *ctx,
16460+ struct pfm_event_set *set);
16461+static int pfm_intel_atom_has_ovfls(struct pfm_context *ctx);
16462+static void __kprobes pfm_intel_atom_quiesce(void);
16463+
16464+struct pfm_arch_pmu_info pfm_intel_atom_pmu_info = {
16465+ .stop_save = pfm_intel_atom_stop_save,
16466+ .has_ovfls = pfm_intel_atom_has_ovfls,
16467+ .quiesce = pfm_intel_atom_quiesce,
16468+ .restore_pmcs = pfm_intel_atom_restore_pmcs,
16469+ .acquire_pmu_percpu = pfm_intel_atom_acquire_pmu_percpu,
16470+ .release_pmu_percpu = pfm_intel_atom_release_pmu_percpu
16471+
16472+};
16473+
16474+#define PFM_ATOM_C(n) { \
16475+ .type = PFM_REG_I64, \
16476+ .desc = "PERFEVTSEL"#n, \
16477+ .dfl_val = PFM_ATOM_PMC_VAL, \
16478+ .rsvd_msk = PFM_ATOM_PMC_RSVD, \
16479+ .no_emul64_msk = PFM_ATOM_NO64, \
16480+ .hw_addr = MSR_P6_EVNTSEL0 + (n) \
16481+ }
16482+
16483+
16484+static struct pfm_regmap_desc pfm_intel_atom_pmc_desc[] = {
16485+/* pmc0 */ PFM_ATOM_C(0),
16486+/* pmc1 */ PFM_ATOM_C(1),
16487+/* pmc2 */ PMX_NA, PMX_NA,
16488+/* pmc4 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA,
16489+/* pmc8 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA,
16490+/* pmc12 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA,
16491+/* pmc16 */ { .type = PFM_REG_I,
16492+ .desc = "FIXED_CTRL",
16493+ .dfl_val = 0x0000000000000888ULL, /* force PMI */
16494+ .rsvd_msk = 0xfffffffffffffcccULL, /* 3 fixed counters defined */
16495+ .no_emul64_msk = 0,
16496+ .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL
16497+ },
16498+/* pmc17 */{ .type = PFM_REG_W,
16499+ .desc = "PEBS_ENABLE",
16500+ .dfl_val = 0,
16501+ .rsvd_msk = 0xfffffffffffffffeULL,
16502+ .no_emul64_msk = 0,
16503+ .hw_addr = MSR_IA32_PEBS_ENABLE
16504+ }
16505+};
16506+#define PFM_ATOM_MAX_PMCS ARRAY_SIZE(pfm_intel_atom_pmc_desc)
16507+
16508+#define PFM_ATOM_D(n) \
16509+ { .type = PFM_REG_C, \
16510+ .desc = "PMC"#n, \
16511+ .rsvd_msk = PFM_ATOM_PMD_RSVD, \
16512+ .hw_addr = MSR_P6_PERFCTR0+n, \
16513+ .dep_pmcs[0] = 1ULL << n \
16514+ }
16515+
16516+#define PFM_ATOM_FD(n) \
16517+ { .type = PFM_REG_C, \
16518+ .desc = "FIXED_CTR"#n, \
16519+ .rsvd_msk = PFM_ATOM_PMD_RSVD, \
16520+ .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\
16521+ .dep_pmcs[0] = 1ULL << 16 \
16522+ }
16523+
16524+static struct pfm_regmap_desc pfm_intel_atom_pmd_desc[] = {
16525+/* pmd0 */ PFM_ATOM_D(0),
16526+/* pmd1 */ PFM_ATOM_D(1),
16527+/* pmd2 */ PMX_NA,
16528+/* pmd3 */ PMX_NA,
16529+/* pmd4 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA,
16530+/* pmd8 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA,
16531+/* pmd12 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA,
16532+/* pmd16 */ PFM_ATOM_FD(0),
16533+/* pmd17 */ PFM_ATOM_FD(1),
16534+/* pmd18 */ PFM_ATOM_FD(2)
16535+};
16536+#define PFM_ATOM_MAX_PMDS ARRAY_SIZE(pfm_intel_atom_pmd_desc)
16537+
16538+static struct pfm_pmu_config pfm_intel_atom_pmu_conf;
16539+
16540+static int pfm_intel_atom_probe_pmu(void)
16541+{
16542+ if (force)
16543+ goto doit;
16544+
16545+ if (current_cpu_data.x86_vendor != X86_VENDOR_INTEL)
16546+ return -1;
16547+
16548+ if (current_cpu_data.x86 != 6)
16549+ return -1;
16550+
16551+ if (current_cpu_data.x86_model != 28)
16552+ return -1;
16553+doit:
16554+ /*
16555+ * having APIC is mandatory, so disregard force option
16556+ */
16557+ if (!cpu_has_apic) {
16558+ PFM_INFO("no Local APIC, try rebooting with lapic option");
16559+ return -1;
16560+ }
16561+
16562+ PFM_INFO("detected Intel Atom PMU");
16563+
16564+ if (force_nmi)
16565+ pfm_intel_atom_pmu_info.flags |= PFM_X86_FL_USE_NMI;
16566+
16567+ return 0;
16568+}
16569+
16570+/**
16571+ * pfm_intel_atom_has_ovfls - check for pending overflow condition
16572+ * @ctx: context to work on
16573+ *
16574+ * detect if counters have overflowed.
16575+ * return:
16576+ * 0 : no overflow
16577+ * 1 : at least one overflow
16578+ */
16579+static int __kprobes pfm_intel_atom_has_ovfls(struct pfm_context *ctx)
16580+{
16581+ struct pfm_regmap_desc *d;
16582+ u64 ovf;
16583+
16584+ d = pfm_pmu_conf->pmd_desc;
16585+ /*
16586+ * read global overflow status register
16587+ * if sharing PMU, then not all bit are ours so must
16588+ * check only the ones we actually use
16589+ */
16590+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, ovf);
16591+
16592+ /*
16593+ * for pmd0, we also check PEBS overflow on bit 62
16594+ */
16595+ if ((d[0].type & PFM_REG_I) && (ovf & ((1ull << 62) | 1ull)))
16596+ return 1;
16597+
16598+ if ((d[1].type & PFM_REG_I) && (ovf & 2ull))
16599+ return 1;
16600+
16601+ if ((d[16].type & PFM_REG_I) && (ovf & (1ull << 32)))
16602+ return 1;
16603+
16604+ if ((d[17].type & PFM_REG_I) && (ovf & (2ull << 32)))
16605+ return 1;
16606+
16607+ if ((d[18].type & PFM_REG_I) && (ovf & (4ull << 32)))
16608+ return 1;
16609+
16610+ return 0;
16611+}
16612+
16613+/**
16614+ * pfm_intel_atom_stop_save - stop monitoring, collect pending overflow, save pmds
16615+ * @ctx: context to work on
16616+ * @set: active set
16617+ *
16618+ * return:
16619+ * 1: caller needs to save pmds
16620+ * 0: caller does not need to save pmds, they have been saved by this call
16621+ */
16622+static int pfm_intel_atom_stop_save(struct pfm_context *ctx,
16623+ struct pfm_event_set *set)
16624+{
16625+#define PFM_ATOM_WMASK (1ULL << 31)
16626+#define PFM_ATOM_OMASK ((1ULL << 31)-1)
16627+ u64 clear_ovf = 0;
16628+ u64 ovf, ovf2, val;
16629+
16630+ /*
16631+ * read global overflow status register
16632+ * if sharing PMU, then not all bit are ours so must
16633+ * check only the ones we actually use.
16634+ *
16635+ * XXX: Atom seems to have a bug with the stickyness of
16636+ * GLOBAL_STATUS. If we read GLOBAL_STATUS after we
16637+ * clear the generic counters, then their bits in
16638+ * GLOBAL_STATUS are cleared. This should not be the
16639+ * case accoding to architected PMU. To workaround
16640+ * the problem, we read GLOBAL_STATUS BEFORE we stop
16641+ * all monitoring.
16642+ */
16643+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, ovf);
16644+
16645+ /*
16646+ * stop monitoring
16647+ */
16648+ if (test_bit(0, cast_ulp(set->used_pmcs)))
16649+ wrmsrl(MSR_P6_EVNTSEL0, 0);
16650+
16651+ if (test_bit(1, cast_ulp(set->used_pmcs)))
16652+ wrmsrl(MSR_P6_EVNTSEL1, 0);
16653+
16654+ if (test_bit(16, cast_ulp(set->used_pmcs)))
16655+ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
16656+
16657+ if (test_bit(17, cast_ulp(set->used_pmcs)))
16658+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
16659+
16660+ /*
16661+ * XXX: related to bug mentioned above
16662+ *
16663+ * read GLOBAL_STATUS again to avoid race condition
16664+ * with overflows happening after first read and
16665+ * before stop. That avoids missing overflows on
16666+ * the fixed counters and PEBS
16667+ */
16668+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, ovf2);
16669+ ovf |= ovf2;
16670+
16671+ /*
16672+ * if we already have a pending overflow condition, we simply
16673+ * return to take care of it first.
16674+ */
16675+ if (set->npend_ovfls)
16676+ return 1;
16677+
16678+ /*
16679+ * check PMD 0,1,16,17,18 for overflow and save their value
16680+ */
16681+ if (test_bit(0, cast_ulp(set->used_pmds))) {
16682+ rdmsrl(MSR_P6_PERFCTR0, val);
16683+ if (ovf & ((1ull<<62)|1ull)) {
16684+ __set_bit(0, cast_ulp(set->povfl_pmds));
16685+ set->npend_ovfls++;
16686+ clear_ovf = (1ull << 62) | 1ull;
16687+ }
16688+ val = (set->pmds[0].value & ~PFM_ATOM_OMASK)
16689+ | (val & PFM_ATOM_OMASK);
16690+ set->pmds[0].value = val;
16691+ }
16692+
16693+ if (test_bit(1, cast_ulp(set->used_pmds))) {
16694+ rdmsrl(MSR_P6_PERFCTR1, val);
16695+ if (ovf & 2ull) {
16696+ __set_bit(1, cast_ulp(set->povfl_pmds));
16697+ set->npend_ovfls++;
16698+ clear_ovf |= 2ull;
16699+ }
16700+ val = (set->pmds[1].value & ~PFM_ATOM_OMASK)
16701+ | (val & PFM_ATOM_OMASK);
16702+ set->pmds[1].value = val;
16703+ }
16704+
16705+ if (test_bit(16, cast_ulp(set->used_pmds))) {
16706+ rdmsrl(MSR_CORE_PERF_FIXED_CTR0, val);
16707+ if (ovf & (1ull << 32)) {
16708+ __set_bit(16, cast_ulp(set->povfl_pmds));
16709+ set->npend_ovfls++;
16710+ clear_ovf |= 1ull << 32;
16711+ }
16712+ val = (set->pmds[16].value & ~PFM_ATOM_OMASK)
16713+ | (val & PFM_ATOM_OMASK);
16714+ set->pmds[16].value = val;
16715+ }
16716+
16717+ if (test_bit(17, cast_ulp(set->used_pmds))) {
16718+ rdmsrl(MSR_CORE_PERF_FIXED_CTR0+1, val);
16719+ if (ovf & (2ull << 32)) {
16720+ __set_bit(17, cast_ulp(set->povfl_pmds));
16721+ set->npend_ovfls++;
16722+ clear_ovf |= 2ull << 32;
16723+ }
16724+ val = (set->pmds[17].value & ~PFM_ATOM_OMASK)
16725+ | (val & PFM_ATOM_OMASK);
16726+ set->pmds[17].value = val;
16727+ }
16728+
16729+ if (test_bit(18, cast_ulp(set->used_pmds))) {
16730+ rdmsrl(MSR_CORE_PERF_FIXED_CTR0+2, val);
16731+ if (ovf & (4ull << 32)) {
16732+ __set_bit(18, cast_ulp(set->povfl_pmds));
16733+ set->npend_ovfls++;
16734+ clear_ovf |= 4ull << 32;
16735+ }
16736+ val = (set->pmds[18].value & ~PFM_ATOM_OMASK)
16737+ | (val & PFM_ATOM_OMASK);
16738+ set->pmds[18].value = val;
16739+ }
16740+
16741+ if (clear_ovf)
16742+ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, clear_ovf);
16743+
16744+ /* 0 means: no need to save PMDs at upper level */
16745+ return 0;
16746+}
16747+
16748+/**
16749+ * pfm_intel_atom_quiesce - stop monitoring without grabbing any lock
16750+ *
16751+ * called from NMI interrupt handler to immediately stop monitoring
16752+ * cannot grab any lock, including perfmon related locks
16753+ */
16754+static void __kprobes pfm_intel_atom_quiesce(void)
16755+{
16756+ /*
16757+ * quiesce PMU by clearing available registers that have
16758+ * the start/stop capability
16759+ */
16760+ if (test_bit(0, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
16761+ wrmsrl(MSR_P6_EVNTSEL0, 0);
16762+
16763+ if (test_bit(1, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
16764+ wrmsrl(MSR_P6_EVNTSEL1, 0);
16765+
16766+ if (test_bit(16, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
16767+ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
16768+
16769+ if (test_bit(17, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
16770+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
16771+}
16772+
16773+/**
16774+ * pfm_intel_atom_restore_pmcs - reload PMC registers
16775+ * @ctx: context to restore from
16776+ * @set: current event set
16777+ *
16778+ * restores pmcs and also PEBS Data Save area pointer
16779+ */
16780+static void pfm_intel_atom_restore_pmcs(struct pfm_context *ctx,
16781+ struct pfm_event_set *set)
16782+{
16783+ struct pfm_arch_context *ctx_arch;
16784+ u64 clear_ovf = 0;
16785+
16786+ ctx_arch = pfm_ctx_arch(ctx);
16787+ /*
16788+ * must restore DS pointer before restoring PMCs
16789+ * as this can potentially reactivate monitoring
16790+ */
16791+ if (ctx_arch->flags.use_ds)
16792+ wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ctx_arch->ds_area);
16793+
16794+ if (test_bit(0, cast_ulp(set->used_pmcs))) {
16795+ wrmsrl(MSR_P6_EVNTSEL0, set->pmcs[0]);
16796+ clear_ovf = 1ull;
16797+ }
16798+
16799+ if (test_bit(1, cast_ulp(set->used_pmcs))) {
16800+ wrmsrl(MSR_P6_EVNTSEL1, set->pmcs[1]);
16801+ clear_ovf |= 2ull;
16802+ }
16803+
16804+ if (test_bit(16, cast_ulp(set->used_pmcs))) {
16805+ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, set->pmcs[16]);
16806+ clear_ovf |= 7ull << 32;
16807+ }
16808+
16809+ if (test_bit(17, cast_ulp(set->used_pmcs))) {
16810+ wrmsrl(MSR_IA32_PEBS_ENABLE, set->pmcs[17]);
16811+ clear_ovf |= 1ull << 62;
16812+ }
16813+
16814+ if (clear_ovf)
16815+ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, clear_ovf);
16816+}
16817+
16818+static int pfm_intel_atom_pmc17_check(struct pfm_context *ctx,
16819+ struct pfm_event_set *set,
16820+ struct pfarg_pmc *req)
16821+{
16822+ struct pfm_arch_context *ctx_arch;
16823+ ctx_arch = pfm_ctx_arch(ctx);
16824+
16825+ /*
16826+ * if user activates PEBS_ENABLE, then we need to have a valid
16827+ * DS Area setup. This only happens when the PEBS sampling format is
16828+ * used in which case PFM_X86_USE_PEBS is set. We must reject all other
16829+ * requests.
16830+ *
16831+ * Otherwise we may pickup stale MSR_IA32_DS_AREA values. It appears
16832+ * that a value of 0 for this MSR does crash the system with
16833+ * PEBS_ENABLE=1.
16834+ */
16835+ if (!ctx_arch->flags.use_pebs && req->reg_value) {
16836+ PFM_DBG("pmc17 useable only with a PEBS sampling format");
16837+ return -EINVAL;
16838+ }
16839+ return 0;
16840+}
16841+
16842+DEFINE_PER_CPU(u64, saved_global_ctrl);
16843+
16844+/**
16845+ * pfm_intel_atom_acquire_pmu_percpu - acquire PMU resource per CPU
16846+ *
16847+ * For Atom, it is necessary to enable all available
16848+ * registers. The firmware rightfully has the fixed counters
16849+ * disabled for backward compatibility with architectural perfmon
16850+ * v1
16851+ *
16852+ * This function is invoked on each online CPU
16853+ */
16854+static void pfm_intel_atom_acquire_pmu_percpu(void)
16855+{
16856+ struct pfm_regmap_desc *d;
16857+ u64 mask = 0;
16858+ unsigned int i;
16859+
16860+ /*
16861+ * build bitmask of registers that are available to
16862+ * us. In some cases, there may be fewer registers than
16863+ * what Atom supports due to sharing with other kernel
16864+ * subsystems, such as NMI
16865+ */
16866+ d = pfm_pmu_conf->pmd_desc;
16867+ for (i=0; i < 16; i++) {
16868+ if ((d[i].type & PFM_REG_I) == 0)
16869+ continue;
16870+ mask |= 1ull << i;
16871+ }
16872+ for (i=16; i < PFM_ATOM_MAX_PMDS; i++) {
16873+ if ((d[i].type & PFM_REG_I) == 0)
16874+ continue;
16875+ mask |= 1ull << (32+i-16);
16876+ }
16877+
16878+ /*
16879+ * keep a local copy of the current MSR_CORE_PERF_GLOBAL_CTRL
16880+ */
16881+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl));
16882+
16883+ PFM_DBG("global=0x%llx set to 0x%llx",
16884+ __get_cpu_var(saved_global_ctrl),
16885+ mask);
16886+
16887+ /*
16888+ * enable all registers
16889+ *
16890+ * No need to quiesce PMU. If there is a overflow, it will be
16891+ * treated as spurious by the handler
16892+ */
16893+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, mask);
16894+}
16895+
16896+/**
16897+ * pfm_intel_atom_release_pmu_percpu - release PMU resource per CPU
16898+ *
16899+ * For Atom, we restore MSR_CORE_PERF_GLOBAL_CTRL to its orginal value
16900+ */
16901+static void pfm_intel_atom_release_pmu_percpu(void)
16902+{
16903+ PFM_DBG("global_ctrl restored to 0x%llx\n",
16904+ __get_cpu_var(saved_global_ctrl));
16905+
16906+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl));
16907+}
16908+
16909+static struct pfm_pmu_config pfm_intel_atom_pmu_conf = {
16910+ .pmu_name = "Intel Atom",
16911+ .pmd_desc = pfm_intel_atom_pmd_desc,
16912+ .counter_width = PFM_ATOM_PMD_WIDTH,
16913+ .num_pmc_entries = PFM_ATOM_MAX_PMCS,
16914+ .num_pmd_entries = PFM_ATOM_MAX_PMDS,
16915+ .pmc_desc = pfm_intel_atom_pmc_desc,
16916+ .probe_pmu = pfm_intel_atom_probe_pmu,
16917+ .version = "1.0",
16918+ .flags = PFM_PMU_BUILTIN_FLAG,
16919+ .owner = THIS_MODULE,
16920+ .pmc_write_check = pfm_intel_atom_pmc17_check,
16921+ .pmu_info = &pfm_intel_atom_pmu_info
16922+};
16923+
16924+static int __init pfm_intel_atom_pmu_init_module(void)
16925+{
16926+ return pfm_pmu_register(&pfm_intel_atom_pmu_conf);
16927+}
16928+
16929+static void __exit pfm_intel_atom_pmu_cleanup_module(void)
16930+{
16931+ pfm_pmu_unregister(&pfm_intel_atom_pmu_conf);
16932+}
16933+
16934+module_init(pfm_intel_atom_pmu_init_module);
16935+module_exit(pfm_intel_atom_pmu_cleanup_module);
16936--- /dev/null
16937+++ b/arch/x86/perfmon/perfmon_intel_core.c
16938@@ -0,0 +1,449 @@
16939+/*
16940+ * This file contains the Intel Core PMU registers description tables.
16941+ * Intel Core-based processors support architectural perfmon v2 + PEBS
16942+ *
16943+ * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P.
16944+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
16945+ */
16946+#include <linux/module.h>
16947+#include <linux/kprobes.h>
16948+#include <linux/perfmon_kern.h>
16949+#include <linux/nmi.h>
16950+
16951+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
16952+MODULE_DESCRIPTION("Intel Core");
16953+MODULE_LICENSE("GPL");
16954+
16955+static int force_nmi;
16956+MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt");
16957+module_param(force_nmi, bool, 0600);
16958+
16959+/*
16960+ * - upper 32 bits are reserved
16961+ * - INT: APIC enable bit is reserved (forced to 1)
16962+ * - bit 21 is reserved
16963+ *
16964+ * RSVD: reserved bits must be 1
16965+ */
16966+#define PFM_CORE_PMC_RSVD ((~((1ULL<<32)-1)) \
16967+ | (1ULL<<20) \
16968+ | (1ULL<<21))
16969+
16970+/*
16971+ * Core counters are 40-bits
16972+ */
16973+#define PFM_CORE_CTR_RSVD (~((1ULL<<40)-1))
16974+
16975+/*
16976+ * force Local APIC interrupt on overflow
16977+ * disable with NO_EMUL64
16978+ */
16979+#define PFM_CORE_PMC_VAL (1ULL<<20)
16980+#define PFM_CORE_NO64 (1ULL<<20)
16981+
16982+#define PFM_CORE_NA { .reg_type = PFM_REGT_NA}
16983+
16984+#define PFM_CORE_CA(m, c, t) \
16985+ { \
16986+ .addrs[0] = m, \
16987+ .ctr = c, \
16988+ .reg_type = t \
16989+ }
16990+
16991+struct pfm_ds_area_intel_core {
16992+ u64 bts_buf_base;
16993+ u64 bts_index;
16994+ u64 bts_abs_max;
16995+ u64 bts_intr_thres;
16996+ u64 pebs_buf_base;
16997+ u64 pebs_index;
16998+ u64 pebs_abs_max;
16999+ u64 pebs_intr_thres;
17000+ u64 pebs_cnt_reset;
17001+};
17002+
17003+static void pfm_core_restore_pmcs(struct pfm_context *ctx,
17004+ struct pfm_event_set *set);
17005+static int pfm_core_has_ovfls(struct pfm_context *ctx);
17006+static int pfm_core_stop_save(struct pfm_context *ctx,
17007+ struct pfm_event_set *set);
17008+static void __kprobes pfm_core_quiesce(void);
17009+
17010+static u64 enable_mask[PFM_MAX_PMCS];
17011+static u16 max_enable;
17012+
17013+struct pfm_arch_pmu_info pfm_core_pmu_info = {
17014+ .stop_save = pfm_core_stop_save,
17015+ .has_ovfls = pfm_core_has_ovfls,
17016+ .quiesce = pfm_core_quiesce,
17017+ .restore_pmcs = pfm_core_restore_pmcs
17018+};
17019+
17020+static struct pfm_regmap_desc pfm_core_pmc_desc[] = {
17021+/* pmc0 */ {
17022+ .type = PFM_REG_I64,
17023+ .desc = "PERFEVTSEL0",
17024+ .dfl_val = PFM_CORE_PMC_VAL,
17025+ .rsvd_msk = PFM_CORE_PMC_RSVD,
17026+ .no_emul64_msk = PFM_CORE_NO64,
17027+ .hw_addr = MSR_P6_EVNTSEL0
17028+ },
17029+/* pmc1 */ {
17030+ .type = PFM_REG_I64,
17031+ .desc = "PERFEVTSEL1",
17032+ .dfl_val = PFM_CORE_PMC_VAL,
17033+ .rsvd_msk = PFM_CORE_PMC_RSVD,
17034+ .no_emul64_msk = PFM_CORE_NO64,
17035+ .hw_addr = MSR_P6_EVNTSEL1
17036+ },
17037+/* pmc2 */ PMX_NA, PMX_NA,
17038+/* pmc4 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA,
17039+/* pmc8 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA,
17040+/* pmc12 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA,
17041+/* pmc16 */ { .type = PFM_REG_I,
17042+ .desc = "FIXED_CTRL",
17043+ .dfl_val = 0x888ULL,
17044+ .rsvd_msk = 0xfffffffffffffcccULL,
17045+ .no_emul64_msk = 0,
17046+ .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL
17047+ },
17048+/* pmc17 */ { .type = PFM_REG_W,
17049+ .desc = "PEBS_ENABLE",
17050+ .dfl_val = 0,
17051+ .rsvd_msk = 0xfffffffffffffffeULL,
17052+ .no_emul64_msk = 0,
17053+ .hw_addr = MSR_IA32_PEBS_ENABLE
17054+ }
17055+};
17056+
17057+#define PFM_CORE_D(n) \
17058+ { .type = PFM_REG_C, \
17059+ .desc = "PMC"#n, \
17060+ .rsvd_msk = PFM_CORE_CTR_RSVD, \
17061+ .hw_addr = MSR_P6_PERFCTR0+n, \
17062+ .dep_pmcs[0] = 1ULL << n \
17063+ }
17064+
17065+#define PFM_CORE_FD(n) \
17066+ { .type = PFM_REG_C, \
17067+ .desc = "FIXED_CTR"#n, \
17068+ .rsvd_msk = PFM_CORE_CTR_RSVD, \
17069+ .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\
17070+ .dep_pmcs[0] = 1ULL << 16 \
17071+ }
17072+
17073+static struct pfm_regmap_desc pfm_core_pmd_desc[] = {
17074+/* pmd0 */ PFM_CORE_D(0),
17075+/* pmd1 */ PFM_CORE_D(1),
17076+/* pmd2 */ PMX_NA, PMX_NA,
17077+/* pmd4 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA,
17078+/* pmd8 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA,
17079+/* pmd12 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA,
17080+/* pmd16 */ PFM_CORE_FD(0),
17081+/* pmd17 */ PFM_CORE_FD(1),
17082+/* pmd18 */ PFM_CORE_FD(2)
17083+};
17084+#define PFM_CORE_NUM_PMCS ARRAY_SIZE(pfm_core_pmc_desc)
17085+#define PFM_CORE_NUM_PMDS ARRAY_SIZE(pfm_core_pmd_desc)
17086+
17087+static struct pfm_pmu_config pfm_core_pmu_conf;
17088+
17089+static int pfm_core_probe_pmu(void)
17090+{
17091+ /*
17092+ * Check for Intel Core processor explicitely
17093+ * Checking for cpu_has_perfmon is not enough as this
17094+ * matches intel Core Duo/Core Solo but none supports
17095+ * PEBS.
17096+ *
17097+ * Intel Core = arch perfmon v2 + PEBS
17098+ */
17099+ if (current_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
17100+ PFM_INFO("not an AMD processor");
17101+ return -1;
17102+ }
17103+
17104+ if (current_cpu_data.x86 != 6)
17105+ return -1;
17106+
17107+ switch (current_cpu_data.x86_model) {
17108+ case 15: /* Merom */
17109+ break;
17110+ case 23: /* Penryn */
17111+ break;
17112+ case 29: /* Dunnington */
17113+ break;
17114+ default:
17115+ return -1;
17116+ }
17117+
17118+ if (!cpu_has_apic) {
17119+ PFM_INFO("no Local APIC, unsupported");
17120+ return -1;
17121+ }
17122+
17123+ PFM_INFO("nmi_watchdog=%d nmi_active=%d force_nmi=%d",
17124+ nmi_watchdog, atomic_read(&nmi_active), force_nmi);
17125+
17126+ /*
17127+ * Intel Core processors implement DS and PEBS, no need to check
17128+ */
17129+ if (cpu_has_pebs)
17130+ PFM_INFO("PEBS supported, enabled");
17131+
17132+ /*
17133+ * initialize bitmask of register with enable capability, i.e.,
17134+ * startstop. This is used to restrict the number of registers to
17135+ * touch on start/stop
17136+ * max_enable: number of bits to scan in enable_mask = highest + 1
17137+ *
17138+ * may be adjusted in pfm_arch_pmu_acquire()
17139+ */
17140+ __set_bit(0, cast_ulp(enable_mask));
17141+ __set_bit(1, cast_ulp(enable_mask));
17142+ __set_bit(16, cast_ulp(enable_mask));
17143+ __set_bit(17, cast_ulp(enable_mask));
17144+ max_enable = 17+1;
17145+
17146+ if (force_nmi)
17147+ pfm_core_pmu_info.flags |= PFM_X86_FL_USE_NMI;
17148+
17149+ return 0;
17150+}
17151+
17152+static int pfm_core_pmc17_check(struct pfm_context *ctx,
17153+ struct pfm_event_set *set,
17154+ struct pfarg_pmc *req)
17155+{
17156+ struct pfm_arch_context *ctx_arch;
17157+ ctx_arch = pfm_ctx_arch(ctx);
17158+
17159+ /*
17160+ * if user activates PEBS_ENABLE, then we need to have a valid
17161+ * DS Area setup. This only happens when the PEBS sampling format is
17162+ * used in which case PFM_X86_USE_PEBS is set. We must reject all other
17163+ * requests.
17164+ *
17165+ * Otherwise we may pickup stale MSR_IA32_DS_AREA values. It appears
17166+ * that a value of 0 for this MSR does crash the system with
17167+ * PEBS_ENABLE=1.
17168+ */
17169+ if (!ctx_arch->flags.use_pebs && req->reg_value) {
17170+ PFM_DBG("pmc17 useable only with a PEBS sampling format");
17171+ return -EINVAL;
17172+ }
17173+ return 0;
17174+}
17175+
17176+/*
17177+ * detect is counters have overflowed.
17178+ * return:
17179+ * 0 : no overflow
17180+ * 1 : at least one overflow
17181+ *
17182+ * used by Intel Core-based processors
17183+ */
17184+static int __kprobes pfm_core_has_ovfls(struct pfm_context *ctx)
17185+{
17186+ struct pfm_arch_pmu_info *pmu_info;
17187+ u64 *cnt_mask;
17188+ u64 wmask, val;
17189+ u16 i, num;
17190+
17191+ pmu_info = &pfm_core_pmu_info;
17192+ cnt_mask = ctx->regs.cnt_pmds;
17193+ num = ctx->regs.num_counters;
17194+ wmask = 1ULL << pfm_pmu_conf->counter_width;
17195+
17196+ for (i = 0; num; i++) {
17197+ if (test_bit(i, cast_ulp(cnt_mask))) {
17198+ rdmsrl(pfm_core_pmd_desc[i].hw_addr, val);
17199+ if (!(val & wmask))
17200+ return 1;
17201+ num--;
17202+ }
17203+ }
17204+ return 0;
17205+}
17206+
17207+static int pfm_core_stop_save(struct pfm_context *ctx,
17208+ struct pfm_event_set *set)
17209+{
17210+ struct pfm_arch_context *ctx_arch;
17211+ struct pfm_ds_area_intel_core *ds = NULL;
17212+ u64 used_mask[PFM_PMC_BV];
17213+ u64 *cnt_mask;
17214+ u64 val, wmask, ovfl_mask;
17215+ u16 count, has_ovfl;
17216+ u16 i, pebs_idx = ~0;
17217+
17218+ ctx_arch = pfm_ctx_arch(ctx);
17219+
17220+ wmask = 1ULL << pfm_pmu_conf->counter_width;
17221+
17222+ /*
17223+ * used enable pmc bitmask
17224+ */
17225+ bitmap_and(cast_ulp(used_mask),
17226+ cast_ulp(set->used_pmcs),
17227+ cast_ulp(enable_mask),
17228+ max_enable);
17229+
17230+ count = bitmap_weight(cast_ulp(used_mask), max_enable);
17231+ /*
17232+ * stop monitoring
17233+ * Unfortunately, this is very expensive!
17234+ * wrmsrl() is serializing.
17235+ */
17236+ for (i = 0; count; i++) {
17237+ if (test_bit(i, cast_ulp(used_mask))) {
17238+ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
17239+ count--;
17240+ }
17241+ }
17242+ /*
17243+ * if we already having a pending overflow condition, we simply
17244+ * return to take care of this first.
17245+ */
17246+ if (set->npend_ovfls)
17247+ return 1;
17248+
17249+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
17250+ cnt_mask = ctx->regs.cnt_pmds;
17251+
17252+ if (ctx_arch->flags.use_pebs) {
17253+ ds = ctx_arch->ds_area;
17254+ pebs_idx = 0; /* PMC0/PMD0 */
17255+ PFM_DBG("ds=%p pebs_idx=0x%llx thres=0x%llx",
17256+ ds,
17257+ (unsigned long long)ds->pebs_index,
17258+ (unsigned long long)ds->pebs_intr_thres);
17259+ }
17260+
17261+ /*
17262+ * Check for pending overflows and save PMDs (combo)
17263+ * We employ used_pmds and not intr_pmds because we must
17264+ * also saved on PMD registers.
17265+ * Must check for counting PMDs because of virtual PMDs
17266+ *
17267+ * XXX: should use the ovf_status register instead, yet
17268+ * we would have to check if NMI is used and fallback
17269+ * to individual pmd inspection.
17270+ */
17271+ count = set->nused_pmds;
17272+
17273+ for (i = 0; count; i++) {
17274+ if (test_bit(i, cast_ulp(set->used_pmds))) {
17275+ val = pfm_arch_read_pmd(ctx, i);
17276+ if (likely(test_bit(i, cast_ulp(cnt_mask)))) {
17277+ if (i == pebs_idx)
17278+ has_ovfl = (ds->pebs_index >=
17279+ ds->pebs_intr_thres);
17280+ else
17281+ has_ovfl = !(val & wmask);
17282+ if (has_ovfl) {
17283+ __set_bit(i, cast_ulp(set->povfl_pmds));
17284+ set->npend_ovfls++;
17285+ }
17286+ val = (set->pmds[i].value & ~ovfl_mask)
17287+ | (val & ovfl_mask);
17288+ }
17289+ set->pmds[i].value = val;
17290+ count--;
17291+ }
17292+ }
17293+ /* 0 means: no need to save PMDs at upper level */
17294+ return 0;
17295+}
17296+
17297+/**
17298+ * pfm_core_quiesce - stop monitoring without grabbing any lock
17299+ *
17300+ * called from NMI interrupt handler to immediately stop monitoring
17301+ * cannot grab any lock, including perfmon related locks
17302+ */
17303+static void __kprobes pfm_core_quiesce(void)
17304+{
17305+ /*
17306+ * quiesce PMU by clearing available registers that have
17307+ * the start/stop capability
17308+ */
17309+ if (test_bit(0, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
17310+ wrmsrl(MSR_P6_EVNTSEL0, 0);
17311+ if (test_bit(1, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
17312+ wrmsrl(MSR_P6_EVNTSEL1, 0);
17313+ if (test_bit(16, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
17314+ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
17315+ if (test_bit(17, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
17316+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
17317+}
17318+/**
17319+ * pfm_core_restore_pmcs - reload PMC registers
17320+ * @ctx: context to restore from
17321+ * @set: current event set
17322+ *
17323+ * optimized version of pfm_arch_restore_pmcs(). On Core, we can
17324+ * afford to only restore the pmcs registers we use, because they are
17325+ * all independent from each other.
17326+ */
17327+static void pfm_core_restore_pmcs(struct pfm_context *ctx,
17328+ struct pfm_event_set *set)
17329+{
17330+ struct pfm_arch_context *ctx_arch;
17331+ u64 *mask;
17332+ u16 i, num;
17333+
17334+ ctx_arch = pfm_ctx_arch(ctx);
17335+
17336+ /*
17337+ * must restore DS pointer before restoring PMCs
17338+ * as this can potentially reactivate monitoring
17339+ */
17340+ if (ctx_arch->flags.use_ds)
17341+ wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ctx_arch->ds_area);
17342+
17343+ mask = set->used_pmcs;
17344+ num = set->nused_pmcs;
17345+ for (i = 0; num; i++) {
17346+ if (test_bit(i, cast_ulp(mask))) {
17347+ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, set->pmcs[i]);
17348+ num--;
17349+ }
17350+ }
17351+}
17352+
17353+/*
17354+ * Counters may have model-specific width which can be probed using
17355+ * the CPUID.0xa leaf. Yet, the documentation says: "
17356+ * In the initial implementation, only the read bit width is reported
17357+ * by CPUID, write operations are limited to the low 32 bits.
17358+ * Bits [w-32] are sign extensions of bit 31. As such the effective width
17359+ * of a counter is 31 bits only.
17360+ */
17361+static struct pfm_pmu_config pfm_core_pmu_conf = {
17362+ .pmu_name = "Intel Core",
17363+ .pmd_desc = pfm_core_pmd_desc,
17364+ .counter_width = 31,
17365+ .num_pmc_entries = PFM_CORE_NUM_PMCS,
17366+ .num_pmd_entries = PFM_CORE_NUM_PMDS,
17367+ .pmc_desc = pfm_core_pmc_desc,
17368+ .probe_pmu = pfm_core_probe_pmu,
17369+ .version = "1.2",
17370+ .flags = PFM_PMU_BUILTIN_FLAG,
17371+ .owner = THIS_MODULE,
17372+ .pmu_info = &pfm_core_pmu_info,
17373+ .pmc_write_check = pfm_core_pmc17_check
17374+};
17375+
17376+static int __init pfm_core_pmu_init_module(void)
17377+{
17378+ return pfm_pmu_register(&pfm_core_pmu_conf);
17379+}
17380+
17381+static void __exit pfm_core_pmu_cleanup_module(void)
17382+{
17383+ pfm_pmu_unregister(&pfm_core_pmu_conf);
17384+}
17385+
17386+module_init(pfm_core_pmu_init_module);
17387+module_exit(pfm_core_pmu_cleanup_module);
17388--- /dev/null
17389+++ b/arch/x86/perfmon/perfmon_p4.c
17390@@ -0,0 +1,913 @@
17391+/*
17392+ * This file contains the P4/Xeon PMU register description tables
17393+ * for both 32 and 64 bit modes.
17394+ *
17395+ * Copyright (c) 2005 Intel Corporation
17396+ * Contributed by Bryan Wilkerson <bryan.p.wilkerson@intel.com>
17397+ *
17398+ * This program is free software; you can redistribute it and/or
17399+ * modify it under the terms of version 2 of the GNU General Public
17400+ * License as published by the Free Software Foundation.
17401+ *
17402+ * This program is distributed in the hope that it will be useful,
17403+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17404+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17405+ * General Public License for more details.
17406+ *
17407+ * You should have received a copy of the GNU General Public License
17408+ * along with this program; if not, write to the Free Software
17409+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
17410+ * 02111-1307 USA
17411+ */
17412+#include <linux/module.h>
17413+#include <linux/perfmon_kern.h>
17414+#include <linux/kprobes.h>
17415+#include <linux/nmi.h>
17416+#include <asm/msr.h>
17417+#include <asm/apic.h>
17418+
17419+MODULE_AUTHOR("Bryan Wilkerson <bryan.p.wilkerson@intel.com>");
17420+MODULE_DESCRIPTION("P4/Xeon/EM64T PMU description table");
17421+MODULE_LICENSE("GPL");
17422+
17423+static int force;
17424+MODULE_PARM_DESC(force, "bool: force module to load succesfully");
17425+module_param(force, bool, 0600);
17426+
17427+static int force_nmi;
17428+MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt");
17429+module_param(force_nmi, bool, 0600);
17430+
17431+/*
17432+ * For extended register information in addition to address that is used
17433+ * at runtime to figure out the mapping of reg addresses to logical procs
17434+ * and association of registers to hardware specific features
17435+ */
17436+struct pfm_p4_regmap {
17437+ /*
17438+ * one each for the logical CPUs. Index 0 corresponds to T0 and
17439+ * index 1 corresponds to T1. Index 1 can be zero if no T1
17440+ * complement reg exists.
17441+ */
17442+ unsigned long addrs[2]; /* 2 = number of threads */
17443+ unsigned int ctr; /* for CCCR/PERFEVTSEL, associated counter */
17444+ unsigned int reg_type;
17445+};
17446+
17447+/*
17448+ * bitmask for pfm_p4_regmap.reg_type
17449+ */
17450+#define PFM_REGT_NA 0x0000 /* not available */
17451+#define PFM_REGT_EN 0x0001 /* has enable bit (cleared on ctxsw) */
17452+#define PFM_REGT_ESCR 0x0002 /* P4: ESCR */
17453+#define PFM_REGT_CCCR 0x0004 /* P4: CCCR */
17454+#define PFM_REGT_PEBS 0x0010 /* PEBS related */
17455+#define PFM_REGT_NOHT 0x0020 /* unavailable with HT */
17456+#define PFM_REGT_CTR 0x0040 /* counter */
17457+
17458+/*
17459+ * architecture specific context extension.
17460+ * located at: (struct pfm_arch_context *)(ctx+1)
17461+ */
17462+struct pfm_arch_p4_context {
17463+ u32 npend_ovfls; /* P4 NMI #pending ovfls */
17464+ u32 reserved;
17465+ u64 povfl_pmds[PFM_PMD_BV]; /* P4 NMI overflowed counters */
17466+ u64 saved_cccrs[PFM_MAX_PMCS];
17467+};
17468+
17469+/*
17470+ * ESCR reserved bitmask:
17471+ * - bits 31 - 63 reserved
17472+ * - T1_OS and T1_USR bits are reserved - set depending on logical proc
17473+ * user mode application should use T0_OS and T0_USR to indicate
17474+ * RSVD: reserved bits must be 1
17475+ */
17476+#define PFM_ESCR_RSVD ~0x000000007ffffffcULL
17477+
17478+/*
17479+ * CCCR default value:
17480+ * - OVF_PMI_T0=1 (bit 26)
17481+ * - OVF_PMI_T1=0 (bit 27) (set if necessary in pfm_write_reg())
17482+ * - all other bits are zero
17483+ *
17484+ * OVF_PMI is forced to zero if PFM_REGFL_NO_EMUL64 is set on CCCR
17485+ */
17486+#define PFM_CCCR_DFL (1ULL<<26) | (3ULL<<16)
17487+
17488+/*
17489+ * CCCR reserved fields:
17490+ * - bits 0-11, 25-29, 31-63
17491+ * - OVF_PMI (26-27), override with REGFL_NO_EMUL64
17492+ *
17493+ * RSVD: reserved bits must be 1
17494+ */
17495+#define PFM_CCCR_RSVD ~((0xfull<<12) \
17496+ | (0x7full<<18) \
17497+ | (0x1ull<<30))
17498+
17499+#define PFM_P4_NO64 (3ULL<<26) /* use 3 even in non HT mode */
17500+
17501+#define PEBS_PMD 8 /* thread0: IQ_CTR4, thread1: IQ_CTR5 */
17502+
17503+/*
17504+ * With HyperThreading enabled:
17505+ *
17506+ * The ESCRs and CCCRs are divided in half with the top half
17507+ * belonging to logical processor 0 and the bottom half going to
17508+ * logical processor 1. Thus only half of the PMU resources are
17509+ * accessible to applications.
17510+ *
17511+ * PEBS is not available due to the fact that:
17512+ * - MSR_PEBS_MATRIX_VERT is shared between the threads
17513+ * - IA32_PEBS_ENABLE is shared between the threads
17514+ *
17515+ * With HyperThreading disabled:
17516+ *
17517+ * The full set of PMU resources is exposed to applications.
17518+ *
17519+ * The mapping is chosen such that PMCxx -> MSR is the same
17520+ * in HT and non HT mode, if register is present in HT mode.
17521+ *
17522+ */
17523+#define PFM_REGT_NHTESCR (PFM_REGT_ESCR|PFM_REGT_NOHT)
17524+#define PFM_REGT_NHTCCCR (PFM_REGT_CCCR|PFM_REGT_NOHT|PFM_REGT_EN)
17525+#define PFM_REGT_NHTPEBS (PFM_REGT_PEBS|PFM_REGT_NOHT|PFM_REGT_EN)
17526+#define PFM_REGT_NHTCTR (PFM_REGT_CTR|PFM_REGT_NOHT)
17527+#define PFM_REGT_ENAC (PFM_REGT_CCCR|PFM_REGT_EN)
17528+
17529+static void pfm_p4_write_pmc(struct pfm_context *ctx, unsigned int cnum, u64 value);
17530+static void pfm_p4_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value);
17531+static u64 pfm_p4_read_pmd(struct pfm_context *ctx, unsigned int cnum);
17532+static u64 pfm_p4_read_pmc(struct pfm_context *ctx, unsigned int cnum);
17533+static int pfm_p4_create_context(struct pfm_context *ctx, u32 ctx_flags);
17534+static void pfm_p4_free_context(struct pfm_context *ctx);
17535+static int pfm_p4_has_ovfls(struct pfm_context *ctx);
17536+static int pfm_p4_stop_save(struct pfm_context *ctx, struct pfm_event_set *set);
17537+static void pfm_p4_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
17538+static void pfm_p4_nmi_copy_state(struct pfm_context *ctx);
17539+static void __kprobes pfm_p4_quiesce(void);
17540+
17541+static u64 enable_mask[PFM_MAX_PMCS];
17542+static u16 max_enable;
17543+
17544+static struct pfm_p4_regmap pmc_addrs[PFM_MAX_PMCS] = {
17545+ /*pmc 0 */ {{MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1}, 0, PFM_REGT_ESCR}, /* BPU_ESCR0,1 */
17546+ /*pmc 1 */ {{MSR_P4_IS_ESCR0, MSR_P4_IS_ESCR1}, 0, PFM_REGT_ESCR}, /* IS_ESCR0,1 */
17547+ /*pmc 2 */ {{MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1}, 0, PFM_REGT_ESCR}, /* MOB_ESCR0,1 */
17548+ /*pmc 3 */ {{MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1}, 0, PFM_REGT_ESCR}, /* ITLB_ESCR0,1 */
17549+ /*pmc 4 */ {{MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1}, 0, PFM_REGT_ESCR}, /* PMH_ESCR0,1 */
17550+ /*pmc 5 */ {{MSR_P4_IX_ESCR0, MSR_P4_IX_ESCR1}, 0, PFM_REGT_ESCR}, /* IX_ESCR0,1 */
17551+ /*pmc 6 */ {{MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1}, 0, PFM_REGT_ESCR}, /* FSB_ESCR0,1 */
17552+ /*pmc 7 */ {{MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1}, 0, PFM_REGT_ESCR}, /* BSU_ESCR0,1 */
17553+ /*pmc 8 */ {{MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1}, 0, PFM_REGT_ESCR}, /* MS_ESCR0,1 */
17554+ /*pmc 9 */ {{MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1}, 0, PFM_REGT_ESCR}, /* TC_ESCR0,1 */
17555+ /*pmc 10*/ {{MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1}, 0, PFM_REGT_ESCR}, /* TBPU_ESCR0,1 */
17556+ /*pmc 11*/ {{MSR_P4_FLAME_ESCR0, MSR_P4_FLAME_ESCR1}, 0, PFM_REGT_ESCR}, /* FLAME_ESCR0,1 */
17557+ /*pmc 12*/ {{MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1}, 0, PFM_REGT_ESCR}, /* FIRM_ESCR0,1 */
17558+ /*pmc 13*/ {{MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1}, 0, PFM_REGT_ESCR}, /* SAAT_ESCR0,1 */
17559+ /*pmc 14*/ {{MSR_P4_U2L_ESCR0, MSR_P4_U2L_ESCR1}, 0, PFM_REGT_ESCR}, /* U2L_ESCR0,1 */
17560+ /*pmc 15*/ {{MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1}, 0, PFM_REGT_ESCR}, /* DAC_ESCR0,1 */
17561+ /*pmc 16*/ {{MSR_P4_IQ_ESCR0, MSR_P4_IQ_ESCR1}, 0, PFM_REGT_ESCR}, /* IQ_ESCR0,1 (only model 1 and 2) */
17562+ /*pmc 17*/ {{MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1}, 0, PFM_REGT_ESCR}, /* ALF_ESCR0,1 */
17563+ /*pmc 18*/ {{MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1}, 0, PFM_REGT_ESCR}, /* RAT_ESCR0,1 */
17564+ /*pmc 19*/ {{MSR_P4_SSU_ESCR0, 0}, 0, PFM_REGT_ESCR}, /* SSU_ESCR0 */
17565+ /*pmc 20*/ {{MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1}, 0, PFM_REGT_ESCR}, /* CRU_ESCR0,1 */
17566+ /*pmc 21*/ {{MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3}, 0, PFM_REGT_ESCR}, /* CRU_ESCR2,3 */
17567+ /*pmc 22*/ {{MSR_P4_CRU_ESCR4, MSR_P4_CRU_ESCR5}, 0, PFM_REGT_ESCR}, /* CRU_ESCR4,5 */
17568+
17569+ /*pmc 23*/ {{MSR_P4_BPU_CCCR0, MSR_P4_BPU_CCCR2}, 0, PFM_REGT_ENAC}, /* BPU_CCCR0,2 */
17570+ /*pmc 24*/ {{MSR_P4_BPU_CCCR1, MSR_P4_BPU_CCCR3}, 1, PFM_REGT_ENAC}, /* BPU_CCCR1,3 */
17571+ /*pmc 25*/ {{MSR_P4_MS_CCCR0, MSR_P4_MS_CCCR2}, 2, PFM_REGT_ENAC}, /* MS_CCCR0,2 */
17572+ /*pmc 26*/ {{MSR_P4_MS_CCCR1, MSR_P4_MS_CCCR3}, 3, PFM_REGT_ENAC}, /* MS_CCCR1,3 */
17573+ /*pmc 27*/ {{MSR_P4_FLAME_CCCR0, MSR_P4_FLAME_CCCR2}, 4, PFM_REGT_ENAC}, /* FLAME_CCCR0,2 */
17574+ /*pmc 28*/ {{MSR_P4_FLAME_CCCR1, MSR_P4_FLAME_CCCR3}, 5, PFM_REGT_ENAC}, /* FLAME_CCCR1,3 */
17575+ /*pmc 29*/ {{MSR_P4_IQ_CCCR0, MSR_P4_IQ_CCCR2}, 6, PFM_REGT_ENAC}, /* IQ_CCCR0,2 */
17576+ /*pmc 30*/ {{MSR_P4_IQ_CCCR1, MSR_P4_IQ_CCCR3}, 7, PFM_REGT_ENAC}, /* IQ_CCCR1,3 */
17577+ /*pmc 31*/ {{MSR_P4_IQ_CCCR4, MSR_P4_IQ_CCCR5}, 8, PFM_REGT_ENAC}, /* IQ_CCCR4,5 */
17578+ /* non HT extensions */
17579+ /*pmc 32*/ {{MSR_P4_BPU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* BPU_ESCR1 */
17580+ /*pmc 33*/ {{MSR_P4_IS_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* IS_ESCR1 */
17581+ /*pmc 34*/ {{MSR_P4_MOB_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* MOB_ESCR1 */
17582+ /*pmc 35*/ {{MSR_P4_ITLB_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* ITLB_ESCR1 */
17583+ /*pmc 36*/ {{MSR_P4_PMH_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* PMH_ESCR1 */
17584+ /*pmc 37*/ {{MSR_P4_IX_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* IX_ESCR1 */
17585+ /*pmc 38*/ {{MSR_P4_FSB_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* FSB_ESCR1 */
17586+ /*pmc 39*/ {{MSR_P4_BSU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* BSU_ESCR1 */
17587+ /*pmc 40*/ {{MSR_P4_MS_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* MS_ESCR1 */
17588+ /*pmc 41*/ {{MSR_P4_TC_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* TC_ESCR1 */
17589+ /*pmc 42*/ {{MSR_P4_TBPU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* TBPU_ESCR1 */
17590+ /*pmc 43*/ {{MSR_P4_FLAME_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* FLAME_ESCR1 */
17591+ /*pmc 44*/ {{MSR_P4_FIRM_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* FIRM_ESCR1 */
17592+ /*pmc 45*/ {{MSR_P4_SAAT_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* SAAT_ESCR1 */
17593+ /*pmc 46*/ {{MSR_P4_U2L_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* U2L_ESCR1 */
17594+ /*pmc 47*/ {{MSR_P4_DAC_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* DAC_ESCR1 */
17595+ /*pmc 48*/ {{MSR_P4_IQ_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* IQ_ESCR1 (only model 1 and 2) */
17596+ /*pmc 49*/ {{MSR_P4_ALF_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* ALF_ESCR1 */
17597+ /*pmc 50*/ {{MSR_P4_RAT_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* RAT_ESCR1 */
17598+ /*pmc 51*/ {{MSR_P4_CRU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* CRU_ESCR1 */
17599+ /*pmc 52*/ {{MSR_P4_CRU_ESCR3, 0}, 0, PFM_REGT_NHTESCR}, /* CRU_ESCR3 */
17600+ /*pmc 53*/ {{MSR_P4_CRU_ESCR5, 0}, 0, PFM_REGT_NHTESCR}, /* CRU_ESCR5 */
17601+ /*pmc 54*/ {{MSR_P4_BPU_CCCR1, 0}, 9, PFM_REGT_NHTCCCR}, /* BPU_CCCR1 */
17602+ /*pmc 55*/ {{MSR_P4_BPU_CCCR3, 0}, 10, PFM_REGT_NHTCCCR}, /* BPU_CCCR3 */
17603+ /*pmc 56*/ {{MSR_P4_MS_CCCR1, 0}, 11, PFM_REGT_NHTCCCR}, /* MS_CCCR1 */
17604+ /*pmc 57*/ {{MSR_P4_MS_CCCR3, 0}, 12, PFM_REGT_NHTCCCR}, /* MS_CCCR3 */
17605+ /*pmc 58*/ {{MSR_P4_FLAME_CCCR1, 0}, 13, PFM_REGT_NHTCCCR}, /* FLAME_CCCR1 */
17606+ /*pmc 59*/ {{MSR_P4_FLAME_CCCR3, 0}, 14, PFM_REGT_NHTCCCR}, /* FLAME_CCCR3 */
17607+ /*pmc 60*/ {{MSR_P4_IQ_CCCR2, 0}, 15, PFM_REGT_NHTCCCR}, /* IQ_CCCR2 */
17608+ /*pmc 61*/ {{MSR_P4_IQ_CCCR3, 0}, 16, PFM_REGT_NHTCCCR}, /* IQ_CCCR3 */
17609+ /*pmc 62*/ {{MSR_P4_IQ_CCCR5, 0}, 17, PFM_REGT_NHTCCCR}, /* IQ_CCCR5 */
17610+ /*pmc 63*/ {{0x3f2, 0}, 0, PFM_REGT_NHTPEBS},/* PEBS_MATRIX_VERT */
17611+ /*pmc 64*/ {{0x3f1, 0}, 0, PFM_REGT_NHTPEBS} /* PEBS_ENABLE */
17612+};
17613+
17614+static struct pfm_p4_regmap pmd_addrs[PFM_MAX_PMDS] = {
17615+ /*pmd 0 */ {{MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_PERFCTR2}, 0, PFM_REGT_CTR}, /* BPU_CTR0,2 */
17616+ /*pmd 1 */ {{MSR_P4_BPU_PERFCTR1, MSR_P4_BPU_PERFCTR3}, 0, PFM_REGT_CTR}, /* BPU_CTR1,3 */
17617+ /*pmd 2 */ {{MSR_P4_MS_PERFCTR0, MSR_P4_MS_PERFCTR2}, 0, PFM_REGT_CTR}, /* MS_CTR0,2 */
17618+ /*pmd 3 */ {{MSR_P4_MS_PERFCTR1, MSR_P4_MS_PERFCTR3}, 0, PFM_REGT_CTR}, /* MS_CTR1,3 */
17619+ /*pmd 4 */ {{MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_PERFCTR2}, 0, PFM_REGT_CTR}, /* FLAME_CTR0,2 */
17620+ /*pmd 5 */ {{MSR_P4_FLAME_PERFCTR1, MSR_P4_FLAME_PERFCTR3}, 0, PFM_REGT_CTR}, /* FLAME_CTR1,3 */
17621+ /*pmd 6 */ {{MSR_P4_IQ_PERFCTR0, MSR_P4_IQ_PERFCTR2}, 0, PFM_REGT_CTR}, /* IQ_CTR0,2 */
17622+ /*pmd 7 */ {{MSR_P4_IQ_PERFCTR1, MSR_P4_IQ_PERFCTR3}, 0, PFM_REGT_CTR}, /* IQ_CTR1,3 */
17623+ /*pmd 8 */ {{MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_PERFCTR5}, 0, PFM_REGT_CTR}, /* IQ_CTR4,5 */
17624+ /*
17625+ * non HT extensions
17626+ */
17627+ /*pmd 9 */ {{MSR_P4_BPU_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* BPU_CTR2 */
17628+ /*pmd 10*/ {{MSR_P4_BPU_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* BPU_CTR3 */
17629+ /*pmd 11*/ {{MSR_P4_MS_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* MS_CTR2 */
17630+ /*pmd 12*/ {{MSR_P4_MS_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* MS_CTR3 */
17631+ /*pmd 13*/ {{MSR_P4_FLAME_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* FLAME_CTR2 */
17632+ /*pmd 14*/ {{MSR_P4_FLAME_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* FLAME_CTR3 */
17633+ /*pmd 15*/ {{MSR_P4_IQ_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* IQ_CTR2 */
17634+ /*pmd 16*/ {{MSR_P4_IQ_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* IQ_CTR3 */
17635+ /*pmd 17*/ {{MSR_P4_IQ_PERFCTR5, 0}, 0, PFM_REGT_NHTCTR}, /* IQ_CTR5 */
17636+};
17637+
17638+static struct pfm_arch_pmu_info pfm_p4_pmu_info = {
17639+ .write_pmc = pfm_p4_write_pmc,
17640+ .write_pmd = pfm_p4_write_pmd,
17641+ .read_pmc = pfm_p4_read_pmc,
17642+ .read_pmd = pfm_p4_read_pmd,
17643+ .create_context = pfm_p4_create_context,
17644+ .free_context = pfm_p4_free_context,
17645+ .has_ovfls = pfm_p4_has_ovfls,
17646+ .stop_save = pfm_p4_stop_save,
17647+ .restore_pmcs = pfm_p4_restore_pmcs,
17648+ .nmi_copy_state = pfm_p4_nmi_copy_state,
17649+ .quiesce = pfm_p4_quiesce
17650+};
17651+
17652+static struct pfm_regmap_desc pfm_p4_pmc_desc[] = {
17653+/* pmc0 */ PMC_D(PFM_REG_I, "BPU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BPU_ESCR0),
17654+/* pmc1 */ PMC_D(PFM_REG_I, "IS_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR0),
17655+/* pmc2 */ PMC_D(PFM_REG_I, "MOB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MOB_ESCR0),
17656+/* pmc3 */ PMC_D(PFM_REG_I, "ITLB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ITLB_ESCR0),
17657+/* pmc4 */ PMC_D(PFM_REG_I, "PMH_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_PMH_ESCR0),
17658+/* pmc5 */ PMC_D(PFM_REG_I, "IX_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IX_ESCR0),
17659+/* pmc6 */ PMC_D(PFM_REG_I, "FSB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FSB_ESCR0),
17660+/* pmc7 */ PMC_D(PFM_REG_I, "BSU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BSU_ESCR0),
17661+/* pmc8 */ PMC_D(PFM_REG_I, "MS_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MS_ESCR0),
17662+/* pmc9 */ PMC_D(PFM_REG_I, "TC_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TC_ESCR0),
17663+/* pmc10 */ PMC_D(PFM_REG_I, "TBPU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TBPU_ESCR0),
17664+/* pmc11 */ PMC_D(PFM_REG_I, "FLAME_ESCR0", 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FLAME_ESCR0),
17665+/* pmc12 */ PMC_D(PFM_REG_I, "FIRM_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FIRM_ESCR0),
17666+/* pmc13 */ PMC_D(PFM_REG_I, "SAAT_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SAAT_ESCR0),
17667+/* pmc14 */ PMC_D(PFM_REG_I, "U2L_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_U2L_ESCR0),
17668+/* pmc15 */ PMC_D(PFM_REG_I, "DAC_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_DAC_ESCR0),
17669+/* pmc16 */ PMC_D(PFM_REG_I, "IQ_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR0), /* only model 1 and 2*/
17670+/* pmc17 */ PMC_D(PFM_REG_I, "ALF_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ALF_ESCR0),
17671+/* pmc18 */ PMC_D(PFM_REG_I, "RAT_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_RAT_ESCR0),
17672+/* pmc19 */ PMC_D(PFM_REG_I, "SSU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SSU_ESCR0),
17673+/* pmc20 */ PMC_D(PFM_REG_I, "CRU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR0),
17674+/* pmc21 */ PMC_D(PFM_REG_I, "CRU_ESCR2" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR2),
17675+/* pmc22 */ PMC_D(PFM_REG_I, "CRU_ESCR4" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR4),
17676+/* pmc23 */ PMC_D(PFM_REG_I64, "BPU_CCCR0" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR0),
17677+/* pmc24 */ PMC_D(PFM_REG_I64, "BPU_CCCR1" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR1),
17678+/* pmc25 */ PMC_D(PFM_REG_I64, "MS_CCCR0" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR0),
17679+/* pmc26 */ PMC_D(PFM_REG_I64, "MS_CCCR1" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR1),
17680+/* pmc27 */ PMC_D(PFM_REG_I64, "FLAME_CCCR0", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR0),
17681+/* pmc28 */ PMC_D(PFM_REG_I64, "FLAME_CCCR1", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR1),
17682+/* pmc29 */ PMC_D(PFM_REG_I64, "IQ_CCCR0" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR0),
17683+/* pmc30 */ PMC_D(PFM_REG_I64, "IQ_CCCR1" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR1),
17684+/* pmc31 */ PMC_D(PFM_REG_I64, "IQ_CCCR4" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR4),
17685+ /* No HT extension */
17686+/* pmc32 */ PMC_D(PFM_REG_I, "BPU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BPU_ESCR1),
17687+/* pmc33 */ PMC_D(PFM_REG_I, "IS_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IS_ESCR1),
17688+/* pmc34 */ PMC_D(PFM_REG_I, "MOB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MOB_ESCR1),
17689+/* pmc35 */ PMC_D(PFM_REG_I, "ITLB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ITLB_ESCR1),
17690+/* pmc36 */ PMC_D(PFM_REG_I, "PMH_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_PMH_ESCR1),
17691+/* pmc37 */ PMC_D(PFM_REG_I, "IX_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IX_ESCR1),
17692+/* pmc38 */ PMC_D(PFM_REG_I, "FSB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FSB_ESCR1),
17693+/* pmc39 */ PMC_D(PFM_REG_I, "BSU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BSU_ESCR1),
17694+/* pmc40 */ PMC_D(PFM_REG_I, "MS_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MS_ESCR1),
17695+/* pmc41 */ PMC_D(PFM_REG_I, "TC_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TC_ESCR1),
17696+/* pmc42 */ PMC_D(PFM_REG_I, "TBPU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TBPU_ESCR1),
17697+/* pmc43 */ PMC_D(PFM_REG_I, "FLAME_ESCR1", 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FLAME_ESCR1),
17698+/* pmc44 */ PMC_D(PFM_REG_I, "FIRM_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FIRM_ESCR1),
17699+/* pmc45 */ PMC_D(PFM_REG_I, "SAAT_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SAAT_ESCR1),
17700+/* pmc46 */ PMC_D(PFM_REG_I, "U2L_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_U2L_ESCR1),
17701+/* pmc47 */ PMC_D(PFM_REG_I, "DAC_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_DAC_ESCR1),
17702+/* pmc48 */ PMC_D(PFM_REG_I, "IQ_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR1), /* only model 1 and 2 */
17703+/* pmc49 */ PMC_D(PFM_REG_I, "ALF_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ALF_ESCR1),
17704+/* pmc50 */ PMC_D(PFM_REG_I, "RAT_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_RAT_ESCR1),
17705+/* pmc51 */ PMC_D(PFM_REG_I, "CRU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR1),
17706+/* pmc52 */ PMC_D(PFM_REG_I, "CRU_ESCR3" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR3),
17707+/* pmc53 */ PMC_D(PFM_REG_I, "CRU_ESCR5" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR5),
17708+/* pmc54 */ PMC_D(PFM_REG_I64, "BPU_CCCR2" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR2),
17709+/* pmc55 */ PMC_D(PFM_REG_I64, "BPU_CCCR3" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR3),
17710+/* pmc56 */ PMC_D(PFM_REG_I64, "MS_CCCR2" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR2),
17711+/* pmc57 */ PMC_D(PFM_REG_I64, "MS_CCCR3" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR3),
17712+/* pmc58 */ PMC_D(PFM_REG_I64, "FLAME_CCCR2", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR2),
17713+/* pmc59 */ PMC_D(PFM_REG_I64, "FLAME_CCCR3", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR3),
17714+/* pmc60 */ PMC_D(PFM_REG_I64, "IQ_CCCR2" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR2),
17715+/* pmc61 */ PMC_D(PFM_REG_I64, "IQ_CCCR3" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR3),
17716+/* pmc62 */ PMC_D(PFM_REG_I64, "IQ_CCCR5" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR5),
17717+/* pmc63 */ PMC_D(PFM_REG_I, "PEBS_MATRIX_VERT", 0, 0xffffffffffffffecULL, 0, 0x3f2),
17718+/* pmc64 */ PMC_D(PFM_REG_I, "PEBS_ENABLE", 0, 0xfffffffff8ffe000ULL, 0, 0x3f1)
17719+};
17720+#define PFM_P4_NUM_PMCS ARRAY_SIZE(pfm_p4_pmc_desc)
17721+
17722+/*
17723+ * See section 15.10.6.6 for details about the IQ block
17724+ */
17725+static struct pfm_regmap_desc pfm_p4_pmd_desc[] = {
17726+/* pmd0 */ PMD_D(PFM_REG_C, "BPU_CTR0", MSR_P4_BPU_PERFCTR0),
17727+/* pmd1 */ PMD_D(PFM_REG_C, "BPU_CTR1", MSR_P4_BPU_PERFCTR1),
17728+/* pmd2 */ PMD_D(PFM_REG_C, "MS_CTR0", MSR_P4_MS_PERFCTR0),
17729+/* pmd3 */ PMD_D(PFM_REG_C, "MS_CTR1", MSR_P4_MS_PERFCTR1),
17730+/* pmd4 */ PMD_D(PFM_REG_C, "FLAME_CTR0", MSR_P4_FLAME_PERFCTR0),
17731+/* pmd5 */ PMD_D(PFM_REG_C, "FLAME_CTR1", MSR_P4_FLAME_PERFCTR1),
17732+/* pmd6 */ PMD_D(PFM_REG_C, "IQ_CTR0", MSR_P4_IQ_PERFCTR0),
17733+/* pmd7 */ PMD_D(PFM_REG_C, "IQ_CTR1", MSR_P4_IQ_PERFCTR1),
17734+/* pmd8 */ PMD_D(PFM_REG_C, "IQ_CTR4", MSR_P4_IQ_PERFCTR4),
17735+ /* no HT extension */
17736+/* pmd9 */ PMD_D(PFM_REG_C, "BPU_CTR2", MSR_P4_BPU_PERFCTR2),
17737+/* pmd10 */ PMD_D(PFM_REG_C, "BPU_CTR3", MSR_P4_BPU_PERFCTR3),
17738+/* pmd11 */ PMD_D(PFM_REG_C, "MS_CTR2", MSR_P4_MS_PERFCTR2),
17739+/* pmd12 */ PMD_D(PFM_REG_C, "MS_CTR3", MSR_P4_MS_PERFCTR3),
17740+/* pmd13 */ PMD_D(PFM_REG_C, "FLAME_CTR2", MSR_P4_FLAME_PERFCTR2),
17741+/* pmd14 */ PMD_D(PFM_REG_C, "FLAME_CTR3", MSR_P4_FLAME_PERFCTR3),
17742+/* pmd15 */ PMD_D(PFM_REG_C, "IQ_CTR2", MSR_P4_IQ_PERFCTR2),
17743+/* pmd16 */ PMD_D(PFM_REG_C, "IQ_CTR3", MSR_P4_IQ_PERFCTR3),
17744+/* pmd17 */ PMD_D(PFM_REG_C, "IQ_CTR5", MSR_P4_IQ_PERFCTR5)
17745+};
17746+#define PFM_P4_NUM_PMDS ARRAY_SIZE(pfm_p4_pmd_desc)
17747+
17748+/*
17749+ * Due to hotplug CPU support, threads may not necessarily
17750+ * be activated at the time the module is inserted. We need
17751+ * to check whether they could be activated by looking at
17752+ * the present CPU (present != online).
17753+ */
17754+static int pfm_p4_probe_pmu(void)
17755+{
17756+ unsigned int i;
17757+ int ht_enabled;
17758+
17759+ /*
17760+ * only works on Intel processors
17761+ */
17762+ if (current_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
17763+ PFM_INFO("not running on Intel processor");
17764+ return -1;
17765+ }
17766+
17767+ if (current_cpu_data.x86 != 15) {
17768+ PFM_INFO("unsupported family=%d", current_cpu_data.x86);
17769+ return -1;
17770+ }
17771+
17772+ switch (current_cpu_data.x86_model) {
17773+ case 0 ... 2:
17774+ break;
17775+ case 3 ... 6:
17776+ /*
17777+ * IQ_ESCR0, IQ_ESCR1 only present on model 1, 2
17778+ */
17779+ pfm_p4_pmc_desc[16].type = PFM_REG_NA;
17780+ pfm_p4_pmc_desc[48].type = PFM_REG_NA;
17781+ break;
17782+ default:
17783+ /*
17784+ * do not know if they all work the same, so reject
17785+ * for now
17786+ */
17787+ if (!force) {
17788+ PFM_INFO("unsupported model %d",
17789+ current_cpu_data.x86_model);
17790+ return -1;
17791+ }
17792+ }
17793+
17794+ /*
17795+ * check for local APIC (required)
17796+ */
17797+ if (!cpu_has_apic) {
17798+ PFM_INFO("no local APIC, unsupported");
17799+ return -1;
17800+ }
17801+#ifdef CONFIG_SMP
17802+ ht_enabled = (cpus_weight(__get_cpu_var(cpu_core_map))
17803+ / current_cpu_data.x86_max_cores) > 1;
17804+#else
17805+ ht_enabled = 0;
17806+#endif
17807+ if (cpu_has_ht) {
17808+
17809+ PFM_INFO("HyperThreading supported, status %s",
17810+ ht_enabled ? "on": "off");
17811+ /*
17812+ * disable registers not supporting HT
17813+ */
17814+ if (ht_enabled) {
17815+ PFM_INFO("disabling half the registers for HT");
17816+ for (i = 0; i < PFM_P4_NUM_PMCS; i++) {
17817+ if (pmc_addrs[(i)].reg_type & PFM_REGT_NOHT)
17818+ pfm_p4_pmc_desc[i].type = PFM_REG_NA;
17819+ }
17820+ for (i = 0; i < PFM_P4_NUM_PMDS; i++) {
17821+ if (pmd_addrs[(i)].reg_type & PFM_REGT_NOHT)
17822+ pfm_p4_pmd_desc[i].type = PFM_REG_NA;
17823+ }
17824+ }
17825+ }
17826+
17827+ if (cpu_has_ds) {
17828+ PFM_INFO("Data Save Area (DS) supported");
17829+
17830+ if (cpu_has_pebs) {
17831+ /*
17832+ * PEBS does not work with HyperThreading enabled
17833+ */
17834+ if (ht_enabled)
17835+ PFM_INFO("PEBS supported, status off (because of HT)");
17836+ else
17837+ PFM_INFO("PEBS supported, status on");
17838+ }
17839+ }
17840+
17841+ /*
17842+ * build enable mask
17843+ */
17844+ for (i = 0; i < PFM_P4_NUM_PMCS; i++) {
17845+ if (pmc_addrs[(i)].reg_type & PFM_REGT_EN) {
17846+ __set_bit(i, cast_ulp(enable_mask));
17847+ max_enable = i + 1;
17848+ }
17849+ }
17850+
17851+ if (force_nmi)
17852+ pfm_p4_pmu_info.flags |= PFM_X86_FL_USE_NMI;
17853+ return 0;
17854+}
17855+static inline int get_smt_id(void)
17856+{
17857+#ifdef CONFIG_SMP
17858+ int cpu = smp_processor_id();
17859+ return (cpu != first_cpu(__get_cpu_var(cpu_sibling_map)));
17860+#else
17861+ return 0;
17862+#endif
17863+}
17864+
17865+static void __pfm_write_reg_p4(const struct pfm_p4_regmap *xreg, u64 val)
17866+{
17867+ u64 pmi;
17868+ int smt_id;
17869+
17870+ smt_id = get_smt_id();
17871+ /*
17872+ * HT is only supported by P4-style PMU
17873+ *
17874+ * Adjust for T1 if necessary:
17875+ *
17876+ * - move the T0_OS/T0_USR bits into T1 slots
17877+ * - move the OVF_PMI_T0 bits into T1 slot
17878+ *
17879+ * The P4/EM64T T1 is cleared by description table.
17880+ * User only works with T0.
17881+ */
17882+ if (smt_id) {
17883+ if (xreg->reg_type & PFM_REGT_ESCR) {
17884+
17885+ /* copy T0_USR & T0_OS to T1 */
17886+ val |= ((val & 0xc) >> 2);
17887+
17888+ /* clear bits T0_USR & T0_OS */
17889+ val &= ~0xc;
17890+
17891+ } else if (xreg->reg_type & PFM_REGT_CCCR) {
17892+ pmi = (val >> 26) & 0x1;
17893+ if (pmi) {
17894+ val &= ~(1UL<<26);
17895+ val |= 1UL<<27;
17896+ }
17897+ }
17898+ }
17899+ if (xreg->addrs[smt_id])
17900+ wrmsrl(xreg->addrs[smt_id], val);
17901+}
17902+
17903+void __pfm_read_reg_p4(const struct pfm_p4_regmap *xreg, u64 *val)
17904+{
17905+ int smt_id;
17906+
17907+ smt_id = get_smt_id();
17908+
17909+ if (likely(xreg->addrs[smt_id])) {
17910+ rdmsrl(xreg->addrs[smt_id], *val);
17911+ /*
17912+ * HT is only supported by P4-style PMU
17913+ *
17914+ * move the Tx_OS and Tx_USR bits into
17915+ * T0 slots setting the T1 slots to zero
17916+ */
17917+ if (xreg->reg_type & PFM_REGT_ESCR) {
17918+ if (smt_id)
17919+ *val |= (((*val) & 0x3) << 2);
17920+
17921+ /*
17922+ * zero out bits that are reserved
17923+ * (including T1_OS and T1_USR)
17924+ */
17925+ *val &= PFM_ESCR_RSVD;
17926+ }
17927+ } else {
17928+ *val = 0;
17929+ }
17930+}
17931+static void pfm_p4_write_pmc(struct pfm_context *ctx, unsigned int cnum, u64 value)
17932+{
17933+ __pfm_write_reg_p4(&pmc_addrs[cnum], value);
17934+}
17935+
17936+static void pfm_p4_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value)
17937+{
17938+ __pfm_write_reg_p4(&pmd_addrs[cnum], value);
17939+}
17940+
17941+static u64 pfm_p4_read_pmd(struct pfm_context *ctx, unsigned int cnum)
17942+{
17943+ u64 tmp;
17944+ __pfm_read_reg_p4(&pmd_addrs[cnum], &tmp);
17945+ return tmp;
17946+}
17947+
17948+static u64 pfm_p4_read_pmc(struct pfm_context *ctx, unsigned int cnum)
17949+{
17950+ u64 tmp;
17951+ __pfm_read_reg_p4(&pmc_addrs[cnum], &tmp);
17952+ return tmp;
17953+}
17954+
17955+struct pfm_ds_area_p4 {
17956+ unsigned long bts_buf_base;
17957+ unsigned long bts_index;
17958+ unsigned long bts_abs_max;
17959+ unsigned long bts_intr_thres;
17960+ unsigned long pebs_buf_base;
17961+ unsigned long pebs_index;
17962+ unsigned long pebs_abs_max;
17963+ unsigned long pebs_intr_thres;
17964+ u64 pebs_cnt_reset;
17965+};
17966+
17967+
17968+static int pfm_p4_stop_save(struct pfm_context *ctx, struct pfm_event_set *set)
17969+{
17970+ struct pfm_arch_pmu_info *pmu_info;
17971+ struct pfm_arch_context *ctx_arch;
17972+ struct pfm_ds_area_p4 *ds = NULL;
17973+ u64 used_mask[PFM_PMC_BV];
17974+ u16 i, j, count, pebs_idx = ~0;
17975+ u16 max_pmc;
17976+ u64 cccr, ctr1, ctr2, ovfl_mask;
17977+
17978+ pmu_info = &pfm_p4_pmu_info;
17979+ ctx_arch = pfm_ctx_arch(ctx);
17980+ max_pmc = ctx->regs.max_pmc;
17981+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
17982+
17983+ /*
17984+ * build used enable PMC bitmask
17985+ * if user did not set any CCCR, then mask is
17986+ * empty and there is nothing to do because nothing
17987+ * was started
17988+ */
17989+ bitmap_and(cast_ulp(used_mask),
17990+ cast_ulp(set->used_pmcs),
17991+ cast_ulp(enable_mask),
17992+ max_enable);
17993+
17994+ count = bitmap_weight(cast_ulp(used_mask), max_enable);
17995+
17996+ PFM_DBG_ovfl("npend=%u ena_mask=0x%llx u_pmcs=0x%llx count=%u num=%u",
17997+ set->npend_ovfls,
17998+ (unsigned long long)enable_mask[0],
17999+ (unsigned long long)set->used_pmcs[0],
18000+ count, max_enable);
18001+
18002+ /*
18003+ * ensures we do not destroy pending overflow
18004+ * information. If pended interrupts are already
18005+ * known, then we just stop monitoring.
18006+ */
18007+ if (set->npend_ovfls) {
18008+ /*
18009+ * clear enable bit
18010+ * unfortunately, this is very expensive!
18011+ */
18012+ for (i = 0; count; i++) {
18013+ if (test_bit(i, cast_ulp(used_mask))) {
18014+ __pfm_write_reg_p4(pmc_addrs+i, 0);
18015+ count--;
18016+ }
18017+ }
18018+ /* need save PMDs at upper level */
18019+ return 1;
18020+ }
18021+
18022+ if (ctx_arch->flags.use_pebs) {
18023+ ds = ctx_arch->ds_area;
18024+ pebs_idx = PEBS_PMD;
18025+ PFM_DBG("ds=%p pebs_idx=0x%llx thres=0x%llx",
18026+ ds,
18027+ (unsigned long long)ds->pebs_index,
18028+ (unsigned long long)ds->pebs_intr_thres);
18029+ }
18030+
18031+ /*
18032+ * stop monitoring AND collect pending overflow information AND
18033+ * save pmds.
18034+ *
18035+ * We need to access the CCCR twice, once to get overflow info
18036+ * and a second to stop monitoring (which destroys the OVF flag)
18037+ * Similarly, we need to read the counter twice to check whether
18038+ * it did overflow between the CCR read and the CCCR write.
18039+ */
18040+ for (i = 0; count; i++) {
18041+ if (i != pebs_idx && test_bit(i, cast_ulp(used_mask))) {
18042+ /*
18043+ * controlled counter
18044+ */
18045+ j = pmc_addrs[i].ctr;
18046+
18047+ /* read CCCR (PMC) value */
18048+ __pfm_read_reg_p4(pmc_addrs+i, &cccr);
18049+
18050+ /* read counter (PMD) controlled by PMC */
18051+ __pfm_read_reg_p4(pmd_addrs+j, &ctr1);
18052+
18053+ /* clear CCCR value: stop counter but destroy OVF */
18054+ __pfm_write_reg_p4(pmc_addrs+i, 0);
18055+
18056+ /* read counter controlled by CCCR again */
18057+ __pfm_read_reg_p4(pmd_addrs+j, &ctr2);
18058+
18059+ /*
18060+ * there is an overflow if either:
18061+ * - CCCR.ovf is set (and we just cleared it)
18062+ * - ctr2 < ctr1
18063+ * in that case we set the bit corresponding to the
18064+ * overflowed PMD in povfl_pmds.
18065+ */
18066+ if ((cccr & (1ULL<<31)) || (ctr2 < ctr1)) {
18067+ __set_bit(j, cast_ulp(set->povfl_pmds));
18068+ set->npend_ovfls++;
18069+ }
18070+ ctr2 = (set->pmds[j].value & ~ovfl_mask) | (ctr2 & ovfl_mask);
18071+ set->pmds[j].value = ctr2;
18072+ count--;
18073+ }
18074+ }
18075+ /*
18076+ * check for PEBS buffer full and set the corresponding PMD overflow
18077+ */
18078+ if (ctx_arch->flags.use_pebs) {
18079+ PFM_DBG("ds=%p pebs_idx=0x%lx thres=0x%lx", ds, ds->pebs_index, ds->pebs_intr_thres);
18080+ if (ds->pebs_index >= ds->pebs_intr_thres
18081+ && test_bit(PEBS_PMD, cast_ulp(set->used_pmds))) {
18082+ __set_bit(PEBS_PMD, cast_ulp(set->povfl_pmds));
18083+ set->npend_ovfls++;
18084+ }
18085+ }
18086+ /* 0 means: no need to save the PMD at higher level */
18087+ return 0;
18088+}
18089+
18090+static int pfm_p4_create_context(struct pfm_context *ctx, u32 ctx_flags)
18091+{
18092+ struct pfm_arch_context *ctx_arch;
18093+
18094+ ctx_arch = pfm_ctx_arch(ctx);
18095+
18096+ ctx_arch->data = kzalloc(sizeof(struct pfm_arch_p4_context), GFP_KERNEL);
18097+ if (!ctx_arch->data)
18098+ return -ENOMEM;
18099+
18100+ return 0;
18101+}
18102+
18103+static void pfm_p4_free_context(struct pfm_context *ctx)
18104+{
18105+ struct pfm_arch_context *ctx_arch;
18106+
18107+ ctx_arch = pfm_ctx_arch(ctx);
18108+ /*
18109+ * we do not check if P4, because it would be NULL and
18110+ * kfree can deal with NULL
18111+ */
18112+ kfree(ctx_arch->data);
18113+}
18114+
18115+/*
18116+ * detect is counters have overflowed.
18117+ * return:
18118+ * 0 : no overflow
18119+ * 1 : at least one overflow
18120+ *
18121+ * used by Intel P4
18122+ */
18123+static int __kprobes pfm_p4_has_ovfls(struct pfm_context *ctx)
18124+{
18125+ struct pfm_arch_pmu_info *pmu_info;
18126+ struct pfm_p4_regmap *xrc, *xrd;
18127+ struct pfm_arch_context *ctx_arch;
18128+ struct pfm_arch_p4_context *p4;
18129+ u64 ena_mask[PFM_PMC_BV];
18130+ u64 cccr, ctr1, ctr2;
18131+ int n, i, j;
18132+
18133+ pmu_info = &pfm_p4_pmu_info;
18134+
18135+ ctx_arch = pfm_ctx_arch(ctx);
18136+ xrc = pmc_addrs;
18137+ xrd = pmd_addrs;
18138+ p4 = ctx_arch->data;
18139+
18140+ bitmap_and(cast_ulp(ena_mask),
18141+ cast_ulp(ctx->regs.pmcs),
18142+ cast_ulp(enable_mask),
18143+ max_enable);
18144+
18145+ n = bitmap_weight(cast_ulp(ena_mask), max_enable);
18146+
18147+ for (i = 0; n; i++) {
18148+ if (!test_bit(i, cast_ulp(ena_mask)))
18149+ continue;
18150+ /*
18151+ * controlled counter
18152+ */
18153+ j = xrc[i].ctr;
18154+
18155+ /* read CCCR (PMC) value */
18156+ __pfm_read_reg_p4(xrc+i, &cccr);
18157+
18158+ /* read counter (PMD) controlled by PMC */
18159+ __pfm_read_reg_p4(xrd+j, &ctr1);
18160+
18161+ /* clear CCCR value: stop counter but destroy OVF */
18162+ __pfm_write_reg_p4(xrc+i, 0);
18163+
18164+ /* read counter controlled by CCCR again */
18165+ __pfm_read_reg_p4(xrd+j, &ctr2);
18166+
18167+ /*
18168+ * there is an overflow if either:
18169+ * - CCCR.ovf is set (and we just cleared it)
18170+ * - ctr2 < ctr1
18171+ * in that case we set the bit corresponding to the
18172+ * overflowed PMD in povfl_pmds.
18173+ */
18174+ if ((cccr & (1ULL<<31)) || (ctr2 < ctr1)) {
18175+ __set_bit(j, cast_ulp(p4->povfl_pmds));
18176+ p4->npend_ovfls++;
18177+ }
18178+ p4->saved_cccrs[i] = cccr;
18179+ n--;
18180+ }
18181+ /*
18182+ * if there was no overflow, then it means the NMI was not really
18183+ * for us, so we have to resume monitoring
18184+ */
18185+ if (unlikely(!p4->npend_ovfls)) {
18186+ for (i = 0; n; i++) {
18187+ if (!test_bit(i, cast_ulp(ena_mask)))
18188+ continue;
18189+ __pfm_write_reg_p4(xrc+i, p4->saved_cccrs[i]);
18190+ }
18191+ }
18192+ return 0;
18193+}
18194+
18195+void pfm_p4_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
18196+{
18197+ struct pfm_arch_pmu_info *pmu_info;
18198+ struct pfm_arch_context *ctx_arch;
18199+ u64 *mask;
18200+ u16 i, num;
18201+
18202+ ctx_arch = pfm_ctx_arch(ctx);
18203+ pmu_info = pfm_pmu_info();
18204+
18205+ /*
18206+ * must restore DS pointer before restoring PMCs
18207+ * as this can potentially reactivate monitoring
18208+ */
18209+ if (ctx_arch->flags.use_ds)
18210+ wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ctx_arch->ds_area);
18211+
18212+ /*
18213+ * must restore everything because there are some dependencies
18214+ * (e.g., ESCR and CCCR)
18215+ */
18216+ num = ctx->regs.num_pmcs;
18217+ mask = ctx->regs.pmcs;
18218+ for (i = 0; num; i++) {
18219+ if (test_bit(i, cast_ulp(mask))) {
18220+ pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
18221+ num--;
18222+ }
18223+ }
18224+}
18225+
18226+/*
18227+ * invoked only when NMI is used. Called from the LOCAL_PERFMON_VECTOR
18228+ * handler to copy P4 overflow state captured when the NMI triggered.
18229+ * Given that on P4, stopping monitoring destroy the overflow information
18230+ * we save it in pfm_has_ovfl_p4() where monitoring is also stopped.
18231+ *
18232+ * Here we propagate the overflow state to current active set. The
18233+ * freeze_pmu() call we not overwrite this state because npend_ovfls
18234+ * is non-zero.
18235+ */
18236+static void pfm_p4_nmi_copy_state(struct pfm_context *ctx)
18237+{
18238+ struct pfm_arch_context *ctx_arch;
18239+ struct pfm_event_set *set;
18240+ struct pfm_arch_p4_context *p4;
18241+
18242+ ctx_arch = pfm_ctx_arch(ctx);
18243+ p4 = ctx_arch->data;
18244+ set = ctx->active_set;
18245+
18246+ if (p4->npend_ovfls) {
18247+ set->npend_ovfls = p4->npend_ovfls;
18248+
18249+ bitmap_copy(cast_ulp(set->povfl_pmds),
18250+ cast_ulp(p4->povfl_pmds),
18251+ ctx->regs.max_pmd);
18252+
18253+ p4->npend_ovfls = 0;
18254+ }
18255+}
18256+
18257+/**
18258+ * pfm_p4_quiesce - stop monitoring without grabbing any lock
18259+ *
18260+ * called from NMI interrupt handler to immediately stop monitoring
18261+ * cannot grab any lock, including perfmon related locks
18262+ */
18263+static void __kprobes pfm_p4_quiesce(void)
18264+{
18265+ u16 i;
18266+ /*
18267+ * quiesce PMU by clearing available registers that have
18268+ * the start/stop capability
18269+ */
18270+ for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) {
18271+ if (test_bit(i, cast_ulp(pfm_pmu_conf->regs_all.pmcs))
18272+ && test_bit(i, cast_ulp(enable_mask)))
18273+ __pfm_write_reg_p4(pmc_addrs+i, 0);
18274+ }
18275+}
18276+
18277+
18278+static struct pfm_pmu_config pfm_p4_pmu_conf = {
18279+ .pmu_name = "Intel P4",
18280+ .counter_width = 40,
18281+ .pmd_desc = pfm_p4_pmd_desc,
18282+ .pmc_desc = pfm_p4_pmc_desc,
18283+ .num_pmc_entries = PFM_P4_NUM_PMCS,
18284+ .num_pmd_entries = PFM_P4_NUM_PMDS,
18285+ .probe_pmu = pfm_p4_probe_pmu,
18286+ .version = "1.0",
18287+ .flags = PFM_PMU_BUILTIN_FLAG,
18288+ .owner = THIS_MODULE,
18289+ .pmu_info = &pfm_p4_pmu_info
18290+};
18291+
18292+static int __init pfm_p4_pmu_init_module(void)
18293+{
18294+ return pfm_pmu_register(&pfm_p4_pmu_conf);
18295+}
18296+
18297+static void __exit pfm_p4_pmu_cleanup_module(void)
18298+{
18299+ pfm_pmu_unregister(&pfm_p4_pmu_conf);
18300+}
18301+
18302+module_init(pfm_p4_pmu_init_module);
18303+module_exit(pfm_p4_pmu_cleanup_module);
18304--- /dev/null
18305+++ b/arch/x86/perfmon/perfmon_p6.c
18306@@ -0,0 +1,310 @@
18307+/*
18308+ * This file contains the P6 family processor PMU register description tables
18309+ *
18310+ * This module supports original P6 processors
18311+ * (Pentium II, Pentium Pro, Pentium III) and Pentium M.
18312+ *
18313+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
18314+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
18315+ *
18316+ * This program is free software; you can redistribute it and/or
18317+ * modify it under the terms of version 2 of the GNU General Public
18318+ * License as published by the Free Software Foundation.
18319+ *
18320+ * This program is distributed in the hope that it will be useful,
18321+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18322+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18323+ * General Public License for more details.
18324+ *
18325+ * You should have received a copy of the GNU General Public License
18326+ * along with this program; if not, write to the Free Software
18327+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18328+ * 02111-1307 USA
18329+ */
18330+#include <linux/module.h>
18331+#include <linux/kprobes.h>
18332+#include <linux/perfmon_kern.h>
18333+#include <linux/nmi.h>
18334+#include <asm/msr.h>
18335+
18336+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
18337+MODULE_DESCRIPTION("P6 PMU description table");
18338+MODULE_LICENSE("GPL");
18339+
18340+static int force_nmi;
18341+MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt");
18342+module_param(force_nmi, bool, 0600);
18343+
18344+/*
18345+ * - upper 32 bits are reserved
18346+ * - INT: APIC enable bit is reserved (forced to 1)
18347+ * - bit 21 is reserved
18348+ * - bit 22 is reserved on PEREVNTSEL1
18349+ *
18350+ * RSVD: reserved bits are 1
18351+ */
18352+#define PFM_P6_PMC0_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (1ULL<<21))
18353+#define PFM_P6_PMC1_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (3ULL<<21))
18354+
18355+/*
18356+ * force Local APIC interrupt on overflow
18357+ * disable with NO_EMUL64
18358+ */
18359+#define PFM_P6_PMC_VAL (1ULL<<20)
18360+#define PFM_P6_NO64 (1ULL<<20)
18361+
18362+
18363+static void __kprobes pfm_p6_quiesce(void);
18364+static int pfm_p6_has_ovfls(struct pfm_context *ctx);
18365+static int pfm_p6_stop_save(struct pfm_context *ctx,
18366+ struct pfm_event_set *set);
18367+
18368+static u64 enable_mask[PFM_MAX_PMCS];
18369+static u16 max_enable;
18370+
18371+/*
18372+ * PFM_X86_FL_NO_SHARING: because of the single enable bit on MSR_P6_EVNTSEL0
18373+ * the PMU cannot be shared with NMI watchdog or Oprofile
18374+ */
18375+struct pfm_arch_pmu_info pfm_p6_pmu_info = {
18376+ .stop_save = pfm_p6_stop_save,
18377+ .has_ovfls = pfm_p6_has_ovfls,
18378+ .quiesce = pfm_p6_quiesce,
18379+ .flags = PFM_X86_FL_NO_SHARING,
18380+};
18381+
18382+static struct pfm_regmap_desc pfm_p6_pmc_desc[] = {
18383+/* pmc0 */ PMC_D(PFM_REG_I64, "PERFEVTSEL0", PFM_P6_PMC_VAL, PFM_P6_PMC0_RSVD, PFM_P6_NO64, MSR_P6_EVNTSEL0),
18384+/* pmc1 */ PMC_D(PFM_REG_I64, "PERFEVTSEL1", PFM_P6_PMC_VAL, PFM_P6_PMC1_RSVD, PFM_P6_NO64, MSR_P6_EVNTSEL1)
18385+};
18386+#define PFM_P6_NUM_PMCS ARRAY_SIZE(pfm_p6_pmc_desc)
18387+
18388+#define PFM_P6_D(n) \
18389+ { .type = PFM_REG_C, \
18390+ .desc = "PERFCTR"#n, \
18391+ .hw_addr = MSR_P6_PERFCTR0+n, \
18392+ .rsvd_msk = 0, \
18393+ .dep_pmcs[0] = 1ULL << n \
18394+ }
18395+
18396+static struct pfm_regmap_desc pfm_p6_pmd_desc[] = {
18397+/* pmd0 */ PFM_P6_D(0),
18398+/* pmd1 */ PFM_P6_D(1)
18399+};
18400+#define PFM_P6_NUM_PMDS ARRAY_SIZE(pfm_p6_pmd_desc)
18401+
18402+static int pfm_p6_probe_pmu(void)
18403+{
18404+ int high, low;
18405+
18406+ if (current_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
18407+ PFM_INFO("not an Intel processor");
18408+ return -1;
18409+ }
18410+
18411+ /*
18412+ * check for P6 processor family
18413+ */
18414+ if (current_cpu_data.x86 != 6) {
18415+ PFM_INFO("unsupported family=%d", current_cpu_data.x86);
18416+ return -1;
18417+ }
18418+
18419+ switch (current_cpu_data.x86_model) {
18420+ case 1: /* Pentium Pro */
18421+ case 3:
18422+ case 5: /* Pentium II Deschutes */
18423+ case 7 ... 11:
18424+ break;
18425+ case 13:
18426+ /* for Pentium M, we need to check if PMU exist */
18427+ rdmsr(MSR_IA32_MISC_ENABLE, low, high);
18428+ if (low & (1U << 7))
18429+ break;
18430+ default:
18431+ PFM_INFO("unsupported CPU model %d",
18432+ current_cpu_data.x86_model);
18433+ return -1;
18434+
18435+ }
18436+
18437+ if (!cpu_has_apic) {
18438+ PFM_INFO("no Local APIC, try rebooting with lapic");
18439+ return -1;
18440+ }
18441+ __set_bit(0, cast_ulp(enable_mask));
18442+ __set_bit(1, cast_ulp(enable_mask));
18443+ max_enable = 1 + 1;
18444+ /*
18445+ * force NMI interrupt?
18446+ */
18447+ if (force_nmi)
18448+ pfm_p6_pmu_info.flags |= PFM_X86_FL_USE_NMI;
18449+
18450+ return 0;
18451+}
18452+
18453+/**
18454+ * pfm_p6_has_ovfls - check for pending overflow condition
18455+ * @ctx: context to work on
18456+ *
18457+ * detect if counters have overflowed.
18458+ * return:
18459+ * 0 : no overflow
18460+ * 1 : at least one overflow
18461+ */
18462+static int __kprobes pfm_p6_has_ovfls(struct pfm_context *ctx)
18463+{
18464+ u64 *cnt_mask;
18465+ u64 wmask, val;
18466+ u16 i, num;
18467+
18468+ cnt_mask = ctx->regs.cnt_pmds;
18469+ num = ctx->regs.num_counters;
18470+ wmask = 1ULL << pfm_pmu_conf->counter_width;
18471+
18472+ /*
18473+ * we can leverage the fact that we know the mapping
18474+ * to hardcode the MSR address and avoid accessing
18475+ * more cachelines
18476+ *
18477+ * We need to check cnt_mask because not all registers
18478+ * may be available.
18479+ */
18480+ for (i = 0; num; i++) {
18481+ if (test_bit(i, cast_ulp(cnt_mask))) {
18482+ rdmsrl(MSR_P6_PERFCTR0+i, val);
18483+ if (!(val & wmask))
18484+ return 1;
18485+ num--;
18486+ }
18487+ }
18488+ return 0;
18489+}
18490+
18491+/**
18492+ * pfm_p6_stop_save -- stop monitoring and save PMD values
18493+ * @ctx: context to work on
18494+ * @set: current event set
18495+ *
18496+ * return value:
18497+ * 0 - no need to save PMDs in caller
18498+ * 1 - need to save PMDs in caller
18499+ */
18500+static int pfm_p6_stop_save(struct pfm_context *ctx, struct pfm_event_set *set)
18501+{
18502+ struct pfm_arch_pmu_info *pmu_info;
18503+ u64 used_mask[PFM_PMC_BV];
18504+ u64 *cnt_pmds;
18505+ u64 val, wmask, ovfl_mask;
18506+ u32 i, count;
18507+
18508+ pmu_info = pfm_pmu_info();
18509+
18510+ wmask = 1ULL << pfm_pmu_conf->counter_width;
18511+ bitmap_and(cast_ulp(used_mask),
18512+ cast_ulp(set->used_pmcs),
18513+ cast_ulp(enable_mask),
18514+ max_enable);
18515+
18516+ count = bitmap_weight(cast_ulp(used_mask), ctx->regs.max_pmc);
18517+
18518+ /*
18519+ * stop monitoring
18520+ * Unfortunately, this is very expensive!
18521+ * wrmsrl() is serializing.
18522+ */
18523+ for (i = 0; count; i++) {
18524+ if (test_bit(i, cast_ulp(used_mask))) {
18525+ wrmsrl(MSR_P6_EVNTSEL0+i, 0);
18526+ count--;
18527+ }
18528+ }
18529+
18530+ /*
18531+ * if we already having a pending overflow condition, we simply
18532+ * return to take care of this first.
18533+ */
18534+ if (set->npend_ovfls)
18535+ return 1;
18536+
18537+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
18538+ cnt_pmds = ctx->regs.cnt_pmds;
18539+
18540+ /*
18541+ * check for pending overflows and save PMDs (combo)
18542+ * we employ used_pmds because we also need to save
18543+ * and not just check for pending interrupts.
18544+ *
18545+ * Must check for counting PMDs because of virtual PMDs
18546+ */
18547+ count = set->nused_pmds;
18548+ for (i = 0; count; i++) {
18549+ if (test_bit(i, cast_ulp(set->used_pmds))) {
18550+ val = pfm_arch_read_pmd(ctx, i);
18551+ if (likely(test_bit(i, cast_ulp(cnt_pmds)))) {
18552+ if (!(val & wmask)) {
18553+ __set_bit(i, cast_ulp(set->povfl_pmds));
18554+ set->npend_ovfls++;
18555+ }
18556+ val = (set->pmds[i].value & ~ovfl_mask) | (val & ovfl_mask);
18557+ }
18558+ set->pmds[i].value = val;
18559+ count--;
18560+ }
18561+ }
18562+ /* 0 means: no need to save PMDs at upper level */
18563+ return 0;
18564+}
18565+
18566+/**
18567+ * pfm_p6_quiesce_pmu -- stop monitoring without grabbing any lock
18568+ *
18569+ * called from NMI interrupt handler to immediately stop monitoring
18570+ * cannot grab any lock, including perfmon related locks
18571+ */
18572+static void __kprobes pfm_p6_quiesce(void)
18573+{
18574+ /*
18575+ * quiesce PMU by clearing available registers that have
18576+ * the start/stop capability
18577+ *
18578+ * P6 processors only have enable bit on PERFEVTSEL0
18579+ */
18580+ if (test_bit(0, cast_ulp(pfm_pmu_conf->regs_all.pmcs)))
18581+ wrmsrl(MSR_P6_EVNTSEL0, 0);
18582+}
18583+
18584+/*
18585+ * Counters have 40 bits implemented. However they are designed such
18586+ * that bits [32-39] are sign extensions of bit 31. As such the
18587+ * effective width of a counter for P6-like PMU is 31 bits only.
18588+ *
18589+ * See IA-32 Intel Architecture Software developer manual Vol 3B
18590+ */
18591+static struct pfm_pmu_config pfm_p6_pmu_conf = {
18592+ .pmu_name = "Intel P6 processor Family",
18593+ .counter_width = 31,
18594+ .pmd_desc = pfm_p6_pmd_desc,
18595+ .pmc_desc = pfm_p6_pmc_desc,
18596+ .num_pmc_entries = PFM_P6_NUM_PMCS,
18597+ .num_pmd_entries = PFM_P6_NUM_PMDS,
18598+ .probe_pmu = pfm_p6_probe_pmu,
18599+ .version = "1.0",
18600+ .flags = PFM_PMU_BUILTIN_FLAG,
18601+ .owner = THIS_MODULE,
18602+ .pmu_info = &pfm_p6_pmu_info
18603+};
18604+
18605+static int __init pfm_p6_pmu_init_module(void)
18606+{
18607+ return pfm_pmu_register(&pfm_p6_pmu_conf);
18608+}
18609+
18610+static void __exit pfm_p6_pmu_cleanup_module(void)
18611+{
18612+ pfm_pmu_unregister(&pfm_p6_pmu_conf);
18613+}
18614+
18615+module_init(pfm_p6_pmu_init_module);
18616+module_exit(pfm_p6_pmu_cleanup_module);
18617--- /dev/null
18618+++ b/arch/x86/perfmon/perfmon_pebs_core_smpl.c
18619@@ -0,0 +1,256 @@
18620+/*
18621+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
18622+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
18623+ *
18624+ * This file implements the Precise Event Based Sampling (PEBS)
18625+ * sampling format for Intel Core and Atom processors.
18626+ *
18627+ * This program is free software; you can redistribute it and/or
18628+ * modify it under the terms of version 2 of the GNU General Public
18629+ * License as published by the Free Software Foundation.
18630+ *
18631+ * This program is distributed in the hope that it will be useful,
18632+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18633+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18634+ * General Public License for more details.
18635+ *
18636+ * You should have received a copy of the GNU General Public License
18637+ * along with this program; if not, write to the Free Software
18638+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18639+ * 02111-1307 USA
18640+ */
18641+#include <linux/kernel.h>
18642+#include <linux/types.h>
18643+#include <linux/module.h>
18644+#include <linux/init.h>
18645+#include <linux/smp.h>
18646+#include <linux/perfmon_kern.h>
18647+
18648+#include <asm/msr.h>
18649+#include <asm/perfmon_pebs_core_smpl.h>
18650+
18651+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
18652+MODULE_DESCRIPTION("Intel Core Precise Event-Based Sampling (PEBS)");
18653+MODULE_LICENSE("GPL");
18654+
18655+#define ALIGN_PEBS(a, order) \
18656+ ((a)+(1UL<<(order))-1) & ~((1UL<<(order))-1)
18657+
18658+#define PEBS_PADDING_ORDER 8 /* log2(256) padding for PEBS alignment constraint */
18659+
18660+static int pfm_pebs_core_fmt_validate(u32 flags, u16 npmds, void *data)
18661+{
18662+ struct pfm_pebs_core_smpl_arg *arg = data;
18663+ size_t min_buf_size;
18664+
18665+ /*
18666+ * need to define at least the size of the buffer
18667+ */
18668+ if (data == NULL) {
18669+ PFM_DBG("no argument passed");
18670+ return -EINVAL;
18671+ }
18672+
18673+ /*
18674+ * compute min buf size. npmds is the maximum number
18675+ * of implemented PMD registers.
18676+ */
18677+ min_buf_size = sizeof(struct pfm_pebs_core_smpl_hdr)
18678+ + sizeof(struct pfm_pebs_core_smpl_entry)
18679+ + (1UL<<PEBS_PADDING_ORDER); /* padding for alignment */
18680+
18681+ PFM_DBG("validate flags=0x%x min_buf_size=%zu buf_size=%zu",
18682+ flags,
18683+ min_buf_size,
18684+ arg->buf_size);
18685+
18686+ /*
18687+ * must hold at least the buffer header + one minimally sized entry
18688+ */
18689+ if (arg->buf_size < min_buf_size)
18690+ return -EINVAL;
18691+
18692+ return 0;
18693+}
18694+
18695+static int pfm_pebs_core_fmt_get_size(unsigned int flags, void *data, size_t *size)
18696+{
18697+ struct pfm_pebs_core_smpl_arg *arg = data;
18698+
18699+ /*
18700+ * size has been validated in pfm_pebs_core_fmt_validate()
18701+ */
18702+ *size = arg->buf_size + (1UL<<PEBS_PADDING_ORDER);
18703+
18704+ return 0;
18705+}
18706+
18707+static int pfm_pebs_core_fmt_init(struct pfm_context *ctx, void *buf,
18708+ u32 flags, u16 npmds, void *data)
18709+{
18710+ struct pfm_arch_context *ctx_arch;
18711+ struct pfm_pebs_core_smpl_hdr *hdr;
18712+ struct pfm_pebs_core_smpl_arg *arg = data;
18713+ u64 pebs_start, pebs_end;
18714+ struct pfm_ds_area_core *ds;
18715+
18716+ ctx_arch = pfm_ctx_arch(ctx);
18717+
18718+ hdr = buf;
18719+ ds = &hdr->ds;
18720+
18721+ /*
18722+ * align PEBS buffer base
18723+ */
18724+ pebs_start = ALIGN_PEBS((unsigned long)(hdr+1), PEBS_PADDING_ORDER);
18725+ pebs_end = pebs_start + arg->buf_size + 1;
18726+
18727+ hdr->version = PFM_PEBS_CORE_SMPL_VERSION;
18728+ hdr->buf_size = arg->buf_size;
18729+ hdr->overflows = 0;
18730+
18731+ /*
18732+ * express PEBS buffer base as offset from the end of the header
18733+ */
18734+ hdr->start_offs = pebs_start - (unsigned long)(hdr+1);
18735+
18736+ /*
18737+ * PEBS buffer boundaries
18738+ */
18739+ ds->pebs_buf_base = pebs_start;
18740+ ds->pebs_abs_max = pebs_end;
18741+
18742+ /*
18743+ * PEBS starting position
18744+ */
18745+ ds->pebs_index = pebs_start;
18746+
18747+ /*
18748+ * PEBS interrupt threshold
18749+ */
18750+ ds->pebs_intr_thres = pebs_start
18751+ + arg->intr_thres
18752+ * sizeof(struct pfm_pebs_core_smpl_entry);
18753+
18754+ /*
18755+ * save counter reset value for PEBS counter
18756+ */
18757+ ds->pebs_cnt_reset = arg->cnt_reset;
18758+
18759+ /*
18760+ * keep track of DS AREA
18761+ */
18762+ ctx_arch->ds_area = ds;
18763+ ctx_arch->flags.use_ds = 1;
18764+ ctx_arch->flags.use_pebs = 1;
18765+
18766+ PFM_DBG("buffer=%p buf_size=%llu offs=%llu pebs_start=0x%llx "
18767+ "pebs_end=0x%llx ds=%p pebs_thres=0x%llx cnt_reset=0x%llx",
18768+ buf,
18769+ (unsigned long long)hdr->buf_size,
18770+ (unsigned long long)hdr->start_offs,
18771+ (unsigned long long)pebs_start,
18772+ (unsigned long long)pebs_end,
18773+ ds,
18774+ (unsigned long long)ds->pebs_intr_thres,
18775+ (unsigned long long)ds->pebs_cnt_reset);
18776+
18777+ return 0;
18778+}
18779+
18780+static int pfm_pebs_core_fmt_handler(struct pfm_context *ctx,
18781+ unsigned long ip, u64 tstamp, void *data)
18782+{
18783+ struct pfm_pebs_core_smpl_hdr *hdr;
18784+ struct pfm_ovfl_arg *arg;
18785+
18786+ hdr = ctx->smpl_addr;
18787+ arg = &ctx->ovfl_arg;
18788+
18789+ PFM_DBG_ovfl("buffer full");
18790+ /*
18791+ * increment number of buffer overflows.
18792+ * important to detect duplicate set of samples.
18793+ */
18794+ hdr->overflows++;
18795+
18796+ /*
18797+ * request notification and masking of monitoring.
18798+ * Notification is still subject to the overflowed
18799+ * register having the FL_NOTIFY flag set.
18800+ */
18801+ arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY | PFM_OVFL_CTRL_MASK;
18802+
18803+ return -ENOBUFS; /* we are full, sorry */
18804+}
18805+
18806+static int pfm_pebs_core_fmt_restart(int is_active, u32 *ovfl_ctrl,
18807+ void *buf)
18808+{
18809+ struct pfm_pebs_core_smpl_hdr *hdr = buf;
18810+
18811+ /*
18812+ * reset index to base of buffer
18813+ */
18814+ hdr->ds.pebs_index = hdr->ds.pebs_buf_base;
18815+
18816+ *ovfl_ctrl = PFM_OVFL_CTRL_RESET;
18817+
18818+ return 0;
18819+}
18820+
18821+static int pfm_pebs_core_fmt_exit(void *buf)
18822+{
18823+ return 0;
18824+}
18825+
18826+static struct pfm_smpl_fmt pebs_core_fmt = {
18827+ .fmt_name = PFM_PEBS_CORE_SMPL_NAME,
18828+ .fmt_version = 0x1,
18829+ .fmt_arg_size = sizeof(struct pfm_pebs_core_smpl_arg),
18830+ .fmt_validate = pfm_pebs_core_fmt_validate,
18831+ .fmt_getsize = pfm_pebs_core_fmt_get_size,
18832+ .fmt_init = pfm_pebs_core_fmt_init,
18833+ .fmt_handler = pfm_pebs_core_fmt_handler,
18834+ .fmt_restart = pfm_pebs_core_fmt_restart,
18835+ .fmt_exit = pfm_pebs_core_fmt_exit,
18836+ .fmt_flags = PFM_FMT_BUILTIN_FLAG,
18837+ .owner = THIS_MODULE,
18838+};
18839+
18840+static int __init pfm_pebs_core_fmt_init_module(void)
18841+{
18842+ if (!cpu_has_pebs) {
18843+ PFM_INFO("processor does not have PEBS support");
18844+ return -1;
18845+ }
18846+ /*
18847+ * cpu_has_pebs is not enough to identify Intel Core PEBS
18848+ * which is different fro Pentium 4 PEBS. Therefore we do
18849+ * a more detailed check here
18850+ */
18851+ if (current_cpu_data.x86 != 6) {
18852+ PFM_INFO("not a supported Intel processor");
18853+ return -1;
18854+ }
18855+
18856+ switch (current_cpu_data.x86_model) {
18857+ case 15: /* Merom */
18858+ case 23: /* Penryn */
18859+ case 28: /* Atom (Silverthorne) */
18860+ case 29: /* Dunnington */
18861+ break;
18862+ default:
18863+ PFM_INFO("not a supported Intel processor");
18864+ return -1;
18865+ }
18866+ return pfm_fmt_register(&pebs_core_fmt);
18867+}
18868+
18869+static void __exit pfm_pebs_core_fmt_cleanup_module(void)
18870+{
18871+ pfm_fmt_unregister(&pebs_core_fmt);
18872+}
18873+
18874+module_init(pfm_pebs_core_fmt_init_module);
18875+module_exit(pfm_pebs_core_fmt_cleanup_module);
18876--- /dev/null
18877+++ b/arch/x86/perfmon/perfmon_pebs_p4_smpl.c
18878@@ -0,0 +1,253 @@
18879+/*
18880+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
18881+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
18882+ *
18883+ * This file implements the Precise Event Based Sampling (PEBS)
18884+ * sampling format. It supports the following processors:
18885+ * - 32-bit Pentium 4 or other Netburst-based processors
18886+ * - 64-bit Pentium 4 or other Netburst-based processors
18887+ *
18888+ * This program is free software; you can redistribute it and/or
18889+ * modify it under the terms of version 2 of the GNU General Public
18890+ * License as published by the Free Software Foundation.
18891+ *
18892+ * This program is distributed in the hope that it will be useful,
18893+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18894+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18895+ * General Public License for more details.
18896+ *
18897+ * You should have received a copy of the GNU General Public License
18898+ * along with this program; if not, write to the Free Software
18899+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18900+ * 02111-1307 USA
18901+ */
18902+#include <linux/kernel.h>
18903+#include <linux/types.h>
18904+#include <linux/module.h>
18905+#include <linux/init.h>
18906+#include <linux/smp.h>
18907+#include <linux/perfmon_kern.h>
18908+
18909+#include <asm/msr.h>
18910+#include <asm/perfmon_pebs_p4_smpl.h>
18911+
18912+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
18913+MODULE_DESCRIPTION("Intel P4 Precise Event-Based Sampling (PEBS)");
18914+MODULE_LICENSE("GPL");
18915+
18916+#define ALIGN_PEBS(a, order) \
18917+ ((a)+(1UL<<(order))-1) & ~((1UL<<(order))-1)
18918+
18919+#define PEBS_PADDING_ORDER 8 /* log2(256) padding for PEBS alignment constraint */
18920+
18921+static int pfm_pebs_p4_fmt_validate(u32 flags, u16 npmds, void *data)
18922+{
18923+ struct pfm_pebs_p4_smpl_arg *arg = data;
18924+ size_t min_buf_size;
18925+
18926+ /*
18927+ * need to define at least the size of the buffer
18928+ */
18929+ if (data == NULL) {
18930+ PFM_DBG("no argument passed");
18931+ return -EINVAL;
18932+ }
18933+
18934+ /*
18935+ * compute min buf size. npmds is the maximum number
18936+ * of implemented PMD registers.
18937+ */
18938+ min_buf_size = sizeof(struct pfm_pebs_p4_smpl_hdr)
18939+ + sizeof(struct pfm_pebs_p4_smpl_entry)
18940+ + (1UL<<PEBS_PADDING_ORDER); /* padding for alignment */
18941+
18942+ PFM_DBG("validate flags=0x%x min_buf_size=%zu buf_size=%zu",
18943+ flags,
18944+ min_buf_size,
18945+ arg->buf_size);
18946+
18947+ /*
18948+ * must hold at least the buffer header + one minimally sized entry
18949+ */
18950+ if (arg->buf_size < min_buf_size)
18951+ return -EINVAL;
18952+
18953+ return 0;
18954+}
18955+
18956+static int pfm_pebs_p4_fmt_get_size(unsigned int flags, void *data, size_t *size)
18957+{
18958+ struct pfm_pebs_p4_smpl_arg *arg = data;
18959+
18960+ /*
18961+ * size has been validated in pfm_pebs_p4_fmt_validate()
18962+ */
18963+ *size = arg->buf_size + (1UL<<PEBS_PADDING_ORDER);
18964+
18965+ return 0;
18966+}
18967+
18968+static int pfm_pebs_p4_fmt_init(struct pfm_context *ctx, void *buf,
18969+ u32 flags, u16 npmds, void *data)
18970+{
18971+ struct pfm_arch_context *ctx_arch;
18972+ struct pfm_pebs_p4_smpl_hdr *hdr;
18973+ struct pfm_pebs_p4_smpl_arg *arg = data;
18974+ unsigned long pebs_start, pebs_end;
18975+ struct pfm_ds_area_p4 *ds;
18976+
18977+ ctx_arch = pfm_ctx_arch(ctx);
18978+
18979+ hdr = buf;
18980+ ds = &hdr->ds;
18981+
18982+ /*
18983+ * align PEBS buffer base
18984+ */
18985+ pebs_start = ALIGN_PEBS((unsigned long)(hdr+1), PEBS_PADDING_ORDER);
18986+ pebs_end = pebs_start + arg->buf_size + 1;
18987+
18988+ hdr->version = PFM_PEBS_P4_SMPL_VERSION;
18989+ hdr->buf_size = arg->buf_size;
18990+ hdr->overflows = 0;
18991+
18992+ /*
18993+ * express PEBS buffer base as offset from the end of the header
18994+ */
18995+ hdr->start_offs = pebs_start - (unsigned long)(hdr+1);
18996+
18997+ /*
18998+ * PEBS buffer boundaries
18999+ */
19000+ ds->pebs_buf_base = pebs_start;
19001+ ds->pebs_abs_max = pebs_end;
19002+
19003+ /*
19004+ * PEBS starting position
19005+ */
19006+ ds->pebs_index = pebs_start;
19007+
19008+ /*
19009+ * PEBS interrupt threshold
19010+ */
19011+ ds->pebs_intr_thres = pebs_start
19012+ + arg->intr_thres * sizeof(struct pfm_pebs_p4_smpl_entry);
19013+
19014+ /*
19015+ * save counter reset value for PEBS counter
19016+ */
19017+ ds->pebs_cnt_reset = arg->cnt_reset;
19018+
19019+ /*
19020+ * keep track of DS AREA
19021+ */
19022+ ctx_arch->ds_area = ds;
19023+ ctx_arch->flags.use_pebs = 1;
19024+ ctx_arch->flags.use_ds = 1;
19025+
19026+ PFM_DBG("buffer=%p buf_size=%llu offs=%llu pebs_start=0x%lx "
19027+ "pebs_end=0x%lx ds=%p pebs_thres=0x%lx cnt_reset=0x%llx",
19028+ buf,
19029+ (unsigned long long)hdr->buf_size,
19030+ (unsigned long long)hdr->start_offs,
19031+ pebs_start,
19032+ pebs_end,
19033+ ds,
19034+ ds->pebs_intr_thres,
19035+ (unsigned long long)ds->pebs_cnt_reset);
19036+
19037+ return 0;
19038+}
19039+
19040+static int pfm_pebs_p4_fmt_handler(struct pfm_context *ctx,
19041+ unsigned long ip, u64 tstamp, void *data)
19042+{
19043+ struct pfm_pebs_p4_smpl_hdr *hdr;
19044+ struct pfm_ovfl_arg *arg;
19045+
19046+ hdr = ctx->smpl_addr;
19047+ arg = &ctx->ovfl_arg;
19048+
19049+ PFM_DBG_ovfl("buffer full");
19050+ /*
19051+ * increment number of buffer overflows.
19052+ * important to detect duplicate set of samples.
19053+ */
19054+ hdr->overflows++;
19055+
19056+ /*
19057+ * request notification and masking of monitoring.
19058+ * Notification is still subject to the overflowed
19059+ * register having the FL_NOTIFY flag set.
19060+ */
19061+ arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY | PFM_OVFL_CTRL_MASK;
19062+
19063+ return -ENOBUFS; /* we are full, sorry */
19064+}
19065+
19066+static int pfm_pebs_p4_fmt_restart(int is_active, u32 *ovfl_ctrl,
19067+ void *buf)
19068+{
19069+ struct pfm_pebs_p4_smpl_hdr *hdr = buf;
19070+
19071+ /*
19072+ * reset index to base of buffer
19073+ */
19074+ hdr->ds.pebs_index = hdr->ds.pebs_buf_base;
19075+
19076+ *ovfl_ctrl = PFM_OVFL_CTRL_RESET;
19077+
19078+ return 0;
19079+}
19080+
19081+static int pfm_pebs_p4_fmt_exit(void *buf)
19082+{
19083+ return 0;
19084+}
19085+
19086+static struct pfm_smpl_fmt pebs_p4_fmt = {
19087+ .fmt_name = PFM_PEBS_P4_SMPL_NAME,
19088+ .fmt_version = 0x1,
19089+ .fmt_arg_size = sizeof(struct pfm_pebs_p4_smpl_arg),
19090+ .fmt_validate = pfm_pebs_p4_fmt_validate,
19091+ .fmt_getsize = pfm_pebs_p4_fmt_get_size,
19092+ .fmt_init = pfm_pebs_p4_fmt_init,
19093+ .fmt_handler = pfm_pebs_p4_fmt_handler,
19094+ .fmt_restart = pfm_pebs_p4_fmt_restart,
19095+ .fmt_exit = pfm_pebs_p4_fmt_exit,
19096+ .fmt_flags = PFM_FMT_BUILTIN_FLAG,
19097+ .owner = THIS_MODULE,
19098+};
19099+
19100+static int __init pfm_pebs_p4_fmt_init_module(void)
19101+{
19102+ int ht_enabled;
19103+
19104+ if (!cpu_has_pebs) {
19105+ PFM_INFO("processor does not have PEBS support");
19106+ return -1;
19107+ }
19108+ if (current_cpu_data.x86 != 15) {
19109+ PFM_INFO("not an Intel Pentium 4");
19110+ return -1;
19111+ }
19112+#ifdef CONFIG_SMP
19113+ ht_enabled = (cpus_weight(__get_cpu_var(cpu_core_map))
19114+ / current_cpu_data.x86_max_cores) > 1;
19115+#else
19116+ ht_enabled = 0;
19117+#endif
19118+ if (ht_enabled) {
19119+ PFM_INFO("PEBS not available because HyperThreading is on");
19120+ return -1;
19121+ }
19122+ return pfm_fmt_register(&pebs_p4_fmt);
19123+}
19124+
19125+static void __exit pfm_pebs_p4_fmt_cleanup_module(void)
19126+{
19127+ pfm_fmt_unregister(&pebs_p4_fmt);
19128+}
19129+
19130+module_init(pfm_pebs_p4_fmt_init_module);
19131+module_exit(pfm_pebs_p4_fmt_cleanup_module);
19132--- a/include/asm-mips/Kbuild
19133+++ b/include/asm-mips/Kbuild
19134@@ -1,3 +1,4 @@
19135 include include/asm-generic/Kbuild.asm
19136
19137 header-y += cachectl.h sgidefs.h sysmips.h
19138+header-y += perfmon.h
19139--- /dev/null
19140+++ b/include/asm-mips/perfmon.h
19141@@ -0,0 +1,34 @@
19142+/*
19143+ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P.
19144+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
19145+ *
19146+ * This file contains mips64 specific definitions for the perfmon
19147+ * interface.
19148+ *
19149+ * This file MUST never be included directly. Use linux/perfmon.h.
19150+ *
19151+ * This program is free software; you can redistribute it and/or
19152+ * modify it under the terms of version 2 of the GNU General Public
19153+ * License as published by the Free Software Foundation.
19154+ *
19155+ * This program is distributed in the hope that it will be useful,
19156+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
19157+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19158+ * General Public License for more details.
19159+ *
19160+ * You should have received a copy of the GNU General Public License
19161+ * along with this program; if not, write to the Free Software
19162+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19163+ * 02111-1307 USA
19164+ */
19165+#ifndef _ASM_MIPS64_PERFMON_H_
19166+#define _ASM_MIPS64_PERFMON_H_
19167+
19168+/*
19169+ * arch-specific user visible interface definitions
19170+ */
19171+
19172+#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */
19173+#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */
19174+
19175+#endif /* _ASM_MIPS64_PERFMON_H_ */
19176--- /dev/null
19177+++ b/include/asm-mips/perfmon_kern.h
19178@@ -0,0 +1,412 @@
19179+/*
19180+ * Copyright (c) 2005 Philip Mucci.
19181+ *
19182+ * Based on other versions:
19183+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
19184+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
19185+ *
19186+ * This file contains mips64 specific definitions for the perfmon
19187+ * interface.
19188+ *
19189+ * This program is free software; you can redistribute it and/or
19190+ * modify it under the terms of version 2 of the GNU General Public
19191+ * License as published by the Free Software Foundation.
19192+ *
19193+ * This program is distributed in the hope that it will be useful,
19194+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
19195+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19196+ * General Public License for more details.
19197+ *
19198+ * You should have received a copy of the GNU General Public License
19199+ * along with this program; if not, write to the Free Software
19200+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19201+ * 02111-1307 USA
19202+ */
19203+#ifndef _ASM_MIPS64_PERFMON_KERN_H_
19204+#define _ASM_MIPS64_PERFMON_KERN_H_
19205+
19206+#ifdef __KERNEL__
19207+
19208+#ifdef CONFIG_PERFMON
19209+#include <linux/unistd.h>
19210+#include <asm/cacheflush.h>
19211+
19212+#define PFM_ARCH_PMD_STK_ARG 2
19213+#define PFM_ARCH_PMC_STK_ARG 2
19214+
19215+struct pfm_arch_pmu_info {
19216+ u32 pmu_style;
19217+};
19218+
19219+#define MIPS64_CONFIG_PMC_MASK (1 << 4)
19220+#define MIPS64_PMC_INT_ENABLE_MASK (1 << 4)
19221+#define MIPS64_PMC_CNT_ENABLE_MASK (0xf)
19222+#define MIPS64_PMC_EVT_MASK (0x7 << 6)
19223+#define MIPS64_PMC_CTR_MASK (1 << 31)
19224+#define MIPS64_PMD_INTERRUPT (1 << 31)
19225+
19226+/* Coprocessor register 25 contains the PMU interface. */
19227+/* Sel 0 is control for counter 0 */
19228+/* Sel 1 is count for counter 0. */
19229+/* Sel 2 is control for counter 1. */
19230+/* Sel 3 is count for counter 1. */
19231+
19232+/*
19233+
19234+31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
19235+M 0--------------------------------------------------------------0 Event-- IE U S K EXL
19236+
19237+M 31 If this bit is one, another pair of Performance Control
19238+and Counter registers is implemented at a MTC0
19239+
19240+Event 8:5 Counter event enabled for this counter. Possible events
19241+are listed in Table 6-30. R/W Undefined
19242+
19243+IE 4 Counter Interrupt Enable. This bit masks bit 31 of the
19244+associated count register from the interrupt exception
19245+request output. R/W 0
19246+
19247+U 3 Count in User Mode. When this bit is set, the specified
19248+event is counted in User Mode. R/W Undefined
19249+
19250+S 2 Count in Supervisor Mode. When this bit is set, the
19251+specified event is counted in Supervisor Mode. R/W Undefined
19252+
19253+K 1 Count in Kernel Mode. When this bit is set, count the
19254+event in Kernel Mode when EXL and ERL both are 0. R/W Undefined
19255+
19256+EXL 0 Count when EXL. When this bit is set, count the event
19257+when EXL = 1 and ERL = 0. R/W Undefined
19258+*/
19259+
19260+static inline void pfm_arch_resend_irq(struct pfm_context *ctx)
19261+{}
19262+
19263+static inline void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx,
19264+ struct pfm_event_set *set)
19265+{}
19266+
19267+static inline void pfm_arch_serialize(void)
19268+{}
19269+
19270+
19271+/*
19272+ * MIPS does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus
19273+ * this routine needs to do it when switching sets on overflow
19274+ */
19275+static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx,
19276+ struct pfm_event_set *set)
19277+{
19278+ pfm_save_pmds(ctx, set);
19279+}
19280+
19281+static inline void pfm_arch_write_pmc(struct pfm_context *ctx,
19282+ unsigned int cnum, u64 value)
19283+{
19284+ /*
19285+ * we only write to the actual register when monitoring is
19286+ * active (pfm_start was issued)
19287+ */
19288+ if (ctx && (ctx->flags.started == 0))
19289+ return;
19290+
19291+ switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) {
19292+ case 0:
19293+ write_c0_perfctrl0(value);
19294+ break;
19295+ case 1:
19296+ write_c0_perfctrl1(value);
19297+ break;
19298+ case 2:
19299+ write_c0_perfctrl2(value);
19300+ break;
19301+ case 3:
19302+ write_c0_perfctrl3(value);
19303+ break;
19304+ default:
19305+ BUG();
19306+ }
19307+}
19308+
19309+static inline void pfm_arch_write_pmd(struct pfm_context *ctx,
19310+ unsigned int cnum, u64 value)
19311+{
19312+ value &= pfm_pmu_conf->ovfl_mask;
19313+
19314+ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
19315+ case 0:
19316+ write_c0_perfcntr0(value);
19317+ break;
19318+ case 1:
19319+ write_c0_perfcntr1(value);
19320+ break;
19321+ case 2:
19322+ write_c0_perfcntr2(value);
19323+ break;
19324+ case 3:
19325+ write_c0_perfcntr3(value);
19326+ break;
19327+ default:
19328+ BUG();
19329+ }
19330+}
19331+
19332+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
19333+{
19334+ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
19335+ case 0:
19336+ return read_c0_perfcntr0();
19337+ break;
19338+ case 1:
19339+ return read_c0_perfcntr1();
19340+ break;
19341+ case 2:
19342+ return read_c0_perfcntr2();
19343+ break;
19344+ case 3:
19345+ return read_c0_perfcntr3();
19346+ break;
19347+ default:
19348+ BUG();
19349+ return 0;
19350+ }
19351+}
19352+
19353+static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum)
19354+{
19355+ switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) {
19356+ case 0:
19357+ return read_c0_perfctrl0();
19358+ break;
19359+ case 1:
19360+ return read_c0_perfctrl1();
19361+ break;
19362+ case 2:
19363+ return read_c0_perfctrl2();
19364+ break;
19365+ case 3:
19366+ return read_c0_perfctrl3();
19367+ break;
19368+ default:
19369+ BUG();
19370+ return 0;
19371+ }
19372+}
19373+
19374+/*
19375+ * For some CPUs, the upper bits of a counter must be set in order for the
19376+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
19377+ * and the upper bits are cleared. This function may be used to set them back.
19378+ */
19379+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx,
19380+ unsigned int cnum)
19381+{
19382+ u64 val;
19383+ val = pfm_arch_read_pmd(ctx, cnum);
19384+ /* This masks out overflow bit 31 */
19385+ pfm_arch_write_pmd(ctx, cnum, val);
19386+}
19387+
19388+/*
19389+ * At certain points, perfmon needs to know if monitoring has been
19390+ * explicitely started/stopped by user via pfm_start/pfm_stop. The
19391+ * information is tracked in ctx.flags.started. However on certain
19392+ * architectures, it may be possible to start/stop directly from
19393+ * user level with a single assembly instruction bypassing
19394+ * the kernel. This function must be used to determine by
19395+ * an arch-specific mean if monitoring is actually started/stopped.
19396+ */
19397+static inline int pfm_arch_is_active(struct pfm_context *ctx)
19398+{
19399+ return ctx->flags.started;
19400+}
19401+
19402+static inline void pfm_arch_ctxswout_sys(struct task_struct *task,
19403+ struct pfm_context *ctx)
19404+{}
19405+
19406+static inline void pfm_arch_ctxswin_sys(struct task_struct *task,
19407+ struct pfm_context *ctx)
19408+{}
19409+
19410+static inline void pfm_arch_ctxswin_thread(struct task_struct *task,
19411+ struct pfm_context *ctx)
19412+{}
19413+int pfm_arch_ctxswout_thread(struct task_struct *task,
19414+ struct pfm_context *ctx);
19415+
19416+int pfm_arch_is_monitoring_active(struct pfm_context *ctx);
19417+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx);
19418+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx);
19419+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
19420+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
19421+char *pfm_arch_get_pmu_module_name(void);
19422+
19423+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx,
19424+ struct pfm_event_set *set)
19425+{
19426+ pfm_arch_stop(current, ctx);
19427+ /*
19428+ * we mark monitoring as stopped to avoid
19429+ * certain side effects especially in
19430+ * pfm_switch_sets_from_intr() on
19431+ * pfm_arch_restore_pmcs()
19432+ */
19433+ ctx->flags.started = 0;
19434+}
19435+
19436+/*
19437+ * unfreeze PMU from pfm_do_interrupt_handler()
19438+ * ctx may be NULL for spurious
19439+ */
19440+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
19441+{
19442+ if (!ctx)
19443+ return;
19444+
19445+ PFM_DBG_ovfl("state=%d", ctx->state);
19446+
19447+ ctx->flags.started = 1;
19448+
19449+ if (ctx->state == PFM_CTX_MASKED)
19450+ return;
19451+
19452+ pfm_arch_restore_pmcs(ctx, ctx->active_set);
19453+}
19454+
19455+/*
19456+ * this function is called from the PMU interrupt handler ONLY.
19457+ * On MIPS, the PMU is frozen via arch_stop, masking would be implemented
19458+ * via arch-stop as well. Given that the PMU is already stopped when
19459+ * entering the interrupt handler, we do not need to stop it again, so
19460+ * this function is a nop.
19461+ */
19462+static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx,
19463+ struct pfm_event_set *set)
19464+{}
19465+
19466+/*
19467+ * on MIPS masking/unmasking uses the start/stop mechanism, so we simply
19468+ * need to start here.
19469+ */
19470+static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx,
19471+ struct pfm_event_set *set)
19472+{
19473+ pfm_arch_start(current, ctx);
19474+}
19475+
19476+static inline int pfm_arch_context_create(struct pfm_context *ctx,
19477+ u32 ctx_flags)
19478+{
19479+ return 0;
19480+}
19481+
19482+static inline void pfm_arch_context_free(struct pfm_context *ctx)
19483+{}
19484+
19485+
19486+
19487+
19488+
19489+/*
19490+ * function called from pfm_setfl_sane(). Context is locked
19491+ * and interrupts are masked.
19492+ * The value of flags is the value of ctx_flags as passed by
19493+ * user.
19494+ *
19495+ * function must check arch-specific set flags.
19496+ * Return:
19497+ * 1 when flags are valid
19498+ * 0 on error
19499+ */
19500+static inline int
19501+pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags)
19502+{
19503+ return 0;
19504+}
19505+
19506+static inline int pfm_arch_init(void)
19507+{
19508+ return 0;
19509+}
19510+
19511+static inline void pfm_arch_init_percpu(void)
19512+{}
19513+
19514+static inline int pfm_arch_load_context(struct pfm_context *ctx)
19515+{
19516+ return 0;
19517+}
19518+
19519+static inline void pfm_arch_unload_context(struct pfm_context *ctx)
19520+{}
19521+
19522+static inline int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds)
19523+{
19524+ return 0;
19525+}
19526+
19527+static inline void pfm_arch_pmu_release(void)
19528+{}
19529+
19530+#ifdef CONFIG_PERFMON_FLUSH
19531+/*
19532+ * due to cache aliasing problem on MIPS, it is necessary to flush
19533+ * pages out of the cache when they are modified.
19534+ */
19535+static inline void pfm_cacheflush(void *addr, unsigned int len)
19536+{
19537+ unsigned long start, end;
19538+
19539+ start = (unsigned long)addr & PAGE_MASK;
19540+ end = ((unsigned long)addr + len + PAGE_SIZE - 1) & PAGE_MASK;
19541+
19542+ while (start < end) {
19543+ flush_data_cache_page(start);
19544+ start += PAGE_SIZE;
19545+ }
19546+}
19547+#else
19548+static inline void pfm_cacheflush(void *addr, unsigned int len)
19549+{}
19550+#endif
19551+
19552+static inline void pfm_arch_arm_handle_work(struct task_struct *task)
19553+{}
19554+
19555+static inline void pfm_arch_disarm_handle_work(struct task_struct *task)
19556+{}
19557+
19558+static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg)
19559+{
19560+ return 0;
19561+}
19562+
19563+static inline int pfm_arch_get_base_syscall(void)
19564+{
19565+ if (test_thread_flag(TIF_32BIT_ADDR)) {
19566+ if (test_thread_flag(TIF_32BIT_REGS))
19567+ return __NR_O32_Linux+330;
19568+ return __NR_N32_Linux+293;
19569+ }
19570+ return __NR_64_Linux+289;
19571+}
19572+
19573+struct pfm_arch_context {
19574+ /* empty */
19575+};
19576+
19577+#define PFM_ARCH_CTX_SIZE sizeof(struct pfm_arch_context)
19578+/*
19579+ * MIPS may need extra alignment requirements for the sampling buffer
19580+ */
19581+#ifdef CONFIG_PERFMON_SMPL_ALIGN
19582+#define PFM_ARCH_SMPL_ALIGN_SIZE 0x4000
19583+#else
19584+#define PFM_ARCH_SMPL_ALIGN_SIZE 0
19585+#endif
19586+
19587+#endif /* CONFIG_PERFMON */
19588+
19589+#endif /* __KERNEL__ */
19590+#endif /* _ASM_MIPS64_PERFMON_KERN_H_ */
19591--- a/include/asm-mips/system.h
19592+++ b/include/asm-mips/system.h
19593@@ -67,6 +67,10 @@ do { \
19594 __mips_mt_fpaff_switch_to(prev); \
19595 if (cpu_has_dsp) \
19596 __save_dsp(prev); \
19597+ if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)) \
19598+ pfm_ctxsw_out(prev, next); \
19599+ if (test_tsk_thread_flag(next, TIF_PERFMON_CTXSW)) \
19600+ pfm_ctxsw_in(prev, next); \
19601 (last) = resume(prev, next, task_thread_info(next)); \
19602 } while (0)
19603
19604--- a/include/asm-mips/thread_info.h
19605+++ b/include/asm-mips/thread_info.h
19606@@ -114,6 +114,7 @@ register struct thread_info *__current_t
19607 #define TIF_NEED_RESCHED 2 /* rescheduling necessary */
19608 #define TIF_SYSCALL_AUDIT 3 /* syscall auditing active */
19609 #define TIF_SECCOMP 4 /* secure computing */
19610+#define TIF_PERFMON_WORK 5 /* work for pfm_handle_work() */
19611 #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */
19612 #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */
19613 #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */
19614@@ -124,6 +125,7 @@ register struct thread_info *__current_t
19615 #define TIF_32BIT_REGS 22 /* also implies 16/32 fprs */
19616 #define TIF_32BIT_ADDR 23 /* 32-bit address space (o32/n32) */
19617 #define TIF_FPUBOUND 24 /* thread bound to FPU-full CPU set */
19618+#define TIF_PERFMON_CTXSW 25 /* perfmon needs ctxsw calls */
19619 #define TIF_SYSCALL_TRACE 31 /* syscall trace active */
19620
19621 #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
19622@@ -140,6 +142,8 @@ register struct thread_info *__current_t
19623 #define _TIF_32BIT_REGS (1<<TIF_32BIT_REGS)
19624 #define _TIF_32BIT_ADDR (1<<TIF_32BIT_ADDR)
19625 #define _TIF_FPUBOUND (1<<TIF_FPUBOUND)
19626+#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK)
19627+#define _TIF_PERFMON_CTXSW (1<<TIF_PERFMON_CTXSW)
19628
19629 /* work to do on interrupt/exception return */
19630 #define _TIF_WORK_MASK (0x0000ffef & ~_TIF_SECCOMP)
19631--- a/include/asm-mips/unistd.h
19632+++ b/include/asm-mips/unistd.h
19633@@ -350,11 +350,23 @@
19634 #define __NR_dup3 (__NR_Linux + 327)
19635 #define __NR_pipe2 (__NR_Linux + 328)
19636 #define __NR_inotify_init1 (__NR_Linux + 329)
19637+#define __NR_pfm_create_context (__NR_Linux + 330)
19638+#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1)
19639+#define __NR_pfm_write_pmds (__NR_pfm_create_context+2)
19640+#define __NR_pfm_read_pmds (__NR_pfm_create_context+3)
19641+#define __NR_pfm_load_context (__NR_pfm_create_context+4)
19642+#define __NR_pfm_start (__NR_pfm_create_context+5)
19643+#define __NR_pfm_stop (__NR_pfm_create_context+6)
19644+#define __NR_pfm_restart (__NR_pfm_create_context+7)
19645+#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8)
19646+#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9)
19647+#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10)
19648+#define __NR_pfm_unload_context (__NR_pfm_create_context+11)
19649
19650 /*
19651 * Offset of the last Linux o32 flavoured syscall
19652 */
19653-#define __NR_Linux_syscalls 329
19654+#define __NR_Linux_syscalls 341
19655
19656 #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
19657
19658@@ -656,16 +668,28 @@
19659 #define __NR_dup3 (__NR_Linux + 286)
19660 #define __NR_pipe2 (__NR_Linux + 287)
19661 #define __NR_inotify_init1 (__NR_Linux + 288)
19662+#define __NR_pfm_create_context (__NR_Linux + 289)
19663+#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1)
19664+#define __NR_pfm_write_pmds (__NR_pfm_create_context+2)
19665+#define __NR_pfm_read_pmds (__NR_pfm_create_context+3)
19666+#define __NR_pfm_load_context (__NR_pfm_create_context+4)
19667+#define __NR_pfm_start (__NR_pfm_create_context+5)
19668+#define __NR_pfm_stop (__NR_pfm_create_context+6)
19669+#define __NR_pfm_restart (__NR_pfm_create_context+7)
19670+#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8)
19671+#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9)
19672+#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10)
19673+#define __NR_pfm_unload_context (__NR_pfm_create_context+11)
19674
19675 /*
19676 * Offset of the last Linux 64-bit flavoured syscall
19677 */
19678-#define __NR_Linux_syscalls 288
19679+#define __NR_Linux_syscalls 300
19680
19681 #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */
19682
19683 #define __NR_64_Linux 5000
19684-#define __NR_64_Linux_syscalls 288
19685+#define __NR_64_Linux_syscalls 300
19686
19687 #if _MIPS_SIM == _MIPS_SIM_NABI32
19688
19689@@ -966,16 +990,28 @@
19690 #define __NR_dup3 (__NR_Linux + 290)
19691 #define __NR_pipe2 (__NR_Linux + 291)
19692 #define __NR_inotify_init1 (__NR_Linux + 292)
19693+#define __NR_pfm_create_context (__NR_Linux + 293)
19694+#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1)
19695+#define __NR_pfm_write_pmds (__NR_pfm_create_context+2)
19696+#define __NR_pfm_read_pmds (__NR_pfm_create_context+3)
19697+#define __NR_pfm_load_context (__NR_pfm_create_context+4)
19698+#define __NR_pfm_start (__NR_pfm_create_context+5)
19699+#define __NR_pfm_stop (__NR_pfm_create_context+6)
19700+#define __NR_pfm_restart (__NR_pfm_create_context+7)
19701+#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8)
19702+#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9)
19703+#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10)
19704+#define __NR_pfm_unload_context (__NR_pfm_create_context+11)
19705
19706 /*
19707 * Offset of the last N32 flavoured syscall
19708 */
19709-#define __NR_Linux_syscalls 292
19710+#define __NR_Linux_syscalls 304
19711
19712 #endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */
19713
19714 #define __NR_N32_Linux 6000
19715-#define __NR_N32_Linux_syscalls 292
19716+#define __NR_N32_Linux_syscalls 304
19717
19718 #ifdef __KERNEL__
19719
19720--- a/include/asm-x86/Kbuild
19721+++ b/include/asm-x86/Kbuild
19722@@ -9,6 +9,7 @@ header-y += prctl.h
19723 header-y += ptrace-abi.h
19724 header-y += sigcontext32.h
19725 header-y += ucontext.h
19726+header-y += perfmon.h
19727 header-y += processor-flags.h
19728
19729 unifdef-y += e820.h
19730--- a/include/asm-x86/ia32_unistd.h
19731+++ b/include/asm-x86/ia32_unistd.h
19732@@ -8,11 +8,12 @@
19733 * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
19734 */
19735
19736-#define __NR_ia32_restart_syscall 0
19737-#define __NR_ia32_exit 1
19738-#define __NR_ia32_read 3
19739-#define __NR_ia32_write 4
19740-#define __NR_ia32_sigreturn 119
19741-#define __NR_ia32_rt_sigreturn 173
19742+#define __NR_ia32_restart_syscall 0
19743+#define __NR_ia32_exit 1
19744+#define __NR_ia32_read 3
19745+#define __NR_ia32_write 4
19746+#define __NR_ia32_sigreturn 119
19747+#define __NR_ia32_rt_sigreturn 173
19748+#define __NR_ia32_pfm_create_context 333
19749
19750 #endif /* _ASM_X86_64_IA32_UNISTD_H_ */
19751--- a/include/asm-x86/irq_vectors.h
19752+++ b/include/asm-x86/irq_vectors.h
19753@@ -99,6 +99,11 @@
19754 #define LOCAL_TIMER_VECTOR 0xef
19755
19756 /*
19757+ * Perfmon PMU interrupt vector
19758+ */
19759+#define LOCAL_PERFMON_VECTOR 0xee
19760+
19761+/*
19762 * First APIC vector available to drivers: (vectors 0x30-0xee) we
19763 * start at 0x31(0x41) to spread out vectors evenly between priority
19764 * levels. (0x80 is the syscall vector)
19765--- a/include/asm-x86/mach-default/entry_arch.h
19766+++ b/include/asm-x86/mach-default/entry_arch.h
19767@@ -32,4 +32,8 @@ BUILD_INTERRUPT(spurious_interrupt,SPURI
19768 BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
19769 #endif
19770
19771+#ifdef CONFIG_PERFMON
19772+BUILD_INTERRUPT(pmu_interrupt,LOCAL_PERFMON_VECTOR)
19773+#endif
19774+
19775 #endif
19776--- /dev/null
19777+++ b/include/asm-x86/perfmon.h
19778@@ -0,0 +1,34 @@
19779+/*
19780+ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P.
19781+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
19782+ *
19783+ * This file contains i386/x86_64 specific definitions for the perfmon
19784+ * interface.
19785+ *
19786+ * This file MUST never be included directly. Use linux/perfmon.h.
19787+ *
19788+ * This program is free software; you can redistribute it and/or
19789+ * modify it under the terms of version 2 of the GNU General Public
19790+ * License as published by the Free Software Foundation.
19791+ *
19792+ * This program is distributed in the hope that it will be useful,
19793+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
19794+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19795+ * General Public License for more details.
19796+ *
19797+ * You should have received a copy of the GNU General Public License
19798+ * along with this program; if not, write to the Free Software
19799+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19800+ * 02111-1307 USA
19801+ */
19802+#ifndef _ASM_X86_PERFMON__H_
19803+#define _ASM_X86_PERFMON__H_
19804+
19805+/*
19806+ * arch-specific user visible interface definitions
19807+ */
19808+
19809+#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */
19810+#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */
19811+
19812+#endif /* _ASM_X86_PERFMON_H_ */
19813--- /dev/null
19814+++ b/include/asm-x86/perfmon_kern.h
19815@@ -0,0 +1,548 @@
19816+/*
19817+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
19818+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
19819+ *
19820+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
19821+ * Contributed by Robert Richter <robert.richter@amd.com>
19822+ *
19823+ * This file contains X86 Processor Family specific definitions
19824+ * for the perfmon interface. This covers P6, Pentium M, P4/Xeon
19825+ * (32-bit and 64-bit, i.e., EM64T) and AMD X86-64.
19826+ *
19827+ * This program is free software; you can redistribute it and/or
19828+ * modify it under the terms of version 2 of the GNU General Public
19829+ * License as published by the Free Software Foundation.
19830+ *
19831+ * This program is distributed in the hope that it will be useful,
19832+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
19833+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19834+ * General Public License for more details.
19835+ *
19836+ * You should have received a copy of the GNU General Public License
19837+ * along with this program; if not, write to the Free Software
19838+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19839+ * 02111-1307 USA
19840+ */
19841+#ifndef _ASM_X86_PERFMON_KERN_H_
19842+#define _ASM_X86_PERFMON_KERN_H_
19843+
19844+#ifdef CONFIG_PERFMON
19845+#include <linux/unistd.h>
19846+#ifdef CONFIG_4KSTACKS
19847+#define PFM_ARCH_PMD_STK_ARG 2
19848+#define PFM_ARCH_PMC_STK_ARG 2
19849+#else
19850+#define PFM_ARCH_PMD_STK_ARG 4 /* about 700 bytes of stack space */
19851+#define PFM_ARCH_PMC_STK_ARG 4 /* about 200 bytes of stack space */
19852+#endif
19853+
19854+struct pfm_arch_pmu_info {
19855+ u32 flags; /* PMU feature flags */
19856+ /*
19857+ * mandatory model-specific callbacks
19858+ */
19859+ int (*stop_save)(struct pfm_context *ctx, struct pfm_event_set *set);
19860+ int (*has_ovfls)(struct pfm_context *ctx);
19861+ void (*quiesce)(void);
19862+
19863+ /*
19864+ * optional model-specific callbacks
19865+ */
19866+ void (*acquire_pmu_percpu)(void);
19867+ void (*release_pmu_percpu)(void);
19868+ int (*create_context)(struct pfm_context *ctx, u32 ctx_flags);
19869+ void (*free_context)(struct pfm_context *ctx);
19870+ int (*load_context)(struct pfm_context *ctx);
19871+ void (*unload_context)(struct pfm_context *ctx);
19872+ void (*write_pmc)(struct pfm_context *ctx, unsigned int cnum, u64 value);
19873+ void (*write_pmd)(struct pfm_context *ctx, unsigned int cnum, u64 value);
19874+ u64 (*read_pmd)(struct pfm_context *ctx, unsigned int cnum);
19875+ u64 (*read_pmc)(struct pfm_context *ctx, unsigned int cnum);
19876+ void (*nmi_copy_state)(struct pfm_context *ctx);
19877+ void (*restore_pmcs)(struct pfm_context *ctx,
19878+ struct pfm_event_set *set);
19879+ void (*restore_pmds)(struct pfm_context *ctx,
19880+ struct pfm_event_set *set);
19881+};
19882+
19883+/*
19884+ * PMU feature flags
19885+ */
19886+#define PFM_X86_FL_USE_NMI 0x01 /* user asking for NMI */
19887+#define PFM_X86_FL_NO_SHARING 0x02 /* no sharing with other subsystems */
19888+#define PFM_X86_FL_SHARING 0x04 /* PMU is being shared */
19889+
19890+struct pfm_x86_ctx_flags {
19891+ unsigned int insecure:1; /* rdpmc per-thread self-monitoring */
19892+ unsigned int use_pebs:1; /* PEBS used */
19893+ unsigned int use_ds:1; /* DS used */
19894+ unsigned int reserved:29; /* for future use */
19895+};
19896+
19897+struct pfm_arch_context {
19898+ u64 saved_real_iip; /* instr pointer of last NMI intr */
19899+ struct pfm_x86_ctx_flags flags; /* flags */
19900+ void *ds_area; /* address of DS area (to go away) */
19901+ void *data; /* model-specific data */
19902+};
19903+
19904+/*
19905+ * functions implemented as inline on x86
19906+ */
19907+
19908+/**
19909+ * pfm_arch_write_pmc - write a single PMC register
19910+ * @ctx: context to work on
19911+ * @cnum: PMC index
19912+ * @value: PMC 64-bit value
19913+ *
19914+ * in certain situations, ctx may be NULL
19915+ */
19916+static inline void pfm_arch_write_pmc(struct pfm_context *ctx,
19917+ unsigned int cnum, u64 value)
19918+{
19919+ struct pfm_arch_pmu_info *pmu_info;
19920+
19921+ pmu_info = pfm_pmu_info();
19922+
19923+ /*
19924+ * we only write to the actual register when monitoring is
19925+ * active (pfm_start was issued)
19926+ */
19927+ if (ctx && ctx->flags.started == 0)
19928+ return;
19929+
19930+ /*
19931+ * model-specific override, if any
19932+ */
19933+ if (pmu_info->write_pmc) {
19934+ pmu_info->write_pmc(ctx, cnum, value);
19935+ return;
19936+ }
19937+
19938+ PFM_DBG_ovfl("pfm_arch_write_pmc(0x%lx, 0x%Lx)",
19939+ pfm_pmu_conf->pmc_desc[cnum].hw_addr,
19940+ (unsigned long long) value);
19941+
19942+ wrmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value);
19943+}
19944+
19945+/**
19946+ * pfm_arch_write_pmd - write a single PMD register
19947+ * @ctx: context to work on
19948+ * @cnum: PMD index
19949+ * @value: PMD 64-bit value
19950+ */
19951+static inline void pfm_arch_write_pmd(struct pfm_context *ctx,
19952+ unsigned int cnum, u64 value)
19953+{
19954+ struct pfm_arch_pmu_info *pmu_info;
19955+
19956+ pmu_info = pfm_pmu_info();
19957+
19958+ /*
19959+ * to make sure the counter overflows, we set the
19960+ * upper bits. we also clear any other unimplemented
19961+ * bits as this may cause crash on some processors.
19962+ */
19963+ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64)
19964+ value = (value | ~pfm_pmu_conf->ovfl_mask)
19965+ & ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk;
19966+
19967+ PFM_DBG_ovfl("pfm_arch_write_pmd(0x%lx, 0x%Lx)",
19968+ pfm_pmu_conf->pmd_desc[cnum].hw_addr,
19969+ (unsigned long long) value);
19970+
19971+ /*
19972+ * model-specific override, if any
19973+ */
19974+ if (pmu_info->write_pmd) {
19975+ pmu_info->write_pmd(ctx, cnum, value);
19976+ return;
19977+ }
19978+
19979+ wrmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value);
19980+}
19981+
19982+/**
19983+ * pfm_arch_read_pmd - read a single PMD register
19984+ * @ctx: context to work on
19985+ * @cnum: PMD index
19986+ *
19987+ * return value is register 64-bit value
19988+ */
19989+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
19990+{
19991+ struct pfm_arch_pmu_info *pmu_info;
19992+ u64 tmp;
19993+
19994+ pmu_info = pfm_pmu_info();
19995+
19996+ /*
19997+ * model-specific override, if any
19998+ */
19999+ if (pmu_info->read_pmd)
20000+ tmp = pmu_info->read_pmd(ctx, cnum);
20001+ else
20002+ rdmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, tmp);
20003+
20004+ PFM_DBG_ovfl("pfm_arch_read_pmd(0x%lx) = 0x%Lx",
20005+ pfm_pmu_conf->pmd_desc[cnum].hw_addr,
20006+ (unsigned long long) tmp);
20007+ return tmp;
20008+}
20009+
20010+/**
20011+ * pfm_arch_read_pmc - read a single PMC register
20012+ * @ctx: context to work on
20013+ * @cnum: PMC index
20014+ *
20015+ * return value is register 64-bit value
20016+ */
20017+static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum)
20018+{
20019+ struct pfm_arch_pmu_info *pmu_info;
20020+ u64 tmp;
20021+
20022+ pmu_info = pfm_pmu_info();
20023+
20024+ /*
20025+ * model-specific override, if any
20026+ */
20027+ if (pmu_info->read_pmc)
20028+ tmp = pmu_info->read_pmc(ctx, cnum);
20029+ else
20030+ rdmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, tmp);
20031+
20032+ PFM_DBG_ovfl("pfm_arch_read_pmc(0x%lx) = 0x%016Lx",
20033+ pfm_pmu_conf->pmc_desc[cnum].hw_addr,
20034+ (unsigned long long) tmp);
20035+ return tmp;
20036+}
20037+
20038+/**
20039+ * pfm_arch_is_active - return non-zero is monitoring has been started
20040+ * @ctx: context to check
20041+ *
20042+ * At certain points, perfmon needs to know if monitoring has been
20043+ * explicitly started.
20044+ *
20045+ * On x86, there is not other way but to use pfm_start/pfm_stop
20046+ * to activate monitoring, thus we can simply check flags.started
20047+ */
20048+static inline int pfm_arch_is_active(struct pfm_context *ctx)
20049+{
20050+ return ctx->flags.started;
20051+}
20052+
20053+
20054+/**
20055+ * pfm_arch_unload_context - detach context from thread or CPU
20056+ * @ctx: context to detach
20057+ *
20058+ * in system-wide ctx->task is NULL, otherwise it points to the
20059+ * attached thread
20060+ */
20061+static inline void pfm_arch_unload_context(struct pfm_context *ctx)
20062+{
20063+ struct pfm_arch_pmu_info *pmu_info;
20064+ struct pfm_arch_context *ctx_arch;
20065+
20066+ ctx_arch = pfm_ctx_arch(ctx);
20067+ pmu_info = pfm_pmu_info();
20068+
20069+ if (ctx_arch->flags.insecure) {
20070+ PFM_DBG("clear cr4.pce");
20071+ clear_in_cr4(X86_CR4_PCE);
20072+ }
20073+
20074+ if (pmu_info->unload_context)
20075+ pmu_info->unload_context(ctx);
20076+}
20077+
20078+/**
20079+ * pfm_arch_load_context - attach context to thread or CPU
20080+ * @ctx: context to attach
20081+ */
20082+static inline int pfm_arch_load_context(struct pfm_context *ctx)
20083+{
20084+ struct pfm_arch_pmu_info *pmu_info;
20085+ struct pfm_arch_context *ctx_arch;
20086+ int ret = 0;
20087+
20088+ ctx_arch = pfm_ctx_arch(ctx);
20089+ pmu_info = pfm_pmu_info();
20090+
20091+ /*
20092+ * RDPMC authorized in system-wide and
20093+ * per-thread self-monitoring.
20094+ *
20095+ * RDPMC only gives access to counts.
20096+ *
20097+ * The context-switch routine code does not restore
20098+ * all the PMD registers (optimization), thus there
20099+ * is a possible leak of counts there in per-thread
20100+ * mode.
20101+ */
20102+ if (ctx->task == current || ctx->flags.system) {
20103+ PFM_DBG("set cr4.pce");
20104+ set_in_cr4(X86_CR4_PCE);
20105+ ctx_arch->flags.insecure = 1;
20106+ }
20107+
20108+ if (pmu_info->load_context)
20109+ ret = pmu_info->load_context(ctx);
20110+
20111+ return ret;
20112+}
20113+
20114+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
20115+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx);
20116+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx);
20117+
20118+/**
20119+ * pfm_arch_unmask_monitoring - unmask monitoring
20120+ * @ctx: context to mask
20121+ * @set: current event set
20122+ *
20123+ * masking is slightly different from stopping in that, it does not undo
20124+ * the pfm_start() issued by user. This is used in conjunction with
20125+ * sampling. Masking means stop monitoring, but do not authorize user
20126+ * to issue pfm_start/stop during that time. Unmasking is achieved via
20127+ * pfm_restart() and also may also depend on the sampling format used.
20128+ *
20129+ * on x86 masking/unmasking use the start/stop mechanism, except
20130+ * that flags.started is not modified.
20131+ */
20132+static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx,
20133+ struct pfm_event_set *set)
20134+{
20135+ pfm_arch_start(current, ctx);
20136+}
20137+
20138+/**
20139+ * pfm_arch_intr_freeze_pmu - stop monitoring when handling PMU interrupt
20140+ * @ctx: current context
20141+ * @set: current event set
20142+ *
20143+ * called from __pfm_interrupt_handler().
20144+ * ctx is not NULL. ctx is locked. interrupts are masked
20145+ *
20146+ * The following actions must take place:
20147+ * - stop all monitoring to ensure handler has consistent view.
20148+ * - collect overflowed PMDs bitmask into povfls_pmds and
20149+ * npend_ovfls. If no interrupt detected then npend_ovfls
20150+ * must be set to zero.
20151+ */
20152+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx,
20153+ struct pfm_event_set *set)
20154+{
20155+ /*
20156+ * on X86, freezing is equivalent to stopping
20157+ */
20158+ pfm_arch_stop(current, ctx);
20159+
20160+ /*
20161+ * we mark monitoring as stopped to avoid
20162+ * certain side effects especially in
20163+ * pfm_switch_sets_from_intr() and
20164+ * pfm_arch_restore_pmcs()
20165+ */
20166+ ctx->flags.started = 0;
20167+}
20168+
20169+/**
20170+ * pfm_arch_intr_unfreeze_pmu - conditionally reactive monitoring
20171+ * @ctx: current context
20172+ *
20173+ * current context may be not when dealing when spurious interrupts
20174+ *
20175+ * Must re-activate monitoring if context is not MASKED.
20176+ * interrupts are masked.
20177+ */
20178+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
20179+{
20180+ if (ctx == NULL)
20181+ return;
20182+
20183+ PFM_DBG_ovfl("state=%d", ctx->state);
20184+
20185+ /*
20186+ * restore flags.started which is cleared in
20187+ * pfm_arch_intr_freeze_pmu()
20188+ */
20189+ ctx->flags.started = 1;
20190+
20191+ if (ctx->state == PFM_CTX_MASKED)
20192+ return;
20193+
20194+ pfm_arch_restore_pmcs(ctx, ctx->active_set);
20195+}
20196+
20197+/**
20198+ * pfm_arch_setfl_sane - check arch/model specific event set flags
20199+ * @ctx: context to work on
20200+ * @flags: event set flags as passed by user
20201+ *
20202+ * called from pfm_setfl_sane(). Context is locked. Interrupts are masked.
20203+ *
20204+ * Return:
20205+ * 0 when flags are valid
20206+ * 1 on error
20207+ */
20208+static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags)
20209+{
20210+ return 0;
20211+}
20212+
20213+/**
20214+ * pfm_arch_ovfl_reset_pmd - reset pmd on overflow
20215+ * @ctx: current context
20216+ * @cnum: PMD index
20217+ *
20218+ * On some CPUs, the upper bits of a counter must be set in order for the
20219+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
20220+ * and the upper bits are cleared. This function may be used to set them back.
20221+ *
20222+ * For x86, the current version loses whatever is remaining in the counter,
20223+ * which is usually has a small count. In order not to loose this count,
20224+ * we do a read-modify-write to set the upper bits while preserving the
20225+ * low-order bits. This is slow but works.
20226+ */
20227+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum)
20228+{
20229+ u64 val;
20230+ val = pfm_arch_read_pmd(ctx, cnum);
20231+ pfm_arch_write_pmd(ctx, cnum, val);
20232+}
20233+
20234+/**
20235+ * pfm_arch_context_create - create context
20236+ * @ctx: newly created context
20237+ * @flags: context flags as passed by user
20238+ *
20239+ * called from __pfm_create_context()
20240+ */
20241+static inline int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags)
20242+{
20243+ struct pfm_arch_pmu_info *pmu_info;
20244+
20245+ pmu_info = pfm_pmu_info();
20246+
20247+ if (pmu_info->create_context)
20248+ return pmu_info->create_context(ctx, ctx_flags);
20249+
20250+ return 0;
20251+}
20252+
20253+/**
20254+ * pfm_arch_context_free - free context
20255+ * @ctx: context to free
20256+ */
20257+static inline void pfm_arch_context_free(struct pfm_context *ctx)
20258+{
20259+ struct pfm_arch_pmu_info *pmu_info;
20260+
20261+ pmu_info = pfm_pmu_info();
20262+
20263+ if (pmu_info->free_context)
20264+ pmu_info->free_context(ctx);
20265+}
20266+
20267+/*
20268+ * pfm_arch_clear_pmd_ovfl_cond - alter the pmds in such a way that they
20269+ * will not cause cause interrupts when unused.
20270+ *
20271+ * This is a nop on x86
20272+ */
20273+static inline void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx,
20274+ struct pfm_event_set *set)
20275+{}
20276+
20277+/*
20278+ * functions implemented in arch/x86/perfmon/perfmon.c
20279+ */
20280+int pfm_arch_init(void);
20281+void pfm_arch_resend_irq(struct pfm_context *ctx);
20282+
20283+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx);
20284+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx);
20285+
20286+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
20287+int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg);
20288+void pfm_arch_pmu_config_remove(void);
20289+char *pfm_arch_get_pmu_module_name(void);
20290+int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds);
20291+void pfm_arch_pmu_release(void);
20292+
20293+/*
20294+ * pfm_arch_serialize - make PMU modifications visible to subsequent instructions
20295+ *
20296+ * This is a nop on x86
20297+ */
20298+static inline void pfm_arch_serialize(void)
20299+{}
20300+
20301+/*
20302+ * on x86, the PMDs are already saved by pfm_arch_freeze_pmu()
20303+ * when entering the PMU interrupt handler, thus, we do not need
20304+ * to save them again in pfm_switch_sets_from_intr()
20305+ */
20306+static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx,
20307+ struct pfm_event_set *set)
20308+{}
20309+
20310+
20311+static inline void pfm_arch_ctxswout_sys(struct task_struct *task,
20312+ struct pfm_context *ctx)
20313+{}
20314+
20315+static inline void pfm_arch_ctxswin_sys(struct task_struct *task,
20316+ struct pfm_context *ctx)
20317+{}
20318+
20319+static inline void pfm_arch_init_percpu(void)
20320+{}
20321+
20322+static inline void pfm_cacheflush(void *addr, unsigned int len)
20323+{}
20324+
20325+/*
20326+ * this function is called from the PMU interrupt handler ONLY.
20327+ * On x86, the PMU is frozen via arch_stop, masking would be implemented
20328+ * via arch-stop as well. Given that the PMU is already stopped when
20329+ * entering the interrupt handler, we do not need to stop it again, so
20330+ * this function is a nop.
20331+ */
20332+static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx,
20333+ struct pfm_event_set *set)
20334+{}
20335+
20336+
20337+static inline void pfm_arch_arm_handle_work(struct task_struct *task)
20338+{}
20339+
20340+static inline void pfm_arch_disarm_handle_work(struct task_struct *task)
20341+{}
20342+
20343+static inline int pfm_arch_get_base_syscall(void)
20344+{
20345+#ifdef __x86_64__
20346+ /* 32-bit syscall definition coming from ia32_unistd.h */
20347+ if (test_thread_flag(TIF_IA32))
20348+ return __NR_ia32_pfm_create_context;
20349+#endif
20350+ return __NR_pfm_create_context;
20351+}
20352+
20353+#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context))
20354+/*
20355+ * x86 does not need extra alignment requirements for the sampling buffer
20356+ */
20357+#define PFM_ARCH_SMPL_ALIGN_SIZE 0
20358+
20359+asmlinkage void pmu_interrupt(void);
20360+
20361+#endif /* CONFIG_PEFMON */
20362+
20363+#endif /* _ASM_X86_PERFMON_KERN_H_ */
20364--- /dev/null
20365+++ b/include/asm-x86/perfmon_pebs_core_smpl.h
20366@@ -0,0 +1,164 @@
20367+/*
20368+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
20369+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
20370+ *
20371+ * This program is free software; you can redistribute it and/or
20372+ * modify it under the terms of version 2 of the GNU General Public
20373+ * License as published by the Free Software Foundation.
20374+ *
20375+ * This program is distributed in the hope that it will be useful,
20376+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
20377+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20378+ * General Public License for more details.
20379+ *
20380+ * You should have received a copy of the GNU General Public License
20381+ * along with this program; if not, write to the Free Software
20382+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20383+ * 02111-1307 USA
20384+ *
20385+ * This file implements the sampling format to support Intel
20386+ * Precise Event Based Sampling (PEBS) feature of Intel Core
20387+ * processors, such as Intel Core 2.
20388+ *
20389+ * What is PEBS?
20390+ * ------------
20391+ * This is a hardware feature to enhance sampling by providing
20392+ * better precision as to where a sample is taken. This avoids the
20393+ * typical skew in the instruction one can observe with any
20394+ * interrupt-based sampling technique.
20395+ *
20396+ * PEBS also lowers sampling overhead significantly by having the
20397+ * processor store samples instead of the OS. PMU interrupt are only
20398+ * generated after multiple samples are written.
20399+ *
20400+ * Another benefit of PEBS is that samples can be captured inside
20401+ * critical sections where interrupts are masked.
20402+ *
20403+ * How does it work?
20404+ * PEBS effectively implements a Hw buffer. The Os must pass a region
20405+ * of memory where samples are to be stored. The region can have any
20406+ * size. The OS must also specify the sampling period to reload. The PMU
20407+ * will interrupt when it reaches the end of the buffer or a specified
20408+ * threshold location inside the memory region.
20409+ *
20410+ * The description of the buffer is stored in the Data Save Area (DS).
20411+ * The samples are stored sequentially in the buffer. The format of the
20412+ * buffer is fixed and specified in the PEBS documentation. The sample
20413+ * format does not change between 32-bit and 64-bit modes unlike on the
20414+ * Pentium 4 version of PEBS.
20415+ *
20416+ * PEBS does not work when HyperThreading is enabled due to certain MSR
20417+ * being shared being to two threads.
20418+ *
20419+ * What does the format do?
20420+ * It provides access to the PEBS feature for both 32-bit and 64-bit
20421+ * processors that support it.
20422+ *
20423+ * The same code and data structures are used for both 32-bit and 64-bi
20424+ * modes. A single format name is used for both modes. In 32-bit mode,
20425+ * some of the extended registers are written to zero in each sample.
20426+ *
20427+ * It is important to realize that the format provides a zero-copy
20428+ * environment for the samples, i.e,, the OS never touches the
20429+ * samples. Whatever the processor write is directly accessible to
20430+ * the user.
20431+ *
20432+ * Parameters to the buffer can be passed via pfm_create_context() in
20433+ * the pfm_pebs_smpl_arg structure.
20434+ */
20435+#ifndef __PERFMON_PEBS_CORE_SMPL_H__
20436+#define __PERFMON_PEBS_CORE_SMPL_H__ 1
20437+
20438+/*
20439+ * The 32-bit and 64-bit formats are identical, thus we use only
20440+ * one name for the format.
20441+ */
20442+#define PFM_PEBS_CORE_SMPL_NAME "pebs_core"
20443+
20444+/*
20445+ * format specific parameters (passed at context creation)
20446+ *
20447+ * intr_thres: index from start of buffer of entry where the
20448+ * PMU interrupt must be triggered. It must be several samples
20449+ * short of the end of the buffer.
20450+ */
20451+struct pfm_pebs_core_smpl_arg {
20452+ u64 cnt_reset; /* counter reset value */
20453+ size_t buf_size; /* size of the PEBS buffer in bytes */
20454+ size_t intr_thres;/* index of PEBS interrupt threshold entry */
20455+ u64 reserved[6]; /* for future use */
20456+};
20457+
20458+/*
20459+ * Data Save Area (32 and 64-bit mode)
20460+ *
20461+ * The DS area is exposed to the user. To determine the number
20462+ * of samples available in PEBS, it is necessary to substract
20463+ * pebs_index from pebs_base.
20464+ *
20465+ * Layout of the structure is mandated by hardware and specified
20466+ * in the Intel documentation.
20467+ */
20468+struct pfm_ds_area_core {
20469+ u64 bts_buf_base;
20470+ u64 bts_index;
20471+ u64 bts_abs_max;
20472+ u64 bts_intr_thres;
20473+ u64 pebs_buf_base;
20474+ u64 pebs_index;
20475+ u64 pebs_abs_max;
20476+ u64 pebs_intr_thres;
20477+ u64 pebs_cnt_reset;
20478+};
20479+
20480+/*
20481+ * This header is at the beginning of the sampling buffer returned to the user.
20482+ *
20483+ * Because of PEBS alignement constraints, the actual PEBS buffer area does
20484+ * not necessarily begin right after the header. The hdr_start_offs must be
20485+ * used to compute the first byte of the buffer. The offset is defined as
20486+ * the number of bytes between the end of the header and the beginning of
20487+ * the buffer. As such the formula is:
20488+ * actual_buffer = (unsigned long)(hdr+1)+hdr->hdr_start_offs
20489+ */
20490+struct pfm_pebs_core_smpl_hdr {
20491+ u64 overflows; /* #overflows for buffer */
20492+ size_t buf_size; /* bytes in the buffer */
20493+ size_t start_offs; /* actual buffer start offset */
20494+ u32 version; /* smpl format version */
20495+ u32 reserved1; /* for future use */
20496+ u64 reserved2[5]; /* for future use */
20497+ struct pfm_ds_area_core ds; /* data save area */
20498+};
20499+
20500+/*
20501+ * Sample format as mandated by Intel documentation.
20502+ * The same format is used in both 32 and 64 bit modes.
20503+ */
20504+struct pfm_pebs_core_smpl_entry {
20505+ u64 eflags;
20506+ u64 ip;
20507+ u64 eax;
20508+ u64 ebx;
20509+ u64 ecx;
20510+ u64 edx;
20511+ u64 esi;
20512+ u64 edi;
20513+ u64 ebp;
20514+ u64 esp;
20515+ u64 r8; /* 0 in 32-bit mode */
20516+ u64 r9; /* 0 in 32-bit mode */
20517+ u64 r10; /* 0 in 32-bit mode */
20518+ u64 r11; /* 0 in 32-bit mode */
20519+ u64 r12; /* 0 in 32-bit mode */
20520+ u64 r13; /* 0 in 32-bit mode */
20521+ u64 r14; /* 0 in 32-bit mode */
20522+ u64 r15; /* 0 in 32-bit mode */
20523+};
20524+
20525+#define PFM_PEBS_CORE_SMPL_VERSION_MAJ 1U
20526+#define PFM_PEBS_CORE_SMPL_VERSION_MIN 0U
20527+#define PFM_PEBS_CORE_SMPL_VERSION (((PFM_PEBS_CORE_SMPL_VERSION_MAJ&0xffff)<<16)|\
20528+ (PFM_PEBS_CORE_SMPL_VERSION_MIN & 0xffff))
20529+
20530+#endif /* __PERFMON_PEBS_CORE_SMPL_H__ */
20531--- /dev/null
20532+++ b/include/asm-x86/perfmon_pebs_p4_smpl.h
20533@@ -0,0 +1,193 @@
20534+/*
20535+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
20536+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
20537+ *
20538+ * This program is free software; you can redistribute it and/or
20539+ * modify it under the terms of version 2 of the GNU General Public
20540+ * License as published by the Free Software Foundation.
20541+ *
20542+ * This program is distributed in the hope that it will be useful,
20543+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
20544+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20545+ * General Public License for more details.
20546+ *
20547+ * You should have received a copy of the GNU General Public License
20548+ * along with this program; if not, write to the Free Software
20549+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20550+ * 02111-1307 USA
20551+ *
20552+ * This file implements the sampling format to support Intel
20553+ * Precise Event Based Sampling (PEBS) feature of Pentium 4
20554+ * and other Netburst-based processors. Not to be used for
20555+ * Intel Core-based processors.
20556+ *
20557+ * What is PEBS?
20558+ * ------------
20559+ * This is a hardware feature to enhance sampling by providing
20560+ * better precision as to where a sample is taken. This avoids the
20561+ * typical skew in the instruction one can observe with any
20562+ * interrupt-based sampling technique.
20563+ *
20564+ * PEBS also lowers sampling overhead significantly by having the
20565+ * processor store samples instead of the OS. PMU interrupt are only
20566+ * generated after multiple samples are written.
20567+ *
20568+ * Another benefit of PEBS is that samples can be captured inside
20569+ * critical sections where interrupts are masked.
20570+ *
20571+ * How does it work?
20572+ * PEBS effectively implements a Hw buffer. The Os must pass a region
20573+ * of memory where samples are to be stored. The region can have any
20574+ * size. The OS must also specify the sampling period to reload. The PMU
20575+ * will interrupt when it reaches the end of the buffer or a specified
20576+ * threshold location inside the memory region.
20577+ *
20578+ * The description of the buffer is stored in the Data Save Area (DS).
20579+ * The samples are stored sequentially in the buffer. The format of the
20580+ * buffer is fixed and specified in the PEBS documentation. The sample
20581+ * format changes between 32-bit and 64-bit modes due to extended register
20582+ * file.
20583+ *
20584+ * PEBS does not work when HyperThreading is enabled due to certain MSR
20585+ * being shared being to two threads.
20586+ *
20587+ * What does the format do?
20588+ * It provides access to the PEBS feature for both 32-bit and 64-bit
20589+ * processors that support it.
20590+ *
20591+ * The same code is used for both 32-bit and 64-bit modes, but different
20592+ * format names are used because the two modes are not compatible due to
20593+ * data model and register file differences. Similarly the public data
20594+ * structures describing the samples are different.
20595+ *
20596+ * It is important to realize that the format provides a zero-copy environment
20597+ * for the samples, i.e,, the OS never touches the samples. Whatever the
20598+ * processor write is directly accessible to the user.
20599+ *
20600+ * Parameters to the buffer can be passed via pfm_create_context() in
20601+ * the pfm_pebs_smpl_arg structure.
20602+ *
20603+ * It is not possible to mix a 32-bit PEBS application on top of a 64-bit
20604+ * host kernel.
20605+ */
20606+#ifndef __PERFMON_PEBS_P4_SMPL_H__
20607+#define __PERFMON_PEBS_P4_SMPL_H__ 1
20608+
20609+#ifdef __i386__
20610+/*
20611+ * The 32-bit and 64-bit formats are not compatible, thus we have
20612+ * two different identifications so that 32-bit programs running on
20613+ * 64-bit OS will fail to use the 64-bit PEBS support.
20614+ */
20615+#define PFM_PEBS_P4_SMPL_NAME "pebs32_p4"
20616+#else
20617+#define PFM_PEBS_P4_SMPL_NAME "pebs64_p4"
20618+#endif
20619+
20620+/*
20621+ * format specific parameters (passed at context creation)
20622+ *
20623+ * intr_thres: index from start of buffer of entry where the
20624+ * PMU interrupt must be triggered. It must be several samples
20625+ * short of the end of the buffer.
20626+ */
20627+struct pfm_pebs_p4_smpl_arg {
20628+ u64 cnt_reset; /* counter reset value */
20629+ size_t buf_size; /* size of the PEBS buffer in bytes */
20630+ size_t intr_thres;/* index of PEBS interrupt threshold entry */
20631+ u64 reserved[6]; /* for future use */
20632+};
20633+
20634+/*
20635+ * Data Save Area (32 and 64-bit mode)
20636+ *
20637+ * The DS area must be exposed to the user because this is the only
20638+ * way to report on the number of valid entries recorded by the CPU.
20639+ * This is required when the buffer is not full, i..e, there was not
20640+ * PMU interrupt.
20641+ *
20642+ * Layout of the structure is mandated by hardware and specified in
20643+ * the Intel documentation.
20644+ */
20645+struct pfm_ds_area_p4 {
20646+ unsigned long bts_buf_base;
20647+ unsigned long bts_index;
20648+ unsigned long bts_abs_max;
20649+ unsigned long bts_intr_thres;
20650+ unsigned long pebs_buf_base;
20651+ unsigned long pebs_index;
20652+ unsigned long pebs_abs_max;
20653+ unsigned long pebs_intr_thres;
20654+ u64 pebs_cnt_reset;
20655+};
20656+
20657+/*
20658+ * This header is at the beginning of the sampling buffer returned to the user.
20659+ *
20660+ * Because of PEBS alignement constraints, the actual PEBS buffer area does
20661+ * not necessarily begin right after the header. The hdr_start_offs must be
20662+ * used to compute the first byte of the buffer. The offset is defined as
20663+ * the number of bytes between the end of the header and the beginning of
20664+ * the buffer. As such the formula is:
20665+ * actual_buffer = (unsigned long)(hdr+1)+hdr->hdr_start_offs
20666+ */
20667+struct pfm_pebs_p4_smpl_hdr {
20668+ u64 overflows; /* #overflows for buffer */
20669+ size_t buf_size; /* bytes in the buffer */
20670+ size_t start_offs; /* actual buffer start offset */
20671+ u32 version; /* smpl format version */
20672+ u32 reserved1; /* for future use */
20673+ u64 reserved2[5]; /* for future use */
20674+ struct pfm_ds_area_p4 ds; /* data save area */
20675+};
20676+
20677+/*
20678+ * 64-bit PEBS record format is described in
20679+ * http://www.intel.com/technology/64bitextensions/30083502.pdf
20680+ *
20681+ * The format does not peek at samples. The sample structure is only
20682+ * used to ensure that the buffer is large enough to accomodate one
20683+ * sample.
20684+ */
20685+#ifdef __i386__
20686+struct pfm_pebs_p4_smpl_entry {
20687+ u32 eflags;
20688+ u32 ip;
20689+ u32 eax;
20690+ u32 ebx;
20691+ u32 ecx;
20692+ u32 edx;
20693+ u32 esi;
20694+ u32 edi;
20695+ u32 ebp;
20696+ u32 esp;
20697+};
20698+#else
20699+struct pfm_pebs_p4_smpl_entry {
20700+ u64 eflags;
20701+ u64 ip;
20702+ u64 eax;
20703+ u64 ebx;
20704+ u64 ecx;
20705+ u64 edx;
20706+ u64 esi;
20707+ u64 edi;
20708+ u64 ebp;
20709+ u64 esp;
20710+ u64 r8;
20711+ u64 r9;
20712+ u64 r10;
20713+ u64 r11;
20714+ u64 r12;
20715+ u64 r13;
20716+ u64 r14;
20717+ u64 r15;
20718+};
20719+#endif
20720+
20721+#define PFM_PEBS_P4_SMPL_VERSION_MAJ 1U
20722+#define PFM_PEBS_P4_SMPL_VERSION_MIN 0U
20723+#define PFM_PEBS_P4_SMPL_VERSION (((PFM_PEBS_P4_SMPL_VERSION_MAJ&0xffff)<<16)|\
20724+ (PFM_PEBS_P4_SMPL_VERSION_MIN & 0xffff))
20725+
20726+#endif /* __PERFMON_PEBS_P4_SMPL_H__ */
20727--- a/include/asm-x86/thread_info.h
20728+++ b/include/asm-x86/thread_info.h
20729@@ -79,6 +79,7 @@ struct thread_info {
20730 #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
20731 #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
20732 #define TIF_SECCOMP 8 /* secure computing */
20733+#define TIF_PERFMON_WORK 9 /* work for pfm_handle_work() */
20734 #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
20735 #define TIF_NOTSC 16 /* TSC is not accessible in userland */
20736 #define TIF_IA32 17 /* 32bit process */
20737@@ -92,6 +93,7 @@ struct thread_info {
20738 #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
20739 #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
20740 #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
20741+#define TIF_PERFMON_CTXSW 28 /* perfmon needs ctxsw calls */
20742
20743 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
20744 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
20745@@ -114,6 +116,8 @@ struct thread_info {
20746 #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
20747 #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
20748 #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
20749+#define _TIF_PERFMON_WORK (1 << TIF_PERFMON_WORK)
20750+#define _TIF_PERFMON_CTXSW (1 << TIF_PERFMON_CTXSW)
20751
20752 /* work to do in syscall_trace_enter() */
20753 #define _TIF_WORK_SYSCALL_ENTRY \
20754@@ -135,12 +139,12 @@ struct thread_info {
20755
20756 /* Only used for 64 bit */
20757 #define _TIF_DO_NOTIFY_MASK \
20758- (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
20759+ (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME|_TIF_PERFMON_WORK)
20760
20761 /* flags to check in __switch_to() */
20762 #define _TIF_WORK_CTXSW \
20763 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
20764- _TIF_NOTSC)
20765+ _TIF_NOTSC|_TIF_PERFMON_CTXSW)
20766
20767 #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
20768 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
20769--- a/include/asm-x86/unistd_32.h
20770+++ b/include/asm-x86/unistd_32.h
20771@@ -338,9 +338,23 @@
20772 #define __NR_dup3 330
20773 #define __NR_pipe2 331
20774 #define __NR_inotify_init1 332
20775+#define __NR_pfm_create_context 333
20776+#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1)
20777+#define __NR_pfm_write_pmds (__NR_pfm_create_context+2)
20778+#define __NR_pfm_read_pmds (__NR_pfm_create_context+3)
20779+#define __NR_pfm_load_context (__NR_pfm_create_context+4)
20780+#define __NR_pfm_start (__NR_pfm_create_context+5)
20781+#define __NR_pfm_stop (__NR_pfm_create_context+6)
20782+#define __NR_pfm_restart (__NR_pfm_create_context+7)
20783+#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8)
20784+#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9)
20785+#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10)
20786+#define __NR_pfm_unload_context (__NR_pfm_create_context+11)
20787
20788 #ifdef __KERNEL__
20789
20790+#define NR_syscalls 345
20791+
20792 #define __ARCH_WANT_IPC_PARSE_VERSION
20793 #define __ARCH_WANT_OLD_READDIR
20794 #define __ARCH_WANT_OLD_STAT
20795--- a/include/asm-x86/unistd_64.h
20796+++ b/include/asm-x86/unistd_64.h
20797@@ -653,7 +653,30 @@ __SYSCALL(__NR_dup3, sys_dup3)
20798 __SYSCALL(__NR_pipe2, sys_pipe2)
20799 #define __NR_inotify_init1 294
20800 __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
20801-
20802+#define __NR_pfm_create_context 295
20803+__SYSCALL(__NR_pfm_create_context, sys_pfm_create_context)
20804+#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1)
20805+__SYSCALL(__NR_pfm_write_pmcs, sys_pfm_write_pmcs)
20806+#define __NR_pfm_write_pmds (__NR_pfm_create_context+2)
20807+__SYSCALL(__NR_pfm_write_pmds, sys_pfm_write_pmds)
20808+#define __NR_pfm_read_pmds (__NR_pfm_create_context+3)
20809+ __SYSCALL(__NR_pfm_read_pmds, sys_pfm_read_pmds)
20810+#define __NR_pfm_load_context (__NR_pfm_create_context+4)
20811+__SYSCALL(__NR_pfm_load_context, sys_pfm_load_context)
20812+#define __NR_pfm_start (__NR_pfm_create_context+5)
20813+__SYSCALL(__NR_pfm_start, sys_pfm_start)
20814+#define __NR_pfm_stop (__NR_pfm_create_context+6)
20815+__SYSCALL(__NR_pfm_stop, sys_pfm_stop)
20816+#define __NR_pfm_restart (__NR_pfm_create_context+7)
20817+__SYSCALL(__NR_pfm_restart, sys_pfm_restart)
20818+#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8)
20819+__SYSCALL(__NR_pfm_create_evtsets, sys_pfm_create_evtsets)
20820+#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9)
20821+__SYSCALL(__NR_pfm_getinfo_evtsets, sys_pfm_getinfo_evtsets)
20822+#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10)
20823+__SYSCALL(__NR_pfm_delete_evtsets, sys_pfm_delete_evtsets)
20824+#define __NR_pfm_unload_context (__NR_pfm_create_context+11)
20825+__SYSCALL(__NR_pfm_unload_context, sys_pfm_unload_context)
20826
20827 #ifndef __NO_STUBS
20828 #define __ARCH_WANT_OLD_READDIR
20829--- a/include/linux/Kbuild
20830+++ b/include/linux/Kbuild
20831@@ -163,6 +163,8 @@ header-y += video_decoder.h
20832 header-y += video_encoder.h
20833 header-y += videotext.h
20834 header-y += x25.h
20835+header-y += perfmon.h
20836+header-y += perfmon_dfl_smpl.h
20837
20838 unifdef-y += acct.h
20839 unifdef-y += adb.h
20840--- /dev/null
20841+++ b/include/linux/perfmon.h
20842@@ -0,0 +1,213 @@
20843+/*
20844+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
20845+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
20846+ *
20847+ * This program is free software; you can redistribute it and/or
20848+ * modify it under the terms of version 2 of the GNU General Public
20849+ * License as published by the Free Software Foundation.
20850+ *
20851+ * This program is distributed in the hope that it will be useful,
20852+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
20853+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20854+ * General Public License for more details.
20855+ *
20856+ * You should have received a copy of the GNU General Public License
20857+ * along with this program; if not, write to the Free Software
20858+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20859+ * 02111-1307 USA
20860+ */
20861+
20862+#ifndef __LINUX_PERFMON_H__
20863+#define __LINUX_PERFMON_H__
20864+
20865+/*
20866+ * This file contains all the user visible generic definitions for the
20867+ * interface. Model-specific user-visible definitions are located in
20868+ * the asm/perfmon.h file.
20869+ */
20870+
20871+/*
20872+ * include arch-specific user interface definitions
20873+ */
20874+#include <asm/perfmon.h>
20875+
20876+/*
20877+ * defined by each arch
20878+ */
20879+#define PFM_MAX_PMCS PFM_ARCH_MAX_PMCS
20880+#define PFM_MAX_PMDS PFM_ARCH_MAX_PMDS
20881+
20882+/*
20883+ * number of elements for each type of bitvector
20884+ * all bitvectors use u64 fixed size type on all architectures.
20885+ */
20886+#define PFM_BVSIZE(x) (((x)+(sizeof(__u64)<<3)-1) / (sizeof(__u64)<<3))
20887+#define PFM_PMD_BV PFM_BVSIZE(PFM_MAX_PMDS)
20888+#define PFM_PMC_BV PFM_BVSIZE(PFM_MAX_PMCS)
20889+
20890+/*
20891+ * register flags layout:
20892+ * bit[00-15] : generic flags
20893+ * bit[16-31] : arch-specific flags
20894+ *
20895+ * PFM_REGFL_NO_EMUL64: must be set on the PMC controlling the PMD
20896+ */
20897+#define PFM_REGFL_OVFL_NOTIFY 0x1 /* PMD: send notification on event */
20898+#define PFM_REGFL_RANDOM 0x2 /* PMD: randomize value after event */
20899+#define PFM_REGFL_NO_EMUL64 0x4 /* PMC: no 64-bit emulation */
20900+
20901+/*
20902+ * event set flags layout:
20903+ * bits[00-15] : generic flags
20904+ * bits[16-31] : arch-specific flags (see asm/perfmon.h)
20905+ */
20906+#define PFM_SETFL_OVFL_SWITCH 0x01 /* enable switch on overflow */
20907+#define PFM_SETFL_TIME_SWITCH 0x02 /* enable switch on timeout */
20908+
20909+/*
20910+ * argument to pfm_create_context() system call
20911+ * structure shared with user level
20912+ */
20913+struct pfarg_ctx {
20914+ __u32 ctx_flags; /* noblock/block/syswide */
20915+ __u32 ctx_reserved1; /* for future use */
20916+ __u64 ctx_reserved2[7]; /* for future use */
20917+};
20918+
20919+/*
20920+ * context flags layout:
20921+ * bits[00-15]: generic flags
20922+ * bits[16-31]: arch-specific flags (see perfmon_const.h)
20923+ */
20924+#define PFM_FL_NOTIFY_BLOCK 0x01 /* block task on user notifications */
20925+#define PFM_FL_SYSTEM_WIDE 0x02 /* create a system wide context */
20926+#define PFM_FL_OVFL_NO_MSG 0x80 /* no overflow msgs */
20927+
20928+/*
20929+ * argument to pfm_write_pmcs() system call.
20930+ * structure shared with user level
20931+ */
20932+struct pfarg_pmc {
20933+ __u16 reg_num; /* which register */
20934+ __u16 reg_set; /* event set for this register */
20935+ __u32 reg_flags; /* REGFL flags */
20936+ __u64 reg_value; /* pmc value */
20937+ __u64 reg_reserved2[4]; /* for future use */
20938+};
20939+
20940+/*
20941+ * argument to pfm_write_pmds() and pfm_read_pmds() system calls.
20942+ * structure shared with user level
20943+ */
20944+struct pfarg_pmd {
20945+ __u16 reg_num; /* which register */
20946+ __u16 reg_set; /* event set for this register */
20947+ __u32 reg_flags; /* REGFL flags */
20948+ __u64 reg_value; /* initial pmc/pmd value */
20949+ __u64 reg_long_reset; /* value to reload after notification */
20950+ __u64 reg_short_reset; /* reset after counter overflow */
20951+ __u64 reg_last_reset_val; /* return: PMD last reset value */
20952+ __u64 reg_ovfl_switch_cnt; /* #overflows before switch */
20953+ __u64 reg_reset_pmds[PFM_PMD_BV]; /* reset on overflow */
20954+ __u64 reg_smpl_pmds[PFM_PMD_BV]; /* record in sample */
20955+ __u64 reg_smpl_eventid; /* opaque event identifier */
20956+ __u64 reg_random_mask; /* bitmask used to limit random value */
20957+ __u32 reg_random_seed; /* seed for randomization (OBSOLETE) */
20958+ __u32 reg_reserved2[7]; /* for future use */
20959+};
20960+
20961+/*
20962+ * optional argument to pfm_start() system call. Pass NULL if not needed.
20963+ * structure shared with user level
20964+ */
20965+struct pfarg_start {
20966+ __u16 start_set; /* event set to start with */
20967+ __u16 start_reserved1; /* for future use */
20968+ __u32 start_reserved2; /* for future use */
20969+ __u64 reserved3[3]; /* for future use */
20970+};
20971+
20972+/*
20973+ * argument to pfm_load_context() system call.
20974+ * structure shared with user level
20975+ */
20976+struct pfarg_load {
20977+ __u32 load_pid; /* thread or CPU to attach to */
20978+ __u16 load_set; /* set to load first */
20979+ __u16 load_reserved1; /* for future use */
20980+ __u64 load_reserved2[3]; /* for future use */
20981+};
20982+
20983+/*
20984+ * argument to pfm_create_evtsets() and pfm_delete_evtsets() system calls.
20985+ * structure shared with user level.
20986+ */
20987+struct pfarg_setdesc {
20988+ __u16 set_id; /* which set */
20989+ __u16 set_reserved1; /* for future use */
20990+ __u32 set_flags; /* SETFL flags */
20991+ __u64 set_timeout; /* switch timeout in nsecs */
20992+ __u64 reserved[6]; /* for future use */
20993+};
20994+
20995+/*
20996+ * argument to pfm_getinfo_evtsets() system call.
20997+ * structure shared with user level
20998+ */
20999+struct pfarg_setinfo {
21000+ __u16 set_id; /* which set */
21001+ __u16 set_reserved1; /* for future use */
21002+ __u32 set_flags; /* out: SETFL flags */
21003+ __u64 set_ovfl_pmds[PFM_PMD_BV]; /* out: last ovfl PMDs */
21004+ __u64 set_runs; /* out: #times the set was active */
21005+ __u64 set_timeout; /* out: eff/leftover timeout (nsecs) */
21006+ __u64 set_act_duration; /* out: time set was active in nsecs */
21007+ __u64 set_avail_pmcs[PFM_PMC_BV];/* out: available PMCs */
21008+ __u64 set_avail_pmds[PFM_PMD_BV];/* out: available PMDs */
21009+ __u64 set_reserved3[6]; /* for future use */
21010+};
21011+
21012+/*
21013+ * default value for the user and group security parameters in
21014+ * /proc/sys/kernel/perfmon/sys_group
21015+ * /proc/sys/kernel/perfmon/task_group
21016+ */
21017+#define PFM_GROUP_PERM_ANY -1 /* any user/group */
21018+
21019+/*
21020+ * overflow notification message.
21021+ * structure shared with user level
21022+ */
21023+struct pfarg_ovfl_msg {
21024+ __u32 msg_type; /* message type: PFM_MSG_OVFL */
21025+ __u32 msg_ovfl_pid; /* process id */
21026+ __u16 msg_active_set; /* active set at overflow */
21027+ __u16 msg_ovfl_cpu; /* cpu of PMU interrupt */
21028+ __u32 msg_ovfl_tid; /* thread id */
21029+ __u64 msg_ovfl_ip; /* IP on PMU intr */
21030+ __u64 msg_ovfl_pmds[PFM_PMD_BV];/* overflowed PMDs */
21031+};
21032+
21033+#define PFM_MSG_OVFL 1 /* an overflow happened */
21034+#define PFM_MSG_END 2 /* task to which context was attached ended */
21035+
21036+/*
21037+ * generic notification message (union).
21038+ * union shared with user level
21039+ */
21040+union pfarg_msg {
21041+ __u32 type;
21042+ struct pfarg_ovfl_msg pfm_ovfl_msg;
21043+};
21044+
21045+/*
21046+ * perfmon version number
21047+ */
21048+#define PFM_VERSION_MAJ 2U
21049+#define PFM_VERSION_MIN 82U
21050+#define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|\
21051+ (PFM_VERSION_MIN & 0xffff))
21052+#define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff)
21053+#define PFM_VERSION_MINOR(x) ((x) & 0xffff)
21054+
21055+#endif /* __LINUX_PERFMON_H__ */
21056--- /dev/null
21057+++ b/include/linux/perfmon_dfl_smpl.h
21058@@ -0,0 +1,78 @@
21059+/*
21060+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
21061+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
21062+ *
21063+ * This file implements the new dfl sampling buffer format
21064+ * for perfmon2 subsystem.
21065+ *
21066+ * This program is free software; you can redistribute it and/or
21067+ * modify it under the terms of version 2 of the GNU General Public
21068+ * License as published by the Free Software Foundation.
21069+ *
21070+ * This program is distributed in the hope that it will be useful,
21071+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
21072+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21073+ * General Public License for more details.
21074+ *
21075+ * You should have received a copy of the GNU General Public License
21076+ * along with this program; if not, write to the Free Software
21077+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21078+ * 02111-1307 USA
21079+ */
21080+#ifndef __PERFMON_DFL_SMPL_H__
21081+#define __PERFMON_DFL_SMPL_H__ 1
21082+
21083+/*
21084+ * format specific parameters (passed at context creation)
21085+ */
21086+struct pfm_dfl_smpl_arg {
21087+ __u64 buf_size; /* size of the buffer in bytes */
21088+ __u32 buf_flags; /* buffer specific flags */
21089+ __u32 reserved1; /* for future use */
21090+ __u64 reserved[6]; /* for future use */
21091+};
21092+
21093+/*
21094+ * This header is at the beginning of the sampling buffer returned to the user.
21095+ * It is directly followed by the first record.
21096+ */
21097+struct pfm_dfl_smpl_hdr {
21098+ __u64 hdr_count; /* how many valid entries */
21099+ __u64 hdr_cur_offs; /* current offset from top of buffer */
21100+ __u64 hdr_overflows; /* #overflows for buffer */
21101+ __u64 hdr_buf_size; /* bytes in the buffer */
21102+ __u64 hdr_min_buf_space;/* minimal buffer size (internal use) */
21103+ __u32 hdr_version; /* smpl format version */
21104+ __u32 hdr_buf_flags; /* copy of buf_flags */
21105+ __u64 hdr_reserved[10]; /* for future use */
21106+};
21107+
21108+/*
21109+ * Entry header in the sampling buffer. The header is directly followed
21110+ * with the values of the PMD registers of interest saved in increasing
21111+ * index order: PMD4, PMD5, and so on. How many PMDs are present depends
21112+ * on how the session was programmed.
21113+ *
21114+ * In the case where multiple counters overflow at the same time, multiple
21115+ * entries are written consecutively.
21116+ *
21117+ * last_reset_value member indicates the initial value of the overflowed PMD.
21118+ */
21119+struct pfm_dfl_smpl_entry {
21120+ __u32 pid; /* thread id (for NPTL, this is gettid()) */
21121+ __u16 ovfl_pmd; /* index of overflowed PMD for this sample */
21122+ __u16 reserved; /* for future use */
21123+ __u64 last_reset_val; /* initial value of overflowed PMD */
21124+ __u64 ip; /* where did the overflow intr happened */
21125+ __u64 tstamp; /* overflow timetamp */
21126+ __u16 cpu; /* cpu on which the overfow occurred */
21127+ __u16 set; /* event set active when overflow ocurred */
21128+ __u32 tgid; /* thread group id (getpid() for NPTL) */
21129+};
21130+
21131+#define PFM_DFL_SMPL_VERSION_MAJ 1U
21132+#define PFM_DFL_SMPL_VERSION_MIN 0U
21133+#define PFM_DFL_SMPL_VERSION (((PFM_DFL_SMPL_VERSION_MAJ&0xffff)<<16)|\
21134+ (PFM_DFL_SMPL_VERSION_MIN & 0xffff))
21135+
21136+#endif /* __PERFMON_DFL_SMPL_H__ */
21137--- /dev/null
21138+++ b/include/linux/perfmon_fmt.h
21139@@ -0,0 +1,74 @@
21140+/*
21141+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
21142+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
21143+ *
21144+ * Interface for custom sampling buffer format modules
21145+ *
21146+ * This program is free software; you can redistribute it and/or
21147+ * modify it under the terms of version 2 of the GNU General Public
21148+ * License as published by the Free Software Foundation.
21149+ *
21150+ * This program is distributed in the hope that it will be useful,
21151+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
21152+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21153+ * General Public License for more details.
21154+ *
21155+ * You should have received a copy of the GNU General Public License
21156+ * along with this program; if not, write to the Free Software
21157+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21158+ * 02111-1307 USA
21159+ */
21160+#ifndef __PERFMON_FMT_H__
21161+#define __PERFMON_FMT_H__ 1
21162+
21163+#include <linux/kobject.h>
21164+
21165+typedef int (*fmt_validate_t)(u32 flags, u16 npmds, void *arg);
21166+typedef int (*fmt_getsize_t)(u32 flags, void *arg, size_t *size);
21167+typedef int (*fmt_init_t)(struct pfm_context *ctx, void *buf, u32 flags,
21168+ u16 nmpds, void *arg);
21169+typedef int (*fmt_restart_t)(int is_active, u32 *ovfl_ctrl, void *buf);
21170+typedef int (*fmt_exit_t)(void *buf);
21171+typedef int (*fmt_handler_t)(struct pfm_context *ctx,
21172+ unsigned long ip, u64 stamp, void *data);
21173+
21174+struct pfm_smpl_fmt {
21175+ char *fmt_name; /* name of the format (required) */
21176+ size_t fmt_arg_size; /* size of fmt args for ctx create */
21177+ u32 fmt_flags; /* format specific flags */
21178+ u32 fmt_version; /* format version number */
21179+
21180+ fmt_validate_t fmt_validate; /* validate context flags */
21181+ fmt_getsize_t fmt_getsize; /* get size for sampling buffer */
21182+ fmt_init_t fmt_init; /* initialize buffer area */
21183+ fmt_handler_t fmt_handler; /* overflow handler (required) */
21184+ fmt_restart_t fmt_restart; /* restart after notification */
21185+ fmt_exit_t fmt_exit; /* context termination */
21186+
21187+ struct list_head fmt_list; /* internal use only */
21188+
21189+ struct kobject kobj; /* sysfs internal use only */
21190+ struct module *owner; /* pointer to module owner */
21191+ u32 fmt_qdepth; /* Max notify queue depth (required) */
21192+};
21193+#define to_smpl_fmt(n) container_of(n, struct pfm_smpl_fmt, kobj)
21194+
21195+#define PFM_FMTFL_IS_BUILTIN 0x1 /* fmt is compiled in */
21196+/*
21197+ * we need to know whether the format is builtin or compiled
21198+ * as a module
21199+ */
21200+#ifdef MODULE
21201+#define PFM_FMT_BUILTIN_FLAG 0 /* not built as a module */
21202+#else
21203+#define PFM_FMT_BUILTIN_FLAG PFM_PMUFL_IS_BUILTIN /* built as a module */
21204+#endif
21205+
21206+int pfm_fmt_register(struct pfm_smpl_fmt *fmt);
21207+int pfm_fmt_unregister(struct pfm_smpl_fmt *fmt);
21208+void pfm_sysfs_builtin_fmt_add(void);
21209+
21210+int pfm_sysfs_add_fmt(struct pfm_smpl_fmt *fmt);
21211+void pfm_sysfs_remove_fmt(struct pfm_smpl_fmt *fmt);
21212+
21213+#endif /* __PERFMON_FMT_H__ */
21214--- /dev/null
21215+++ b/include/linux/perfmon_kern.h
21216@@ -0,0 +1,551 @@
21217+/*
21218+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
21219+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
21220+ *
21221+ * This program is free software; you can redistribute it and/or
21222+ * modify it under the terms of version 2 of the GNU General Public
21223+ * License as published by the Free Software Foundation.
21224+ *
21225+ * This program is distributed in the hope that it will be useful,
21226+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
21227+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21228+ * General Public License for more details.
21229+ *
21230+ * You should have received a copy of the GNU General Public License
21231+ * along with this program; if not, write to the Free Software
21232+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21233+ * 02111-1307 USA
21234+ */
21235+
21236+#ifndef __LINUX_PERFMON_KERN_H__
21237+#define __LINUX_PERFMON_KERN_H__
21238+/*
21239+ * This file contains all the definitions of data structures, variables, macros
21240+ * that are to be shared between generic code and arch-specific code
21241+ *
21242+ * For generic only definitions, use perfmon/perfmon_priv.h
21243+ */
21244+#ifdef CONFIG_PERFMON
21245+
21246+#include <linux/file.h>
21247+#include <linux/sched.h>
21248+#include <linux/perfmon.h>
21249+
21250+/*
21251+ * system adminstrator configuration controls available via
21252+ * the /sys/kerne/perfmon interface
21253+ */
21254+struct pfm_controls {
21255+ u32 debug; /* debugging control bitmask */
21256+ gid_t sys_group; /* gid to create a syswide context */
21257+ gid_t task_group; /* gid to create a per-task context */
21258+ u32 flags; /* control flags (see below) */
21259+ size_t arg_mem_max; /* maximum vector argument size */
21260+ size_t smpl_buffer_mem_max; /* max buf mem, -1 for infinity */
21261+};
21262+extern struct pfm_controls pfm_controls;
21263+
21264+/*
21265+ * control flags
21266+ */
21267+#define PFM_CTRL_FL_RW_EXPERT 0x1 /* bypass reserved fields on read/write */
21268+
21269+/*
21270+ * software PMD
21271+ */
21272+struct pfm_pmd {
21273+ u64 value; /* 64-bit value */
21274+ u64 lval; /* last reset value */
21275+ u64 ovflsw_thres; /* #ovfls left before switch */
21276+ u64 long_reset; /* long reset value on overflow */
21277+ u64 short_reset; /* short reset value on overflow */
21278+ u64 reset_pmds[PFM_PMD_BV]; /* pmds to reset on overflow */
21279+ u64 smpl_pmds[PFM_PMD_BV]; /* pmds to record on overflow */
21280+ u64 mask; /* range mask for random value */
21281+ u64 ovflsw_ref_thres; /* #ovfls before next set */
21282+ u64 eventid; /* opaque event identifier */
21283+ u32 flags; /* notify/do not notify */
21284+};
21285+
21286+/*
21287+ * event_set: encapsulates the full PMU state
21288+ */
21289+struct pfm_event_set {
21290+ struct list_head list; /* ordered chain of sets */
21291+ u16 id; /* set identification */
21292+ u16 nused_pmds; /* max number of used PMDs */
21293+ u16 nused_pmcs; /* max number of used PMCs */
21294+ u16 pad1; /* paddding */
21295+ u32 flags; /* public flags */
21296+ u32 priv_flags; /* private flags (see below) */
21297+ u64 runs; /* # of activations */
21298+ u32 npend_ovfls; /* number of pending PMD overflow */
21299+ u32 pad2; /* padding */
21300+ u64 used_pmds[PFM_PMD_BV]; /* used PMDs */
21301+ u64 povfl_pmds[PFM_PMD_BV]; /* pending overflowed PMDs */
21302+ u64 ovfl_pmds[PFM_PMD_BV]; /* last overflowed PMDs */
21303+ u64 reset_pmds[PFM_PMD_BV]; /* PMDs to reset after overflow */
21304+ u64 ovfl_notify[PFM_PMD_BV]; /* notify on overflow */
21305+ u64 used_pmcs[PFM_PMC_BV]; /* used PMCs */
21306+ u64 pmcs[PFM_MAX_PMCS]; /* PMC values */
21307+
21308+ struct pfm_pmd pmds[PFM_MAX_PMDS];
21309+
21310+ ktime_t hrtimer_exp; /* switch timeout reference */
21311+ ktime_t hrtimer_rem; /* per-thread remainder timeout */
21312+
21313+ u64 duration_start; /* start time in ns */
21314+ u64 duration; /* total active ns */
21315+};
21316+
21317+/*
21318+ * common private event set flags (priv_flags)
21319+ *
21320+ * upper 16 bits: for arch-specific use
21321+ * lower 16 bits: for common use
21322+ */
21323+#define PFM_SETFL_PRIV_MOD_PMDS 0x1 /* PMD register(s) modified */
21324+#define PFM_SETFL_PRIV_MOD_PMCS 0x2 /* PMC register(s) modified */
21325+#define PFM_SETFL_PRIV_SWITCH 0x4 /* must switch set on restart */
21326+#define PFM_SETFL_PRIV_MOD_BOTH (PFM_SETFL_PRIV_MOD_PMDS \
21327+ | PFM_SETFL_PRIV_MOD_PMCS)
21328+
21329+/*
21330+ * context flags
21331+ */
21332+struct pfm_context_flags {
21333+ unsigned int block:1; /* task blocks on user notifications */
21334+ unsigned int system:1; /* do system wide monitoring */
21335+ unsigned int no_msg:1; /* no message sent on overflow */
21336+ unsigned int switch_ovfl:1; /* switch set on counter ovfl */
21337+ unsigned int switch_time:1; /* switch set on timeout */
21338+ unsigned int started:1; /* pfm_start() issued */
21339+ unsigned int work_type:2; /* type of work for pfm_handle_work */
21340+ unsigned int mmap_nlock:1; /* no lock in pfm_release_buf_space */
21341+ unsigned int ia64_v20_compat:1; /* context is IA-64 v2.0 mode */
21342+ unsigned int can_restart:8; /* allowed to issue a PFM_RESTART */
21343+ unsigned int reset_count:8; /* number of pending resets */
21344+ unsigned int is_self:1; /* per-thread and self-montoring */
21345+ unsigned int reserved:5; /* for future use */
21346+};
21347+
21348+/*
21349+ * values for work_type (TIF_PERFMON_WORK must be set)
21350+ */
21351+#define PFM_WORK_NONE 0 /* nothing to do */
21352+#define PFM_WORK_RESET 1 /* reset overflowed counters */
21353+#define PFM_WORK_BLOCK 2 /* block current thread */
21354+#define PFM_WORK_ZOMBIE 3 /* cleanup zombie context */
21355+
21356+/*
21357+ * overflow description argument passed to sampling format
21358+ */
21359+struct pfm_ovfl_arg {
21360+ u16 ovfl_pmd; /* index of overflowed PMD */
21361+ u16 active_set; /* set active at the time of the overflow */
21362+ u32 ovfl_ctrl; /* control flags */
21363+ u64 pmd_last_reset; /* last reset value of overflowed PMD */
21364+ u64 smpl_pmds_values[PFM_MAX_PMDS]; /* values of other PMDs */
21365+ u64 pmd_eventid; /* eventid associated with PMD */
21366+ u16 num_smpl_pmds; /* number of PMDS in smpl_pmd_values */
21367+};
21368+/*
21369+ * depth of message queue
21370+ *
21371+ * Depth cannot be bigger than 255 (see reset_count)
21372+ */
21373+#define PFM_MSGS_ORDER 3 /* log2(number of messages) */
21374+#define PFM_MSGS_COUNT (1<<PFM_MSGS_ORDER) /* number of messages */
21375+#define PFM_MSGQ_MASK (PFM_MSGS_COUNT-1)
21376+
21377+/*
21378+ * perfmon context state
21379+ */
21380+#define PFM_CTX_UNLOADED 1 /* context is not loaded onto any task */
21381+#define PFM_CTX_LOADED 2 /* context is loaded onto a task */
21382+#define PFM_CTX_MASKED 3 /* context is loaded, monitoring is masked */
21383+#define PFM_CTX_ZOMBIE 4 /* context lost owner but still attached */
21384+
21385+/*
21386+ * registers description
21387+ */
21388+struct pfm_regdesc {
21389+ u64 pmcs[PFM_PMC_BV]; /* available PMC */
21390+ u64 pmds[PFM_PMD_BV]; /* available PMD */
21391+ u64 rw_pmds[PFM_PMD_BV]; /* available RW PMD */
21392+ u64 intr_pmds[PFM_PMD_BV]; /* PMD generating intr */
21393+ u64 cnt_pmds[PFM_PMD_BV]; /* PMD counters */
21394+ u16 max_pmc; /* highest+1 avail PMC */
21395+ u16 max_pmd; /* highest+1 avail PMD */
21396+ u16 max_rw_pmd; /* highest+1 avail RW PMD */
21397+ u16 first_intr_pmd; /* first intr PMD */
21398+ u16 max_intr_pmd; /* highest+1 intr PMD */
21399+ u16 num_rw_pmd; /* number of avail RW PMD */
21400+ u16 num_pmcs; /* number of logical PMCS */
21401+ u16 num_pmds; /* number of logical PMDS */
21402+ u16 num_counters; /* number of counting PMD */
21403+};
21404+
21405+/*
21406+ * context: contains all the state of a session
21407+ */
21408+struct pfm_context {
21409+ spinlock_t lock; /* context protection */
21410+
21411+ struct pfm_context_flags flags;
21412+ u32 state; /* current state */
21413+ struct task_struct *task; /* attached task */
21414+
21415+ struct completion restart_complete;/* block on notification */
21416+ u64 last_act; /* last activation */
21417+ u32 last_cpu; /* last CPU used (SMP only) */
21418+ u32 cpu; /* cpu bound to context */
21419+
21420+ struct pfm_smpl_fmt *smpl_fmt; /* sampling format callbacks */
21421+ void *smpl_addr; /* user smpl buffer base */
21422+ size_t smpl_size; /* user smpl buffer size */
21423+ void *smpl_real_addr;/* actual smpl buffer base */
21424+ size_t smpl_real_size; /* actual smpl buffer size */
21425+
21426+ wait_queue_head_t msgq_wait; /* pfm_read() wait queue */
21427+
21428+ union pfarg_msg msgq[PFM_MSGS_COUNT];
21429+ int msgq_head;
21430+ int msgq_tail;
21431+
21432+ struct fasync_struct *async_queue; /* async notification */
21433+
21434+ struct pfm_event_set *active_set; /* active set */
21435+ struct list_head set_list; /* ordered list of sets */
21436+
21437+ struct pfm_regdesc regs; /* registers available to context */
21438+
21439+ /*
21440+ * save stack space by allocating temporary variables for
21441+ * pfm_overflow_handler() in pfm_context
21442+ */
21443+ struct pfm_ovfl_arg ovfl_arg;
21444+ u64 tmp_ovfl_notify[PFM_PMD_BV];
21445+};
21446+
21447+/*
21448+ * ovfl_ctrl bitmask (used by interrupt handler)
21449+ */
21450+#define PFM_OVFL_CTRL_NOTIFY 0x1 /* notify user */
21451+#define PFM_OVFL_CTRL_RESET 0x2 /* reset overflowed pmds */
21452+#define PFM_OVFL_CTRL_MASK 0x4 /* mask monitoring */
21453+#define PFM_OVFL_CTRL_SWITCH 0x8 /* switch sets */
21454+
21455+/*
21456+ * logging
21457+ */
21458+#define PFM_ERR(f, x...) printk(KERN_ERR "perfmon: " f "\n", ## x)
21459+#define PFM_WARN(f, x...) printk(KERN_WARNING "perfmon: " f "\n", ## x)
21460+#define PFM_LOG(f, x...) printk(KERN_NOTICE "perfmon: " f "\n", ## x)
21461+#define PFM_INFO(f, x...) printk(KERN_INFO "perfmon: " f "\n", ## x)
21462+
21463+/*
21464+ * debugging
21465+ *
21466+ * Printk rate limiting is enforced to avoid getting flooded with too many
21467+ * error messages on the console (which could render the machine unresponsive).
21468+ * To get full debug output (turn off ratelimit):
21469+ * $ echo 0 >/proc/sys/kernel/printk_ratelimit
21470+ *
21471+ * debug is a bitmask where bits are defined as follows:
21472+ * bit 0: enable non-interrupt code degbug messages
21473+ * bit 1: enable interrupt code debug messages
21474+ */
21475+#ifdef CONFIG_PERFMON_DEBUG
21476+#define _PFM_DBG(lm, f, x...) \
21477+ do { \
21478+ if (unlikely((pfm_controls.debug & lm) && printk_ratelimit())) { \
21479+ preempt_disable(); \
21480+ printk("perfmon: %s.%d: CPU%d [%d]: " f "\n", \
21481+ __func__, __LINE__, \
21482+ smp_processor_id(), current->pid , ## x); \
21483+ preempt_enable(); \
21484+ } \
21485+ } while (0)
21486+
21487+#define PFM_DBG(f, x...) _PFM_DBG(0x1, f, ##x)
21488+#define PFM_DBG_ovfl(f, x...) _PFM_DBG(0x2, f, ## x)
21489+#else
21490+#define PFM_DBG(f, x...) do {} while (0)
21491+#define PFM_DBG_ovfl(f, x...) do {} while (0)
21492+#endif
21493+
21494+extern struct pfm_pmu_config *pfm_pmu_conf;
21495+extern int perfmon_disabled;
21496+
21497+static inline struct pfm_arch_context *pfm_ctx_arch(struct pfm_context *c)
21498+{
21499+ return (struct pfm_arch_context *)(c+1);
21500+}
21501+
21502+int pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr,
21503+ void **req, void **to_free);
21504+
21505+int pfm_get_smpl_arg(char __user *fmt_uname, void __user *uaddr, size_t usize,
21506+ void **arg, struct pfm_smpl_fmt **fmt);
21507+
21508+int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmc *req,
21509+ int count);
21510+int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count,
21511+ int compat);
21512+int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count);
21513+
21514+int __pfm_load_context(struct pfm_context *ctx, struct pfarg_load *req,
21515+ struct task_struct *task);
21516+int __pfm_unload_context(struct pfm_context *ctx, int *can_release);
21517+
21518+int __pfm_stop(struct pfm_context *ctx, int *release_info);
21519+int __pfm_restart(struct pfm_context *ctx, int *unblock);
21520+int __pfm_start(struct pfm_context *ctx, struct pfarg_start *start);
21521+
21522+void pfm_free_context(struct pfm_context *ctx);
21523+
21524+void pfm_smpl_buf_space_release(struct pfm_context *ctx, size_t size);
21525+
21526+int pfm_check_task_state(struct pfm_context *ctx, int check_mask,
21527+ unsigned long *flags, void **resume);
21528+/*
21529+ * check_mask bitmask values for pfm_check_task_state()
21530+ */
21531+#define PFM_CMD_STOPPED 0x01 /* command needs thread stopped */
21532+#define PFM_CMD_UNLOADED 0x02 /* command needs ctx unloaded */
21533+#define PFM_CMD_UNLOAD 0x04 /* command is unload */
21534+
21535+int __pfm_create_context(struct pfarg_ctx *req,
21536+ struct pfm_smpl_fmt *fmt,
21537+ void *fmt_arg,
21538+ int mode,
21539+ struct pfm_context **new_ctx);
21540+
21541+struct pfm_event_set *pfm_find_set(struct pfm_context *ctx, u16 set_id,
21542+ int alloc);
21543+
21544+int pfm_pmu_conf_get(int autoload);
21545+void pfm_pmu_conf_put(void);
21546+
21547+int pfm_session_allcpus_acquire(void);
21548+void pfm_session_allcpus_release(void);
21549+
21550+int pfm_smpl_buf_alloc(struct pfm_context *ctx, size_t rsize);
21551+void pfm_smpl_buf_free(struct pfm_context *ctx);
21552+
21553+struct pfm_smpl_fmt *pfm_smpl_fmt_get(char *name);
21554+void pfm_smpl_fmt_put(struct pfm_smpl_fmt *fmt);
21555+
21556+void pfm_interrupt_handler(unsigned long iip, struct pt_regs *regs);
21557+
21558+void pfm_resume_task(struct task_struct *t, void *data);
21559+
21560+#include <linux/perfmon_pmu.h>
21561+#include <linux/perfmon_fmt.h>
21562+
21563+extern const struct file_operations pfm_file_ops;
21564+/*
21565+ * upper limit for count in calls that take vector arguments. This is used
21566+ * to prevent for multiplication overflow when we compute actual storage size
21567+ */
21568+#define PFM_MAX_ARG_COUNT(m) (INT_MAX/sizeof(*(m)))
21569+
21570+#define cast_ulp(_x) ((unsigned long *)_x)
21571+
21572+#define PFM_NORMAL 0
21573+#define PFM_COMPAT 1
21574+
21575+void __pfm_exit_thread(void);
21576+void pfm_ctxsw_in(struct task_struct *prev, struct task_struct *next);
21577+void pfm_ctxsw_out(struct task_struct *prev, struct task_struct *next);
21578+void pfm_handle_work(struct pt_regs *regs);
21579+void __pfm_init_percpu(void *dummy);
21580+void pfm_save_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
21581+
21582+static inline void pfm_exit_thread(void)
21583+{
21584+ if (current->pfm_context)
21585+ __pfm_exit_thread();
21586+}
21587+
21588+/*
21589+ * include arch-specific kernel level definitions
21590+ */
21591+#include <asm/perfmon_kern.h>
21592+
21593+static inline void pfm_copy_thread(struct task_struct *task)
21594+{
21595+ /*
21596+ * context or perfmon TIF state is NEVER inherited
21597+ * in child task. Holds for per-thread and system-wide
21598+ */
21599+ task->pfm_context = NULL;
21600+ clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW);
21601+ clear_tsk_thread_flag(task, TIF_PERFMON_WORK);
21602+ pfm_arch_disarm_handle_work(task);
21603+}
21604+
21605+
21606+/*
21607+ * read a single PMD register.
21608+ *
21609+ * virtual PMD registers have special handler.
21610+ * Depends on definitions in asm/perfmon_kern.h
21611+ */
21612+static inline u64 pfm_read_pmd(struct pfm_context *ctx, unsigned int cnum)
21613+{
21614+ if (unlikely(pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V))
21615+ return pfm_pmu_conf->pmd_sread(ctx, cnum);
21616+
21617+ return pfm_arch_read_pmd(ctx, cnum);
21618+}
21619+/*
21620+ * write a single PMD register.
21621+ *
21622+ * virtual PMD registers have special handler.
21623+ * Depends on definitions in asm/perfmon_kern.h
21624+ */
21625+static inline void pfm_write_pmd(struct pfm_context *ctx, unsigned int cnum,
21626+ u64 value)
21627+{
21628+ /*
21629+ * PMD writes are ignored for read-only registers
21630+ */
21631+ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_RO)
21632+ return;
21633+
21634+ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V) {
21635+ pfm_pmu_conf->pmd_swrite(ctx, cnum, value);
21636+ return;
21637+ }
21638+ /*
21639+ * clear unimplemented bits
21640+ */
21641+ value &= ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk;
21642+
21643+ pfm_arch_write_pmd(ctx, cnum, value);
21644+}
21645+
21646+void __pfm_init_percpu(void *dummy);
21647+
21648+static inline void pfm_init_percpu(void)
21649+{
21650+ __pfm_init_percpu(NULL);
21651+}
21652+
21653+/*
21654+ * pfm statistics are available via debugfs
21655+ * and perfmon subdir.
21656+ *
21657+ * When adding/removing new stats, make sure you also
21658+ * update the name table in perfmon_debugfs.c
21659+ */
21660+enum pfm_stats_names {
21661+ PFM_ST_ovfl_intr_all_count = 0,
21662+ PFM_ST_ovfl_intr_ns,
21663+ PFM_ST_ovfl_intr_spurious_count,
21664+ PFM_ST_ovfl_intr_replay_count,
21665+ PFM_ST_ovfl_intr_regular_count,
21666+ PFM_ST_handle_work_count,
21667+ PFM_ST_ovfl_notify_count,
21668+ PFM_ST_reset_pmds_count,
21669+ PFM_ST_pfm_restart_count,
21670+ PFM_ST_fmt_handler_calls,
21671+ PFM_ST_fmt_handler_ns,
21672+ PFM_ST_set_switch_count,
21673+ PFM_ST_set_switch_ns,
21674+ PFM_ST_set_switch_exp,
21675+ PFM_ST_ctxswin_count,
21676+ PFM_ST_ctxswin_ns,
21677+ PFM_ST_handle_timeout_count,
21678+ PFM_ST_ovfl_intr_nmi_count,
21679+ PFM_ST_ctxswout_count,
21680+ PFM_ST_ctxswout_ns,
21681+ PFM_ST_LAST /* last entry marked */
21682+};
21683+#define PFM_NUM_STATS PFM_ST_LAST
21684+
21685+struct pfm_stats {
21686+ u64 v[PFM_NUM_STATS];
21687+ struct dentry *dirs[PFM_NUM_STATS];
21688+ struct dentry *cpu_dir;
21689+ char cpu_name[8];
21690+};
21691+
21692+#ifdef CONFIG_PERFMON_DEBUG_FS
21693+#define pfm_stats_get(x) __get_cpu_var(pfm_stats).v[PFM_ST_##x]
21694+#define pfm_stats_inc(x) __get_cpu_var(pfm_stats).v[PFM_ST_##x]++
21695+#define pfm_stats_add(x, y) __get_cpu_var(pfm_stats).v[PFM_ST_##x] += (y)
21696+void pfm_reset_stats(int cpu);
21697+#else
21698+#define pfm_stats_get(x)
21699+#define pfm_stats_inc(x)
21700+#define pfm_stats_add(x, y)
21701+static inline void pfm_reset_stats(int cpu)
21702+{}
21703+#endif
21704+
21705+
21706+
21707+DECLARE_PER_CPU(struct pfm_context *, pmu_ctx);
21708+DECLARE_PER_CPU(struct pfm_stats, pfm_stats);
21709+DECLARE_PER_CPU(struct task_struct *, pmu_owner);
21710+
21711+void pfm_cpu_disable(void);
21712+
21713+
21714+/*
21715+ * max vector argument elements for local storage (no kmalloc/kfree)
21716+ * The PFM_ARCH_PM*_ARG should be defined in perfmon_kern.h.
21717+ * If not, default (conservative) values are used
21718+ */
21719+#ifndef PFM_ARCH_PMC_STK_ARG
21720+#define PFM_ARCH_PMC_STK_ARG 1
21721+#endif
21722+
21723+#ifndef PFM_ARCH_PMD_STK_ARG
21724+#define PFM_ARCH_PMD_STK_ARG 1
21725+#endif
21726+
21727+#define PFM_PMC_STK_ARG PFM_ARCH_PMC_STK_ARG
21728+#define PFM_PMD_STK_ARG PFM_ARCH_PMD_STK_ARG
21729+
21730+#else /* !CONFIG_PERFMON */
21731+
21732+
21733+/*
21734+ * perfmon hooks are nops when CONFIG_PERFMON is undefined
21735+ */
21736+static inline void pfm_cpu_disable(void)
21737+{}
21738+
21739+static inline void pfm_exit_thread(void)
21740+{}
21741+
21742+static inline void pfm_handle_work(struct pt_regs *regs)
21743+{}
21744+
21745+static inline void pfm_copy_thread(struct task_struct *t)
21746+{}
21747+
21748+static inline void pfm_ctxsw_in(struct task_struct *p, struct task_struct *n)
21749+{}
21750+
21751+static inline void pfm_ctxsw_out(struct task_struct *p, struct task_struct *n)
21752+{}
21753+
21754+static inline void pfm_session_allcpus_release(void)
21755+{}
21756+
21757+static inline int pfm_session_allcpus_acquire(void)
21758+{
21759+ return 0;
21760+}
21761+
21762+static inline void pfm_init_percpu(void)
21763+{}
21764+
21765+#endif /* CONFIG_PERFMON */
21766+
21767+#endif /* __LINUX_PERFMON_KERN_H__ */
21768--- /dev/null
21769+++ b/include/linux/perfmon_pmu.h
21770@@ -0,0 +1,192 @@
21771+/*
21772+ * Copyright (c) 2006 Hewlett-Packard Development Company, L.P.
21773+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
21774+ *
21775+ * Interface for PMU description modules
21776+ *
21777+ * This program is free software; you can redistribute it and/or
21778+ * modify it under the terms of version 2 of the GNU General Public
21779+ * License as published by the Free Software Foundation.
21780+ *
21781+ * This program is distributed in the hope that it will be useful,
21782+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
21783+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21784+ * General Public License for more details.
21785+ *
21786+ * You should have received a copy of the GNU General Public License
21787+ * along with this program; if not, write to the Free Software
21788+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21789+ * 02111-1307 USA
21790+ */
21791+#ifndef __PERFMON_PMU_H__
21792+#define __PERFMON_PMU_H__ 1
21793+
21794+/*
21795+ * generic information about a PMC or PMD register
21796+ *
21797+ * Dependency bitmasks:
21798+ * They are used to allow lazy save/restore in the context switch
21799+ * code. To avoid picking up stale configuration from a previous
21800+ * thread. Usng the bitmask, the generic read/write routines can
21801+ * ensure that all registers needed to support the measurement are
21802+ * restored properly on context switch in.
21803+ */
21804+struct pfm_regmap_desc {
21805+ u16 type; /* role of the register */
21806+ u16 reserved1; /* for future use */
21807+ u32 reserved2; /* for future use */
21808+ u64 dfl_val; /* power-on default value (quiescent) */
21809+ u64 rsvd_msk; /* reserved bits: 1 means reserved */
21810+ u64 no_emul64_msk; /* bits to clear for PFM_REGFL_NO_EMUL64 */
21811+ unsigned long hw_addr; /* HW register address or index */
21812+ struct kobject kobj; /* for internal use only */
21813+ char *desc; /* HW register description string */
21814+ u64 dep_pmcs[PFM_PMC_BV];/* depending PMC registers */
21815+};
21816+#define to_reg(n) container_of(n, struct pfm_regmap_desc, kobj)
21817+
21818+/*
21819+ * pfm_reg_desc helper macros
21820+ */
21821+#define PMC_D(t, d, v, r, n, h) \
21822+ { .type = t, \
21823+ .desc = d, \
21824+ .dfl_val = v, \
21825+ .rsvd_msk = r, \
21826+ .no_emul64_msk = n, \
21827+ .hw_addr = h \
21828+ }
21829+
21830+#define PMD_D(t, d, h) \
21831+ { .type = t, \
21832+ .desc = d, \
21833+ .rsvd_msk = 0, \
21834+ .no_emul64_msk = 0, \
21835+ .hw_addr = h \
21836+ }
21837+
21838+#define PMD_DR(t, d, h, r) \
21839+ { .type = t, \
21840+ .desc = d, \
21841+ .rsvd_msk = r, \
21842+ .no_emul64_msk = 0, \
21843+ .hw_addr = h \
21844+ }
21845+
21846+#define PMX_NA \
21847+ { .type = PFM_REG_NA }
21848+
21849+#define PMD_DP(t, d, h, p) \
21850+ { .type = t, \
21851+ .desc = d, \
21852+ .rsvd_msk = 0, \
21853+ .no_emul64_msk = 0, \
21854+ .dep_pmcs[0] = p, \
21855+ .hw_addr = h \
21856+ }
21857+
21858+/*
21859+ * type of a PMU register (16-bit bitmask) for use with pfm_reg_desc.type
21860+ */
21861+#define PFM_REG_NA 0x00 /* not avail. (not impl.,no access) must be 0 */
21862+#define PFM_REG_I 0x01 /* PMC/PMD: implemented */
21863+#define PFM_REG_WC 0x02 /* PMC: has write_checker */
21864+#define PFM_REG_C64 0x04 /* PMD: 64-bit virtualization */
21865+#define PFM_REG_RO 0x08 /* PMD: read-only (writes ignored) */
21866+#define PFM_REG_V 0x10 /* PMD: virtual reg */
21867+#define PFM_REG_INTR 0x20 /* PMD: register can generate interrupt */
21868+#define PFM_REG_SYS 0x40 /* PMC/PMD: register is for system-wide only */
21869+#define PFM_REG_THR 0x80 /* PMC/PMD: register is for per-thread only */
21870+#define PFM_REG_NO64 0x100 /* PMC: supports PFM_REGFL_NO_EMUL64 */
21871+
21872+/*
21873+ * define some shortcuts for common types
21874+ */
21875+#define PFM_REG_W (PFM_REG_WC|PFM_REG_I)
21876+#define PFM_REG_W64 (PFM_REG_WC|PFM_REG_NO64|PFM_REG_I)
21877+#define PFM_REG_C (PFM_REG_C64|PFM_REG_INTR|PFM_REG_I)
21878+#define PFM_REG_I64 (PFM_REG_NO64|PFM_REG_I)
21879+#define PFM_REG_IRO (PFM_REG_I|PFM_REG_RO)
21880+
21881+typedef int (*pfm_pmc_check_t)(struct pfm_context *ctx,
21882+ struct pfm_event_set *set,
21883+ struct pfarg_pmc *req);
21884+
21885+typedef int (*pfm_pmd_check_t)(struct pfm_context *ctx,
21886+ struct pfm_event_set *set,
21887+ struct pfarg_pmd *req);
21888+
21889+
21890+typedef u64 (*pfm_sread_t)(struct pfm_context *ctx, unsigned int cnum);
21891+typedef void (*pfm_swrite_t)(struct pfm_context *ctx, unsigned int cnum, u64 val);
21892+
21893+/*
21894+ * structure used by pmu description modules
21895+ *
21896+ * probe_pmu() routine return value:
21897+ * - 1 means recognized PMU
21898+ * - 0 means not recognized PMU
21899+ */
21900+struct pfm_pmu_config {
21901+ char *pmu_name; /* PMU family name */
21902+ char *version; /* config module version */
21903+
21904+ int counter_width; /* width of hardware counter */
21905+
21906+ struct pfm_regmap_desc *pmc_desc; /* PMC register descriptions */
21907+ struct pfm_regmap_desc *pmd_desc; /* PMD register descriptions */
21908+
21909+ pfm_pmc_check_t pmc_write_check;/* write checker (optional) */
21910+ pfm_pmd_check_t pmd_write_check;/* write checker (optional) */
21911+ pfm_pmd_check_t pmd_read_check; /* read checker (optional) */
21912+
21913+ pfm_sread_t pmd_sread; /* virtual pmd read */
21914+ pfm_swrite_t pmd_swrite; /* virtual pmd write */
21915+
21916+ int (*probe_pmu)(void);/* probe PMU routine */
21917+
21918+ u16 num_pmc_entries;/* #entries in pmc_desc */
21919+ u16 num_pmd_entries;/* #entries in pmd_desc */
21920+
21921+ void *pmu_info; /* model-specific infos */
21922+ u32 flags; /* set of flags */
21923+
21924+ struct module *owner; /* pointer to module struct */
21925+
21926+ /*
21927+ * fields computed internally, do not set in module
21928+ */
21929+ struct pfm_regdesc regs_all; /* regs available to all */
21930+ struct pfm_regdesc regs_thr; /* regs avail per-thread */
21931+ struct pfm_regdesc regs_sys; /* regs avail system-wide */
21932+
21933+ u64 ovfl_mask; /* overflow mask */
21934+};
21935+
21936+static inline void *pfm_pmu_info(void)
21937+{
21938+ return pfm_pmu_conf->pmu_info;
21939+}
21940+
21941+/*
21942+ * pfm_pmu_config flags
21943+ */
21944+#define PFM_PMUFL_IS_BUILTIN 0x1 /* pmu config is compiled in */
21945+
21946+/*
21947+ * we need to know whether the PMU description is builtin or compiled
21948+ * as a module
21949+ */
21950+#ifdef MODULE
21951+#define PFM_PMU_BUILTIN_FLAG 0 /* not built as a module */
21952+#else
21953+#define PFM_PMU_BUILTIN_FLAG PFM_PMUFL_IS_BUILTIN /* built as a module */
21954+#endif
21955+
21956+int pfm_pmu_register(struct pfm_pmu_config *cfg);
21957+void pfm_pmu_unregister(struct pfm_pmu_config *cfg);
21958+
21959+int pfm_sysfs_remove_pmu(struct pfm_pmu_config *pmu);
21960+int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu);
21961+
21962+#endif /* __PERFMON_PMU_H__ */
21963--- a/include/linux/sched.h
21964+++ b/include/linux/sched.h
21965@@ -96,6 +96,7 @@ struct exec_domain;
21966 struct futex_pi_state;
21967 struct robust_list_head;
21968 struct bio;
21969+struct pfm_context;
21970
21971 /*
21972 * List of flags we want to share for kernel threads,
21973@@ -1309,6 +1310,9 @@ struct task_struct {
21974 struct latency_record latency_record[LT_SAVECOUNT];
21975 #endif
21976 u64 instrumentation;
21977+#ifdef CONFIG_PERFMON
21978+ struct pfm_context *pfm_context;
21979+#endif
21980 };
21981
21982 /*
21983--- a/include/linux/syscalls.h
21984+++ b/include/linux/syscalls.h
21985@@ -29,6 +29,13 @@ struct msqid_ds;
21986 struct new_utsname;
21987 struct nfsctl_arg;
21988 struct __old_kernel_stat;
21989+struct pfarg_ctx;
21990+struct pfarg_pmc;
21991+struct pfarg_pmd;
21992+struct pfarg_start;
21993+struct pfarg_load;
21994+struct pfarg_setinfo;
21995+struct pfarg_setdesc;
21996 struct pollfd;
21997 struct rlimit;
21998 struct rusage;
21999@@ -690,4 +697,27 @@ asmlinkage long sys_pipe(int __user *);
22000
22001 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
22002
22003+asmlinkage long sys_pfm_create_context(struct pfarg_ctx __user *ureq,
22004+ void __user *uarg, size_t smpl_size);
22005+asmlinkage long sys_pfm_write_pmcs(int fd, struct pfarg_pmc __user *ureq,
22006+ int count);
22007+asmlinkage long sys_pfm_write_pmds(int fd, struct pfarg_pmd __user *ureq,
22008+ int count);
22009+asmlinkage long sys_pfm_read_pmds(int fd, struct pfarg_pmd __user *ureq,
22010+ int count);
22011+asmlinkage long sys_pfm_restart(int fd);
22012+asmlinkage long sys_pfm_stop(int fd);
22013+asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *ureq);
22014+asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ureq);
22015+asmlinkage long sys_pfm_unload_context(int fd);
22016+asmlinkage long sys_pfm_delete_evtsets(int fd,
22017+ struct pfarg_setinfo __user *ureq,
22018+ int count);
22019+asmlinkage long sys_pfm_create_evtsets(int fd,
22020+ struct pfarg_setdesc __user *ureq,
22021+ int count);
22022+asmlinkage long sys_pfm_getinfo_evtsets(int fd,
22023+ struct pfarg_setinfo __user *ureq,
22024+ int count);
22025+
22026 #endif
22027--- a/kernel/sched.c
22028+++ b/kernel/sched.c
22029@@ -71,6 +71,7 @@
22030 #include <linux/debugfs.h>
22031 #include <linux/ctype.h>
22032 #include <linux/ftrace.h>
22033+#include <linux/perfmon_kern.h>
22034
22035 #include <asm/tlb.h>
22036 #include <asm/irq_regs.h>
22037--- a/kernel/sys_ni.c
22038+++ b/kernel/sys_ni.c
22039@@ -127,6 +127,19 @@ cond_syscall(compat_sys_ipc);
22040 cond_syscall(compat_sys_sysctl);
22041 cond_syscall(sys_syslog);
22042
22043+cond_syscall(sys_pfm_create_context);
22044+cond_syscall(sys_pfm_write_pmcs);
22045+cond_syscall(sys_pfm_write_pmds);
22046+cond_syscall(sys_pfm_read_pmds);
22047+cond_syscall(sys_pfm_restart);
22048+cond_syscall(sys_pfm_start);
22049+cond_syscall(sys_pfm_stop);
22050+cond_syscall(sys_pfm_load_context);
22051+cond_syscall(sys_pfm_unload_context);
22052+cond_syscall(sys_pfm_create_evtsets);
22053+cond_syscall(sys_pfm_delete_evtsets);
22054+cond_syscall(sys_pfm_getinfo_evtsets);
22055+
22056 /* arch-specific weak syscall entries */
22057 cond_syscall(sys_pciconfig_read);
22058 cond_syscall(sys_pciconfig_write);
22059--- /dev/null
22060+++ b/perfmon/Makefile
22061@@ -0,0 +1,12 @@
22062+#
22063+# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
22064+# Contributed by Stephane Eranian <eranian@hpl.hp.com>
22065+#
22066+obj-y = perfmon_init.o perfmon_rw.o perfmon_res.o \
22067+ perfmon_pmu.o perfmon_sysfs.o perfmon_syscalls.o \
22068+ perfmon_file.o perfmon_ctxsw.o perfmon_intr.o \
22069+ perfmon_dfl_smpl.o perfmon_sets.o perfmon_hotplug.o \
22070+ perfmon_msg.o perfmon_smpl.o perfmon_attach.o \
22071+ perfmon_activate.o perfmon_ctx.o perfmon_fmt.o
22072+
22073+obj-$(CONFIG_PERFMON_DEBUG_FS) += perfmon_debugfs.o
22074--- /dev/null
22075+++ b/perfmon/perfmon_activate.c
22076@@ -0,0 +1,265 @@
22077+/*
22078+ * perfmon_activate.c: perfmon2 start/stop functions
22079+ *
22080+ * This file implements the perfmon2 interface which
22081+ * provides access to the hardware performance counters
22082+ * of the host processor.
22083+ *
22084+ *
22085+ * The initial version of perfmon.c was written by
22086+ * Ganesh Venkitachalam, IBM Corp.
22087+ *
22088+ * Then it was modified for perfmon-1.x by Stephane Eranian and
22089+ * David Mosberger, Hewlett Packard Co.
22090+ *
22091+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
22092+ * by Stephane Eranian, Hewlett Packard Co.
22093+ *
22094+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
22095+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
22096+ * David Mosberger-Tang <davidm@hpl.hp.com>
22097+ *
22098+ * More information about perfmon available at:
22099+ * http://perfmon2.sf.net
22100+ *
22101+ * This program is free software; you can redistribute it and/or
22102+ * modify it under the terms of version 2 of the GNU General Public
22103+ * License as published by the Free Software Foundation.
22104+ *
22105+ * This program is distributed in the hope that it will be useful,
22106+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
22107+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22108+ * General Public License for more details.
22109+ *
22110+ * You should have received a copy of the GNU General Public License
22111+ * along with this program; if not, write to the Free Software
22112+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
22113+ * 02111-1307 USA
22114+ */
22115+#include <linux/kernel.h>
22116+#include <linux/perfmon_kern.h>
22117+#include "perfmon_priv.h"
22118+
22119+/**
22120+ * __pfm_start - activate monitoring
22121+ * @ctx: context to operate on
22122+ * @start: pfarg_start as passed by user
22123+ *
22124+ * When operating in per-thread mode and not self-monitoring, the monitored
22125+ * thread must be stopped. Activation will be effective next time the thread
22126+ * is context switched in.
22127+ *
22128+ * The pfarg_start argument is optional and may be used to designate
22129+ * the initial event set to activate. When not provided, the last active
22130+ * set is used. For the first activation, set0 is used when start is NULL.
22131+ *
22132+ * On some architectures, e.g., IA-64, it may be possible to start monitoring
22133+ * without calling this function under certain conditions (per-thread and self
22134+ * monitoring). In this case, either set0 or the last active set is used.
22135+ *
22136+ * the context is locked and interrupts are disabled.
22137+ */
22138+int __pfm_start(struct pfm_context *ctx, struct pfarg_start *start)
22139+{
22140+ struct task_struct *task, *owner_task;
22141+ struct pfm_event_set *new_set, *old_set;
22142+ int is_self;
22143+
22144+ task = ctx->task;
22145+
22146+ /*
22147+ * UNLOADED: error
22148+ * LOADED : normal start, nop if started unless set is different
22149+ * MASKED : nop or change set when unmasking
22150+ * ZOMBIE : cannot happen
22151+ */
22152+ if (ctx->state == PFM_CTX_UNLOADED)
22153+ return -EINVAL;
22154+
22155+ old_set = new_set = ctx->active_set;
22156+
22157+ /*
22158+ * always the case for system-wide
22159+ */
22160+ if (task == NULL)
22161+ task = current;
22162+
22163+ is_self = task == current;
22164+
22165+ /*
22166+ * argument is provided?
22167+ */
22168+ if (start) {
22169+ /*
22170+ * find the set to load first
22171+ */
22172+ new_set = pfm_find_set(ctx, start->start_set, 0);
22173+ if (new_set == NULL) {
22174+ PFM_DBG("event set%u does not exist",
22175+ start->start_set);
22176+ return -EINVAL;
22177+ }
22178+ }
22179+
22180+ PFM_DBG("cur_set=%u req_set=%u", old_set->id, new_set->id);
22181+
22182+ /*
22183+ * if we need to change the active set we need
22184+ * to check if we can access the PMU
22185+ */
22186+ if (new_set != old_set) {
22187+
22188+ owner_task = __get_cpu_var(pmu_owner);
22189+ /*
22190+ * system-wide: must run on the right CPU
22191+ * per-thread : must be the owner of the PMU context
22192+ *
22193+ * pfm_switch_sets() returns with monitoring stopped
22194+ */
22195+ if (is_self) {
22196+ pfm_switch_sets(ctx, new_set, PFM_PMD_RESET_LONG, 1);
22197+ } else {
22198+ /*
22199+ * In a UP kernel, the PMU may contain the state
22200+ * of the task we want to operate on, yet the task
22201+ * may be switched out (lazy save). We need to save
22202+ * current state (old_set), switch active_set and
22203+ * mark it for reload.
22204+ */
22205+ if (owner_task == task)
22206+ pfm_save_pmds(ctx, old_set);
22207+ ctx->active_set = new_set;
22208+ new_set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
22209+ }
22210+ }
22211+
22212+ /*
22213+ * mark as started
22214+ * must be done before calling pfm_arch_start()
22215+ */
22216+ ctx->flags.started = 1;
22217+
22218+ pfm_arch_start(task, ctx);
22219+
22220+ /*
22221+ * we check whether we had a pending ovfl before restarting.
22222+ * If so we need to regenerate the interrupt to make sure we
22223+ * keep recorded samples. For non-self monitoring this check
22224+ * is done in the pfm_ctxswin_thread() routine.
22225+ *
22226+ * we check new_set/old_set because pfm_switch_sets() already
22227+ * takes care of replaying the pending interrupts
22228+ */
22229+ if (is_self && new_set != old_set && new_set->npend_ovfls) {
22230+ pfm_arch_resend_irq(ctx);
22231+ pfm_stats_inc(ovfl_intr_replay_count);
22232+ }
22233+
22234+ /*
22235+ * always start with full timeout
22236+ */
22237+ new_set->hrtimer_rem = new_set->hrtimer_exp;
22238+
22239+ /*
22240+ * activate timeout for system-wide, self-montoring
22241+ * Always start with full timeout
22242+ * Timeout is at least one tick away, so no risk of
22243+ * having hrtimer_start() trying to wakeup softirqd
22244+ * and thus causing troubles. This cannot happen anmyway
22245+ * because cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ
22246+ */
22247+ if (is_self && new_set->flags & PFM_SETFL_TIME_SWITCH) {
22248+ hrtimer_start(&__get_cpu_var(pfm_hrtimer),
22249+ new_set->hrtimer_rem,
22250+ HRTIMER_MODE_REL);
22251+
22252+ PFM_DBG("set%u started timeout=%lld",
22253+ new_set->id,
22254+ (unsigned long long)new_set->hrtimer_rem.tv64);
22255+ }
22256+
22257+ /*
22258+ * we restart total duration even if context was
22259+ * already started. In that case, counts are simply
22260+ * reset.
22261+ *
22262+ * For per-thread, if not self-monitoring, the statement
22263+ * below will have no effect because thread is stopped.
22264+ * The field is reset of ctxsw in.
22265+ */
22266+ new_set->duration_start = sched_clock();
22267+
22268+ return 0;
22269+}
22270+
22271+/**
22272+ * __pfm_stop - stop monitoring
22273+ * @ctx: context to operate on
22274+ * @release_info: infos for caller (see below)
22275+ *
22276+ * When operating in per-thread* mode and when not self-monitoring,
22277+ * the monitored thread must be stopped.
22278+ *
22279+ * the context is locked and interrupts are disabled.
22280+ *
22281+ * release_info value upon return:
22282+ * - bit 0 : unused
22283+ * - bit 1 : when set, must cancel hrtimer
22284+ */
22285+int __pfm_stop(struct pfm_context *ctx, int *release_info)
22286+{
22287+ struct pfm_event_set *set;
22288+ struct task_struct *task;
22289+ u64 now;
22290+ int state;
22291+
22292+ *release_info = 0;
22293+
22294+ now = sched_clock();
22295+ state = ctx->state;
22296+ set = ctx->active_set;
22297+
22298+ /*
22299+ * context must be attached (zombie cannot happen)
22300+ */
22301+ if (state == PFM_CTX_UNLOADED)
22302+ return -EINVAL;
22303+
22304+ task = ctx->task;
22305+
22306+ PFM_DBG("ctx_task=[%d] ctx_state=%d is_system=%d",
22307+ task ? task->pid : -1,
22308+ state,
22309+ !task);
22310+
22311+ /*
22312+ * this happens for system-wide context
22313+ */
22314+ if (task == NULL)
22315+ task = current;
22316+
22317+ /*
22318+ * compute elapsed time
22319+ *
22320+ * unless masked, compute elapsed duration, stop timeout
22321+ */
22322+ if (task == current && state == PFM_CTX_LOADED) {
22323+ /*
22324+ * timeout cancel must be deferred until context is
22325+ * unlocked to avoid race with pfm_handle_switch_timeout()
22326+ */
22327+ if (set->flags & PFM_SETFL_TIME_SWITCH)
22328+ *release_info |= 0x2;
22329+
22330+ set->duration += now - set->duration_start;
22331+ }
22332+
22333+ pfm_arch_stop(task, ctx);
22334+
22335+ ctx->flags.started = 0;
22336+ /*
22337+ * starting now, in-flight PMU interrupt for this context
22338+ * are treated as spurious
22339+ */
22340+ return 0;
22341+}
22342--- /dev/null
22343+++ b/perfmon/perfmon_attach.c
22344@@ -0,0 +1,474 @@
22345+/*
22346+ * perfmon_attach.c: perfmon2 load/unload functions
22347+ *
22348+ * This file implements the perfmon2 interface which
22349+ * provides access to the hardware performance counters
22350+ * of the host processor.
22351+ *
22352+ *
22353+ * The initial version of perfmon.c was written by
22354+ * Ganesh Venkitachalam, IBM Corp.
22355+ *
22356+ * Then it was modified for perfmon-1.x by Stephane Eranian and
22357+ * David Mosberger, Hewlett Packard Co.
22358+ *
22359+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
22360+ * by Stephane Eranian, Hewlett Packard Co.
22361+ *
22362+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
22363+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
22364+ * David Mosberger-Tang <davidm@hpl.hp.com>
22365+ *
22366+ * More information about perfmon available at:
22367+ * http://perfmon2.sf.net
22368+ *
22369+ * This program is free software; you can redistribute it and/or
22370+ * modify it under the terms of version 2 of the GNU General Public
22371+ * License as published by the Free Software Foundation.
22372+ *
22373+ * This program is distributed in the hope that it will be useful,
22374+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
22375+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22376+ * General Public License for more details.
22377+ *
22378+ * You should have received a copy of the GNU General Public License
22379+ * along with this program; if not, write to the Free Software
22380+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
22381+ * 02111-1307 USA
22382+ */
22383+#include <linux/kernel.h>
22384+#include <linux/fs.h>
22385+#include <linux/perfmon_kern.h>
22386+#include "perfmon_priv.h"
22387+
22388+/**
22389+ * __pfm_load_context_sys - attach context to a CPU in system-wide mode
22390+ * @ctx: context to operate on
22391+ * @set_id: set to activate first
22392+ * @cpu: CPU to monitor
22393+ *
22394+ * The cpu specified in the pfarg_load.load_pid argument must be the current
22395+ * CPU.
22396+ *
22397+ * The function must be called with the context locked and interrupts disabled.
22398+ */
22399+static int pfm_load_ctx_sys(struct pfm_context *ctx, u16 set_id, u32 cpu)
22400+{
22401+ struct pfm_event_set *set;
22402+ int mycpu;
22403+ int ret;
22404+
22405+ mycpu = smp_processor_id();
22406+
22407+ /*
22408+ * system-wide: check we are running on the desired CPU
22409+ */
22410+ if (cpu != mycpu) {
22411+ PFM_DBG("wrong CPU: asking %u but on %u", cpu, mycpu);
22412+ return -EINVAL;
22413+ }
22414+
22415+ /*
22416+ * initialize sets
22417+ */
22418+ set = pfm_prepare_sets(ctx, set_id);
22419+ if (!set) {
22420+ PFM_DBG("event set%u does not exist", set_id);
22421+ return -EINVAL;
22422+ }
22423+
22424+ PFM_DBG("set=%u set_flags=0x%x", set->id, set->flags);
22425+
22426+ ctx->cpu = mycpu;
22427+ ctx->task = NULL;
22428+ ctx->active_set = set;
22429+
22430+ /*
22431+ * perform any architecture specific actions
22432+ */
22433+ ret = pfm_arch_load_context(ctx);
22434+ if (ret)
22435+ goto error_noload;
22436+
22437+ /*
22438+ * now reserve the session, before we can proceed with
22439+ * actually accessing the PMU hardware
22440+ */
22441+ ret = pfm_session_acquire(1, mycpu);
22442+ if (ret)
22443+ goto error;
22444+
22445+
22446+ /*
22447+ * caller must be on monitored CPU to access PMU, thus this is
22448+ * a form of self-monitoring
22449+ */
22450+ ctx->flags.is_self = 1;
22451+
22452+ set->runs++;
22453+
22454+ /*
22455+ * load PMD from set
22456+ * load PMC from set
22457+ */
22458+ pfm_arch_restore_pmds(ctx, set);
22459+ pfm_arch_restore_pmcs(ctx, set);
22460+
22461+ /*
22462+ * set new ownership
22463+ */
22464+ pfm_set_pmu_owner(NULL, ctx);
22465+
22466+ /*
22467+ * reset pending work
22468+ */
22469+ ctx->flags.work_type = PFM_WORK_NONE;
22470+ ctx->flags.reset_count = 0;
22471+
22472+ /*
22473+ * reset message queue
22474+ */
22475+ ctx->msgq_head = ctx->msgq_tail = 0;
22476+
22477+ ctx->state = PFM_CTX_LOADED;
22478+
22479+ return 0;
22480+error:
22481+ pfm_arch_unload_context(ctx);
22482+error_noload:
22483+ return ret;
22484+}
22485+
22486+/**
22487+ * __pfm_load_context_thread - attach context to a thread
22488+ * @ctx: context to operate on
22489+ * @set_id: first set
22490+ * @task: threadf to attach to
22491+ *
22492+ * The function must be called with the context locked and interrupts disabled.
22493+ */
22494+static int pfm_load_ctx_thread(struct pfm_context *ctx, u16 set_id,
22495+ struct task_struct *task)
22496+{
22497+ struct pfm_event_set *set;
22498+ struct pfm_context *old;
22499+ int ret;
22500+
22501+ PFM_DBG("load_pid=%d set=%u", task->pid, set_id);
22502+ /*
22503+ * per-thread:
22504+ * - task to attach to is checked in sys_pfm_load_context() to avoid
22505+ * locking issues. if found, and not self, task refcount was
22506+ * incremented.
22507+ */
22508+ old = cmpxchg(&task->pfm_context, NULL, ctx);
22509+ if (old) {
22510+ PFM_DBG("load_pid=%d has a context "
22511+ "old=%p new=%p cur=%p",
22512+ task->pid,
22513+ old,
22514+ ctx,
22515+ task->pfm_context);
22516+ return -EEXIST;
22517+ }
22518+
22519+ /*
22520+ * initialize sets
22521+ */
22522+ set = pfm_prepare_sets(ctx, set_id);
22523+ if (!set) {
22524+ PFM_DBG("event set%u does not exist", set_id);
22525+ return -EINVAL;
22526+ }
22527+
22528+
22529+ ctx->task = task;
22530+ ctx->cpu = -1;
22531+ ctx->active_set = set;
22532+
22533+ /*
22534+ * perform any architecture specific actions
22535+ */
22536+ ret = pfm_arch_load_context(ctx);
22537+ if (ret)
22538+ goto error_noload;
22539+
22540+ /*
22541+ * now reserve the session, before we can proceed with
22542+ * actually accessing the PMU hardware
22543+ */
22544+ ret = pfm_session_acquire(0, -1);
22545+ if (ret)
22546+ goto error;
22547+
22548+
22549+ set->runs++;
22550+ if (ctx->task != current) {
22551+
22552+ ctx->flags.is_self = 0;
22553+
22554+ /* force a full reload */
22555+ ctx->last_act = PFM_INVALID_ACTIVATION;
22556+ ctx->last_cpu = -1;
22557+ set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
22558+
22559+ } else {
22560+ pfm_check_save_prev_ctx();
22561+
22562+ ctx->last_cpu = smp_processor_id();
22563+ __get_cpu_var(pmu_activation_number)++;
22564+ ctx->last_act = __get_cpu_var(pmu_activation_number);
22565+
22566+ ctx->flags.is_self = 1;
22567+
22568+ /*
22569+ * load PMD from set
22570+ * load PMC from set
22571+ */
22572+ pfm_arch_restore_pmds(ctx, set);
22573+ pfm_arch_restore_pmcs(ctx, set);
22574+
22575+ /*
22576+ * set new ownership
22577+ */
22578+ pfm_set_pmu_owner(ctx->task, ctx);
22579+ }
22580+ set_tsk_thread_flag(task, TIF_PERFMON_CTXSW);
22581+
22582+ /*
22583+ * reset pending work
22584+ */
22585+ ctx->flags.work_type = PFM_WORK_NONE;
22586+ ctx->flags.reset_count = 0;
22587+
22588+ /*
22589+ * reset message queue
22590+ */
22591+ ctx->msgq_head = ctx->msgq_tail = 0;
22592+
22593+ ctx->state = PFM_CTX_LOADED;
22594+
22595+ return 0;
22596+
22597+error:
22598+ pfm_arch_unload_context(ctx);
22599+ ctx->task = NULL;
22600+error_noload:
22601+ /*
22602+ * detach context
22603+ */
22604+ task->pfm_context = NULL;
22605+ return ret;
22606+}
22607+
22608+/**
22609+ * __pfm_load_context - attach context to a CPU or thread
22610+ * @ctx: context to operate on
22611+ * @load: pfarg_load as passed by user
22612+ * @task: thread to attach to, NULL for system-wide
22613+ */
22614+int __pfm_load_context(struct pfm_context *ctx, struct pfarg_load *load,
22615+ struct task_struct *task)
22616+{
22617+ if (ctx->flags.system)
22618+ return pfm_load_ctx_sys(ctx, load->load_set, load->load_pid);
22619+ return pfm_load_ctx_thread(ctx, load->load_set, task);
22620+}
22621+
22622+/**
22623+ * pfm_update_ovfl_pmds - account for pending ovfls on PMDs
22624+ * @ctx: context to operate on
22625+ *
22626+ * This function is always called after pfm_stop has been issued
22627+ */
22628+static void pfm_update_ovfl_pmds(struct pfm_context *ctx)
22629+{
22630+ struct pfm_event_set *set;
22631+ u64 *cnt_pmds;
22632+ u64 ovfl_mask;
22633+ u16 num_ovfls, i, first;
22634+
22635+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
22636+ first = ctx->regs.first_intr_pmd;
22637+ cnt_pmds = ctx->regs.cnt_pmds;
22638+
22639+ /*
22640+ * look for pending interrupts and adjust PMD values accordingly
22641+ */
22642+ list_for_each_entry(set, &ctx->set_list, list) {
22643+
22644+ if (!set->npend_ovfls)
22645+ continue;
22646+
22647+ num_ovfls = set->npend_ovfls;
22648+ PFM_DBG("set%u nintrs=%u", set->id, num_ovfls);
22649+
22650+ for (i = first; num_ovfls; i++) {
22651+ if (test_bit(i, cast_ulp(set->povfl_pmds))) {
22652+ /* only correct value for counters */
22653+ if (test_bit(i, cast_ulp(cnt_pmds)))
22654+ set->pmds[i].value += 1 + ovfl_mask;
22655+ num_ovfls--;
22656+ }
22657+ PFM_DBG("pmd%u set=%u val=0x%llx",
22658+ i,
22659+ set->id,
22660+ (unsigned long long)set->pmds[i].value);
22661+ }
22662+ /*
22663+ * we need to clear to prevent a pfm_getinfo_evtsets() from
22664+ * returning stale data even after the context is unloaded
22665+ */
22666+ set->npend_ovfls = 0;
22667+ bitmap_zero(cast_ulp(set->povfl_pmds), ctx->regs.max_intr_pmd);
22668+ }
22669+}
22670+
22671+
22672+/**
22673+ * __pfm_unload_context - detach context from CPU or thread
22674+ * @ctx: context to operate on
22675+ * @release_info: pointer to return info (see below)
22676+ *
22677+ * The function must be called with the context locked and interrupts disabled.
22678+ *
22679+ * release_info value upon return:
22680+ * - bit 0: when set, must free context
22681+ * - bit 1: when set, must cancel hrtimer
22682+ */
22683+int __pfm_unload_context(struct pfm_context *ctx, int *release_info)
22684+{
22685+ struct task_struct *task;
22686+ int ret;
22687+
22688+ PFM_DBG("ctx_state=%d task [%d]",
22689+ ctx->state,
22690+ ctx->task ? ctx->task->pid : -1);
22691+
22692+ *release_info = 0;
22693+
22694+ /*
22695+ * unload only when necessary
22696+ */
22697+ if (ctx->state == PFM_CTX_UNLOADED)
22698+ return 0;
22699+
22700+ task = ctx->task;
22701+
22702+ /*
22703+ * stop monitoring
22704+ */
22705+ ret = __pfm_stop(ctx, release_info);
22706+ if (ret)
22707+ return ret;
22708+
22709+ ctx->state = PFM_CTX_UNLOADED;
22710+ ctx->flags.can_restart = 0;
22711+
22712+ /*
22713+ * save active set
22714+ * UP:
22715+ * if not current task and due to lazy, state may
22716+ * still be live
22717+ * for system-wide, guaranteed to run on correct CPU
22718+ */
22719+ if (__get_cpu_var(pmu_ctx) == ctx) {
22720+ /*
22721+ * pending overflows have been saved by pfm_stop()
22722+ */
22723+ pfm_save_pmds(ctx, ctx->active_set);
22724+ pfm_set_pmu_owner(NULL, NULL);
22725+ PFM_DBG("released ownership");
22726+ }
22727+
22728+ /*
22729+ * account for pending overflows
22730+ */
22731+ pfm_update_ovfl_pmds(ctx);
22732+
22733+ /*
22734+ * arch-specific unload operations
22735+ */
22736+ pfm_arch_unload_context(ctx);
22737+
22738+ /*
22739+ * per-thread: disconnect from monitored task
22740+ */
22741+ if (task) {
22742+ task->pfm_context = NULL;
22743+ ctx->task = NULL;
22744+ clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW);
22745+ clear_tsk_thread_flag(task, TIF_PERFMON_WORK);
22746+ pfm_arch_disarm_handle_work(task);
22747+ }
22748+ /*
22749+ * session can be freed, must have interrupts enabled
22750+ * thus we release in the caller. Bit 0 signals to the
22751+ * caller that the session can be released.
22752+ */
22753+ *release_info |= 0x1;
22754+
22755+ return 0;
22756+}
22757+
22758+/**
22759+ * __pfm_exit_thread - detach and free context on thread exit
22760+ */
22761+void __pfm_exit_thread(void)
22762+{
22763+ struct pfm_context *ctx;
22764+ unsigned long flags;
22765+ int free_ok = 0, release_info = 0;
22766+ int ret;
22767+
22768+ ctx = current->pfm_context;
22769+
22770+ BUG_ON(ctx->flags.system);
22771+
22772+ spin_lock_irqsave(&ctx->lock, flags);
22773+
22774+ PFM_DBG("state=%d is_self=%d", ctx->state, ctx->flags.is_self);
22775+
22776+ /*
22777+ * __pfm_unload_context() cannot fail
22778+ * in the context states we are interested in
22779+ */
22780+ switch (ctx->state) {
22781+ case PFM_CTX_LOADED:
22782+ case PFM_CTX_MASKED:
22783+ __pfm_unload_context(ctx, &release_info);
22784+ /*
22785+ * end notification only sent for non
22786+ * self-monitoring context
22787+ */
22788+ if (!ctx->flags.is_self)
22789+ pfm_end_notify(ctx);
22790+ break;
22791+ case PFM_CTX_ZOMBIE:
22792+ __pfm_unload_context(ctx, &release_info);
22793+ free_ok = 1;
22794+ break;
22795+ default:
22796+ BUG_ON(ctx->state != PFM_CTX_LOADED);
22797+ break;
22798+ }
22799+ spin_unlock_irqrestore(&ctx->lock, flags);
22800+
22801+ /*
22802+ * cancel timer now that context is unlocked
22803+ */
22804+ if (release_info & 0x2) {
22805+ ret = hrtimer_cancel(&__get_cpu_var(pfm_hrtimer));
22806+ PFM_DBG("timeout cancel=%d", ret);
22807+ }
22808+
22809+ if (release_info & 0x1)
22810+ pfm_session_release(0, 0);
22811+
22812+ /*
22813+ * All memory free operations (especially for vmalloc'ed memory)
22814+ * MUST be done with interrupts ENABLED.
22815+ */
22816+ if (free_ok)
22817+ pfm_free_context(ctx);
22818+}
22819--- /dev/null
22820+++ b/perfmon/perfmon_ctx.c
22821@@ -0,0 +1,314 @@
22822+/*
22823+ * perfmon_ctx.c: perfmon2 context functions
22824+ *
22825+ * This file implements the perfmon2 interface which
22826+ * provides access to the hardware performance counters
22827+ * of the host processor.
22828+ *
22829+ *
22830+ * The initial version of perfmon.c was written by
22831+ * Ganesh Venkitachalam, IBM Corp.
22832+ *
22833+ * Then it was modified for perfmon-1.x by Stephane Eranian and
22834+ * David Mosberger, Hewlett Packard Co.
22835+ *
22836+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
22837+ * by Stephane Eranian, Hewlett Packard Co.
22838+ *
22839+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
22840+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
22841+ * David Mosberger-Tang <davidm@hpl.hp.com>
22842+ *
22843+ * More information about perfmon available at:
22844+ * http://perfmon2.sf.net
22845+ *
22846+ * This program is free software; you can redistribute it and/or
22847+ * modify it under the terms of version 2 of the GNU General Public
22848+ * License as published by the Free Software Foundation.
22849+ *
22850+ * This program is distributed in the hope that it will be useful,
22851+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
22852+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22853+ * General Public License for more details.
22854+ *
22855+ * You should have received a copy of the GNU General Public License
22856+ * along with this program; if not, write to the Free Software
22857+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
22858+ * 02111-1307 USA
22859+ */
22860+#include <linux/kernel.h>
22861+#include <linux/fs.h>
22862+#include <linux/perfmon_kern.h>
22863+#include "perfmon_priv.h"
22864+
22865+/*
22866+ * context memory pool pointer
22867+ */
22868+static struct kmem_cache *pfm_ctx_cachep;
22869+
22870+/**
22871+ * pfm_free_context - de-allocate context and associated resources
22872+ * @ctx: context to free
22873+ */
22874+void pfm_free_context(struct pfm_context *ctx)
22875+{
22876+ pfm_arch_context_free(ctx);
22877+
22878+ pfm_free_sets(ctx);
22879+
22880+ pfm_smpl_buf_free(ctx);
22881+
22882+ PFM_DBG("free ctx @0x%p", ctx);
22883+ kmem_cache_free(pfm_ctx_cachep, ctx);
22884+ /*
22885+ * decrease refcount on:
22886+ * - PMU description table
22887+ * - sampling format
22888+ */
22889+ pfm_pmu_conf_put();
22890+ pfm_pmu_release();
22891+}
22892+
22893+/**
22894+ * pfm_ctx_flags_sane - check if context flags passed by user are okay
22895+ * @ctx_flags: flags passed user on pfm_create_context
22896+ *
22897+ * return:
22898+ * 0 if successful
22899+ * <0 and error code otherwise
22900+ */
22901+static inline int pfm_ctx_flags_sane(u32 ctx_flags)
22902+{
22903+ if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
22904+ if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
22905+ PFM_DBG("cannot use blocking mode in syswide mode");
22906+ return -EINVAL;
22907+ }
22908+ }
22909+ return 0;
22910+}
22911+
22912+/**
22913+ * pfm_ctx_permissions - check authorization to create new context
22914+ * @ctx_flags: context flags passed by user
22915+ *
22916+ * check for permissions to create a context.
22917+ *
22918+ * A sysadmin may decide to restrict creation of per-thread
22919+ * and/or system-wide context to a group of users using the
22920+ * group id via /sys/kernel/perfmon/task_group and
22921+ * /sys/kernel/perfmon/sys_group.
22922+ *
22923+ * Once we identify a user level package which can be used
22924+ * to grant/revoke Linux capabilites at login via PAM, we will
22925+ * be able to use capabilities. We would also need to increase
22926+ * the size of cap_t to support more than 32 capabilities (it
22927+ * is currently defined as u32 and 32 capabilities are alrady
22928+ * defined).
22929+ */
22930+static inline int pfm_ctx_permissions(u32 ctx_flags)
22931+{
22932+ if ((ctx_flags & PFM_FL_SYSTEM_WIDE)
22933+ && pfm_controls.sys_group != PFM_GROUP_PERM_ANY
22934+ && !in_group_p(pfm_controls.sys_group)) {
22935+ PFM_DBG("user group not allowed to create a syswide ctx");
22936+ return -EPERM;
22937+ } else if (pfm_controls.task_group != PFM_GROUP_PERM_ANY
22938+ && !in_group_p(pfm_controls.task_group)) {
22939+ PFM_DBG("user group not allowed to create a task context");
22940+ return -EPERM;
22941+ }
22942+ return 0;
22943+}
22944+
22945+/**
22946+ * __pfm_create_context - allocate and initialize a perfmon context
22947+ * @req : pfarg_ctx from user
22948+ * @fmt : pointer sampling format, NULL if not used
22949+ * @fmt_arg: pointer to argument to sampling format, NULL if not used
22950+ * @mode: PFM_NORMAL or PFM_COMPAT(IA-64 v2.0 compatibility)
22951+ * @ctx : address of new context upon succesful return, undefined otherwise
22952+ *
22953+ * function used to allocate a new context. A context is allocated along
22954+ * with the default event set. If a sampling format is used, the buffer
22955+ * may be allocated and initialized.
22956+ *
22957+ * The file descriptor identifying the context is allocated and returned
22958+ * to caller.
22959+ *
22960+ * This function operates with no locks and interrupts are enabled.
22961+ * return:
22962+ * >=0: the file descriptor to identify the context
22963+ * <0 : the error code
22964+ */
22965+int __pfm_create_context(struct pfarg_ctx *req,
22966+ struct pfm_smpl_fmt *fmt,
22967+ void *fmt_arg,
22968+ int mode,
22969+ struct pfm_context **new_ctx)
22970+{
22971+ struct pfm_context *ctx;
22972+ struct file *filp = NULL;
22973+ u32 ctx_flags;
22974+ int fd = 0, ret;
22975+
22976+ ctx_flags = req->ctx_flags;
22977+
22978+ /* Increase refcount on PMU description */
22979+ ret = pfm_pmu_conf_get(1);
22980+ if (ret < 0)
22981+ goto error_conf;
22982+
22983+ ret = pfm_ctx_flags_sane(ctx_flags);
22984+ if (ret < 0)
22985+ goto error_alloc;
22986+
22987+ ret = pfm_ctx_permissions(ctx_flags);
22988+ if (ret < 0)
22989+ goto error_alloc;
22990+
22991+ /*
22992+ * we can use GFP_KERNEL and potentially sleep because we do
22993+ * not hold any lock at this point.
22994+ */
22995+ might_sleep();
22996+ ret = -ENOMEM;
22997+ ctx = kmem_cache_zalloc(pfm_ctx_cachep, GFP_KERNEL);
22998+ if (!ctx)
22999+ goto error_alloc;
23000+
23001+ PFM_DBG("alloc ctx @0x%p", ctx);
23002+
23003+ INIT_LIST_HEAD(&ctx->set_list);
23004+ spin_lock_init(&ctx->lock);
23005+ init_completion(&ctx->restart_complete);
23006+ init_waitqueue_head(&ctx->msgq_wait);
23007+
23008+ /*
23009+ * context is unloaded
23010+ */
23011+ ctx->state = PFM_CTX_UNLOADED;
23012+
23013+ /*
23014+ * initialization of context's flags
23015+ * must be done before pfm_find_set()
23016+ */
23017+ ctx->flags.block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
23018+ ctx->flags.system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
23019+ ctx->flags.no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
23020+ ctx->flags.ia64_v20_compat = mode == PFM_COMPAT ? 1 : 0;
23021+
23022+ ret = pfm_pmu_acquire(ctx);
23023+ if (ret)
23024+ goto error_file;
23025+ /*
23026+ * check if PMU is usable
23027+ */
23028+ if (!(ctx->regs.num_pmcs && ctx->regs.num_pmcs)) {
23029+ PFM_DBG("no usable PMU registers");
23030+ ret = -EBUSY;
23031+ goto error_file;
23032+ }
23033+
23034+ /*
23035+ * link to format, must be done first for correct
23036+ * error handling in pfm_context_free()
23037+ */
23038+ ctx->smpl_fmt = fmt;
23039+
23040+ ret = -ENFILE;
23041+ fd = pfm_alloc_fd(&filp);
23042+ if (fd < 0)
23043+ goto error_file;
23044+
23045+ /*
23046+ * initialize arch-specific section
23047+ * must be done before fmt_init()
23048+ */
23049+ ret = pfm_arch_context_create(ctx, ctx_flags);
23050+ if (ret)
23051+ goto error_set;
23052+
23053+ ret = -ENOMEM;
23054+
23055+ /*
23056+ * add initial set
23057+ */
23058+ if (pfm_create_initial_set(ctx))
23059+ goto error_set;
23060+
23061+ /*
23062+ * does the user want to sample?
23063+ * must be done after pfm_pmu_acquire() because
23064+ * needs ctx->regs
23065+ */
23066+ if (fmt) {
23067+ ret = pfm_setup_smpl_fmt(ctx, ctx_flags, fmt_arg, filp);
23068+ if (ret)
23069+ goto error_set;
23070+ }
23071+
23072+ filp->private_data = ctx;
23073+
23074+ ctx->last_act = PFM_INVALID_ACTIVATION;
23075+ ctx->last_cpu = -1;
23076+
23077+ /*
23078+ * initialize notification message queue
23079+ */
23080+ ctx->msgq_head = ctx->msgq_tail = 0;
23081+
23082+ PFM_DBG("flags=0x%x system=%d notify_block=%d no_msg=%d"
23083+ " use_fmt=%d ctx_fd=%d mode=%d",
23084+ ctx_flags,
23085+ ctx->flags.system,
23086+ ctx->flags.block,
23087+ ctx->flags.no_msg,
23088+ !!fmt,
23089+ fd, mode);
23090+
23091+ if (new_ctx)
23092+ *new_ctx = ctx;
23093+
23094+ /*
23095+ * we defer the fd_install until we are certain the call succeeded
23096+ * to ensure we do not have to undo its effect. Neither put_filp()
23097+ * nor put_unused_fd() undoes the effect of fd_install().
23098+ */
23099+ fd_install(fd, filp);
23100+
23101+ return fd;
23102+
23103+error_set:
23104+ put_filp(filp);
23105+ put_unused_fd(fd);
23106+error_file:
23107+ /*
23108+ * calls the right *_put() functions
23109+ * calls pfm_release_pmu()
23110+ */
23111+ pfm_free_context(ctx);
23112+ return ret;
23113+error_alloc:
23114+ pfm_pmu_conf_put();
23115+error_conf:
23116+ pfm_smpl_fmt_put(fmt);
23117+ return ret;
23118+}
23119+
23120+/**
23121+ * pfm_init_ctx -- initialize context SLAB
23122+ *
23123+ * called from pfm_init
23124+ */
23125+int __init pfm_init_ctx(void)
23126+{
23127+ pfm_ctx_cachep = kmem_cache_create("pfm_context",
23128+ sizeof(struct pfm_context)+PFM_ARCH_CTX_SIZE,
23129+ SLAB_HWCACHE_ALIGN, 0, NULL);
23130+ if (!pfm_ctx_cachep) {
23131+ PFM_ERR("cannot initialize context slab");
23132+ return -ENOMEM;
23133+ }
23134+ return 0;
23135+}
23136--- /dev/null
23137+++ b/perfmon/perfmon_ctxsw.c
23138@@ -0,0 +1,342 @@
23139+/*
23140+ * perfmon_cxtsw.c: perfmon2 context switch code
23141+ *
23142+ * This file implements the perfmon2 interface which
23143+ * provides access to the hardware performance counters
23144+ * of the host processor.
23145+ *
23146+ * The initial version of perfmon.c was written by
23147+ * Ganesh Venkitachalam, IBM Corp.
23148+ *
23149+ * Then it was modified for perfmon-1.x by Stephane Eranian and
23150+ * David Mosberger, Hewlett Packard Co.
23151+ *
23152+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
23153+ * by Stephane Eranian, Hewlett Packard Co.
23154+ *
23155+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
23156+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
23157+ * David Mosberger-Tang <davidm@hpl.hp.com>
23158+ *
23159+ * More information about perfmon available at:
23160+ * http://perfmon2.sf.net
23161+ *
23162+ * This program is free software; you can redistribute it and/or
23163+ * modify it under the terms of version 2 of the GNU General Public
23164+ * License as published by the Free Software Foundation.
23165+ *
23166+ * This program is distributed in the hope that it will be useful,
23167+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
23168+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23169+ * General Public License for more details.
23170+ *
23171+ * You should have received a copy of the GNU General Public License
23172+ * along with this program; if not, write to the Free Software
23173+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23174+ * 02111-1307 USA
23175+ */
23176+#include <linux/kernel.h>
23177+#include <linux/perfmon_kern.h>
23178+#include "perfmon_priv.h"
23179+
23180+void pfm_save_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
23181+{
23182+ u64 val, ovfl_mask;
23183+ u64 *used_pmds, *cnt_pmds;
23184+ u16 i, num;
23185+
23186+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
23187+ num = set->nused_pmds;
23188+ cnt_pmds = ctx->regs.cnt_pmds;
23189+ used_pmds = set->used_pmds;
23190+
23191+ /*
23192+ * save HW PMD, for counters, reconstruct 64-bit value
23193+ */
23194+ for (i = 0; num; i++) {
23195+ if (test_bit(i, cast_ulp(used_pmds))) {
23196+ val = pfm_read_pmd(ctx, i);
23197+ if (likely(test_bit(i, cast_ulp(cnt_pmds))))
23198+ val = (set->pmds[i].value & ~ovfl_mask) |
23199+ (val & ovfl_mask);
23200+ set->pmds[i].value = val;
23201+ num--;
23202+ }
23203+ }
23204+ pfm_arch_clear_pmd_ovfl_cond(ctx, set);
23205+}
23206+
23207+/*
23208+ * interrupts are disabled (no preemption)
23209+ */
23210+void __pfm_ctxswin_thread(struct task_struct *task,
23211+ struct pfm_context *ctx, u64 now)
23212+{
23213+ u64 cur_act;
23214+ struct pfm_event_set *set;
23215+ int reload_pmcs, reload_pmds;
23216+ int mycpu, is_active;
23217+
23218+ mycpu = smp_processor_id();
23219+
23220+ cur_act = __get_cpu_var(pmu_activation_number);
23221+ /*
23222+ * we need to lock context because it could be accessed
23223+ * from another CPU. Normally the schedule() functions
23224+ * has masked interrupts which should be enough to
23225+ * protect against PMU interrupts.
23226+ */
23227+ spin_lock(&ctx->lock);
23228+
23229+ is_active = pfm_arch_is_active(ctx);
23230+
23231+ set = ctx->active_set;
23232+
23233+ /*
23234+ * in case fo zombie, we do not complete ctswin of the
23235+ * PMU, and we force a call to pfm_handle_work() to finish
23236+ * cleanup, i.e., free context + smpl_buff. The reason for
23237+ * deferring to pfm_handle_work() is that it is not possible
23238+ * to vfree() with interrupts disabled.
23239+ */
23240+ if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) {
23241+ pfm_post_work(task, ctx, PFM_WORK_ZOMBIE);
23242+ goto done;
23243+ }
23244+
23245+ /*
23246+ * if we were the last user of the PMU on that CPU,
23247+ * then nothing to do except restore psr
23248+ */
23249+ if (ctx->last_cpu == mycpu && ctx->last_act == cur_act) {
23250+ /*
23251+ * check for forced reload conditions
23252+ */
23253+ reload_pmcs = set->priv_flags & PFM_SETFL_PRIV_MOD_PMCS;
23254+ reload_pmds = set->priv_flags & PFM_SETFL_PRIV_MOD_PMDS;
23255+ } else {
23256+#ifndef CONFIG_SMP
23257+ pfm_check_save_prev_ctx();
23258+#endif
23259+ reload_pmcs = 1;
23260+ reload_pmds = 1;
23261+ }
23262+ /* consumed */
23263+ set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
23264+
23265+ if (reload_pmds)
23266+ pfm_arch_restore_pmds(ctx, set);
23267+
23268+ /*
23269+ * need to check if had in-flight interrupt in
23270+ * pfm_ctxswout_thread(). If at least one bit set, then we must replay
23271+ * the interrupt to avoid losing some important performance data.
23272+ *
23273+ * npend_ovfls is cleared in interrupt handler
23274+ */
23275+ if (set->npend_ovfls) {
23276+ pfm_arch_resend_irq(ctx);
23277+ pfm_stats_inc(ovfl_intr_replay_count);
23278+ }
23279+
23280+ if (reload_pmcs)
23281+ pfm_arch_restore_pmcs(ctx, set);
23282+
23283+ /*
23284+ * record current activation for this context
23285+ */
23286+ __get_cpu_var(pmu_activation_number)++;
23287+ ctx->last_cpu = mycpu;
23288+ ctx->last_act = __get_cpu_var(pmu_activation_number);
23289+
23290+ /*
23291+ * establish new ownership.
23292+ */
23293+ pfm_set_pmu_owner(task, ctx);
23294+
23295+ pfm_arch_ctxswin_thread(task, ctx);
23296+ /*
23297+ * set->duration does not count when context in MASKED state.
23298+ * set->duration_start is reset in unmask_monitoring()
23299+ */
23300+ set->duration_start = now;
23301+
23302+ /*
23303+ * re-arm switch timeout, if necessary
23304+ * Timeout is active only if monitoring is active,
23305+ * i.e., LOADED + started
23306+ *
23307+ * We reload the remainder timeout or the full timeout.
23308+ * Remainder is recorded on context switch out or in
23309+ * pfm_load_context()
23310+ */
23311+ if (ctx->state == PFM_CTX_LOADED
23312+ && (set->flags & PFM_SETFL_TIME_SWITCH) && is_active) {
23313+ pfm_restart_timer(ctx, set);
23314+ /* careful here as pfm_restart_timer may switch sets */
23315+ }
23316+done:
23317+ spin_unlock(&ctx->lock);
23318+}
23319+
23320+/*
23321+ * interrupts are masked, runqueue lock is held.
23322+ *
23323+ * In UP. we simply stop monitoring and leave the state
23324+ * in place, i.e., lazy save
23325+ */
23326+void __pfm_ctxswout_thread(struct task_struct *task,
23327+ struct pfm_context *ctx, u64 now)
23328+{
23329+ struct pfm_event_set *set;
23330+ int need_save_pmds, is_active;
23331+
23332+ /*
23333+ * we need to lock context because it could be accessed
23334+ * from another CPU. Normally the schedule() functions
23335+ * has masked interrupts which should be enough to
23336+ * protect against PMU interrupts.
23337+ */
23338+
23339+ spin_lock(&ctx->lock);
23340+
23341+ is_active = pfm_arch_is_active(ctx);
23342+ set = ctx->active_set;
23343+
23344+ /*
23345+ * stop monitoring and
23346+ * collect pending overflow information
23347+ * needed on ctxswin. We cannot afford to lose
23348+ * a PMU interrupt.
23349+ */
23350+ need_save_pmds = pfm_arch_ctxswout_thread(task, ctx);
23351+
23352+ if (ctx->state == PFM_CTX_LOADED) {
23353+ /*
23354+ * accumulate only when set is actively monitoring,
23355+ */
23356+ set->duration += now - set->duration_start;
23357+
23358+ /*
23359+ * record remaining timeout
23360+ * reload in pfm_ctxsw_in()
23361+ */
23362+ if (is_active && (set->flags & PFM_SETFL_TIME_SWITCH)) {
23363+ struct hrtimer *h = NULL;
23364+ h = &__get_cpu_var(pfm_hrtimer);
23365+ hrtimer_cancel(h);
23366+ set->hrtimer_rem = hrtimer_get_remaining(h);
23367+ PFM_DBG_ovfl("hrtimer=%lld",
23368+ (long long)set->hrtimer_rem.tv64);
23369+ }
23370+ }
23371+
23372+#ifdef CONFIG_SMP
23373+ /*
23374+ * in SMP, release ownership of this PMU.
23375+ * PMU interrupts are masked, so nothing
23376+ * can happen.
23377+ */
23378+ pfm_set_pmu_owner(NULL, NULL);
23379+
23380+ /*
23381+ * On some architectures, it is necessary to read the
23382+ * PMD registers to check for pending overflow in
23383+ * pfm_arch_ctxswout_thread(). In that case, saving of
23384+ * the PMDs may be done there and not here.
23385+ */
23386+ if (need_save_pmds)
23387+ pfm_save_pmds(ctx, set);
23388+#endif
23389+ spin_unlock(&ctx->lock);
23390+}
23391+
23392+/*
23393+ *
23394+ */
23395+static void __pfm_ctxswout_sys(struct task_struct *prev,
23396+ struct task_struct *next)
23397+{
23398+ struct pfm_context *ctx;
23399+
23400+ ctx = __get_cpu_var(pmu_ctx);
23401+ BUG_ON(!ctx);
23402+
23403+ /*
23404+ * propagate TIF_PERFMON_CTXSW to ensure that:
23405+ * - previous task has TIF_PERFMON_CTXSW cleared, in case it is
23406+ * scheduled onto another CPU where there is syswide monitoring
23407+ * - next task has TIF_PERFMON_CTXSW set to ensure it will come back
23408+ * here when context switched out
23409+ */
23410+ clear_tsk_thread_flag(prev, TIF_PERFMON_CTXSW);
23411+ set_tsk_thread_flag(next, TIF_PERFMON_CTXSW);
23412+
23413+ /*
23414+ * nothing to do until actually started
23415+ * XXX: assumes no mean to start from user level
23416+ */
23417+ if (!ctx->flags.started)
23418+ return;
23419+
23420+ pfm_arch_ctxswout_sys(prev, ctx);
23421+}
23422+
23423+/*
23424+ *
23425+ */
23426+static void __pfm_ctxswin_sys(struct task_struct *prev,
23427+ struct task_struct *next)
23428+{
23429+ struct pfm_context *ctx;
23430+
23431+ ctx = __get_cpu_var(pmu_ctx);
23432+ BUG_ON(!ctx);
23433+
23434+ /*
23435+ * nothing to do until actually started
23436+ * XXX: assumes no mean to start from user level
23437+ */
23438+ if (!ctx->flags.started)
23439+ return;
23440+
23441+ pfm_arch_ctxswin_sys(next, ctx);
23442+}
23443+
23444+void pfm_ctxsw_out(struct task_struct *prev,
23445+ struct task_struct *next)
23446+{
23447+ struct pfm_context *ctxp;
23448+ u64 now;
23449+
23450+ now = sched_clock();
23451+
23452+ ctxp = prev->pfm_context;
23453+
23454+ if (ctxp)
23455+ __pfm_ctxswout_thread(prev, ctxp, now);
23456+ else
23457+ __pfm_ctxswout_sys(prev, next);
23458+
23459+ pfm_stats_inc(ctxswout_count);
23460+ pfm_stats_add(ctxswout_ns, sched_clock() - now);
23461+}
23462+
23463+void pfm_ctxsw_in(struct task_struct *prev,
23464+ struct task_struct *next)
23465+{
23466+ struct pfm_context *ctxn;
23467+ u64 now;
23468+
23469+ now = sched_clock();
23470+
23471+ ctxn = next->pfm_context;
23472+
23473+ if (ctxn)
23474+ __pfm_ctxswin_thread(next, ctxn, now);
23475+ else
23476+ __pfm_ctxswin_sys(prev, next);
23477+
23478+ pfm_stats_inc(ctxswin_count);
23479+ pfm_stats_add(ctxswin_ns, sched_clock() - now);
23480+}
23481--- /dev/null
23482+++ b/perfmon/perfmon_debugfs.c
23483@@ -0,0 +1,168 @@
23484+/*
23485+ * perfmon_debugfs.c: perfmon2 statistics interface to debugfs
23486+ *
23487+ * This file implements the perfmon2 interface which
23488+ * provides access to the hardware performance counters
23489+ * of the host processor.
23490+ *
23491+ * The initial version of perfmon.c was written by
23492+ * Ganesh Venkitachalam, IBM Corp.
23493+ *
23494+ * Then it was modified for perfmon-1.x by Stephane Eranian and
23495+ * David Mosberger, Hewlett Packard Co.
23496+ *
23497+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
23498+ * by Stephane Eranian, Hewlett Packard Co.
23499+ *
23500+ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P.
23501+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
23502+ *
23503+ * More information about perfmon available at:
23504+ * http://perfmon2.sf.net
23505+ *
23506+ * This program is free software; you can redistribute it and/or
23507+ * modify it under the terms of version 2 of the GNU General Public
23508+ * License as published by the Free Software Foundation.
23509+ *
23510+ * This program is distributed in the hope that it will be useful,
23511+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
23512+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23513+ * General Public License for more details.
23514+ *
23515+ * You should have received a copy of the GNU General Public License
23516+ * along with this program; if not, write to the Free Software
23517+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23518+ * 02111-1307 USA
23519+ */
23520+#include <linux/kernel.h>
23521+#include <linux/debugfs.h>
23522+#include <linux/perfmon_kern.h>
23523+
23524+/*
23525+ * to make the statistics visible to user space:
23526+ * $ mount -t debugfs none /mnt
23527+ * $ cd /mnt/perfmon
23528+ * then choose a CPU subdir
23529+ */
23530+DECLARE_PER_CPU(struct pfm_stats, pfm_stats);
23531+
23532+static struct dentry *pfm_debugfs_dir;
23533+
23534+void pfm_reset_stats(int cpu)
23535+{
23536+ struct pfm_stats *st;
23537+ unsigned long flags;
23538+
23539+ st = &per_cpu(pfm_stats, cpu);
23540+
23541+ local_irq_save(flags);
23542+ memset(st->v, 0, sizeof(st->v));
23543+ local_irq_restore(flags);
23544+}
23545+
23546+static const char *pfm_stats_strs[] = {
23547+ "ovfl_intr_all_count",
23548+ "ovfl_intr_ns",
23549+ "ovfl_intr_spurious_count",
23550+ "ovfl_intr_replay_count",
23551+ "ovfl_intr_regular_count",
23552+ "handle_work_count",
23553+ "ovfl_notify_count",
23554+ "reset_pmds_count",
23555+ "pfm_restart_count",
23556+ "fmt_handler_calls",
23557+ "fmt_handler_ns",
23558+ "set_switch_count",
23559+ "set_switch_ns",
23560+ "set_switch_exp",
23561+ "ctxswin_count",
23562+ "ctxswin_ns",
23563+ "handle_timeout_count",
23564+ "ovfl_intr_nmi_count",
23565+ "ctxswout_count",
23566+ "ctxswout_ns",
23567+};
23568+#define PFM_NUM_STRS ARRAY_SIZE(pfm_stats_strs)
23569+
23570+void pfm_debugfs_del_cpu(int cpu)
23571+{
23572+ struct pfm_stats *st;
23573+ int i;
23574+
23575+ st = &per_cpu(pfm_stats, cpu);
23576+
23577+ for (i = 0; i < PFM_NUM_STATS; i++) {
23578+ if (st->dirs[i])
23579+ debugfs_remove(st->dirs[i]);
23580+ st->dirs[i] = NULL;
23581+ }
23582+ if (st->cpu_dir)
23583+ debugfs_remove(st->cpu_dir);
23584+ st->cpu_dir = NULL;
23585+}
23586+
23587+int pfm_debugfs_add_cpu(int cpu)
23588+{
23589+ struct pfm_stats *st;
23590+ int i;
23591+
23592+ /*
23593+ * sanity check between stats names and the number
23594+ * of entries in the pfm_stats value array.
23595+ */
23596+ if (PFM_NUM_STRS != PFM_NUM_STATS) {
23597+ PFM_ERR("PFM_NUM_STRS != PFM_NUM_STATS error");
23598+ return -1;
23599+ }
23600+
23601+ st = &per_cpu(pfm_stats, cpu);
23602+ sprintf(st->cpu_name, "cpu%d", cpu);
23603+
23604+ st->cpu_dir = debugfs_create_dir(st->cpu_name, pfm_debugfs_dir);
23605+ if (!st->cpu_dir)
23606+ return -1;
23607+
23608+ for (i = 0; i < PFM_NUM_STATS; i++) {
23609+ st->dirs[i] = debugfs_create_u64(pfm_stats_strs[i],
23610+ S_IRUGO,
23611+ st->cpu_dir,
23612+ &st->v[i]);
23613+ if (!st->dirs[i])
23614+ goto error;
23615+ }
23616+ pfm_reset_stats(cpu);
23617+ return 0;
23618+error:
23619+ while (i >= 0) {
23620+ debugfs_remove(st->dirs[i]);
23621+ i--;
23622+ }
23623+ debugfs_remove(st->cpu_dir);
23624+ return -1;
23625+}
23626+
23627+/*
23628+ * called once from pfm_init()
23629+ */
23630+int __init pfm_init_debugfs(void)
23631+{
23632+ int cpu1, cpu2, ret;
23633+
23634+ pfm_debugfs_dir = debugfs_create_dir("perfmon", NULL);
23635+ if (!pfm_debugfs_dir)
23636+ return -1;
23637+
23638+ for_each_online_cpu(cpu1) {
23639+ ret = pfm_debugfs_add_cpu(cpu1);
23640+ if (ret)
23641+ goto error;
23642+ }
23643+ return 0;
23644+error:
23645+ for_each_online_cpu(cpu2) {
23646+ if (cpu2 == cpu1)
23647+ break;
23648+ pfm_debugfs_del_cpu(cpu2);
23649+ }
23650+ return -1;
23651+}
23652--- /dev/null
23653+++ b/perfmon/perfmon_dfl_smpl.c
23654@@ -0,0 +1,298 @@
23655+/*
23656+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
23657+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
23658+ *
23659+ * This file implements the new default sampling buffer format
23660+ * for the perfmon2 subsystem.
23661+ *
23662+ * This program is free software; you can redistribute it and/or
23663+ * modify it under the terms of version 2 of the GNU General Public
23664+ * License as published by the Free Software Foundation.
23665+ *
23666+ * This program is distributed in the hope that it will be useful,
23667+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
23668+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23669+ * General Public License for more details.
23670+ *
23671+ * You should have received a copy of the GNU General Public License
23672+ * along with this program; if not, write to the Free Software
23673+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23674+ * 02111-1307 USA
23675+ */
23676+#include <linux/kernel.h>
23677+#include <linux/types.h>
23678+#include <linux/module.h>
23679+#include <linux/init.h>
23680+#include <linux/smp.h>
23681+
23682+#include <linux/perfmon_kern.h>
23683+#include <linux/perfmon_dfl_smpl.h>
23684+
23685+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
23686+MODULE_DESCRIPTION("new perfmon default sampling format");
23687+MODULE_LICENSE("GPL");
23688+
23689+static int pfm_dfl_fmt_validate(u32 ctx_flags, u16 npmds, void *data)
23690+{
23691+ struct pfm_dfl_smpl_arg *arg = data;
23692+ u64 min_buf_size;
23693+
23694+ if (data == NULL) {
23695+ PFM_DBG("no argument passed");
23696+ return -EINVAL;
23697+ }
23698+
23699+ /*
23700+ * sanity check in case size_t is smaller then u64
23701+ */
23702+#if BITS_PER_LONG == 4
23703+#define MAX_SIZE_T (1ULL<<(sizeof(size_t)<<3))
23704+ if (sizeof(size_t) < sizeof(arg->buf_size)) {
23705+ if (arg->buf_size >= MAX_SIZE_T)
23706+ return -ETOOBIG;
23707+ }
23708+#endif
23709+
23710+ /*
23711+ * compute min buf size. npmds is the maximum number
23712+ * of implemented PMD registers.
23713+ */
23714+ min_buf_size = sizeof(struct pfm_dfl_smpl_hdr)
23715+ + (sizeof(struct pfm_dfl_smpl_entry) + (npmds*sizeof(u64)));
23716+
23717+ PFM_DBG("validate ctx_flags=0x%x flags=0x%x npmds=%u "
23718+ "min_buf_size=%llu buf_size=%llu\n",
23719+ ctx_flags,
23720+ arg->buf_flags,
23721+ npmds,
23722+ (unsigned long long)min_buf_size,
23723+ (unsigned long long)arg->buf_size);
23724+
23725+ /*
23726+ * must hold at least the buffer header + one minimally sized entry
23727+ */
23728+ if (arg->buf_size < min_buf_size)
23729+ return -EINVAL;
23730+
23731+ return 0;
23732+}
23733+
23734+static int pfm_dfl_fmt_get_size(u32 flags, void *data, size_t *size)
23735+{
23736+ struct pfm_dfl_smpl_arg *arg = data;
23737+
23738+ /*
23739+ * size has been validated in default_validate
23740+ * we can never loose bits from buf_size.
23741+ */
23742+ *size = (size_t)arg->buf_size;
23743+
23744+ return 0;
23745+}
23746+
23747+static int pfm_dfl_fmt_init(struct pfm_context *ctx, void *buf, u32 ctx_flags,
23748+ u16 npmds, void *data)
23749+{
23750+ struct pfm_dfl_smpl_hdr *hdr;
23751+ struct pfm_dfl_smpl_arg *arg = data;
23752+
23753+ hdr = buf;
23754+
23755+ hdr->hdr_version = PFM_DFL_SMPL_VERSION;
23756+ hdr->hdr_buf_size = arg->buf_size;
23757+ hdr->hdr_buf_flags = arg->buf_flags;
23758+ hdr->hdr_cur_offs = sizeof(*hdr);
23759+ hdr->hdr_overflows = 0;
23760+ hdr->hdr_count = 0;
23761+ hdr->hdr_min_buf_space = sizeof(struct pfm_dfl_smpl_entry) + (npmds*sizeof(u64));
23762+ /*
23763+ * due to cache aliasing, it may be necessary to flush the cache
23764+ * on certain architectures (e.g., MIPS)
23765+ */
23766+ pfm_cacheflush(hdr, sizeof(*hdr));
23767+
23768+ PFM_DBG("buffer=%p buf_size=%llu hdr_size=%zu hdr_version=%u.%u "
23769+ "min_space=%llu npmds=%u",
23770+ buf,
23771+ (unsigned long long)hdr->hdr_buf_size,
23772+ sizeof(*hdr),
23773+ PFM_VERSION_MAJOR(hdr->hdr_version),
23774+ PFM_VERSION_MINOR(hdr->hdr_version),
23775+ (unsigned long long)hdr->hdr_min_buf_space,
23776+ npmds);
23777+
23778+ return 0;
23779+}
23780+
23781+/*
23782+ * called from pfm_overflow_handler() to record a new sample
23783+ *
23784+ * context is locked, interrupts are disabled (no preemption)
23785+ */
23786+static int pfm_dfl_fmt_handler(struct pfm_context *ctx,
23787+ unsigned long ip, u64 tstamp, void *data)
23788+{
23789+ struct pfm_dfl_smpl_hdr *hdr;
23790+ struct pfm_dfl_smpl_entry *ent;
23791+ struct pfm_ovfl_arg *arg;
23792+ void *cur, *last;
23793+ u64 *e;
23794+ size_t entry_size, min_size;
23795+ u16 npmds, i;
23796+ u16 ovfl_pmd;
23797+ void *buf;
23798+
23799+ hdr = ctx->smpl_addr;
23800+ arg = &ctx->ovfl_arg;
23801+
23802+ buf = hdr;
23803+ cur = buf+hdr->hdr_cur_offs;
23804+ last = buf+hdr->hdr_buf_size;
23805+ ovfl_pmd = arg->ovfl_pmd;
23806+ min_size = hdr->hdr_min_buf_space;
23807+
23808+ /*
23809+ * precheck for sanity
23810+ */
23811+ if ((last - cur) < min_size)
23812+ goto full;
23813+
23814+ npmds = arg->num_smpl_pmds;
23815+
23816+ ent = (struct pfm_dfl_smpl_entry *)cur;
23817+
23818+ entry_size = sizeof(*ent) + (npmds << 3);
23819+
23820+ /* position for first pmd */
23821+ e = (u64 *)(ent+1);
23822+
23823+ hdr->hdr_count++;
23824+
23825+ PFM_DBG_ovfl("count=%llu cur=%p last=%p free_bytes=%zu ovfl_pmd=%d "
23826+ "npmds=%u",
23827+ (unsigned long long)hdr->hdr_count,
23828+ cur, last,
23829+ (last-cur),
23830+ ovfl_pmd,
23831+ npmds);
23832+
23833+ /*
23834+ * current = task running at the time of the overflow.
23835+ *
23836+ * per-task mode:
23837+ * - this is usually the task being monitored.
23838+ * Under certain conditions, it might be a different task
23839+ *
23840+ * system-wide:
23841+ * - this is not necessarily the task controlling the session
23842+ */
23843+ ent->pid = current->pid;
23844+ ent->ovfl_pmd = ovfl_pmd;
23845+ ent->last_reset_val = arg->pmd_last_reset;
23846+
23847+ /*
23848+ * where did the fault happen (includes slot number)
23849+ */
23850+ ent->ip = ip;
23851+
23852+ ent->tstamp = tstamp;
23853+ ent->cpu = smp_processor_id();
23854+ ent->set = arg->active_set;
23855+ ent->tgid = current->tgid;
23856+
23857+ /*
23858+ * selectively store PMDs in increasing index number
23859+ */
23860+ if (npmds) {
23861+ u64 *val = arg->smpl_pmds_values;
23862+ for (i = 0; i < npmds; i++)
23863+ *e++ = *val++;
23864+ }
23865+
23866+ /*
23867+ * update position for next entry
23868+ */
23869+ hdr->hdr_cur_offs += entry_size;
23870+ cur += entry_size;
23871+
23872+ pfm_cacheflush(hdr, sizeof(*hdr));
23873+ pfm_cacheflush(ent, entry_size);
23874+
23875+ /*
23876+ * post check to avoid losing the last sample
23877+ */
23878+ if ((last - cur) < min_size)
23879+ goto full;
23880+
23881+ /* reset before returning from interrupt handler */
23882+ arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET;
23883+
23884+ return 0;
23885+full:
23886+ PFM_DBG_ovfl("sampling buffer full free=%zu, count=%llu",
23887+ last-cur,
23888+ (unsigned long long)hdr->hdr_count);
23889+
23890+ /*
23891+ * increment number of buffer overflows.
23892+ * important to detect duplicate set of samples.
23893+ */
23894+ hdr->hdr_overflows++;
23895+
23896+ /*
23897+ * request notification and masking of monitoring.
23898+ * Notification is still subject to the overflowed
23899+ * register having the FL_NOTIFY flag set.
23900+ */
23901+ arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY | PFM_OVFL_CTRL_MASK;
23902+
23903+ return -ENOBUFS; /* we are full, sorry */
23904+}
23905+
23906+static int pfm_dfl_fmt_restart(int is_active, u32 *ovfl_ctrl, void *buf)
23907+{
23908+ struct pfm_dfl_smpl_hdr *hdr;
23909+
23910+ hdr = buf;
23911+
23912+ hdr->hdr_count = 0;
23913+ hdr->hdr_cur_offs = sizeof(*hdr);
23914+
23915+ pfm_cacheflush(hdr, sizeof(*hdr));
23916+
23917+ *ovfl_ctrl = PFM_OVFL_CTRL_RESET;
23918+
23919+ return 0;
23920+}
23921+
23922+static int pfm_dfl_fmt_exit(void *buf)
23923+{
23924+ return 0;
23925+}
23926+
23927+static struct pfm_smpl_fmt dfl_fmt = {
23928+ .fmt_name = "default",
23929+ .fmt_version = 0x10000,
23930+ .fmt_arg_size = sizeof(struct pfm_dfl_smpl_arg),
23931+ .fmt_validate = pfm_dfl_fmt_validate,
23932+ .fmt_getsize = pfm_dfl_fmt_get_size,
23933+ .fmt_init = pfm_dfl_fmt_init,
23934+ .fmt_handler = pfm_dfl_fmt_handler,
23935+ .fmt_restart = pfm_dfl_fmt_restart,
23936+ .fmt_exit = pfm_dfl_fmt_exit,
23937+ .fmt_flags = PFM_FMT_BUILTIN_FLAG,
23938+ .owner = THIS_MODULE
23939+};
23940+
23941+static int pfm_dfl_fmt_init_module(void)
23942+{
23943+ return pfm_fmt_register(&dfl_fmt);
23944+}
23945+
23946+static void pfm_dfl_fmt_cleanup_module(void)
23947+{
23948+ pfm_fmt_unregister(&dfl_fmt);
23949+}
23950+
23951+module_init(pfm_dfl_fmt_init_module);
23952+module_exit(pfm_dfl_fmt_cleanup_module);
23953--- /dev/null
23954+++ b/perfmon/perfmon_file.c
23955@@ -0,0 +1,751 @@
23956+/*
23957+ * perfmon_file.c: perfmon2 file input/output functions
23958+ *
23959+ * This file implements the perfmon2 interface which
23960+ * provides access to the hardware performance counters
23961+ * of the host processor.
23962+ *
23963+ * The initial version of perfmon.c was written by
23964+ * Ganesh Venkitachalam, IBM Corp.
23965+ *
23966+ * Then it was modified for perfmon-1.x by Stephane Eranian and
23967+ * David Mosberger, Hewlett Packard Co.
23968+ *
23969+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
23970+ * by Stephane Eranian, Hewlett Packard Co.
23971+ *
23972+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
23973+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
23974+ * David Mosberger-Tang <davidm@hpl.hp.com>
23975+ *
23976+ * More information about perfmon available at:
23977+ * http://perfmon2.sf.net
23978+ *
23979+ * This program is free software; you can redistribute it and/or
23980+ * modify it under the terms of version 2 of the GNU General Public
23981+ * License as published by the Free Software Foundation.
23982+ *
23983+ * This program is distributed in the hope that it will be useful,
23984+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
23985+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23986+ * General Public License for more details.
23987+ *
23988+ * You should have received a copy of the GNU General Public License
23989+ * along with this program; if not, write to the Free Software
23990+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23991+ * 02111-1307 USA
23992+ */
23993+#include <linux/kernel.h>
23994+#include <linux/module.h>
23995+#include <linux/file.h>
23996+#include <linux/poll.h>
23997+#include <linux/vfs.h>
23998+#include <linux/pagemap.h>
23999+#include <linux/mount.h>
24000+#include <linux/perfmon_kern.h>
24001+#include "perfmon_priv.h"
24002+
24003+#define PFMFS_MAGIC 0xa0b4d889 /* perfmon filesystem magic number */
24004+
24005+struct pfm_controls pfm_controls = {
24006+ .sys_group = PFM_GROUP_PERM_ANY,
24007+ .task_group = PFM_GROUP_PERM_ANY,
24008+ .arg_mem_max = PAGE_SIZE,
24009+ .smpl_buffer_mem_max = ~0,
24010+};
24011+EXPORT_SYMBOL(pfm_controls);
24012+
24013+static int __init enable_debug(char *str)
24014+{
24015+ pfm_controls.debug = 1;
24016+ PFM_INFO("debug output enabled\n");
24017+ return 1;
24018+}
24019+__setup("perfmon_debug", enable_debug);
24020+
24021+static int pfmfs_delete_dentry(struct dentry *dentry)
24022+{
24023+ return 1;
24024+}
24025+
24026+static struct dentry_operations pfmfs_dentry_operations = {
24027+ .d_delete = pfmfs_delete_dentry,
24028+};
24029+
24030+int pfm_buf_map_pagefault(struct vm_area_struct *vma, struct vm_fault *vmf)
24031+{
24032+ void *kaddr;
24033+ unsigned long address;
24034+ struct pfm_context *ctx;
24035+ size_t size;
24036+
24037+ address = (unsigned long)vmf->virtual_address;
24038+
24039+ ctx = vma->vm_private_data;
24040+ if (ctx == NULL) {
24041+ PFM_DBG("no ctx");
24042+ return VM_FAULT_SIGBUS;
24043+ }
24044+ /*
24045+ * size available to user (maybe different from real_smpl_size
24046+ */
24047+ size = ctx->smpl_size;
24048+
24049+ if ((address < vma->vm_start) ||
24050+ (address >= (vma->vm_start + size)))
24051+ return VM_FAULT_SIGBUS;
24052+
24053+ kaddr = ctx->smpl_addr + (address - vma->vm_start);
24054+
24055+ vmf->page = vmalloc_to_page(kaddr);
24056+ get_page(vmf->page);
24057+
24058+ PFM_DBG("[%d] start=%p ref_count=%d",
24059+ current->pid,
24060+ kaddr, page_count(vmf->page));
24061+
24062+ return 0;
24063+}
24064+
24065+/*
24066+ * we need to determine whther or not we are closing the last reference
24067+ * to the file and thus are going to end up in pfm_close() which eventually
24068+ * calls pfm_release_buf_space(). In that function, we update the accouting
24069+ * for locked_vm given that we are actually freeing the sampling buffer. The
24070+ * issue is that there are multiple paths leading to pfm_release_buf_space(),
24071+ * from exit(), munmap(), close(). The path coming from munmap() is problematic
24072+ * becuse do_munmap() grabs mmap_sem in write-mode which is also what
24073+ * pfm_release_buf_space does. To avoid deadlock, we need to determine where
24074+ * we are calling from and skip the locking. The vm_ops->close() callback
24075+ * is invoked for each remove_vma() independently of the number of references
24076+ * left on the file descriptor, therefore simple reference counter does not
24077+ * work. We need to determine if this is the last call, and then set a flag
24078+ * to skip the locking.
24079+ */
24080+static void pfm_buf_map_close(struct vm_area_struct *vma)
24081+{
24082+ struct file *file;
24083+ struct pfm_context *ctx;
24084+
24085+ file = vma->vm_file;
24086+ ctx = vma->vm_private_data;
24087+
24088+ /*
24089+ * if file is going to close, then pfm_close() will
24090+ * be called, do not lock in pfm_release_buf
24091+ */
24092+ if (atomic_read(&file->f_count) == 1)
24093+ ctx->flags.mmap_nlock = 1;
24094+}
24095+
24096+/*
24097+ * we do not have a close callback because, the locked
24098+ * memory accounting must be done when the actual buffer
24099+ * is freed. Munmap does not free the page backing the vma
24100+ * because they may still be in use by the PMU interrupt handler.
24101+ */
24102+struct vm_operations_struct pfm_buf_map_vm_ops = {
24103+ .fault = pfm_buf_map_pagefault,
24104+ .close = pfm_buf_map_close
24105+};
24106+
24107+static int pfm_mmap_buffer(struct pfm_context *ctx, struct vm_area_struct *vma,
24108+ size_t size)
24109+{
24110+ if (ctx->smpl_addr == NULL) {
24111+ PFM_DBG("no sampling buffer to map");
24112+ return -EINVAL;
24113+ }
24114+
24115+ if (size > ctx->smpl_size) {
24116+ PFM_DBG("mmap size=%zu >= actual buf size=%zu",
24117+ size,
24118+ ctx->smpl_size);
24119+ return -EINVAL;
24120+ }
24121+
24122+ vma->vm_ops = &pfm_buf_map_vm_ops;
24123+ vma->vm_private_data = ctx;
24124+
24125+ return 0;
24126+}
24127+
24128+static int pfm_mmap(struct file *file, struct vm_area_struct *vma)
24129+{
24130+ size_t size;
24131+ struct pfm_context *ctx;
24132+ unsigned long flags;
24133+ int ret;
24134+
24135+ PFM_DBG("pfm_file_ops");
24136+
24137+ ctx = file->private_data;
24138+ size = (vma->vm_end - vma->vm_start);
24139+
24140+ if (ctx == NULL)
24141+ return -EINVAL;
24142+
24143+ ret = -EINVAL;
24144+
24145+ spin_lock_irqsave(&ctx->lock, flags);
24146+
24147+ if (vma->vm_flags & VM_WRITE) {
24148+ PFM_DBG("cannot map buffer for writing");
24149+ goto done;
24150+ }
24151+
24152+ PFM_DBG("vm_pgoff=%lu size=%zu vm_start=0x%lx",
24153+ vma->vm_pgoff,
24154+ size,
24155+ vma->vm_start);
24156+
24157+ ret = pfm_mmap_buffer(ctx, vma, size);
24158+ if (ret == 0)
24159+ vma->vm_flags |= VM_RESERVED;
24160+
24161+ PFM_DBG("ret=%d vma_flags=0x%lx vma_start=0x%lx vma_size=%lu",
24162+ ret,
24163+ vma->vm_flags,
24164+ vma->vm_start,
24165+ vma->vm_end-vma->vm_start);
24166+done:
24167+ spin_unlock_irqrestore(&ctx->lock, flags);
24168+
24169+ return ret;
24170+}
24171+
24172+/*
24173+ * Extract one message from queue.
24174+ *
24175+ * return:
24176+ * -EAGAIN: when non-blocking and nothing is* in the queue.
24177+ * -ERESTARTSYS: when blocking and signal is pending
24178+ * Otherwise returns size of message (sizeof(pfarg_msg))
24179+ */
24180+ssize_t __pfm_read(struct pfm_context *ctx, union pfarg_msg *msg_buf, int non_block)
24181+{
24182+ ssize_t ret = 0;
24183+ unsigned long flags;
24184+ DECLARE_WAITQUEUE(wait, current);
24185+
24186+ /*
24187+ * we must masks interrupts to avoid a race condition
24188+ * with the PMU interrupt handler.
24189+ */
24190+ spin_lock_irqsave(&ctx->lock, flags);
24191+
24192+ while (pfm_msgq_is_empty(ctx)) {
24193+
24194+ /*
24195+ * handle non-blocking reads
24196+ * return -EAGAIN
24197+ */
24198+ ret = -EAGAIN;
24199+ if (non_block)
24200+ break;
24201+
24202+ add_wait_queue(&ctx->msgq_wait, &wait);
24203+ set_current_state(TASK_INTERRUPTIBLE);
24204+
24205+ spin_unlock_irqrestore(&ctx->lock, flags);
24206+
24207+ schedule();
24208+
24209+ /*
24210+ * during this window, another thread may call
24211+ * pfm_read() and steal our message
24212+ */
24213+
24214+ spin_lock_irqsave(&ctx->lock, flags);
24215+
24216+ remove_wait_queue(&ctx->msgq_wait, &wait);
24217+ set_current_state(TASK_RUNNING);
24218+
24219+ /*
24220+ * check for pending signals
24221+ * return -ERESTARTSYS
24222+ */
24223+ ret = -ERESTARTSYS;
24224+ if (signal_pending(current))
24225+ break;
24226+
24227+ /*
24228+ * we may have a message
24229+ */
24230+ ret = 0;
24231+ }
24232+
24233+ /*
24234+ * extract message
24235+ */
24236+ if (ret == 0) {
24237+ /*
24238+ * copy the oldest message into msg_buf.
24239+ * We cannot directly call copy_to_user()
24240+ * because interrupts masked. This is done
24241+ * in the caller
24242+ */
24243+ pfm_get_next_msg(ctx, msg_buf);
24244+
24245+ ret = sizeof(*msg_buf);
24246+
24247+ PFM_DBG("extracted type=%d", msg_buf->type);
24248+ }
24249+
24250+ spin_unlock_irqrestore(&ctx->lock, flags);
24251+
24252+ PFM_DBG("blocking=%d ret=%zd", non_block, ret);
24253+
24254+ return ret;
24255+}
24256+
24257+static ssize_t pfm_read(struct file *filp, char __user *buf, size_t size,
24258+ loff_t *ppos)
24259+{
24260+ struct pfm_context *ctx;
24261+ union pfarg_msg msg_buf;
24262+ int non_block, ret;
24263+
24264+ PFM_DBG_ovfl("buf=%p size=%zu", buf, size);
24265+
24266+ ctx = filp->private_data;
24267+ if (ctx == NULL) {
24268+ PFM_ERR("no ctx for pfm_read");
24269+ return -EINVAL;
24270+ }
24271+
24272+ non_block = filp->f_flags & O_NONBLOCK;
24273+
24274+#ifdef CONFIG_IA64_PERFMON_COMPAT
24275+ /*
24276+ * detect IA-64 v2.0 context read (message size is different)
24277+ * nops on all other architectures
24278+ */
24279+ if (unlikely(ctx->flags.ia64_v20_compat))
24280+ return pfm_arch_compat_read(ctx, buf, non_block, size);
24281+#endif
24282+ /*
24283+ * cannot extract partial messages.
24284+ * check even when there is no message
24285+ *
24286+ * cannot extract more than one message per call. Bytes
24287+ * above sizeof(msg) are ignored.
24288+ */
24289+ if (size < sizeof(msg_buf)) {
24290+ PFM_DBG("message is too small size=%zu must be >=%zu)",
24291+ size,
24292+ sizeof(msg_buf));
24293+ return -EINVAL;
24294+ }
24295+
24296+ ret = __pfm_read(ctx, &msg_buf, non_block);
24297+ if (ret > 0) {
24298+ if (copy_to_user(buf, &msg_buf, sizeof(msg_buf)))
24299+ ret = -EFAULT;
24300+ }
24301+ PFM_DBG_ovfl("ret=%d", ret);
24302+ return ret;
24303+}
24304+
24305+static ssize_t pfm_write(struct file *file, const char __user *ubuf,
24306+ size_t size, loff_t *ppos)
24307+{
24308+ PFM_DBG("pfm_write called");
24309+ return -EINVAL;
24310+}
24311+
24312+static unsigned int pfm_poll(struct file *filp, poll_table *wait)
24313+{
24314+ struct pfm_context *ctx;
24315+ unsigned long flags;
24316+ unsigned int mask = 0;
24317+
24318+ PFM_DBG("pfm_file_ops");
24319+
24320+ if (filp->f_op != &pfm_file_ops) {
24321+ PFM_ERR("pfm_poll bad magic");
24322+ return 0;
24323+ }
24324+
24325+ ctx = filp->private_data;
24326+ if (ctx == NULL) {
24327+ PFM_ERR("pfm_poll no ctx");
24328+ return 0;
24329+ }
24330+
24331+ PFM_DBG("before poll_wait");
24332+
24333+ poll_wait(filp, &ctx->msgq_wait, wait);
24334+
24335+ /*
24336+ * pfm_msgq_is_empty() is non-atomic
24337+ *
24338+ * filp is protected by fget() at upper level
24339+ * context cannot be closed by another thread.
24340+ *
24341+ * There may be a race with a PMU interrupt adding
24342+ * messages to the queue. But we are interested in
24343+ * queue not empty, so adding more messages should
24344+ * not really be a problem.
24345+ *
24346+ * There may be a race with another thread issuing
24347+ * a read() and stealing messages from the queue thus
24348+ * may return the wrong answer. This could potentially
24349+ * lead to a blocking read, because nothing is
24350+ * available in the queue
24351+ */
24352+ spin_lock_irqsave(&ctx->lock, flags);
24353+
24354+ if (!pfm_msgq_is_empty(ctx))
24355+ mask = POLLIN | POLLRDNORM;
24356+
24357+ spin_unlock_irqrestore(&ctx->lock, flags);
24358+
24359+ PFM_DBG("after poll_wait mask=0x%x", mask);
24360+
24361+ return mask;
24362+}
24363+
24364+static int pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
24365+ unsigned long arg)
24366+{
24367+ PFM_DBG("pfm_ioctl called");
24368+ return -EINVAL;
24369+}
24370+
24371+/*
24372+ * interrupt cannot be masked when entering this function
24373+ */
24374+static inline int __pfm_fasync(int fd, struct file *filp,
24375+ struct pfm_context *ctx, int on)
24376+{
24377+ int ret;
24378+
24379+ PFM_DBG("in fd=%d on=%d async_q=%p",
24380+ fd,
24381+ on,
24382+ ctx->async_queue);
24383+
24384+ ret = fasync_helper(fd, filp, on, &ctx->async_queue);
24385+
24386+ PFM_DBG("out fd=%d on=%d async_q=%p ret=%d",
24387+ fd,
24388+ on,
24389+ ctx->async_queue, ret);
24390+
24391+ return ret;
24392+}
24393+
24394+static int pfm_fasync(int fd, struct file *filp, int on)
24395+{
24396+ struct pfm_context *ctx;
24397+ int ret;
24398+
24399+ PFM_DBG("pfm_file_ops");
24400+
24401+ ctx = filp->private_data;
24402+ if (ctx == NULL) {
24403+ PFM_ERR("pfm_fasync no ctx");
24404+ return -EBADF;
24405+ }
24406+
24407+ /*
24408+ * we cannot mask interrupts during this call because this may
24409+ * may go to sleep if memory is not readily avalaible.
24410+ *
24411+ * We are protected from the context disappearing by the
24412+ * get_fd()/put_fd() done in caller. Serialization of this function
24413+ * is ensured by caller.
24414+ */
24415+ ret = __pfm_fasync(fd, filp, ctx, on);
24416+
24417+ PFM_DBG("pfm_fasync called on fd=%d on=%d async_queue=%p ret=%d",
24418+ fd,
24419+ on,
24420+ ctx->async_queue, ret);
24421+
24422+ return ret;
24423+}
24424+
24425+#ifdef CONFIG_SMP
24426+static void __pfm_close_remote_cpu(void *info)
24427+{
24428+ struct pfm_context *ctx = info;
24429+ int can_release;
24430+
24431+ BUG_ON(ctx != __get_cpu_var(pmu_ctx));
24432+
24433+ /*
24434+ * we are in IPI interrupt handler which has always higher
24435+ * priority than PMU interrupt, therefore we do not need to
24436+ * mask interrupts. context locking is not needed because we
24437+ * are in close(), no more user references.
24438+ *
24439+ * can_release is ignored, release done on calling CPU
24440+ */
24441+ __pfm_unload_context(ctx, &can_release);
24442+
24443+ /*
24444+ * we cannot free context here because we are in_interrupt().
24445+ * we free on the calling CPU
24446+ */
24447+}
24448+
24449+static int pfm_close_remote_cpu(u32 cpu, struct pfm_context *ctx)
24450+{
24451+ BUG_ON(irqs_disabled());
24452+ return smp_call_function_single(cpu, __pfm_close_remote_cpu, ctx, 1);
24453+}
24454+#endif /* CONFIG_SMP */
24455+
24456+/*
24457+ * called either on explicit close() or from exit_files().
24458+ * Only the LAST user of the file gets to this point, i.e., it is
24459+ * called only ONCE.
24460+ *
24461+ * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
24462+ * (fput()),i.e, last task to access the file. Nobody else can access the
24463+ * file at this point.
24464+ *
24465+ * When called from exit_files(), the VMA has been freed because exit_mm()
24466+ * is executed before exit_files().
24467+ *
24468+ * When called from exit_files(), the current task is not yet ZOMBIE but we
24469+ * flush the PMU state to the context.
24470+ */
24471+int __pfm_close(struct pfm_context *ctx, struct file *filp)
24472+{
24473+ unsigned long flags;
24474+ int state;
24475+ int can_free = 1, can_unload = 1;
24476+ int is_system, can_release = 0;
24477+ u32 cpu;
24478+
24479+ /*
24480+ * no risk of ctx of filp disappearing so we can operate outside
24481+ * of spin_lock(). fasync_helper() runs with interrupts masked,
24482+ * thus there is no risk with the PMU interrupt handler
24483+ *
24484+ * In case of zombie, we will not have the async struct anymore
24485+ * thus kill_fasync() will not do anything
24486+ *
24487+ * fd is not used when removing the entry so we pass -1
24488+ */
24489+ if (filp->f_flags & FASYNC)
24490+ __pfm_fasync (-1, filp, ctx, 0);
24491+
24492+ spin_lock_irqsave(&ctx->lock, flags);
24493+
24494+ state = ctx->state;
24495+ is_system = ctx->flags.system;
24496+ cpu = ctx->cpu;
24497+
24498+ PFM_DBG("state=%d", state);
24499+
24500+ /*
24501+ * check if unload is needed
24502+ */
24503+ if (state == PFM_CTX_UNLOADED)
24504+ goto doit;
24505+
24506+#ifdef CONFIG_SMP
24507+ /*
24508+ * we need to release the resource on the ORIGINAL cpu.
24509+ * we need to release the context lock to avoid deadlocks
24510+ * on the original CPU, especially in the context switch
24511+ * routines. It is safe to unlock because we are in close(),
24512+ * in other words, there is no more access from user level.
24513+ * we can also unmask interrupts on this CPU because the
24514+ * context is running on the original CPU. Context will be
24515+ * unloaded and the session will be released on the original
24516+ * CPU. Upon return, the caller is guaranteed that the context
24517+ * is gone from original CPU.
24518+ */
24519+ if (is_system && cpu != smp_processor_id()) {
24520+ spin_unlock_irqrestore(&ctx->lock, flags);
24521+ pfm_close_remote_cpu(cpu, ctx);
24522+ can_release = 1;
24523+ goto free_it;
24524+ }
24525+
24526+ if (!is_system && ctx->task != current) {
24527+ /*
24528+ * switch context to zombie state
24529+ */
24530+ ctx->state = PFM_CTX_ZOMBIE;
24531+
24532+ PFM_DBG("zombie ctx for [%d]", ctx->task->pid);
24533+ /*
24534+ * must check if other thread is using block overflow
24535+ * notification mode. If so make sure it will not block
24536+ * because there will not be any pfm_restart() issued.
24537+ * When the thread notices the ZOMBIE state, it will clean
24538+ * up what is left of the context
24539+ */
24540+ if (state == PFM_CTX_MASKED && ctx->flags.block) {
24541+ /*
24542+ * force task to wake up from MASKED state
24543+ */
24544+ PFM_DBG("waking up [%d]", ctx->task->pid);
24545+
24546+ complete(&ctx->restart_complete);
24547+ }
24548+ /*
24549+ * PMU session will be release by monitored task when it notices
24550+ * ZOMBIE state as part of pfm_unload_context()
24551+ */
24552+ can_unload = can_free = 0;
24553+ }
24554+#endif
24555+ if (can_unload)
24556+ __pfm_unload_context(ctx, &can_release);
24557+doit:
24558+ spin_unlock_irqrestore(&ctx->lock, flags);
24559+
24560+#ifdef CONFIG_SMP
24561+free_it:
24562+#endif
24563+ if (can_release)
24564+ pfm_session_release(is_system, cpu);
24565+
24566+ if (can_free)
24567+ pfm_free_context(ctx);
24568+
24569+ return 0;
24570+}
24571+
24572+static int pfm_close(struct inode *inode, struct file *filp)
24573+{
24574+ struct pfm_context *ctx;
24575+
24576+ PFM_DBG("called filp=%p", filp);
24577+
24578+ ctx = filp->private_data;
24579+ if (ctx == NULL) {
24580+ PFM_ERR("no ctx");
24581+ return -EBADF;
24582+ }
24583+ return __pfm_close(ctx, filp);
24584+}
24585+
24586+static int pfm_no_open(struct inode *irrelevant, struct file *dontcare)
24587+{
24588+ PFM_DBG("pfm_file_ops");
24589+
24590+ return -ENXIO;
24591+}
24592+
24593+
24594+const struct file_operations pfm_file_ops = {
24595+ .llseek = no_llseek,
24596+ .read = pfm_read,
24597+ .write = pfm_write,
24598+ .poll = pfm_poll,
24599+ .ioctl = pfm_ioctl,
24600+ .open = pfm_no_open, /* special open to disallow open via /proc */
24601+ .fasync = pfm_fasync,
24602+ .release = pfm_close,
24603+ .mmap = pfm_mmap
24604+};
24605+
24606+static int pfmfs_get_sb(struct file_system_type *fs_type,
24607+ int flags, const char *dev_name,
24608+ void *data, struct vfsmount *mnt)
24609+{
24610+ return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt);
24611+}
24612+
24613+static struct file_system_type pfm_fs_type = {
24614+ .name = "pfmfs",
24615+ .get_sb = pfmfs_get_sb,
24616+ .kill_sb = kill_anon_super,
24617+};
24618+
24619+/*
24620+ * pfmfs should _never_ be mounted by userland - too much of security hassle,
24621+ * no real gain from having the whole whorehouse mounted. So we don't need
24622+ * any operations on the root directory. However, we need a non-trivial
24623+ * d_name - pfm: will go nicely and kill the special-casing in procfs.
24624+ */
24625+static struct vfsmount *pfmfs_mnt;
24626+
24627+int __init pfm_init_fs(void)
24628+{
24629+ int err = register_filesystem(&pfm_fs_type);
24630+ if (!err) {
24631+ pfmfs_mnt = kern_mount(&pfm_fs_type);
24632+ err = PTR_ERR(pfmfs_mnt);
24633+ if (IS_ERR(pfmfs_mnt))
24634+ unregister_filesystem(&pfm_fs_type);
24635+ else
24636+ err = 0;
24637+ }
24638+ return err;
24639+}
24640+
24641+int pfm_alloc_fd(struct file **cfile)
24642+{
24643+ int fd, ret = 0;
24644+ struct file *file = NULL;
24645+ struct inode * inode;
24646+ char name[32];
24647+ struct qstr this;
24648+
24649+ fd = get_unused_fd();
24650+ if (fd < 0)
24651+ return -ENFILE;
24652+
24653+ ret = -ENFILE;
24654+
24655+ file = get_empty_filp();
24656+ if (!file)
24657+ goto out;
24658+
24659+ /*
24660+ * allocate a new inode
24661+ */
24662+ inode = new_inode(pfmfs_mnt->mnt_sb);
24663+ if (!inode)
24664+ goto out;
24665+
24666+ PFM_DBG("new inode ino=%ld @%p", inode->i_ino, inode);
24667+
24668+ inode->i_sb = pfmfs_mnt->mnt_sb;
24669+ inode->i_mode = S_IFCHR|S_IRUGO;
24670+ inode->i_uid = current->fsuid;
24671+ inode->i_gid = current->fsgid;
24672+
24673+ sprintf(name, "[%lu]", inode->i_ino);
24674+ this.name = name;
24675+ this.hash = inode->i_ino;
24676+ this.len = strlen(name);
24677+
24678+ ret = -ENOMEM;
24679+
24680+ /*
24681+ * allocate a new dcache entry
24682+ */
24683+ file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this);
24684+ if (!file->f_dentry)
24685+ goto out;
24686+
24687+ file->f_dentry->d_op = &pfmfs_dentry_operations;
24688+
24689+ d_add(file->f_dentry, inode);
24690+ file->f_vfsmnt = mntget(pfmfs_mnt);
24691+ file->f_mapping = inode->i_mapping;
24692+
24693+ file->f_op = &pfm_file_ops;
24694+ file->f_mode = FMODE_READ;
24695+ file->f_flags = O_RDONLY;
24696+ file->f_pos = 0;
24697+
24698+ *cfile = file;
24699+
24700+ return fd;
24701+out:
24702+ if (file)
24703+ put_filp(file);
24704+ put_unused_fd(fd);
24705+ return ret;
24706+}
24707--- /dev/null
24708+++ b/perfmon/perfmon_fmt.c
24709@@ -0,0 +1,219 @@
24710+/*
24711+ * perfmon_fmt.c: perfmon2 sampling buffer format management
24712+ *
24713+ * This file implements the perfmon2 interface which
24714+ * provides access to the hardware performance counters
24715+ * of the host processor.
24716+ *
24717+ * The initial version of perfmon.c was written by
24718+ * Ganesh Venkitachalam, IBM Corp.
24719+ *
24720+ * Then it was modified for perfmon-1.x by Stephane Eranian and
24721+ * David Mosberger, Hewlett Packard Co.
24722+ *
24723+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
24724+ * by Stephane Eranian, Hewlett Packard Co.
24725+ *
24726+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
24727+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
24728+ * David Mosberger-Tang <davidm@hpl.hp.com>
24729+ *
24730+ * More information about perfmon available at:
24731+ * http://perfmon2.sf.net
24732+ *
24733+ * This program is free software; you can redistribute it and/or
24734+ * modify it under the terms of version 2 of the GNU General Public
24735+ * License as published by the Free Software Foundation.
24736+ *
24737+ * This program is distributed in the hope that it will be useful,
24738+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
24739+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24740+ * General Public License for more details.
24741+ *
24742+ * You should have received a copy of the GNU General Public License
24743+ * along with this program; if not, write to the Free Software
24744+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24745+ * 02111-1307 USA
24746+ */
24747+#include <linux/module.h>
24748+#include <linux/perfmon_kern.h>
24749+#include "perfmon_priv.h"
24750+
24751+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_smpl_fmt_lock);
24752+static LIST_HEAD(pfm_smpl_fmt_list);
24753+
24754+static inline int fmt_is_mod(struct pfm_smpl_fmt *f)
24755+{
24756+ return !(f->fmt_flags & PFM_FMTFL_IS_BUILTIN);
24757+}
24758+
24759+static struct pfm_smpl_fmt *pfm_find_fmt(char *name)
24760+{
24761+ struct pfm_smpl_fmt *entry;
24762+
24763+ list_for_each_entry(entry, &pfm_smpl_fmt_list, fmt_list) {
24764+ if (!strcmp(entry->fmt_name, name))
24765+ return entry;
24766+ }
24767+ return NULL;
24768+}
24769+/*
24770+ * find a buffer format based on its name
24771+ */
24772+struct pfm_smpl_fmt *pfm_smpl_fmt_get(char *name)
24773+{
24774+ struct pfm_smpl_fmt *fmt;
24775+
24776+ spin_lock(&pfm_smpl_fmt_lock);
24777+
24778+ fmt = pfm_find_fmt(name);
24779+
24780+ /*
24781+ * increase module refcount
24782+ */
24783+ if (fmt && fmt_is_mod(fmt) && !try_module_get(fmt->owner))
24784+ fmt = NULL;
24785+
24786+ spin_unlock(&pfm_smpl_fmt_lock);
24787+
24788+ return fmt;
24789+}
24790+
24791+void pfm_smpl_fmt_put(struct pfm_smpl_fmt *fmt)
24792+{
24793+ if (fmt == NULL || !fmt_is_mod(fmt))
24794+ return;
24795+ BUG_ON(fmt->owner == NULL);
24796+
24797+ spin_lock(&pfm_smpl_fmt_lock);
24798+ module_put(fmt->owner);
24799+ spin_unlock(&pfm_smpl_fmt_lock);
24800+}
24801+
24802+int pfm_fmt_register(struct pfm_smpl_fmt *fmt)
24803+{
24804+ int ret = 0;
24805+
24806+ if (perfmon_disabled) {
24807+ PFM_INFO("perfmon disabled, cannot add sampling format");
24808+ return -ENOSYS;
24809+ }
24810+
24811+ /* some sanity checks */
24812+ if (fmt == NULL) {
24813+ PFM_INFO("perfmon: NULL format for register");
24814+ return -EINVAL;
24815+ }
24816+
24817+ if (fmt->fmt_name == NULL) {
24818+ PFM_INFO("perfmon: format has no name");
24819+ return -EINVAL;
24820+ }
24821+
24822+ if (fmt->fmt_qdepth > PFM_MSGS_COUNT) {
24823+ PFM_INFO("perfmon: format %s requires %u msg queue depth (max %d)",
24824+ fmt->fmt_name,
24825+ fmt->fmt_qdepth,
24826+ PFM_MSGS_COUNT);
24827+ return -EINVAL;
24828+ }
24829+
24830+ /*
24831+ * fmt is missing the initialization of .owner = THIS_MODULE
24832+ * this is only valid when format is compiled as a module
24833+ */
24834+ if (fmt->owner == NULL && fmt_is_mod(fmt)) {
24835+ PFM_INFO("format %s has no module owner", fmt->fmt_name);
24836+ return -EINVAL;
24837+ }
24838+ /*
24839+ * we need at least a handler
24840+ */
24841+ if (fmt->fmt_handler == NULL) {
24842+ PFM_INFO("format %s has no handler", fmt->fmt_name);
24843+ return -EINVAL;
24844+ }
24845+
24846+ /*
24847+ * format argument size cannot be bigger than PAGE_SIZE
24848+ */
24849+ if (fmt->fmt_arg_size > PAGE_SIZE) {
24850+ PFM_INFO("format %s arguments too big", fmt->fmt_name);
24851+ return -EINVAL;
24852+ }
24853+
24854+ spin_lock(&pfm_smpl_fmt_lock);
24855+
24856+ /*
24857+ * because of sysfs, we cannot have two formats with the same name
24858+ */
24859+ if (pfm_find_fmt(fmt->fmt_name)) {
24860+ PFM_INFO("format %s already registered", fmt->fmt_name);
24861+ ret = -EBUSY;
24862+ goto out;
24863+ }
24864+
24865+ ret = pfm_sysfs_add_fmt(fmt);
24866+ if (ret) {
24867+ PFM_INFO("sysfs cannot add format entry for %s", fmt->fmt_name);
24868+ goto out;
24869+ }
24870+
24871+ list_add(&fmt->fmt_list, &pfm_smpl_fmt_list);
24872+
24873+ PFM_INFO("added sampling format %s", fmt->fmt_name);
24874+out:
24875+ spin_unlock(&pfm_smpl_fmt_lock);
24876+
24877+ return ret;
24878+}
24879+EXPORT_SYMBOL(pfm_fmt_register);
24880+
24881+int pfm_fmt_unregister(struct pfm_smpl_fmt *fmt)
24882+{
24883+ struct pfm_smpl_fmt *fmt2;
24884+ int ret = 0;
24885+
24886+ if (!fmt || !fmt->fmt_name) {
24887+ PFM_DBG("invalid fmt");
24888+ return -EINVAL;
24889+ }
24890+
24891+ spin_lock(&pfm_smpl_fmt_lock);
24892+
24893+ fmt2 = pfm_find_fmt(fmt->fmt_name);
24894+ if (!fmt) {
24895+ PFM_INFO("unregister failed, format not registered");
24896+ ret = -EINVAL;
24897+ goto out;
24898+ }
24899+ list_del_init(&fmt->fmt_list);
24900+
24901+ pfm_sysfs_remove_fmt(fmt);
24902+
24903+ PFM_INFO("removed sampling format: %s", fmt->fmt_name);
24904+
24905+out:
24906+ spin_unlock(&pfm_smpl_fmt_lock);
24907+ return ret;
24908+
24909+}
24910+EXPORT_SYMBOL(pfm_fmt_unregister);
24911+
24912+/*
24913+ * we defer adding the builtin formats to /sys/kernel/perfmon/formats
24914+ * until after the pfm sysfs subsystem is initialized. This function
24915+ * is called from pfm_init_sysfs()
24916+ */
24917+void __init pfm_sysfs_builtin_fmt_add(void)
24918+{
24919+ struct pfm_smpl_fmt *entry;
24920+
24921+ /*
24922+ * locking not needed, kernel not fully booted
24923+ * when called
24924+ */
24925+ list_for_each_entry(entry, &pfm_smpl_fmt_list, fmt_list) {
24926+ pfm_sysfs_add_fmt(entry);
24927+ }
24928+}
24929--- /dev/null
24930+++ b/perfmon/perfmon_hotplug.c
24931@@ -0,0 +1,151 @@
24932+/*
24933+ * perfmon_hotplug.c: handling of CPU hotplug
24934+ *
24935+ * The initial version of perfmon.c was written by
24936+ * Ganesh Venkitachalam, IBM Corp.
24937+ *
24938+ * Then it was modified for perfmon-1.x by Stephane Eranian and
24939+ * David Mosberger, Hewlett Packard Co.
24940+ *
24941+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
24942+ * by Stephane Eranian, Hewlett Packard Co.
24943+ *
24944+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
24945+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
24946+ * David Mosberger-Tang <davidm@hpl.hp.com>
24947+ *
24948+ * More information about perfmon available at:
24949+ * http://perfmon2.sf.net
24950+ *
24951+ * This program is free software; you can redistribute it and/or
24952+ * modify it under the terms of version 2 of the GNU General Public
24953+ * License as published by the Free Software Foundation.
24954+ *
24955+ * This program is distributed in the hope that it will be useful,
24956+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
24957+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24958+ * General Public License for more details.
24959+ *
24960+ * You should have received a copy of the GNU General Public License
24961+ * along with this program; if not, write to the Free Software
24962+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24963+ * 02111-1307 USA
24964+ */
24965+#include <linux/kernel.h>
24966+#include <linux/perfmon_kern.h>
24967+#include <linux/cpu.h>
24968+#include "perfmon_priv.h"
24969+
24970+#ifndef CONFIG_HOTPLUG_CPU
24971+void pfm_cpu_disable(void)
24972+{}
24973+
24974+int __init pfm_init_hotplug(void)
24975+{
24976+ return 0;
24977+}
24978+#else /* CONFIG_HOTPLUG_CPU */
24979+/*
24980+ * CPU hotplug event nofication callback
24981+ *
24982+ * We use the callback to do manage the sysfs interface.
24983+ * Note that the actual shutdown of monitoring on the CPU
24984+ * is done in pfm_cpu_disable(), see comments there for more
24985+ * information.
24986+ */
24987+static int pfm_cpu_notify(struct notifier_block *nfb,
24988+ unsigned long action, void *hcpu)
24989+{
24990+ unsigned int cpu = (unsigned long)hcpu;
24991+ int ret = NOTIFY_OK;
24992+
24993+ pfm_pmu_conf_get(0);
24994+
24995+ switch (action) {
24996+ case CPU_ONLINE:
24997+ pfm_debugfs_add_cpu(cpu);
24998+ PFM_INFO("CPU%d is online", cpu);
24999+ break;
25000+ case CPU_UP_PREPARE:
25001+ PFM_INFO("CPU%d prepare online", cpu);
25002+ break;
25003+ case CPU_UP_CANCELED:
25004+ pfm_debugfs_del_cpu(cpu);
25005+ PFM_INFO("CPU%d is up canceled", cpu);
25006+ break;
25007+ case CPU_DOWN_PREPARE:
25008+ PFM_INFO("CPU%d prepare offline", cpu);
25009+ break;
25010+ case CPU_DOWN_FAILED:
25011+ PFM_INFO("CPU%d is down failed", cpu);
25012+ break;
25013+ case CPU_DEAD:
25014+ pfm_debugfs_del_cpu(cpu);
25015+ PFM_INFO("CPU%d is offline", cpu);
25016+ break;
25017+ }
25018+ pfm_pmu_conf_put();
25019+ return ret;
25020+}
25021+
25022+/*
25023+ * called from cpu_disable() to detach the perfmon context
25024+ * from the CPU going down.
25025+ *
25026+ * We cannot use the cpu hotplug notifier because we MUST run
25027+ * on the CPU that is going down to save the PMU state
25028+ */
25029+void pfm_cpu_disable(void)
25030+{
25031+ struct pfm_context *ctx;
25032+ unsigned long flags;
25033+ int is_system, release_info = 0;
25034+ u32 cpu;
25035+ int r;
25036+
25037+ ctx = __get_cpu_var(pmu_ctx);
25038+ if (ctx == NULL)
25039+ return;
25040+
25041+ is_system = ctx->flags.system;
25042+ cpu = ctx->cpu;
25043+
25044+ /*
25045+ * context is LOADED or MASKED
25046+ *
25047+ * we unload from CPU. That stops monitoring and does
25048+ * all the bookeeping of saving values and updating duration
25049+ */
25050+ spin_lock_irqsave(&ctx->lock, flags);
25051+ if (is_system)
25052+ __pfm_unload_context(ctx, &release_info);
25053+ spin_unlock_irqrestore(&ctx->lock, flags);
25054+
25055+ /*
25056+ * cancel timer
25057+ */
25058+ if (release_info & 0x2) {
25059+ r = hrtimer_cancel(&__get_cpu_var(pfm_hrtimer));
25060+ PFM_DBG("timeout cancel=%d", r);
25061+ }
25062+
25063+ if (release_info & 0x1)
25064+ pfm_session_release(is_system, cpu);
25065+}
25066+
25067+static struct notifier_block pfm_cpu_notifier = {
25068+ .notifier_call = pfm_cpu_notify
25069+};
25070+
25071+int __init pfm_init_hotplug(void)
25072+{
25073+ int ret = 0;
25074+ /*
25075+ * register CPU hotplug event notifier
25076+ */
25077+ ret = register_cpu_notifier(&pfm_cpu_notifier);
25078+ if (!ret)
25079+ PFM_LOG("CPU hotplug support enabled");
25080+ return ret;
25081+}
25082+#endif /* CONFIG_HOTPLUG_CPU */
25083--- /dev/null
25084+++ b/perfmon/perfmon_init.c
25085@@ -0,0 +1,131 @@
25086+/*
25087+ * perfmon.c: perfmon2 global initialization functions
25088+ *
25089+ * This file implements the perfmon2 interface which
25090+ * provides access to the hardware performance counters
25091+ * of the host processor.
25092+ *
25093+ *
25094+ * The initial version of perfmon.c was written by
25095+ * Ganesh Venkitachalam, IBM Corp.
25096+ *
25097+ * Then it was modified for perfmon-1.x by Stephane Eranian and
25098+ * David Mosberger, Hewlett Packard Co.
25099+ *
25100+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
25101+ * by Stephane Eranian, Hewlett Packard Co.
25102+ *
25103+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
25104+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
25105+ * David Mosberger-Tang <davidm@hpl.hp.com>
25106+ *
25107+ * More information about perfmon available at:
25108+ * http://perfmon2.sf.net
25109+ *
25110+ * This program is free software; you can redistribute it and/or
25111+ * modify it under the terms of version 2 of the GNU General Public
25112+ * License as published by the Free Software Foundation.
25113+ *
25114+ * This program is distributed in the hope that it will be useful,
25115+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
25116+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25117+ * General Public License for more details.
25118+ *
25119+ * You should have received a copy of the GNU General Public License
25120+ * along with this program; if not, write to the Free Software
25121+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
25122+ * 02111-1307 USA
25123+ */
25124+#include <linux/kernel.h>
25125+#include <linux/perfmon_kern.h>
25126+#include "perfmon_priv.h"
25127+
25128+/*
25129+ * external variables
25130+ */
25131+DEFINE_PER_CPU(struct task_struct *, pmu_owner);
25132+DEFINE_PER_CPU(struct pfm_context *, pmu_ctx);
25133+DEFINE_PER_CPU(u64, pmu_activation_number);
25134+DEFINE_PER_CPU(struct pfm_stats, pfm_stats);
25135+DEFINE_PER_CPU(struct hrtimer, pfm_hrtimer);
25136+
25137+
25138+int perfmon_disabled; /* >0 if perfmon is disabled */
25139+
25140+/*
25141+ * called from cpu_init() and pfm_pmu_register()
25142+ */
25143+void __pfm_init_percpu(void *dummy)
25144+{
25145+ struct hrtimer *h;
25146+
25147+ h = &__get_cpu_var(pfm_hrtimer);
25148+
25149+ pfm_arch_init_percpu();
25150+
25151+ /*
25152+ * initialize per-cpu high res timer
25153+ */
25154+ hrtimer_init(h, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
25155+#ifdef CONFIG_HIGH_RES_TIMERS
25156+ /*
25157+ * avoid potential deadlock on the runqueue lock
25158+ * during context switch when multiplexing. Situation
25159+ * arises on architectures which run switch_to() with
25160+ * the runqueue lock held, e.g., x86. On others, e.g.,
25161+ * IA-64, the problem does not exist.
25162+ * Setting the callback mode to HRTIMER_CB_IRQSAFE_UNOCKED
25163+ * such that the callback routine is only called on hardirq
25164+ * context not on softirq, thus the context switch will not
25165+ * end up trying to wakeup the softirqd
25166+ */
25167+ h->cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
25168+#endif
25169+ h->function = pfm_handle_switch_timeout;
25170+}
25171+
25172+/*
25173+ * global initialization routine, executed only once
25174+ */
25175+int __init pfm_init(void)
25176+{
25177+ PFM_LOG("version %u.%u", PFM_VERSION_MAJ, PFM_VERSION_MIN);
25178+
25179+ if (pfm_init_ctx())
25180+ goto error_disable;
25181+
25182+
25183+ if (pfm_init_sets())
25184+ goto error_disable;
25185+
25186+ if (pfm_init_fs())
25187+ goto error_disable;
25188+
25189+ if (pfm_init_sysfs())
25190+ goto error_disable;
25191+
25192+ /* not critical, so no error checking */
25193+ pfm_init_debugfs();
25194+
25195+ /*
25196+ * one time, arch-specific global initialization
25197+ */
25198+ if (pfm_arch_init())
25199+ goto error_disable;
25200+
25201+ if (pfm_init_hotplug())
25202+ goto error_disable;
25203+ return 0;
25204+
25205+error_disable:
25206+ PFM_ERR("perfmon is disabled due to initialization error");
25207+ perfmon_disabled = 1;
25208+ return -1;
25209+}
25210+
25211+/*
25212+ * must use subsys_initcall() to ensure that the perfmon2 core
25213+ * is initialized before any PMU description module when they are
25214+ * compiled in.
25215+ */
25216+subsys_initcall(pfm_init);
25217--- /dev/null
25218+++ b/perfmon/perfmon_intr.c
25219@@ -0,0 +1,648 @@
25220+/*
25221+ * perfmon_intr.c: perfmon2 interrupt handling
25222+ *
25223+ * This file implements the perfmon2 interface which
25224+ * provides access to the hardware performance counters
25225+ * of the host processor.
25226+ *
25227+ * The initial version of perfmon.c was written by
25228+ * Ganesh Venkitachalam, IBM Corp.
25229+ *
25230+ * Then it was modified for perfmon-1.x by Stephane Eranian and
25231+ * David Mosberger, Hewlett Packard Co.
25232+ *
25233+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
25234+ * by Stephane Eranian, Hewlett Packard Co.
25235+ *
25236+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
25237+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
25238+ * David Mosberger-Tang <davidm@hpl.hp.com>
25239+ *
25240+ * More information about perfmon available at:
25241+ * http://perfmon2.sf.net
25242+ *
25243+ * This program is free software; you can redistribute it and/or
25244+ * modify it under the terms of version 2 of the GNU General Public
25245+ * License as published by the Free Software Foundation.
25246+ *
25247+ * This program is distributed in the hope that it will be useful,
25248+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
25249+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25250+ * General Public License for more details.
25251+ *
25252+ * You should have received a copy of the GNU General Public License
25253+ * along with this program; if not, write to the Free Software
25254+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
25255+ * 02111-1307 USA
25256+ */
25257+#include <linux/kernel.h>
25258+#include <linux/module.h>
25259+#include <linux/perfmon_kern.h>
25260+#include "perfmon_priv.h"
25261+
25262+/**
25263+ * pfm_intr_process_64bit_ovfls - handle 64-bit counter emulation
25264+ * @ctx: context to operate on
25265+ * @set: set to operate on
25266+ *
25267+ * The function returns the number of 64-bit overflows detected.
25268+ *
25269+ * 64-bit software pmds are updated for overflowed pmd registers
25270+ * the set->reset_pmds is updated to the list of pmds to reset
25271+ *
25272+ * In any case, set->npend_ovfls is cleared
25273+ */
25274+static u16 pfm_intr_process_64bit_ovfls(struct pfm_context *ctx,
25275+ struct pfm_event_set *set,
25276+ u32 *ovfl_ctrl)
25277+{
25278+ u16 i, num_ovfls, max_pmd, max_intr;
25279+ u16 num_64b_ovfls, has_ovfl_sw, must_switch;
25280+ u64 ovfl_thres, old_val, new_val, ovfl_mask;
25281+
25282+ num_64b_ovfls = must_switch = 0;
25283+
25284+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
25285+ max_pmd = ctx->regs.max_pmd;
25286+ max_intr = ctx->regs.max_intr_pmd;
25287+
25288+ num_ovfls = set->npend_ovfls;
25289+ has_ovfl_sw = set->flags & PFM_SETFL_OVFL_SWITCH;
25290+
25291+ bitmap_zero(cast_ulp(set->reset_pmds), max_pmd);
25292+
25293+ for (i = ctx->regs.first_intr_pmd; num_ovfls; i++) {
25294+ /*
25295+ * skip pmd which did not overflow
25296+ */
25297+ if (!test_bit(i, cast_ulp(set->povfl_pmds)))
25298+ continue;
25299+
25300+ num_ovfls--;
25301+
25302+ /*
25303+ * Update software value for counters ONLY
25304+ *
25305+ * Note that the pmd is not necessarily 0 at this point as
25306+ * qualified events may have happened before the PMU was
25307+ * frozen. The residual count is not taken into consideration
25308+ * here but will be with any read of the pmd
25309+ */
25310+ ovfl_thres = set->pmds[i].ovflsw_thres;
25311+
25312+ if (likely(test_bit(i, cast_ulp(ctx->regs.cnt_pmds)))) {
25313+ old_val = new_val = set->pmds[i].value;
25314+ new_val += 1 + ovfl_mask;
25315+ set->pmds[i].value = new_val;
25316+ } else {
25317+ /*
25318+ * for non counters which interrupt, e.g., AMD IBS,
25319+ * we consider this equivalent to a 64-bit counter
25320+ * overflow.
25321+ */
25322+ old_val = 1; new_val = 0;
25323+ }
25324+
25325+ /*
25326+ * check for 64-bit overflow condition
25327+ */
25328+ if (likely(old_val > new_val)) {
25329+ num_64b_ovfls++;
25330+ if (has_ovfl_sw && ovfl_thres > 0) {
25331+ if (ovfl_thres == 1)
25332+ must_switch = 1;
25333+ set->pmds[i].ovflsw_thres = ovfl_thres - 1;
25334+ }
25335+
25336+ /*
25337+ * what to reset because of this overflow
25338+ * - the overflowed register
25339+ * - its reset_smpls
25340+ */
25341+ __set_bit(i, cast_ulp(set->reset_pmds));
25342+
25343+ bitmap_or(cast_ulp(set->reset_pmds),
25344+ cast_ulp(set->reset_pmds),
25345+ cast_ulp(set->pmds[i].reset_pmds),
25346+ max_pmd);
25347+ } else {
25348+ /*
25349+ * only keep track of 64-bit overflows or
25350+ * assimilated
25351+ */
25352+ __clear_bit(i, cast_ulp(set->povfl_pmds));
25353+
25354+ /*
25355+ * on some PMU, it may be necessary to re-arm the PMD
25356+ */
25357+ pfm_arch_ovfl_reset_pmd(ctx, i);
25358+ }
25359+
25360+ PFM_DBG_ovfl("ovfl=%s pmd%u new=0x%llx old=0x%llx "
25361+ "hw_pmd=0x%llx o_pmds=0x%llx must_switch=%u "
25362+ "o_thres=%llu o_thres_ref=%llu",
25363+ old_val > new_val ? "64-bit" : "HW",
25364+ i,
25365+ (unsigned long long)new_val,
25366+ (unsigned long long)old_val,
25367+ (unsigned long long)pfm_read_pmd(ctx, i),
25368+ (unsigned long long)set->povfl_pmds[0],
25369+ must_switch,
25370+ (unsigned long long)set->pmds[i].ovflsw_thres,
25371+ (unsigned long long)set->pmds[i].ovflsw_ref_thres);
25372+ }
25373+ /*
25374+ * update public bitmask of 64-bit overflowed pmds
25375+ */
25376+ if (num_64b_ovfls)
25377+ bitmap_copy(cast_ulp(set->ovfl_pmds), cast_ulp(set->povfl_pmds),
25378+ max_intr);
25379+
25380+ if (must_switch)
25381+ *ovfl_ctrl |= PFM_OVFL_CTRL_SWITCH;
25382+
25383+ /*
25384+ * mark the overflows as consumed
25385+ */
25386+ set->npend_ovfls = 0;
25387+ bitmap_zero(cast_ulp(set->povfl_pmds), max_intr);
25388+
25389+ return num_64b_ovfls;
25390+}
25391+
25392+/**
25393+ * pfm_intr_get_smpl_pmds_values - copy 64-bit pmd values for sampling format
25394+ * @ctx: context to work on
25395+ * @set: current event set
25396+ * @arg: overflow arg to be passed to format
25397+ * @smpl_pmds: list of PMDs of interest for the overflowed register
25398+ *
25399+ * build an array of 46-bit PMD values based on smpl_pmds. Values are
25400+ * stored in increasing order of the PMD indexes
25401+ */
25402+static void pfm_intr_get_smpl_pmds_values(struct pfm_context *ctx,
25403+ struct pfm_event_set *set,
25404+ struct pfm_ovfl_arg *arg,
25405+ u64 *smpl_pmds)
25406+{
25407+ u16 j, k, max_pmd;
25408+ u64 new_val, ovfl_mask;
25409+ u64 *cnt_pmds;
25410+
25411+ cnt_pmds = ctx->regs.cnt_pmds;
25412+ max_pmd = ctx->regs.max_pmd;
25413+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
25414+
25415+ for (j = k = 0; j < max_pmd; j++) {
25416+
25417+ if (!test_bit(j, cast_ulp(smpl_pmds)))
25418+ continue;
25419+
25420+ new_val = pfm_read_pmd(ctx, j);
25421+
25422+ /* for counters, build 64-bit value */
25423+ if (test_bit(j, cast_ulp(cnt_pmds)))
25424+ new_val = (set->pmds[j].value & ~ovfl_mask)
25425+ | (new_val & ovfl_mask);
25426+
25427+ arg->smpl_pmds_values[k++] = new_val;
25428+
25429+ PFM_DBG_ovfl("s_pmd_val[%u]=pmd%u=0x%llx", k, j,
25430+ (unsigned long long)new_val);
25431+ }
25432+ arg->num_smpl_pmds = k;
25433+}
25434+
25435+/**
25436+ * pfm_intr_process_smpl_fmt -- handle sampling format callback
25437+ * @ctx: context to work on
25438+ * @set: current event set
25439+ * @ip: interrupted instruction pointer
25440+ * @now: timestamp
25441+ * @num_ovfls: number of 64-bit overflows
25442+ * @ovfl_ctrl: set of controls for interrupt handler tail processing
25443+ * @regs: register state
25444+ *
25445+ * Prepare argument (ovfl_arg) to be passed to sampling format callback, then
25446+ * invoke the callback (fmt_handler)
25447+ */
25448+static int pfm_intr_process_smpl_fmt(struct pfm_context *ctx,
25449+ struct pfm_event_set *set,
25450+ unsigned long ip,
25451+ u64 now,
25452+ u64 num_ovfls,
25453+ u32 *ovfl_ctrl,
25454+ struct pt_regs *regs)
25455+{
25456+ struct pfm_ovfl_arg *ovfl_arg;
25457+ u64 start_cycles, end_cycles;
25458+ u16 i, max_pmd;
25459+ int ret = 0;
25460+
25461+ ovfl_arg = &ctx->ovfl_arg;
25462+
25463+ ovfl_arg->active_set = set->id;
25464+ max_pmd = ctx->regs.max_pmd;
25465+
25466+ /*
25467+ * first_intr_pmd: first PMD which can generate PMU interrupts
25468+ */
25469+ for (i = ctx->regs.first_intr_pmd; num_ovfls; i++) {
25470+ /*
25471+ * skip pmd which did not have 64-bit overflows
25472+ */
25473+ if (!test_bit(i, cast_ulp(set->ovfl_pmds)))
25474+ continue;
25475+
25476+ num_ovfls--;
25477+
25478+ /*
25479+ * prepare argument to fmt_handler
25480+ */
25481+ ovfl_arg->ovfl_pmd = i;
25482+ ovfl_arg->ovfl_ctrl = 0;
25483+
25484+ ovfl_arg->pmd_last_reset = set->pmds[i].lval;
25485+ ovfl_arg->pmd_eventid = set->pmds[i].eventid;
25486+ ovfl_arg->num_smpl_pmds = 0;
25487+
25488+ /*
25489+ * copy values of pmds of interest, if any
25490+ * Sampling format may use them
25491+ * We do not initialize the unused smpl_pmds_values
25492+ */
25493+ if (!bitmap_empty(cast_ulp(set->pmds[i].smpl_pmds), max_pmd))
25494+ pfm_intr_get_smpl_pmds_values(ctx, set, ovfl_arg,
25495+ set->pmds[i].smpl_pmds);
25496+
25497+ pfm_stats_inc(fmt_handler_calls);
25498+
25499+ /*
25500+ * call format record (handler) routine
25501+ */
25502+ start_cycles = sched_clock();
25503+ ret = (*ctx->smpl_fmt->fmt_handler)(ctx, ip, now, regs);
25504+ end_cycles = sched_clock();
25505+
25506+ /*
25507+ * The reset_pmds mask is constructed automatically
25508+ * on overflow. When the actual reset takes place
25509+ * depends on the masking, switch and notification
25510+ * status. It may be deferred until pfm_restart().
25511+ */
25512+ *ovfl_ctrl |= ovfl_arg->ovfl_ctrl;
25513+
25514+ pfm_stats_add(fmt_handler_ns, end_cycles - start_cycles);
25515+ }
25516+ /*
25517+ * when the format cannot handle the rest of the overflow, we abort
25518+ */
25519+ if (ret)
25520+ PFM_DBG_ovfl("handler aborted at PMD%u ret=%d", i, ret);
25521+ return ret;
25522+}
25523+/**
25524+ * pfm_overflow_handler - main overflow processing routine.
25525+ * @ctx: context to work on (always current context)
25526+ * @set: current event set
25527+ * @ip: interrupt instruction pointer
25528+ * @regs: machine state
25529+ *
25530+ * set->num_ovfl_pmds is 0 when returning from this function even though
25531+ * set->ovfl_pmds[] may have bits set. When leaving set->num_ovfl_pmds
25532+ * must never be used to determine if there was a pending overflow.
25533+ */
25534+static void pfm_overflow_handler(struct pfm_context *ctx,
25535+ struct pfm_event_set *set,
25536+ unsigned long ip,
25537+ struct pt_regs *regs)
25538+{
25539+ struct pfm_event_set *set_orig;
25540+ u64 now;
25541+ u32 ovfl_ctrl;
25542+ u16 max_intr, max_pmd;
25543+ u16 num_ovfls;
25544+ int ret, has_notify;
25545+
25546+ /*
25547+ * take timestamp
25548+ */
25549+ now = sched_clock();
25550+
25551+ max_pmd = ctx->regs.max_pmd;
25552+ max_intr = ctx->regs.max_intr_pmd;
25553+
25554+ set_orig = set;
25555+ ovfl_ctrl = 0;
25556+
25557+ /*
25558+ * skip ZOMBIE case
25559+ */
25560+ if (unlikely(ctx->state == PFM_CTX_ZOMBIE))
25561+ goto stop_monitoring;
25562+
25563+ PFM_DBG_ovfl("intr_pmds=0x%llx npend=%u ip=%p, blocking=%d "
25564+ "u_pmds=0x%llx use_fmt=%u",
25565+ (unsigned long long)set->povfl_pmds[0],
25566+ set->npend_ovfls,
25567+ (void *)ip,
25568+ ctx->flags.block,
25569+ (unsigned long long)set->used_pmds[0],
25570+ !!ctx->smpl_fmt);
25571+
25572+ /*
25573+ * return number of 64-bit overflows
25574+ */
25575+ num_ovfls = pfm_intr_process_64bit_ovfls(ctx, set, &ovfl_ctrl);
25576+
25577+ /*
25578+ * there were no 64-bit overflows
25579+ * nothing else to do
25580+ */
25581+ if (!num_ovfls)
25582+ return;
25583+
25584+ /*
25585+ * tmp_ovfl_notify = ovfl_pmds & ovfl_notify
25586+ * with:
25587+ * - ovfl_pmds: last 64-bit overflowed pmds
25588+ * - ovfl_notify: notify on overflow registers
25589+ */
25590+ bitmap_and(cast_ulp(ctx->tmp_ovfl_notify),
25591+ cast_ulp(set->ovfl_pmds),
25592+ cast_ulp(set->ovfl_notify),
25593+ max_intr);
25594+
25595+ has_notify = !bitmap_empty(cast_ulp(ctx->tmp_ovfl_notify), max_intr);
25596+
25597+ /*
25598+ * check for sampling format and invoke fmt_handler
25599+ */
25600+ if (likely(ctx->smpl_fmt)) {
25601+ pfm_intr_process_smpl_fmt(ctx, set, ip, now, num_ovfls,
25602+ &ovfl_ctrl, regs);
25603+ } else {
25604+ /*
25605+ * When no sampling format is used, the default
25606+ * is:
25607+ * - mask monitoring if not switching
25608+ * - notify user if requested
25609+ *
25610+ * If notification is not requested, monitoring is masked
25611+ * and overflowed registers are not reset (saturation).
25612+ * This mimics the behavior of the default sampling format.
25613+ */
25614+ ovfl_ctrl |= PFM_OVFL_CTRL_NOTIFY;
25615+ if (has_notify || !(ovfl_ctrl & PFM_OVFL_CTRL_SWITCH))
25616+ ovfl_ctrl |= PFM_OVFL_CTRL_MASK;
25617+ }
25618+
25619+ PFM_DBG_ovfl("set%u o_notify=0x%llx o_pmds=0x%llx "
25620+ "r_pmds=0x%llx ovfl_ctrl=0x%x",
25621+ set->id,
25622+ (unsigned long long)ctx->tmp_ovfl_notify[0],
25623+ (unsigned long long)set->ovfl_pmds[0],
25624+ (unsigned long long)set->reset_pmds[0],
25625+ ovfl_ctrl);
25626+
25627+ /*
25628+ * execute the various controls
25629+ * ORDER MATTERS
25630+ */
25631+
25632+
25633+ /*
25634+ * mask monitoring
25635+ */
25636+ if (ovfl_ctrl & PFM_OVFL_CTRL_MASK) {
25637+ pfm_mask_monitoring(ctx, set);
25638+ /*
25639+ * when masking, reset is deferred until
25640+ * pfm_restart()
25641+ */
25642+ ovfl_ctrl &= ~PFM_OVFL_CTRL_RESET;
25643+
25644+ /*
25645+ * when masking, switching is deferred until
25646+ * pfm_restart and we need to remember it
25647+ */
25648+ if (ovfl_ctrl & PFM_OVFL_CTRL_SWITCH) {
25649+ set->priv_flags |= PFM_SETFL_PRIV_SWITCH;
25650+ ovfl_ctrl &= ~PFM_OVFL_CTRL_SWITCH;
25651+ }
25652+ }
25653+
25654+ /*
25655+ * switch event set
25656+ */
25657+ if (ovfl_ctrl & PFM_OVFL_CTRL_SWITCH) {
25658+ pfm_switch_sets_from_intr(ctx);
25659+ /* update view of active set */
25660+ set = ctx->active_set;
25661+ }
25662+ /*
25663+ * send overflow notification
25664+ *
25665+ * only necessary if at least one overflowed
25666+ * register had the notify flag set
25667+ */
25668+ if (has_notify && (ovfl_ctrl & PFM_OVFL_CTRL_NOTIFY)) {
25669+ /*
25670+ * block on notify, not on masking
25671+ */
25672+ if (ctx->flags.block)
25673+ pfm_post_work(current, ctx, PFM_WORK_BLOCK);
25674+
25675+ /*
25676+ * send notification and passed original set id
25677+ * if error, queue full, for instance, then default
25678+ * to masking monitoring, i.e., saturate
25679+ */
25680+ ret = pfm_ovfl_notify(ctx, set_orig, ip);
25681+ if (unlikely(ret)) {
25682+ if (ctx->state == PFM_CTX_LOADED) {
25683+ pfm_mask_monitoring(ctx, set);
25684+ ovfl_ctrl &= ~PFM_OVFL_CTRL_RESET;
25685+ }
25686+ } else {
25687+ ctx->flags.can_restart++;
25688+ PFM_DBG_ovfl("can_restart=%u", ctx->flags.can_restart);
25689+ }
25690+ }
25691+
25692+ /*
25693+ * reset overflowed registers
25694+ */
25695+ if (ovfl_ctrl & PFM_OVFL_CTRL_RESET) {
25696+ u16 nn;
25697+ nn = bitmap_weight(cast_ulp(set->reset_pmds), max_pmd);
25698+ if (nn)
25699+ pfm_reset_pmds(ctx, set, nn, PFM_PMD_RESET_SHORT);
25700+ }
25701+ return;
25702+
25703+stop_monitoring:
25704+ /*
25705+ * Does not happen for a system-wide context nor for a
25706+ * self-monitored context. We cannot attach to kernel-only
25707+ * thread, thus it is safe to set TIF bits, i.e., the thread
25708+ * will eventually leave the kernel or die and either we will
25709+ * catch the context and clean it up in pfm_handler_work() or
25710+ * pfm_exit_thread().
25711+ *
25712+ * Mask until we get to pfm_handle_work()
25713+ */
25714+ pfm_mask_monitoring(ctx, set);
25715+
25716+ PFM_DBG_ovfl("ctx is zombie, converted to spurious");
25717+ pfm_post_work(current, ctx, PFM_WORK_ZOMBIE);
25718+}
25719+
25720+/**
25721+ * __pfm_interrupt_handler - 1st level interrupt handler
25722+ * @ip: interrupted instruction pointer
25723+ * @regs: machine state
25724+ *
25725+ * Function is static because we use a wrapper to easily capture timing infos.
25726+ *
25727+ *
25728+ * Context locking necessary to avoid concurrent accesses from other CPUs
25729+ * - For per-thread, we must prevent pfm_restart() which works when
25730+ * context is LOADED or MASKED
25731+ */
25732+static void __pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs)
25733+{
25734+ struct task_struct *task;
25735+ struct pfm_context *ctx;
25736+ struct pfm_event_set *set;
25737+
25738+
25739+ task = __get_cpu_var(pmu_owner);
25740+ ctx = __get_cpu_var(pmu_ctx);
25741+
25742+ /*
25743+ * verify if there is a context on this CPU
25744+ */
25745+ if (unlikely(ctx == NULL)) {
25746+ PFM_DBG_ovfl("no ctx");
25747+ goto spurious;
25748+ }
25749+
25750+ /*
25751+ * we need to lock context because it could be accessed
25752+ * from another CPU. Depending on the priority level of
25753+ * the PMU interrupt or the arch, it may be necessary to
25754+ * mask interrupts alltogether to avoid race condition with
25755+ * the timer interrupt in case of time-based set switching,
25756+ * for instance.
25757+ */
25758+ spin_lock(&ctx->lock);
25759+
25760+ set = ctx->active_set;
25761+
25762+ /*
25763+ * For SMP per-thread, it is not possible to have
25764+ * owner != NULL && task != current.
25765+ *
25766+ * For UP per-thread, because of lazy save, it
25767+ * is possible to receive an interrupt in another task
25768+ * which is not using the PMU. This means
25769+ * that the interrupt was in-flight at the
25770+ * time of pfm_ctxswout_thread(). In that
25771+ * case, it will be replayed when the task
25772+ * is scheduled again. Hence we convert to spurious.
25773+ *
25774+ * The basic rule is that an overflow is always
25775+ * processed in the context of the task that
25776+ * generated it for all per-thread contexts.
25777+ *
25778+ * for system-wide, task is always NULL
25779+ */
25780+#ifndef CONFIG_SMP
25781+ if (unlikely((task && current->pfm_context != ctx))) {
25782+ PFM_DBG_ovfl("spurious: not owned by current task");
25783+ goto spurious;
25784+ }
25785+#endif
25786+ if (unlikely(ctx->state == PFM_CTX_MASKED)) {
25787+ PFM_DBG_ovfl("spurious: monitoring masked");
25788+ goto spurious;
25789+ }
25790+
25791+ /*
25792+ * check that monitoring is active, otherwise convert
25793+ * to spurious
25794+ */
25795+ if (unlikely(!pfm_arch_is_active(ctx))) {
25796+ PFM_DBG_ovfl("spurious: monitoring non active");
25797+ goto spurious;
25798+ }
25799+
25800+ /*
25801+ * freeze PMU and collect overflowed PMD registers
25802+ * into set->povfl_pmds. Number of overflowed PMDs
25803+ * reported in set->npend_ovfls
25804+ */
25805+ pfm_arch_intr_freeze_pmu(ctx, set);
25806+
25807+ /*
25808+ * no overflow detected, interrupt may have come
25809+ * from the previous thread running on this CPU
25810+ */
25811+ if (unlikely(!set->npend_ovfls)) {
25812+ PFM_DBG_ovfl("no npend_ovfls");
25813+ goto spurious;
25814+ }
25815+
25816+ pfm_stats_inc(ovfl_intr_regular_count);
25817+
25818+ /*
25819+ * invoke actual handler
25820+ */
25821+ pfm_overflow_handler(ctx, set, ip, regs);
25822+
25823+ /*
25824+ * unfreeze PMU, monitoring may not actual be restarted
25825+ * if context is MASKED
25826+ */
25827+ pfm_arch_intr_unfreeze_pmu(ctx);
25828+
25829+ spin_unlock(&ctx->lock);
25830+
25831+ return;
25832+
25833+spurious:
25834+ /* ctx may be NULL */
25835+ pfm_arch_intr_unfreeze_pmu(ctx);
25836+ if (ctx)
25837+ spin_unlock(&ctx->lock);
25838+
25839+ pfm_stats_inc(ovfl_intr_spurious_count);
25840+}
25841+
25842+
25843+/**
25844+ * pfm_interrupt_handler - 1st level interrupt handler
25845+ * @ip: interrupt instruction pointer
25846+ * @regs: machine state
25847+ *
25848+ * Function called from the low-level assembly code or arch-specific perfmon
25849+ * code. Simple wrapper used for timing purpose. Actual work done in
25850+ * __pfm_overflow_handler()
25851+ */
25852+void pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs)
25853+{
25854+ u64 start;
25855+
25856+ pfm_stats_inc(ovfl_intr_all_count);
25857+
25858+ BUG_ON(!irqs_disabled());
25859+
25860+ start = sched_clock();
25861+
25862+ __pfm_interrupt_handler(ip, regs);
25863+
25864+ pfm_stats_add(ovfl_intr_ns, sched_clock() - start);
25865+}
25866+EXPORT_SYMBOL(pfm_interrupt_handler);
25867+
25868--- /dev/null
25869+++ b/perfmon/perfmon_msg.c
25870@@ -0,0 +1,229 @@
25871+/*
25872+ * perfmon_msg.c: perfmon2 notification message queue management
25873+ *
25874+ * This file implements the perfmon2 interface which
25875+ * provides access to the hardware performance counters
25876+ * of the host processor.
25877+ *
25878+ * The initial version of perfmon.c was written by
25879+ * Ganesh Venkitachalam, IBM Corp.
25880+ *
25881+ * Then it was modified for perfmon-1.x by Stephane Eranian and
25882+ * David Mosberger, Hewlett Packard Co.
25883+ *
25884+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
25885+ * by Stephane Eranian, Hewlett Packard Co.
25886+ *
25887+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
25888+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
25889+ * David Mosberger-Tang <davidm@hpl.hp.com>
25890+ *
25891+ * More information about perfmon available at:
25892+ * http://perfmon2.sf.net
25893+ *
25894+ * This program is free software; you can redistribute it and/or
25895+ * modify it under the terms of version 2 of the GNU General Public
25896+ * License as published by the Free Software Foundation.
25897+ *
25898+ * This program is distributed in the hope that it will be useful,
25899+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
25900+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25901+ * General Public License for more details.
25902+ *
25903+ * You should have received a copy of the GNU General Public License
25904+ * along with this program; if not, write to the Free Software
25905+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
25906+ * 02111-1307 USA
25907+ */
25908+#include <linux/kernel.h>
25909+#include <linux/poll.h>
25910+#include <linux/perfmon_kern.h>
25911+
25912+/**
25913+ * pfm_get_new_msg - get a new message slot from the queue
25914+ * @ctx: context to operate on
25915+ *
25916+ * if queue if full NULL is returned
25917+ */
25918+static union pfarg_msg *pfm_get_new_msg(struct pfm_context *ctx)
25919+{
25920+ int next;
25921+
25922+ next = ctx->msgq_head & PFM_MSGQ_MASK;
25923+
25924+ if ((ctx->msgq_head - ctx->msgq_tail) == PFM_MSGS_COUNT)
25925+ return NULL;
25926+
25927+ /*
25928+ * move to next possible slot
25929+ */
25930+ ctx->msgq_head++;
25931+
25932+ PFM_DBG_ovfl("head=%d tail=%d msg=%d",
25933+ ctx->msgq_head & PFM_MSGQ_MASK,
25934+ ctx->msgq_tail & PFM_MSGQ_MASK,
25935+ next);
25936+
25937+ return ctx->msgq+next;
25938+}
25939+
25940+/**
25941+ * pfm_notify_user - wakeup any thread wiating on msg queue, post SIGIO
25942+ * @ctx: context to operate on
25943+ *
25944+ * message is already enqueued
25945+ */
25946+static void pfm_notify_user(struct pfm_context *ctx)
25947+{
25948+ if (ctx->state == PFM_CTX_ZOMBIE) {
25949+ PFM_DBG("no notification, context is zombie");
25950+ return;
25951+ }
25952+
25953+ PFM_DBG_ovfl("waking up");
25954+
25955+ wake_up_interruptible(&ctx->msgq_wait);
25956+
25957+ /*
25958+ * it is safe to call kill_fasync() from an interrupt
25959+ * handler. kill_fasync() grabs two RW locks (fasync_lock,
25960+ * tasklist_lock) in read mode. There is conflict only in
25961+ * case the PMU interrupt occurs during a write mode critical
25962+ * section. This cannot happen because for both locks, the
25963+ * write mode is always using interrupt masking (write_lock_irq).
25964+ */
25965+ kill_fasync(&ctx->async_queue, SIGIO, POLL_IN);
25966+}
25967+
25968+/**
25969+ * pfm_ovfl_notify - send overflow notification
25970+ * @ctx: context to operate on
25971+ * @set: which set the overflow comes from
25972+ * @ip: overflow interrupt instruction address (IIP)
25973+ *
25974+ * Appends an overflow notification message to context queue.
25975+ * call pfm_notify() to wakeup any threads and/or send a signal
25976+ *
25977+ * Context is locked and interrupts are disabled (no preemption).
25978+ */
25979+int pfm_ovfl_notify(struct pfm_context *ctx,
25980+ struct pfm_event_set *set,
25981+ unsigned long ip)
25982+{
25983+ union pfarg_msg *msg = NULL;
25984+ u64 *ovfl_pmds;
25985+
25986+ if (!ctx->flags.no_msg) {
25987+ msg = pfm_get_new_msg(ctx);
25988+ if (msg == NULL) {
25989+ /*
25990+ * when message queue fills up it is because the user
25991+ * did not extract the message, yet issued
25992+ * pfm_restart(). At this point, we stop sending
25993+ * notification, thus the user will not be able to get
25994+ * new samples when using the default format.
25995+ */
25996+ PFM_DBG_ovfl("no more notification msgs");
25997+ return -1;
25998+ }
25999+
26000+ msg->pfm_ovfl_msg.msg_type = PFM_MSG_OVFL;
26001+ msg->pfm_ovfl_msg.msg_ovfl_pid = current->pid;
26002+ msg->pfm_ovfl_msg.msg_active_set = set->id;
26003+
26004+ ovfl_pmds = msg->pfm_ovfl_msg.msg_ovfl_pmds;
26005+
26006+ /*
26007+ * copy bitmask of all pmd that interrupted last
26008+ */
26009+ bitmap_copy(cast_ulp(ovfl_pmds), cast_ulp(set->ovfl_pmds),
26010+ ctx->regs.max_intr_pmd);
26011+
26012+ msg->pfm_ovfl_msg.msg_ovfl_cpu = smp_processor_id();
26013+ msg->pfm_ovfl_msg.msg_ovfl_tid = current->tgid;
26014+ msg->pfm_ovfl_msg.msg_ovfl_ip = ip;
26015+
26016+ pfm_stats_inc(ovfl_notify_count);
26017+ }
26018+
26019+ PFM_DBG_ovfl("ip=0x%lx o_pmds=0x%llx",
26020+ ip,
26021+ (unsigned long long)set->ovfl_pmds[0]);
26022+
26023+ pfm_notify_user(ctx);
26024+ return 0;
26025+}
26026+
26027+/**
26028+ * pfm_end_notify_user - notify of thread termination
26029+ * @ctx: context to operate on
26030+ *
26031+ * In per-thread mode, when not self-monitoring, perfmon
26032+ * sends a 'end' notification message when the monitored
26033+ * thread where the context is attached is exiting.
26034+ *
26035+ * This helper message alleviates the need to track the activity
26036+ * of the thread/process when it is not directly related, i.e.,
26037+ * was attached. In other words, no needto keep the thread
26038+ * ptraced.
26039+ *
26040+ * The context must be locked and interrupts disabled.
26041+ */
26042+int pfm_end_notify(struct pfm_context *ctx)
26043+{
26044+ union pfarg_msg *msg;
26045+
26046+ msg = pfm_get_new_msg(ctx);
26047+ if (msg == NULL) {
26048+ PFM_ERR("%s no more msgs", __func__);
26049+ return -1;
26050+ }
26051+ /* no leak */
26052+ memset(msg, 0, sizeof(*msg));
26053+
26054+ msg->type = PFM_MSG_END;
26055+
26056+ PFM_DBG("end msg: msg=%p no_msg=%d",
26057+ msg,
26058+ ctx->flags.no_msg);
26059+
26060+ pfm_notify_user(ctx);
26061+ return 0;
26062+}
26063+
26064+/**
26065+ * pfm_get_next_msg - copy the oldest message from the queue and move tail
26066+ * @ctx: context to use
26067+ * @m: where to copy the message into
26068+ *
26069+ * The tail of the queue is moved as a consequence of this call
26070+ */
26071+void pfm_get_next_msg(struct pfm_context *ctx, union pfarg_msg *m)
26072+{
26073+ union pfarg_msg *next;
26074+
26075+ PFM_DBG_ovfl("in head=%d tail=%d",
26076+ ctx->msgq_head & PFM_MSGQ_MASK,
26077+ ctx->msgq_tail & PFM_MSGQ_MASK);
26078+
26079+ /*
26080+ * get oldest message
26081+ */
26082+ next = ctx->msgq + (ctx->msgq_tail & PFM_MSGQ_MASK);
26083+
26084+ /*
26085+ * move tail forward
26086+ */
26087+ ctx->msgq_tail++;
26088+
26089+ /*
26090+ * copy message, we cannot simply point to it
26091+ * as it may be re-used before we copy it out
26092+ */
26093+ *m = *next;
26094+
26095+ PFM_DBG_ovfl("out head=%d tail=%d type=%d",
26096+ ctx->msgq_head & PFM_MSGQ_MASK,
26097+ ctx->msgq_tail & PFM_MSGQ_MASK,
26098+ m->type);
26099+}
26100--- /dev/null
26101+++ b/perfmon/perfmon_pmu.c
26102@@ -0,0 +1,590 @@
26103+/*
26104+ * perfmon_pmu.c: perfmon2 PMU configuration management
26105+ *
26106+ * This file implements the perfmon2 interface which
26107+ * provides access to the hardware performance counters
26108+ * of the host processor.
26109+ *
26110+ * The initial version of perfmon.c was written by
26111+ * Ganesh Venkitachalam, IBM Corp.
26112+ *
26113+ * Then it was modified for perfmon-1.x by Stephane Eranian and
26114+ * David Mosberger, Hewlett Packard Co.
26115+ *
26116+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
26117+ * by Stephane Eranian, Hewlett Packard Co.
26118+ *
26119+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
26120+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
26121+ * David Mosberger-Tang <davidm@hpl.hp.com>
26122+ *
26123+ * More information about perfmon available at:
26124+ * http://perfmon2.sf.net
26125+ *
26126+ * This program is free software; you can redistribute it and/or
26127+ * modify it under the terms of version 2 of the GNU General Public
26128+ * License as published by the Free Software Foundation.
26129+ *
26130+ * This program is distributed in the hope that it will be useful,
26131+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
26132+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26133+ * General Public License for more details.
26134+ *
26135+ * You should have received a copy of the GNU General Public License
26136+ * along with this program; if not, write to the Free Software
26137+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26138+ * 02111-1307 USA
26139+ */
26140+#include <linux/module.h>
26141+#include <linux/perfmon_kern.h>
26142+#include "perfmon_priv.h"
26143+
26144+#ifndef CONFIG_MODULE_UNLOAD
26145+#define module_refcount(n) 1
26146+#endif
26147+
26148+static __cacheline_aligned_in_smp int request_mod_in_progress;
26149+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_conf_lock);
26150+
26151+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_acq_lock);
26152+static u32 pfm_pmu_acquired;
26153+
26154+/*
26155+ * perfmon core must acces PMU information ONLY through pfm_pmu_conf
26156+ * if pfm_pmu_conf is NULL, then no description is registered
26157+ */
26158+struct pfm_pmu_config *pfm_pmu_conf;
26159+EXPORT_SYMBOL(pfm_pmu_conf);
26160+
26161+static inline int pmu_is_module(struct pfm_pmu_config *c)
26162+{
26163+ return !(c->flags & PFM_PMUFL_IS_BUILTIN);
26164+}
26165+/**
26166+ * pfm_pmu_regdesc_init -- initialize regdesc structure from PMU table
26167+ * @regs: the regdesc structure to initialize
26168+ * @excl_type: the register type(s) to exclude from this regdesc
26169+ * @unvail_pmcs: unavailable PMC registers
26170+ * @unavail_pmds: unavailable PMD registers
26171+ *
26172+ * Return:
26173+ * 0 success
26174+ * errno in case of error
26175+ */
26176+static int pfm_pmu_regdesc_init(struct pfm_regdesc *regs, int excl_type,
26177+ u64 *unavail_pmcs, u64 *unavail_pmds)
26178+{
26179+ struct pfm_regmap_desc *d;
26180+ u16 n, n2, n_counters, i;
26181+ int first_intr_pmd = -1, max1, max2, max3;
26182+
26183+ /*
26184+ * compute the number of implemented PMC from the
26185+ * description table
26186+ */
26187+ n = 0;
26188+ max1 = max2 = -1;
26189+ d = pfm_pmu_conf->pmc_desc;
26190+ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) {
26191+ if (!(d->type & PFM_REG_I))
26192+ continue;
26193+
26194+ if (test_bit(i, cast_ulp(unavail_pmcs)))
26195+ continue;
26196+
26197+ if (d->type & excl_type)
26198+ continue;
26199+
26200+ __set_bit(i, cast_ulp(regs->pmcs));
26201+
26202+ max1 = i;
26203+ n++;
26204+ }
26205+
26206+ if (!n) {
26207+ PFM_INFO("%s PMU description has no PMC registers",
26208+ pfm_pmu_conf->pmu_name);
26209+ return -EINVAL;
26210+ }
26211+
26212+ regs->max_pmc = max1 + 1;
26213+ regs->num_pmcs = n;
26214+
26215+ n = n_counters = n2 = 0;
26216+ max1 = max2 = max3 = -1;
26217+ d = pfm_pmu_conf->pmd_desc;
26218+ for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) {
26219+ if (!(d->type & PFM_REG_I))
26220+ continue;
26221+
26222+ if (test_bit(i, cast_ulp(unavail_pmds)))
26223+ continue;
26224+
26225+ if (d->type & excl_type)
26226+ continue;
26227+
26228+ __set_bit(i, cast_ulp(regs->pmds));
26229+ max1 = i;
26230+ n++;
26231+
26232+ /*
26233+ * read-write registers
26234+ */
26235+ if (!(d->type & PFM_REG_RO)) {
26236+ __set_bit(i, cast_ulp(regs->rw_pmds));
26237+ max3 = i;
26238+ n2++;
26239+ }
26240+
26241+ /*
26242+ * counter registers
26243+ */
26244+ if (d->type & PFM_REG_C64) {
26245+ __set_bit(i, cast_ulp(regs->cnt_pmds));
26246+ n_counters++;
26247+ }
26248+
26249+ /*
26250+ * PMD with intr capabilities
26251+ */
26252+ if (d->type & PFM_REG_INTR) {
26253+ __set_bit(i, cast_ulp(regs->intr_pmds));
26254+ if (first_intr_pmd == -1)
26255+ first_intr_pmd = i;
26256+ max2 = i;
26257+ }
26258+ }
26259+
26260+ if (!n) {
26261+ PFM_INFO("%s PMU description has no PMD registers",
26262+ pfm_pmu_conf->pmu_name);
26263+ return -EINVAL;
26264+ }
26265+
26266+ regs->max_pmd = max1 + 1;
26267+ regs->first_intr_pmd = first_intr_pmd;
26268+ regs->max_intr_pmd = max2 + 1;
26269+
26270+ regs->num_counters = n_counters;
26271+ regs->num_pmds = n;
26272+ regs->max_rw_pmd = max3 + 1;
26273+ regs->num_rw_pmd = n2;
26274+
26275+ return 0;
26276+}
26277+
26278+/**
26279+ * pfm_pmu_regdesc_init_all -- initialize all regdesc structures
26280+ * @una_pmcs : unavailable PMC registers
26281+ * @una_pmds : unavailable PMD registers
26282+ *
26283+ * Return:
26284+ * 0 sucess
26285+ * errno if error
26286+ *
26287+ * We maintain 3 regdesc:
26288+ * regs_all: all available registers
26289+ * regs_sys: registers available to system-wide contexts only
26290+ * regs_thr: registers available to per-thread contexts only
26291+ */
26292+static int pfm_pmu_regdesc_init_all(u64 *una_pmcs, u64 *una_pmds)
26293+{
26294+ int ret;
26295+
26296+ memset(&pfm_pmu_conf->regs_all, 0, sizeof(struct pfm_regdesc));
26297+ memset(&pfm_pmu_conf->regs_thr, 0, sizeof(struct pfm_regdesc));
26298+ memset(&pfm_pmu_conf->regs_sys, 0, sizeof(struct pfm_regdesc));
26299+
26300+ ret = pfm_pmu_regdesc_init(&pfm_pmu_conf->regs_all,
26301+ 0,
26302+ una_pmcs, una_pmds);
26303+ if (ret)
26304+ return ret;
26305+
26306+ PFM_DBG("regs_all.pmcs=0x%llx",
26307+ (unsigned long long)pfm_pmu_conf->regs_all.pmcs[0]);
26308+
26309+ ret = pfm_pmu_regdesc_init(&pfm_pmu_conf->regs_thr,
26310+ PFM_REG_SYS,
26311+ una_pmcs, una_pmds);
26312+ if (ret)
26313+ return ret;
26314+ PFM_DBG("regs.thr.pmcs=0x%llx",
26315+ (unsigned long long)pfm_pmu_conf->regs_thr.pmcs[0]);
26316+
26317+ ret = pfm_pmu_regdesc_init(&pfm_pmu_conf->regs_sys,
26318+ PFM_REG_THR,
26319+ una_pmcs, una_pmds);
26320+
26321+ PFM_DBG("regs_sys.pmcs=0x%llx",
26322+ (unsigned long long)pfm_pmu_conf->regs_sys.pmcs[0]);
26323+
26324+ return ret;
26325+}
26326+
26327+int pfm_pmu_register(struct pfm_pmu_config *cfg)
26328+{
26329+ u16 i, nspec, nspec_ro, num_pmcs, num_pmds, num_wc = 0;
26330+ int type, ret = -EBUSY;
26331+
26332+ if (perfmon_disabled) {
26333+ PFM_INFO("perfmon disabled, cannot add PMU description");
26334+ return -ENOSYS;
26335+ }
26336+
26337+ nspec = nspec_ro = num_pmds = num_pmcs = 0;
26338+
26339+ /* some sanity checks */
26340+ if (cfg == NULL || cfg->pmu_name == NULL) {
26341+ PFM_INFO("PMU config descriptor is invalid");
26342+ return -EINVAL;
26343+ }
26344+
26345+ /* must have a probe */
26346+ if (cfg->probe_pmu == NULL) {
26347+ PFM_INFO("PMU config has no probe routine");
26348+ return -EINVAL;
26349+ }
26350+
26351+ /*
26352+ * execute probe routine before anything else as it
26353+ * may update configuration tables
26354+ */
26355+ if ((*cfg->probe_pmu)() == -1) {
26356+ PFM_INFO("%s PMU detection failed", cfg->pmu_name);
26357+ return -EINVAL;
26358+ }
26359+
26360+ if (!(cfg->flags & PFM_PMUFL_IS_BUILTIN) && cfg->owner == NULL) {
26361+ PFM_INFO("PMU config %s is missing owner", cfg->pmu_name);
26362+ return -EINVAL;
26363+ }
26364+
26365+ if (!cfg->num_pmd_entries) {
26366+ PFM_INFO("%s needs to define num_pmd_entries", cfg->pmu_name);
26367+ return -EINVAL;
26368+ }
26369+
26370+ if (!cfg->num_pmc_entries) {
26371+ PFM_INFO("%s needs to define num_pmc_entries", cfg->pmu_name);
26372+ return -EINVAL;
26373+ }
26374+
26375+ if (!cfg->counter_width) {
26376+ PFM_INFO("PMU config %s, zero width counters", cfg->pmu_name);
26377+ return -EINVAL;
26378+ }
26379+
26380+ /*
26381+ * REG_RO, REG_V not supported on PMC registers
26382+ */
26383+ for (i = 0; i < cfg->num_pmc_entries; i++) {
26384+
26385+ type = cfg->pmc_desc[i].type;
26386+
26387+ if (type & PFM_REG_I)
26388+ num_pmcs++;
26389+
26390+ if (type & PFM_REG_WC)
26391+ num_wc++;
26392+
26393+ if (type & PFM_REG_V) {
26394+ PFM_INFO("PFM_REG_V is not supported on "
26395+ "PMCs (PMC%d)", i);
26396+ return -EINVAL;
26397+ }
26398+ if (type & PFM_REG_RO) {
26399+ PFM_INFO("PFM_REG_RO meaningless on "
26400+ "PMCs (PMC%u)", i);
26401+ return -EINVAL;
26402+ }
26403+ }
26404+
26405+ if (num_wc && cfg->pmc_write_check == NULL) {
26406+ PFM_INFO("some PMCs have write-checker but no callback provided\n");
26407+ return -EINVAL;
26408+ }
26409+
26410+ /*
26411+ * check virtual PMD registers
26412+ */
26413+ num_wc = 0;
26414+ for (i = 0; i < cfg->num_pmd_entries; i++) {
26415+
26416+ type = cfg->pmd_desc[i].type;
26417+
26418+ if (type & PFM_REG_I)
26419+ num_pmds++;
26420+
26421+ if (type & PFM_REG_V) {
26422+ nspec++;
26423+ if (type & PFM_REG_RO)
26424+ nspec_ro++;
26425+ }
26426+
26427+ if (type & PFM_REG_WC)
26428+ num_wc++;
26429+ }
26430+
26431+ if (num_wc && cfg->pmd_write_check == NULL) {
26432+ PFM_INFO("PMD have write-checker but no callback provided\n");
26433+ return -EINVAL;
26434+ }
26435+
26436+ if (nspec && cfg->pmd_sread == NULL) {
26437+ PFM_INFO("PMU config is missing pmd_sread()");
26438+ return -EINVAL;
26439+ }
26440+
26441+ nspec = nspec - nspec_ro;
26442+ if (nspec && cfg->pmd_swrite == NULL) {
26443+ PFM_INFO("PMU config is missing pmd_swrite()");
26444+ return -EINVAL;
26445+ }
26446+
26447+ if (num_pmcs >= PFM_MAX_PMCS) {
26448+ PFM_INFO("%s PMCS registers exceed name space [0-%u]",
26449+ cfg->pmu_name,
26450+ PFM_MAX_PMCS);
26451+ return -EINVAL;
26452+ }
26453+ if (num_pmds >= PFM_MAX_PMDS) {
26454+ PFM_INFO("%s PMDS registers exceed name space [0-%u]",
26455+ cfg->pmu_name,
26456+ PFM_MAX_PMDS);
26457+ return -EINVAL;
26458+ }
26459+ spin_lock(&pfm_pmu_conf_lock);
26460+
26461+ if (pfm_pmu_conf)
26462+ goto unlock;
26463+
26464+ if (!cfg->version)
26465+ cfg->version = "0.0";
26466+
26467+ pfm_pmu_conf = cfg;
26468+ pfm_pmu_conf->ovfl_mask = (1ULL << cfg->counter_width) - 1;
26469+
26470+ ret = pfm_arch_pmu_config_init(cfg);
26471+ if (ret)
26472+ goto unlock;
26473+
26474+ ret = pfm_sysfs_add_pmu(pfm_pmu_conf);
26475+ if (ret)
26476+ pfm_pmu_conf = NULL;
26477+
26478+unlock:
26479+ spin_unlock(&pfm_pmu_conf_lock);
26480+
26481+ if (ret) {
26482+ PFM_INFO("register %s PMU error %d", cfg->pmu_name, ret);
26483+ } else {
26484+ PFM_INFO("%s PMU installed", cfg->pmu_name);
26485+ /*
26486+ * (re)initialize PMU on each PMU now that we have a description
26487+ */
26488+ on_each_cpu(__pfm_init_percpu, cfg, 0);
26489+ }
26490+ return ret;
26491+}
26492+EXPORT_SYMBOL(pfm_pmu_register);
26493+
26494+/*
26495+ * remove PMU description. Caller must pass address of current
26496+ * configuration. This is mostly for sanity checking as only
26497+ * one config can exist at any time.
26498+ *
26499+ * We are using the module refcount mechanism to protect against
26500+ * removal while the configuration is being used. As long as there is
26501+ * one context, a PMU configuration cannot be removed. The protection is
26502+ * managed in module logic.
26503+ */
26504+void pfm_pmu_unregister(struct pfm_pmu_config *cfg)
26505+{
26506+ if (!(cfg || pfm_pmu_conf))
26507+ return;
26508+
26509+ spin_lock(&pfm_pmu_conf_lock);
26510+
26511+ BUG_ON(module_refcount(pfm_pmu_conf->owner));
26512+
26513+ if (cfg->owner == pfm_pmu_conf->owner) {
26514+ pfm_sysfs_remove_pmu(pfm_pmu_conf);
26515+ pfm_pmu_conf = NULL;
26516+ }
26517+
26518+ spin_unlock(&pfm_pmu_conf_lock);
26519+}
26520+EXPORT_SYMBOL(pfm_pmu_unregister);
26521+
26522+static int pfm_pmu_request_module(void)
26523+{
26524+ char *mod_name;
26525+ int ret;
26526+
26527+ mod_name = pfm_arch_get_pmu_module_name();
26528+ if (mod_name == NULL)
26529+ return -ENOSYS;
26530+
26531+ ret = request_module(mod_name);
26532+
26533+ PFM_DBG("mod=%s ret=%d\n", mod_name, ret);
26534+ return ret;
26535+}
26536+
26537+/*
26538+ * autoload:
26539+ * 0 : do not try to autoload the PMU description module
26540+ * not 0 : try to autoload the PMU description module
26541+ */
26542+int pfm_pmu_conf_get(int autoload)
26543+{
26544+ int ret;
26545+
26546+ spin_lock(&pfm_pmu_conf_lock);
26547+
26548+ if (request_mod_in_progress) {
26549+ ret = -ENOSYS;
26550+ goto skip;
26551+ }
26552+
26553+ if (autoload && pfm_pmu_conf == NULL) {
26554+
26555+ request_mod_in_progress = 1;
26556+
26557+ spin_unlock(&pfm_pmu_conf_lock);
26558+
26559+ pfm_pmu_request_module();
26560+
26561+ spin_lock(&pfm_pmu_conf_lock);
26562+
26563+ request_mod_in_progress = 0;
26564+
26565+ /*
26566+ * request_module() may succeed but the module
26567+ * may not have registered properly so we need
26568+ * to check
26569+ */
26570+ }
26571+
26572+ ret = pfm_pmu_conf == NULL ? -ENOSYS : 0;
26573+ if (!ret && pmu_is_module(pfm_pmu_conf)
26574+ && !try_module_get(pfm_pmu_conf->owner))
26575+ ret = -ENOSYS;
26576+
26577+skip:
26578+ spin_unlock(&pfm_pmu_conf_lock);
26579+
26580+ return ret;
26581+}
26582+
26583+void pfm_pmu_conf_put(void)
26584+{
26585+ if (pfm_pmu_conf == NULL || !pmu_is_module(pfm_pmu_conf))
26586+ return;
26587+
26588+ spin_lock(&pfm_pmu_conf_lock);
26589+ module_put(pfm_pmu_conf->owner);
26590+ spin_unlock(&pfm_pmu_conf_lock);
26591+}
26592+
26593+
26594+/*
26595+ * acquire PMU resource from lower-level PMU register allocator
26596+ * (currently perfctr-watchdog.c)
26597+ *
26598+ * acquisition is done when the first context is created (and not
26599+ * when it is loaded). We grab all that is defined in the description
26600+ * module and then we make adjustments at the arch-specific level.
26601+ *
26602+ * The PMU resource is released when the last perfmon context is
26603+ * destroyed.
26604+ *
26605+ * interrupts are not masked
26606+ */
26607+int pfm_pmu_acquire(struct pfm_context *ctx)
26608+{
26609+ u64 unavail_pmcs[PFM_PMC_BV];
26610+ u64 unavail_pmds[PFM_PMD_BV];
26611+ int ret = 0;
26612+
26613+ spin_lock(&pfm_pmu_acq_lock);
26614+
26615+ PFM_DBG("pmu_acquired=%u", pfm_pmu_acquired);
26616+
26617+ pfm_pmu_acquired++;
26618+
26619+ /*
26620+ * we need to initialize regdesc each time we re-acquire
26621+ * the PMU for the first time as there may have been changes
26622+ * in the list of available registers, e.g., NMI may have
26623+ * been disabled. Checking on PMU module insert is not
26624+ * enough
26625+ */
26626+ if (pfm_pmu_acquired == 1) {
26627+ memset(unavail_pmcs, 0, sizeof(unavail_pmcs));
26628+ memset(unavail_pmds, 0, sizeof(unavail_pmds));
26629+
26630+ ret = pfm_arch_pmu_acquire(unavail_pmcs, unavail_pmds);
26631+ if (ret) {
26632+ pfm_pmu_acquired--;
26633+ } else {
26634+ pfm_pmu_regdesc_init_all(unavail_pmcs, unavail_pmds);
26635+
26636+ /* available PMU ressources */
26637+ PFM_DBG("PMU acquired: %u PMCs, %u PMDs, %u counters",
26638+ pfm_pmu_conf->regs_all.num_pmcs,
26639+ pfm_pmu_conf->regs_all.num_pmds,
26640+ pfm_pmu_conf->regs_all.num_counters);
26641+ }
26642+ }
26643+ spin_unlock(&pfm_pmu_acq_lock);
26644+
26645+ /*
26646+ * copy the regdesc that corresponds to the context
26647+ * we copy and not just point because it helps with
26648+ * memory locality. the regdesc structure is accessed
26649+ * very frequently in performance critical code such
26650+ * as context switch and interrupt handling. By using
26651+ * a local copy, we increase memory footprint, but
26652+ * increase chance to have local memory access,
26653+ * especially for system-wide contexts.
26654+ */
26655+ if (ctx->flags.system)
26656+ ctx->regs = pfm_pmu_conf->regs_sys;
26657+ else
26658+ ctx->regs = pfm_pmu_conf->regs_thr;
26659+
26660+ return ret;
26661+}
26662+
26663+/*
26664+ * release the PMU resource
26665+ *
26666+ * actual release happens when last context is destroyed
26667+ *
26668+ * interrupts are not masked
26669+ */
26670+void pfm_pmu_release(void)
26671+{
26672+ BUG_ON(irqs_disabled());
26673+
26674+ /*
26675+ * we need to use a spinlock because release takes some time
26676+ * and we may have a race with pfm_pmu_acquire()
26677+ */
26678+ spin_lock(&pfm_pmu_acq_lock);
26679+
26680+ PFM_DBG("pmu_acquired=%d", pfm_pmu_acquired);
26681+
26682+ /*
26683+ * we decouple test and decrement because if we had errors
26684+ * in pfm_pmu_acquire(), we still come here on pfm_context_free()
26685+ * but with pfm_pmu_acquire=0
26686+ */
26687+ if (pfm_pmu_acquired > 0 && --pfm_pmu_acquired == 0) {
26688+ pfm_arch_pmu_release();
26689+ PFM_DBG("PMU released");
26690+ }
26691+ spin_unlock(&pfm_pmu_acq_lock);
26692+}
26693--- /dev/null
26694+++ b/perfmon/perfmon_priv.h
26695@@ -0,0 +1,182 @@
26696+/*
26697+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
26698+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
26699+ *
26700+ * This program is free software; you can redistribute it and/or
26701+ * modify it under the terms of version 2 of the GNU General Public
26702+ * License as published by the Free Software Foundation.
26703+ *
26704+ * This program is distributed in the hope that it will be useful,
26705+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
26706+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26707+ * General Public License for more details.
26708+ *
26709+ * You should have received a copy of the GNU General Public License
26710+ * along with this program; if not, write to the Free Software
26711+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26712+ * 02111-1307 USA
26713+ */
26714+
26715+#ifndef __PERFMON_PRIV_H__
26716+#define __PERFMON_PRIV_H__
26717+/*
26718+ * This file contains all the definitions of data structures, variables, macros
26719+ * that are to private to the generic code, i.e., not shared with any code that
26720+ * lives under arch/ or include/asm-XX
26721+ *
26722+ * For shared definitions, use include/linux/perfmon_kern.h
26723+ */
26724+
26725+#ifdef CONFIG_PERFMON
26726+
26727+/*
26728+ * type of PMD reset for pfm_reset_pmds() or pfm_switch_sets*()
26729+ */
26730+#define PFM_PMD_RESET_SHORT 1 /* use short reset value */
26731+#define PFM_PMD_RESET_LONG 2 /* use long reset value */
26732+
26733+/*
26734+ * context lazy save/restore activation count
26735+ */
26736+#define PFM_INVALID_ACTIVATION ((u64)~0)
26737+
26738+DECLARE_PER_CPU(u64, pmu_activation_number);
26739+DECLARE_PER_CPU(struct hrtimer, pfm_hrtimer);
26740+
26741+static inline void pfm_set_pmu_owner(struct task_struct *task,
26742+ struct pfm_context *ctx)
26743+{
26744+ __get_cpu_var(pmu_owner) = task;
26745+ __get_cpu_var(pmu_ctx) = ctx;
26746+}
26747+
26748+static inline int pfm_msgq_is_empty(struct pfm_context *ctx)
26749+{
26750+ return ctx->msgq_head == ctx->msgq_tail;
26751+}
26752+
26753+void pfm_get_next_msg(struct pfm_context *ctx, union pfarg_msg *m);
26754+int pfm_end_notify(struct pfm_context *ctx);
26755+int pfm_ovfl_notify(struct pfm_context *ctx, struct pfm_event_set *set,
26756+ unsigned long ip);
26757+
26758+int pfm_alloc_fd(struct file **cfile);
26759+
26760+int __pfm_delete_evtsets(struct pfm_context *ctx, void *arg, int count);
26761+int __pfm_getinfo_evtsets(struct pfm_context *ctx, struct pfarg_setinfo *req,
26762+ int count);
26763+int __pfm_create_evtsets(struct pfm_context *ctx, struct pfarg_setdesc *req,
26764+ int count);
26765+
26766+
26767+int pfm_init_ctx(void);
26768+
26769+int pfm_pmu_acquire(struct pfm_context *ctx);
26770+void pfm_pmu_release(void);
26771+
26772+int pfm_session_acquire(int is_system, u32 cpu);
26773+void pfm_session_release(int is_system, u32 cpu);
26774+
26775+int pfm_smpl_buf_space_acquire(struct pfm_context *ctx, size_t size);
26776+int pfm_smpl_buf_load_context(struct pfm_context *ctx);
26777+void pfm_smpl_buf_unload_context(struct pfm_context *ctx);
26778+
26779+int pfm_init_sysfs(void);
26780+
26781+#ifdef CONFIG_PERFMON_DEBUG_FS
26782+int pfm_init_debugfs(void);
26783+int pfm_debugfs_add_cpu(int mycpu);
26784+void pfm_debugfs_del_cpu(int mycpu);
26785+#else
26786+static inline int pfm_init_debugfs(void)
26787+{
26788+ return 0;
26789+}
26790+static inline int pfm_debugfs_add_cpu(int mycpu)
26791+{
26792+ return 0;
26793+}
26794+
26795+static inline void pfm_debugfs_del_cpu(int mycpu)
26796+{}
26797+#endif
26798+
26799+
26800+void pfm_reset_pmds(struct pfm_context *ctx, struct pfm_event_set *set,
26801+ int num_pmds,
26802+ int reset_mode);
26803+
26804+struct pfm_event_set *pfm_prepare_sets(struct pfm_context *ctx, u16 load_set);
26805+int pfm_init_sets(void);
26806+
26807+ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what);
26808+
26809+void pfm_free_sets(struct pfm_context *ctx);
26810+int pfm_create_initial_set(struct pfm_context *ctx);
26811+void pfm_switch_sets_from_intr(struct pfm_context *ctx);
26812+void pfm_restart_timer(struct pfm_context *ctx, struct pfm_event_set *set);
26813+enum hrtimer_restart pfm_handle_switch_timeout(struct hrtimer *t);
26814+
26815+enum hrtimer_restart pfm_switch_sets(struct pfm_context *ctx,
26816+ struct pfm_event_set *new_set,
26817+ int reset_mode,
26818+ int no_restart);
26819+
26820+/**
26821+ * pfm_save_prev_ctx - check if previous context exists and save state
26822+ *
26823+ * called from pfm_load_ctx_thread() and __pfm_ctxsin_thread() to
26824+ * check if previous context exists. If so saved its PMU state. This is used
26825+ * only for UP kernels.
26826+ *
26827+ * PMU ownership is not cleared because the function is always called while
26828+ * trying to install a new owner.
26829+ */
26830+static inline void pfm_check_save_prev_ctx(void)
26831+{
26832+#ifdef CONFIG_SMP
26833+ struct pfm_event_set *set;
26834+ struct pfm_context *ctxp;
26835+
26836+ ctxp = __get_cpu_var(pmu_ctx);
26837+ if (!ctxp)
26838+ return;
26839+ /*
26840+ * in UP per-thread, due to lazy save
26841+ * there could be a context from another
26842+ * task. We need to push it first before
26843+ * installing our new state
26844+ */
26845+ set = ctxp->active_set;
26846+ pfm_save_pmds(ctxp, set);
26847+ /*
26848+ * do not clear ownership because we rewrite
26849+ * right away
26850+ */
26851+#endif
26852+}
26853+
26854+
26855+int pfm_init_fs(void);
26856+
26857+int pfm_init_hotplug(void);
26858+
26859+void pfm_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set);
26860+void pfm_resume_after_ovfl(struct pfm_context *ctx);
26861+int pfm_setup_smpl_fmt(struct pfm_context *ctx, u32 ctx_flags, void *fmt_arg,
26862+ struct file *filp);
26863+
26864+static inline void pfm_post_work(struct task_struct *task,
26865+ struct pfm_context *ctx, int type)
26866+{
26867+ ctx->flags.work_type = type;
26868+ set_tsk_thread_flag(task, TIF_PERFMON_WORK);
26869+ pfm_arch_arm_handle_work(task);
26870+}
26871+
26872+#define PFM_PMC_STK_ARG PFM_ARCH_PMC_STK_ARG
26873+#define PFM_PMD_STK_ARG PFM_ARCH_PMD_STK_ARG
26874+
26875+#endif /* CONFIG_PERFMON */
26876+
26877+#endif /* __PERFMON_PRIV_H__ */
26878--- /dev/null
26879+++ b/perfmon/perfmon_res.c
26880@@ -0,0 +1,450 @@
26881+/*
26882+ * perfmon_res.c: perfmon2 resource allocations
26883+ *
26884+ * This file implements the perfmon2 interface which
26885+ * provides access to the hardware performance counters
26886+ * of the host processor.
26887+ *
26888+ * The initial version of perfmon.c was written by
26889+ * Ganesh Venkitachalam, IBM Corp.
26890+ *
26891+ * Then it was modified for perfmon-1.x by Stephane Eranian and
26892+ * David Mosberger, Hewlett Packard Co.
26893+ *
26894+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
26895+ * by Stephane Eranian, Hewlett Packard Co.
26896+ *
26897+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
26898+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
26899+ * David Mosberger-Tang <davidm@hpl.hp.com>
26900+ *
26901+ * More information about perfmon available at:
26902+ * http://perfmon2.sf.net
26903+ *
26904+ * This program is free software; you can redistribute it and/or
26905+ * modify it under the terms of version 2 of the GNU General Public
26906+ * License as published by the Free Software Foundation.
26907+ *
26908+ * This program is distributed in the hope that it will be useful,
26909+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
26910+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26911+ * General Public License for more details.
26912+ *
26913+ * You should have received a copy of the GNU General Public License
26914+ * along with this program; if not, write to the Free Software
26915+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26916+ * 02111-1307 USA
26917+ */
26918+#include <linux/kernel.h>
26919+#include <linux/module.h>
26920+#include <linux/perfmon_kern.h>
26921+#include "perfmon_priv.h"
26922+
26923+/*
26924+ * global information about all sessions
26925+ * mostly used to synchronize between system wide and per-process
26926+ */
26927+struct pfm_resources {
26928+ size_t smpl_buf_mem_cur;/* current smpl buf mem usage */
26929+ cpumask_t sys_cpumask; /* bitmask of used cpus */
26930+ u32 thread_sessions; /* #num loaded per-thread sessions */
26931+};
26932+
26933+static struct pfm_resources pfm_res;
26934+
26935+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_res_lock);
26936+
26937+/**
26938+ * pfm_smpl_buf_space_acquire - check memory resource usage for sampling buffer
26939+ * @ctx: context of interest
26940+ * @size: size fo requested buffer
26941+ *
26942+ * sampling buffer allocated by perfmon must be
26943+ * checked against max locked memory usage thresholds
26944+ * for security reasons.
26945+ *
26946+ * The first level check is against the system wide limit
26947+ * as indicated by the system administrator in /sys/kernel/perfmon
26948+ *
26949+ * The second level check is on a per-process basis using
26950+ * RLIMIT_MEMLOCK limit.
26951+ *
26952+ * Operating on the current task only.
26953+ */
26954+int pfm_smpl_buf_space_acquire(struct pfm_context *ctx, size_t size)
26955+{
26956+ struct mm_struct *mm;
26957+ unsigned long locked;
26958+ unsigned long buf_mem, buf_mem_max;
26959+ unsigned long flags;
26960+
26961+ spin_lock_irqsave(&pfm_res_lock, flags);
26962+
26963+ /*
26964+ * check against global buffer limit
26965+ */
26966+ buf_mem_max = pfm_controls.smpl_buffer_mem_max;
26967+ buf_mem = pfm_res.smpl_buf_mem_cur + size;
26968+
26969+ if (buf_mem <= buf_mem_max) {
26970+ pfm_res.smpl_buf_mem_cur = buf_mem;
26971+
26972+ PFM_DBG("buf_mem_max=%lu current_buf_mem=%lu",
26973+ buf_mem_max,
26974+ buf_mem);
26975+ }
26976+
26977+ spin_unlock_irqrestore(&pfm_res_lock, flags);
26978+
26979+ if (buf_mem > buf_mem_max) {
26980+ PFM_DBG("smpl buffer memory threshold reached");
26981+ return -ENOMEM;
26982+ }
26983+
26984+ /*
26985+ * check against per-process RLIMIT_MEMLOCK
26986+ */
26987+ mm = get_task_mm(current);
26988+
26989+ down_write(&mm->mmap_sem);
26990+
26991+ locked = mm->locked_vm << PAGE_SHIFT;
26992+ locked += size;
26993+
26994+ if (locked > current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur) {
26995+
26996+ PFM_DBG("RLIMIT_MEMLOCK reached ask_locked=%lu rlim_cur=%lu",
26997+ locked,
26998+ current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur);
26999+
27000+ up_write(&mm->mmap_sem);
27001+ mmput(mm);
27002+ goto unres;
27003+ }
27004+
27005+ mm->locked_vm = locked >> PAGE_SHIFT;
27006+
27007+ up_write(&mm->mmap_sem);
27008+
27009+ mmput(mm);
27010+
27011+ return 0;
27012+
27013+unres:
27014+ /*
27015+ * remove global buffer memory allocation
27016+ */
27017+ spin_lock_irqsave(&pfm_res_lock, flags);
27018+
27019+ pfm_res.smpl_buf_mem_cur -= size;
27020+
27021+ spin_unlock_irqrestore(&pfm_res_lock, flags);
27022+
27023+ return -ENOMEM;
27024+}
27025+/**
27026+ * pfm_smpl_buf_space_release - release resource usage for sampling buffer
27027+ * @ctx: perfmon context of interest
27028+ *
27029+ * There exist multiple paths leading to this function. We need to
27030+ * be very careful withlokcing on the mmap_sem as it may already be
27031+ * held by the time we come here.
27032+ * The following paths exist:
27033+ *
27034+ * exit path:
27035+ * sys_exit_group
27036+ * do_group_exit
27037+ * do_exit
27038+ * exit_mm
27039+ * mmput
27040+ * exit_mmap
27041+ * remove_vma
27042+ * fput
27043+ * __fput
27044+ * pfm_close
27045+ * __pfm_close
27046+ * pfm_context_free
27047+ * pfm_release_buf_space
27048+ * munmap path:
27049+ * sys_munmap
27050+ * do_munmap
27051+ * remove_vma
27052+ * fput
27053+ * __fput
27054+ * pfm_close
27055+ * __pfm_close
27056+ * pfm_context_free
27057+ * pfm_release_buf_space
27058+ *
27059+ * close path:
27060+ * sys_close
27061+ * filp_close
27062+ * fput
27063+ * __fput
27064+ * pfm_close
27065+ * __pfm_close
27066+ * pfm_context_free
27067+ * pfm_release_buf_space
27068+ *
27069+ * The issue is that on the munmap() path, the mmap_sem is already held
27070+ * in write-mode by the time we come here. To avoid the deadlock, we need
27071+ * to know where we are coming from and skip down_write(). If is fairly
27072+ * difficult to know this because of the lack of good hooks and
27073+ * the fact that, there may not have been any mmap() of the sampling buffer
27074+ * (i.e. create_context() followed by close() or exit()).
27075+ *
27076+ * We use a set flag ctx->flags.mmap_nlock which is toggled in the vm_ops
27077+ * callback in remove_vma() which is called systematically for the call, so
27078+ * on all but the pure close() path. The exit path does not already hold
27079+ * the lock but this is exit so there is no task->mm by the time we come here.
27080+ *
27081+ * The mmap_nlock is set only when unmapping and this is the LAST reference
27082+ * to the file (i.e., close() followed by munmap()).
27083+ */
27084+void pfm_smpl_buf_space_release(struct pfm_context *ctx, size_t size)
27085+{
27086+ unsigned long flags;
27087+ struct mm_struct *mm;
27088+
27089+ mm = get_task_mm(current);
27090+ if (mm) {
27091+ if (ctx->flags.mmap_nlock == 0) {
27092+ PFM_DBG("doing down_write");
27093+ down_write(&mm->mmap_sem);
27094+ }
27095+
27096+ mm->locked_vm -= size >> PAGE_SHIFT;
27097+
27098+ PFM_DBG("size=%zu locked_vm=%lu", size, mm->locked_vm);
27099+
27100+ if (ctx->flags.mmap_nlock == 0)
27101+ up_write(&mm->mmap_sem);
27102+
27103+ mmput(mm);
27104+ }
27105+
27106+ spin_lock_irqsave(&pfm_res_lock, flags);
27107+
27108+ pfm_res.smpl_buf_mem_cur -= size;
27109+
27110+ spin_unlock_irqrestore(&pfm_res_lock, flags);
27111+}
27112+
27113+/**
27114+ * pfm_session_acquire - reserve a per-thread or per-cpu session
27115+ * @is_system: true if per-cpu session
27116+ * @cpu: cpu number for per-cpu session
27117+ *
27118+ * return:
27119+ * 0 : success
27120+ * -EBUSY: if conflicting session exist
27121+ */
27122+int pfm_session_acquire(int is_system, u32 cpu)
27123+{
27124+ unsigned long flags;
27125+ u32 nsys_cpus;
27126+ int ret = 0;
27127+
27128+ /*
27129+ * validy checks on cpu_mask have been done upstream
27130+ */
27131+ spin_lock_irqsave(&pfm_res_lock, flags);
27132+
27133+ nsys_cpus = cpus_weight(pfm_res.sys_cpumask);
27134+
27135+ PFM_DBG("in sys=%u task=%u is_sys=%d cpu=%u",
27136+ nsys_cpus,
27137+ pfm_res.thread_sessions,
27138+ is_system,
27139+ cpu);
27140+
27141+ if (is_system) {
27142+ /*
27143+ * cannot mix system wide and per-task sessions
27144+ */
27145+ if (pfm_res.thread_sessions > 0) {
27146+ PFM_DBG("%u conflicting thread_sessions",
27147+ pfm_res.thread_sessions);
27148+ ret = -EBUSY;
27149+ goto abort;
27150+ }
27151+
27152+ if (cpu_isset(cpu, pfm_res.sys_cpumask)) {
27153+ PFM_DBG("conflicting session on CPU%u", cpu);
27154+ ret = -EBUSY;
27155+ goto abort;
27156+ }
27157+
27158+ PFM_DBG("reserved session on CPU%u", cpu);
27159+
27160+ cpu_set(cpu, pfm_res.sys_cpumask);
27161+ nsys_cpus++;
27162+ } else {
27163+ if (nsys_cpus) {
27164+ ret = -EBUSY;
27165+ goto abort;
27166+ }
27167+ pfm_res.thread_sessions++;
27168+ }
27169+
27170+ PFM_DBG("out sys=%u task=%u is_sys=%d cpu=%u",
27171+ nsys_cpus,
27172+ pfm_res.thread_sessions,
27173+ is_system,
27174+ cpu);
27175+
27176+abort:
27177+ spin_unlock_irqrestore(&pfm_res_lock, flags);
27178+
27179+ return ret;
27180+}
27181+
27182+/**
27183+ * pfm_session_release - release a per-cpu or per-thread session
27184+ * @is_system: true if per-cpu session
27185+ * @cpu: cpu number for per-cpu session
27186+ *
27187+ * called from __pfm_unload_context()
27188+ */
27189+void pfm_session_release(int is_system, u32 cpu)
27190+{
27191+ unsigned long flags;
27192+
27193+ spin_lock_irqsave(&pfm_res_lock, flags);
27194+
27195+ PFM_DBG("in sys_sessions=%u thread_sessions=%u syswide=%d cpu=%u",
27196+ cpus_weight(pfm_res.sys_cpumask),
27197+ pfm_res.thread_sessions,
27198+ is_system, cpu);
27199+
27200+ if (is_system)
27201+ cpu_clear(cpu, pfm_res.sys_cpumask);
27202+ else
27203+ pfm_res.thread_sessions--;
27204+
27205+ PFM_DBG("out sys_sessions=%u thread_sessions=%u syswide=%d cpu=%u",
27206+ cpus_weight(pfm_res.sys_cpumask),
27207+ pfm_res.thread_sessions,
27208+ is_system, cpu);
27209+
27210+ spin_unlock_irqrestore(&pfm_res_lock, flags);
27211+}
27212+
27213+/**
27214+ * pfm_session_allcpus_acquire - acquire per-cpu sessions on all available cpus
27215+ *
27216+ * currently used by Oprofile on X86
27217+ */
27218+int pfm_session_allcpus_acquire(void)
27219+{
27220+ unsigned long flags;
27221+ u32 nsys_cpus, cpu;
27222+ int ret = -EBUSY;
27223+
27224+ spin_lock_irqsave(&pfm_res_lock, flags);
27225+
27226+ nsys_cpus = cpus_weight(pfm_res.sys_cpumask);
27227+
27228+ PFM_DBG("in sys=%u task=%u",
27229+ nsys_cpus,
27230+ pfm_res.thread_sessions);
27231+
27232+ if (nsys_cpus) {
27233+ PFM_DBG("already some system-wide sessions");
27234+ goto abort;
27235+ }
27236+
27237+ /*
27238+ * cannot mix system wide and per-task sessions
27239+ */
27240+ if (pfm_res.thread_sessions) {
27241+ PFM_DBG("%u conflicting thread_sessions",
27242+ pfm_res.thread_sessions);
27243+ goto abort;
27244+ }
27245+
27246+ for_each_online_cpu(cpu) {
27247+ cpu_set(cpu, pfm_res.sys_cpumask);
27248+ nsys_cpus++;
27249+ }
27250+
27251+ PFM_DBG("out sys=%u task=%u",
27252+ nsys_cpus,
27253+ pfm_res.thread_sessions);
27254+
27255+ ret = 0;
27256+abort:
27257+ spin_unlock_irqrestore(&pfm_res_lock, flags);
27258+
27259+ return ret;
27260+}
27261+EXPORT_SYMBOL(pfm_session_allcpus_acquire);
27262+
27263+/**
27264+ * pfm_session_allcpus_release - relase per-cpu sessions on all cpus
27265+ *
27266+ * currently used by Oprofile code
27267+ */
27268+void pfm_session_allcpus_release(void)
27269+{
27270+ unsigned long flags;
27271+ u32 nsys_cpus, cpu;
27272+
27273+ spin_lock_irqsave(&pfm_res_lock, flags);
27274+
27275+ nsys_cpus = cpus_weight(pfm_res.sys_cpumask);
27276+
27277+ PFM_DBG("in sys=%u task=%u",
27278+ nsys_cpus,
27279+ pfm_res.thread_sessions);
27280+
27281+ /*
27282+ * XXX: could use __cpus_clear() with nbits
27283+ */
27284+ for_each_online_cpu(cpu) {
27285+ cpu_clear(cpu, pfm_res.sys_cpumask);
27286+ nsys_cpus--;
27287+ }
27288+
27289+ PFM_DBG("out sys=%u task=%u",
27290+ nsys_cpus,
27291+ pfm_res.thread_sessions);
27292+
27293+ spin_unlock_irqrestore(&pfm_res_lock, flags);
27294+}
27295+EXPORT_SYMBOL(pfm_session_allcpus_release);
27296+
27297+/**
27298+ * pfm_sysfs_res_show - return currnt resourcde usage for sysfs
27299+ * @buf: buffer to hold string in return
27300+ * @sz: size of buf
27301+ * @what: what to produce
27302+ * what=0 : thread_sessions
27303+ * what=1 : cpus_weight(sys_cpumask)
27304+ * what=2 : smpl_buf_mem_cur
27305+ * what=3 : pmu model name
27306+ *
27307+ * called from perfmon_sysfs.c
27308+ * return number of bytes written into buf (up to sz)
27309+ */
27310+ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what)
27311+{
27312+ unsigned long flags;
27313+
27314+ spin_lock_irqsave(&pfm_res_lock, flags);
27315+
27316+ switch (what) {
27317+ case 0: snprintf(buf, sz, "%u\n", pfm_res.thread_sessions);
27318+ break;
27319+ case 1: snprintf(buf, sz, "%d\n", cpus_weight(pfm_res.sys_cpumask));
27320+ break;
27321+ case 2: snprintf(buf, sz, "%zu\n", pfm_res.smpl_buf_mem_cur);
27322+ break;
27323+ case 3:
27324+ snprintf(buf, sz, "%s\n",
27325+ pfm_pmu_conf ? pfm_pmu_conf->pmu_name
27326+ : "unknown\n");
27327+ }
27328+ spin_unlock_irqrestore(&pfm_res_lock, flags);
27329+ return strlen(buf);
27330+}
27331--- /dev/null
27332+++ b/perfmon/perfmon_rw.c
27333@@ -0,0 +1,733 @@
27334+/*
27335+ * perfmon.c: perfmon2 PMC/PMD read/write system calls
27336+ *
27337+ * This file implements the perfmon2 interface which
27338+ * provides access to the hardware performance counters
27339+ * of the host processor.
27340+ *
27341+ * The initial version of perfmon.c was written by
27342+ * Ganesh Venkitachalam, IBM Corp.
27343+ *
27344+ * Then it was modified for perfmon-1.x by Stephane Eranian and
27345+ * David Mosberger, Hewlett Packard Co.
27346+ *
27347+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
27348+ * by Stephane Eranian, Hewlett Packard Co.
27349+ *
27350+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
27351+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
27352+ * David Mosberger-Tang <davidm@hpl.hp.com>
27353+ *
27354+ * More information about perfmon available at:
27355+ * http://perfmon2.sf.net/
27356+ *
27357+ * This program is free software; you can redistribute it and/or
27358+ * modify it under the terms of version 2 of the GNU General Public
27359+ * License as published by the Free Software Foundation.
27360+ *
27361+ * This program is distributed in the hope that it will be useful,
27362+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
27363+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27364+ * General Public License for more details.
27365+ *
27366+ * You should have received a copy of the GNU General Public License
27367+ * along with this program; if not, write to the Free Software
27368+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
27369+ * 02111-1307 USA
27370+ */
27371+#include <linux/module.h>
27372+#include <linux/kernel.h>
27373+#include <linux/perfmon_kern.h>
27374+#include "perfmon_priv.h"
27375+
27376+#define PFM_REGFL_PMC_ALL (PFM_REGFL_NO_EMUL64)
27377+#define PFM_REGFL_PMD_ALL (PFM_REGFL_RANDOM|PFM_REGFL_OVFL_NOTIFY)
27378+
27379+/**
27380+ * update_used_reg -- updated used_pmcs for a single PMD
27381+ * @set: set to update
27382+ * @cnum: new PMD to add
27383+ *
27384+ * This function adds the pmds and pmcs depending on PMD cnum
27385+ */
27386+static inline void update_used_reg(struct pfm_context *ctx,
27387+ struct pfm_event_set *set, u16 cnum)
27388+{
27389+ bitmap_or(cast_ulp(set->used_pmcs),
27390+ cast_ulp(set->used_pmcs),
27391+ cast_ulp(pfm_pmu_conf->pmd_desc[cnum].dep_pmcs),
27392+ ctx->regs.max_pmc);
27393+}
27394+
27395+/**
27396+ * update_used -- update used_pmcs bitmask
27397+ * @set: event set to update
27398+ * @bv: bitmask to inspect for new PMD registers
27399+ *
27400+ * This function updates the used_pmcs bitmask for
27401+ * the set using bv, a bitmask of pmds. For each pmd in bv,
27402+ * its depending pmcs are added to used_pmcs.
27403+ */
27404+static void update_used_pmcs(struct pfm_context *ctx,
27405+ struct pfm_event_set *set, unsigned long *bv)
27406+{
27407+ u16 max_pmd;
27408+ int n, p, q;
27409+
27410+ max_pmd = ctx->regs.max_pmd;
27411+
27412+ n = bitmap_weight(bv, max_pmd);
27413+ for(p = 0; n; n--, p = q+1) {
27414+ q = find_next_bit(bv, max_pmd, p);
27415+ update_used_reg(ctx, set, q);
27416+ }
27417+}
27418+
27419+/**
27420+ * update_changes -- update nused_pmcs, nused_pmds, write newly touched pmcs
27421+ * @ctx: context to use
27422+ * @set: event set to use
27423+ * @old_used_pmcs: former used_pmc bitmask
27424+ * @can_access: non-zero if PMU is accessible, i.e., can be written to
27425+ *
27426+ * This function updates nused_pmcs and nused_pmds after the last modificiation
27427+ * to an event set. When new pmcs are used, then they must be initialized such
27428+ * that we do not pick up stale values from another session.
27429+ */
27430+static inline int update_changes(struct pfm_context *ctx, struct pfm_event_set *set,
27431+ unsigned long *old_used_pmcs)
27432+{
27433+ struct pfarg_pmc req;
27434+ u16 max_pmc, max_pmd;
27435+ int n, p, q, ret = 0;
27436+
27437+ max_pmd = ctx->regs.max_pmd;
27438+ max_pmc = ctx->regs.max_pmc;
27439+
27440+ /*
27441+ * update used counts
27442+ */
27443+ set->nused_pmds = bitmap_weight(cast_ulp(set->used_pmds), max_pmd);
27444+ set->nused_pmcs = bitmap_weight(cast_ulp(set->used_pmcs), max_pmc);
27445+
27446+ PFM_DBG("set%u u_pmds=0x%llx nu_pmds=%u u_pmcs=0x%llx nu_pmcs=%u",
27447+ set->id,
27448+ (unsigned long long)set->used_pmds[0],
27449+ set->nused_pmds,
27450+ (unsigned long long)set->used_pmcs[0],
27451+ set->nused_pmcs);
27452+
27453+ memset(&req, 0, sizeof(req));
27454+
27455+ n = bitmap_weight(cast_ulp(set->used_pmcs), max_pmc);
27456+ for(p = 0; n; n--, p = q+1) {
27457+ q = find_next_bit(cast_ulp(set->used_pmcs), max_pmc, p);
27458+
27459+ if (test_bit(q, cast_ulp(old_used_pmcs)))
27460+ continue;
27461+
27462+ req.reg_num = q;
27463+ req.reg_value = set->pmcs[q];
27464+
27465+ ret = __pfm_write_pmcs(ctx, &req, 1);
27466+ if (ret)
27467+ break;
27468+ }
27469+ return ret;
27470+}
27471+
27472+/**
27473+ * handle_smpl_bv - checks sampling bitmasks for new PMDs
27474+ * @ctx: context to use
27475+ * @set: set to use
27476+ * @bv: sampling bitmask
27477+ *
27478+ * scans the smpl bitmask looking for new PMDs (not yet used), if found
27479+ * invoke pfm_write_pmds() on them to get them initialized and marked used
27480+ */
27481+static int handle_smpl_bv(struct pfm_context *ctx, struct pfm_event_set *set,
27482+ unsigned long *bv)
27483+{
27484+ struct pfarg_pmd req;
27485+ int p, q, n, ret = 0;
27486+ u16 max_pmd;
27487+
27488+ memset(&req, 0, sizeof(req));
27489+
27490+ max_pmd = ctx->regs.max_pmd;
27491+
27492+ n = bitmap_weight(cast_ulp(bv), max_pmd);
27493+
27494+ for(p = 0; n; n--, p = q+1) {
27495+ q = find_next_bit(cast_ulp(bv), max_pmd, p);
27496+
27497+ if (test_bit(q, cast_ulp(set->used_pmds)))
27498+ continue;
27499+
27500+ req.reg_num = q;
27501+ req.reg_value = 0;
27502+
27503+ ret = __pfm_write_pmds(ctx, &req, 1, 0);
27504+ if (ret)
27505+ break;
27506+ }
27507+ return ret;
27508+}
27509+
27510+/**
27511+ * is_invalid -- check if register index is within limits
27512+ * @cnum: register index
27513+ * @impl: bitmask of implemented registers
27514+ * @max: highest implemented registers + 1
27515+ *
27516+ * return:
27517+ * 0 is register index is valid
27518+ * 1 if invalid
27519+ */
27520+static inline int is_invalid(u16 cnum, unsigned long *impl, u16 max)
27521+{
27522+ return cnum >= max || !test_bit(cnum, impl);
27523+}
27524+
27525+/**
27526+ * __pfm_write_pmds - modified data registers
27527+ * @ctx: context to operate on
27528+ * @req: pfarg_pmd_t request from user
27529+ * @count: number of element in the pfarg_pmd_t vector
27530+ * @compat: used only on IA-64 to maintain backward compatibility with v2.0
27531+ *
27532+ * The function succeeds whether the context is attached or not.
27533+ * When attached to another thread, that thread must be stopped.
27534+ *
27535+ * The context is locked and interrupts are disabled.
27536+ */
27537+int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count,
27538+ int compat)
27539+{
27540+ struct pfm_event_set *set, *active_set;
27541+ u64 old_used_pmcs[PFM_PMC_BV];
27542+ unsigned long *smpl_pmds, *reset_pmds, *impl_pmds, *impl_rw_pmds;
27543+ u32 req_flags, flags;
27544+ u16 cnum, pmd_type, max_pmd;
27545+ u16 set_id;
27546+ int i, can_access_pmu;
27547+ int ret;
27548+ pfm_pmd_check_t wr_func;
27549+
27550+ active_set = ctx->active_set;
27551+ max_pmd = ctx->regs.max_pmd;
27552+ impl_pmds = cast_ulp(ctx->regs.pmds);
27553+ impl_rw_pmds = cast_ulp(ctx->regs.rw_pmds);
27554+ wr_func = pfm_pmu_conf->pmd_write_check;
27555+ set = list_first_entry(&ctx->set_list, struct pfm_event_set, list);
27556+
27557+ can_access_pmu = 0;
27558+
27559+ /*
27560+ * we cannot access the actual PMD registers when monitoring is masked
27561+ */
27562+ if (unlikely(ctx->state == PFM_CTX_LOADED))
27563+ can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task
27564+ || ctx->flags.system;
27565+
27566+ bitmap_copy(cast_ulp(old_used_pmcs),
27567+ cast_ulp(set->used_pmcs),
27568+ ctx->regs.max_pmc);
27569+
27570+ ret = -EINVAL;
27571+ for (i = 0; i < count; i++, req++) {
27572+
27573+ cnum = req->reg_num;
27574+ set_id = req->reg_set;
27575+ req_flags = req->reg_flags;
27576+ smpl_pmds = cast_ulp(req->reg_smpl_pmds);
27577+ reset_pmds = cast_ulp(req->reg_reset_pmds);
27578+ flags = 0;
27579+
27580+ /*
27581+ * cannot write to unexisting
27582+ * writes to read-only register are ignored
27583+ */
27584+ if (unlikely(is_invalid(cnum, impl_pmds, max_pmd))) {
27585+ PFM_DBG("pmd%u is not available", cnum);
27586+ goto error;
27587+ }
27588+
27589+ pmd_type = pfm_pmu_conf->pmd_desc[cnum].type;
27590+
27591+ /*
27592+ * ensure only valid flags are set
27593+ */
27594+ if (req_flags & ~(PFM_REGFL_PMD_ALL)) {
27595+ PFM_DBG("pmd%u: invalid flags=0x%x",
27596+ cnum, req_flags);
27597+ goto error;
27598+ }
27599+
27600+ /*
27601+ * OVFL_NOTIFY is valid for all types of PMD.
27602+ * non counting PMD may trigger PMU interrupt
27603+ * and thus may trigger recording of a sample.
27604+ * This is true with IBS on AMD family 16.
27605+ */
27606+ if (req_flags & PFM_REGFL_OVFL_NOTIFY)
27607+ flags |= PFM_REGFL_OVFL_NOTIFY;
27608+
27609+ /*
27610+ * We allow randomization to non counting PMD
27611+ */
27612+ if (req_flags & PFM_REGFL_RANDOM)
27613+ flags |= PFM_REGFL_RANDOM;
27614+
27615+ /*
27616+ * verify validity of smpl_pmds
27617+ */
27618+ if (unlikely(!bitmap_subset(smpl_pmds, impl_pmds, PFM_MAX_PMDS))) {
27619+ PFM_DBG("invalid smpl_pmds=0x%llx for pmd%u",
27620+ (unsigned long long)req->reg_smpl_pmds[0],
27621+ cnum);
27622+ goto error;
27623+ }
27624+
27625+ /*
27626+ * verify validity of reset_pmds
27627+ * check against impl_rw_pmds because it is not
27628+ * possible to reset read-only PMDs
27629+ */
27630+ if (unlikely(!bitmap_subset(reset_pmds, impl_rw_pmds, PFM_MAX_PMDS))) {
27631+ PFM_DBG("invalid reset_pmds=0x%llx for pmd%u",
27632+ (unsigned long long)req->reg_reset_pmds[0],
27633+ cnum);
27634+ goto error;
27635+ }
27636+
27637+ /*
27638+ * locate event set
27639+ */
27640+ if (set_id != set->id) {
27641+ /* update number of used register for previous set */
27642+ if (i) {
27643+ ret = update_changes(ctx, set, cast_ulp(old_used_pmcs));
27644+ if (ret)
27645+ goto error;
27646+ }
27647+
27648+ set = pfm_find_set(ctx, set_id, 0);
27649+ if (set == NULL) {
27650+ PFM_DBG("event set%u does not exist",
27651+ set_id);
27652+ goto error;
27653+ }
27654+ bitmap_copy(cast_ulp(old_used_pmcs),
27655+ cast_ulp(set->used_pmcs),
27656+ ctx->regs.max_pmc);
27657+ }
27658+
27659+ /*
27660+ * execute write checker, if any
27661+ */
27662+ if (unlikely(wr_func && (pmd_type & PFM_REG_WC))) {
27663+ ret = (*wr_func)(ctx, set, req);
27664+ if (ret)
27665+ goto error;
27666+
27667+ }
27668+
27669+
27670+ /*
27671+ * now commit changes to software state
27672+ */
27673+
27674+ if (unlikely(compat))
27675+ goto skip_set;
27676+
27677+ if (bitmap_weight(smpl_pmds, max_pmd)) {
27678+ ret = handle_smpl_bv(ctx, set, smpl_pmds);
27679+ if (ret)
27680+ goto error;
27681+ update_used_pmcs(ctx, set, cast_ulp(smpl_pmds));
27682+ }
27683+
27684+ bitmap_copy(cast_ulp(set->pmds[cnum].smpl_pmds),
27685+ smpl_pmds,
27686+ max_pmd);
27687+
27688+
27689+ if (bitmap_weight(reset_pmds, max_pmd)) {
27690+ ret = handle_smpl_bv(ctx, set, reset_pmds);
27691+ if (ret)
27692+ goto error;
27693+ update_used_pmcs(ctx, set, cast_ulp(reset_pmds));
27694+ }
27695+
27696+ bitmap_copy(cast_ulp(set->pmds[cnum].reset_pmds),
27697+ reset_pmds,
27698+ max_pmd);
27699+
27700+ set->pmds[cnum].flags = flags;
27701+
27702+ __set_bit(cnum, cast_ulp(set->used_pmds));
27703+ update_used_reg(ctx, set, cnum);
27704+
27705+ /*
27706+ * we reprogram the PMD hence, we clear any pending
27707+ * ovfl. Does affect ovfl switch on restart but new
27708+ * value has already been established here
27709+ */
27710+ if (test_bit(cnum, cast_ulp(set->povfl_pmds))) {
27711+ set->npend_ovfls--;
27712+ __clear_bit(cnum, cast_ulp(set->povfl_pmds));
27713+ }
27714+ __clear_bit(cnum, cast_ulp(set->ovfl_pmds));
27715+
27716+ /*
27717+ * update ovfl_notify
27718+ */
27719+ if (flags & PFM_REGFL_OVFL_NOTIFY)
27720+ __set_bit(cnum, cast_ulp(set->ovfl_notify));
27721+ else
27722+ __clear_bit(cnum, cast_ulp(set->ovfl_notify));
27723+
27724+ /*
27725+ * establish new switch count
27726+ */
27727+ set->pmds[cnum].ovflsw_thres = req->reg_ovfl_switch_cnt;
27728+ set->pmds[cnum].ovflsw_ref_thres = req->reg_ovfl_switch_cnt;
27729+skip_set:
27730+
27731+ /*
27732+ * set last value to new value for all types of PMD
27733+ */
27734+ set->pmds[cnum].lval = req->reg_value;
27735+ set->pmds[cnum].value = req->reg_value;
27736+
27737+ /*
27738+ * update reset values (not just for counters)
27739+ */
27740+ set->pmds[cnum].long_reset = req->reg_long_reset;
27741+ set->pmds[cnum].short_reset = req->reg_short_reset;
27742+
27743+ /*
27744+ * update randomization mask
27745+ */
27746+ set->pmds[cnum].mask = req->reg_random_mask;
27747+
27748+ set->pmds[cnum].eventid = req->reg_smpl_eventid;
27749+
27750+ if (set == active_set) {
27751+ set->priv_flags |= PFM_SETFL_PRIV_MOD_PMDS;
27752+ if (can_access_pmu)
27753+ pfm_write_pmd(ctx, cnum, req->reg_value);
27754+ }
27755+
27756+
27757+ PFM_DBG("set%u pmd%u=0x%llx flags=0x%x a_pmu=%d "
27758+ "ctx_pmd=0x%llx s_reset=0x%llx "
27759+ "l_reset=0x%llx s_pmds=0x%llx "
27760+ "r_pmds=0x%llx o_pmds=0x%llx "
27761+ "o_thres=%llu compat=%d eventid=%llx",
27762+ set->id,
27763+ cnum,
27764+ (unsigned long long)req->reg_value,
27765+ set->pmds[cnum].flags,
27766+ can_access_pmu,
27767+ (unsigned long long)set->pmds[cnum].value,
27768+ (unsigned long long)set->pmds[cnum].short_reset,
27769+ (unsigned long long)set->pmds[cnum].long_reset,
27770+ (unsigned long long)set->pmds[cnum].smpl_pmds[0],
27771+ (unsigned long long)set->pmds[cnum].reset_pmds[0],
27772+ (unsigned long long)set->ovfl_pmds[0],
27773+ (unsigned long long)set->pmds[cnum].ovflsw_thres,
27774+ compat,
27775+ (unsigned long long)set->pmds[cnum].eventid);
27776+ }
27777+ ret = 0;
27778+
27779+error:
27780+ update_changes(ctx, set, cast_ulp(old_used_pmcs));
27781+
27782+ /*
27783+ * make changes visible
27784+ */
27785+ if (can_access_pmu)
27786+ pfm_arch_serialize();
27787+
27788+ return ret;
27789+}
27790+
27791+/**
27792+ * __pfm_write_pmcs - modified config registers
27793+ * @ctx: context to operate on
27794+ * @req: pfarg_pmc_t request from user
27795+ * @count: number of element in the pfarg_pmc_t vector
27796+ *
27797+ *
27798+ * The function succeeds whether the context is * attached or not.
27799+ * When attached to another thread, that thread must be stopped.
27800+ *
27801+ * The context is locked and interrupts are disabled.
27802+ */
27803+int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmc *req, int count)
27804+{
27805+ struct pfm_event_set *set, *active_set;
27806+ u64 value, dfl_val, rsvd_msk;
27807+ unsigned long *impl_pmcs;
27808+ int i, can_access_pmu;
27809+ int ret;
27810+ u16 set_id;
27811+ u16 cnum, pmc_type, max_pmc;
27812+ u32 flags, expert;
27813+ pfm_pmc_check_t wr_func;
27814+
27815+ active_set = ctx->active_set;
27816+
27817+ wr_func = pfm_pmu_conf->pmc_write_check;
27818+ max_pmc = ctx->regs.max_pmc;
27819+ impl_pmcs = cast_ulp(ctx->regs.pmcs);
27820+ set = list_first_entry(&ctx->set_list, struct pfm_event_set, list);
27821+
27822+ expert = pfm_controls.flags & PFM_CTRL_FL_RW_EXPERT;
27823+
27824+ can_access_pmu = 0;
27825+
27826+ /*
27827+ * we cannot access the actual PMC registers when monitoring is masked
27828+ */
27829+ if (unlikely(ctx->state == PFM_CTX_LOADED))
27830+ can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task
27831+ || ctx->flags.system;
27832+
27833+ ret = -EINVAL;
27834+
27835+ for (i = 0; i < count; i++, req++) {
27836+
27837+ cnum = req->reg_num;
27838+ set_id = req->reg_set;
27839+ value = req->reg_value;
27840+ flags = req->reg_flags;
27841+
27842+ /*
27843+ * no access to unavailable PMC register
27844+ */
27845+ if (unlikely(is_invalid(cnum, impl_pmcs, max_pmc))) {
27846+ PFM_DBG("pmc%u is not available", cnum);
27847+ goto error;
27848+ }
27849+
27850+ pmc_type = pfm_pmu_conf->pmc_desc[cnum].type;
27851+ dfl_val = pfm_pmu_conf->pmc_desc[cnum].dfl_val;
27852+ rsvd_msk = pfm_pmu_conf->pmc_desc[cnum].rsvd_msk;
27853+
27854+ /*
27855+ * ensure only valid flags are set
27856+ */
27857+ if (flags & ~PFM_REGFL_PMC_ALL) {
27858+ PFM_DBG("pmc%u: invalid flags=0x%x", cnum, flags);
27859+ goto error;
27860+ }
27861+
27862+ /*
27863+ * locate event set
27864+ */
27865+ if (set_id != set->id) {
27866+ set = pfm_find_set(ctx, set_id, 0);
27867+ if (set == NULL) {
27868+ PFM_DBG("event set%u does not exist",
27869+ set_id);
27870+ goto error;
27871+ }
27872+ }
27873+
27874+ /*
27875+ * set reserved bits to default values
27876+ * (reserved bits must be 1 in rsvd_msk)
27877+ *
27878+ * bypass via /sys/kernel/perfmon/mode = 1
27879+ */
27880+ if (likely(!expert))
27881+ value = (value & ~rsvd_msk) | (dfl_val & rsvd_msk);
27882+
27883+ if (flags & PFM_REGFL_NO_EMUL64) {
27884+ if (!(pmc_type & PFM_REG_NO64)) {
27885+ PFM_DBG("pmc%u no support for "
27886+ "PFM_REGFL_NO_EMUL64", cnum);
27887+ goto error;
27888+ }
27889+ value &= ~pfm_pmu_conf->pmc_desc[cnum].no_emul64_msk;
27890+ }
27891+
27892+ /*
27893+ * execute write checker, if any
27894+ */
27895+ if (likely(wr_func && (pmc_type & PFM_REG_WC))) {
27896+ req->reg_value = value;
27897+ ret = (*wr_func)(ctx, set, req);
27898+ if (ret)
27899+ goto error;
27900+ value = req->reg_value;
27901+ }
27902+
27903+ /*
27904+ * Now we commit the changes
27905+ */
27906+
27907+ /*
27908+ * mark PMC register as used
27909+ * We do not track associated PMC register based on
27910+ * the fact that they will likely need to be written
27911+ * in order to become useful at which point the statement
27912+ * below will catch that.
27913+ *
27914+ * The used_pmcs bitmask is only useful on architectures where
27915+ * the PMC needs to be modified for particular bits, especially
27916+ * on overflow or to stop/start.
27917+ */
27918+ if (!test_bit(cnum, cast_ulp(set->used_pmcs))) {
27919+ __set_bit(cnum, cast_ulp(set->used_pmcs));
27920+ set->nused_pmcs++;
27921+ }
27922+
27923+ set->pmcs[cnum] = value;
27924+
27925+ if (set == active_set) {
27926+ set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
27927+ if (can_access_pmu)
27928+ pfm_arch_write_pmc(ctx, cnum, value);
27929+ }
27930+
27931+ PFM_DBG("set%u pmc%u=0x%llx a_pmu=%d "
27932+ "u_pmcs=0x%llx nu_pmcs=%u",
27933+ set->id,
27934+ cnum,
27935+ (unsigned long long)value,
27936+ can_access_pmu,
27937+ (unsigned long long)set->used_pmcs[0],
27938+ set->nused_pmcs);
27939+ }
27940+ ret = 0;
27941+error:
27942+ /*
27943+ * make sure the changes are visible
27944+ */
27945+ if (can_access_pmu)
27946+ pfm_arch_serialize();
27947+
27948+ return ret;
27949+}
27950+
27951+/**
27952+ * __pfm_read_pmds - read data registers
27953+ * @ctx: context to operate on
27954+ * @req: pfarg_pmd_t request from user
27955+ * @count: number of element in the pfarg_pmd_t vector
27956+ *
27957+ *
27958+ * The function succeeds whether the context is attached or not.
27959+ * When attached to another thread, that thread must be stopped.
27960+ *
27961+ * The context is locked and interrupts are disabled.
27962+ */
27963+int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count)
27964+{
27965+ u64 val = 0, lval, ovfl_mask, hw_val;
27966+ u64 sw_cnt;
27967+ unsigned long *impl_pmds;
27968+ struct pfm_event_set *set, *active_set;
27969+ int i, ret, can_access_pmu = 0;
27970+ u16 cnum, pmd_type, set_id, max_pmd;
27971+
27972+ ovfl_mask = pfm_pmu_conf->ovfl_mask;
27973+ impl_pmds = cast_ulp(ctx->regs.pmds);
27974+ max_pmd = ctx->regs.max_pmd;
27975+ active_set = ctx->active_set;
27976+ set = list_first_entry(&ctx->set_list, struct pfm_event_set, list);
27977+
27978+ if (likely(ctx->state == PFM_CTX_LOADED)) {
27979+ can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task
27980+ || ctx->flags.system;
27981+
27982+ if (can_access_pmu)
27983+ pfm_arch_serialize();
27984+ }
27985+
27986+ /*
27987+ * on both UP and SMP, we can only read the PMD from the hardware
27988+ * register when the task is the owner of the local PMU.
27989+ */
27990+ ret = -EINVAL;
27991+ for (i = 0; i < count; i++, req++) {
27992+
27993+ cnum = req->reg_num;
27994+ set_id = req->reg_set;
27995+
27996+ if (unlikely(is_invalid(cnum, impl_pmds, max_pmd))) {
27997+ PFM_DBG("pmd%u is not implemented/unaccessible", cnum);
27998+ goto error;
27999+ }
28000+
28001+ pmd_type = pfm_pmu_conf->pmd_desc[cnum].type;
28002+
28003+ /*
28004+ * locate event set
28005+ */
28006+ if (set_id != set->id) {
28007+ set = pfm_find_set(ctx, set_id, 0);
28008+ if (set == NULL) {
28009+ PFM_DBG("event set%u does not exist",
28010+ set_id);
28011+ goto error;
28012+ }
28013+ }
28014+ /*
28015+ * it is not possible to read a PMD which was not requested:
28016+ * - explicitly written via pfm_write_pmds()
28017+ * - provided as a reg_smpl_pmds[] to another PMD during
28018+ * pfm_write_pmds()
28019+ *
28020+ * This is motivated by security and for optimization purposes:
28021+ * - on context switch restore, we can restore only what
28022+ * we use (except when regs directly readable at user
28023+ * level, e.g., IA-64 self-monitoring, I386 RDPMC).
28024+ * - do not need to maintain PMC -> PMD dependencies
28025+ */
28026+ if (unlikely(!test_bit(cnum, cast_ulp(set->used_pmds)))) {
28027+ PFM_DBG("pmd%u cannot read, because not used", cnum);
28028+ goto error;
28029+ }
28030+
28031+ val = set->pmds[cnum].value;
28032+ lval = set->pmds[cnum].lval;
28033+
28034+ /*
28035+ * extract remaining ovfl to switch
28036+ */
28037+ sw_cnt = set->pmds[cnum].ovflsw_thres;
28038+
28039+ /*
28040+ * If the task is not the current one, then we check if the
28041+ * PMU state is still in the local live register due to lazy
28042+ * ctxsw. If true, then we read directly from the registers.
28043+ */
28044+ if (set == active_set && can_access_pmu) {
28045+ hw_val = pfm_read_pmd(ctx, cnum);
28046+ if (pmd_type & PFM_REG_C64)
28047+ val = (val & ~ovfl_mask) | (hw_val & ovfl_mask);
28048+ else
28049+ val = hw_val;
28050+ }
28051+
28052+ PFM_DBG("set%u pmd%u=0x%llx sw_thr=%llu lval=0x%llx",
28053+ set->id,
28054+ cnum,
28055+ (unsigned long long)val,
28056+ (unsigned long long)sw_cnt,
28057+ (unsigned long long)lval);
28058+
28059+ req->reg_value = val;
28060+ req->reg_last_reset_val = lval;
28061+ req->reg_ovfl_switch_cnt = sw_cnt;
28062+ }
28063+ ret = 0;
28064+error:
28065+ return ret;
28066+}
28067--- /dev/null
28068+++ b/perfmon/perfmon_sets.c
28069@@ -0,0 +1,873 @@
28070+/*
28071+ * perfmon_sets.c: perfmon2 event sets and multiplexing functions
28072+ *
28073+ * This file implements the perfmon2 interface which
28074+ * provides access to the hardware performance counters
28075+ * of the host processor.
28076+ *
28077+ * The initial version of perfmon.c was written by
28078+ * Ganesh Venkitachalam, IBM Corp.
28079+ *
28080+ * Then it was modified for perfmon-1.x by Stephane Eranian and
28081+ * David Mosberger, Hewlett Packard Co.
28082+ *
28083+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
28084+ * by Stephane Eranian, Hewlett Packard Co.
28085+ *
28086+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
28087+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
28088+ * David Mosberger-Tang <davidm@hpl.hp.com>
28089+ *
28090+ * More information about perfmon available at:
28091+ * http://perfmon2.sf.net
28092+ *
28093+ * This program is free software; you can redistribute it and/or
28094+ * modify it under the terms of version 2 of the GNU General Public
28095+ * License as published by the Free Software Foundation.
28096+ *
28097+ * This program is distributed in the hope that it will be useful,
28098+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
28099+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
28100+ * General Public License for more details.
28101+ *
28102+ * You should have received a copy of the GNU General Public License
28103+ * along with this program; if not, write to the Free Software
28104+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28105+ * 02111-1307 USA
28106+ */
28107+#include <linux/kernel.h>
28108+#include <linux/perfmon_kern.h>
28109+#include "perfmon_priv.h"
28110+
28111+static struct kmem_cache *pfm_set_cachep;
28112+
28113+/**
28114+ * pfm_reload_switch_thresholds - reload overflow-based switch thresholds per set
28115+ * @set: the set for which to reload thresholds
28116+ *
28117+ */
28118+static void pfm_reload_switch_thresholds(struct pfm_context *ctx,
28119+ struct pfm_event_set *set)
28120+{
28121+ u64 *used_pmds;
28122+ u16 i, max, first;
28123+
28124+ used_pmds = set->used_pmds;
28125+ first = ctx->regs.first_intr_pmd;
28126+ max = ctx->regs.max_intr_pmd;
28127+
28128+ for (i = first; i < max; i++) {
28129+ if (test_bit(i, cast_ulp(used_pmds))) {
28130+ set->pmds[i].ovflsw_thres = set->pmds[i].ovflsw_ref_thres;
28131+
28132+ PFM_DBG("set%u pmd%u ovflsw_thres=%llu",
28133+ set->id,
28134+ i,
28135+ (unsigned long long)set->pmds[i].ovflsw_thres);
28136+ }
28137+ }
28138+}
28139+
28140+/**
28141+ * pfm_prepare_sets - initialize sets on pfm_load_context
28142+ * @ctx : context to operate on
28143+ * @load_set: set to activate first
28144+ *
28145+ * connect all sets, reset internal fields
28146+ */
28147+struct pfm_event_set *pfm_prepare_sets(struct pfm_context *ctx, u16 load_set)
28148+{
28149+ struct pfm_event_set *set, *p;
28150+ u16 max;
28151+
28152+ /*
28153+ * locate first set to activate
28154+ */
28155+ set = pfm_find_set(ctx, load_set, 0);
28156+ if (!set)
28157+ return NULL;
28158+
28159+ if (set->flags & PFM_SETFL_OVFL_SWITCH)
28160+ pfm_reload_switch_thresholds(ctx, set);
28161+
28162+ max = ctx->regs.max_intr_pmd;
28163+
28164+ list_for_each_entry(p, &ctx->set_list, list) {
28165+ /*
28166+ * cleanup bitvectors
28167+ */
28168+ bitmap_zero(cast_ulp(p->ovfl_pmds), max);
28169+ bitmap_zero(cast_ulp(p->povfl_pmds), max);
28170+
28171+ p->npend_ovfls = 0;
28172+
28173+ /*
28174+ * we cannot just use plain clear because of arch-specific flags
28175+ */
28176+ p->priv_flags &= ~(PFM_SETFL_PRIV_MOD_BOTH|PFM_SETFL_PRIV_SWITCH);
28177+ /*
28178+ * neither duration nor runs are reset because typically loading/unloading
28179+ * does not mean counts are reset. To reset, the set must be modified
28180+ */
28181+ }
28182+ return set;
28183+}
28184+
28185+/*
28186+ * called by hrtimer_interrupt()
28187+ *
28188+ * This is the only function where we come with
28189+ * cpu_base->lock held before ctx->lock
28190+ *
28191+ * interrupts are disabled
28192+ */
28193+enum hrtimer_restart pfm_handle_switch_timeout(struct hrtimer *t)
28194+{
28195+ struct pfm_event_set *set;
28196+ struct pfm_context *ctx;
28197+ unsigned long flags;
28198+ enum hrtimer_restart ret = HRTIMER_NORESTART;
28199+
28200+ /*
28201+ * prevent against race with unload
28202+ */
28203+ ctx = __get_cpu_var(pmu_ctx);
28204+ if (!ctx)
28205+ return HRTIMER_NORESTART;
28206+
28207+ spin_lock_irqsave(&ctx->lock, flags);
28208+
28209+ set = ctx->active_set;
28210+
28211+ /*
28212+ * switching occurs only when context is attached
28213+ */
28214+ if (ctx->state != PFM_CTX_LOADED)
28215+ goto done;
28216+ /*
28217+ * timer does not run while monitoring is inactive (not started)
28218+ */
28219+ if (!pfm_arch_is_active(ctx))
28220+ goto done;
28221+
28222+ pfm_stats_inc(handle_timeout_count);
28223+
28224+ ret = pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_SHORT, 0);
28225+done:
28226+ spin_unlock_irqrestore(&ctx->lock, flags);
28227+ return ret;
28228+}
28229+
28230+/*
28231+ *
28232+ * always operating on the current task
28233+ * interrupts are masked
28234+ *
28235+ * input:
28236+ * - new_set: new set to switch to, if NULL follow normal chain
28237+ */
28238+enum hrtimer_restart pfm_switch_sets(struct pfm_context *ctx,
28239+ struct pfm_event_set *new_set,
28240+ int reset_mode,
28241+ int no_restart)
28242+{
28243+ struct pfm_event_set *set;
28244+ u64 now, end;
28245+ u32 new_flags;
28246+ int is_system, is_active, nn;
28247+ enum hrtimer_restart ret = HRTIMER_NORESTART;
28248+
28249+ now = sched_clock();
28250+ set = ctx->active_set;
28251+ is_active = pfm_arch_is_active(ctx);
28252+
28253+ /*
28254+ * if no set is explicitly requested,
28255+ * use the set_switch_next field
28256+ */
28257+ if (!new_set) {
28258+ /*
28259+ * we use round-robin unless the user specified
28260+ * a particular set to go to.
28261+ */
28262+ new_set = list_first_entry(&set->list, struct pfm_event_set, list);
28263+ if (&new_set->list == &ctx->set_list)
28264+ new_set = list_first_entry(&ctx->set_list, struct pfm_event_set, list);
28265+ }
28266+
28267+ PFM_DBG_ovfl("state=%d act=%d cur_set=%u cur_runs=%llu cur_npend=%d next_set=%u "
28268+ "next_runs=%llu new_npend=%d reset_mode=%d reset_pmds=%llx",
28269+ ctx->state,
28270+ is_active,
28271+ set->id,
28272+ (unsigned long long)set->runs,
28273+ set->npend_ovfls,
28274+ new_set->id,
28275+ (unsigned long long)new_set->runs,
28276+ new_set->npend_ovfls,
28277+ reset_mode,
28278+ (unsigned long long)new_set->reset_pmds[0]);
28279+
28280+ is_system = ctx->flags.system;
28281+ new_flags = new_set->flags;
28282+
28283+ /*
28284+ * nothing more to do
28285+ */
28286+ if (new_set == set)
28287+ goto skip_same_set;
28288+
28289+ if (is_active) {
28290+ pfm_arch_stop(current, ctx);
28291+ pfm_save_pmds(ctx, set);
28292+ /*
28293+ * compute elapsed ns for active set
28294+ */
28295+ set->duration += now - set->duration_start;
28296+ }
28297+
28298+ pfm_arch_restore_pmds(ctx, new_set);
28299+ /*
28300+ * if masked, we must restore the pmcs such that they
28301+ * do not capture anything.
28302+ */
28303+ pfm_arch_restore_pmcs(ctx, new_set);
28304+
28305+ if (new_set->npend_ovfls) {
28306+ pfm_arch_resend_irq(ctx);
28307+ pfm_stats_inc(ovfl_intr_replay_count);
28308+ }
28309+
28310+ new_set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
28311+
28312+skip_same_set:
28313+ new_set->runs++;
28314+ /*
28315+ * reset switch threshold
28316+ */
28317+ if (new_flags & PFM_SETFL_OVFL_SWITCH)
28318+ pfm_reload_switch_thresholds(ctx, new_set);
28319+
28320+ /*
28321+ * reset overflowed PMD registers in new set
28322+ */
28323+ nn = bitmap_weight(cast_ulp(new_set->reset_pmds), ctx->regs.max_pmd);
28324+ if (nn)
28325+ pfm_reset_pmds(ctx, new_set, nn, reset_mode);
28326+
28327+
28328+ /*
28329+ * This is needed when coming from pfm_start()
28330+ *
28331+ * When switching to the same set, there is no
28332+ * need to restart
28333+ */
28334+ if (no_restart)
28335+ goto skip_restart;
28336+
28337+ if (is_active) {
28338+ /*
28339+ * do not need to restart when same set
28340+ */
28341+ if (new_set != set) {
28342+ ctx->active_set = new_set;
28343+ new_set->duration_start = now;
28344+ pfm_arch_start(current, ctx);
28345+ }
28346+ /*
28347+ * install new timeout if necessary
28348+ */
28349+ if (new_flags & PFM_SETFL_TIME_SWITCH) {
28350+ struct hrtimer *h;
28351+ h = &__get_cpu_var(pfm_hrtimer);
28352+ hrtimer_forward(h, h->base->get_time(), new_set->hrtimer_exp);
28353+ new_set->hrtimer_rem = new_set->hrtimer_exp;
28354+ ret = HRTIMER_RESTART;
28355+ }
28356+ }
28357+
28358+skip_restart:
28359+ ctx->active_set = new_set;
28360+
28361+ end = sched_clock();
28362+
28363+ pfm_stats_inc(set_switch_count);
28364+ pfm_stats_add(set_switch_ns, end - now);
28365+
28366+ return ret;
28367+}
28368+
28369+/*
28370+ * called from __pfm_overflow_handler() to switch event sets.
28371+ * monitoring is stopped, task is current, interrupts are masked.
28372+ * compared to pfm_switch_sets(), this version is simplified because
28373+ * it knows about the call path. There is no need to stop monitoring
28374+ * because it is already frozen by PMU handler.
28375+ */
28376+void pfm_switch_sets_from_intr(struct pfm_context *ctx)
28377+{
28378+ struct pfm_event_set *set, *new_set;
28379+ u64 now, end;
28380+ u32 new_flags;
28381+ int is_system, n;
28382+
28383+ now = sched_clock();
28384+ set = ctx->active_set;
28385+ new_set = list_first_entry(&set->list, struct pfm_event_set, list);
28386+ if (&new_set->list == &ctx->set_list)
28387+ new_set = list_first_entry(&ctx->set_list, struct pfm_event_set, list);
28388+
28389+ PFM_DBG_ovfl("state=%d cur_set=%u cur_runs=%llu cur_npend=%d next_set=%u "
28390+ "next_runs=%llu new_npend=%d new_r_pmds=%llx",
28391+ ctx->state,
28392+ set->id,
28393+ (unsigned long long)set->runs,
28394+ set->npend_ovfls,
28395+ new_set->id,
28396+ (unsigned long long)new_set->runs,
28397+ new_set->npend_ovfls,
28398+ (unsigned long long)new_set->reset_pmds[0]);
28399+
28400+ is_system = ctx->flags.system;
28401+ new_flags = new_set->flags;
28402+
28403+ /*
28404+ * nothing more to do
28405+ */
28406+ if (new_set == set)
28407+ goto skip_same_set;
28408+
28409+ /*
28410+ * switch on intr only when set has OVFL_SWITCH
28411+ */
28412+ BUG_ON(set->flags & PFM_SETFL_TIME_SWITCH);
28413+
28414+ /*
28415+ * when called from PMU intr handler, monitoring
28416+ * is already stopped
28417+ *
28418+ * save current PMD registers, we use a special
28419+ * form for performance reason. On some architectures,
28420+ * such as x86, the pmds are already saved when entering
28421+ * the PMU interrupt handler via pfm-arch_intr_freeze()
28422+ * so we don't need to save them again. On the contrary,
28423+ * on IA-64, they are not saved by freeze, thus we have to
28424+ * to it here.
28425+ */
28426+ pfm_arch_save_pmds_from_intr(ctx, set);
28427+
28428+ /*
28429+ * compute elapsed ns for active set
28430+ */
28431+ set->duration += now - set->duration_start;
28432+
28433+ pfm_arch_restore_pmds(ctx, new_set);
28434+
28435+ /*
28436+ * must not be restored active as we are still executing in the
28437+ * PMU interrupt handler. activation is deferred to unfreeze PMU
28438+ */
28439+ pfm_arch_restore_pmcs(ctx, new_set);
28440+
28441+ /*
28442+ * check for pending interrupt on incoming set.
28443+ * interrupts are masked so handler call deferred
28444+ */
28445+ if (new_set->npend_ovfls) {
28446+ pfm_arch_resend_irq(ctx);
28447+ pfm_stats_inc(ovfl_intr_replay_count);
28448+ }
28449+ /*
28450+ * no need to restore anything, that is already done
28451+ */
28452+ new_set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
28453+ /*
28454+ * reset duration counter
28455+ */
28456+ new_set->duration_start = now;
28457+
28458+skip_same_set:
28459+ new_set->runs++;
28460+
28461+ /*
28462+ * reset switch threshold
28463+ */
28464+ if (new_flags & PFM_SETFL_OVFL_SWITCH)
28465+ pfm_reload_switch_thresholds(ctx, new_set);
28466+
28467+ /*
28468+ * reset overflowed PMD registers
28469+ */
28470+ n = bitmap_weight(cast_ulp(new_set->reset_pmds), ctx->regs.max_pmd);
28471+ if (n)
28472+ pfm_reset_pmds(ctx, new_set, n, PFM_PMD_RESET_SHORT);
28473+
28474+ /*
28475+ * XXX: isactive?
28476+ *
28477+ * Came here following a interrupt which triggered a switch, i.e.,
28478+ * previous set was using OVFL_SWITCH, thus we just need to arm
28479+ * check if the next set is using timeout, and if so arm the timer.
28480+ *
28481+ * Timeout is always at least one tick away. No risk of having to
28482+ * invoke the timeout handler right now. In any case, cb_mode is
28483+ * set to HRTIMER_CB_IRQSAFE_NO_SOFTIRQ such that hrtimer_start
28484+ * will not try to wakeup the softirqd which could cause a locking
28485+ * problem.
28486+ */
28487+ if (new_flags & PFM_SETFL_TIME_SWITCH) {
28488+ hrtimer_start(&__get_cpu_var(pfm_hrtimer), set->hrtimer_exp, HRTIMER_MODE_REL);
28489+ PFM_DBG("armed new timeout for set%u", new_set->id);
28490+ }
28491+
28492+ ctx->active_set = new_set;
28493+
28494+ end = sched_clock();
28495+
28496+ pfm_stats_inc(set_switch_count);
28497+ pfm_stats_add(set_switch_ns, end - now);
28498+}
28499+
28500+
28501+static int pfm_setfl_sane(struct pfm_context *ctx, u32 flags)
28502+{
28503+#define PFM_SETFL_BOTH_SWITCH (PFM_SETFL_OVFL_SWITCH|PFM_SETFL_TIME_SWITCH)
28504+ int ret;
28505+
28506+ ret = pfm_arch_setfl_sane(ctx, flags);
28507+ if (ret)
28508+ return ret;
28509+
28510+ if ((flags & PFM_SETFL_BOTH_SWITCH) == PFM_SETFL_BOTH_SWITCH) {
28511+ PFM_DBG("both switch ovfl and switch time are set");
28512+ return -EINVAL;
28513+ }
28514+ return 0;
28515+}
28516+
28517+/*
28518+ * it is never possible to change the identification of an existing set
28519+ */
28520+static int pfm_change_evtset(struct pfm_context *ctx,
28521+ struct pfm_event_set *set,
28522+ struct pfarg_setdesc *req)
28523+{
28524+ struct timeval tv;
28525+ struct timespec ts;
28526+ ktime_t kt;
28527+ long d, res_ns;
28528+ s32 rem;
28529+ u32 flags;
28530+ int ret;
28531+ u16 set_id;
28532+
28533+ BUG_ON(ctx->state == PFM_CTX_LOADED);
28534+
28535+ set_id = req->set_id;
28536+ flags = req->set_flags;
28537+
28538+ ret = pfm_setfl_sane(ctx, flags);
28539+ if (ret) {
28540+ PFM_DBG("invalid flags 0x%x set %u", flags, set_id);
28541+ return -EINVAL;
28542+ }
28543+
28544+ /*
28545+ * compute timeout value
28546+ */
28547+ if (flags & PFM_SETFL_TIME_SWITCH) {
28548+ /*
28549+ * timeout value of zero is illegal
28550+ */
28551+ if (req->set_timeout == 0) {
28552+ PFM_DBG("invalid timeout 0");
28553+ return -EINVAL;
28554+ }
28555+
28556+ hrtimer_get_res(CLOCK_MONOTONIC, &ts);
28557+ res_ns = (long)ktime_to_ns(timespec_to_ktime(ts));
28558+
28559+ /*
28560+ * round-up to multiple of clock resolution
28561+ * timeout = ((req->set_timeout+res_ns-1)/res_ns)*res_ns;
28562+ *
28563+ * u64 division missing on 32-bit arch, so use div_s64_rem
28564+ */
28565+ d = div_s64_rem(req->set_timeout, res_ns, &rem);
28566+
28567+ PFM_DBG("set%u flags=0x%x req_timeout=%lluns "
28568+ "HZ=%u TICK_NSEC=%lu clock_res=%ldns rem=%dns",
28569+ set_id,
28570+ flags,
28571+ (unsigned long long)req->set_timeout,
28572+ HZ, TICK_NSEC,
28573+ res_ns,
28574+ rem);
28575+
28576+ /*
28577+ * Only accept timeout, we can actually achieve.
28578+ * users can invoke clock_getres(CLOCK_MONOTONIC)
28579+ * to figure out resolution and adjust timeout
28580+ */
28581+ if (rem) {
28582+ PFM_DBG("set%u invalid timeout=%llu",
28583+ set_id,
28584+ (unsigned long long)req->set_timeout);
28585+ return -EINVAL;
28586+ }
28587+
28588+ tv = ns_to_timeval(req->set_timeout);
28589+ kt = timeval_to_ktime(tv);
28590+ set->hrtimer_exp = kt;
28591+ } else {
28592+ set->hrtimer_exp = ktime_set(0, 0);
28593+ }
28594+
28595+ /*
28596+ * commit changes
28597+ */
28598+ set->id = set_id;
28599+ set->flags = flags;
28600+ set->priv_flags = 0;
28601+
28602+ /*
28603+ * activation and duration counters are reset as
28604+ * most likely major things will change in the set
28605+ */
28606+ set->runs = 0;
28607+ set->duration = 0;
28608+
28609+ return 0;
28610+}
28611+
28612+/*
28613+ * this function does not modify the next field
28614+ */
28615+static void pfm_initialize_set(struct pfm_context *ctx,
28616+ struct pfm_event_set *set)
28617+{
28618+ u64 *impl_pmcs;
28619+ u16 i, max_pmc;
28620+
28621+ max_pmc = ctx->regs.max_pmc;
28622+ impl_pmcs = ctx->regs.pmcs;
28623+
28624+ /*
28625+ * install default values for all PMC registers
28626+ */
28627+ for (i = 0; i < max_pmc; i++) {
28628+ if (test_bit(i, cast_ulp(impl_pmcs))) {
28629+ set->pmcs[i] = pfm_pmu_conf->pmc_desc[i].dfl_val;
28630+ PFM_DBG("set%u pmc%u=0x%llx",
28631+ set->id,
28632+ i,
28633+ (unsigned long long)set->pmcs[i]);
28634+ }
28635+ }
28636+
28637+ /*
28638+ * PMD registers are set to 0 when the event set is allocated,
28639+ * hence we do not need to explicitly initialize them.
28640+ *
28641+ * For virtual PMD registers (i.e., those tied to a SW resource)
28642+ * their value becomes meaningful once the context is attached.
28643+ */
28644+}
28645+
28646+/*
28647+ * look for an event set using its identification. If the set does not
28648+ * exist:
28649+ * - if alloc == 0 then return error
28650+ * - if alloc == 1 then allocate set
28651+ *
28652+ * alloc is one ONLY when coming from pfm_create_evtsets() which can only
28653+ * be called when the context is detached, i.e. monitoring is stopped.
28654+ */
28655+struct pfm_event_set *pfm_find_set(struct pfm_context *ctx, u16 set_id, int alloc)
28656+{
28657+ struct pfm_event_set *set = NULL, *prev, *new_set;
28658+
28659+ PFM_DBG("looking for set=%u", set_id);
28660+
28661+ prev = NULL;
28662+ list_for_each_entry(set, &ctx->set_list, list) {
28663+ if (set->id == set_id)
28664+ return set;
28665+ if (set->id > set_id)
28666+ break;
28667+ prev = set;
28668+ }
28669+
28670+ if (!alloc)
28671+ return NULL;
28672+
28673+ /*
28674+ * we are holding the context spinlock and interrupts
28675+ * are unmasked. We must use GFP_ATOMIC as we cannot
28676+ * sleep while holding a spin lock.
28677+ */
28678+ new_set = kmem_cache_zalloc(pfm_set_cachep, GFP_ATOMIC);
28679+ if (!new_set)
28680+ return NULL;
28681+
28682+ new_set->id = set_id;
28683+
28684+ INIT_LIST_HEAD(&new_set->list);
28685+
28686+ if (prev == NULL) {
28687+ list_add(&(new_set->list), &ctx->set_list);
28688+ } else {
28689+ PFM_DBG("add after set=%u", prev->id);
28690+ list_add(&(new_set->list), &prev->list);
28691+ }
28692+ return new_set;
28693+}
28694+
28695+/**
28696+ * pfm_create_initial_set - create initial set from __pfm_c reate_context
28697+ * @ctx: context to atatched the set to
28698+ */
28699+int pfm_create_initial_set(struct pfm_context *ctx)
28700+{
28701+ struct pfm_event_set *set;
28702+
28703+ /*
28704+ * create initial set0
28705+ */
28706+ if (!pfm_find_set(ctx, 0, 1))
28707+ return -ENOMEM;
28708+
28709+ set = list_first_entry(&ctx->set_list, struct pfm_event_set, list);
28710+
28711+ pfm_initialize_set(ctx, set);
28712+
28713+ return 0;
28714+}
28715+
28716+/*
28717+ * context is unloaded for this command. Interrupts are enabled
28718+ */
28719+int __pfm_create_evtsets(struct pfm_context *ctx, struct pfarg_setdesc *req,
28720+ int count)
28721+{
28722+ struct pfm_event_set *set;
28723+ u16 set_id;
28724+ int i, ret;
28725+
28726+ for (i = 0; i < count; i++, req++) {
28727+ set_id = req->set_id;
28728+
28729+ PFM_DBG("set_id=%u", set_id);
28730+
28731+ set = pfm_find_set(ctx, set_id, 1);
28732+ if (set == NULL)
28733+ goto error_mem;
28734+
28735+ ret = pfm_change_evtset(ctx, set, req);
28736+ if (ret)
28737+ goto error_params;
28738+
28739+ pfm_initialize_set(ctx, set);
28740+ }
28741+ return 0;
28742+error_mem:
28743+ PFM_DBG("cannot allocate set %u", set_id);
28744+ return -ENOMEM;
28745+error_params:
28746+ return ret;
28747+}
28748+
28749+int __pfm_getinfo_evtsets(struct pfm_context *ctx, struct pfarg_setinfo *req,
28750+ int count)
28751+{
28752+ struct pfm_event_set *set;
28753+ int i, is_system, is_loaded, is_self, ret;
28754+ u16 set_id;
28755+ u64 end;
28756+
28757+ end = sched_clock();
28758+
28759+ is_system = ctx->flags.system;
28760+ is_loaded = ctx->state == PFM_CTX_LOADED;
28761+ is_self = ctx->task == current || is_system;
28762+
28763+ ret = -EINVAL;
28764+ for (i = 0; i < count; i++, req++) {
28765+
28766+ set_id = req->set_id;
28767+
28768+ list_for_each_entry(set, &ctx->set_list, list) {
28769+ if (set->id == set_id)
28770+ goto found;
28771+ if (set->id > set_id)
28772+ goto error;
28773+ }
28774+found:
28775+ req->set_flags = set->flags;
28776+
28777+ /*
28778+ * compute leftover timeout
28779+ *
28780+ * lockdep may complain about lock inversion
28781+ * because of get_remaining() however, this
28782+ * applies to self-montoring only, thus the
28783+ * thread cannot be in the timeout handler
28784+ * and here at the same time given that we
28785+ * run with interrupts disabled
28786+ */
28787+ if (is_loaded && is_self) {
28788+ struct hrtimer *h;
28789+ h = &__get_cpu_var(pfm_hrtimer);
28790+ req->set_timeout = ktime_to_ns(hrtimer_get_remaining(h));
28791+ } else {
28792+ /*
28793+ * hrtimer_rem zero when not using
28794+ * timeout-based switching
28795+ */
28796+ req->set_timeout = ktime_to_ns(set->hrtimer_rem);
28797+ }
28798+
28799+ req->set_runs = set->runs;
28800+ req->set_act_duration = set->duration;
28801+
28802+ /*
28803+ * adjust for active set if needed
28804+ */
28805+ if (is_system && is_loaded && ctx->flags.started
28806+ && set == ctx->active_set)
28807+ req->set_act_duration += end - set->duration_start;
28808+
28809+ /*
28810+ * copy the list of pmds which last overflowed
28811+ */
28812+ bitmap_copy(cast_ulp(req->set_ovfl_pmds),
28813+ cast_ulp(set->ovfl_pmds),
28814+ PFM_MAX_PMDS);
28815+
28816+ /*
28817+ * copy bitmask of available PMU registers
28818+ *
28819+ * must copy over the entire vector to avoid
28820+ * returning bogus upper bits pass by user
28821+ */
28822+ bitmap_copy(cast_ulp(req->set_avail_pmcs),
28823+ cast_ulp(ctx->regs.pmcs),
28824+ PFM_MAX_PMCS);
28825+
28826+ bitmap_copy(cast_ulp(req->set_avail_pmds),
28827+ cast_ulp(ctx->regs.pmds),
28828+ PFM_MAX_PMDS);
28829+
28830+ PFM_DBG("set%u flags=0x%x eff_usec=%llu runs=%llu "
28831+ "a_pmcs=0x%llx a_pmds=0x%llx",
28832+ set_id,
28833+ set->flags,
28834+ (unsigned long long)req->set_timeout,
28835+ (unsigned long long)set->runs,
28836+ (unsigned long long)ctx->regs.pmcs[0],
28837+ (unsigned long long)ctx->regs.pmds[0]);
28838+ }
28839+ ret = 0;
28840+error:
28841+ return ret;
28842+}
28843+
28844+/*
28845+ * context is unloaded for this command. Interrupts are enabled
28846+ */
28847+int __pfm_delete_evtsets(struct pfm_context *ctx, void *arg, int count)
28848+{
28849+ struct pfarg_setdesc *req = arg;
28850+ struct pfm_event_set *set;
28851+ u16 set_id;
28852+ int i, ret;
28853+
28854+ ret = -EINVAL;
28855+ for (i = 0; i < count; i++, req++) {
28856+ set_id = req->set_id;
28857+
28858+ list_for_each_entry(set, &ctx->set_list, list) {
28859+ if (set->id == set_id)
28860+ goto found;
28861+ if (set->id > set_id)
28862+ goto error;
28863+ }
28864+ goto error;
28865+found:
28866+ /*
28867+ * clear active set if necessary.
28868+ * will be updated when context is loaded
28869+ */
28870+ if (set == ctx->active_set)
28871+ ctx->active_set = NULL;
28872+
28873+ list_del(&set->list);
28874+
28875+ kmem_cache_free(pfm_set_cachep, set);
28876+
28877+ PFM_DBG("set%u deleted", set_id);
28878+ }
28879+ ret = 0;
28880+error:
28881+ return ret;
28882+}
28883+
28884+/*
28885+ * called from pfm_context_free() to free all sets
28886+ */
28887+void pfm_free_sets(struct pfm_context *ctx)
28888+{
28889+ struct pfm_event_set *set, *tmp;
28890+
28891+ list_for_each_entry_safe(set, tmp, &ctx->set_list, list) {
28892+ list_del(&set->list);
28893+ kmem_cache_free(pfm_set_cachep, set);
28894+ }
28895+}
28896+
28897+/**
28898+ * pfm_restart_timer - restart hrtimer taking care of expired timeout
28899+ * @ctx : context to work with
28900+ * @set : current active set
28901+ *
28902+ * Must be called on the processor on which the timer is to be armed.
28903+ * Assumes context is locked and interrupts are masked
28904+ *
28905+ * Upon return the active set for the context may have changed
28906+ */
28907+void pfm_restart_timer(struct pfm_context *ctx, struct pfm_event_set *set)
28908+{
28909+ struct hrtimer *h;
28910+ enum hrtimer_restart ret;
28911+
28912+ h = &__get_cpu_var(pfm_hrtimer);
28913+
28914+ PFM_DBG_ovfl("hrtimer=%lld", (long long)ktime_to_ns(set->hrtimer_rem));
28915+
28916+ if (ktime_to_ns(set->hrtimer_rem) > 0) {
28917+ hrtimer_start(h, set->hrtimer_rem, HRTIMER_MODE_REL);
28918+ } else {
28919+ /*
28920+ * timer was not re-armed because it has already expired
28921+ * timer was not enqueued, we need to switch set now
28922+ */
28923+ pfm_stats_inc(set_switch_exp);
28924+
28925+ ret = pfm_switch_sets(ctx, NULL, 1, 0);
28926+ set = ctx->active_set;
28927+ if (ret == HRTIMER_RESTART)
28928+ hrtimer_start(h, set->hrtimer_rem, HRTIMER_MODE_REL);
28929+ }
28930+}
28931+
28932+int __init pfm_init_sets(void)
28933+{
28934+ pfm_set_cachep = kmem_cache_create("pfm_event_set",
28935+ sizeof(struct pfm_event_set),
28936+ SLAB_HWCACHE_ALIGN, 0, NULL);
28937+ if (!pfm_set_cachep) {
28938+ PFM_ERR("cannot initialize event set slab");
28939+ return -ENOMEM;
28940+ }
28941+ return 0;
28942+}
28943--- /dev/null
28944+++ b/perfmon/perfmon_smpl.c
28945@@ -0,0 +1,865 @@
28946+/*
28947+ * perfmon_smpl.c: perfmon2 sampling management
28948+ *
28949+ * This file implements the perfmon2 interface which
28950+ * provides access to the hardware performance counters
28951+ * of the host processor.
28952+ *
28953+ *
28954+ * The initial version of perfmon.c was written by
28955+ * Ganesh Venkitachalam, IBM Corp.
28956+ *
28957+ * Then it was modified for perfmon-1.x by Stephane Eranian and
28958+ * David Mosberger, Hewlett Packard Co.
28959+ *
28960+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
28961+ * by Stephane Eranian, Hewlett Packard Co.
28962+ *
28963+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
28964+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
28965+ * David Mosberger-Tang <davidm@hpl.hp.com>
28966+ *
28967+ * More information about perfmon available at:
28968+ * http://perfmon2.sf.net
28969+ *
28970+ * This program is free software; you can redistribute it and/or
28971+ * modify it under the terms of version 2 of the GNU General Public
28972+ * License as published by the Free Software Foundation.
28973+ *
28974+ * This program is distributed in the hope that it will be useful,
28975+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
28976+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
28977+ * General Public License for more details.
28978+ *
28979+ * You should have received a copy of the GNU General Public License
28980+ * along with this program; if not, write to the Free Software
28981+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28982+ * 02111-1307 USA
28983+ */
28984+#include <linux/module.h>
28985+#include <linux/kernel.h>
28986+#include <linux/vmalloc.h>
28987+#include <linux/fs.h>
28988+#include <linux/mm.h>
28989+#include <linux/random.h>
28990+#include <linux/uaccess.h>
28991+#include <linux/perfmon_kern.h>
28992+
28993+#include "perfmon_priv.h"
28994+
28995+/**
28996+ * pfm_smpl_buf_alloc - allocate memory for sampling buffer
28997+ * @ctx: context to operate on
28998+ * @rsize: requested size
28999+ *
29000+ * called from pfm_smpl_buffer_alloc_old() (IA64-COMPAT)
29001+ * and pfm_setup_smpl_fmt()
29002+ *
29003+ * interrupts are enabled, context is not locked.
29004+ *
29005+ * function is not static because it is called from the IA-64
29006+ * compatibility module (perfmon_compat.c)
29007+ */
29008+int pfm_smpl_buf_alloc(struct pfm_context *ctx, size_t rsize)
29009+{
29010+#if PFM_ARCH_SMPL_ALIGN_SIZE > 0
29011+#define PFM_ALIGN_SMPL(a, f) (void *)((((unsigned long)(a))+(f-1)) & ~(f-1))
29012+#else
29013+#define PFM_ALIGN_SMPL(a, f) (a)
29014+#endif
29015+ void *addr, *real_addr;
29016+ size_t size, real_size;
29017+ int ret;
29018+
29019+ might_sleep();
29020+
29021+ /*
29022+ * align page boundary
29023+ */
29024+ size = PAGE_ALIGN(rsize);
29025+
29026+ /*
29027+ * On some arch, it may be necessary to get an alignment greater
29028+ * than page size to avoid certain cache effects (e.g., MIPS).
29029+ * This is the reason for PFM_ARCH_SMPL_ALIGN_SIZE.
29030+ */
29031+ real_size = size + PFM_ARCH_SMPL_ALIGN_SIZE;
29032+
29033+ PFM_DBG("req_size=%zu size=%zu real_size=%zu",
29034+ rsize,
29035+ size,
29036+ real_size);
29037+
29038+ ret = pfm_smpl_buf_space_acquire(ctx, real_size);
29039+ if (ret)
29040+ return ret;
29041+
29042+ /*
29043+ * vmalloc can sleep. we do not hold
29044+ * any spinlock and interrupts are enabled
29045+ */
29046+ real_addr = addr = vmalloc(real_size);
29047+ if (!real_addr) {
29048+ PFM_DBG("cannot allocate sampling buffer");
29049+ goto unres;
29050+ }
29051+
29052+ /*
29053+ * align the useable sampling buffer address to the arch requirement
29054+ * This is a nop on most architectures
29055+ */
29056+ addr = PFM_ALIGN_SMPL(real_addr, PFM_ARCH_SMPL_ALIGN_SIZE);
29057+
29058+ memset(addr, 0, real_size);
29059+
29060+ /*
29061+ * due to cache aliasing, it may be necessary to flush the pages
29062+ * on certain architectures (e.g., MIPS)
29063+ */
29064+ pfm_cacheflush(addr, real_size);
29065+
29066+ /*
29067+ * what needs to be freed
29068+ */
29069+ ctx->smpl_real_addr = real_addr;
29070+ ctx->smpl_real_size = real_size;
29071+
29072+ /*
29073+ * what is actually available to user
29074+ */
29075+ ctx->smpl_addr = addr;
29076+ ctx->smpl_size = size;
29077+
29078+ PFM_DBG("addr=%p real_addr=%p", addr, real_addr);
29079+
29080+ return 0;
29081+unres:
29082+ /*
29083+ * smpl_addr is NULL, no double freeing possible in pfm_context_free()
29084+ */
29085+ pfm_smpl_buf_space_release(ctx, real_size);
29086+
29087+ return -ENOMEM;
29088+}
29089+
29090+/**
29091+ * pfm_smpl_buf_free - free resources associated with sampling
29092+ * @ctx: context to operate on
29093+ */
29094+void pfm_smpl_buf_free(struct pfm_context *ctx)
29095+{
29096+ struct pfm_smpl_fmt *fmt;
29097+
29098+ fmt = ctx->smpl_fmt;
29099+
29100+ /*
29101+ * some formats may not use a buffer, yet they may
29102+ * need to be called on exit
29103+ */
29104+ if (fmt) {
29105+ if (fmt->fmt_exit)
29106+ (*fmt->fmt_exit)(ctx->smpl_addr);
29107+ /*
29108+ * decrease refcount of sampling format
29109+ */
29110+ pfm_smpl_fmt_put(fmt);
29111+ }
29112+
29113+ if (ctx->smpl_addr) {
29114+ pfm_smpl_buf_space_release(ctx, ctx->smpl_real_size);
29115+
29116+ PFM_DBG("free buffer real_addr=0x%p real_size=%zu",
29117+ ctx->smpl_real_addr,
29118+ ctx->smpl_real_size);
29119+
29120+ vfree(ctx->smpl_real_addr);
29121+ }
29122+}
29123+
29124+/**
29125+ * pfm_setup_smpl_fmt - initialization of sampling format and buffer
29126+ * @ctx: context to operate on
29127+ * @fmt_arg: smapling format arguments
29128+ * @ctx_flags: context flags as passed by user
29129+ * @filp: file descriptor associated with context
29130+ *
29131+ * called from __pfm_create_context()
29132+ */
29133+int pfm_setup_smpl_fmt(struct pfm_context *ctx, u32 ctx_flags, void *fmt_arg,
29134+ struct file *filp)
29135+{
29136+ struct pfm_smpl_fmt *fmt;
29137+ size_t size = 0;
29138+ int ret = 0;
29139+
29140+ fmt = ctx->smpl_fmt;
29141+
29142+ /*
29143+ * validate parameters
29144+ */
29145+ if (fmt->fmt_validate) {
29146+ ret = (*fmt->fmt_validate)(ctx_flags,
29147+ ctx->regs.num_pmds,
29148+ fmt_arg);
29149+ PFM_DBG("validate(0x%x,%p)=%d", ctx_flags, fmt_arg, ret);
29150+ if (ret)
29151+ goto error;
29152+ }
29153+
29154+ /*
29155+ * check if buffer format needs buffer allocation
29156+ */
29157+ size = 0;
29158+ if (fmt->fmt_getsize) {
29159+ ret = (*fmt->fmt_getsize)(ctx_flags, fmt_arg, &size);
29160+ if (ret) {
29161+ PFM_DBG("cannot get size ret=%d", ret);
29162+ goto error;
29163+ }
29164+ }
29165+
29166+ /*
29167+ * allocate buffer
29168+ * v20_compat is for IA-64 backward compatibility with perfmon v2.0
29169+ */
29170+ if (size) {
29171+#ifdef CONFIG_IA64_PERFMON_COMPAT
29172+ /*
29173+ * backward compatibility with perfmon v2.0 on Ia-64
29174+ */
29175+ if (ctx->flags.ia64_v20_compat)
29176+ ret = pfm_smpl_buf_alloc_compat(ctx, size, filp);
29177+ else
29178+#endif
29179+ ret = pfm_smpl_buf_alloc(ctx, size);
29180+
29181+ if (ret)
29182+ goto error;
29183+
29184+ }
29185+
29186+ if (fmt->fmt_init) {
29187+ ret = (*fmt->fmt_init)(ctx, ctx->smpl_addr, ctx_flags,
29188+ ctx->regs.num_pmds,
29189+ fmt_arg);
29190+ }
29191+ /*
29192+ * if there was an error, the buffer/resource will be freed by
29193+ * via pfm_context_free()
29194+ */
29195+error:
29196+ return ret;
29197+}
29198+
29199+void pfm_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set)
29200+{
29201+ u64 now;
29202+
29203+ now = sched_clock();
29204+
29205+ /*
29206+ * we save the PMD values such that we can read them while
29207+ * MASKED without having the thread stopped
29208+ * because monitoring is stopped
29209+ *
29210+ * pfm_save_pmds() could be avoided if we knew
29211+ * that pfm_arch_intr_freeze() had saved them already
29212+ */
29213+ pfm_save_pmds(ctx, set);
29214+ pfm_arch_mask_monitoring(ctx, set);
29215+ /*
29216+ * accumulate the set duration up to this point
29217+ */
29218+ set->duration += now - set->duration_start;
29219+
29220+ ctx->state = PFM_CTX_MASKED;
29221+
29222+ /*
29223+ * need to stop timer and remember remaining time
29224+ * will be reloaded in pfm_unmask_monitoring
29225+ * hrtimer is cancelled in the tail of the interrupt
29226+ * handler once the context is unlocked
29227+ */
29228+ if (set->flags & PFM_SETFL_TIME_SWITCH) {
29229+ struct hrtimer *h = &__get_cpu_var(pfm_hrtimer);
29230+ hrtimer_cancel(h);
29231+ set->hrtimer_rem = hrtimer_get_remaining(h);
29232+ }
29233+ PFM_DBG_ovfl("can_restart=%u", ctx->flags.can_restart);
29234+}
29235+
29236+/**
29237+ * pfm_unmask_monitoring - unmask monitoring
29238+ * @ctx: context to work with
29239+ * @set: current active set
29240+ *
29241+ * interrupts are masked when entering this function.
29242+ * context must be in MASKED state when calling.
29243+ *
29244+ * Upon return, the active set may have changed when using timeout
29245+ * based switching.
29246+ */
29247+static void pfm_unmask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set)
29248+{
29249+ if (ctx->state != PFM_CTX_MASKED)
29250+ return;
29251+
29252+ PFM_DBG_ovfl("unmasking monitoring");
29253+
29254+ /*
29255+ * must be done before calling
29256+ * pfm_arch_unmask_monitoring()
29257+ */
29258+ ctx->state = PFM_CTX_LOADED;
29259+
29260+ /*
29261+ * we need to restore the PMDs because they
29262+ * may have been modified by user while MASKED in
29263+ * which case the actual registers have no yet
29264+ * been updated
29265+ */
29266+ pfm_arch_restore_pmds(ctx, set);
29267+
29268+ /*
29269+ * call arch specific handler
29270+ */
29271+ pfm_arch_unmask_monitoring(ctx, set);
29272+
29273+ /*
29274+ * clear force reload flag. May have been set
29275+ * in pfm_write_pmcs or pfm_write_pmds
29276+ */
29277+ set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
29278+
29279+ /*
29280+ * reset set duration timer
29281+ */
29282+ set->duration_start = sched_clock();
29283+
29284+ /*
29285+ * restart hrtimer if needed
29286+ */
29287+ if (set->flags & PFM_SETFL_TIME_SWITCH) {
29288+ pfm_restart_timer(ctx, set);
29289+ /* careful here as pfm_restart_timer may switch sets */
29290+ }
29291+}
29292+
29293+void pfm_reset_pmds(struct pfm_context *ctx,
29294+ struct pfm_event_set *set,
29295+ int num_pmds,
29296+ int reset_mode)
29297+{
29298+ u64 val, mask, new_seed;
29299+ struct pfm_pmd *reg;
29300+ unsigned int i, not_masked;
29301+
29302+ not_masked = ctx->state != PFM_CTX_MASKED;
29303+
29304+ PFM_DBG_ovfl("%s r_pmds=0x%llx not_masked=%d",
29305+ reset_mode == PFM_PMD_RESET_LONG ? "long" : "short",
29306+ (unsigned long long)set->reset_pmds[0],
29307+ not_masked);
29308+
29309+ pfm_stats_inc(reset_pmds_count);
29310+
29311+ for (i = 0; num_pmds; i++) {
29312+ if (test_bit(i, cast_ulp(set->reset_pmds))) {
29313+ num_pmds--;
29314+
29315+ reg = set->pmds + i;
29316+
29317+ val = reset_mode == PFM_PMD_RESET_LONG ?
29318+ reg->long_reset : reg->short_reset;
29319+
29320+ if (reg->flags & PFM_REGFL_RANDOM) {
29321+ mask = reg->mask;
29322+ new_seed = random32();
29323+
29324+ /* construct a full 64-bit random value: */
29325+ if ((unlikely(mask >> 32) != 0))
29326+ new_seed |= (u64)random32() << 32;
29327+
29328+ /* counter values are negative numbers! */
29329+ val -= (new_seed & mask);
29330+ }
29331+
29332+ set->pmds[i].value = val;
29333+ reg->lval = val;
29334+
29335+ /*
29336+ * not all PMD to reset are necessarily
29337+ * counters
29338+ */
29339+ if (not_masked)
29340+ pfm_write_pmd(ctx, i, val);
29341+
29342+ PFM_DBG_ovfl("set%u pmd%u sval=0x%llx",
29343+ set->id,
29344+ i,
29345+ (unsigned long long)val);
29346+ }
29347+ }
29348+
29349+ /*
29350+ * done with reset
29351+ */
29352+ bitmap_zero(cast_ulp(set->reset_pmds), i);
29353+
29354+ /*
29355+ * make changes visible
29356+ */
29357+ if (not_masked)
29358+ pfm_arch_serialize();
29359+}
29360+
29361+/*
29362+ * called from pfm_handle_work() and __pfm_restart()
29363+ * for system-wide and per-thread context to resume
29364+ * monitoring after a user level notification.
29365+ *
29366+ * In both cases, the context is locked and interrupts
29367+ * are disabled.
29368+ */
29369+void pfm_resume_after_ovfl(struct pfm_context *ctx)
29370+{
29371+ struct pfm_smpl_fmt *fmt;
29372+ u32 rst_ctrl;
29373+ struct pfm_event_set *set;
29374+ u64 *reset_pmds;
29375+ void *hdr;
29376+ int state, ret;
29377+
29378+ hdr = ctx->smpl_addr;
29379+ fmt = ctx->smpl_fmt;
29380+ state = ctx->state;
29381+ set = ctx->active_set;
29382+ ret = 0;
29383+
29384+ if (hdr) {
29385+ rst_ctrl = 0;
29386+ prefetch(hdr);
29387+ } else {
29388+ rst_ctrl = PFM_OVFL_CTRL_RESET;
29389+ }
29390+
29391+ /*
29392+ * if using a sampling buffer format and it has a restart callback,
29393+ * then invoke it. hdr may be NULL, if the format does not use a
29394+ * perfmon buffer
29395+ */
29396+ if (fmt && fmt->fmt_restart)
29397+ ret = (*fmt->fmt_restart)(state == PFM_CTX_LOADED, &rst_ctrl,
29398+ hdr);
29399+
29400+ reset_pmds = set->reset_pmds;
29401+
29402+ PFM_DBG("fmt_restart=%d reset_count=%d set=%u r_pmds=0x%llx switch=%d "
29403+ "ctx_state=%d",
29404+ ret,
29405+ ctx->flags.reset_count,
29406+ set->id,
29407+ (unsigned long long)reset_pmds[0],
29408+ (set->priv_flags & PFM_SETFL_PRIV_SWITCH),
29409+ state);
29410+
29411+ if (!ret) {
29412+ /*
29413+ * switch set if needed
29414+ */
29415+ if (set->priv_flags & PFM_SETFL_PRIV_SWITCH) {
29416+ set->priv_flags &= ~PFM_SETFL_PRIV_SWITCH;
29417+ pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_LONG, 0);
29418+ set = ctx->active_set;
29419+ } else if (rst_ctrl & PFM_OVFL_CTRL_RESET) {
29420+ int nn;
29421+ nn = bitmap_weight(cast_ulp(set->reset_pmds),
29422+ ctx->regs.max_pmd);
29423+ if (nn)
29424+ pfm_reset_pmds(ctx, set, nn, PFM_PMD_RESET_LONG);
29425+ }
29426+
29427+ if (!(rst_ctrl & PFM_OVFL_CTRL_MASK))
29428+ pfm_unmask_monitoring(ctx, set);
29429+ else
29430+ PFM_DBG("stopping monitoring?");
29431+ ctx->state = PFM_CTX_LOADED;
29432+ }
29433+}
29434+
29435+/*
29436+ * This function is called when we need to perform asynchronous
29437+ * work on a context. This function is called ONLY when about to
29438+ * return to user mode (very much like with signal handling).
29439+ *
29440+ * There are several reasons why we come here:
29441+ *
29442+ * - per-thread mode, not self-monitoring, to reset the counters
29443+ * after a pfm_restart()
29444+ *
29445+ * - we are zombie and we need to cleanup our state
29446+ *
29447+ * - we need to block after an overflow notification
29448+ * on a context with the PFM_OVFL_NOTIFY_BLOCK flag
29449+ *
29450+ * This function is never called for a system-wide context.
29451+ *
29452+ * pfm_handle_work() can be called with interrupts enabled
29453+ * (TIF_NEED_RESCHED) or disabled. The down_interruptible
29454+ * call may sleep, therefore we must re-enable interrupts
29455+ * to avoid deadlocks. It is safe to do so because this function
29456+ * is called ONLY when returning to user level, in which case
29457+ * there is no risk of kernel stack overflow due to deep
29458+ * interrupt nesting.
29459+ */
29460+void pfm_handle_work(struct pt_regs *regs)
29461+{
29462+ struct pfm_context *ctx;
29463+ unsigned long flags, dummy_flags;
29464+ int type, ret, info;
29465+
29466+#ifdef CONFIG_PPC
29467+ /*
29468+ * This is just a temporary fix. Obviously we'd like to fix the powerpc
29469+ * code to make that check before calling __pfm_handle_work() to
29470+ * prevent the function call overhead, but the call is made from
29471+ * assembly code, so it will take a little while to figure out how to
29472+ * perform the check correctly.
29473+ */
29474+ if (!test_thread_flag(TIF_PERFMON_WORK))
29475+ return;
29476+#endif
29477+
29478+ if (!user_mode(regs))
29479+ return;
29480+
29481+ clear_thread_flag(TIF_PERFMON_WORK);
29482+
29483+ pfm_stats_inc(handle_work_count);
29484+
29485+ ctx = current->pfm_context;
29486+ if (ctx == NULL) {
29487+ PFM_DBG("[%d] has no ctx", current->pid);
29488+ return;
29489+ }
29490+
29491+ BUG_ON(ctx->flags.system);
29492+
29493+ spin_lock_irqsave(&ctx->lock, flags);
29494+
29495+ type = ctx->flags.work_type;
29496+ ctx->flags.work_type = PFM_WORK_NONE;
29497+
29498+ PFM_DBG("work_type=%d reset_count=%d",
29499+ type,
29500+ ctx->flags.reset_count);
29501+
29502+ switch (type) {
29503+ case PFM_WORK_ZOMBIE:
29504+ goto do_zombie;
29505+ case PFM_WORK_RESET:
29506+ /* simply reset, no blocking */
29507+ goto skip_blocking;
29508+ case PFM_WORK_NONE:
29509+ PFM_DBG("unexpected PFM_WORK_NONE");
29510+ goto nothing_todo;
29511+ case PFM_WORK_BLOCK:
29512+ break;
29513+ default:
29514+ PFM_DBG("unkown type=%d", type);
29515+ goto nothing_todo;
29516+ }
29517+
29518+ /*
29519+ * restore interrupt mask to what it was on entry.
29520+ * Could be enabled/disabled.
29521+ */
29522+ spin_unlock_irqrestore(&ctx->lock, flags);
29523+
29524+ /*
29525+ * force interrupt enable because of down_interruptible()
29526+ */
29527+ local_irq_enable();
29528+
29529+ PFM_DBG("before block sleeping");
29530+
29531+ /*
29532+ * may go through without blocking on SMP systems
29533+ * if restart has been received already by the time we call down()
29534+ */
29535+ ret = wait_for_completion_interruptible(&ctx->restart_complete);
29536+
29537+ PFM_DBG("after block sleeping ret=%d", ret);
29538+
29539+ /*
29540+ * lock context and mask interrupts again
29541+ * We save flags into a dummy because we may have
29542+ * altered interrupts mask compared to entry in this
29543+ * function.
29544+ */
29545+ spin_lock_irqsave(&ctx->lock, dummy_flags);
29546+
29547+ if (ctx->state == PFM_CTX_ZOMBIE)
29548+ goto do_zombie;
29549+
29550+ /*
29551+ * in case of interruption of down() we don't restart anything
29552+ */
29553+ if (ret < 0)
29554+ goto nothing_todo;
29555+
29556+skip_blocking:
29557+ /*
29558+ * iterate over the number of pending resets
29559+ * There are certain situations where there may be
29560+ * multiple notifications sent before a pfm_restart().
29561+ * As such, it may be that multiple pfm_restart() are
29562+ * issued before the monitored thread gets to
29563+ * pfm_handle_work(). To avoid losing restarts, pfm_restart()
29564+ * increments a counter (reset_counts). Here, we take this
29565+ * into account by potentially calling pfm_resume_after_ovfl()
29566+ * multiple times. It is up to the sampling format to take the
29567+ * appropriate actions.
29568+ */
29569+ while (ctx->flags.reset_count) {
29570+ pfm_resume_after_ovfl(ctx);
29571+ /* careful as active set may have changed */
29572+ ctx->flags.reset_count--;
29573+ }
29574+
29575+nothing_todo:
29576+ /*
29577+ * restore flags as they were upon entry
29578+ */
29579+ spin_unlock_irqrestore(&ctx->lock, flags);
29580+ return;
29581+
29582+do_zombie:
29583+ PFM_DBG("context is zombie, bailing out");
29584+
29585+ __pfm_unload_context(ctx, &info);
29586+
29587+ /*
29588+ * keep the spinlock check happy
29589+ */
29590+ spin_unlock(&ctx->lock);
29591+
29592+ /*
29593+ * enable interrupt for vfree()
29594+ */
29595+ local_irq_enable();
29596+
29597+ /*
29598+ * cancel timer now that context is unlocked
29599+ */
29600+ if (info & 0x2) {
29601+ ret = hrtimer_cancel(&__get_cpu_var(pfm_hrtimer));
29602+ PFM_DBG("timeout cancel=%d", ret);
29603+ }
29604+
29605+ /*
29606+ * actual context free
29607+ */
29608+ pfm_free_context(ctx);
29609+
29610+ /*
29611+ * restore interrupts as they were upon entry
29612+ */
29613+ local_irq_restore(flags);
29614+
29615+ /* always true */
29616+ if (info & 0x1)
29617+ pfm_session_release(0, 0);
29618+}
29619+
29620+/**
29621+ * __pfm_restart - resume monitoring after user-level notification
29622+ * @ctx: context to operate on
29623+ * @info: return information used to free resource once unlocked
29624+ *
29625+ * function called from sys_pfm_restart(). It is used when overflow
29626+ * notification is requested. For each notification received, the user
29627+ * must call pfm_restart() to indicate to the kernel that it is done
29628+ * processing the notification.
29629+ *
29630+ * When the caller is doing user level sampling, this function resets
29631+ * the overflowed counters and resumes monitoring which is normally stopped
29632+ * during notification (always the consequence of a counter overflow).
29633+ *
29634+ * When using a sampling format, the format restart() callback is invoked,
29635+ * overflowed PMDS may be reset based upon decision from sampling format.
29636+ *
29637+ * When operating in per-thread mode, and when not self-monitoring, the
29638+ * monitored thread DOES NOT need to be stopped, unlike for many other calls.
29639+ *
29640+ * This means that the effect of the restart may not necessarily be observed
29641+ * right when returning from the call. For instance, counters may not already
29642+ * be reset in the other thread.
29643+ *
29644+ * When operating in system-wide, the caller must be running on the monitored
29645+ * CPU.
29646+ *
29647+ * The context is locked and interrupts are disabled.
29648+ *
29649+ * info value upon return:
29650+ * - bit 0: when set, mudt issue complete() on restart semaphore
29651+ */
29652+int __pfm_restart(struct pfm_context *ctx, int *info)
29653+{
29654+ int state;
29655+
29656+ state = ctx->state;
29657+
29658+ PFM_DBG("state=%d can_restart=%d reset_count=%d",
29659+ state,
29660+ ctx->flags.can_restart,
29661+ ctx->flags.reset_count);
29662+
29663+ *info = 0;
29664+
29665+ switch (state) {
29666+ case PFM_CTX_MASKED:
29667+ break;
29668+ case PFM_CTX_LOADED:
29669+ if (ctx->smpl_addr && ctx->smpl_fmt->fmt_restart)
29670+ break;
29671+ default:
29672+ PFM_DBG("invalid state=%d", state);
29673+ return -EBUSY;
29674+ }
29675+
29676+ /*
29677+ * first check if allowed to restart, i.e., notifications received
29678+ */
29679+ if (!ctx->flags.can_restart) {
29680+ PFM_DBG("no restart can_restart=0");
29681+ return -EBUSY;
29682+ }
29683+
29684+ pfm_stats_inc(pfm_restart_count);
29685+
29686+ /*
29687+ * at this point, the context is either LOADED or MASKED
29688+ */
29689+ ctx->flags.can_restart--;
29690+
29691+ /*
29692+ * handle self-monitoring case and system-wide
29693+ */
29694+ if (ctx->task == current || ctx->flags.system) {
29695+ pfm_resume_after_ovfl(ctx);
29696+ return 0;
29697+ }
29698+
29699+ /*
29700+ * restart another task
29701+ */
29702+
29703+ /*
29704+ * if blocking, then post the semaphore if PFM_CTX_MASKED, i.e.
29705+ * the task is blocked or on its way to block. That's the normal
29706+ * restart path. If the monitoring is not masked, then the task
29707+ * can be actively monitoring and we cannot directly intervene.
29708+ * Therefore we use the trap mechanism to catch the task and
29709+ * force it to reset the buffer/reset PMDs.
29710+ *
29711+ * if non-blocking, then we ensure that the task will go into
29712+ * pfm_handle_work() before returning to user mode.
29713+ *
29714+ * We cannot explicitly reset another task, it MUST always
29715+ * be done by the task itself. This works for system wide because
29716+ * the tool that is controlling the session is logically doing
29717+ * "self-monitoring".
29718+ */
29719+ if (ctx->flags.block && state == PFM_CTX_MASKED) {
29720+ PFM_DBG("unblocking [%d]", ctx->task->pid);
29721+ /*
29722+ * It is not possible to call complete() with the context locked
29723+ * otherwise we have a potential deadlock with the PMU context
29724+ * switch code due to a lock inversion between task_rq_lock()
29725+ * and the context lock.
29726+ * Instead we mark whether or not we need to issue the complete
29727+ * and we invoke the function once the context lock is released
29728+ * in sys_pfm_restart()
29729+ */
29730+ *info = 1;
29731+ } else {
29732+ PFM_DBG("[%d] armed exit trap", ctx->task->pid);
29733+ pfm_post_work(ctx->task, ctx, PFM_WORK_RESET);
29734+ }
29735+ ctx->flags.reset_count++;
29736+ return 0;
29737+}
29738+
29739+/**
29740+ * pfm_get_smpl_arg -- copy user arguments to pfm_create_context() related to sampling format
29741+ * @name: format name as passed by user
29742+ * @fmt_arg: format optional argument as passed by user
29743+ * @uszie: size of structure pass in fmt_arg
29744+ * @arg: kernel copy of fmt_arg
29745+ * @fmt: pointer to sampling format upon success
29746+ *
29747+ * arg is kmalloc'ed, thus it needs a kfree by caller
29748+ */
29749+int pfm_get_smpl_arg(char __user *fmt_uname, void __user *fmt_uarg, size_t usize, void **arg,
29750+ struct pfm_smpl_fmt **fmt)
29751+{
29752+ struct pfm_smpl_fmt *f;
29753+ char *fmt_name;
29754+ void *addr = NULL;
29755+ size_t sz;
29756+ int ret;
29757+
29758+ fmt_name = getname(fmt_uname);
29759+ if (!fmt_name) {
29760+ PFM_DBG("getname failed");
29761+ return -ENOMEM;
29762+ }
29763+
29764+ /*
29765+ * find fmt and increase refcount
29766+ */
29767+ f = pfm_smpl_fmt_get(fmt_name);
29768+
29769+ putname(fmt_name);
29770+
29771+ if (f == NULL) {
29772+ PFM_DBG("buffer format not found");
29773+ return -EINVAL;
29774+ }
29775+
29776+ /*
29777+ * expected format argument size
29778+ */
29779+ sz = f->fmt_arg_size;
29780+
29781+ /*
29782+ * check user size matches expected size
29783+ * usize = -1 is for IA-64 backward compatibility
29784+ */
29785+ ret = -EINVAL;
29786+ if (sz != usize && usize != -1) {
29787+ PFM_DBG("invalid arg size %zu, format expects %zu",
29788+ usize, sz);
29789+ goto error;
29790+ }
29791+
29792+ if (sz) {
29793+ ret = -ENOMEM;
29794+ addr = kmalloc(sz, GFP_KERNEL);
29795+ if (addr == NULL)
29796+ goto error;
29797+
29798+ ret = -EFAULT;
29799+ if (copy_from_user(addr, fmt_uarg, sz))
29800+ goto error;
29801+ }
29802+ *arg = addr;
29803+ *fmt = f;
29804+ return 0;
29805+
29806+error:
29807+ kfree(addr);
29808+ pfm_smpl_fmt_put(f);
29809+ return ret;
29810+}
29811--- /dev/null
29812+++ b/perfmon/perfmon_syscalls.c
29813@@ -0,0 +1,1060 @@
29814+/*
29815+ * perfmon_syscalls.c: perfmon2 system call interface
29816+ *
29817+ * This file implements the perfmon2 interface which
29818+ * provides access to the hardware performance counters
29819+ * of the host processor.
29820+ *
29821+ * The initial version of perfmon.c was written by
29822+ * Ganesh Venkitachalam, IBM Corp.
29823+ *
29824+ * Then it was modified for perfmon-1.x by Stephane Eranian and
29825+ * David Mosberger, Hewlett Packard Co.
29826+ *
29827+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
29828+ * by Stephane Eranian, Hewlett Packard Co.
29829+ *
29830+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
29831+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
29832+ * David Mosberger-Tang <davidm@hpl.hp.com>
29833+ *
29834+ * More information about perfmon available at:
29835+ * http://perfmon2.sf.net
29836+ *
29837+ * This program is free software; you can redistribute it and/or
29838+ * modify it under the terms of version 2 of the GNU General Public
29839+ * License as published by the Free Software Foundation.
29840+ *
29841+ * This program is distributed in the hope that it will be useful,
29842+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
29843+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
29844+ * General Public License for more details.
29845+ *
29846+ * You should have received a copy of the GNU General Public License
29847+ * along with this program; if not, write to the Free Software
29848+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29849+ * 02111-1307 USA
29850+ */
29851+#include <linux/kernel.h>
29852+#include <linux/fs.h>
29853+#include <linux/ptrace.h>
29854+#include <linux/perfmon_kern.h>
29855+#include <linux/uaccess.h>
29856+#include "perfmon_priv.h"
29857+
29858+/*
29859+ * Context locking rules:
29860+ * ---------------------
29861+ * - any thread with access to the file descriptor of a context can
29862+ * potentially issue perfmon calls
29863+ *
29864+ * - calls must be serialized to guarantee correctness
29865+ *
29866+ * - as soon as a context is attached to a thread or CPU, it may be
29867+ * actively monitoring. On some architectures, such as IA-64, this
29868+ * is true even though the pfm_start() call has not been made. This
29869+ * comes from the fact that on some architectures, it is possible to
29870+ * start/stop monitoring from userland.
29871+ *
29872+ * - If monitoring is active, then there can PMU interrupts. Because
29873+ * context accesses must be serialized, the perfmon system calls
29874+ * must mask interrupts as soon as the context is attached.
29875+ *
29876+ * - perfmon system calls that operate with the context unloaded cannot
29877+ * assume it is actually unloaded when they are called. They first need
29878+ * to check and for that they need interrupts masked. Then, if the
29879+ * context is actually unloaded, they can unmask interrupts.
29880+ *
29881+ * - interrupt masking holds true for other internal perfmon functions as
29882+ * well. Except for PMU interrupt handler because those interrupts
29883+ * cannot be nested.
29884+ *
29885+ * - we mask ALL interrupts instead of just the PMU interrupt because we
29886+ * also need to protect against timer interrupts which could trigger
29887+ * a set switch.
29888+ */
29889+#ifdef CONFIG_UTRACE
29890+#include <linux/utrace.h>
29891+
29892+static u32
29893+stopper_quiesce(struct utrace_attached_engine *engine, struct task_struct *tsk)
29894+{
29895+ PFM_DBG("quiesced [%d]", tsk->pid);
29896+ complete(engine->data);
29897+ return UTRACE_ACTION_RESUME;
29898+}
29899+
29900+void
29901+pfm_resume_task(struct task_struct *t, void *data)
29902+{
29903+ PFM_DBG("utrace detach [%d]", t->pid);
29904+ (void) utrace_detach(t, data);
29905+}
29906+
29907+static const struct utrace_engine_ops utrace_ops =
29908+{
29909+ .report_quiesce = stopper_quiesce,
29910+};
29911+
29912+static int pfm_wait_task_stopped(struct task_struct *task, void **data)
29913+{
29914+ DECLARE_COMPLETION_ONSTACK(done);
29915+ struct utrace_attached_engine *eng;
29916+ int ret;
29917+
29918+ eng = utrace_attach(task, UTRACE_ATTACH_CREATE, &utrace_ops, &done);
29919+ if (IS_ERR(eng))
29920+ return PTR_ERR(eng);
29921+
29922+ ret = utrace_set_flags(task, eng,
29923+ UTRACE_ACTION_QUIESCE | UTRACE_EVENT(QUIESCE));
29924+ PFM_DBG("wait quiesce [%d]", task->pid);
29925+ if (!ret)
29926+ ret = wait_for_completion_interruptible(&done);
29927+
29928+ if (ret)
29929+ (void) utrace_detach(task, eng);
29930+ else
29931+ *data = eng;
29932+ return 0;
29933+}
29934+#else /* !CONFIG_UTRACE */
29935+static int pfm_wait_task_stopped(struct task_struct *task, void **data)
29936+{
29937+ int ret;
29938+
29939+ *data = NULL;
29940+
29941+ /*
29942+ * returns 0 if cannot attach
29943+ */
29944+ ret = ptrace_may_access(task, PTRACE_MODE_ATTACH);
29945+ PFM_DBG("may_attach=%d", ret);
29946+ if (!ret)
29947+ return -EPERM;
29948+
29949+ ret = ptrace_check_attach(task, 0);
29950+ PFM_DBG("check_attach=%d", ret);
29951+ return ret;
29952+}
29953+void pfm_resume_task(struct task_struct *t, void *data)
29954+{}
29955+#endif
29956+
29957+struct pfm_syscall_cookie {
29958+ struct file *filp;
29959+ int fput_needed;
29960+};
29961+
29962+/*
29963+ * cannot attach if :
29964+ * - kernel task
29965+ * - task not owned by caller (checked by ptrace_may_attach())
29966+ * - task is dead or zombie
29967+ * - cannot use blocking notification when self-monitoring
29968+ */
29969+static int pfm_task_incompatible(struct pfm_context *ctx,
29970+ struct task_struct *task)
29971+{
29972+ /*
29973+ * cannot attach to a kernel thread
29974+ */
29975+ if (!task->mm) {
29976+ PFM_DBG("cannot attach to kernel thread [%d]", task->pid);
29977+ return -EPERM;
29978+ }
29979+
29980+ /*
29981+ * cannot use block on notification when
29982+ * self-monitoring.
29983+ */
29984+ if (ctx->flags.block && task == current) {
29985+ PFM_DBG("cannot use block on notification when self-monitoring"
29986+ "[%d]", task->pid);
29987+ return -EINVAL;
29988+ }
29989+ /*
29990+ * cannot attach to a zombie task
29991+ */
29992+ if (task->exit_state == EXIT_ZOMBIE || task->exit_state == EXIT_DEAD) {
29993+ PFM_DBG("cannot attach to zombie/dead task [%d]", task->pid);
29994+ return -EBUSY;
29995+ }
29996+ return 0;
29997+}
29998+
29999+/**
30000+ * pfm_get_task -- check permission and acquire task to monitor
30001+ * @ctx: perfmon context
30002+ * @pid: identification of the task to check
30003+ * @task: upon return, a pointer to the task to monitor
30004+ *
30005+ * This function is used in per-thread mode only AND when not
30006+ * self-monitoring. It finds the task to monitor and checks
30007+ * that the caller has permissions to attach. It also checks
30008+ * that the task is stopped via ptrace so that we can safely
30009+ * modify its state.
30010+ *
30011+ * task refcount is incremented when succesful.
30012+ */
30013+static int pfm_get_task(struct pfm_context *ctx, pid_t pid,
30014+ struct task_struct **task, void **data)
30015+{
30016+ struct task_struct *p;
30017+ int ret = 0, ret1 = 0;
30018+
30019+ *data = NULL;
30020+
30021+ /*
30022+ * When attaching to another thread we must ensure
30023+ * that the thread is actually stopped.
30024+ *
30025+ * As a consequence, only the ptracing parent can actually
30026+ * attach a context to a thread. Obviously, this constraint
30027+ * does not exist for self-monitoring threads.
30028+ *
30029+ * We use ptrace_may_attach() to check for permission.
30030+ */
30031+ read_lock(&tasklist_lock);
30032+
30033+ p = find_task_by_vpid(pid);
30034+ if (p)
30035+ get_task_struct(p);
30036+
30037+ read_unlock(&tasklist_lock);
30038+
30039+ if (!p) {
30040+ PFM_DBG("task not found %d", pid);
30041+ return -ESRCH;
30042+ }
30043+
30044+ ret = pfm_task_incompatible(ctx, p);
30045+ if (ret)
30046+ goto error;
30047+
30048+ ret = pfm_wait_task_stopped(p, data);
30049+ if (ret)
30050+ goto error;
30051+
30052+ *task = p;
30053+
30054+ return 0;
30055+error:
30056+ if (!(ret1 || ret))
30057+ ret = -EPERM;
30058+
30059+ put_task_struct(p);
30060+
30061+ return ret;
30062+}
30063+
30064+/*
30065+ * context must be locked when calling this function
30066+ */
30067+int pfm_check_task_state(struct pfm_context *ctx, int check_mask,
30068+ unsigned long *flags, void **resume)
30069+{
30070+ struct task_struct *task;
30071+ unsigned long local_flags, new_flags;
30072+ int state, ret;
30073+
30074+ *resume = NULL;
30075+
30076+recheck:
30077+ /*
30078+ * task is NULL for system-wide context
30079+ */
30080+ task = ctx->task;
30081+ state = ctx->state;
30082+ local_flags = *flags;
30083+
30084+ PFM_DBG("state=%d check_mask=0x%x", state, check_mask);
30085+ /*
30086+ * if the context is detached, then we do not touch
30087+ * hardware, therefore there is not restriction on when we can
30088+ * access it.
30089+ */
30090+ if (state == PFM_CTX_UNLOADED)
30091+ return 0;
30092+ /*
30093+ * no command can operate on a zombie context.
30094+ * A context becomes zombie when the file that identifies
30095+ * it is closed while the context is still attached to the
30096+ * thread it monitors.
30097+ */
30098+ if (state == PFM_CTX_ZOMBIE)
30099+ return -EINVAL;
30100+
30101+ /*
30102+ * at this point, state is PFM_CTX_LOADED or PFM_CTX_MASKED
30103+ */
30104+
30105+ /*
30106+ * some commands require the context to be unloaded to operate
30107+ */
30108+ if (check_mask & PFM_CMD_UNLOADED) {
30109+ PFM_DBG("state=%d, cmd needs context unloaded", state);
30110+ return -EBUSY;
30111+ }
30112+
30113+ /*
30114+ * self-monitoring always ok.
30115+ */
30116+ if (task == current)
30117+ return 0;
30118+
30119+ /*
30120+ * for syswide, the calling thread must be running on the cpu
30121+ * the context is bound to.
30122+ */
30123+ if (ctx->flags.system) {
30124+ if (ctx->cpu != smp_processor_id())
30125+ return -EBUSY;
30126+ return 0;
30127+ }
30128+
30129+ /*
30130+ * at this point, monitoring another thread
30131+ */
30132+
30133+ /*
30134+ * the pfm_unload_context() command is allowed on masked context
30135+ */
30136+ if (state == PFM_CTX_MASKED && !(check_mask & PFM_CMD_UNLOAD))
30137+ return 0;
30138+
30139+ /*
30140+ * When we operate on another thread, we must wait for it to be
30141+ * stopped and completely off any CPU as we need to access the
30142+ * PMU state (or machine state).
30143+ *
30144+ * A thread can be put in the STOPPED state in various ways
30145+ * including PTRACE_ATTACH, or when it receives a SIGSTOP signal.
30146+ * We enforce that the thread must be ptraced, so it is stopped
30147+ * AND it CANNOT wake up while we operate on it because this
30148+ * would require an action from the ptracing parent which is the
30149+ * thread that is calling this function.
30150+ *
30151+ * The dependency on ptrace, imposes that only the ptracing
30152+ * parent can issue command on a thread. This is unfortunate
30153+ * but we do not know of a better way of doing this.
30154+ */
30155+ if (check_mask & PFM_CMD_STOPPED) {
30156+
30157+ spin_unlock_irqrestore(&ctx->lock, local_flags);
30158+
30159+ /*
30160+ * check that the thread is ptraced AND STOPPED
30161+ */
30162+ ret = pfm_wait_task_stopped(task, resume);
30163+
30164+ spin_lock_irqsave(&ctx->lock, new_flags);
30165+
30166+ /*
30167+ * flags may be different than when we released the lock
30168+ */
30169+ *flags = new_flags;
30170+
30171+ if (ret)
30172+ return ret;
30173+ /*
30174+ * we must recheck to verify if state has changed
30175+ */
30176+ if (unlikely(ctx->state != state)) {
30177+ PFM_DBG("old_state=%d new_state=%d",
30178+ state,
30179+ ctx->state);
30180+ goto recheck;
30181+ }
30182+ }
30183+ return 0;
30184+}
30185+
30186+/*
30187+ * pfm_get_args - Function used to copy the syscall argument into kernel memory.
30188+ * @ureq: user argument
30189+ * @sz: user argument size
30190+ * @lsz: size of stack buffer
30191+ * @laddr: stack buffer address
30192+ * @req: point to start of kernel copy of the argument
30193+ * @ptr_free: address of kernel copy to free
30194+ *
30195+ * There are two options:
30196+ * - use a stack buffer described by laddr (addresses) and lsz (size)
30197+ * - allocate memory
30198+ *
30199+ * return:
30200+ * < 0 : in case of error (ptr_free may not be updated)
30201+ * 0 : success
30202+ * - req: points to base of kernel copy of arguments
30203+ * - ptr_free: address of buffer to free by caller on exit.
30204+ * NULL if using the stack buffer
30205+ *
30206+ * when ptr_free is not NULL upon return, the caller must kfree()
30207+ */
30208+int pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr,
30209+ void **req, void **ptr_free)
30210+{
30211+ void *addr;
30212+
30213+ /*
30214+ * check syadmin argument limit
30215+ */
30216+ if (unlikely(sz > pfm_controls.arg_mem_max)) {
30217+ PFM_DBG("argument too big %zu max=%zu",
30218+ sz,
30219+ pfm_controls.arg_mem_max);
30220+ return -E2BIG;
30221+ }
30222+
30223+ /*
30224+ * check if vector fits on stack buffer
30225+ */
30226+ if (sz > lsz) {
30227+ addr = kmalloc(sz, GFP_KERNEL);
30228+ if (unlikely(addr == NULL))
30229+ return -ENOMEM;
30230+ *ptr_free = addr;
30231+ } else {
30232+ addr = laddr;
30233+ *req = laddr;
30234+ *ptr_free = NULL;
30235+ }
30236+
30237+ /*
30238+ * bring the data in
30239+ */
30240+ if (unlikely(copy_from_user(addr, ureq, sz))) {
30241+ if (addr != laddr)
30242+ kfree(addr);
30243+ return -EFAULT;
30244+ }
30245+
30246+ /*
30247+ * base address of kernel buffer
30248+ */
30249+ *req = addr;
30250+
30251+ return 0;
30252+}
30253+
30254+/**
30255+ * pfm_acquire_ctx_from_fd -- get ctx from file descriptor
30256+ * @fd: file descriptor
30257+ * @ctx: pointer to pointer of context updated on return
30258+ * @cookie: opaque structure to use for release
30259+ *
30260+ * This helper function extracts the ctx from the file descriptor.
30261+ * It also increments the refcount of the file structure. Thus
30262+ * it updates the cookie so the refcount can be decreased when
30263+ * leaving the perfmon syscall via pfm_release_ctx_from_fd
30264+ */
30265+static int pfm_acquire_ctx_from_fd(int fd, struct pfm_context **ctx,
30266+ struct pfm_syscall_cookie *cookie)
30267+{
30268+ struct file *filp;
30269+ int fput_needed;
30270+
30271+ filp = fget_light(fd, &fput_needed);
30272+ if (unlikely(filp == NULL)) {
30273+ PFM_DBG("invalid fd %d", fd);
30274+ return -EBADF;
30275+ }
30276+
30277+ *ctx = filp->private_data;
30278+
30279+ if (unlikely(!*ctx || filp->f_op != &pfm_file_ops)) {
30280+ PFM_DBG("fd %d not related to perfmon", fd);
30281+ return -EBADF;
30282+ }
30283+ cookie->filp = filp;
30284+ cookie->fput_needed = fput_needed;
30285+
30286+ return 0;
30287+}
30288+
30289+/**
30290+ * pfm_release_ctx_from_fd -- decrease refcount of file associated with context
30291+ * @cookie: the cookie structure initialized by pfm_acquire_ctx_from_fd
30292+ */
30293+static inline void pfm_release_ctx_from_fd(struct pfm_syscall_cookie *cookie)
30294+{
30295+ fput_light(cookie->filp, cookie->fput_needed);
30296+}
30297+
30298+/*
30299+ * unlike the other perfmon system calls, this one returns a file descriptor
30300+ * or a value < 0 in case of error, very much like open() or socket()
30301+ */
30302+asmlinkage long sys_pfm_create_context(struct pfarg_ctx __user *ureq,
30303+ char __user *fmt_name,
30304+ void __user *fmt_uarg, size_t fmt_size)
30305+{
30306+ struct pfarg_ctx req;
30307+ struct pfm_smpl_fmt *fmt = NULL;
30308+ void *fmt_arg = NULL;
30309+ int ret;
30310+
30311+ PFM_DBG("req=%p fmt=%p fmt_arg=%p size=%zu",
30312+ ureq, fmt_name, fmt_uarg, fmt_size);
30313+
30314+ if (perfmon_disabled)
30315+ return -ENOSYS;
30316+
30317+ if (copy_from_user(&req, ureq, sizeof(req)))
30318+ return -EFAULT;
30319+
30320+ if (fmt_name) {
30321+ ret = pfm_get_smpl_arg(fmt_name, fmt_uarg, fmt_size, &fmt_arg, &fmt);
30322+ if (ret)
30323+ goto abort;
30324+ }
30325+
30326+ ret = __pfm_create_context(&req, fmt, fmt_arg, PFM_NORMAL, NULL);
30327+
30328+ kfree(fmt_arg);
30329+abort:
30330+ return ret;
30331+}
30332+
30333+asmlinkage long sys_pfm_write_pmcs(int fd, struct pfarg_pmc __user *ureq, int count)
30334+{
30335+ struct pfm_context *ctx;
30336+ struct task_struct *task;
30337+ struct pfm_syscall_cookie cookie;
30338+ struct pfarg_pmc pmcs[PFM_PMC_STK_ARG];
30339+ struct pfarg_pmc *req;
30340+ void *fptr, *resume;
30341+ unsigned long flags;
30342+ size_t sz;
30343+ int ret;
30344+
30345+ PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count);
30346+
30347+ if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) {
30348+ PFM_DBG("invalid arg count %d", count);
30349+ return -EINVAL;
30350+ }
30351+
30352+ sz = count*sizeof(*ureq);
30353+
30354+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
30355+ if (ret)
30356+ return ret;
30357+
30358+ ret = pfm_get_args(ureq, sz, sizeof(pmcs), pmcs, (void **)&req, &fptr);
30359+ if (ret)
30360+ goto error;
30361+
30362+ spin_lock_irqsave(&ctx->lock, flags);
30363+
30364+ task = ctx->task;
30365+
30366+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume);
30367+ if (!ret)
30368+ ret = __pfm_write_pmcs(ctx, req, count);
30369+
30370+ spin_unlock_irqrestore(&ctx->lock, flags);
30371+
30372+ if (resume)
30373+ pfm_resume_task(task, resume);
30374+
30375+ /*
30376+ * This function may be on the critical path.
30377+ * We want to avoid the branch if unecessary.
30378+ */
30379+ if (fptr)
30380+ kfree(fptr);
30381+error:
30382+ pfm_release_ctx_from_fd(&cookie);
30383+ return ret;
30384+}
30385+
30386+asmlinkage long sys_pfm_write_pmds(int fd, struct pfarg_pmd __user *ureq, int count)
30387+{
30388+ struct pfm_context *ctx;
30389+ struct task_struct *task;
30390+ struct pfm_syscall_cookie cookie;
30391+ struct pfarg_pmd pmds[PFM_PMD_STK_ARG];
30392+ struct pfarg_pmd *req;
30393+ void *fptr, *resume;
30394+ unsigned long flags;
30395+ size_t sz;
30396+ int ret;
30397+
30398+ PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count);
30399+
30400+ if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) {
30401+ PFM_DBG("invalid arg count %d", count);
30402+ return -EINVAL;
30403+ }
30404+
30405+ sz = count*sizeof(*ureq);
30406+
30407+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
30408+ if (ret)
30409+ return ret;
30410+
30411+ ret = pfm_get_args(ureq, sz, sizeof(pmds), pmds, (void **)&req, &fptr);
30412+ if (ret)
30413+ goto error;
30414+
30415+ spin_lock_irqsave(&ctx->lock, flags);
30416+
30417+ task = ctx->task;
30418+
30419+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume);
30420+ if (!ret)
30421+ ret = __pfm_write_pmds(ctx, req, count, 0);
30422+
30423+ spin_unlock_irqrestore(&ctx->lock, flags);
30424+
30425+ if (resume)
30426+ pfm_resume_task(task, resume);
30427+
30428+ if (fptr)
30429+ kfree(fptr);
30430+error:
30431+ pfm_release_ctx_from_fd(&cookie);
30432+ return ret;
30433+}
30434+
30435+asmlinkage long sys_pfm_read_pmds(int fd, struct pfarg_pmd __user *ureq, int count)
30436+{
30437+ struct pfm_context *ctx;
30438+ struct task_struct *task;
30439+ struct pfm_syscall_cookie cookie;
30440+ struct pfarg_pmd pmds[PFM_PMD_STK_ARG];
30441+ struct pfarg_pmd *req;
30442+ void *fptr, *resume;
30443+ unsigned long flags;
30444+ size_t sz;
30445+ int ret;
30446+
30447+ PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count);
30448+
30449+ if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq))
30450+ return -EINVAL;
30451+
30452+ sz = count*sizeof(*ureq);
30453+
30454+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
30455+ if (ret)
30456+ return ret;
30457+
30458+ ret = pfm_get_args(ureq, sz, sizeof(pmds), pmds, (void **)&req, &fptr);
30459+ if (ret)
30460+ goto error;
30461+
30462+ spin_lock_irqsave(&ctx->lock, flags);
30463+
30464+ task = ctx->task;
30465+
30466+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume);
30467+ if (!ret)
30468+ ret = __pfm_read_pmds(ctx, req, count);
30469+
30470+ spin_unlock_irqrestore(&ctx->lock, flags);
30471+
30472+ if (copy_to_user(ureq, req, sz))
30473+ ret = -EFAULT;
30474+
30475+ if (resume)
30476+ pfm_resume_task(task, resume);
30477+
30478+ if (fptr)
30479+ kfree(fptr);
30480+error:
30481+ pfm_release_ctx_from_fd(&cookie);
30482+ return ret;
30483+}
30484+
30485+asmlinkage long sys_pfm_restart(int fd)
30486+{
30487+ struct pfm_context *ctx;
30488+ struct task_struct *task;
30489+ struct pfm_syscall_cookie cookie;
30490+ void *resume;
30491+ unsigned long flags;
30492+ int ret, info;
30493+
30494+ PFM_DBG("fd=%d", fd);
30495+
30496+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
30497+ if (ret)
30498+ return ret;
30499+
30500+ spin_lock_irqsave(&ctx->lock, flags);
30501+
30502+ task = ctx->task;
30503+
30504+ ret = pfm_check_task_state(ctx, 0, &flags, &resume);
30505+ if (!ret)
30506+ ret = __pfm_restart(ctx, &info);
30507+
30508+ spin_unlock_irqrestore(&ctx->lock, flags);
30509+
30510+ if (resume)
30511+ pfm_resume_task(task, resume);
30512+ /*
30513+ * In per-thread mode with blocking notification, i.e.
30514+ * ctx->flags.blocking=1, we need to defer issuing the
30515+ * complete to unblock the blocked monitored thread.
30516+ * Otherwise we have a potential deadlock due to a lock
30517+ * inversion between the context lock and the task_rq_lock()
30518+ * which can happen if one thread is in this call and the other
30519+ * (the monitored thread) is in the context switch code.
30520+ *
30521+ * It is safe to access the context outside the critical section
30522+ * because:
30523+ * - we are protected by the fget_light(), thus the context
30524+ * cannot disappear
30525+ */
30526+ if (ret == 0 && info == 1)
30527+ complete(&ctx->restart_complete);
30528+
30529+ pfm_release_ctx_from_fd(&cookie);
30530+ return ret;
30531+}
30532+
30533+asmlinkage long sys_pfm_stop(int fd)
30534+{
30535+ struct pfm_context *ctx;
30536+ struct task_struct *task;
30537+ struct pfm_syscall_cookie cookie;
30538+ void *resume;
30539+ unsigned long flags;
30540+ int ret;
30541+ int release_info;
30542+
30543+ PFM_DBG("fd=%d", fd);
30544+
30545+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
30546+ if (ret)
30547+ return ret;
30548+
30549+ spin_lock_irqsave(&ctx->lock, flags);
30550+
30551+ task = ctx->task;
30552+
30553+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume);
30554+ if (!ret)
30555+ ret = __pfm_stop(ctx, &release_info);
30556+
30557+ spin_unlock_irqrestore(&ctx->lock, flags);
30558+
30559+ if (resume)
30560+ pfm_resume_task(task, resume);
30561+
30562+ /*
30563+ * defer cancellation of timer to avoid race
30564+ * with pfm_handle_switch_timeout()
30565+ *
30566+ * applies only when self-monitoring
30567+ */
30568+ if (release_info & 0x2)
30569+ hrtimer_cancel(&__get_cpu_var(pfm_hrtimer));
30570+
30571+ pfm_release_ctx_from_fd(&cookie);
30572+ return ret;
30573+}
30574+
30575+asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *ureq)
30576+{
30577+ struct pfm_context *ctx;
30578+ struct task_struct *task;
30579+ struct pfm_syscall_cookie cookie;
30580+ void *resume;
30581+ struct pfarg_start req;
30582+ unsigned long flags;
30583+ int ret;
30584+
30585+ PFM_DBG("fd=%d req=%p", fd, ureq);
30586+
30587+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
30588+ if (ret)
30589+ return ret;
30590+
30591+ /*
30592+ * the one argument is actually optional
30593+ */
30594+ if (ureq && copy_from_user(&req, ureq, sizeof(req)))
30595+ return -EFAULT;
30596+
30597+ spin_lock_irqsave(&ctx->lock, flags);
30598+
30599+ task = ctx->task;
30600+
30601+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume);
30602+ if (!ret)
30603+ ret = __pfm_start(ctx, ureq ? &req : NULL);
30604+
30605+ spin_unlock_irqrestore(&ctx->lock, flags);
30606+
30607+ if (resume)
30608+ pfm_resume_task(task, resume);
30609+
30610+ pfm_release_ctx_from_fd(&cookie);
30611+ return ret;
30612+}
30613+
30614+asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ureq)
30615+{
30616+ struct pfm_context *ctx;
30617+ struct task_struct *task;
30618+ struct pfm_syscall_cookie cookie;
30619+ void *resume, *dummy_resume;
30620+ unsigned long flags;
30621+ struct pfarg_load req;
30622+ int ret;
30623+
30624+ PFM_DBG("fd=%d req=%p", fd, ureq);
30625+
30626+ if (copy_from_user(&req, ureq, sizeof(req)))
30627+ return -EFAULT;
30628+
30629+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
30630+ if (ret)
30631+ return ret;
30632+
30633+ task = current;
30634+
30635+ /*
30636+ * in per-thread mode (not self-monitoring), get a reference
30637+ * on task to monitor. This must be done with interrupts enabled
30638+ * Upon succesful return, refcount on task is increased.
30639+ *
30640+ * fget_light() is protecting the context.
30641+ */
30642+ if (!ctx->flags.system && req.load_pid != current->pid) {
30643+ ret = pfm_get_task(ctx, req.load_pid, &task, &resume);
30644+ if (ret)
30645+ goto error;
30646+ }
30647+
30648+ /*
30649+ * irqsave is required to avoid race in case context is already
30650+ * loaded or with switch timeout in the case of self-monitoring
30651+ */
30652+ spin_lock_irqsave(&ctx->lock, flags);
30653+
30654+ ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags, &dummy_resume);
30655+ if (!ret)
30656+ ret = __pfm_load_context(ctx, &req, task);
30657+
30658+ spin_unlock_irqrestore(&ctx->lock, flags);
30659+
30660+ if (resume)
30661+ pfm_resume_task(task, resume);
30662+
30663+ /*
30664+ * in per-thread mode (not self-monitoring), we need
30665+ * to decrease refcount on task to monitor:
30666+ * - load successful: we have a reference to the task in ctx->task
30667+ * - load failed : undo the effect of pfm_get_task()
30668+ */
30669+ if (task != current)
30670+ put_task_struct(task);
30671+error:
30672+ pfm_release_ctx_from_fd(&cookie);
30673+ return ret;
30674+}
30675+
30676+asmlinkage long sys_pfm_unload_context(int fd)
30677+{
30678+ struct pfm_context *ctx;
30679+ struct task_struct *task;
30680+ struct pfm_syscall_cookie cookie;
30681+ void *resume;
30682+ unsigned long flags;
30683+ int ret;
30684+ int is_system, release_info = 0;
30685+ u32 cpu;
30686+
30687+ PFM_DBG("fd=%d", fd);
30688+
30689+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
30690+ if (ret)
30691+ return ret;
30692+
30693+ is_system = ctx->flags.system;
30694+
30695+ spin_lock_irqsave(&ctx->lock, flags);
30696+
30697+ cpu = ctx->cpu;
30698+ task = ctx->task;
30699+
30700+ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED|PFM_CMD_UNLOAD,
30701+ &flags, &resume);
30702+ if (!ret)
30703+ ret = __pfm_unload_context(ctx, &release_info);
30704+
30705+ spin_unlock_irqrestore(&ctx->lock, flags);
30706+
30707+ if (resume)
30708+ pfm_resume_task(task, resume);
30709+
30710+ /*
30711+ * cancel time now that context is unlocked
30712+ * avoid race with pfm_handle_switch_timeout()
30713+ */
30714+ if (release_info & 0x2) {
30715+ int r;
30716+ r = hrtimer_cancel(&__get_cpu_var(pfm_hrtimer));
30717+ PFM_DBG("timeout cancel=%d", r);
30718+ }
30719+
30720+ if (release_info & 0x1)
30721+ pfm_session_release(is_system, cpu);
30722+
30723+ pfm_release_ctx_from_fd(&cookie);
30724+ return ret;
30725+}
30726+
30727+asmlinkage long sys_pfm_create_evtsets(int fd, struct pfarg_setdesc __user *ureq, int count)
30728+{
30729+ struct pfm_context *ctx;
30730+ struct pfm_syscall_cookie cookie;
30731+ struct pfarg_setdesc *req;
30732+ void *fptr, *resume;
30733+ unsigned long flags;
30734+ size_t sz;
30735+ int ret;
30736+
30737+ PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count);
30738+
30739+ if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq))
30740+ return -EINVAL;
30741+
30742+ sz = count*sizeof(*ureq);
30743+
30744+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
30745+ if (ret)
30746+ return ret;
30747+
30748+ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
30749+ if (ret)
30750+ goto error;
30751+
30752+ /*
30753+ * must mask interrupts because we do not know the state of context,
30754+ * could be attached and we could be getting PMU interrupts. So
30755+ * we mask and lock context and we check and possibly relax masking
30756+ */
30757+ spin_lock_irqsave(&ctx->lock, flags);
30758+
30759+ ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags, &resume);
30760+ if (!ret)
30761+ ret = __pfm_create_evtsets(ctx, req, count);
30762+
30763+ spin_unlock_irqrestore(&ctx->lock, flags);
30764+ /*
30765+ * context must be unloaded for this command. The resume pointer
30766+ * is necessarily NULL, thus no need to call pfm_resume_task()
30767+ */
30768+ kfree(fptr);
30769+
30770+error:
30771+ pfm_release_ctx_from_fd(&cookie);
30772+ return ret;
30773+}
30774+
30775+asmlinkage long sys_pfm_getinfo_evtsets(int fd, struct pfarg_setinfo __user *ureq, int count)
30776+{
30777+ struct pfm_context *ctx;
30778+ struct task_struct *task;
30779+ struct pfm_syscall_cookie cookie;
30780+ struct pfarg_setinfo *req;
30781+ void *fptr, *resume;
30782+ unsigned long flags;
30783+ size_t sz;
30784+ int ret;
30785+
30786+ PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count);
30787+
30788+ if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq))
30789+ return -EINVAL;
30790+
30791+ sz = count*sizeof(*ureq);
30792+
30793+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
30794+ if (ret)
30795+ return ret;
30796+
30797+ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
30798+ if (ret)
30799+ goto error;
30800+
30801+ /*
30802+ * this command operates even when context is loaded, so we need
30803+ * to keep interrupts masked to avoid a race with PMU interrupt
30804+ * which may switch the active set
30805+ */
30806+ spin_lock_irqsave(&ctx->lock, flags);
30807+
30808+ task = ctx->task;
30809+
30810+ ret = pfm_check_task_state(ctx, 0, &flags, &resume);
30811+ if (!ret)
30812+ ret = __pfm_getinfo_evtsets(ctx, req, count);
30813+
30814+ spin_unlock_irqrestore(&ctx->lock, flags);
30815+
30816+ if (resume)
30817+ pfm_resume_task(task, resume);
30818+
30819+ if (copy_to_user(ureq, req, sz))
30820+ ret = -EFAULT;
30821+
30822+ kfree(fptr);
30823+error:
30824+ pfm_release_ctx_from_fd(&cookie);
30825+ return ret;
30826+}
30827+
30828+asmlinkage long sys_pfm_delete_evtsets(int fd, struct pfarg_setinfo __user *ureq, int count)
30829+{
30830+ struct pfm_context *ctx;
30831+ struct pfm_syscall_cookie cookie;
30832+ struct pfarg_setinfo *req;
30833+ void *fptr, *resume;
30834+ unsigned long flags;
30835+ size_t sz;
30836+ int ret;
30837+
30838+ PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count);
30839+
30840+ if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq))
30841+ return -EINVAL;
30842+
30843+ sz = count*sizeof(*ureq);
30844+
30845+ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie);
30846+ if (ret)
30847+ return ret;
30848+
30849+ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
30850+ if (ret)
30851+ goto error;
30852+
30853+ /*
30854+ * must mask interrupts because we do not know the state of context,
30855+ * could be attached and we could be getting PMU interrupts
30856+ */
30857+ spin_lock_irqsave(&ctx->lock, flags);
30858+
30859+ ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags, &resume);
30860+ if (!ret)
30861+ ret = __pfm_delete_evtsets(ctx, req, count);
30862+
30863+ spin_unlock_irqrestore(&ctx->lock, flags);
30864+ /*
30865+ * context must be unloaded for this command. The resume pointer
30866+ * is necessarily NULL, thus no need to call pfm_resume_task()
30867+ */
30868+ kfree(fptr);
30869+
30870+error:
30871+ pfm_release_ctx_from_fd(&cookie);
30872+ return ret;
30873+}
30874--- /dev/null
30875+++ b/perfmon/perfmon_sysfs.c
30876@@ -0,0 +1,525 @@
30877+/*
30878+ * perfmon_sysfs.c: perfmon2 sysfs interface
30879+ *
30880+ * This file implements the perfmon2 interface which
30881+ * provides access to the hardware performance counters
30882+ * of the host processor.
30883+ *
30884+ * The initial version of perfmon.c was written by
30885+ * Ganesh Venkitachalam, IBM Corp.
30886+ *
30887+ * Then it was modified for perfmon-1.x by Stephane Eranian and
30888+ * David Mosberger, Hewlett Packard Co.
30889+ *
30890+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
30891+ * by Stephane Eranian, Hewlett Packard Co.
30892+ *
30893+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
30894+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
30895+ * David Mosberger-Tang <davidm@hpl.hp.com>
30896+ *
30897+ * More information about perfmon available at:
30898+ * http://perfmon2.sf.net
30899+ *
30900+ * This program is free software; you can redistribute it and/or
30901+ * modify it under the terms of version 2 of the GNU General Public
30902+ * License as published by the Free Software Foundation.
30903+ *
30904+ * This program is distributed in the hope that it will be useful,
30905+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
30906+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
30907+ * General Public License for more details.
30908+ *
30909+ * You should have received a copy of the GNU General Public License
30910+ * along with this program; if not, write to the Free Software
30911+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
30912+ * 02111-1307 USA
30913+ */
30914+#include <linux/kernel.h>
30915+#include <linux/module.h> /* for EXPORT_SYMBOL */
30916+#include <linux/perfmon_kern.h>
30917+#include "perfmon_priv.h"
30918+
30919+struct pfm_attribute {
30920+ struct attribute attr;
30921+ ssize_t (*show)(void *, struct pfm_attribute *attr, char *);
30922+ ssize_t (*store)(void *, const char *, size_t);
30923+};
30924+#define to_attr(n) container_of(n, struct pfm_attribute, attr);
30925+
30926+#define PFM_RO_ATTR(_name, _show) \
30927+ struct kobj_attribute attr_##_name = __ATTR(_name, 0444, _show, NULL)
30928+
30929+#define PFM_RW_ATTR(_name, _show, _store) \
30930+ struct kobj_attribute attr_##_name = __ATTR(_name, 0644, _show, _store)
30931+
30932+#define PFM_ROS_ATTR(_name, _show) \
30933+ struct pfm_attribute attr_##_name = __ATTR(_name, 0444, _show, NULL)
30934+
30935+#define is_attr_name(a, n) (!strcmp((a)->attr.name, n))
30936+int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu);
30937+
30938+static struct kobject *pfm_kernel_kobj, *pfm_fmt_kobj;
30939+static struct kobject *pfm_pmu_kobj;
30940+
30941+static ssize_t pfm_regs_attr_show(struct kobject *kobj,
30942+ struct attribute *attr, char *buf)
30943+{
30944+ struct pfm_regmap_desc *reg = to_reg(kobj);
30945+ struct pfm_attribute *attribute = to_attr(attr);
30946+ return attribute->show ? attribute->show(reg, attribute, buf) : -EIO;
30947+}
30948+
30949+static ssize_t pfm_fmt_attr_show(struct kobject *kobj,
30950+ struct attribute *attr, char *buf)
30951+{
30952+ struct pfm_smpl_fmt *fmt = to_smpl_fmt(kobj);
30953+ struct pfm_attribute *attribute = to_attr(attr);
30954+ return attribute->show ? attribute->show(fmt, attribute, buf) : -EIO;
30955+}
30956+
30957+static struct sysfs_ops pfm_regs_sysfs_ops = {
30958+ .show = pfm_regs_attr_show
30959+};
30960+
30961+static struct sysfs_ops pfm_fmt_sysfs_ops = {
30962+ .show = pfm_fmt_attr_show
30963+};
30964+
30965+static struct kobj_type pfm_regs_ktype = {
30966+ .sysfs_ops = &pfm_regs_sysfs_ops,
30967+};
30968+
30969+static struct kobj_type pfm_fmt_ktype = {
30970+ .sysfs_ops = &pfm_fmt_sysfs_ops,
30971+};
30972+
30973+static ssize_t pfm_controls_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
30974+{
30975+ int base;
30976+
30977+ if (is_attr_name(attr, "version"))
30978+ return snprintf(buf, PAGE_SIZE, "%u.%u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN);
30979+
30980+ if (is_attr_name(attr, "task_sessions_count"))
30981+ return pfm_sysfs_res_show(buf, PAGE_SIZE, 0);
30982+
30983+ if (is_attr_name(attr, "debug"))
30984+ return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.debug);
30985+
30986+ if (is_attr_name(attr, "task_group"))
30987+ return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.task_group);
30988+
30989+ if (is_attr_name(attr, "mode"))
30990+ return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.flags);
30991+
30992+ if (is_attr_name(attr, "arg_mem_max"))
30993+ return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.arg_mem_max);
30994+
30995+ if (is_attr_name(attr, "syscall")) {
30996+ base = pfm_arch_get_base_syscall();
30997+ return snprintf(buf, PAGE_SIZE, "%d\n", base);
30998+ }
30999+
31000+ if (is_attr_name(attr, "sys_sessions_count"))
31001+ return pfm_sysfs_res_show(buf, PAGE_SIZE, 1);
31002+
31003+ if (is_attr_name(attr, "smpl_buffer_mem_max"))
31004+ return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.smpl_buffer_mem_max);
31005+
31006+ if (is_attr_name(attr, "smpl_buffer_mem_cur"))
31007+ return pfm_sysfs_res_show(buf, PAGE_SIZE, 2);
31008+
31009+ if (is_attr_name(attr, "sys_group"))
31010+ return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.sys_group);
31011+
31012+ /* XXX: could be set to write-only */
31013+ if (is_attr_name(attr, "reset_stats")) {
31014+ buf[0] = '0';
31015+ buf[1] = '\0';
31016+ return strnlen(buf, PAGE_SIZE);
31017+ }
31018+ return 0;
31019+}
31020+
31021+static ssize_t pfm_controls_store(struct kobject *kobj, struct kobj_attribute *attr,
31022+ const char *buf, size_t count)
31023+{
31024+ int i;
31025+ size_t d;
31026+
31027+ if (sscanf(buf, "%zu", &d) != 1)
31028+ goto skip;
31029+
31030+ if (is_attr_name(attr, "debug"))
31031+ pfm_controls.debug = d;
31032+
31033+ if (is_attr_name(attr, "task_group"))
31034+ pfm_controls.task_group = d;
31035+
31036+ if (is_attr_name(attr, "sys_group"))
31037+ pfm_controls.sys_group = d;
31038+
31039+ if (is_attr_name(attr, "mode"))
31040+ pfm_controls.flags = d ? PFM_CTRL_FL_RW_EXPERT : 0;
31041+
31042+ if (is_attr_name(attr, "arg_mem_max")) {
31043+ /*
31044+ * we impose a page as the minimum.
31045+ *
31046+ * This limit may be smaller than the stack buffer
31047+ * available and that is fine.
31048+ */
31049+ if (d >= PAGE_SIZE)
31050+ pfm_controls.arg_mem_max = d;
31051+ }
31052+ if (is_attr_name(attr, "reset_stats")) {
31053+ for_each_online_cpu(i) {
31054+ pfm_reset_stats(i);
31055+ }
31056+ }
31057+
31058+ if (is_attr_name(attr, "smpl_buffer_mem_max")) {
31059+ if (d >= PAGE_SIZE)
31060+ pfm_controls.smpl_buffer_mem_max = d;
31061+ }
31062+skip:
31063+ return count;
31064+}
31065+
31066+/*
31067+ * /sys/kernel/perfmon attributes
31068+ */
31069+static PFM_RO_ATTR(version, pfm_controls_show);
31070+static PFM_RO_ATTR(task_sessions_count, pfm_controls_show);
31071+static PFM_RO_ATTR(syscall, pfm_controls_show);
31072+static PFM_RO_ATTR(sys_sessions_count, pfm_controls_show);
31073+static PFM_RO_ATTR(smpl_buffer_mem_cur, pfm_controls_show);
31074+
31075+static PFM_RW_ATTR(debug, pfm_controls_show, pfm_controls_store);
31076+static PFM_RW_ATTR(task_group, pfm_controls_show, pfm_controls_store);
31077+static PFM_RW_ATTR(mode, pfm_controls_show, pfm_controls_store);
31078+static PFM_RW_ATTR(sys_group, pfm_controls_show, pfm_controls_store);
31079+static PFM_RW_ATTR(arg_mem_max, pfm_controls_show, pfm_controls_store);
31080+static PFM_RW_ATTR(smpl_buffer_mem_max, pfm_controls_show, pfm_controls_store);
31081+static PFM_RW_ATTR(reset_stats, pfm_controls_show, pfm_controls_store);
31082+
31083+static struct attribute *pfm_kernel_attrs[] = {
31084+ &attr_version.attr,
31085+ &attr_syscall.attr,
31086+ &attr_task_sessions_count.attr,
31087+ &attr_sys_sessions_count.attr,
31088+ &attr_smpl_buffer_mem_cur.attr,
31089+ &attr_debug.attr,
31090+ &attr_reset_stats.attr,
31091+ &attr_sys_group.attr,
31092+ &attr_task_group.attr,
31093+ &attr_mode.attr,
31094+ &attr_smpl_buffer_mem_max.attr,
31095+ &attr_arg_mem_max.attr,
31096+ NULL
31097+};
31098+
31099+static struct attribute_group pfm_kernel_attr_group = {
31100+ .attrs = pfm_kernel_attrs,
31101+};
31102+
31103+/*
31104+ * per-reg attributes
31105+ */
31106+static ssize_t pfm_reg_show(void *data, struct pfm_attribute *attr, char *buf)
31107+{
31108+ struct pfm_regmap_desc *reg;
31109+ int w;
31110+
31111+ reg = data;
31112+
31113+ if (is_attr_name(attr, "name"))
31114+ return snprintf(buf, PAGE_SIZE, "%s\n", reg->desc);
31115+
31116+ if (is_attr_name(attr, "dfl_val"))
31117+ return snprintf(buf, PAGE_SIZE, "0x%llx\n",
31118+ (unsigned long long)reg->dfl_val);
31119+
31120+ if (is_attr_name(attr, "width")) {
31121+ w = (reg->type & PFM_REG_C64) ?
31122+ pfm_pmu_conf->counter_width : 64;
31123+ return snprintf(buf, PAGE_SIZE, "%d\n", w);
31124+ }
31125+
31126+ if (is_attr_name(attr, "rsvd_msk"))
31127+ return snprintf(buf, PAGE_SIZE, "0x%llx\n",
31128+ (unsigned long long)reg->rsvd_msk);
31129+
31130+ if (is_attr_name(attr, "addr"))
31131+ return snprintf(buf, PAGE_SIZE, "0x%lx\n", reg->hw_addr);
31132+
31133+ return 0;
31134+}
31135+
31136+static PFM_ROS_ATTR(name, pfm_reg_show);
31137+static PFM_ROS_ATTR(dfl_val, pfm_reg_show);
31138+static PFM_ROS_ATTR(rsvd_msk, pfm_reg_show);
31139+static PFM_ROS_ATTR(width, pfm_reg_show);
31140+static PFM_ROS_ATTR(addr, pfm_reg_show);
31141+
31142+static struct attribute *pfm_reg_attrs[] = {
31143+ &attr_name.attr,
31144+ &attr_dfl_val.attr,
31145+ &attr_rsvd_msk.attr,
31146+ &attr_width.attr,
31147+ &attr_addr.attr,
31148+ NULL
31149+};
31150+
31151+static struct attribute_group pfm_reg_attr_group = {
31152+ .attrs = pfm_reg_attrs,
31153+};
31154+
31155+static ssize_t pfm_pmu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
31156+{
31157+ if (is_attr_name(attr, "model"))
31158+ return snprintf(buf, PAGE_SIZE, "%s\n", pfm_pmu_conf->pmu_name);
31159+ return 0;
31160+}
31161+static PFM_RO_ATTR(model, pfm_pmu_show);
31162+
31163+static struct attribute *pfm_pmu_desc_attrs[] = {
31164+ &attr_model.attr,
31165+ NULL
31166+};
31167+
31168+static struct attribute_group pfm_pmu_desc_attr_group = {
31169+ .attrs = pfm_pmu_desc_attrs,
31170+};
31171+
31172+static int pfm_sysfs_add_pmu_regs(struct pfm_pmu_config *pmu)
31173+{
31174+ struct pfm_regmap_desc *reg;
31175+ unsigned int i, k;
31176+ int ret;
31177+
31178+ reg = pmu->pmc_desc;
31179+ for (i = 0; i < pmu->num_pmc_entries; i++, reg++) {
31180+
31181+ if (!(reg->type & PFM_REG_I))
31182+ continue;
31183+
31184+ ret = kobject_init_and_add(&reg->kobj, &pfm_regs_ktype,
31185+ pfm_pmu_kobj, "pmc%u", i);
31186+ if (ret)
31187+ goto undo_pmcs;
31188+
31189+ ret = sysfs_create_group(&reg->kobj, &pfm_reg_attr_group);
31190+ if (ret) {
31191+ kobject_del(&reg->kobj);
31192+ goto undo_pmcs;
31193+ }
31194+ }
31195+
31196+ reg = pmu->pmd_desc;
31197+ for (i = 0; i < pmu->num_pmd_entries; i++, reg++) {
31198+
31199+ if (!(reg->type & PFM_REG_I))
31200+ continue;
31201+
31202+ ret = kobject_init_and_add(&reg->kobj, &pfm_regs_ktype,
31203+ pfm_pmu_kobj, "pmd%u", i);
31204+ if (ret)
31205+ goto undo_pmds;
31206+
31207+ ret = sysfs_create_group(&reg->kobj, &pfm_reg_attr_group);
31208+ if (ret) {
31209+ kobject_del(&reg->kobj);
31210+ goto undo_pmds;
31211+ }
31212+ }
31213+ return 0;
31214+undo_pmds:
31215+ reg = pmu->pmd_desc;
31216+ for (k = 0; k < i; k++, reg++) {
31217+ if (!(reg->type & PFM_REG_I))
31218+ continue;
31219+ sysfs_remove_group(&reg->kobj, &pfm_reg_attr_group);
31220+ kobject_del(&reg->kobj);
31221+ }
31222+ i = pmu->num_pmc_entries;
31223+ /* fall through */
31224+undo_pmcs:
31225+ reg = pmu->pmc_desc;
31226+ for (k = 0; k < i; k++, reg++) {
31227+ if (!(reg->type & PFM_REG_I))
31228+ continue;
31229+ sysfs_remove_group(&reg->kobj, &pfm_reg_attr_group);
31230+ kobject_del(&reg->kobj);
31231+ }
31232+ return ret;
31233+}
31234+
31235+static int pfm_sysfs_del_pmu_regs(struct pfm_pmu_config *pmu)
31236+{
31237+ struct pfm_regmap_desc *reg;
31238+ unsigned int i;
31239+
31240+ reg = pmu->pmc_desc;
31241+ for (i = 0; i < pmu->num_pmc_entries; i++, reg++) {
31242+
31243+ if (!(reg->type & PFM_REG_I))
31244+ continue;
31245+
31246+ sysfs_remove_group(&reg->kobj, &pfm_reg_attr_group);
31247+ kobject_del(&reg->kobj);
31248+ }
31249+
31250+ reg = pmu->pmd_desc;
31251+ for (i = 0; i < pmu->num_pmd_entries; i++, reg++) {
31252+
31253+ if (!(reg->type & PFM_REG_I))
31254+ continue;
31255+
31256+ sysfs_remove_group(&reg->kobj, &pfm_reg_attr_group);
31257+ kobject_del(&reg->kobj);
31258+ }
31259+ return 0;
31260+}
31261+
31262+/*
31263+ * when a PMU description module is inserted, we create
31264+ * a pmu_desc subdir in sysfs and we populate it with
31265+ * PMU specific information, such as register mappings
31266+ */
31267+int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu)
31268+{
31269+ int ret;
31270+
31271+ pfm_pmu_kobj = kobject_create_and_add("pmu_desc", pfm_kernel_kobj);
31272+ if (!pfm_pmu_kobj)
31273+ return -ENOMEM;
31274+
31275+ ret = sysfs_create_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group);
31276+ if (ret) {
31277+ /* will release pfm_pmu_kobj */
31278+ kobject_put(pfm_pmu_kobj);
31279+ return ret;
31280+ }
31281+
31282+ ret = pfm_sysfs_add_pmu_regs(pmu);
31283+ if (ret) {
31284+ sysfs_remove_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group);
31285+ /* will release pfm_pmu_kobj */
31286+ kobject_put(pfm_pmu_kobj);
31287+ } else
31288+ kobject_uevent(pfm_pmu_kobj, KOBJ_ADD);
31289+
31290+ return ret;
31291+}
31292+
31293+/*
31294+ * when a PMU description module is removed, we also remove
31295+ * all its information from sysfs, i.e., the pmu_desc subdir
31296+ * disappears
31297+ */
31298+int pfm_sysfs_remove_pmu(struct pfm_pmu_config *pmu)
31299+{
31300+ pfm_sysfs_del_pmu_regs(pmu);
31301+ sysfs_remove_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group);
31302+ kobject_uevent(pfm_pmu_kobj, KOBJ_REMOVE);
31303+ kobject_put(pfm_pmu_kobj);
31304+ pfm_pmu_kobj = NULL;
31305+ return 0;
31306+}
31307+
31308+static ssize_t pfm_fmt_show(void *data, struct pfm_attribute *attr, char *buf)
31309+{
31310+ struct pfm_smpl_fmt *fmt = data;
31311+
31312+ if (is_attr_name(attr, "version"))
31313+ return snprintf(buf, PAGE_SIZE, "%u.%u\n",
31314+ fmt->fmt_version >> 16 & 0xffff,
31315+ fmt->fmt_version & 0xffff);
31316+ return 0;
31317+}
31318+
31319+/*
31320+ * do not use predefined macros because of name conflict
31321+ * with /sys/kernel/perfmon/version
31322+ */
31323+struct pfm_attribute attr_fmt_version = {
31324+ .attr = { .name = "version", .mode = 0444 },
31325+ .show = pfm_fmt_show,
31326+};
31327+
31328+static struct attribute *pfm_fmt_attrs[] = {
31329+ &attr_fmt_version.attr,
31330+ NULL
31331+};
31332+
31333+static struct attribute_group pfm_fmt_attr_group = {
31334+ .attrs = pfm_fmt_attrs,
31335+};
31336+
31337+/*
31338+ * when a sampling format module is inserted, we populate
31339+ * sysfs with some information
31340+ */
31341+int pfm_sysfs_add_fmt(struct pfm_smpl_fmt *fmt)
31342+{
31343+ int ret;
31344+
31345+ ret = kobject_init_and_add(&fmt->kobj, &pfm_fmt_ktype,
31346+ pfm_fmt_kobj, fmt->fmt_name);
31347+ if (ret)
31348+ return ret;
31349+
31350+ ret = sysfs_create_group(&fmt->kobj, &pfm_fmt_attr_group);
31351+ if (ret)
31352+ kobject_del(&fmt->kobj);
31353+ else
31354+ kobject_uevent(&fmt->kobj, KOBJ_ADD);
31355+
31356+ return ret;
31357+}
31358+
31359+/*
31360+ * when a sampling format module is removed, its information
31361+ * must also be removed from sysfs
31362+ */
31363+void pfm_sysfs_remove_fmt(struct pfm_smpl_fmt *fmt)
31364+{
31365+ sysfs_remove_group(&fmt->kobj, &pfm_fmt_attr_group);
31366+ kobject_uevent(&fmt->kobj, KOBJ_REMOVE);
31367+ kobject_del(&fmt->kobj);
31368+}
31369+
31370+int __init pfm_init_sysfs(void)
31371+{
31372+ int ret;
31373+
31374+ pfm_kernel_kobj = kobject_create_and_add("perfmon", kernel_kobj);
31375+ if (!pfm_kernel_kobj) {
31376+ PFM_ERR("cannot add kernel object: /sys/kernel/perfmon");
31377+ return -ENOMEM;
31378+ }
31379+
31380+ ret = sysfs_create_group(pfm_kernel_kobj, &pfm_kernel_attr_group);
31381+ if (ret) {
31382+ kobject_put(pfm_kernel_kobj);
31383+ return ret;
31384+ }
31385+
31386+ pfm_fmt_kobj = kobject_create_and_add("formats", pfm_kernel_kobj);
31387+ if (ret) {
31388+ PFM_ERR("cannot add fmt object: %d", ret);
31389+ goto error_fmt;
31390+ }
31391+ if (pfm_pmu_conf)
31392+ pfm_sysfs_add_pmu(pfm_pmu_conf);
31393+
31394+ pfm_sysfs_builtin_fmt_add();
31395+
31396+ return 0;
31397+
31398+error_fmt:
31399+ kobject_del(pfm_kernel_kobj);
31400+ return ret;
31401+}