]> git.ipfire.org Git - thirdparty/binutils-gdb.git/blob - gprofng/common/hwcdrv.c
gprofng: 31123 improvements to hardware event implementation
[thirdparty/binutils-gdb.git] / gprofng / common / hwcdrv.c
1 /* Copyright (C) 2021-2024 Free Software Foundation, Inc.
2 Contributed by Oracle.
3
4 This file is part of GNU Binutils.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, 51 Franklin Street - Fifth Floor, Boston,
19 MA 02110-1301, USA. */
20
21 #include <errno.h>
22 #include <unistd.h>
23 #include <fcntl.h>
24 #include <sys/mman.h>
25 #include <sys/ioctl.h>
26 #include <sys/syscall.h>
27 #include <linux/perf_event.h>
28
29 #include "hwcdrv.h"
30
31 /*---------------------------------------------------------------------------*/
32 /* macros */
33 #define IS_GLOBAL /* Mark global symbols */
34
35 #include "cpuid.c" /* ftns for identifying a chip */
36
37 static hdrv_pcbe_api_t hdrv_pcbe_core_api;
38 static hdrv_pcbe_api_t hdrv_pcbe_opteron_api;
39 static hdrv_pcbe_api_t *hdrv_pcbe_drivers[] = {
40 &hdrv_pcbe_core_api,
41 &hdrv_pcbe_opteron_api,
42 NULL
43 };
44 #include "opteron_pcbe.c" /* CPU-specific code */
45 #include "core_pcbe.c" /* CPU-specific code */
46
47 extern hwcdrv_api_t hwcdrv_pcl_api;
48 IS_GLOBAL hwcdrv_api_t *hwcdrv_drivers[] = {
49 &hwcdrv_pcl_api,
50 NULL
51 };
52
53 /*---------------------------------------------------------------------------*/
54
55 /* utils for drivers */
56 IS_GLOBAL int
57 hwcdrv_assign_all_regnos (Hwcentry* entries[], unsigned numctrs)
58 {
59 unsigned int pmc_assigned[MAX_PICS];
60 unsigned idx;
61 for (int ii = 0; ii < MAX_PICS; ii++)
62 pmc_assigned[ii] = 0;
63
64 /* assign the HWCs that we already know about */
65 for (idx = 0; idx < numctrs; idx++)
66 {
67 regno_t regno = entries[idx]->reg_num;
68 if (regno == REGNO_ANY)
69 {
70 /* check to see if list of possible registers only contains one entry */
71 regno = REG_LIST_SINGLE_VALID_ENTRY (entries[idx]->reg_list);
72 }
73 if (regno != REGNO_ANY)
74 {
75 if (regno < 0 || regno >= MAX_PICS || !regno_is_valid (entries[idx], regno))
76 {
77 logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/
78 return HWCFUNCS_ERROR_HWCARGS;
79 }
80 TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): preselected: idx=%d, regno=%d\n", idx, regno);
81 entries[idx]->reg_num = regno; /* assigning back to entries */
82 pmc_assigned[regno] = 1;
83 }
84 }
85
86 /* assign HWCs that are currently REGNO_ANY */
87 for (idx = 0; idx < numctrs; idx++)
88 {
89 if (entries[idx]->reg_num == REGNO_ANY)
90 {
91 int assigned = 0;
92 regno_t *reg_list = entries[idx]->reg_list;
93 for (; reg_list && *reg_list != REGNO_ANY; reg_list++)
94 {
95 regno_t regno = *reg_list;
96 if (regno < 0 || regno >= MAX_PICS)
97 {
98 logerr (GTXT ("For counter #%d, register %d is out of range\n"), idx + 1, regno); /*!*/
99 return HWCFUNCS_ERROR_HWCARGS;
100 }
101 if (pmc_assigned[regno] == 0)
102 {
103 TprintfT (DBG_LT2, "hwcfuncs_assign_regnos(): assigned: idx=%d, regno=%d\n", idx, regno);
104 entries[idx]->reg_num = regno; /* assigning back to entries */
105 pmc_assigned[regno] = 1;
106 assigned = 1;
107 break;
108 }
109 }
110 if (!assigned)
111 {
112 logerr (GTXT ("Counter '%s' could not be bound to a register\n"),
113 entries[idx]->name ? entries[idx]->name : "<NULL>");
114 return HWCFUNCS_ERROR_HWCARGS;
115 }
116 }
117 }
118 return 0;
119 }
120
121 IS_GLOBAL int
122 hwcdrv_lookup_cpuver (const char * cpcN_cciname)
123 {
124 libcpc2_cpu_lookup_t *plookup;
125 static libcpc2_cpu_lookup_t cpu_table[] = {
126 LIBCPC2_CPU_LOOKUP_LIST
127 };
128 if (cpcN_cciname == NULL)
129 return CPUVER_UNDEFINED;
130
131 /* search table for name */
132 for (plookup = cpu_table; plookup->cpc2_cciname; plookup++)
133 {
134 int n = strlen (plookup->cpc2_cciname);
135 if (!strncmp (plookup->cpc2_cciname, cpcN_cciname, n))
136 return plookup->cpc2_cpuver;
137 }
138 /* unknown, but does have a descriptive string */
139 TprintfT (DBG_LT0, "hwcfuncs: CPC2: WARNING: Id of processor '%s' "
140 "could not be determined\n",
141 cpcN_cciname);
142 return CPUVER_GENERIC;
143 }
144
145 /*---------------------------------------------------------------------------*/
146 /* utils to generate x86 register definitions on Linux */
147
148 /*
149 * This code is structured as though we're going to initialize the
150 * HWC by writing the Intel MSR register directly. That is, we
151 * assume the lowest 16 bits of the event number will have the event
152 * and that higher bits will set attributes.
153 *
154 * While SPARC is different, we can nonetheless use basically the
155 * same "x86"-named functions:
156 *
157 * - The event code will still be 16 bits. It will still
158 * be in the lowest 16 bits of the event number. Though
159 * perf_event_code() on SPARC will expect those bits to
160 * shifted, hwcdrv_pcl.c can easily perform that shift.
161 *
162 * - On SPARC we support only two attributes, "user" and "system",
163 * which hwcdrv_pcl.c already converts to the "exclude_user"
164 * and "exclude_kernel" fields expected by perf_event_open().
165 * "user" and "system" are stored in event bits 16 and 17.
166 * For M8, a 4-bit mask of supported PICs is stored in bits [23:20].
167 */
168
169 IS_GLOBAL hwcdrv_get_eventnum_fn_t *hwcdrv_get_x86_eventnum = 0;
170
171 static const attr_info_t perfctr_sparc_attrs[] = {
172 {NTXT ("user"), 0, 0x01, 16}, //usr
173 {NTXT ("system"), 0, 0x01, 17}, //os
174 {NULL, 0, 0x00, 0},
175 };
176 static const attr_info_t perfctr_x64_attrs[] = {/* ok for Core2 & later */
177 {NTXT ("umask"), 0, 0xff, 8},
178 {NTXT ("user"), 0, 0x01, 16}, //usr
179 //{NTXT("nouser"), 1, 0x01, 16}, //usr (inverted)
180 {NTXT ("system"), 0, 0x01, 17}, //os
181 {NTXT ("edge"), 0, 0x01, 18},
182 {NTXT ("pc"), 0, 0x01, 19},
183 {NTXT ("inv"), 0, 0x01, 23},
184 {NTXT ("cmask"), 0, 0xff, 24},
185 {NULL, 0, 0x00, 0},
186 };
187 const attr_info_t *perfctr_attrs_table = perfctr_x64_attrs;
188
189 static const eventsel_t perfctr_evntsel_enable_bits = (0x01 << 16) | /* usr */
190 // (0xff << 0) | /* event*/
191 // (0xff << 8) | /* umask */
192 // (0x01 << 17) | /* os */
193 // (0x01 << 18) | /* edge */
194 // (0x01 << 19) | /* pc */
195 (0x01 << 20) | /* int */
196 // (0x01 << 21) | /* reserved */
197 (0x01 << 22) | /* enable */
198 // (0x01 << 23) | /* inv */
199 // (0xff << 24) | /* cmask */
200 0;
201
202 static int
203 myperfctr_get_x86_eventnum (const char *eventname, uint_t pmc,
204 eventsel_t *eventsel, eventsel_t *valid_umask,
205 uint_t *pmc_sel)
206 {
207 if (hwcdrv_get_x86_eventnum &&
208 !hwcdrv_get_x86_eventnum (eventname, pmc, eventsel, valid_umask, pmc_sel))
209 return 0;
210
211 /* check for numerically-specified counters */
212 char * endptr;
213 uint64_t num = strtoull (eventname, &endptr, 0);
214 if (*eventname && !*endptr)
215 {
216 *eventsel = EXTENDED_EVNUM_2_EVSEL (num);
217 *valid_umask = 0xff; /* allow any umask (unused for SPARC?) */
218 *pmc_sel = pmc;
219 return 0;
220 }
221
222 /* name does not specify a numeric value */
223 *eventsel = (eventsel_t) - 1;
224 *valid_umask = 0x0;
225 *pmc_sel = pmc;
226 return -1;
227 }
228
229 static int
230 mask_shift_set (eventsel_t *presult, eventsel_t invalue,
231 eventsel_t mask, eventsel_t shift)
232 {
233 if (invalue & ~mask)
234 return -1; /* invalue attempts to set bits outside of mask */
235 *presult &= ~(mask << shift); /* clear all the mask bits */
236 *presult |= (invalue << shift); /* set bits according to invalue */
237 return 0;
238 }
239
240 static int
241 set_x86_attr_bits (eventsel_t *result_mask, eventsel_t evnt_valid_umask,
242 hwcfuncs_attr_t attrs[], int nattrs, const char*nameOnly)
243 {
244 eventsel_t evntsel = *result_mask;
245 for (int ii = 0; ii < (int) nattrs; ii++)
246 {
247 const char *attrname = attrs[ii].ca_name;
248 eventsel_t attrval = (eventsel_t) attrs[ii].ca_val;
249 const char *tmpname;
250 int attr_found = 0;
251 for (int jj = 0; (tmpname = perfctr_attrs_table[jj].attrname); jj++)
252 {
253 if (strcmp (attrname, tmpname) == 0)
254 {
255 if (strcmp (attrname, "umask") == 0)
256 {
257 if (attrval & ~evnt_valid_umask)
258 {
259 logerr (GTXT ("for `%s', allowable umask bits are: 0x%llx\n"),
260 nameOnly, (long long) evnt_valid_umask);
261 return -1;
262 }
263 }
264 if (mask_shift_set (&evntsel,
265 perfctr_attrs_table[jj].is_inverted ? (attrval^1) : attrval,
266 perfctr_attrs_table[jj].mask,
267 perfctr_attrs_table[jj].shift))
268 {
269 logerr (GTXT ("`%s' attribute `%s' could not be set to 0x%llx\n"),
270 nameOnly, attrname, (long long) attrval);
271 return -1;
272 }
273 TprintfT (DBG_LT2, "hwcfuncs: Counter %s, attribute %s set to 0x%llx\n",
274 nameOnly, attrname, (long long) attrval);
275 attr_found = 1;
276 break;
277 }
278 }
279 if (!attr_found)
280 {
281 logerr (GTXT ("attribute `%s' is invalid\n"), attrname);
282 return -1;
283 }
284 }
285 *result_mask = evntsel;
286 return 0;
287 }
288
289 IS_GLOBAL int
290 hwcfuncs_get_x86_eventsel (unsigned int regno, const char *int_name,
291 eventsel_t *return_event, uint_t *return_pmc_sel)
292 {
293 hwcfuncs_attr_t attrs[HWCFUNCS_MAX_ATTRS + 1];
294 unsigned nattrs = 0;
295 char *nameOnly = NULL;
296 eventsel_t evntsel = 0; // event number
297 eventsel_t evnt_valid_umask = 0;
298 uint_t pmc_sel = 0;
299 int rc = -1;
300 *return_event = 0;
301 *return_pmc_sel = 0;
302 void *attr_mem = hwcfuncs_parse_attrs (int_name, attrs, HWCFUNCS_MAX_ATTRS,
303 &nattrs, NULL);
304 if (!attr_mem)
305 {
306 logerr (GTXT ("out of memory, could not parse attributes\n"));
307 return -1;
308 }
309 hwcfuncs_parse_ctr (int_name, NULL, &nameOnly, NULL, NULL, NULL);
310 if (regno == REGNO_ANY)
311 {
312 logerr (GTXT ("reg# could not be determined for `%s'\n"), nameOnly);
313 goto attr_wrapup;
314 }
315
316 /* look up evntsel */
317 if (myperfctr_get_x86_eventnum (nameOnly, regno,
318 &evntsel, &evnt_valid_umask, &pmc_sel))
319 {
320 logerr (GTXT ("counter `%s' is not valid\n"), nameOnly);
321 goto attr_wrapup;
322 }
323 TprintfT (DBG_LT1, "hwcfuncs: event=0x%llx pmc=0x%x '%s' nattrs = %u\n",
324 (long long) evntsel, pmc_sel, nameOnly, nattrs);
325
326 /* determine event attributes */
327 eventsel_t evnt_attrs = perfctr_evntsel_enable_bits;
328 if (set_x86_attr_bits (&evnt_attrs, evnt_valid_umask, attrs, nattrs, nameOnly))
329 goto attr_wrapup;
330 if (evntsel & evnt_attrs)
331 TprintfT (DBG_LT0, "hwcfuncs: ERROR - evntsel & enable bits overlap: 0x%llx 0x%llx 0x%llx\n",
332 (long long) evntsel, (long long) evnt_attrs,
333 (long long) (evntsel & evnt_attrs));
334 *return_event = evntsel | evnt_attrs;
335 *return_pmc_sel = pmc_sel;
336 rc = 0;
337
338 attr_wrapup:
339 free (attr_mem);
340 free (nameOnly);
341 return rc;
342 }
343
344 #ifdef __x86_64__
345 #define syscall_instr "syscall"
346 #define syscall_clobber "rcx", "r11", "memory"
347 #endif
348 #ifdef __i386__
349 #define syscall_instr "int $0x80"
350 #define syscall_clobber "memory"
351 #endif
352
353 static inline int
354 perf_event_open (struct perf_event_attr *hw_event_uptr, pid_t pid,
355 int cpu, int group_fd, unsigned long flags)
356 {
357 /* It seems that perf_event_open() sometimes fails spuriously,
358 * even while an immediate retry succeeds.
359 * So, let's try a few retries if the call fails just to be sure.
360 */
361 int rc;
362 for (int retry = 0; retry < 5; retry++)
363 {
364 rc = syscall (__NR_perf_event_open, hw_event_uptr, pid, cpu, group_fd, flags);
365 if (rc != -1)
366 return rc;
367 }
368 return rc;
369 }
370
371 /*---------------------------------------------------------------------------*/
372 /* macros & fwd prototypes */
373
374 #define HWCDRV_API static /* Mark functions used by hwcdrv API */
375
376 HWCDRV_API int hwcdrv_start (void);
377 HWCDRV_API int hwcdrv_free_counters ();
378
379 static pid_t
380 hwcdrv_gettid (void)
381 {
382 #ifndef LIBCOLLECTOR_SRC
383 return syscall (__NR_gettid);
384 #elif defined(intel)
385 pid_t r;
386 __asm__ __volatile__(syscall_instr
387 : "=a" (r) : "0" (__NR_gettid)
388 : syscall_clobber);
389 return r;
390 #else
391 return syscall (__NR_gettid); // FIXUP_XXX_SPARC_LINUX // write gettid in asm
392 #endif
393 }
394
395 /*---------------------------------------------------------------------------*/
396 /* types */
397
398 #define NPAGES_PER_BUF 1 // number of pages to be used for perf_event samples
399 // must be a power of 2
400
401 /*---------------------------------------------------------------------------*/
402
403 /* typedefs */
404
405 typedef struct
406 { // event (hwc) definition
407 unsigned int reg_num; // PMC assignment, potentially for detecting conflicts
408 eventsel_t eventsel; // raw event bits (Intel/AMD)
409 uint64_t counter_preload; // number of HWC events before signal
410 struct perf_event_attr hw; // perf_event definition
411 hrtime_t min_time; // minimum time we're targeting between events
412 char *name;
413 } perf_event_def_t;
414
415 typedef struct
416 { // runtime state of perf_event buffer
417 void *buf; // pointer to mmapped buffer
418 size_t pagesz; // size of pages
419 } buffer_state_t;
420
421 typedef struct
422 { // runtime state of counter values
423 uint64_t prev_ena_ts; // previous perf_event "enabled" time
424 uint64_t prev_run_ts; // previous perf_event "running" time
425 uint64_t prev_value; // previous HWC value
426 } counter_value_state_t;
427
428 typedef struct
429 { // per-counter information
430 perf_event_def_t *ev_def; // global HWC definition for one counter
431 int fd; // perf_event fd
432 buffer_state_t buf_state; // perf_event buffer's state
433 counter_value_state_t value_state; // counter state
434 int needs_restart; // workaround for dbx failure to preserve si_fd
435 uint64_t last_overflow_period;
436 hrtime_t last_overflow_time;
437 } counter_state_t;
438
439 typedef struct
440 { // per-thread context
441 counter_state_t *ctr_list;
442 int signal_fd; // fd that caused the most recent signal
443 pid_t tid; // for debugging signal delivery problems
444 } hdrv_pcl_ctx_t;
445
446 /*---------------------------------------------------------------------------*/
447
448 /* static variables */
449 static struct
450 {
451 int library_ok;
452 int internal_open_called;
453 hwcfuncs_tsd_get_fn_t find_vpc_ctx;
454 unsigned hwcdef_cnt; /* number of *active* hardware counters */
455 hwcdrv_get_events_fn_t *get_events;
456 } hdrv_pcl_state;
457
458 static hwcdrv_about_t hdrv_pcl_about = {.cpcN_cpuver = CPUVER_UNDEFINED};
459 static perf_event_def_t global_perf_event_def[MAX_PICS];
460
461 #define COUNTERS_ENABLED() (hdrv_pcl_state.hwcdef_cnt)
462
463
464 /* perf_event buffer formatting and handling */
465 static void
466 reset_buf (buffer_state_t *bufstate)
467 {
468 TprintfT (0, "hwcdrv: ERROR: perf_event reset_buf() called!\n");
469 struct perf_event_mmap_page *metadata = bufstate->buf;
470 if (metadata)
471 metadata->data_tail = metadata->data_head;
472 }
473
474 static int
475 skip_buf (buffer_state_t *bufstate, size_t sz)
476 {
477 TprintfT (DBG_LT1, "hwcdrv: WARNING: perf_event skip_buf called!\n");
478 struct perf_event_mmap_page *metadata = bufstate->buf;
479 if (metadata == NULL)
480 return -1;
481 size_t pgsz = bufstate->pagesz;
482 size_t bufsz = NPAGES_PER_BUF*pgsz;
483 uint64_t d_tail = metadata->data_tail;
484 uint64_t d_head = metadata->data_head;
485
486 // validate request size
487 if (sz > d_head - d_tail || sz >= bufsz)
488 {
489 reset_buf (bufstate);
490 return -1;
491 }
492 metadata->data_tail = d_tail + sz; // advance tail
493 return 0;
494 }
495
496 static int
497 read_buf (buffer_state_t *bufstate, void *buf, size_t sz)
498 {
499 struct perf_event_mmap_page *metadata = bufstate->buf;
500 if (metadata == NULL)
501 return -1;
502 size_t pgsz = bufstate->pagesz;
503 size_t bufsz = NPAGES_PER_BUF*pgsz;
504 uint64_t d_tail = metadata->data_tail;
505 uint64_t d_head = metadata->data_head;
506
507 // validate request size
508 if (sz > d_head - d_tail || sz >= bufsz)
509 {
510 reset_buf (bufstate);
511 return -1;
512 }
513 char *buf_base = ((char *) metadata) + pgsz; // start of data buffer
514 uint64_t start_pos = d_tail & (bufsz - 1); // char offset into data buffer
515 size_t nbytes = sz;
516 if (start_pos + sz > bufsz)
517 {
518 // will wrap past end of buffer
519 nbytes = bufsz - start_pos;
520 memcpy (buf, buf_base + start_pos, nbytes);
521 start_pos = 0; // wrap to start
522 buf = (void *) (((char *) buf) + nbytes);
523 nbytes = sz - nbytes;
524 }
525 memcpy (buf, buf_base + start_pos, nbytes);
526 metadata->data_tail += sz;
527 return 0;
528 }
529
530 static int
531 read_u64 (buffer_state_t *bufstate, uint64_t *value)
532 {
533 return read_buf (bufstate, value, sizeof (uint64_t));
534 }
535
536 static int
537 read_sample (counter_state_t *ctr_state, int msgsz, uint64_t *rvalue,
538 uint64_t *rlost)
539 {
540 // returns count of bytes read
541 buffer_state_t *bufstate = &ctr_state->buf_state;
542 counter_value_state_t *cntstate = &ctr_state->value_state;
543 int readsz = 0;
544
545 // PERF_SAMPLE_IP
546 uint64_t ipc = 0;
547 int rc = read_u64 (bufstate, &ipc);
548 if (rc)
549 return -1;
550 readsz += sizeof (uint64_t);
551
552 // PERF_SAMPLE_READ: value
553 uint64_t value = 0;
554 rc = read_u64 (bufstate, &value);
555 if (rc)
556 return -2;
557 readsz += sizeof (uint64_t);
558
559 /* Bug 20806896
560 * Old Linux kernels (e.g. 2.6.32) on certain systems return enabled and
561 * running times in the sample data that correspond to the metadata times
562 * metadata->time_enabled
563 * metadata->time_running
564 * from the PREVIOUS (not current) sample. Probably just ignore this bug
565 * since it's on old kernels and we only use the enabled and running times
566 * to construct loss_estimate.
567 */
568 // PERF_SAMPLE_READ: PERF_FORMAT_ENABLED
569 uint64_t enabled_time = 0;
570 rc = read_u64 (bufstate, &enabled_time);
571 if (rc)
572 return -3;
573 readsz += sizeof (uint64_t);
574
575 // PERF_SAMPLE_READ: PERF_FORMAT_RUNNING
576 uint64_t running_time = 0;
577 rc = read_u64 (bufstate, &running_time);
578 if (rc)
579 return -4;
580 readsz += sizeof (uint64_t);
581
582 uint64_t value_delta = value - cntstate->prev_value;
583 uint64_t enabled_delta = enabled_time - cntstate->prev_ena_ts;
584 uint64_t running_delta = running_time - cntstate->prev_run_ts;
585 cntstate->prev_value = value;
586 cntstate->prev_ena_ts = enabled_time;
587 cntstate->prev_run_ts = running_time;
588
589 // 24830461 need workaround for Linux anomalous HWC skid overrun
590 int set_error_flag = 0;
591 if (value_delta > 2 * ctr_state->last_overflow_period + 2000 /* HWC_SKID_TOLERANCE */)
592 set_error_flag = 1;
593
594 uint64_t loss_estimate = 0; // estimate loss of events caused by multiplexing
595 if (running_delta == enabled_delta)
596 {
597 // counter was running 100% of time, no multiplexing
598 }
599 else if (running_delta == 0)
600 loss_estimate = 1; // token amount to aid in debugging perfctr oddities
601 else if ((running_delta > enabled_delta) || (enabled_delta & 0x1000000000000000ll))
602 {
603 // running should be smaller than enabled, can't estimate
604 /*
605 * 21418391 HWC can have a negative count
606 *
607 * We've also seen enabled not only be smaller than running
608 * but in fact go negative. Guard against this.
609 */
610 loss_estimate = 2; // token amount to aid in debugging perfctr oddities
611 }
612 else
613 {
614 // counter was running less than 100% of time
615 // Example: ena=7772268 run=6775669 raw_value=316004 scaled_value=362483 loss_est=46479
616 uint64_t scaled_delta = (double) value_delta * enabled_delta / running_delta;
617 value_delta = scaled_delta;
618 #if 0
619 // We should perhaps warn the user that multiplexing is going on,
620 // but hwcdrv_pcl.c doesn't know about the collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_* values.
621 // For now we simply don't report.
622 // Perhaps we should address the issue not here but in the caller collector_sigemt_handler(),
623 // but at that level "lost" has a meaning that's considerably broader than just multiplexing.
624 collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n",
625 SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name,
626 ctr_list[idx].last_overflow_period, new_period);
627 #endif
628 }
629 TprintfT ((loss_estimate || set_error_flag) ? DBG_LT1 : DBG_LT3,
630 "hwcdrv: '%s' ipc=0x%llx ena=%llu run=%llu "
631 "value_delta=%lld(0x%llx) loss_est=%llu %s error_flag='%s'\n",
632 ctr_state->ev_def->name, (long long) ipc,
633 (long long) enabled_delta, (long long) running_delta,
634 (long long) value_delta, (long long) value_delta,
635 (unsigned long long) loss_estimate,
636 loss_estimate ? ", WARNING - SCALED" : "",
637 set_error_flag ? ", ERRORFLAG" : "");
638 if (set_error_flag == 1)
639 value_delta |= (1ULL << 63) /* HWCVAL_ERR_FLAG */;
640 *rvalue = value_delta;
641 *rlost = loss_estimate;
642 if (readsz != msgsz)
643 {
644 TprintfT (0, "hwcdrv: ERROR: perf_event sample not fully parsed\n");
645 return -5;
646 }
647 return 0;
648 }
649
650 static void
651 dump_perf_event_attr (struct perf_event_attr *at)
652 {
653 TprintfT (DBG_LT2, "dump_perf_event_attr: size=%d type=%d sample_period=%lld\n"
654 " config=0x%llx config1=0x%llx config2=0x%llx wakeup_events=%lld __reserved_1=%lld\n",
655 (int) at->size, (int) at->type, (unsigned long long) at->sample_period,
656 (unsigned long long) at->config, (unsigned long long) at->config1,
657 (unsigned long long) at->config2, (unsigned long long) at->wakeup_events,
658 (unsigned long long) at->__reserved_1);
659 #define DUMP_F(fld) if (at->fld) TprintfT(DBG_LT2, " %-10s : %lld\n", #fld, (long long) at->fld)
660 DUMP_F (disabled);
661 DUMP_F (inherit);
662 DUMP_F (pinned);
663 DUMP_F (exclusive);
664 DUMP_F (exclude_user);
665 DUMP_F (exclude_kernel);
666 DUMP_F (exclude_hv);
667 DUMP_F (exclude_idle);
668 // DUMP_F(xmmap);
669 DUMP_F (comm);
670 DUMP_F (freq);
671 DUMP_F (inherit_stat);
672 DUMP_F (enable_on_exec);
673 DUMP_F (task);
674 DUMP_F (watermark);
675 }
676
677 static void
678 init_perf_event (struct perf_event_attr *hw, uint64_t event, uint64_t period,
679 Hwcentry *hwce)
680 {
681 memset (hw, 0, sizeof (struct perf_event_attr));
682 hw->size = sizeof (struct perf_event_attr);
683 if (hwce && hwce->use_perf_event_type)
684 {
685 hw->config = hwce->config;
686 hw->type = hwce->type;
687 }
688 else
689 { // backward compatibility. The old interface had no 'hwce' argument.
690 hw->config = event;
691 hw->type = PERF_TYPE_RAW; // hw/sw/trace/raw...
692 }
693 hw->sample_period = period;
694 hw->sample_type = PERF_SAMPLE_IP |
695 // PERF_SAMPLE_TID |
696 // PERF_SAMPLE_TIME | // possibly interesting
697 // PERF_SAMPLE_ADDR |
698 PERF_SAMPLE_READ | // HWC value
699 // PERF_SAMPLE_CALLCHAIN | // interesting
700 // PERF_SAMPLE_ID |
701 // PERF_SAMPLE_CPU | // possibly interesting
702 // PERF_SAMPLE_PERIOD |
703 // PERF_SAMPLE_STREAM_ID |
704 // PERF_SAMPLE_RAW |
705 0;
706 hw->read_format =
707 PERF_FORMAT_TOTAL_TIME_ENABLED | // detect when hwc not scheduled
708 PERF_FORMAT_TOTAL_TIME_RUNNING | // detect when hwc not scheduled
709 // PERF_FORMAT_ID |
710 // PERF_FORMAT_GROUP |
711 0;
712 hw->disabled = 1; /* off by default */
713
714 // Note: the following override config.priv bits!
715 hw->exclude_user = (event & (1 << 16)) == 0; /* don't count user */
716 hw->exclude_kernel = (event & (1 << 17)) == 0; /* ditto kernel */
717 hw->exclude_hv = 1; /* ditto hypervisor */
718 hw->wakeup_events = 1; /* wakeup every n events */
719 dump_perf_event_attr (hw);
720 }
721
722 static int
723 start_one_ctr (int ii, size_t pgsz, hdrv_pcl_ctx_t * pctx, char *error_string)
724 {
725 // pe_attr should have been initialized in hwcdrv_create_counters()
726 struct perf_event_attr pe_attr;
727 memcpy (&pe_attr, &global_perf_event_def[ii].hw, sizeof (pe_attr));
728
729 // but we adjust the period, so make sure that pctx->ctr_list[ii].last_overflow_period has been set
730 pe_attr.sample_period = pctx->ctr_list[ii].last_overflow_period;
731
732 int hwc_fd = perf_event_open (&pe_attr, pctx->tid, -1, -1, 0);
733 if (hwc_fd == -1)
734 {
735 TprintfT (DBG_LT1, "%s idx=%d perf_event_open failed, errno=%d\n",
736 error_string, ii, errno);
737 return 1;
738 }
739
740 size_t buffer_area_sz = (NPAGES_PER_BUF + 1) * pgsz; // add a page for metadata
741 void * buf = mmap (NULL, buffer_area_sz, //YXXX is this a safe call?
742 PROT_READ | PROT_WRITE, MAP_SHARED, hwc_fd, 0);
743 if (buf == MAP_FAILED)
744 {
745 TprintfT (0, "sz = %ld, pgsz = %ld\n err=%s idx=%d mmap failed: %s\n",
746 (long) buffer_area_sz, (long) pgsz, error_string, ii, strerror (errno));
747 return 1;
748 }
749 pctx->ctr_list[ii].ev_def = &global_perf_event_def[ii]; // why do we set ev_def? we never seem to use it
750 pctx->ctr_list[ii].fd = hwc_fd;
751 pctx->ctr_list[ii].buf_state.buf = buf;
752 pctx->ctr_list[ii].buf_state.pagesz = pgsz;
753 pctx->ctr_list[ii].value_state.prev_ena_ts = 0;
754 pctx->ctr_list[ii].value_state.prev_run_ts = 0;
755 pctx->ctr_list[ii].value_state.prev_value = 0;
756 pctx->ctr_list[ii].last_overflow_time = gethrtime ();
757
758 /* set async mode */
759 long flags = fcntl (hwc_fd, F_GETFL, 0) | O_ASYNC;
760 int rc = fcntl (hwc_fd, F_SETFL, flags);
761 if (rc == -1)
762 {
763 TprintfT (0, "%s idx=%d O_ASYNC failed\n", error_string, ii);
764 return 1;
765 }
766
767 /*
768 * set lwp ownership of the fd
769 * See BUGS section of "man perf_event_open":
770 * The F_SETOWN_EX option to fcntl(2) is needed to properly get
771 * overflow signals in threads. This was introduced in Linux 2.6.32.
772 * Legacy references:
773 * see http://lkml.org/lkml/2009/8/4/128
774 * google man fcntl F_SETOWN_EX -conflict
775 * "From Linux 2.6.32 onward, use F_SETOWN_EX to target
776 * SIGIO and SIGURG signals at a particular thread."
777 * http://icl.cs.utk.edu/papi/docs/da/d2a/examples__v2_8x_2self__smpl__multi_8c.html
778 * See 2010 CSCADS presentation by Eranian
779 */
780 struct f_owner_ex fowner_ex;
781 fowner_ex.type = F_OWNER_TID;
782 fowner_ex.pid = pctx->tid;
783 rc = fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex);
784 if (rc == -1)
785 {
786 TprintfT (0, "%s idx=%d F_SETOWN failed\n", error_string, ii);
787 return 1;
788 }
789
790 /* Use sigio so handler can determine FD via siginfo->si_fd. */
791 rc = fcntl (hwc_fd, F_SETSIG, SIGIO);
792 if (rc == -1)
793 {
794 TprintfT (0, "%s idx=%d F_SETSIG failed\n", error_string, ii);
795 return 1;
796 }
797 return 0;
798 }
799
800 static int
801 stop_one_ctr (int ii, counter_state_t *ctr_list)
802 {
803 int hwc_rc = 0;
804 if (-1 == ioctl (ctr_list[ii].fd, PERF_EVENT_IOC_DISABLE, 1))
805 {
806 TprintfT (0, "hwcdrv: ERROR: PERF_EVENT_IOC_DISABLE #%d failed: errno=%d\n", ii, errno);
807 hwc_rc = HWCFUNCS_ERROR_GENERIC;
808 }
809 void *buf = ctr_list[ii].buf_state.buf;
810 if (buf)
811 {
812 size_t bufsz = (NPAGES_PER_BUF + 1) * ctr_list[ii].buf_state.pagesz;
813 ctr_list[ii].buf_state.buf = NULL;
814 int tmprc = munmap (buf, bufsz);
815 if (tmprc)
816 {
817 TprintfT (0, "hwcdrv: ERROR: munmap() #%d failed: errno=%d\n", ii, errno);
818 hwc_rc = HWCFUNCS_ERROR_GENERIC;
819 }
820 }
821 if (-1 == close (ctr_list[ii].fd))
822 {
823 TprintfT (0, "hwcdrv: ERROR: close(fd) #%d failed: errno=%d\n", ii, errno);
824 hwc_rc = HWCFUNCS_ERROR_GENERIC;
825 }
826 return hwc_rc;
827 }
828
829 /* HWCDRV_API for thread-specific actions */
830 HWCDRV_API int
831 hwcdrv_lwp_init (void)
832 {
833 return hwcdrv_start ();
834 }
835
836 HWCDRV_API void
837 hwcdrv_lwp_fini (void)
838 {
839 hwcdrv_free_counters (); /* also sets pctx->ctr_list=NULL; */
840 }
841
842 /* open */
843 static int
844 hdrv_pcl_internal_open ()
845 {
846 if (hdrv_pcl_state.internal_open_called)
847 {
848 TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open: already called\n");
849 return HWCFUNCS_ERROR_ALREADY_CALLED;
850 }
851
852 // determine if PCL is available
853 perf_event_def_t tmp_event_def;
854 memset (&tmp_event_def, 0, sizeof (tmp_event_def));
855 struct perf_event_attr *pe_attr = &tmp_event_def.hw;
856 init_perf_event (pe_attr, 0, 0, NULL);
857 pe_attr->type = PERF_TYPE_HARDWARE; // specify abstracted HW event
858 pe_attr->config = PERF_COUNT_HW_INSTRUCTIONS; // specify abstracted insts
859 int hwc_fd = perf_event_open (pe_attr,
860 0, // pid/tid, 0 is self
861 -1, // cpu, -1 is per-thread mode
862 -1, // group_fd, -1 is root
863 0); // flags
864 if (hwc_fd == -1)
865 {
866 TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open:"
867 " perf_event_open() failed, errno=%d\n", errno);
868 goto internal_open_error;
869 }
870
871 /* see if the PCL is new enough to know about F_SETOWN_EX */
872 struct f_owner_ex fowner_ex;
873 fowner_ex.type = F_OWNER_TID;
874 fowner_ex.pid = hwcdrv_gettid (); // "pid=tid" is correct w/F_OWNER_TID
875 if (fcntl (hwc_fd, F_SETOWN_EX, (unsigned long) &fowner_ex) == -1)
876 {
877 TprintfT (DBG_LT1, "hwcdrv: WARNING: hdrv_pcl_internal_open: "
878 "F_SETOWN failed, errno=%d\n", errno);
879 close (hwc_fd);
880 goto internal_open_error;
881 }
882 close (hwc_fd);
883
884 hdrv_pcl_state.internal_open_called = 1;
885 hdrv_pcl_state.library_ok = 1; // set to non-zero to show it's initted
886 hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED;
887 TprintfT (DBG_LT2, "hwcdrv: hdrv_pcl_internal_open()\n");
888 for (int ii = 0; hdrv_pcbe_drivers[ii]; ii++)
889 {
890 hdrv_pcbe_api_t *ppcbe = hdrv_pcbe_drivers[ii];
891 if (!ppcbe->hdrv_pcbe_init ())
892 {
893 hdrv_pcl_about.cpcN_cciname = ppcbe->hdrv_pcbe_impl_name ();
894 hdrv_pcl_about.cpcN_cpuver = hwcdrv_lookup_cpuver (hdrv_pcl_about.cpcN_cciname);
895 if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED)
896 goto internal_open_error;
897 hdrv_pcl_about.cpcN_npics = ppcbe->hdrv_pcbe_ncounters ();
898 hdrv_pcl_about.cpcN_docref = ppcbe->hdrv_pcbe_cpuref ();
899 hdrv_pcl_state.get_events = ppcbe->hdrv_pcbe_get_events;
900 hwcdrv_get_x86_eventnum = ppcbe->hdrv_pcbe_get_eventnum;
901 break;
902 }
903 }
904 if (hdrv_pcl_about.cpcN_npics > MAX_PICS)
905 {
906 TprintfT (0, "hwcdrv: WARNING: hdrv_pcl_internal_open:"
907 " reducing number of HWCs from %u to %u on processor '%s'\n",
908 hdrv_pcl_about.cpcN_npics, MAX_PICS, hdrv_pcl_about.cpcN_cciname);
909 hdrv_pcl_about.cpcN_npics = MAX_PICS;
910 }
911 TprintfT (DBG_LT1, "hwcdrv: hdrv_pcl_internal_open:"
912 " perf_event cpuver=%d, name='%s'\n",
913 hdrv_pcl_about.cpcN_cpuver, hdrv_pcl_about.cpcN_cciname);
914 return 0;
915
916 internal_open_error:
917 hdrv_pcl_about.cpcN_cpuver = CPUVER_UNDEFINED;
918 hdrv_pcl_about.cpcN_npics = 0;
919 hdrv_pcl_about.cpcN_docref = NULL;
920 hdrv_pcl_about.cpcN_cciname = NULL;
921 return HWCFUNCS_ERROR_NOT_SUPPORTED;
922 }
923
924 static void *
925 single_thread_tsd_ftn ()
926 {
927 static hdrv_pcl_ctx_t tsd_context;
928 return &tsd_context;
929 }
930
931 /* HWCDRV_API */
932 HWCDRV_API int
933 hwcdrv_init (hwcfuncs_abort_fn_t abort_ftn, int *tsd_sz)
934 {
935 hdrv_pcl_state.find_vpc_ctx = single_thread_tsd_ftn;
936 if (tsd_sz)
937 *tsd_sz = sizeof (hdrv_pcl_ctx_t);
938
939 if (hdrv_pcl_state.internal_open_called)
940 return HWCFUNCS_ERROR_ALREADY_CALLED;
941 return hdrv_pcl_internal_open ();
942 }
943
944 HWCDRV_API void
945 hwcdrv_get_info (int *cpuver, const char **cciname, uint_t *npics,
946 const char **docref, uint64_t *support)
947 {
948 if (cpuver)
949 *cpuver = hdrv_pcl_about.cpcN_cpuver;
950 if (cciname)
951 *cciname = hdrv_pcl_about.cpcN_cciname;
952 if (npics)
953 *npics = hdrv_pcl_about.cpcN_npics;
954 if (docref)
955 *docref = hdrv_pcl_about.cpcN_docref;
956 if (support)
957 *support = HWCFUNCS_SUPPORT_OVERFLOW_PROFILING | HWCFUNCS_SUPPORT_OVERFLOW_CTR_ID;
958 }
959
960 HWCDRV_API int
961 hwcdrv_enable_mt (hwcfuncs_tsd_get_fn_t tsd_ftn)
962 {
963 if (tsd_ftn)
964 hdrv_pcl_state.find_vpc_ctx = tsd_ftn;
965 else
966 {
967 TprintfT (0, "hwcdrv: ERROR: enable_mt(): tsd_ftn==NULL\n");
968 return HWCFUNCS_ERROR_UNAVAIL;
969 }
970 return 0;
971 }
972
973 HWCDRV_API int
974 hwcdrv_get_descriptions (hwcf_hwc_cb_t *hwc_cb, hwcf_attr_cb_t *attr_cb)
975 {
976 int count = 0;
977 if (hwc_cb && hdrv_pcl_state.get_events)
978 count = hdrv_pcl_state.get_events (hwc_cb);
979 if (attr_cb)
980 for (int ii = 0; perfctr_attrs_table && perfctr_attrs_table[ii].attrname; ii++)
981 attr_cb (perfctr_attrs_table[ii].attrname);
982 if (!count)
983 return -1;
984 return 0;
985 }
986
987 HWCDRV_API int
988 hwcdrv_assign_regnos (Hwcentry* entries[], unsigned numctrs)
989 {
990 return hwcdrv_assign_all_regnos (entries, numctrs);
991 }
992
993 static int
994 internal_hwc_start (int fd)
995 {
996 int rc = ioctl (fd, PERF_EVENT_IOC_REFRESH, 1);
997 if (rc == -1)
998 {
999 TprintfT (DBG_LT0, "hwcdrv: ERROR: internal_hwc_start:"
1000 " PERF_EVENT_IOC_REFRESH(fd=%d) failed: errno=%d\n", fd, errno);
1001 return HWCFUNCS_ERROR_UNAVAIL;
1002 }
1003 TprintfT (DBG_LT3, "hwcdrv: internal_hwc_start(fd=%d)\n", fd);
1004 return 0;
1005 }
1006
1007 HWCDRV_API int
1008 hwcdrv_overflow (siginfo_t *si, hwc_event_t *eventp, hwc_event_t *lost_events)
1009 {
1010 /* set expired counters to overflow value and all others to 0 */
1011 /* return 0: OK, counters should be restarted */
1012 /* return non-zero: eventp not set, counters should not be restarted */
1013 /* clear return values */
1014 int ii;
1015 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
1016 {
1017 eventp->ce_pic[ii] = 0;
1018 lost_events->ce_pic[ii] = 0;
1019 }
1020 hrtime_t sig_ts = gethrtime (); //YXXX get this from HWC event?
1021 eventp->ce_hrt = sig_ts;
1022 lost_events->ce_hrt = sig_ts;
1023
1024 /* determine source signal */
1025 int signal_fd = -1;
1026 switch (si->si_code)
1027 {
1028 case POLL_HUP: /* expected value from pcl */
1029 /* According to Stephane Eranian:
1030 * "expect POLL_HUP instead of POLL_IN because we are
1031 * in one-shot mode (IOC_REFRESH)"
1032 */
1033 signal_fd = si->si_fd;
1034 break;
1035 case SI_TKILL: /* event forwarded by tkill */
1036 /* DBX can only forward SI_TKILL when it detects POLL_HUP
1037 * unfortunately, this means that si->si_fd has been lost...
1038 * We need to process the buffers, but we don't know the fd!
1039 */
1040 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
1041 " SI_TKILL detected\n", sig_ts);
1042 break;
1043 default:
1044 // "sometimes we see a POLL_IN (1) with very high event rates,"
1045 // according to eranian(?)
1046 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
1047 " unexpected si_code 0x%x\n", sig_ts, si->si_code);
1048 return HWCFUNCS_ERROR_GENERIC;
1049 }
1050
1051 hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx ();
1052 if (!pctx)
1053 {
1054 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
1055 " tsd context is NULL\n", sig_ts);
1056 return HWCFUNCS_ERROR_UNEXPECTED;
1057 }
1058 counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list;
1059 if (!ctr_list)
1060 {
1061 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
1062 " ctr_list is NULL\n", sig_ts);
1063 return HWCFUNCS_ERROR_UNEXPECTED;
1064 }
1065
1066 /* clear needs_restart flag */
1067 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
1068 ctr_list[ii].needs_restart = 0;
1069
1070 /* attempt to identify the counter to read */
1071 int signal_idx = -1;
1072 pctx->signal_fd = signal_fd; // save the signal provided by siginfo_t
1073 if (signal_fd != -1)
1074 {
1075 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
1076 {
1077 if (ctr_list[ii].fd == signal_fd)
1078 {
1079 signal_idx = ii;
1080 break;
1081 }
1082 }
1083 }
1084
1085 if (signal_idx < 0)
1086 {
1087 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
1088 " pmc not determined!\n", sig_ts);
1089 lost_events->ce_pic[0] = 1; /* record a bogus value into experiment */
1090 // note: bogus value may get overwritten in loop below
1091 }
1092
1093 /* capture sample(s). In addition to signal_idx, check other counters. */
1094 struct perf_event_header sheader;
1095 int idx;
1096 for (idx = 0; idx < hdrv_pcl_state.hwcdef_cnt; idx++)
1097 {
1098 int num_recs = 0;
1099 while (1)
1100 {
1101 /* check for samples */
1102 struct perf_event_mmap_page *metadata = ctr_list[idx].buf_state.buf;
1103 if (metadata == NULL)
1104 break; // empty
1105 if (metadata->data_tail == metadata->data_head)
1106 break; // empty
1107
1108 /* read header */
1109 if (read_buf (&ctr_list[idx].buf_state, &sheader, sizeof (sheader)))
1110 break;
1111 num_recs++;
1112
1113 /* check for PERF_RECORD_SAMPLE */
1114 size_t datasz = sheader.size - sizeof (struct perf_event_header);
1115 if (sheader.type != PERF_RECORD_SAMPLE)
1116 {
1117 TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
1118 " unexpected recd type=%d\n",
1119 sig_ts, sheader.type);
1120 if (skip_buf (&ctr_list[idx].buf_state, datasz))
1121 {
1122 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
1123 " skip recd type=%d failed\n", sig_ts, sheader.type);
1124 lost_events->ce_pic[idx] = 4; /* record a bogus value */
1125 break; // failed to skip buffer??
1126 }
1127 lost_events->ce_pic[idx] = 2; /* record a bogus value */
1128 continue; // advance to next record
1129 }
1130
1131 /* type is PERF_RECORD_SAMPLE */
1132 uint64_t value, lostv;
1133 if (read_sample (&ctr_list[idx], datasz, &value, &lostv))
1134 {
1135 TprintfT (DBG_LT0, "hwcdrv: sig_ts=%llu: ERROR: hwcdrv_overflow:"
1136 " read_sample() failed\n", sig_ts);
1137 lost_events->ce_pic[idx] = 3; // record a bogus value
1138 break; // failed to read sample data??
1139 }
1140 TprintfT (DBG_LT3, "hwcdrv: sig_ts=%llu: hwcdrv_overflow:"
1141 " idx=%d value=%llu lost=%llu\n", (unsigned long long) sig_ts,
1142 idx, (unsigned long long) value, (unsigned long long) lostv);
1143 if (eventp->ce_pic[idx])
1144 {
1145 TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
1146 " idx=%d previous sample recorded as lost_event\n", sig_ts, idx);
1147 lost_events->ce_pic[idx] += eventp->ce_pic[idx];
1148 }
1149 eventp->ce_pic[idx] = value;
1150 lost_events->ce_pic[idx] += lostv;
1151 }
1152
1153 /* debug output for unexpected (but common) cases */
1154 if (idx == signal_idx)
1155 {
1156 if (num_recs != 1)
1157 TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
1158 " %d records for signal_idx=%d\n", sig_ts, num_recs, signal_idx);
1159 }
1160 else if (num_recs)
1161 TprintfT (DBG_LT2, "hwcdrv: sig_ts=%llu: WARNING: hwcdrv_overflow:"
1162 " %d unexpected record(s) for idx=%d (signal_idx=%d)\n",
1163 sig_ts, num_recs, idx, signal_idx);
1164
1165 /* trigger counter restart whenever records were found */
1166 if (num_recs)
1167 {
1168 /* check whether to adapt the overflow interval */
1169 /* This is the Linux version.
1170 * The Solaris version is in hwprofile.c collector_update_overflow_counters().
1171 */
1172 hrtime_t min_time = global_perf_event_def[idx].min_time;
1173 if (min_time > 0 // overflow interval is adaptive
1174 && sig_ts - ctr_list[idx].last_overflow_time < min_time) // last interval below min
1175 {
1176 /* pick a new overflow interval */
1177 /* roughly doubled, but add funny numbers */
1178 /* hopefully the result is prime or not a multiple of some # of ops/loop */
1179 uint64_t new_period = 2 * ctr_list[idx].last_overflow_period + 37;
1180 #if 0
1181 // On Solaris, we report the adjustment to the log file.
1182 // On Linux it's hard for us to do so since hwcdrv_pcl.c doesn't know about collector_interface, SP_JCMD_COMMENT, or COL_COMMENT_HWCADJ.
1183 // For now we simply don't report.
1184 collector_interface->writeLog ("<event kind=\"%s\" id=\"%d\">%s %d -> %d</event>\n",
1185 SP_JCMD_COMMENT, COL_COMMENT_HWCADJ, global_perf_event_def[idx].name,
1186 ctr_list[idx].last_overflow_period, new_period);
1187 #endif
1188 /* There are a variety of ways of resetting the period on Linux.
1189 * The most elegant is
1190 * ioctl(fd,PERF_EVENT_IOC_PERIOD,&period)
1191 * but check the perf_event_open man page for PERF_EVENT_IOC_PERIOD:
1192 * > Prior to Linux 2.6.36 this ioctl always failed due to a bug in the kernel.
1193 * > Prior to Linux 3.14 (or 3.7 on ARM), the new period did not take effect
1194 * until after the next overflow.
1195 * So we're kind of stuck shutting the fd down and restarting it with the new period.
1196 */
1197 if (stop_one_ctr (idx, ctr_list))
1198 {
1199 // EUGENE figure out what to do on error
1200 }
1201 ctr_list[idx].last_overflow_period = new_period;
1202 if (start_one_ctr (idx, ctr_list[idx].buf_state.pagesz, pctx, "hwcdrv: ERROR: hwcdrv_overflow (readjust overflow):"))
1203 {
1204 // EUGENE figure out what to do on error
1205 }
1206 }
1207 ctr_list[idx].last_overflow_time = sig_ts;
1208 #if 0
1209 ctr_list[idx].needs_restart = 1;
1210 #else // seems to be more reliable to restart here instead of hwcdrv_sighlr_restart()
1211 internal_hwc_start (ctr_list[idx].fd);
1212 #endif
1213 }
1214 }
1215 return 0; // OK to restart counters
1216 }
1217
1218 HWCDRV_API int
1219 hwcdrv_sighlr_restart (const hwc_event_t *pp)
1220 {
1221 #if 0 // restarting here doesn't seem to work as well as restarting in hwcdrv_overflow()
1222 hdrv_pcl_ctx_t * pctx = hdrv_pcl_state.find_vpc_ctx ();
1223 if (!pctx)
1224 {
1225 TprintfT (DBG_LT0, "hwcdrv: ERROR: hwcdrv_sighlr_restart: find_vpc_ctx()==NULL\n");
1226 return -1;
1227 }
1228 counter_state_t * ctr_list = (counter_state_t *) pctx->ctr_list;
1229 if (!ctr_list)
1230 {
1231 TprintfT (DBG_LT0, "hwcdrv: WARNING: hwcdrv_sighlr_restart: ctr_list is NULL\n");
1232 return -1;
1233 }
1234 int errors = 0;
1235 for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
1236 {
1237 if (ctr_list[ii].needs_restart)
1238 errors |= internal_hwc_start (ctr_list[ii].fd);
1239 ctr_list[ii].needs_restart = 0;
1240 }
1241 return errors;
1242 #else
1243 return 0;
1244 #endif
1245 }
1246
1247 /* create counters based on hwcdef[] */
1248 HWCDRV_API int
1249 hwcdrv_create_counters (unsigned hwcdef_cnt, Hwcentry *hwcdef)
1250 {
1251 if (hwcdef_cnt > hdrv_pcl_about.cpcN_npics)
1252 {
1253 logerr (GTXT ("More than %d counters were specified\n"), hdrv_pcl_about.cpcN_npics); /*!*/
1254 return HWCFUNCS_ERROR_HWCARGS;
1255 }
1256 if (hdrv_pcl_about.cpcN_cpuver == CPUVER_UNDEFINED)
1257 {
1258 logerr (GTXT ("Processor not supported\n"));
1259 return HWCFUNCS_ERROR_HWCARGS;
1260 }
1261
1262 /* add counters */
1263 for (unsigned idx = 0; idx < hwcdef_cnt; idx++)
1264 {
1265 perf_event_def_t *glb_event_def = &global_perf_event_def[idx];
1266 memset (glb_event_def, 0, sizeof (perf_event_def_t));
1267 unsigned int pmc_sel;
1268 eventsel_t evntsel;
1269 if (hwcfuncs_get_x86_eventsel (hwcdef[idx].reg_num,
1270 hwcdef[idx].int_name, &evntsel, &pmc_sel))
1271 {
1272 TprintfT (0, "hwcdrv: ERROR: hwcfuncs_get_x86_eventsel() failed\n");
1273 return HWCFUNCS_ERROR_HWCARGS;
1274 }
1275 glb_event_def->reg_num = pmc_sel;
1276 glb_event_def->eventsel = evntsel;
1277 glb_event_def->counter_preload = hwcdef[idx].val;
1278 glb_event_def->min_time = hwcdef[idx].min_time;
1279 glb_event_def->name = strdup (hwcdef[idx].name); // memory leak??? very minor
1280 init_perf_event (&glb_event_def->hw, glb_event_def->eventsel,
1281 glb_event_def->counter_preload, hwcdef + idx);
1282 TprintfT (DBG_LT1, "hwcdrv: create_counters: pic=%u name='%s' interval=%lld"
1283 "(min_time=%lld): reg_num=0x%x eventsel=0x%llx ireset=%lld usr=%lld sys=%lld\n",
1284 idx, hwcdef[idx].int_name, (long long) glb_event_def->counter_preload,
1285 (long long) glb_event_def->min_time, (int) glb_event_def->reg_num,
1286 (long long) glb_event_def->eventsel,
1287 (long long) HW_INTERVAL_PRESET (hwcdef[idx].val),
1288 (long long) glb_event_def->hw.exclude_user,
1289 (long long) glb_event_def->hw.exclude_kernel);
1290 }
1291
1292 hdrv_pcl_state.hwcdef_cnt = hwcdef_cnt;
1293 return 0;
1294 }
1295
1296 HWCDRV_API int
1297 hwcdrv_free_counters () // note: only performs shutdown for this thread
1298 {
1299 hdrv_pcl_ctx_t * pctx;
1300 if (!COUNTERS_ENABLED ())
1301 return 0;
1302 pctx = hdrv_pcl_state.find_vpc_ctx ();
1303 if (!pctx)
1304 {
1305 TprintfT (0, "hwcdrv: WARNING: hwcdrv_free_counters: tsd context is NULL\n");
1306 return HWCFUNCS_ERROR_GENERIC;
1307 }
1308 counter_state_t *ctr_list = pctx->ctr_list;
1309 if (!ctr_list)
1310 {
1311 // fork child: prolog suspends hwcs, then epilog frees them
1312 TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_free_counters: ctr_list is already NULL\n");
1313 return 0;
1314 }
1315 int hwc_rc = 0;
1316 for (int ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
1317 if (stop_one_ctr (ii, ctr_list))
1318 hwc_rc = HWCFUNCS_ERROR_GENERIC;
1319 TprintfT (DBG_LT1, "hwcdrv: hwcdrv_free_counters(tid=0x%lx).\n", (long) pctx->tid);
1320 pctx->ctr_list = NULL;
1321 return hwc_rc;
1322 }
1323
1324 HWCDRV_API int
1325 hwcdrv_start (void) /* must be called from each thread ? */
1326 {
1327 hdrv_pcl_ctx_t *pctx = NULL;
1328 if (!COUNTERS_ENABLED ())
1329 {
1330 TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_start: no counters to start \n");
1331 return 0;
1332 }
1333 if (!hdrv_pcl_state.library_ok)
1334 {
1335 TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: library is not open\n");
1336 return HWCFUNCS_ERROR_NOT_SUPPORTED;
1337 }
1338
1339 /*
1340 * set up per-thread context
1341 */
1342 pctx = hdrv_pcl_state.find_vpc_ctx ();
1343 if (!pctx)
1344 {
1345 TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: tsd context is NULL\n");
1346 return HWCFUNCS_ERROR_UNEXPECTED;
1347 }
1348 pctx->tid = hwcdrv_gettid ();
1349 TprintfT (DBG_LT1, "hwcdrv: hwcdrv_start(tid=0x%lx)\n", (long) pctx->tid);
1350
1351 /*
1352 * create per-thread counter list
1353 */
1354 counter_state_t *ctr_list = (counter_state_t *) calloc (hdrv_pcl_state.hwcdef_cnt,
1355 sizeof (counter_state_t));
1356 if (!ctr_list)
1357 {
1358 TprintfT (0, "hwcdrv: ERROR: hwcdrv_start: calloc(ctr_list) failed\n");
1359 return HWCFUNCS_ERROR_MEMORY;
1360 }
1361 int ii;
1362 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
1363 ctr_list[ii].fd = -1; // invalidate fds in case we have to close prematurely
1364 pctx->ctr_list = ctr_list;
1365
1366 /*
1367 * bind the counters
1368 */
1369 size_t pgsz = sysconf (_SC_PAGESIZE);
1370 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
1371 {
1372 ctr_list[ii].last_overflow_period = global_perf_event_def[ii].hw.sample_period;
1373 if (start_one_ctr (ii, pgsz, pctx, "hwcdrv: ERROR: hwcdrv_start:")) goto hwcdrv_start_cleanup;
1374 }
1375
1376 /*
1377 * start the counters
1378 */
1379 for (ii = 0; ii < hdrv_pcl_state.hwcdef_cnt; ii++)
1380 {
1381 int rc = internal_hwc_start (ctr_list[ii].fd);
1382 if (rc < 0)
1383 goto hwcdrv_start_cleanup;
1384 }
1385 return 0;
1386
1387 hwcdrv_start_cleanup:
1388 hwcdrv_free_counters (); // PERF_EVENT_IOC_DISABLE and close() for all fds
1389 return HWCFUNCS_ERROR_UNAVAIL;
1390 }
1391
1392 HWCDRV_API int
1393 hwcdrv_lwp_suspend (void) /* must be called from each thread */
1394 {
1395 if (!COUNTERS_ENABLED ())
1396 {
1397 TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_suspend: no counters\n");
1398 return 0;
1399 }
1400 TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_suspend()\n");
1401 return hwcdrv_free_counters ();
1402 }
1403
1404 HWCDRV_API int
1405 hwcdrv_lwp_resume (void) /* must be called from each thread */
1406 {
1407 if (!COUNTERS_ENABLED ())
1408 {
1409 TprintfT (DBG_LT1, "hwcdrv: WARNING: hwcdrv_lwp_resume: no counters\n");
1410 return 0;
1411 }
1412 TprintfT (DBG_LT1, "hwcdrv: hwcdrv_lwp_resume()\n");
1413 return hwcdrv_start ();
1414 }
1415
1416 HWCDRV_API int
1417 hwcdrv_read_events (hwc_event_t *overflow_data, hwc_event_samples_t *sampled_data)
1418 {
1419 overflow_data->ce_hrt = 0;
1420 for (int i = 0; i < MAX_PICS; i++)
1421 {
1422 overflow_data->ce_pic[i] = 0;
1423 if (sampled_data)
1424 HWCFUNCS_SAMPLE_RESET (&sampled_data->sample[i]);
1425 }
1426 return 0;
1427 }
1428
1429 /*---------------------------------------------------------------------------*/
1430 /* HWCDRV_API */
1431
1432 hwcdrv_api_t hwcdrv_pcl_api = {
1433 hwcdrv_init,
1434 hwcdrv_get_info,
1435 hwcdrv_enable_mt,
1436 hwcdrv_get_descriptions,
1437 hwcdrv_assign_regnos,
1438 hwcdrv_create_counters,
1439 hwcdrv_start,
1440 hwcdrv_overflow,
1441 hwcdrv_read_events,
1442 hwcdrv_sighlr_restart,
1443 hwcdrv_lwp_suspend,
1444 hwcdrv_lwp_resume,
1445 hwcdrv_free_counters,
1446 hwcdrv_lwp_init,
1447 hwcdrv_lwp_fini,
1448 -1 // hwcdrv_init_status
1449 };