]>
Commit | Line | Data |
---|---|---|
88bfe092 | 1 | |
2 | /* | |
507d0a78 | 3 | * $Id: Profiler.c,v 1.7 2007/04/25 11:30:16 adrian Exp $ |
88bfe092 | 4 | * |
507d0a78 | 5 | * DEBUG: section 81 CPU Profiling Routines |
88bfe092 | 6 | * AUTHOR: Andres Kroonmaa, Sep.2000 |
7 | * | |
8 | * SQUID Internet Object Cache http://squid.nlanr.net/Squid/ | |
9 | * ---------------------------------------------------------- | |
10 | * | |
11 | * Squid is the result of efforts by numerous individuals from the | |
12 | * Internet community. Development is led by Duane Wessels of the | |
13 | * National Laboratory for Applied Network Research and funded by the | |
14 | * National Science Foundation. Squid is Copyrighted (C) 1998 by | |
15 | * the Regents of the University of California. Please see the | |
16 | * COPYRIGHT file for full details. Squid incorporates software | |
17 | * developed and/or copyrighted by other sources. Please see the | |
18 | * CREDITS file for full details. | |
19 | * | |
20 | * This program is free software; you can redistribute it and/or modify | |
21 | * it under the terms of the GNU General Public License as published by | |
22 | * the Free Software Foundation; either version 2 of the License, or | |
23 | * (at your option) any later version. | |
24 | * | |
25 | * This program is distributed in the hope that it will be useful, | |
26 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
27 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
28 | * GNU General Public License for more details. | |
29 | * | |
30 | * You should have received a copy of the GNU General Public License | |
31 | * along with this program; if not, write to the Free Software | |
32 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. | |
33 | * | |
34 | */ | |
35 | ||
36 | /* | |
37 | * CPU Profiling implementation. | |
38 | * | |
39 | * This library implements the Probes needed to gather stats. | |
40 | * See src/ProfStats.c which implements historical recording and | |
41 | * presentation in CacheMgr.cgi. | |
42 | * | |
43 | * For timing we prefer on-CPU ops that retrieve cpu ticks counter. | |
44 | * For Intel, this is "rdtsc", which is 64-bit counter that virtually | |
45 | * never wraps. For alpha, this is "rpcc" which is 32-bit counter and | |
46 | * wraps every few seconds. Currently, no handling of wrapping counters | |
47 | * is implemented. Other CPU's are also not covered. Potentially all | |
48 | * modern CPU's has similar counters. | |
49 | * | |
50 | * Usage. | |
51 | * Insert macro PROF_state(probename) in strategic places in code. | |
52 | * PROF_start(probename); | |
53 | * ... section of code measured ... | |
54 | * PROF_stop(probename); | |
55 | * | |
56 | * probename must be added to profiling.h into xprof_type enum list | |
57 | * with prepended "XPROF_" string. | |
58 | * | |
59 | * Description. | |
60 | * PROF gathers stats per probename into structures. It indexes these | |
61 | * structures by enum type index in an array. | |
62 | * | |
63 | * PROF records best, best, average and worst values for delta time, | |
64 | * also, if UNACCED is defined, it measures "empty" time during which | |
65 | * no probes are in measuring state. This allows to see time "unaccounted" | |
66 | * for. If OVERHEAD is defined, additional calculations are made at every | |
67 | * probe to measure approximate overhead of the probe code itself. | |
68 | * | |
69 | * Probe data is stored in linked-list, so the more probes you define, | |
70 | * the more overhead is added to find the deepest nested probe. To reduce | |
71 | * average overhead, linked list is manipulated each time PR_start is | |
72 | * called, so that probe just started is moved 1 position up in linkedlist. | |
73 | * This way frequently used probes are moved closer to the head of list, | |
74 | * reducing average overhead. | |
75 | * Note that all overhead is on the scale of one hundred of CPU clock | |
76 | * ticks, which on the scale of submicroseconds. Yet, to optimise really | |
77 | * fast and frequent sections of code, we want to reduce this overhead | |
78 | * to absolute minimum possible. | |
79 | * | |
80 | * For actual measurements, probe overhead cancels out mostly. Still, | |
81 | * do not take the measured times as facts, they should be viewed in | |
82 | * relative comparison to overall CPU time and on the same platform. | |
83 | * | |
84 | * Every 1 second, Event within squid is called that parses gathered | |
85 | * statistics of every probe, and accumulates that into historical | |
86 | * structures for last 1,5,30 secs, 1,5,30 mins, and 1,5 and 24 hours. | |
87 | * Each second active probe stats are reset, and only historical data | |
88 | * is presented in cachemgr output. | |
89 | * | |
90 | * Reading stats. | |
91 | * "Worst case" may be misleading. Anything can happen at any section | |
92 | * of code that could delay reaching to probe stop. For eg. system may | |
93 | * need to service interrupt routine, task switch could occur, or page | |
94 | * fault needs to be handled. In this sense, this is quite meaningless | |
95 | * metric. "Best case" shows fastest completion of probe section, and | |
96 | * is also somewhat useless, unless you know that amount of work is | |
97 | * constant. Best metric to watch is "average time" and total cumulated | |
98 | * time in given timeframe, which really show percentage of time spent | |
99 | * in given section of code, and its average completion time. This data | |
100 | * could be used to detect bottlenecks withing squid and optimise them. | |
101 | * | |
102 | * TOTALS are quite off reality. Its there just to summarise cumulative | |
103 | * times and percent column. Percent values over 100% shows that there | |
104 | * have been some probes nested into each other. | |
105 | * | |
106 | */ | |
107 | ||
108 | #include "profiling.h" | |
109 | ||
4fd8d08f | 110 | #include <assert.h> |
111 | ||
88bfe092 | 112 | #ifdef USE_XPROF_STATS |
113 | ||
114 | ||
115 | #if HAVE_GNUMALLLOC_H | |
116 | #include <gnumalloc.h> | |
482aa790 | 117 | #elif HAVE_MALLOC_H |
88bfe092 | 118 | #include <malloc.h> |
119 | #endif | |
120 | #if HAVE_UNISTD_H | |
121 | #include <unistd.h> | |
122 | #endif | |
123 | ||
124 | /* Exported Data */ | |
125 | TimersArray *xprof_Timers = NULL; | |
88bfe092 | 126 | |
127 | /* Private stuff */ | |
128 | ||
4fd8d08f | 129 | |
130 | /* new stuff */ | |
131 | #define MAXSTACKDEPTH 512 | |
132 | ||
133 | struct _callstack_entry { | |
134 | int timer; /* index into timers array */ | |
135 | const char *name; | |
136 | hrtime_t start, stop, accum; | |
137 | }; | |
138 | ||
139 | struct _callstack_entry cstack[MAXSTACKDEPTH]; | |
140 | int cstack_head = 0; | |
141 | ||
0ef0f1de | 142 | #if defined(_MSC_VER) /* Microsoft C Compiler ONLY */ |
143 | static __inline void | |
144 | #else | |
88bfe092 | 145 | static inline void |
0ef0f1de | 146 | #endif |
88bfe092 | 147 | xprof_update(xprof_stats_data * head) |
148 | { | |
88bfe092 | 149 | if (head->delta < head->best) |
150 | head->best = head->delta; | |
151 | if (head->worst < head->delta) | |
152 | head->worst = head->delta; | |
153 | head->summ += head->delta; | |
154 | head->count++; | |
155 | } | |
156 | ||
157 | static xprof_stats_data *xp_UNACCOUNTED; | |
158 | static int xprof_inited = 0; | |
159 | ||
160 | static void | |
161 | xprof_InitLib(void) | |
162 | { | |
163 | if (xprof_inited) | |
164 | return; | |
165 | ||
166 | xprof_Timers = calloc(XPROF_LAST + 2, sizeof(xprof_stats_node)); | |
167 | ||
168 | xprof_Timers[XPROF_PROF_UNACCOUNTED]->name = "PROF_UNACCOUNTED"; | |
169 | xprof_Timers[XPROF_PROF_UNACCOUNTED]->accu.start = get_tick(); | |
170 | xp_UNACCOUNTED = &xprof_Timers[XPROF_PROF_UNACCOUNTED]->accu; | |
4fd8d08f | 171 | cstack_head = 0; |
88bfe092 | 172 | xprof_inited = 1; |
173 | } | |
174 | ||
175 | void | |
176 | xprof_start(xprof_type type, const char *timer) | |
177 | { | |
4fd8d08f | 178 | hrtime_t tt = get_tick(); |
88bfe092 | 179 | if (!xprof_inited) |
180 | xprof_InitLib(); | |
181 | ||
4fd8d08f | 182 | /* If nested, stop current stack frame */ |
183 | if (cstack_head > 0) { | |
184 | cstack[cstack_head - 1].accum += get_tick() - cstack[cstack_head - 1].start; | |
185 | cstack[cstack_head - 1].start = -1; | |
186 | } | |
187 | ||
188 | /* Are we at the first level? If so; stop the unaccounted timer */ | |
189 | if (cstack_head == 0) { | |
190 | assert(xp_UNACCOUNTED->start != -1); | |
191 | xp_UNACCOUNTED->delta = tt - xp_UNACCOUNTED->start; | |
192 | xp_UNACCOUNTED->start = -1; | |
193 | xprof_update(xp_UNACCOUNTED); | |
194 | } | |
195 | ||
196 | /* Allocate new stack frame */ | |
197 | cstack[cstack_head].start = tt; | |
198 | cstack[cstack_head].stop = -1; | |
199 | cstack[cstack_head].accum = 0; | |
200 | cstack[cstack_head].timer = type; | |
201 | cstack[cstack_head].name = timer; | |
202 | cstack_head++; | |
203 | assert(cstack_head < MAXSTACKDEPTH); | |
88bfe092 | 204 | |
88bfe092 | 205 | } |
206 | ||
207 | void | |
208 | xprof_stop(xprof_type type, const char *timer) | |
209 | { | |
4fd8d08f | 210 | hrtime_t tt = get_tick(); |
211 | assert(cstack_head > 0); | |
212 | cstack_head--; | |
213 | assert(cstack[cstack_head].timer == type); | |
214 | ||
215 | /* Record timer details */ | |
216 | cstack[cstack_head].accum += tt - cstack[cstack_head].start; | |
217 | xprof_Timers[type]->accu.delta = cstack[cstack_head].accum; | |
218 | xprof_Timers[type]->name = timer; | |
219 | ||
220 | /* Update */ | |
88bfe092 | 221 | xprof_update(&xprof_Timers[type]->accu); |
4fd8d08f | 222 | |
223 | /* Restart previous timer if we're not at the top level */ | |
224 | if (cstack_head > 0) { | |
225 | cstack[cstack_head - 1].start = tt; | |
226 | cstack[cstack_head - 1].stop = 0; | |
88bfe092 | 227 | return; |
4fd8d08f | 228 | } |
229 | /* Get here? We're at the top level; unaccounted */ | |
230 | xp_UNACCOUNTED->start = tt; | |
88bfe092 | 231 | } |
232 | ||
233 | #endif /* USE_XPROF_STATS */ |