]>
Commit | Line | Data |
---|---|---|
2cb7cef9 BS |
1 | Subject: [PATCH] powerpc/oprofile: Fix mutex locking for cell spu-oprofile |
2 | From: Carl Love <cel@us.ibm.com> | |
3 | References: 422501 - LTC47617 | |
4 | ||
5 | The issue is the SPU code is not holding the kernel mutex lock while | |
6 | adding samples to the kernel buffer. | |
7 | ||
8 | This patch creates per SPU buffers to hold the data. Data | |
9 | is added to the buffers from in interrupt context. The data | |
10 | is periodically pushed to the kernel buffer via a new Oprofile | |
11 | function oprofile_put_buff(). The oprofile_put_buff() function | |
12 | is called via a work queue enabling the funtion to acquire the | |
13 | mutex lock. | |
14 | ||
15 | The existing user controls for adjusting the per CPU buffer | |
16 | size is used to control the size of the per SPU buffers. | |
17 | Similarly, overflows of the SPU buffers are reported by | |
18 | incrementing the per CPU buffer stats. This eliminates the | |
19 | need to have architecture specific controls for the per SPU | |
20 | buffers which is not acceptable to the OProfile user tool | |
21 | maintainer. | |
22 | ||
23 | The export of the oprofile add_event_entry() is removed as it | |
24 | is no longer needed given this patch. | |
25 | ||
26 | Note, this patch has not addressed the issue of indexing arrays | |
27 | by the spu number. This still needs to be fixed as the spu | |
28 | numbering is not guarenteed to be 0 to max_num_spus-1. | |
29 | ||
30 | Signed-off-by: Carl Love <carll@us.ibm.com> | |
31 | Signed-off-by: Maynard Johnson <maynardj@us.ibm.com> | |
32 | Signed-off-by: Arnd Bergmann <arnd@arndb.de> | |
33 | Acked-by: Acked-by: Robert Richter <robert.richter@amd.com> | |
34 | Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> | |
35 | Signed-off-by: Olaf Hering <olh@suse.de> | |
36 | ||
37 | --- | |
38 | arch/powerpc/oprofile/cell/pr_util.h | 13 + | |
39 | arch/powerpc/oprofile/cell/spu_profiler.c | 4 | |
40 | arch/powerpc/oprofile/cell/spu_task_sync.c | 236 +++++++++++++++++++++++++---- | |
41 | drivers/oprofile/buffer_sync.c | 24 ++ | |
42 | drivers/oprofile/cpu_buffer.c | 15 + | |
43 | drivers/oprofile/event_buffer.h | 7 | |
44 | include/linux/oprofile.h | 16 + | |
45 | 7 files changed, 279 insertions(+), 36 deletions(-) | |
46 | ||
47 | --- a/arch/powerpc/oprofile/cell/pr_util.h | |
48 | +++ b/arch/powerpc/oprofile/cell/pr_util.h | |
49 | @@ -24,6 +24,11 @@ | |
50 | #define SKIP_GENERIC_SYNC 0 | |
51 | #define SYNC_START_ERROR -1 | |
52 | #define DO_GENERIC_SYNC 1 | |
53 | +#define SPUS_PER_NODE 8 | |
54 | +#define DEFAULT_TIMER_EXPIRE (HZ / 10) | |
55 | + | |
56 | +extern struct delayed_work spu_work; | |
57 | +extern int spu_prof_running; | |
58 | ||
59 | struct spu_overlay_info { /* map of sections within an SPU overlay */ | |
60 | unsigned int vma; /* SPU virtual memory address from elf */ | |
61 | @@ -62,6 +67,14 @@ struct vma_to_fileoffset_map { /* map of | |
62 | ||
63 | }; | |
64 | ||
65 | +struct spu_buffer { | |
66 | + int last_guard_val; | |
67 | + int ctx_sw_seen; | |
68 | + unsigned long *buff; | |
69 | + unsigned int head, tail; | |
70 | +}; | |
71 | + | |
72 | + | |
73 | /* The three functions below are for maintaining and accessing | |
74 | * the vma-to-fileoffset map. | |
75 | */ | |
76 | --- a/arch/powerpc/oprofile/cell/spu_profiler.c | |
77 | +++ b/arch/powerpc/oprofile/cell/spu_profiler.c | |
78 | @@ -24,12 +24,11 @@ | |
79 | ||
80 | static u32 *samples; | |
81 | ||
82 | -static int spu_prof_running; | |
83 | +int spu_prof_running; | |
84 | static unsigned int profiling_interval; | |
85 | ||
86 | #define NUM_SPU_BITS_TRBUF 16 | |
87 | #define SPUS_PER_TB_ENTRY 4 | |
88 | -#define SPUS_PER_NODE 8 | |
89 | ||
90 | #define SPU_PC_MASK 0xFFFF | |
91 | ||
92 | @@ -209,6 +208,7 @@ int start_spu_profiling(unsigned int cyc | |
93 | ||
94 | spu_prof_running = 1; | |
95 | hrtimer_start(&timer, kt, HRTIMER_MODE_REL); | |
96 | + schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE); | |
97 | ||
98 | return 0; | |
99 | } | |
100 | --- a/arch/powerpc/oprofile/cell/spu_task_sync.c | |
101 | +++ b/arch/powerpc/oprofile/cell/spu_task_sync.c | |
102 | @@ -35,7 +35,102 @@ static DEFINE_SPINLOCK(buffer_lock); | |
103 | static DEFINE_SPINLOCK(cache_lock); | |
104 | static int num_spu_nodes; | |
105 | int spu_prof_num_nodes; | |
106 | -int last_guard_val[MAX_NUMNODES * 8]; | |
107 | + | |
108 | +struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE]; | |
109 | +struct delayed_work spu_work; | |
110 | +static unsigned max_spu_buff; | |
111 | + | |
112 | +static void spu_buff_add(unsigned long int value, int spu) | |
113 | +{ | |
114 | + /* spu buff is a circular buffer. Add entries to the | |
115 | + * head. Head is the index to store the next value. | |
116 | + * The buffer is full when there is one available entry | |
117 | + * in the queue, i.e. head and tail can't be equal. | |
118 | + * That way we can tell the difference between the | |
119 | + * buffer being full versus empty. | |
120 | + * | |
121 | + * ASSUPTION: the buffer_lock is held when this function | |
122 | + * is called to lock the buffer, head and tail. | |
123 | + */ | |
124 | + int full = 1; | |
125 | + | |
126 | + if (spu_buff[spu].head >= spu_buff[spu].tail) { | |
127 | + if ((spu_buff[spu].head - spu_buff[spu].tail) | |
128 | + < (max_spu_buff - 1)) | |
129 | + full = 0; | |
130 | + | |
131 | + } else if (spu_buff[spu].tail > spu_buff[spu].head) { | |
132 | + if ((spu_buff[spu].tail - spu_buff[spu].head) | |
133 | + > 1) | |
134 | + full = 0; | |
135 | + } | |
136 | + | |
137 | + if (!full) { | |
138 | + spu_buff[spu].buff[spu_buff[spu].head] = value; | |
139 | + spu_buff[spu].head++; | |
140 | + | |
141 | + if (spu_buff[spu].head >= max_spu_buff) | |
142 | + spu_buff[spu].head = 0; | |
143 | + } else { | |
144 | + /* From the user's perspective make the SPU buffer | |
145 | + * size management/overflow look like we are using | |
146 | + * per cpu buffers. The user uses the same | |
147 | + * per cpu parameter to adjust the SPU buffer size. | |
148 | + * Increment the sample_lost_overflow to inform | |
149 | + * the user the buffer size needs to be increased. | |
150 | + */ | |
151 | + oprofile_cpu_buffer_inc_smpl_lost(); | |
152 | + } | |
153 | +} | |
154 | + | |
155 | +/* This function copies the per SPU buffers to the | |
156 | + * OProfile kernel buffer. | |
157 | + */ | |
158 | +void sync_spu_buff(void) | |
159 | +{ | |
160 | + int spu; | |
161 | + unsigned long flags; | |
162 | + int curr_head; | |
163 | + | |
164 | + for (spu = 0; spu < num_spu_nodes; spu++) { | |
165 | + /* In case there was an issue and the buffer didn't | |
166 | + * get created skip it. | |
167 | + */ | |
168 | + if (spu_buff[spu].buff == NULL) | |
169 | + continue; | |
170 | + | |
171 | + /* Hold the lock to make sure the head/tail | |
172 | + * doesn't change while spu_buff_add() is | |
173 | + * deciding if the buffer is full or not. | |
174 | + * Being a little paranoid. | |
175 | + */ | |
176 | + spin_lock_irqsave(&buffer_lock, flags); | |
177 | + curr_head = spu_buff[spu].head; | |
178 | + spin_unlock_irqrestore(&buffer_lock, flags); | |
179 | + | |
180 | + /* Transfer the current contents to the kernel buffer. | |
181 | + * data can still be added to the head of the buffer. | |
182 | + */ | |
183 | + oprofile_put_buff(spu_buff[spu].buff, | |
184 | + spu_buff[spu].tail, | |
185 | + curr_head, max_spu_buff); | |
186 | + | |
187 | + spin_lock_irqsave(&buffer_lock, flags); | |
188 | + spu_buff[spu].tail = curr_head; | |
189 | + spin_unlock_irqrestore(&buffer_lock, flags); | |
190 | + } | |
191 | + | |
192 | +} | |
193 | + | |
194 | +static void wq_sync_spu_buff(struct work_struct *work) | |
195 | +{ | |
196 | + /* move data from spu buffers to kernel buffer */ | |
197 | + sync_spu_buff(); | |
198 | + | |
199 | + /* only reschedule if profiling is not done */ | |
200 | + if (spu_prof_running) | |
201 | + schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE); | |
202 | +} | |
203 | ||
204 | /* Container for caching information about an active SPU task. */ | |
205 | struct cached_info { | |
206 | @@ -305,14 +400,21 @@ static int process_context_switch(struct | |
207 | ||
208 | /* Record context info in event buffer */ | |
209 | spin_lock_irqsave(&buffer_lock, flags); | |
210 | - add_event_entry(ESCAPE_CODE); | |
211 | - add_event_entry(SPU_CTX_SWITCH_CODE); | |
212 | - add_event_entry(spu->number); | |
213 | - add_event_entry(spu->pid); | |
214 | - add_event_entry(spu->tgid); | |
215 | - add_event_entry(app_dcookie); | |
216 | - add_event_entry(spu_cookie); | |
217 | - add_event_entry(offset); | |
218 | + spu_buff_add(ESCAPE_CODE, spu->number); | |
219 | + spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number); | |
220 | + spu_buff_add(spu->number, spu->number); | |
221 | + spu_buff_add(spu->pid, spu->number); | |
222 | + spu_buff_add(spu->tgid, spu->number); | |
223 | + spu_buff_add(app_dcookie, spu->number); | |
224 | + spu_buff_add(spu_cookie, spu->number); | |
225 | + spu_buff_add(offset, spu->number); | |
226 | + | |
227 | + /* Set flag to indicate SPU PC data can now be written out. If | |
228 | + * the SPU program counter data is seen before an SPU context | |
229 | + * record is seen, the postprocessing will fail. | |
230 | + */ | |
231 | + spu_buff[spu->number].ctx_sw_seen = 1; | |
232 | + | |
233 | spin_unlock_irqrestore(&buffer_lock, flags); | |
234 | smp_wmb(); /* insure spu event buffer updates are written */ | |
235 | /* don't want entries intermingled... */ | |
236 | @@ -360,6 +462,47 @@ static int number_of_online_nodes(void) | |
237 | return nodes; | |
238 | } | |
239 | ||
240 | +static int oprofile_spu_buff_create(void) | |
241 | +{ | |
242 | + int spu; | |
243 | + | |
244 | + max_spu_buff = oprofile_get_cpu_buffer_size(); | |
245 | + | |
246 | + for (spu = 0; spu < num_spu_nodes; spu++) { | |
247 | + /* create circular buffers to store the data in. | |
248 | + * use locks to manage accessing the buffers | |
249 | + */ | |
250 | + spu_buff[spu].head = 0; | |
251 | + spu_buff[spu].tail = 0; | |
252 | + | |
253 | + /* | |
254 | + * Create a buffer for each SPU. Can't reliably | |
255 | + * create a single buffer for all spus due to not | |
256 | + * enough contiguous kernel memory. | |
257 | + */ | |
258 | + | |
259 | + spu_buff[spu].buff = kzalloc((max_spu_buff | |
260 | + * sizeof(unsigned long)), | |
261 | + GFP_KERNEL); | |
262 | + | |
263 | + if (!spu_buff[spu].buff) { | |
264 | + printk(KERN_ERR "SPU_PROF: " | |
265 | + "%s, line %d: oprofile_spu_buff_create " | |
266 | + "failed to allocate spu buffer %d.\n", | |
267 | + __func__, __LINE__, spu); | |
268 | + | |
269 | + /* release the spu buffers that have been allocated */ | |
270 | + while (spu >= 0) { | |
271 | + kfree(spu_buff[spu].buff); | |
272 | + spu_buff[spu].buff = 0; | |
273 | + spu--; | |
274 | + } | |
275 | + return -ENOMEM; | |
276 | + } | |
277 | + } | |
278 | + return 0; | |
279 | +} | |
280 | + | |
281 | /* The main purpose of this function is to synchronize | |
282 | * OProfile with SPUFS by registering to be notified of | |
283 | * SPU task switches. | |
284 | @@ -372,20 +515,35 @@ static int number_of_online_nodes(void) | |
285 | */ | |
286 | int spu_sync_start(void) | |
287 | { | |
288 | - int k; | |
289 | + int spu; | |
290 | int ret = SKIP_GENERIC_SYNC; | |
291 | int register_ret; | |
292 | unsigned long flags = 0; | |
293 | ||
294 | spu_prof_num_nodes = number_of_online_nodes(); | |
295 | num_spu_nodes = spu_prof_num_nodes * 8; | |
296 | + INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff); | |
297 | + | |
298 | + /* create buffer for storing the SPU data to put in | |
299 | + * the kernel buffer. | |
300 | + */ | |
301 | + ret = oprofile_spu_buff_create(); | |
302 | + if (ret) | |
303 | + goto out; | |
304 | ||
305 | spin_lock_irqsave(&buffer_lock, flags); | |
306 | - add_event_entry(ESCAPE_CODE); | |
307 | - add_event_entry(SPU_PROFILING_CODE); | |
308 | - add_event_entry(num_spu_nodes); | |
309 | + for (spu = 0; spu < num_spu_nodes; spu++) { | |
310 | + spu_buff_add(ESCAPE_CODE, spu); | |
311 | + spu_buff_add(SPU_PROFILING_CODE, spu); | |
312 | + spu_buff_add(num_spu_nodes, spu); | |
313 | + } | |
314 | spin_unlock_irqrestore(&buffer_lock, flags); | |
315 | ||
316 | + for (spu = 0; spu < num_spu_nodes; spu++) { | |
317 | + spu_buff[spu].ctx_sw_seen = 0; | |
318 | + spu_buff[spu].last_guard_val = 0; | |
319 | + } | |
320 | + | |
321 | /* Register for SPU events */ | |
322 | register_ret = spu_switch_event_register(&spu_active); | |
323 | if (register_ret) { | |
324 | @@ -393,8 +551,6 @@ int spu_sync_start(void) | |
325 | goto out; | |
326 | } | |
327 | ||
328 | - for (k = 0; k < (MAX_NUMNODES * 8); k++) | |
329 | - last_guard_val[k] = 0; | |
330 | pr_debug("spu_sync_start -- running.\n"); | |
331 | out: | |
332 | return ret; | |
333 | @@ -446,13 +602,20 @@ void spu_sync_buffer(int spu_num, unsign | |
334 | * use. We need to discard samples taken during the time | |
335 | * period which an overlay occurs (i.e., guard value changes). | |
336 | */ | |
337 | - if (grd_val && grd_val != last_guard_val[spu_num]) { | |
338 | - last_guard_val[spu_num] = grd_val; | |
339 | + if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) { | |
340 | + spu_buff[spu_num].last_guard_val = grd_val; | |
341 | /* Drop the rest of the samples. */ | |
342 | break; | |
343 | } | |
344 | ||
345 | - add_event_entry(file_offset | spu_num_shifted); | |
346 | + /* We must ensure that the SPU context switch has been written | |
347 | + * out before samples for the SPU. Otherwise, the SPU context | |
348 | + * information is not available and the postprocessing of the | |
349 | + * SPU PC will fail with no available anonymous map information. | |
350 | + */ | |
351 | + if (spu_buff[spu_num].ctx_sw_seen) | |
352 | + spu_buff_add((file_offset | spu_num_shifted), | |
353 | + spu_num); | |
354 | } | |
355 | spin_unlock(&buffer_lock); | |
356 | out: | |
357 | @@ -463,20 +626,41 @@ out: | |
358 | int spu_sync_stop(void) | |
359 | { | |
360 | unsigned long flags = 0; | |
361 | - int ret = spu_switch_event_unregister(&spu_active); | |
362 | - if (ret) { | |
363 | + int ret; | |
364 | + int k; | |
365 | + | |
366 | + ret = spu_switch_event_unregister(&spu_active); | |
367 | + | |
368 | + if (ret) | |
369 | printk(KERN_ERR "SPU_PROF: " | |
370 | - "%s, line %d: spu_switch_event_unregister returned %d\n", | |
371 | - __func__, __LINE__, ret); | |
372 | - goto out; | |
373 | - } | |
374 | + "%s, line %d: spu_switch_event_unregister " \ | |
375 | + "returned %d\n", | |
376 | + __func__, __LINE__, ret); | |
377 | + | |
378 | + /* flush any remaining data in the per SPU buffers */ | |
379 | + sync_spu_buff(); | |
380 | ||
381 | spin_lock_irqsave(&cache_lock, flags); | |
382 | ret = release_cached_info(RELEASE_ALL); | |
383 | spin_unlock_irqrestore(&cache_lock, flags); | |
384 | -out: | |
385 | + | |
386 | + /* remove scheduled work queue item rather then waiting | |
387 | + * for every queued entry to execute. Then flush pending | |
388 | + * system wide buffer to event buffer. | |
389 | + */ | |
390 | + cancel_delayed_work(&spu_work); | |
391 | + | |
392 | + for (k = 0; k < num_spu_nodes; k++) { | |
393 | + spu_buff[k].ctx_sw_seen = 0; | |
394 | + | |
395 | + /* | |
396 | + * spu_sys_buff will be null if there was a problem | |
397 | + * allocating the buffer. Only delete if it exists. | |
398 | + */ | |
399 | + kfree(spu_buff[k].buff); | |
400 | + spu_buff[k].buff = 0; | |
401 | + } | |
402 | pr_debug("spu_sync_stop -- done.\n"); | |
403 | return ret; | |
404 | } | |
405 | ||
406 | - | |
407 | --- a/drivers/oprofile/buffer_sync.c | |
408 | +++ b/drivers/oprofile/buffer_sync.c | |
409 | @@ -551,3 +551,27 @@ void sync_buffer(int cpu) | |
410 | ||
411 | mutex_unlock(&buffer_mutex); | |
412 | } | |
413 | + | |
414 | +/* The function can be used to add a buffer worth of data directly to | |
415 | + * the kernel buffer. The buffer is assumed to be a circular buffer. | |
416 | + * Take the entries from index start and end at index end, wrapping | |
417 | + * at max_entries. | |
418 | + */ | |
419 | +void oprofile_put_buff(unsigned long *buf, unsigned int start, | |
420 | + unsigned int stop, unsigned int max) | |
421 | +{ | |
422 | + int i; | |
423 | + | |
424 | + i = start; | |
425 | + | |
426 | + mutex_lock(&buffer_mutex); | |
427 | + while (i != stop) { | |
428 | + add_event_entry(buf[i++]); | |
429 | + | |
430 | + if (i >= max) | |
431 | + i = 0; | |
432 | + } | |
433 | + | |
434 | + mutex_unlock(&buffer_mutex); | |
435 | +} | |
436 | + | |
437 | --- a/drivers/oprofile/cpu_buffer.c | |
438 | +++ b/drivers/oprofile/cpu_buffer.c | |
439 | @@ -37,13 +37,26 @@ static int work_enabled; | |
440 | void free_cpu_buffers(void) | |
441 | { | |
442 | int i; | |
443 | - | |
444 | + | |
445 | for_each_online_cpu(i) { | |
446 | vfree(per_cpu(cpu_buffer, i).buffer); | |
447 | per_cpu(cpu_buffer, i).buffer = NULL; | |
448 | } | |
449 | } | |
450 | ||
451 | +unsigned long oprofile_get_cpu_buffer_size(void) | |
452 | +{ | |
453 | + return fs_cpu_buffer_size; | |
454 | +} | |
455 | + | |
456 | +void oprofile_cpu_buffer_inc_smpl_lost(void) | |
457 | +{ | |
458 | + struct oprofile_cpu_buffer *cpu_buf | |
459 | + = &__get_cpu_var(cpu_buffer); | |
460 | + | |
461 | + cpu_buf->sample_lost_overflow++; | |
462 | +} | |
463 | + | |
464 | int alloc_cpu_buffers(void) | |
465 | { | |
466 | int i; | |
467 | --- a/drivers/oprofile/event_buffer.h | |
468 | +++ b/drivers/oprofile/event_buffer.h | |
469 | @@ -17,6 +17,13 @@ int alloc_event_buffer(void); | |
470 | ||
471 | void free_event_buffer(void); | |
472 | ||
473 | +/** | |
474 | + * Add data to the event buffer. | |
475 | + * The data passed is free-form, but typically consists of | |
476 | + * file offsets, dcookies, context information, and ESCAPE codes. | |
477 | + */ | |
478 | +void add_event_entry(unsigned long data); | |
479 | + | |
480 | /* wake up the process sleeping on the event file */ | |
481 | void wake_up_buffer_waiter(void); | |
482 | ||
483 | --- a/include/linux/oprofile.h | |
484 | +++ b/include/linux/oprofile.h | |
485 | @@ -84,13 +84,6 @@ int oprofile_arch_init(struct oprofile_o | |
486 | void oprofile_arch_exit(void); | |
487 | ||
488 | /** | |
489 | - * Add data to the event buffer. | |
490 | - * The data passed is free-form, but typically consists of | |
491 | - * file offsets, dcookies, context information, and ESCAPE codes. | |
492 | - */ | |
493 | -void add_event_entry(unsigned long data); | |
494 | - | |
495 | -/** | |
496 | * Add a sample. This may be called from any context. Pass | |
497 | * smp_processor_id() as cpu. | |
498 | */ | |
499 | @@ -160,5 +153,14 @@ int oprofilefs_ulong_from_user(unsigned | |
500 | ||
501 | /** lock for read/write safety */ | |
502 | extern spinlock_t oprofilefs_lock; | |
503 | + | |
504 | +/** | |
505 | + * Add the contents of a circular buffer to the event buffer. | |
506 | + */ | |
507 | +void oprofile_put_buff(unsigned long *buf, unsigned int start, | |
508 | + unsigned int stop, unsigned int max); | |
509 | + | |
510 | +unsigned long oprofile_get_cpu_buffer_size(void); | |
511 | +void oprofile_cpu_buffer_inc_smpl_lost(void); | |
512 | ||
513 | #endif /* OPROFILE_H */ |