]>
Commit | Line | Data |
---|---|---|
55a56509 | 1 | /* workitems.c -- The main runtime entry that performs work-item execution in |
2 | various ways and the builtin functions closely related to the | |
3 | implementation. | |
4 | ||
fbd26352 | 5 | Copyright (C) 2015-2019 Free Software Foundation, Inc. |
55a56509 | 6 | Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com> |
7 | for General Processor Tech. | |
8 | ||
9 | Permission is hereby granted, free of charge, to any person obtaining a | |
10 | copy of this software and associated documentation files | |
11 | (the "Software"), to deal in the Software without restriction, including | |
12 | without limitation the rights to use, copy, modify, merge, publish, | |
13 | distribute, sublicense, and/or sell copies of the Software, and to | |
14 | permit persons to whom the Software is furnished to do so, subject to | |
15 | the following conditions: | |
16 | ||
17 | The above copyright notice and this permission notice shall be included | |
18 | in all copies or substantial portions of the Software. | |
19 | ||
20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | |
21 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
22 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
23 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, | |
24 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR | |
25 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE | |
26 | USE OR OTHER DEALINGS IN THE SOFTWARE. | |
27 | */ | |
28 | ||
29 | /* The fiber based multiple work-item work-group execution uses ucontext | |
30 | based user mode threading. However, if gccbrig is able to optimize the | |
31 | kernel to a much faster work-group function that implements the multiple | |
32 | WI execution using loops instead of fibers requiring slow context switches, | |
33 | the fiber-based implementation won't be called. | |
34 | */ | |
35 | ||
36 | #include <stdlib.h> | |
37 | #include <signal.h> | |
38 | #include <string.h> | |
39 | ||
40 | #include "workitems.h" | |
41 | #include "phsa-rt.h" | |
42 | ||
43 | #ifdef HAVE_FIBERS | |
44 | #include "fibers.h" | |
45 | #endif | |
46 | ||
47 | #ifdef BENCHMARK_PHSA_RT | |
48 | #include <stdio.h> | |
49 | #include <time.h> | |
50 | ||
51 | static uint64_t wi_count = 0; | |
52 | static uint64_t wis_skipped = 0; | |
53 | static uint64_t wi_total = 0; | |
54 | static clock_t start_time; | |
55 | ||
56 | #endif | |
57 | ||
58 | #ifdef DEBUG_PHSA_RT | |
59 | #include <stdio.h> | |
60 | #endif | |
61 | ||
62 | #define PRIVATE_SEGMENT_ALIGN 256 | |
63 | #define FIBER_STACK_SIZE (64*1024) | |
64 | #define GROUP_SEGMENT_ALIGN 256 | |
65 | ||
c11ac926 | 66 | /* Preserve this amount of additional space in the alloca stack as we need to |
67 | store the alloca frame pointer to the alloca frame, thus must preserve | |
68 | space for it. This thus supports at most 1024 functions with allocas in | |
69 | a call chain. */ | |
70 | #define ALLOCA_OVERHEAD 1024*4 | |
71 | ||
55a56509 | 72 | uint32_t __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context); |
73 | ||
74 | uint32_t __hsail_workitemid (uint32_t dim, PHSAWorkItem *context); | |
75 | ||
76 | uint32_t __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context); | |
77 | ||
78 | uint32_t __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi); | |
79 | ||
80 | uint32_t __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi); | |
81 | ||
82 | void | |
83 | phsa_fatal_error (int code) | |
84 | { | |
85 | exit (code); | |
86 | } | |
87 | ||
88 | #ifdef HAVE_FIBERS | |
89 | /* ucontext-based work-item thread implementation. Runs all work-items in | |
90 | separate fibers. */ | |
91 | ||
92 | static void | |
93 | phsa_work_item_thread (int arg0, int arg1) | |
94 | { | |
95 | void *arg = fiber_int_args_to_ptr (arg0, arg1); | |
96 | ||
97 | PHSAWorkItem *wi = (PHSAWorkItem *) arg; | |
98 | volatile PHSAWorkGroup *wg = wi->wg; | |
99 | PHSAKernelLaunchData *l_data = wi->launch_data; | |
100 | ||
101 | do | |
102 | { | |
103 | int retcode | |
104 | = fiber_barrier_reach ((fiber_barrier_t *) l_data->wg_start_barrier); | |
105 | ||
106 | /* At this point the threads can assume that either more_wgs is 0 or | |
107 | the current_work_group_* is set to point to the WG executed next. */ | |
108 | if (!wi->wg->more_wgs) | |
109 | break; | |
4ac4c323 | 110 | |
111 | wi->group_x = wg->x; | |
112 | wi->group_y = wg->y; | |
113 | wi->group_z = wg->z; | |
114 | ||
115 | wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi); | |
116 | wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi); | |
117 | wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi); | |
118 | ||
55a56509 | 119 | #ifdef DEBUG_PHSA_RT |
120 | printf ( | |
121 | "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n", | |
4ac4c323 | 122 | wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z, |
123 | l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z); | |
55a56509 | 124 | #endif |
125 | ||
126 | if (wi->x < __hsail_currentworkgroupsize (0, wi) | |
127 | && wi->y < __hsail_currentworkgroupsize (1, wi) | |
128 | && wi->z < __hsail_currentworkgroupsize (2, wi)) | |
129 | { | |
130 | l_data->kernel (l_data->kernarg_addr, wi, wg->group_base_ptr, | |
08ab8585 | 131 | wg->initial_group_offset, wg->private_base_ptr); |
55a56509 | 132 | #ifdef DEBUG_PHSA_RT |
133 | printf ("done.\n"); | |
134 | #endif | |
135 | #ifdef BENCHMARK_PHSA_RT | |
136 | wi_count++; | |
137 | #endif | |
138 | } | |
139 | else | |
140 | { | |
141 | #ifdef DEBUG_PHSA_RT | |
142 | printf ("skipped (partial WG).\n"); | |
143 | #endif | |
144 | #ifdef BENCHMARK_PHSA_RT | |
145 | wis_skipped++; | |
146 | #endif | |
147 | } | |
148 | ||
149 | retcode | |
150 | = fiber_barrier_reach ((fiber_barrier_t *) | |
151 | l_data->wg_completion_barrier); | |
152 | ||
153 | /* The first thread updates the WG to execute next etc. */ | |
154 | ||
155 | if (retcode == 0) | |
156 | { | |
157 | #ifdef EXECUTE_WGS_BACKWARDS | |
158 | if (wg->x == l_data->wg_min_x) | |
159 | { | |
160 | wg->x = l_data->wg_max_x - 1; | |
161 | if (wg->y == l_data->wg_min_y) | |
162 | { | |
163 | wg->y = l_data->wg_max_y - 1; | |
164 | if (wg->z == l_data->wg_min_z) | |
165 | wg->more_wgs = 0; | |
166 | else | |
167 | wg->z--; | |
168 | } | |
169 | else | |
170 | wg->y--; | |
171 | } | |
172 | else | |
173 | wg->x--; | |
174 | #else | |
175 | if (wg->x + 1 >= l_data->wg_max_x) | |
176 | { | |
177 | wg->x = l_data->wg_min_x; | |
178 | if (wg->y + 1 >= l_data->wg_max_y) | |
179 | { | |
180 | wg->y = l_data->wg_min_y; | |
181 | if (wg->z + 1 >= l_data->wg_max_z) | |
182 | wg->more_wgs = 0; | |
183 | else | |
184 | wg->z++; | |
185 | } | |
186 | else | |
187 | wg->y++; | |
188 | } | |
189 | else | |
190 | wg->x++; | |
191 | #endif | |
4ac4c323 | 192 | wi->group_x = wg->x; |
193 | wi->group_y = wg->y; | |
194 | wi->group_z = wg->z; | |
195 | ||
196 | wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi); | |
197 | wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi); | |
198 | wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi); | |
55a56509 | 199 | |
200 | /* Reinitialize the work-group barrier according to the new WG's | |
201 | size, which might not be the same as the previous ones, due | |
202 | to "partial WGs". */ | |
203 | size_t wg_size = __hsail_currentworkgroupsize (0, wi) | |
204 | * __hsail_currentworkgroupsize (1, wi) | |
205 | * __hsail_currentworkgroupsize (2, wi); | |
206 | ||
207 | #ifdef DEBUG_PHSA_RT | |
208 | printf ("Reinitializing the WG barrier to %lu.\n", wg_size); | |
209 | #endif | |
210 | fiber_barrier_init ((fiber_barrier_t *) | |
211 | wi->launch_data->wg_sync_barrier, | |
212 | wg_size); | |
213 | ||
214 | #ifdef BENCHMARK_PHSA_RT | |
215 | if (wi_count % 1000 == 0) | |
216 | { | |
217 | clock_t spent_time = clock () - start_time; | |
218 | double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC; | |
219 | double wis_per_sec = wi_count / spent_time_sec; | |
220 | uint64_t eta_sec | |
221 | = (wi_total - wi_count - wis_skipped) / wis_per_sec; | |
222 | ||
223 | printf ("%lu WIs executed %lu skipped in %lus (%lu WIs/s, ETA in " | |
224 | "%lu s)\n", | |
225 | wi_count, wis_skipped, (uint64_t) spent_time_sec, | |
226 | (uint64_t) wis_per_sec, (uint64_t) eta_sec); | |
227 | } | |
228 | #endif | |
229 | } | |
230 | } | |
231 | while (1); | |
232 | ||
233 | fiber_exit (); | |
234 | } | |
235 | #endif | |
236 | ||
237 | #define MIN(a, b) ((a < b) ? a : b) | |
238 | #define MAX(a, b) ((a > b) ? a : b) | |
239 | ||
240 | #ifdef HAVE_FIBERS | |
241 | /* Spawns a given number of work-items to execute a set of work-groups, | |
242 | blocks until their completion. */ | |
243 | ||
244 | static void | |
245 | phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr, | |
08ab8585 | 246 | uint32_t group_local_offset, size_t wg_size_x, |
247 | size_t wg_size_y, size_t wg_size_z) | |
55a56509 | 248 | { |
249 | PHSAWorkItem *wi_threads = NULL; | |
250 | PHSAWorkGroup wg; | |
251 | size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z; | |
4ac4c323 | 252 | uint32_t group_x, group_y, group_z; |
55a56509 | 253 | fiber_barrier_t wg_start_barrier; |
254 | fiber_barrier_t wg_completion_barrier; | |
255 | fiber_barrier_t wg_sync_barrier; | |
256 | ||
257 | max_x = wg_size_x == 0 ? 1 : wg_size_x; | |
258 | max_y = wg_size_y == 0 ? 1 : wg_size_y; | |
259 | max_z = wg_size_z == 0 ? 1 : wg_size_z; | |
260 | ||
261 | size_t wg_size = max_x * max_y * max_z; | |
262 | if (wg_size > PHSA_MAX_WG_SIZE) | |
263 | phsa_fatal_error (2); | |
264 | ||
265 | wg.private_segment_total_size = context->dp->private_segment_size * wg_size; | |
266 | if (wg.private_segment_total_size > 0 | |
267 | && posix_memalign (&wg.private_base_ptr, PRIVATE_SEGMENT_ALIGN, | |
268 | wg.private_segment_total_size) | |
269 | != 0) | |
270 | phsa_fatal_error (3); | |
271 | ||
c11ac926 | 272 | wg.alloca_stack_p = wg.private_segment_total_size + ALLOCA_OVERHEAD; |
55a56509 | 273 | wg.alloca_frame_p = wg.alloca_stack_p; |
08ab8585 | 274 | wg.initial_group_offset = group_local_offset; |
55a56509 | 275 | |
276 | #ifdef EXECUTE_WGS_BACKWARDS | |
4ac4c323 | 277 | group_x = context->wg_max_x - 1; |
278 | group_y = context->wg_max_y - 1; | |
279 | group_z = context->wg_max_z - 1; | |
55a56509 | 280 | #else |
4ac4c323 | 281 | group_x = context->wg_min_x; |
282 | group_y = context->wg_min_y; | |
283 | group_z = context->wg_min_z; | |
55a56509 | 284 | #endif |
285 | ||
286 | fiber_barrier_init (&wg_sync_barrier, wg_size); | |
287 | fiber_barrier_init (&wg_start_barrier, wg_size); | |
288 | fiber_barrier_init (&wg_completion_barrier, wg_size); | |
289 | ||
290 | context->wg_start_barrier = &wg_start_barrier; | |
291 | context->wg_sync_barrier = &wg_sync_barrier; | |
292 | context->wg_completion_barrier = &wg_completion_barrier; | |
293 | ||
294 | wg.more_wgs = 1; | |
295 | wg.group_base_ptr = group_base_ptr; | |
296 | ||
297 | #ifdef BENCHMARK_PHSA_RT | |
298 | wi_count = 0; | |
299 | wis_skipped = 0; | |
300 | start_time = clock (); | |
301 | #endif | |
302 | wi_threads = malloc (sizeof (PHSAWorkItem) * max_x * max_y * max_z); | |
303 | for (x = 0; x < max_x; ++x) | |
304 | for (y = 0; y < max_y; ++y) | |
305 | for (z = 0; z < max_z; ++z) | |
306 | { | |
307 | PHSAWorkItem *wi = &wi_threads[flat_wi_id]; | |
308 | wi->launch_data = context; | |
309 | wi->wg = &wg; | |
4ac4c323 | 310 | |
311 | wg.x = wi->group_x = group_x; | |
312 | wg.y = wi->group_y = group_y; | |
313 | wg.z = wi->group_z = group_z; | |
314 | ||
315 | wi->wg_size_x = context->dp->workgroup_size_x; | |
316 | wi->wg_size_y = context->dp->workgroup_size_y; | |
317 | wi->wg_size_z = context->dp->workgroup_size_z; | |
318 | ||
319 | wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi); | |
320 | wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi); | |
321 | wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi); | |
322 | ||
55a56509 | 323 | wi->x = x; |
324 | wi->y = y; | |
325 | wi->z = z; | |
326 | ||
327 | /* TODO: set the stack size according to the private | |
328 | segment size. Too big stack consumes huge amount of | |
329 | memory in case of huge number of WIs and a too small stack | |
330 | will fail in mysterious and potentially dangerous ways. */ | |
331 | ||
332 | fiber_init (&wi->fiber, phsa_work_item_thread, wi, | |
333 | FIBER_STACK_SIZE, PRIVATE_SEGMENT_ALIGN); | |
334 | ++flat_wi_id; | |
335 | } | |
336 | ||
337 | do | |
338 | { | |
339 | --flat_wi_id; | |
340 | fiber_join (&wi_threads[flat_wi_id].fiber); | |
341 | } | |
342 | while (flat_wi_id > 0); | |
343 | ||
344 | if (wg.private_segment_total_size > 0) | |
345 | free (wg.private_base_ptr); | |
346 | ||
347 | free (wi_threads); | |
348 | } | |
349 | ||
350 | /* Spawn the work-item threads to execute work-groups and let | |
351 | them execute all the WGs, including a potential partial WG. */ | |
352 | ||
353 | static void | |
08ab8585 | 354 | phsa_spawn_work_items (PHSAKernelLaunchData *context, void *group_base_ptr, |
355 | uint32_t group_local_offset) | |
55a56509 | 356 | { |
357 | hsa_kernel_dispatch_packet_t *dp = context->dp; | |
358 | size_t x, y, z; | |
359 | ||
55a56509 | 360 | context->group_segment_start_addr = (size_t) group_base_ptr; |
361 | ||
362 | /* HSA seems to allow the WG size to be larger than the grid size. We need to | |
363 | saturate the effective WG size to the grid size to prevent the extra WIs | |
364 | from executing. */ | |
365 | size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size; | |
366 | sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x); | |
367 | sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y); | |
368 | sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z); | |
369 | sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z; | |
370 | ||
371 | #ifdef BENCHMARK_PHSA_RT | |
372 | wi_total = (uint64_t) dp->grid_size_x | |
373 | * (dp->grid_size_y > 0 ? dp->grid_size_y : 1) | |
374 | * (dp->grid_size_z > 0 ? dp->grid_size_z : 1); | |
375 | #endif | |
376 | ||
377 | /* For now execute all work groups in a single coarse thread (does not utilize | |
378 | multicore/multithread). */ | |
379 | context->wg_min_x = context->wg_min_y = context->wg_min_z = 0; | |
380 | ||
381 | int dims = dp->setup & 0x3; | |
382 | ||
383 | context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1) | |
384 | / dp->workgroup_size_x; | |
385 | ||
386 | context->wg_max_y | |
387 | = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1) | |
388 | / dp->workgroup_size_y; | |
389 | ||
390 | context->wg_max_z | |
391 | = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1) | |
392 | / dp->workgroup_size_z; | |
393 | ||
394 | #ifdef DEBUG_PHSA_RT | |
395 | printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with " | |
396 | "wg size %lu/%lu/%lu grid size %u/%u/%u\n", | |
397 | context->wg_min_x, context->wg_min_y, context->wg_min_z, | |
398 | context->wg_max_x, context->wg_max_y, context->wg_max_z, | |
399 | sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x, | |
400 | dp->grid_size_y, dp->grid_size_z); | |
401 | #endif | |
402 | ||
08ab8585 | 403 | phsa_execute_wi_gang (context, group_base_ptr, group_local_offset, |
404 | sat_wg_size_x, sat_wg_size_y, sat_wg_size_z); | |
55a56509 | 405 | } |
406 | #endif | |
407 | ||
408 | /* Executes the given work-group function for all work groups in the grid. | |
409 | ||
410 | A work-group function is a version of the original kernel which executes | |
411 | the kernel for all work-items in a work-group. It is produced by gccbrig | |
412 | if it can handle the kernel's barrier usage and is much faster way to | |
413 | execute massive numbers of work-items in a non-SPMD machine than fibers | |
414 | (easily 100x faster). */ | |
415 | static void | |
08ab8585 | 416 | phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr, |
417 | uint32_t group_local_offset) | |
55a56509 | 418 | { |
419 | hsa_kernel_dispatch_packet_t *dp = context->dp; | |
420 | size_t x, y, z, wg_x, wg_y, wg_z; | |
421 | ||
55a56509 | 422 | context->group_segment_start_addr = (size_t) group_base_ptr; |
423 | ||
424 | /* HSA seems to allow the WG size to be larger than the grid size. We need | |
425 | to saturate the effective WG size to the grid size to prevent the extra WIs | |
426 | from executing. */ | |
427 | size_t sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, sat_wg_size; | |
428 | sat_wg_size_x = MIN (dp->workgroup_size_x, dp->grid_size_x); | |
429 | sat_wg_size_y = MIN (dp->workgroup_size_y, dp->grid_size_y); | |
430 | sat_wg_size_z = MIN (dp->workgroup_size_z, dp->grid_size_z); | |
431 | sat_wg_size = sat_wg_size_x * sat_wg_size_y * sat_wg_size_z; | |
432 | ||
433 | #ifdef BENCHMARK_PHSA_RT | |
434 | wi_total = (uint64_t) dp->grid_size_x | |
435 | * (dp->grid_size_y > 0 ? dp->grid_size_y : 1) | |
436 | * (dp->grid_size_z > 0 ? dp->grid_size_z : 1); | |
437 | #endif | |
438 | ||
439 | context->wg_min_x = context->wg_min_y = context->wg_min_z = 0; | |
440 | ||
441 | int dims = dp->setup & 0x3; | |
442 | ||
443 | context->wg_max_x = ((uint64_t) dp->grid_size_x + dp->workgroup_size_x - 1) | |
444 | / dp->workgroup_size_x; | |
445 | ||
446 | context->wg_max_y | |
447 | = dims < 2 ? 1 : ((uint64_t) dp->grid_size_y + dp->workgroup_size_y - 1) | |
448 | / dp->workgroup_size_y; | |
449 | ||
450 | context->wg_max_z | |
451 | = dims < 3 ? 1 : ((uint64_t) dp->grid_size_z + dp->workgroup_size_z - 1) | |
452 | / dp->workgroup_size_z; | |
453 | ||
454 | #ifdef DEBUG_PHSA_RT | |
455 | printf ("### launching work-groups %lu/%lu/%lu to %lu/%lu/%lu with " | |
456 | "wg size %lu/%lu/%lu grid size %u/%u/%u\n", | |
457 | context->wg_min_x, context->wg_min_y, context->wg_min_z, | |
458 | context->wg_max_x, context->wg_max_y, context->wg_max_z, | |
459 | sat_wg_size_x, sat_wg_size_y, sat_wg_size_z, dp->grid_size_x, | |
460 | dp->grid_size_y, dp->grid_size_z); | |
461 | #endif | |
462 | ||
463 | PHSAWorkItem wi; | |
464 | PHSAWorkGroup wg; | |
465 | wi.wg = &wg; | |
466 | wi.x = wi.y = wi.z = 0; | |
467 | wi.launch_data = context; | |
468 | ||
469 | #ifdef BENCHMARK_PHSA_RT | |
470 | start_time = clock (); | |
471 | uint64_t wg_count = 0; | |
472 | #endif | |
473 | ||
474 | size_t wg_size = __hsail_workgroupsize (0, &wi) | |
475 | * __hsail_workgroupsize (1, &wi) | |
476 | * __hsail_workgroupsize (2, &wi); | |
477 | ||
478 | void *private_base_ptr = NULL; | |
479 | if (dp->private_segment_size > 0 | |
480 | && posix_memalign (&private_base_ptr, PRIVATE_SEGMENT_ALIGN, | |
481 | dp->private_segment_size * wg_size) | |
482 | != 0) | |
483 | phsa_fatal_error (3); | |
484 | ||
c11ac926 | 485 | wg.alloca_stack_p = dp->private_segment_size * wg_size + ALLOCA_OVERHEAD; |
55a56509 | 486 | wg.alloca_frame_p = wg.alloca_stack_p; |
487 | ||
488 | wg.private_base_ptr = private_base_ptr; | |
489 | wg.group_base_ptr = group_base_ptr; | |
490 | ||
491 | #ifdef DEBUG_PHSA_RT | |
492 | printf ("priv seg size %u wg_size %lu @ %p\n", dp->private_segment_size, | |
493 | wg_size, private_base_ptr); | |
494 | #endif | |
495 | ||
496 | for (wg_z = context->wg_min_z; wg_z < context->wg_max_z; ++wg_z) | |
497 | for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y) | |
498 | for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x) | |
499 | { | |
4ac4c323 | 500 | wi.group_x = wg_x; |
501 | wi.group_y = wg_y; | |
502 | wi.group_z = wg_z; | |
503 | ||
504 | wi.wg_size_x = context->dp->workgroup_size_x; | |
505 | wi.wg_size_y = context->dp->workgroup_size_y; | |
506 | wi.wg_size_z = context->dp->workgroup_size_z; | |
507 | ||
508 | wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi); | |
509 | wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi); | |
510 | wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi); | |
55a56509 | 511 | |
512 | context->kernel (context->kernarg_addr, &wi, group_base_ptr, | |
08ab8585 | 513 | group_local_offset, private_base_ptr); |
55a56509 | 514 | |
515 | #if defined (BENCHMARK_PHSA_RT) | |
516 | wg_count++; | |
517 | if (wg_count % 1000000 == 0) | |
518 | { | |
519 | clock_t spent_time = clock () - start_time; | |
520 | uint64_t wi_count = wg_x * sat_wg_size_x + wg_y * sat_wg_size_y | |
521 | + wg_z * sat_wg_size_z; | |
522 | double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC; | |
523 | double wis_per_sec = wi_count / spent_time_sec; | |
524 | uint64_t eta_sec = (wi_total - wi_count) / wis_per_sec; | |
525 | ||
526 | printf ("%lu WIs executed in %lus (%lu WIs/s, ETA in %lu s)\n", | |
527 | wi_count, (uint64_t) spent_time_sec, | |
528 | (uint64_t) wis_per_sec, (uint64_t) eta_sec); | |
529 | } | |
530 | #endif | |
531 | } | |
532 | ||
533 | #ifdef BENCHMARK_PHSA_RT | |
534 | clock_t spent_time = clock () - start_time; | |
535 | double spent_time_sec = (double) spent_time / CLOCKS_PER_SEC; | |
536 | double wis_per_sec = wi_total / spent_time_sec; | |
537 | ||
538 | printf ("### %lu WIs executed in %lu s (%lu WIs / s)\n", wi_total, | |
539 | (uint64_t) spent_time_sec, (uint64_t) wis_per_sec); | |
540 | #endif | |
55a56509 | 541 | free (private_base_ptr); |
542 | private_base_ptr = NULL; | |
543 | } | |
544 | ||
545 | /* gccbrig generates the following from each HSAIL kernel: | |
546 | ||
547 | 1) The actual kernel function (a single work-item kernel or a work-group | |
548 | function) generated from HSAIL (BRIG). | |
549 | ||
550 | static void _Kernel (void* args, void* context, void* group_base_ptr) | |
551 | { | |
552 | ... | |
553 | } | |
554 | ||
555 | 2) A public facing kernel function that is called from the PHSA runtime: | |
556 | ||
557 | a) A single work-item function (that requires fibers for multi-WI): | |
558 | ||
559 | void Kernel (void* context) | |
560 | { | |
561 | __launch_launch_kernel (_Kernel, context); | |
562 | } | |
563 | ||
564 | or | |
565 | ||
566 | b) a when gccbrig could generate a work-group function: | |
567 | ||
568 | void Kernel (void* context) | |
569 | { | |
570 | __hsail_launch_wg_function (_Kernel, context); | |
571 | } | |
572 | */ | |
573 | ||
574 | #ifdef HAVE_FIBERS | |
575 | ||
576 | void | |
577 | __hsail_launch_kernel (gccbrigKernelFunc kernel, PHSAKernelLaunchData *context, | |
08ab8585 | 578 | void *group_base_ptr, uint32_t group_local_offset) |
55a56509 | 579 | { |
580 | context->kernel = kernel; | |
08ab8585 | 581 | phsa_spawn_work_items (context, group_base_ptr, group_local_offset); |
55a56509 | 582 | } |
583 | #endif | |
584 | ||
585 | void | |
586 | __hsail_launch_wg_function (gccbrigKernelFunc kernel, | |
08ab8585 | 587 | PHSAKernelLaunchData *context, void *group_base_ptr, |
588 | uint32_t group_local_offset) | |
55a56509 | 589 | { |
590 | context->kernel = kernel; | |
08ab8585 | 591 | phsa_execute_work_groups (context, group_base_ptr, group_local_offset); |
55a56509 | 592 | } |
593 | ||
594 | uint32_t | |
595 | __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context) | |
596 | { | |
597 | hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp; | |
598 | ||
599 | uint32_t id; | |
600 | switch (dim) | |
601 | { | |
602 | default: | |
603 | case 0: | |
604 | /* Overflow semantics in the case of WG dim > grid dim. */ | |
4ac4c323 | 605 | id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x) |
55a56509 | 606 | % dp->grid_size_x; |
607 | break; | |
608 | case 1: | |
4ac4c323 | 609 | id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y) |
55a56509 | 610 | % dp->grid_size_y; |
611 | break; | |
612 | case 2: | |
4ac4c323 | 613 | id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z) |
55a56509 | 614 | % dp->grid_size_z; |
615 | break; | |
616 | } | |
617 | return id; | |
618 | } | |
619 | ||
620 | uint64_t | |
621 | __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context) | |
622 | { | |
623 | hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp; | |
624 | ||
625 | uint64_t id; | |
626 | switch (dim) | |
627 | { | |
628 | default: | |
629 | case 0: | |
630 | /* Overflow semantics in the case of WG dim > grid dim. */ | |
4ac4c323 | 631 | id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x) |
55a56509 | 632 | % dp->grid_size_x; |
633 | break; | |
634 | case 1: | |
4ac4c323 | 635 | id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y) |
55a56509 | 636 | % dp->grid_size_y; |
637 | break; | |
638 | case 2: | |
4ac4c323 | 639 | id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z) |
55a56509 | 640 | % dp->grid_size_z; |
641 | break; | |
642 | } | |
643 | return id; | |
644 | } | |
645 | ||
646 | ||
647 | uint32_t | |
648 | __hsail_workitemid (uint32_t dim, PHSAWorkItem *context) | |
649 | { | |
650 | PHSAWorkItem *c = (PHSAWorkItem *) context; | |
651 | hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp; | |
652 | ||
653 | /* The number of dimensions is in the two least significant bits. */ | |
654 | int dims = dp->setup & 0x3; | |
655 | ||
656 | uint32_t id; | |
657 | switch (dim) | |
658 | { | |
659 | default: | |
660 | case 0: | |
661 | id = c->x; | |
662 | break; | |
663 | case 1: | |
664 | id = dims < 2 ? 0 : c->y; | |
665 | break; | |
666 | case 2: | |
667 | id = dims < 3 ? 0 : c->z; | |
668 | break; | |
669 | } | |
670 | return id; | |
671 | } | |
672 | ||
673 | uint32_t | |
674 | __hsail_gridgroups (uint32_t dim, PHSAWorkItem *context) | |
675 | { | |
676 | hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp; | |
677 | int dims = dp->setup & 0x3; | |
678 | ||
679 | uint32_t id; | |
680 | switch (dim) | |
681 | { | |
682 | default: | |
683 | case 0: | |
684 | id = (dp->grid_size_x + dp->workgroup_size_x - 1) / dp->workgroup_size_x; | |
685 | break; | |
686 | case 1: | |
687 | id = dims < 2 ? 1 : (dp->grid_size_y + dp->workgroup_size_y - 1) | |
688 | / dp->workgroup_size_y; | |
689 | break; | |
690 | case 2: | |
691 | id = dims < 3 ? 1 : (dp->grid_size_z + dp->workgroup_size_z - 1) | |
692 | / dp->workgroup_size_z; | |
693 | break; | |
694 | } | |
695 | return id; | |
696 | } | |
697 | ||
698 | uint32_t | |
699 | __hsail_workitemflatid (PHSAWorkItem *c) | |
700 | { | |
701 | hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp; | |
702 | ||
703 | return c->x + c->y * dp->workgroup_size_x | |
704 | + c->z * dp->workgroup_size_x * dp->workgroup_size_y; | |
705 | } | |
706 | ||
707 | uint32_t | |
708 | __hsail_currentworkitemflatid (PHSAWorkItem *c) | |
709 | { | |
710 | hsa_kernel_dispatch_packet_t *dp = c->launch_data->dp; | |
711 | ||
712 | return c->x + c->y * __hsail_currentworkgroupsize (0, c) | |
713 | + c->z * __hsail_currentworkgroupsize (0, c) | |
714 | * __hsail_currentworkgroupsize (1, c); | |
715 | } | |
716 | ||
717 | void | |
718 | __hsail_setworkitemid (uint32_t dim, uint32_t id, PHSAWorkItem *context) | |
719 | { | |
720 | switch (dim) | |
721 | { | |
722 | default: | |
723 | case 0: | |
724 | context->x = id; | |
725 | break; | |
726 | case 1: | |
727 | context->y = id; | |
728 | break; | |
729 | case 2: | |
730 | context->z = id; | |
731 | break; | |
732 | } | |
733 | } | |
734 | ||
735 | uint64_t | |
736 | __hsail_workitemflatabsid_u64 (PHSAWorkItem *context) | |
737 | { | |
738 | PHSAWorkItem *c = (PHSAWorkItem *) context; | |
739 | hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp; | |
740 | ||
741 | /* Work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */ | |
742 | uint64_t id0 = __hsail_workitemabsid (0, context); | |
743 | uint64_t id1 = __hsail_workitemabsid (1, context); | |
744 | uint64_t id2 = __hsail_workitemabsid (2, context); | |
745 | ||
746 | uint64_t max0 = dp->grid_size_x; | |
747 | uint64_t max1 = dp->grid_size_y; | |
748 | uint64_t id = id0 + id1 * max0 + id2 * max0 * max1; | |
749 | ||
750 | return id; | |
751 | } | |
752 | ||
753 | uint32_t | |
754 | __hsail_workitemflatabsid_u32 (PHSAWorkItem *context) | |
755 | { | |
756 | PHSAWorkItem *c = (PHSAWorkItem *) context; | |
757 | hsa_kernel_dispatch_packet_t *dp = context->launch_data->dp; | |
758 | ||
759 | /* work-item flattened absolute ID = ID0 + ID1 * max0 + ID2 * max0 * max1. */ | |
760 | uint64_t id0 = __hsail_workitemabsid (0, context); | |
761 | uint64_t id1 = __hsail_workitemabsid (1, context); | |
762 | uint64_t id2 = __hsail_workitemabsid (2, context); | |
763 | ||
764 | uint64_t max0 = dp->grid_size_x; | |
765 | uint64_t max1 = dp->grid_size_y; | |
766 | uint64_t id = id0 + id1 * max0 + id2 * max0 * max1; | |
767 | return (uint32_t) id; | |
768 | } | |
769 | ||
770 | uint32_t | |
771 | __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi) | |
772 | { | |
773 | hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp; | |
774 | uint32_t wg_size = 0; | |
775 | switch (dim) | |
776 | { | |
777 | default: | |
778 | case 0: | |
4ac4c323 | 779 | if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x) |
55a56509 | 780 | wg_size = dp->workgroup_size_x; /* Full WG. */ |
781 | else | |
782 | wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */ | |
783 | break; | |
784 | case 1: | |
4ac4c323 | 785 | if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y) |
55a56509 | 786 | wg_size = dp->workgroup_size_y; /* Full WG. */ |
787 | else | |
788 | wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */ | |
789 | break; | |
790 | case 2: | |
4ac4c323 | 791 | if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z) |
55a56509 | 792 | wg_size = dp->workgroup_size_z; /* Full WG. */ |
793 | else | |
794 | wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */ | |
795 | break; | |
796 | } | |
797 | return wg_size; | |
798 | } | |
799 | ||
800 | uint32_t | |
801 | __hsail_workgroupsize (uint32_t dim, PHSAWorkItem *wi) | |
802 | { | |
803 | hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp; | |
804 | switch (dim) | |
805 | { | |
806 | default: | |
807 | case 0: | |
808 | return dp->workgroup_size_x; | |
809 | case 1: | |
810 | return dp->workgroup_size_y; | |
811 | case 2: | |
812 | return dp->workgroup_size_z; | |
813 | } | |
814 | } | |
815 | ||
816 | uint32_t | |
817 | __hsail_gridsize (uint32_t dim, PHSAWorkItem *wi) | |
818 | { | |
819 | hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp; | |
820 | switch (dim) | |
821 | { | |
822 | default: | |
823 | case 0: | |
824 | return dp->grid_size_x; | |
825 | case 1: | |
826 | return dp->grid_size_y; | |
827 | case 2: | |
828 | return dp->grid_size_z; | |
829 | } | |
830 | } | |
831 | ||
832 | uint32_t | |
833 | __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi) | |
834 | { | |
835 | switch (dim) | |
836 | { | |
837 | default: | |
838 | case 0: | |
4ac4c323 | 839 | return wi->group_x; |
55a56509 | 840 | case 1: |
4ac4c323 | 841 | return wi->group_y; |
55a56509 | 842 | case 2: |
4ac4c323 | 843 | return wi->group_z; |
55a56509 | 844 | } |
845 | } | |
846 | ||
847 | uint32_t | |
848 | __hsail_dim (PHSAWorkItem *wi) | |
849 | { | |
850 | hsa_kernel_dispatch_packet_t *dp = wi->launch_data->dp; | |
851 | return dp->setup & 0x3; | |
852 | } | |
853 | ||
854 | uint64_t | |
855 | __hsail_packetid (PHSAWorkItem *wi) | |
856 | { | |
857 | return wi->launch_data->packet_id; | |
858 | } | |
859 | ||
860 | uint32_t | |
861 | __hsail_packetcompletionsig_sig32 (PHSAWorkItem *wi) | |
862 | { | |
863 | return (uint32_t) wi->launch_data->dp->completion_signal.handle; | |
864 | } | |
865 | ||
866 | uint64_t | |
867 | __hsail_packetcompletionsig_sig64 (PHSAWorkItem *wi) | |
868 | { | |
869 | return (uint64_t) (wi->launch_data->dp->completion_signal.handle); | |
870 | } | |
871 | ||
872 | #ifdef HAVE_FIBERS | |
873 | void | |
874 | __hsail_barrier (PHSAWorkItem *wi) | |
875 | { | |
876 | fiber_barrier_reach ((fiber_barrier_t *) wi->launch_data->wg_sync_barrier); | |
877 | } | |
878 | #endif | |
879 | ||
880 | /* Return a 32b private segment address that points to a dynamically | |
881 | allocated chunk of 'size' with 'align'. | |
882 | ||
883 | Allocates the space from the end of the private segment allocated | |
884 | for the whole work group. In implementations with separate private | |
885 | memories per WI, we will need to have a stack pointer per WI. But in | |
886 | the current implementation, the segment is shared, so we possibly | |
887 | save some space in case all WIs do not call the alloca. | |
888 | ||
889 | The "alloca frames" are organized as follows: | |
890 | ||
891 | wg->alloca_stack_p points to the last allocated data (initially | |
892 | outside the private segment) | |
893 | wg->alloca_frame_p points to the first address _outside_ the current | |
894 | function's allocations (initially to the same as alloca_stack_p) | |
895 | ||
896 | The data is allocated downwards from the end of the private segment. | |
897 | ||
898 | In the beginning of a new function which has allocas, a new alloca | |
899 | frame is pushed which adds the current alloca_frame_p (the current | |
900 | function's frame starting point) to the top of the alloca stack and | |
901 | alloca_frame_p is set to the current stack position. | |
902 | ||
903 | At the exit points of a function with allocas, the alloca frame | |
904 | is popped before returning. This involves popping the alloca_frame_p | |
905 | to the one of the previous function in the call stack, and alloca_stack_p | |
906 | similarly, to the position of the last word alloca'd by the previous | |
907 | function. | |
908 | */ | |
909 | ||
910 | uint32_t | |
911 | __hsail_alloca (uint32_t size, uint32_t align, PHSAWorkItem *wi) | |
912 | { | |
913 | volatile PHSAWorkGroup *wg = wi->wg; | |
c11ac926 | 914 | int64_t new_pos = wg->alloca_stack_p - size; |
55a56509 | 915 | while (new_pos % align != 0) |
916 | new_pos--; | |
c11ac926 | 917 | if (new_pos < 0) |
918 | phsa_fatal_error (2); | |
919 | ||
55a56509 | 920 | wg->alloca_stack_p = new_pos; |
921 | ||
922 | #ifdef DEBUG_ALLOCA | |
923 | printf ("--- alloca (%u, %u) sp @%u fp @%u\n", size, align, | |
924 | wg->alloca_stack_p, wg->alloca_frame_p); | |
925 | #endif | |
926 | return new_pos; | |
927 | } | |
928 | ||
929 | /* Initializes a new "alloca frame" in the private segment. | |
930 | This should be called at all the function entry points in case | |
931 | the function contains at least one call to alloca. */ | |
932 | ||
933 | void | |
934 | __hsail_alloca_push_frame (PHSAWorkItem *wi) | |
935 | { | |
936 | volatile PHSAWorkGroup *wg = wi->wg; | |
937 | ||
938 | /* Store the alloca_frame_p without any alignment padding so | |
939 | we know exactly where the previous frame ended after popping | |
940 | it. */ | |
941 | #ifdef DEBUG_ALLOCA | |
942 | printf ("--- push frame "); | |
943 | #endif | |
944 | uint32_t last_word_offs = __hsail_alloca (4, 1, wi); | |
945 | memcpy (wg->private_base_ptr + last_word_offs, | |
946 | (const void *) &wg->alloca_frame_p, 4); | |
947 | wg->alloca_frame_p = last_word_offs; | |
948 | ||
949 | #ifdef DEBUG_ALLOCA | |
950 | printf ("--- sp @%u fp @%u\n", wg->alloca_stack_p, wg->alloca_frame_p); | |
951 | #endif | |
952 | } | |
953 | ||
954 | /* Frees the current "alloca frame" and restores the frame | |
955 | pointer. | |
956 | This should be called at all the function return points in case | |
957 | the function contains at least one call to alloca. Restores the | |
958 | alloca stack to the condition it was before pushing the frame | |
959 | the last time. */ | |
960 | void | |
961 | __hsail_alloca_pop_frame (PHSAWorkItem *wi) | |
962 | { | |
963 | volatile PHSAWorkGroup *wg = wi->wg; | |
964 | ||
965 | wg->alloca_stack_p = wg->alloca_frame_p; | |
816d679b | 966 | memcpy ((void *) &wg->alloca_frame_p, |
55a56509 | 967 | (const void *) (wg->private_base_ptr + wg->alloca_frame_p), 4); |
968 | /* Now frame_p points to the beginning of the previous function's | |
969 | frame and stack_p to its end. */ | |
970 | ||
971 | wg->alloca_stack_p += 4; | |
972 | ||
973 | #ifdef DEBUG_ALLOCA | |
974 | printf ("--- pop frame sp @%u fp @%u\n", wg->alloca_stack_p, | |
975 | wg->alloca_frame_p); | |
976 | #endif | |
977 | } |