]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/gcn/gcn-run.c
Update copyright years.
[thirdparty/gcc.git] / gcc / config / gcn / gcn-run.c
CommitLineData
5326695a
AS
1/* Run a stand-alone AMD GCN kernel.
2
3 Copyright 2017 Mentor Graphics Corporation
99dee823 4 Copyright (C) 2018-2021 Free Software Foundation, Inc.
5326695a
AS
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18
19/* This program will run a compiled stand-alone GCN kernel on a GPU.
20
21 The kernel entry point's signature must use a standard main signature:
22
23 int main(int argc, char **argv)
24*/
25
26#include <stdint.h>
27#include <stdbool.h>
28#include <stdlib.h>
29#include <malloc.h>
30#include <stdio.h>
31#include <string.h>
32#include <dlfcn.h>
33#include <unistd.h>
34#include <elf.h>
35#include <signal.h>
36
5326695a
AS
37#include "hsa.h"
38
39#ifndef HSA_RUNTIME_LIB
f062c3f1 40#define HSA_RUNTIME_LIB "libhsa-runtime64.so.1"
5326695a
AS
41#endif
42
43#ifndef VERSION_STRING
44#define VERSION_STRING "(version unknown)"
45#endif
46
47bool debug = false;
48
49hsa_agent_t device = { 0 };
50hsa_queue_t *queue = NULL;
7039cebf
KCY
51uint64_t init_array_kernel = 0;
52uint64_t fini_array_kernel = 0;
53uint64_t main_kernel = 0;
5326695a
AS
54hsa_executable_t executable = { 0 };
55
56hsa_region_t kernargs_region = { 0 };
e3d0ee4a 57hsa_region_t heap_region = { 0 };
5326695a
AS
58uint32_t kernarg_segment_size = 0;
59uint32_t group_segment_size = 0;
60uint32_t private_segment_size = 0;
61
62static void
63usage (const char *progname)
64{
65 printf ("Usage: %s [options] kernel [kernel-args]\n\n"
66 "Options:\n"
67 " --help\n"
68 " --version\n"
69 " --debug\n", progname);
70}
71
72static void
73version (const char *progname)
74{
75 printf ("%s " VERSION_STRING "\n", progname);
76}
77
78/* As an HSA runtime is dlopened, following structure defines the necessary
79 function pointers.
80 Code adapted from libgomp. */
81
82struct hsa_runtime_fn_info
83{
84 /* HSA runtime. */
85 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
86 const char **status_string);
87 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
88 hsa_agent_info_t attribute,
89 void *value);
90 hsa_status_t (*hsa_init_fn) (void);
91 hsa_status_t (*hsa_iterate_agents_fn)
92 (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
93 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
94 hsa_region_info_t attribute,
95 void *value);
96 hsa_status_t (*hsa_queue_create_fn)
97 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
98 void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
99 void *data, uint32_t private_segment_size,
100 uint32_t group_segment_size, hsa_queue_t **queue);
101 hsa_status_t (*hsa_agent_iterate_regions_fn)
102 (hsa_agent_t agent,
103 hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
104 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
105 hsa_status_t (*hsa_executable_create_fn)
106 (hsa_profile_t profile, hsa_executable_state_t executable_state,
107 const char *options, hsa_executable_t *executable);
108 hsa_status_t (*hsa_executable_global_variable_define_fn)
109 (hsa_executable_t executable, const char *variable_name, void *address);
110 hsa_status_t (*hsa_executable_load_code_object_fn)
111 (hsa_executable_t executable, hsa_agent_t agent,
112 hsa_code_object_t code_object, const char *options);
113 hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
114 const char *options);
115 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
116 uint32_t num_consumers,
117 const hsa_agent_t *consumers,
118 hsa_signal_t *signal);
119 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
120 void **ptr);
e3d0ee4a
AS
121 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
122 hsa_access_permission_t access);
5326695a
AS
123 hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
124 size_t size);
125 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
126 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
127 hsa_status_t (*hsa_executable_get_symbol_fn)
128 (hsa_executable_t executable, const char *module_name,
129 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
130 hsa_executable_symbol_t *symbol);
131 hsa_status_t (*hsa_executable_symbol_get_info_fn)
132 (hsa_executable_symbol_t executable_symbol,
133 hsa_executable_symbol_info_t attribute, void *value);
134 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
135 hsa_signal_value_t value);
136 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
137 (hsa_signal_t signal, hsa_signal_condition_t condition,
138 hsa_signal_value_t compare_value, uint64_t timeout_hint,
139 hsa_wait_state_t wait_state_hint);
140 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
141 (hsa_signal_t signal, hsa_signal_condition_t condition,
142 hsa_signal_value_t compare_value, uint64_t timeout_hint,
143 hsa_wait_state_t wait_state_hint);
144 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
145 hsa_status_t (*hsa_code_object_deserialize_fn)
146 (void *serialized_code_object, size_t serialized_code_object_size,
147 const char *options, hsa_code_object_t *code_object);
148 uint64_t (*hsa_queue_load_write_index_relaxed_fn)
149 (const hsa_queue_t *queue);
150 void (*hsa_queue_store_write_index_relaxed_fn)
151 (const hsa_queue_t *queue, uint64_t value);
152 hsa_status_t (*hsa_shut_down_fn) ();
153};
154
155/* HSA runtime functions that are initialized in init_hsa_context.
156 Code adapted from libgomp. */
157
158static struct hsa_runtime_fn_info hsa_fns;
159
160#define DLSYM_FN(function) \
161 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
162 if (hsa_fns.function##_fn == NULL) \
163 goto fail;
164
165static void
166init_hsa_runtime_functions (void)
167{
168 void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
169 if (handle == NULL)
170 {
171 fprintf (stderr,
172 "The HSA runtime is required to run GCN kernels on hardware.\n"
173 "%s: File not found or could not be opened\n",
174 HSA_RUNTIME_LIB);
175 exit (1);
176 }
177
178 DLSYM_FN (hsa_status_string)
179 DLSYM_FN (hsa_agent_get_info)
180 DLSYM_FN (hsa_init)
181 DLSYM_FN (hsa_iterate_agents)
182 DLSYM_FN (hsa_region_get_info)
183 DLSYM_FN (hsa_queue_create)
184 DLSYM_FN (hsa_agent_iterate_regions)
185 DLSYM_FN (hsa_executable_destroy)
186 DLSYM_FN (hsa_executable_create)
187 DLSYM_FN (hsa_executable_global_variable_define)
188 DLSYM_FN (hsa_executable_load_code_object)
189 DLSYM_FN (hsa_executable_freeze)
190 DLSYM_FN (hsa_signal_create)
191 DLSYM_FN (hsa_memory_allocate)
e3d0ee4a 192 DLSYM_FN (hsa_memory_assign_agent)
5326695a
AS
193 DLSYM_FN (hsa_memory_copy)
194 DLSYM_FN (hsa_memory_free)
195 DLSYM_FN (hsa_signal_destroy)
196 DLSYM_FN (hsa_executable_get_symbol)
197 DLSYM_FN (hsa_executable_symbol_get_info)
198 DLSYM_FN (hsa_signal_wait_acquire)
199 DLSYM_FN (hsa_signal_wait_relaxed)
200 DLSYM_FN (hsa_signal_store_relaxed)
201 DLSYM_FN (hsa_queue_destroy)
202 DLSYM_FN (hsa_code_object_deserialize)
203 DLSYM_FN (hsa_queue_load_write_index_relaxed)
204 DLSYM_FN (hsa_queue_store_write_index_relaxed)
205 DLSYM_FN (hsa_shut_down)
206
207 return;
208
209fail:
210 fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
211 exit (1);
212}
213
214#undef DLSYM_FN
215
216/* Report a fatal error STR together with the HSA error corresponding to
217 STATUS and terminate execution of the current process. */
218
219static void
220hsa_fatal (const char *str, hsa_status_t status)
221{
222 const char *hsa_error_msg;
223 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
224 fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
225 hsa_error_msg);
226 exit (1);
227}
228
229/* Helper macros to ensure we check the return values from the HSA Runtime.
230 These just keep the rest of the code a bit cleaner. */
231
232#define XHSA_CMP(FN, CMP, MSG) \
233 do { \
234 hsa_status_t status = (FN); \
235 if (!(CMP)) \
236 hsa_fatal ((MSG), status); \
237 else if (debug) \
238 fprintf (stderr, "%s: OK\n", (MSG)); \
239 } while (0)
240#define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
241
242/* Callback of hsa_iterate_agents.
243 Called once for each available device, and returns "break" when a
244 suitable one has been found. */
245
246static hsa_status_t
247get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
248{
249 hsa_device_type_t device_type;
250 XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
251 &device_type),
252 "Get agent type");
253
254 /* Select only GPU devices. */
255 /* TODO: support selecting from multiple GPUs. */
256 if (HSA_DEVICE_TYPE_GPU == device_type)
257 {
258 device = agent;
259 return HSA_STATUS_INFO_BREAK;
260 }
261
262 /* The device was not suitable. */
263 return HSA_STATUS_SUCCESS;
264}
265
266/* Callback of hsa_iterate_regions.
267 Called once for each available memory region, and returns "break" when a
268 suitable one has been found. */
269
270static hsa_status_t
e3d0ee4a
AS
271get_memory_region (hsa_region_t region, hsa_region_t *retval,
272 hsa_region_global_flag_t kind)
5326695a
AS
273{
274 /* Reject non-global regions. */
275 hsa_region_segment_t segment;
276 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
277 if (HSA_REGION_SEGMENT_GLOBAL != segment)
278 return HSA_STATUS_SUCCESS;
279
280 /* Find a region with the KERNARG flag set. */
281 hsa_region_global_flag_t flags;
282 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
283 &flags);
e3d0ee4a 284 if (flags & kind)
5326695a 285 {
e3d0ee4a 286 *retval = region;
5326695a
AS
287 return HSA_STATUS_INFO_BREAK;
288 }
289
290 /* The region was not suitable. */
291 return HSA_STATUS_SUCCESS;
292}
293
e3d0ee4a
AS
294static hsa_status_t
295get_kernarg_region (hsa_region_t region, void *data __attribute__((unused)))
296{
297 return get_memory_region (region, &kernargs_region,
298 HSA_REGION_GLOBAL_FLAG_KERNARG);
299}
300
301static hsa_status_t
302get_heap_region (hsa_region_t region, void *data __attribute__((unused)))
303{
304 return get_memory_region (region, &heap_region,
305 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
306}
307
5326695a
AS
308/* Initialize the HSA Runtime library and GPU device. */
309
310static void
311init_device ()
312{
313 /* Load the shared library and find the API functions. */
314 init_hsa_runtime_functions ();
315
316 /* Initialize the HSA Runtime. */
317 XHSA (hsa_fns.hsa_init_fn (),
318 "Initialize run-time");
319
320 /* Select a suitable device.
321 The call-back function, get_gpu_agent, does the selection. */
322 XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
323 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
324 "Find a device");
325
326 /* Initialize the queue used for launching kernels. */
327 uint32_t queue_size = 0;
328 XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
329 &queue_size),
330 "Find max queue size");
331 XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
332 HSA_QUEUE_TYPE_SINGLE, NULL,
333 NULL, UINT32_MAX, UINT32_MAX, &queue),
334 "Set up a device queue");
335
336 /* Select a memory region for the kernel arguments.
337 The call-back function, get_kernarg_region, does the selection. */
338 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
339 NULL),
340 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
341 "Locate kernargs memory");
e3d0ee4a
AS
342
343 /* Select a memory region for the kernel heap.
344 The call-back function, get_heap_region, does the selection. */
345 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region,
346 NULL),
347 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
348 "Locate device memory");
5326695a
AS
349}
350
351
352/* Read a whole input file.
353 Code copied from mkoffload. */
354
355static char *
356read_file (const char *filename, size_t *plen)
357{
358 size_t alloc = 16384;
359 size_t base = 0;
360 char *buffer;
361
362 FILE *stream = fopen (filename, "rb");
363 if (!stream)
364 {
365 perror (filename);
366 exit (1);
367 }
368
369 if (!fseek (stream, 0, SEEK_END))
370 {
371 /* Get the file size. */
372 long s = ftell (stream);
373 if (s >= 0)
374 alloc = s + 100;
375 fseek (stream, 0, SEEK_SET);
376 }
377 buffer = malloc (alloc);
378
379 for (;;)
380 {
381 size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
382
383 if (!n)
384 break;
385 base += n;
386 if (base + 1 == alloc)
387 {
388 alloc *= 2;
389 buffer = realloc (buffer, alloc);
390 }
391 }
392 buffer[base] = 0;
393 *plen = base;
394
395 fclose (stream);
396
397 return buffer;
398}
399
400/* Read a HSA Code Object (HSACO) from file, and load it into the device. */
401
402static void
403load_image (const char *filename)
404{
405 size_t image_size;
406 Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
407
408 /* An "executable" consists of one or more code objects. */
409 XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
410 HSA_EXECUTABLE_STATE_UNFROZEN, "",
411 &executable),
412 "Initialize GCN executable");
413
5326695a
AS
414 /* Add the HSACO to the executable. */
415 hsa_code_object_t co = { 0 };
416 XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
417 "Deserialize GCN code object");
418 XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
419 ""),
420 "Load GCN code object");
421
422 /* We're done modifying he executable. */
423 XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
424 "Freeze GCN executable");
425
7039cebf 426 /* Locate the "_init_array" function, and read the kernel's properties. */
5326695a 427 hsa_executable_symbol_t symbol;
f062c3f1
AS
428 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL,
429 "_init_array.kd", device, 0,
430 &symbol),
7039cebf
KCY
431 "Find '_init_array' function");
432 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
f062c3f1
AS
433 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
434 &init_array_kernel),
7039cebf
KCY
435 "Extract '_init_array' kernel object kernel object");
436
437 /* Locate the "_fini_array" function, and read the kernel's properties. */
f062c3f1
AS
438 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL,
439 "_fini_array.kd", device, 0,
440 &symbol),
7039cebf
KCY
441 "Find '_fini_array' function");
442 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
f062c3f1
AS
443 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
444 &fini_array_kernel),
7039cebf
KCY
445 "Extract '_fini_array' kernel object kernel object");
446
447 /* Locate the "main" function, and read the kernel's properties. */
f062c3f1 448 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main.kd",
5326695a
AS
449 device, 0, &symbol),
450 "Find 'main' function");
451 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
7039cebf
KCY
452 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel),
453 "Extract 'main' kernel object");
5326695a
AS
454 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
455 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
456 &kernarg_segment_size),
457 "Extract kernarg segment size");
458 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
459 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
460 &group_segment_size),
461 "Extract group segment size");
462 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
463 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
464 &private_segment_size),
465 "Extract private segment size");
5326695a
AS
466}
467
468/* Allocate some device memory from the kernargs region.
469 The returned address will be 32-bit (with excess zeroed on 64-bit host),
470 and accessible via the same address on both host and target (via
471 __flat_scalar GCN address space). */
472
473static void *
e3d0ee4a 474device_malloc (size_t size, hsa_region_t region)
5326695a
AS
475{
476 void *result;
e3d0ee4a 477 XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result),
5326695a
AS
478 "Allocate device memory");
479 return result;
480}
481
482/* These are the device pointers that will be transferred to the target.
483 The HSA Runtime points the kernargs register here.
484 They correspond to function signature:
485 int main (int argc, char *argv[], int *return_value)
486 The compiler expects this, for kernel functions, and will
487 automatically assign the exit value to *return_value. */
488struct kernargs
489{
490 /* Kernargs. */
491 int32_t argc;
492 int64_t argv;
493 int64_t out_ptr;
494 int64_t heap_ptr;
495
496 /* Output data. */
497 struct output
498 {
499 int return_value;
f9cacebb 500 unsigned int next_output;
5326695a
AS
501 struct printf_data
502 {
503 int written;
504 char msg[128];
505 int type;
506 union
507 {
508 int64_t ivalue;
509 double dvalue;
510 char text[128];
511 };
f9cacebb
AS
512 } queue[1024];
513 unsigned int consumed;
5326695a 514 } output_data;
5326695a
AS
515};
516
e3d0ee4a
AS
517struct heap
518{
519 int64_t size;
520 char data[0];
521} heap;
522
5326695a 523/* Print any console output from the kernel.
f9cacebb
AS
524 We print all entries from "consumed" to the next entry without a "written"
525 flag, or "next_output" is reached. The buffer is circular, but the
526 indices are absolute. It is assumed the kernel will stop writing data
527 if "next_output" wraps (becomes smaller than "consumed"). */
5326695a 528void
f9cacebb 529gomp_print_output (struct kernargs *kernargs, bool final)
5326695a 530{
f9cacebb
AS
531 unsigned int limit = (sizeof (kernargs->output_data.queue)
532 / sizeof (kernargs->output_data.queue[0]));
5326695a 533
f9cacebb
AS
534 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
535 __ATOMIC_ACQUIRE);
536 unsigned int to = kernargs->output_data.next_output;
537
538 if (from > to)
539 {
540 /* Overflow. */
541 if (final)
542 printf ("GCN print buffer overflowed.\n");
543 return;
544 }
545
546 unsigned int i;
547 for (i = from; i < to; i++)
5326695a 548 {
f9cacebb 549 struct printf_data *data = &kernargs->output_data.queue[i%limit];
5326695a 550
f9cacebb 551 if (!data->written && !final)
5326695a
AS
552 break;
553
554 switch (data->type)
555 {
556 case 0:
557 printf ("%.128s%ld\n", data->msg, data->ivalue);
558 break;
559 case 1:
560 printf ("%.128s%f\n", data->msg, data->dvalue);
561 break;
562 case 2:
563 printf ("%.128s%.128s\n", data->msg, data->text);
564 break;
565 case 3:
566 printf ("%.128s%.128s", data->msg, data->text);
567 break;
f9cacebb
AS
568 default:
569 printf ("GCN print buffer error!\n");
570 break;
5326695a
AS
571 }
572
573 data->written = 0;
f9cacebb
AS
574 __atomic_store_n (&kernargs->output_data.consumed, i+1,
575 __ATOMIC_RELEASE);
5326695a 576 }
f9cacebb 577 fflush (stdout);
5326695a
AS
578}
579
580/* Execute an already-loaded kernel on the device. */
581
582static void
7039cebf 583run (uint64_t kernel, void *kernargs)
5326695a
AS
584{
585 /* A "signal" is used to launch and monitor the kernel. */
586 hsa_signal_t signal;
587 XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
588 "Create signal");
589
590 /* Configure for a single-worker kernel. */
591 uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
592 const uint32_t queueMask = queue->size - 1;
593 hsa_kernel_dispatch_packet_t *dispatch_packet =
594 &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
595 queueMask]);
596 dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
597 dispatch_packet->workgroup_size_x = (uint16_t) 1;
598 dispatch_packet->workgroup_size_y = (uint16_t) 64;
599 dispatch_packet->workgroup_size_z = (uint16_t) 1;
600 dispatch_packet->grid_size_x = 1;
601 dispatch_packet->grid_size_y = 64;
602 dispatch_packet->grid_size_z = 1;
603 dispatch_packet->completion_signal = signal;
604 dispatch_packet->kernel_object = kernel;
605 dispatch_packet->kernarg_address = (void *) kernargs;
606 dispatch_packet->private_segment_size = private_segment_size;
607 dispatch_packet->group_segment_size = group_segment_size;
608
609 uint16_t header = 0;
610 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
611 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
612 header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
613
614 __atomic_store_n ((uint32_t *) dispatch_packet,
615 header | (dispatch_packet->setup << 16),
616 __ATOMIC_RELEASE);
617
618 if (debug)
619 fprintf (stderr, "Launch kernel\n");
620
621 hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
622 hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
623 /* Kernel running ...... */
5326695a
AS
624 while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
625 1, 1000000,
626 HSA_WAIT_STATE_ACTIVE) != 0)
627 {
628 usleep (10000);
f9cacebb 629 gomp_print_output (kernargs, false);
5326695a
AS
630 }
631
f9cacebb 632 gomp_print_output (kernargs, true);
5326695a
AS
633
634 if (debug)
635 fprintf (stderr, "Kernel exited\n");
636
637 XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
638 "Clean up signal");
639}
640
641int
642main (int argc, char *argv[])
643{
644 int kernel_arg = 0;
645 for (int i = 1; i < argc; i++)
646 {
647 if (!strcmp (argv[i], "--help"))
648 {
649 usage (argv[0]);
650 return 0;
651 }
652 else if (!strcmp (argv[i], "--version"))
653 {
654 version (argv[0]);
655 return 0;
656 }
657 else if (!strcmp (argv[i], "--debug"))
658 debug = true;
659 else if (argv[i][0] == '-')
660 {
661 usage (argv[0]);
662 return 1;
663 }
664 else
665 {
666 kernel_arg = i;
667 break;
668 }
669 }
670
671 if (!kernel_arg)
672 {
673 /* No kernel arguments were found. */
674 usage (argv[0]);
675 return 1;
676 }
677
678 /* The remaining arguments are for the GCN kernel. */
679 int kernel_argc = argc - kernel_arg;
680 char **kernel_argv = &argv[kernel_arg];
681
682 init_device ();
683 load_image (kernel_argv[0]);
684
685 /* Calculate size of function parameters + argv data. */
686 size_t args_size = 0;
687 for (int i = 0; i < kernel_argc; i++)
688 args_size += strlen (kernel_argv[i]) + 1;
689
690 /* Allocate device memory for both function parameters and the argv
691 data. */
e3d0ee4a
AS
692 struct kernargs *kernargs = device_malloc (sizeof (*kernargs),
693 kernargs_region);
5326695a
AS
694 struct argdata
695 {
696 int64_t argv_data[kernel_argc];
697 char strings[args_size];
e3d0ee4a
AS
698 } *args = device_malloc (sizeof (struct argdata), kernargs_region);
699
700 size_t heap_size = 10 * 1024 * 1024; /* 10MB. */
701 struct heap *heap = device_malloc (heap_size, heap_region);
702 XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device,
703 HSA_ACCESS_PERMISSION_RW),
704 "Assign heap to device agent");
5326695a
AS
705
706 /* Write the data to the target. */
707 kernargs->argc = kernel_argc;
708 kernargs->argv = (int64_t) args->argv_data;
709 kernargs->out_ptr = (int64_t) &kernargs->output_data;
710 kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
711 kernargs->output_data.next_output = 0;
712 for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
713 / sizeof (kernargs->output_data.queue[0])); i++)
714 kernargs->output_data.queue[i].written = 0;
f9cacebb 715 kernargs->output_data.consumed = 0;
5326695a
AS
716 int offset = 0;
717 for (int i = 0; i < kernel_argc; i++)
718 {
719 size_t arg_len = strlen (kernel_argv[i]) + 1;
720 args->argv_data[i] = (int64_t) &args->strings[offset];
721 memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
722 offset += arg_len;
723 }
e3d0ee4a
AS
724 kernargs->heap_ptr = (int64_t) heap;
725 hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size));
5326695a 726
7039cebf
KCY
727 /* Run constructors on the GPU. */
728 run (init_array_kernel, kernargs);
729
5326695a 730 /* Run the kernel on the GPU. */
7039cebf 731 run (main_kernel, kernargs);
5326695a
AS
732 unsigned int return_value =
733 (unsigned int) kernargs->output_data.return_value;
734
7039cebf
KCY
735 /* Run destructors on the GPU. */
736 run (fini_array_kernel, kernargs);
737
5326695a
AS
738 unsigned int upper = (return_value & ~0xffff) >> 16;
739 if (upper == 0xcafe)
e8daba7e
AS
740 {
741 printf ("Kernel exit value was never set\n");
742 return_value = 0xff;
743 }
5326695a
AS
744 else if (upper == 0xffff)
745 ; /* Set by exit. */
746 else if (upper == 0)
747 ; /* Set by return from main. */
748 else
749 printf ("Possible kernel exit value corruption, 2 most significant bytes "
750 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
751
752 if (upper == 0xffff)
753 {
754 unsigned int signal = (return_value >> 8) & 0xff;
755 if (signal == SIGABRT)
756 printf ("Kernel aborted\n");
757 else if (signal != 0)
758 printf ("Kernel received unkown signal\n");
759 }
760
761 if (debug)
762 printf ("Kernel exit value: %d\n", return_value & 0xff);
763
764 /* Clean shut down. */
765 XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
766 "Clean up device memory");
767 XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
768 "Clean up GCN executable");
769 XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
770 "Clean up device queue");
771 XHSA (hsa_fns.hsa_shut_down_fn (),
772 "Shut down run-time");
773
774 return return_value & 0xff;
775}