]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/gcn/gcn-run.cc
Update copyright years.
[thirdparty/gcc.git] / gcc / config / gcn / gcn-run.cc
CommitLineData
5326695a
AS
1/* Run a stand-alone AMD GCN kernel.
2
3 Copyright 2017 Mentor Graphics Corporation
a945c346 4 Copyright (C) 2018-2024 Free Software Foundation, Inc.
5326695a
AS
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18
19/* This program will run a compiled stand-alone GCN kernel on a GPU.
20
21 The kernel entry point's signature must use a standard main signature:
22
23 int main(int argc, char **argv)
24*/
25
26#include <stdint.h>
27#include <stdbool.h>
28#include <stdlib.h>
29#include <malloc.h>
30#include <stdio.h>
31#include <string.h>
32#include <dlfcn.h>
33#include <unistd.h>
34#include <elf.h>
35#include <signal.h>
36
5326695a 37#include "hsa.h"
f6fff8a6 38#include "../../../libgomp/config/gcn/libgomp-gcn.h"
5326695a
AS
39
40#ifndef HSA_RUNTIME_LIB
f062c3f1 41#define HSA_RUNTIME_LIB "libhsa-runtime64.so.1"
5326695a
AS
42#endif
43
44#ifndef VERSION_STRING
45#define VERSION_STRING "(version unknown)"
46#endif
47
48bool debug = false;
49
50hsa_agent_t device = { 0 };
51hsa_queue_t *queue = NULL;
7039cebf
KCY
52uint64_t init_array_kernel = 0;
53uint64_t fini_array_kernel = 0;
54uint64_t main_kernel = 0;
5326695a
AS
55hsa_executable_t executable = { 0 };
56
57hsa_region_t kernargs_region = { 0 };
e3d0ee4a 58hsa_region_t heap_region = { 0 };
5326695a
AS
59uint32_t kernarg_segment_size = 0;
60uint32_t group_segment_size = 0;
61uint32_t private_segment_size = 0;
62
63static void
64usage (const char *progname)
65{
66 printf ("Usage: %s [options] kernel [kernel-args]\n\n"
67 "Options:\n"
68 " --help\n"
69 " --version\n"
70 " --debug\n", progname);
71}
72
73static void
74version (const char *progname)
75{
76 printf ("%s " VERSION_STRING "\n", progname);
77}
78
79/* As an HSA runtime is dlopened, following structure defines the necessary
80 function pointers.
81 Code adapted from libgomp. */
82
83struct hsa_runtime_fn_info
84{
85 /* HSA runtime. */
86 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
87 const char **status_string);
88 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
89 hsa_agent_info_t attribute,
90 void *value);
91 hsa_status_t (*hsa_init_fn) (void);
92 hsa_status_t (*hsa_iterate_agents_fn)
93 (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
94 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
95 hsa_region_info_t attribute,
96 void *value);
97 hsa_status_t (*hsa_queue_create_fn)
98 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
99 void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
100 void *data, uint32_t private_segment_size,
101 uint32_t group_segment_size, hsa_queue_t **queue);
102 hsa_status_t (*hsa_agent_iterate_regions_fn)
103 (hsa_agent_t agent,
104 hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
105 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
106 hsa_status_t (*hsa_executable_create_fn)
107 (hsa_profile_t profile, hsa_executable_state_t executable_state,
108 const char *options, hsa_executable_t *executable);
109 hsa_status_t (*hsa_executable_global_variable_define_fn)
110 (hsa_executable_t executable, const char *variable_name, void *address);
111 hsa_status_t (*hsa_executable_load_code_object_fn)
112 (hsa_executable_t executable, hsa_agent_t agent,
113 hsa_code_object_t code_object, const char *options);
114 hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
115 const char *options);
116 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
117 uint32_t num_consumers,
118 const hsa_agent_t *consumers,
119 hsa_signal_t *signal);
120 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
121 void **ptr);
e3d0ee4a
AS
122 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
123 hsa_access_permission_t access);
5326695a
AS
124 hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
125 size_t size);
126 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
127 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
128 hsa_status_t (*hsa_executable_get_symbol_fn)
129 (hsa_executable_t executable, const char *module_name,
130 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
131 hsa_executable_symbol_t *symbol);
132 hsa_status_t (*hsa_executable_symbol_get_info_fn)
133 (hsa_executable_symbol_t executable_symbol,
134 hsa_executable_symbol_info_t attribute, void *value);
135 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
136 hsa_signal_value_t value);
137 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
138 (hsa_signal_t signal, hsa_signal_condition_t condition,
139 hsa_signal_value_t compare_value, uint64_t timeout_hint,
140 hsa_wait_state_t wait_state_hint);
141 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
142 (hsa_signal_t signal, hsa_signal_condition_t condition,
143 hsa_signal_value_t compare_value, uint64_t timeout_hint,
144 hsa_wait_state_t wait_state_hint);
145 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
146 hsa_status_t (*hsa_code_object_deserialize_fn)
147 (void *serialized_code_object, size_t serialized_code_object_size,
148 const char *options, hsa_code_object_t *code_object);
149 uint64_t (*hsa_queue_load_write_index_relaxed_fn)
150 (const hsa_queue_t *queue);
151 void (*hsa_queue_store_write_index_relaxed_fn)
152 (const hsa_queue_t *queue, uint64_t value);
153 hsa_status_t (*hsa_shut_down_fn) ();
154};
155
156/* HSA runtime functions that are initialized in init_hsa_context.
157 Code adapted from libgomp. */
158
159static struct hsa_runtime_fn_info hsa_fns;
160
161#define DLSYM_FN(function) \
162 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
163 if (hsa_fns.function##_fn == NULL) \
164 goto fail;
165
166static void
167init_hsa_runtime_functions (void)
168{
169 void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
170 if (handle == NULL)
171 {
172 fprintf (stderr,
173 "The HSA runtime is required to run GCN kernels on hardware.\n"
174 "%s: File not found or could not be opened\n",
175 HSA_RUNTIME_LIB);
176 exit (1);
177 }
178
179 DLSYM_FN (hsa_status_string)
180 DLSYM_FN (hsa_agent_get_info)
181 DLSYM_FN (hsa_init)
182 DLSYM_FN (hsa_iterate_agents)
183 DLSYM_FN (hsa_region_get_info)
184 DLSYM_FN (hsa_queue_create)
185 DLSYM_FN (hsa_agent_iterate_regions)
186 DLSYM_FN (hsa_executable_destroy)
187 DLSYM_FN (hsa_executable_create)
188 DLSYM_FN (hsa_executable_global_variable_define)
189 DLSYM_FN (hsa_executable_load_code_object)
190 DLSYM_FN (hsa_executable_freeze)
191 DLSYM_FN (hsa_signal_create)
192 DLSYM_FN (hsa_memory_allocate)
e3d0ee4a 193 DLSYM_FN (hsa_memory_assign_agent)
5326695a
AS
194 DLSYM_FN (hsa_memory_copy)
195 DLSYM_FN (hsa_memory_free)
196 DLSYM_FN (hsa_signal_destroy)
197 DLSYM_FN (hsa_executable_get_symbol)
198 DLSYM_FN (hsa_executable_symbol_get_info)
199 DLSYM_FN (hsa_signal_wait_acquire)
200 DLSYM_FN (hsa_signal_wait_relaxed)
201 DLSYM_FN (hsa_signal_store_relaxed)
202 DLSYM_FN (hsa_queue_destroy)
203 DLSYM_FN (hsa_code_object_deserialize)
204 DLSYM_FN (hsa_queue_load_write_index_relaxed)
205 DLSYM_FN (hsa_queue_store_write_index_relaxed)
206 DLSYM_FN (hsa_shut_down)
207
208 return;
209
210fail:
211 fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
212 exit (1);
213}
214
215#undef DLSYM_FN
216
217/* Report a fatal error STR together with the HSA error corresponding to
218 STATUS and terminate execution of the current process. */
219
220static void
221hsa_fatal (const char *str, hsa_status_t status)
222{
223 const char *hsa_error_msg;
224 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
225 fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
226 hsa_error_msg);
227 exit (1);
228}
229
230/* Helper macros to ensure we check the return values from the HSA Runtime.
231 These just keep the rest of the code a bit cleaner. */
232
233#define XHSA_CMP(FN, CMP, MSG) \
234 do { \
235 hsa_status_t status = (FN); \
236 if (!(CMP)) \
237 hsa_fatal ((MSG), status); \
238 else if (debug) \
239 fprintf (stderr, "%s: OK\n", (MSG)); \
240 } while (0)
241#define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
242
243/* Callback of hsa_iterate_agents.
244 Called once for each available device, and returns "break" when a
245 suitable one has been found. */
246
247static hsa_status_t
248get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
249{
250 hsa_device_type_t device_type;
251 XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
252 &device_type),
253 "Get agent type");
254
255 /* Select only GPU devices. */
256 /* TODO: support selecting from multiple GPUs. */
257 if (HSA_DEVICE_TYPE_GPU == device_type)
258 {
259 device = agent;
260 return HSA_STATUS_INFO_BREAK;
261 }
262
263 /* The device was not suitable. */
264 return HSA_STATUS_SUCCESS;
265}
266
267/* Callback of hsa_iterate_regions.
268 Called once for each available memory region, and returns "break" when a
269 suitable one has been found. */
270
271static hsa_status_t
e3d0ee4a
AS
272get_memory_region (hsa_region_t region, hsa_region_t *retval,
273 hsa_region_global_flag_t kind)
5326695a
AS
274{
275 /* Reject non-global regions. */
276 hsa_region_segment_t segment;
277 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
278 if (HSA_REGION_SEGMENT_GLOBAL != segment)
279 return HSA_STATUS_SUCCESS;
280
281 /* Find a region with the KERNARG flag set. */
282 hsa_region_global_flag_t flags;
283 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
284 &flags);
e3d0ee4a 285 if (flags & kind)
5326695a 286 {
e3d0ee4a 287 *retval = region;
5326695a
AS
288 return HSA_STATUS_INFO_BREAK;
289 }
290
291 /* The region was not suitable. */
292 return HSA_STATUS_SUCCESS;
293}
294
e3d0ee4a
AS
295static hsa_status_t
296get_kernarg_region (hsa_region_t region, void *data __attribute__((unused)))
297{
298 return get_memory_region (region, &kernargs_region,
299 HSA_REGION_GLOBAL_FLAG_KERNARG);
300}
301
302static hsa_status_t
303get_heap_region (hsa_region_t region, void *data __attribute__((unused)))
304{
305 return get_memory_region (region, &heap_region,
306 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
307}
308
5326695a
AS
309/* Initialize the HSA Runtime library and GPU device. */
310
311static void
312init_device ()
313{
314 /* Load the shared library and find the API functions. */
315 init_hsa_runtime_functions ();
316
317 /* Initialize the HSA Runtime. */
318 XHSA (hsa_fns.hsa_init_fn (),
319 "Initialize run-time");
320
321 /* Select a suitable device.
322 The call-back function, get_gpu_agent, does the selection. */
323 XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
324 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
325 "Find a device");
326
327 /* Initialize the queue used for launching kernels. */
328 uint32_t queue_size = 0;
329 XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
330 &queue_size),
331 "Find max queue size");
332 XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
333 HSA_QUEUE_TYPE_SINGLE, NULL,
334 NULL, UINT32_MAX, UINT32_MAX, &queue),
335 "Set up a device queue");
336
337 /* Select a memory region for the kernel arguments.
338 The call-back function, get_kernarg_region, does the selection. */
339 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
340 NULL),
341 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
342 "Locate kernargs memory");
e3d0ee4a
AS
343
344 /* Select a memory region for the kernel heap.
345 The call-back function, get_heap_region, does the selection. */
346 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region,
347 NULL),
348 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
349 "Locate device memory");
5326695a
AS
350}
351
352
353/* Read a whole input file.
354 Code copied from mkoffload. */
355
356static char *
357read_file (const char *filename, size_t *plen)
358{
359 size_t alloc = 16384;
360 size_t base = 0;
361 char *buffer;
362
363 FILE *stream = fopen (filename, "rb");
364 if (!stream)
365 {
366 perror (filename);
367 exit (1);
368 }
369
370 if (!fseek (stream, 0, SEEK_END))
371 {
372 /* Get the file size. */
373 long s = ftell (stream);
374 if (s >= 0)
375 alloc = s + 100;
376 fseek (stream, 0, SEEK_SET);
377 }
378 buffer = malloc (alloc);
379
380 for (;;)
381 {
382 size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
383
384 if (!n)
385 break;
386 base += n;
387 if (base + 1 == alloc)
388 {
389 alloc *= 2;
390 buffer = realloc (buffer, alloc);
391 }
392 }
393 buffer[base] = 0;
394 *plen = base;
395
396 fclose (stream);
397
398 return buffer;
399}
400
401/* Read a HSA Code Object (HSACO) from file, and load it into the device. */
402
403static void
404load_image (const char *filename)
405{
406 size_t image_size;
407 Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
408
409 /* An "executable" consists of one or more code objects. */
410 XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
411 HSA_EXECUTABLE_STATE_UNFROZEN, "",
412 &executable),
413 "Initialize GCN executable");
414
5326695a
AS
415 /* Add the HSACO to the executable. */
416 hsa_code_object_t co = { 0 };
417 XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
418 "Deserialize GCN code object");
419 XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
420 ""),
421 "Load GCN code object");
422
423 /* We're done modifying he executable. */
424 XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
425 "Freeze GCN executable");
426
7039cebf 427 /* Locate the "_init_array" function, and read the kernel's properties. */
5326695a 428 hsa_executable_symbol_t symbol;
f062c3f1
AS
429 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL,
430 "_init_array.kd", device, 0,
431 &symbol),
7039cebf
KCY
432 "Find '_init_array' function");
433 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
f062c3f1
AS
434 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
435 &init_array_kernel),
7039cebf
KCY
436 "Extract '_init_array' kernel object kernel object");
437
438 /* Locate the "_fini_array" function, and read the kernel's properties. */
f062c3f1
AS
439 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL,
440 "_fini_array.kd", device, 0,
441 &symbol),
7039cebf
KCY
442 "Find '_fini_array' function");
443 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
f062c3f1
AS
444 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
445 &fini_array_kernel),
7039cebf
KCY
446 "Extract '_fini_array' kernel object kernel object");
447
448 /* Locate the "main" function, and read the kernel's properties. */
f062c3f1 449 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main.kd",
5326695a
AS
450 device, 0, &symbol),
451 "Find 'main' function");
452 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
7039cebf
KCY
453 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel),
454 "Extract 'main' kernel object");
5326695a
AS
455 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
456 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
457 &kernarg_segment_size),
458 "Extract kernarg segment size");
459 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
460 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
461 &group_segment_size),
462 "Extract group segment size");
463 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
464 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
465 &private_segment_size),
466 "Extract private segment size");
5326695a
AS
467}
468
469/* Allocate some device memory from the kernargs region.
470 The returned address will be 32-bit (with excess zeroed on 64-bit host),
471 and accessible via the same address on both host and target (via
472 __flat_scalar GCN address space). */
473
474static void *
e3d0ee4a 475device_malloc (size_t size, hsa_region_t region)
5326695a
AS
476{
477 void *result;
e3d0ee4a 478 XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result),
5326695a
AS
479 "Allocate device memory");
480 return result;
481}
482
483/* These are the device pointers that will be transferred to the target.
484 The HSA Runtime points the kernargs register here.
485 They correspond to function signature:
486 int main (int argc, char *argv[], int *return_value)
487 The compiler expects this, for kernel functions, and will
488 automatically assign the exit value to *return_value. */
489struct kernargs
490{
f6fff8a6
AS
491 union {
492 struct {
493 int32_t argc;
494 int64_t argv;
495 } args;
496 struct kernargs_abi abi;
497 };
498 struct output output_data;
5326695a
AS
499};
500
501/* Print any console output from the kernel.
f9cacebb
AS
502 We print all entries from "consumed" to the next entry without a "written"
503 flag, or "next_output" is reached. The buffer is circular, but the
504 indices are absolute. It is assumed the kernel will stop writing data
505 if "next_output" wraps (becomes smaller than "consumed"). */
5326695a 506void
f9cacebb 507gomp_print_output (struct kernargs *kernargs, bool final)
5326695a 508{
f9cacebb
AS
509 unsigned int limit = (sizeof (kernargs->output_data.queue)
510 / sizeof (kernargs->output_data.queue[0]));
5326695a 511
f9cacebb
AS
512 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
513 __ATOMIC_ACQUIRE);
514 unsigned int to = kernargs->output_data.next_output;
515
516 if (from > to)
517 {
518 /* Overflow. */
519 if (final)
520 printf ("GCN print buffer overflowed.\n");
521 return;
522 }
523
524 unsigned int i;
525 for (i = from; i < to; i++)
5326695a 526 {
f9cacebb 527 struct printf_data *data = &kernargs->output_data.queue[i%limit];
5326695a 528
f9cacebb 529 if (!data->written && !final)
5326695a
AS
530 break;
531
532 switch (data->type)
533 {
534 case 0:
535 printf ("%.128s%ld\n", data->msg, data->ivalue);
536 break;
537 case 1:
538 printf ("%.128s%f\n", data->msg, data->dvalue);
539 break;
540 case 2:
541 printf ("%.128s%.128s\n", data->msg, data->text);
542 break;
543 case 3:
544 printf ("%.128s%.128s", data->msg, data->text);
545 break;
f9cacebb
AS
546 default:
547 printf ("GCN print buffer error!\n");
548 break;
5326695a
AS
549 }
550
551 data->written = 0;
f9cacebb
AS
552 __atomic_store_n (&kernargs->output_data.consumed, i+1,
553 __ATOMIC_RELEASE);
5326695a 554 }
f9cacebb 555 fflush (stdout);
5326695a
AS
556}
557
558/* Execute an already-loaded kernel on the device. */
559
560static void
7039cebf 561run (uint64_t kernel, void *kernargs)
5326695a
AS
562{
563 /* A "signal" is used to launch and monitor the kernel. */
564 hsa_signal_t signal;
565 XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
566 "Create signal");
567
568 /* Configure for a single-worker kernel. */
569 uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
570 const uint32_t queueMask = queue->size - 1;
571 hsa_kernel_dispatch_packet_t *dispatch_packet =
572 &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
573 queueMask]);
574 dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
575 dispatch_packet->workgroup_size_x = (uint16_t) 1;
576 dispatch_packet->workgroup_size_y = (uint16_t) 64;
577 dispatch_packet->workgroup_size_z = (uint16_t) 1;
578 dispatch_packet->grid_size_x = 1;
579 dispatch_packet->grid_size_y = 64;
580 dispatch_packet->grid_size_z = 1;
581 dispatch_packet->completion_signal = signal;
582 dispatch_packet->kernel_object = kernel;
583 dispatch_packet->kernarg_address = (void *) kernargs;
584 dispatch_packet->private_segment_size = private_segment_size;
585 dispatch_packet->group_segment_size = group_segment_size;
586
587 uint16_t header = 0;
588 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
589 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
590 header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
591
592 __atomic_store_n ((uint32_t *) dispatch_packet,
593 header | (dispatch_packet->setup << 16),
594 __ATOMIC_RELEASE);
595
596 if (debug)
597 fprintf (stderr, "Launch kernel\n");
598
599 hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
600 hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
601 /* Kernel running ...... */
5326695a
AS
602 while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
603 1, 1000000,
604 HSA_WAIT_STATE_ACTIVE) != 0)
605 {
606 usleep (10000);
f9cacebb 607 gomp_print_output (kernargs, false);
5326695a
AS
608 }
609
f9cacebb 610 gomp_print_output (kernargs, true);
5326695a
AS
611
612 if (debug)
613 fprintf (stderr, "Kernel exited\n");
614
615 XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
616 "Clean up signal");
617}
618
619int
620main (int argc, char *argv[])
621{
622 int kernel_arg = 0;
623 for (int i = 1; i < argc; i++)
624 {
625 if (!strcmp (argv[i], "--help"))
626 {
627 usage (argv[0]);
628 return 0;
629 }
630 else if (!strcmp (argv[i], "--version"))
631 {
632 version (argv[0]);
633 return 0;
634 }
635 else if (!strcmp (argv[i], "--debug"))
636 debug = true;
637 else if (argv[i][0] == '-')
638 {
639 usage (argv[0]);
640 return 1;
641 }
642 else
643 {
644 kernel_arg = i;
645 break;
646 }
647 }
648
649 if (!kernel_arg)
650 {
651 /* No kernel arguments were found. */
652 usage (argv[0]);
653 return 1;
654 }
655
656 /* The remaining arguments are for the GCN kernel. */
657 int kernel_argc = argc - kernel_arg;
658 char **kernel_argv = &argv[kernel_arg];
659
660 init_device ();
661 load_image (kernel_argv[0]);
662
663 /* Calculate size of function parameters + argv data. */
664 size_t args_size = 0;
665 for (int i = 0; i < kernel_argc; i++)
666 args_size += strlen (kernel_argv[i]) + 1;
667
f6fff8a6
AS
668 /* The device stack can be adjusted via an environment variable. */
669 char *envvar = getenv ("GCN_STACK_SIZE");
670 int stack_size = 1 * 1024 * 1024; /* 1MB default. */
671 if (envvar)
672 {
673 int val = atoi (envvar);
674 if (val)
675 stack_size = val;
676 }
677
5326695a
AS
678 /* Allocate device memory for both function parameters and the argv
679 data. */
e3d0ee4a
AS
680 struct kernargs *kernargs = device_malloc (sizeof (*kernargs),
681 kernargs_region);
5326695a
AS
682 struct argdata
683 {
684 int64_t argv_data[kernel_argc];
685 char strings[args_size];
e3d0ee4a
AS
686 } *args = device_malloc (sizeof (struct argdata), kernargs_region);
687
688 size_t heap_size = 10 * 1024 * 1024; /* 10MB. */
689 struct heap *heap = device_malloc (heap_size, heap_region);
690 XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device,
691 HSA_ACCESS_PERMISSION_RW),
692 "Assign heap to device agent");
f6fff8a6 693 void *stack = device_malloc (stack_size, heap_region);
5326695a
AS
694
695 /* Write the data to the target. */
f6fff8a6
AS
696 kernargs->args.argc = kernel_argc;
697 kernargs->args.argv = (int64_t) args->argv_data;
698 kernargs->abi.out_ptr = (int64_t) &kernargs->output_data;
5326695a
AS
699 kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
700 kernargs->output_data.next_output = 0;
701 for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
702 / sizeof (kernargs->output_data.queue[0])); i++)
703 kernargs->output_data.queue[i].written = 0;
f9cacebb 704 kernargs->output_data.consumed = 0;
5326695a
AS
705 int offset = 0;
706 for (int i = 0; i < kernel_argc; i++)
707 {
708 size_t arg_len = strlen (kernel_argv[i]) + 1;
709 args->argv_data[i] = (int64_t) &args->strings[offset];
710 memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
711 offset += arg_len;
712 }
f6fff8a6 713 kernargs->abi.heap_ptr = (int64_t) heap;
e3d0ee4a 714 hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size));
f6fff8a6
AS
715 kernargs->abi.arena_ptr = 0;
716 kernargs->abi.stack_ptr = (int64_t) stack;
717 kernargs->abi.stack_size_per_thread = stack_size;
5326695a 718
7039cebf
KCY
719 /* Run constructors on the GPU. */
720 run (init_array_kernel, kernargs);
721
5326695a 722 /* Run the kernel on the GPU. */
7039cebf 723 run (main_kernel, kernargs);
5326695a
AS
724 unsigned int return_value =
725 (unsigned int) kernargs->output_data.return_value;
726
7039cebf
KCY
727 /* Run destructors on the GPU. */
728 run (fini_array_kernel, kernargs);
729
5326695a
AS
730 unsigned int upper = (return_value & ~0xffff) >> 16;
731 if (upper == 0xcafe)
e8daba7e
AS
732 {
733 printf ("Kernel exit value was never set\n");
734 return_value = 0xff;
735 }
5326695a
AS
736 else if (upper == 0xffff)
737 ; /* Set by exit. */
738 else if (upper == 0)
739 ; /* Set by return from main. */
740 else
741 printf ("Possible kernel exit value corruption, 2 most significant bytes "
742 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
743
744 if (upper == 0xffff)
745 {
746 unsigned int signal = (return_value >> 8) & 0xff;
747 if (signal == SIGABRT)
748 printf ("Kernel aborted\n");
749 else if (signal != 0)
750 printf ("Kernel received unkown signal\n");
751 }
752
753 if (debug)
754 printf ("Kernel exit value: %d\n", return_value & 0xff);
755
756 /* Clean shut down. */
757 XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
758 "Clean up device memory");
759 XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
760 "Clean up GCN executable");
761 XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
762 "Clean up device queue");
763 XHSA (hsa_fns.hsa_shut_down_fn (),
764 "Shut down run-time");
765
766 return return_value & 0xff;
767}