]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/gcn/gcn-run.c
Add support for constructors and destuctors on GCN
[thirdparty/gcc.git] / gcc / config / gcn / gcn-run.c
1 /* Run a stand-alone AMD GCN kernel.
2
3 Copyright 2017 Mentor Graphics Corporation
4 Copyright 2018-2019 Free Software Foundation, Inc.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18
19 /* This program will run a compiled stand-alone GCN kernel on a GPU.
20
21 The kernel entry point's signature must use a standard main signature:
22
23 int main(int argc, char **argv)
24 */
25
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <stdlib.h>
29 #include <malloc.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <dlfcn.h>
33 #include <unistd.h>
34 #include <elf.h>
35 #include <signal.h>
36
37 /* These probably won't be in elf.h for a while. */
38 #ifndef R_AMDGPU_NONE
39 #define R_AMDGPU_NONE 0
40 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
41 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
42 #define R_AMDGPU_ABS64 3 /* S + A */
43 #define R_AMDGPU_REL32 4 /* S + A - P */
44 #define R_AMDGPU_REL64 5 /* S + A - P */
45 #define R_AMDGPU_ABS32 6 /* S + A */
46 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
47 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
48 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
49 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
50 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
51 #define reserved 12
52 #define R_AMDGPU_RELATIVE64 13 /* B + A */
53 #endif
54
55 #include "hsa.h"
56
57 #ifndef HSA_RUNTIME_LIB
58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so"
59 #endif
60
61 #ifndef VERSION_STRING
62 #define VERSION_STRING "(version unknown)"
63 #endif
64
65 bool debug = false;
66
67 hsa_agent_t device = { 0 };
68 hsa_queue_t *queue = NULL;
69 uint64_t init_array_kernel = 0;
70 uint64_t fini_array_kernel = 0;
71 uint64_t main_kernel = 0;
72 hsa_executable_t executable = { 0 };
73
74 hsa_region_t kernargs_region = { 0 };
75 uint32_t kernarg_segment_size = 0;
76 uint32_t group_segment_size = 0;
77 uint32_t private_segment_size = 0;
78
79 static void
80 usage (const char *progname)
81 {
82 printf ("Usage: %s [options] kernel [kernel-args]\n\n"
83 "Options:\n"
84 " --help\n"
85 " --version\n"
86 " --debug\n", progname);
87 }
88
89 static void
90 version (const char *progname)
91 {
92 printf ("%s " VERSION_STRING "\n", progname);
93 }
94
95 /* As an HSA runtime is dlopened, following structure defines the necessary
96 function pointers.
97 Code adapted from libgomp. */
98
99 struct hsa_runtime_fn_info
100 {
101 /* HSA runtime. */
102 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
103 const char **status_string);
104 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
105 hsa_agent_info_t attribute,
106 void *value);
107 hsa_status_t (*hsa_init_fn) (void);
108 hsa_status_t (*hsa_iterate_agents_fn)
109 (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
110 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
111 hsa_region_info_t attribute,
112 void *value);
113 hsa_status_t (*hsa_queue_create_fn)
114 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
115 void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
116 void *data, uint32_t private_segment_size,
117 uint32_t group_segment_size, hsa_queue_t **queue);
118 hsa_status_t (*hsa_agent_iterate_regions_fn)
119 (hsa_agent_t agent,
120 hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
121 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
122 hsa_status_t (*hsa_executable_create_fn)
123 (hsa_profile_t profile, hsa_executable_state_t executable_state,
124 const char *options, hsa_executable_t *executable);
125 hsa_status_t (*hsa_executable_global_variable_define_fn)
126 (hsa_executable_t executable, const char *variable_name, void *address);
127 hsa_status_t (*hsa_executable_load_code_object_fn)
128 (hsa_executable_t executable, hsa_agent_t agent,
129 hsa_code_object_t code_object, const char *options);
130 hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
131 const char *options);
132 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
133 uint32_t num_consumers,
134 const hsa_agent_t *consumers,
135 hsa_signal_t *signal);
136 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
137 void **ptr);
138 hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
139 size_t size);
140 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
141 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
142 hsa_status_t (*hsa_executable_get_symbol_fn)
143 (hsa_executable_t executable, const char *module_name,
144 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
145 hsa_executable_symbol_t *symbol);
146 hsa_status_t (*hsa_executable_symbol_get_info_fn)
147 (hsa_executable_symbol_t executable_symbol,
148 hsa_executable_symbol_info_t attribute, void *value);
149 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
150 hsa_signal_value_t value);
151 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
152 (hsa_signal_t signal, hsa_signal_condition_t condition,
153 hsa_signal_value_t compare_value, uint64_t timeout_hint,
154 hsa_wait_state_t wait_state_hint);
155 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
156 (hsa_signal_t signal, hsa_signal_condition_t condition,
157 hsa_signal_value_t compare_value, uint64_t timeout_hint,
158 hsa_wait_state_t wait_state_hint);
159 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
160 hsa_status_t (*hsa_code_object_deserialize_fn)
161 (void *serialized_code_object, size_t serialized_code_object_size,
162 const char *options, hsa_code_object_t *code_object);
163 uint64_t (*hsa_queue_load_write_index_relaxed_fn)
164 (const hsa_queue_t *queue);
165 void (*hsa_queue_store_write_index_relaxed_fn)
166 (const hsa_queue_t *queue, uint64_t value);
167 hsa_status_t (*hsa_shut_down_fn) ();
168 };
169
170 /* HSA runtime functions that are initialized in init_hsa_context.
171 Code adapted from libgomp. */
172
173 static struct hsa_runtime_fn_info hsa_fns;
174
175 #define DLSYM_FN(function) \
176 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
177 if (hsa_fns.function##_fn == NULL) \
178 goto fail;
179
180 static void
181 init_hsa_runtime_functions (void)
182 {
183 void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
184 if (handle == NULL)
185 {
186 fprintf (stderr,
187 "The HSA runtime is required to run GCN kernels on hardware.\n"
188 "%s: File not found or could not be opened\n",
189 HSA_RUNTIME_LIB);
190 exit (1);
191 }
192
193 DLSYM_FN (hsa_status_string)
194 DLSYM_FN (hsa_agent_get_info)
195 DLSYM_FN (hsa_init)
196 DLSYM_FN (hsa_iterate_agents)
197 DLSYM_FN (hsa_region_get_info)
198 DLSYM_FN (hsa_queue_create)
199 DLSYM_FN (hsa_agent_iterate_regions)
200 DLSYM_FN (hsa_executable_destroy)
201 DLSYM_FN (hsa_executable_create)
202 DLSYM_FN (hsa_executable_global_variable_define)
203 DLSYM_FN (hsa_executable_load_code_object)
204 DLSYM_FN (hsa_executable_freeze)
205 DLSYM_FN (hsa_signal_create)
206 DLSYM_FN (hsa_memory_allocate)
207 DLSYM_FN (hsa_memory_copy)
208 DLSYM_FN (hsa_memory_free)
209 DLSYM_FN (hsa_signal_destroy)
210 DLSYM_FN (hsa_executable_get_symbol)
211 DLSYM_FN (hsa_executable_symbol_get_info)
212 DLSYM_FN (hsa_signal_wait_acquire)
213 DLSYM_FN (hsa_signal_wait_relaxed)
214 DLSYM_FN (hsa_signal_store_relaxed)
215 DLSYM_FN (hsa_queue_destroy)
216 DLSYM_FN (hsa_code_object_deserialize)
217 DLSYM_FN (hsa_queue_load_write_index_relaxed)
218 DLSYM_FN (hsa_queue_store_write_index_relaxed)
219 DLSYM_FN (hsa_shut_down)
220
221 return;
222
223 fail:
224 fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
225 exit (1);
226 }
227
228 #undef DLSYM_FN
229
230 /* Report a fatal error STR together with the HSA error corresponding to
231 STATUS and terminate execution of the current process. */
232
233 static void
234 hsa_fatal (const char *str, hsa_status_t status)
235 {
236 const char *hsa_error_msg;
237 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
238 fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
239 hsa_error_msg);
240 exit (1);
241 }
242
243 /* Helper macros to ensure we check the return values from the HSA Runtime.
244 These just keep the rest of the code a bit cleaner. */
245
246 #define XHSA_CMP(FN, CMP, MSG) \
247 do { \
248 hsa_status_t status = (FN); \
249 if (!(CMP)) \
250 hsa_fatal ((MSG), status); \
251 else if (debug) \
252 fprintf (stderr, "%s: OK\n", (MSG)); \
253 } while (0)
254 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
255
256 /* Callback of hsa_iterate_agents.
257 Called once for each available device, and returns "break" when a
258 suitable one has been found. */
259
260 static hsa_status_t
261 get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
262 {
263 hsa_device_type_t device_type;
264 XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
265 &device_type),
266 "Get agent type");
267
268 /* Select only GPU devices. */
269 /* TODO: support selecting from multiple GPUs. */
270 if (HSA_DEVICE_TYPE_GPU == device_type)
271 {
272 device = agent;
273 return HSA_STATUS_INFO_BREAK;
274 }
275
276 /* The device was not suitable. */
277 return HSA_STATUS_SUCCESS;
278 }
279
280 /* Callback of hsa_iterate_regions.
281 Called once for each available memory region, and returns "break" when a
282 suitable one has been found. */
283
284 static hsa_status_t
285 get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused)))
286 {
287 /* Reject non-global regions. */
288 hsa_region_segment_t segment;
289 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
290 if (HSA_REGION_SEGMENT_GLOBAL != segment)
291 return HSA_STATUS_SUCCESS;
292
293 /* Find a region with the KERNARG flag set. */
294 hsa_region_global_flag_t flags;
295 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
296 &flags);
297 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
298 {
299 kernargs_region = region;
300 return HSA_STATUS_INFO_BREAK;
301 }
302
303 /* The region was not suitable. */
304 return HSA_STATUS_SUCCESS;
305 }
306
307 /* Initialize the HSA Runtime library and GPU device. */
308
309 static void
310 init_device ()
311 {
312 /* Load the shared library and find the API functions. */
313 init_hsa_runtime_functions ();
314
315 /* Initialize the HSA Runtime. */
316 XHSA (hsa_fns.hsa_init_fn (),
317 "Initialize run-time");
318
319 /* Select a suitable device.
320 The call-back function, get_gpu_agent, does the selection. */
321 XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
322 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
323 "Find a device");
324
325 /* Initialize the queue used for launching kernels. */
326 uint32_t queue_size = 0;
327 XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
328 &queue_size),
329 "Find max queue size");
330 XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
331 HSA_QUEUE_TYPE_SINGLE, NULL,
332 NULL, UINT32_MAX, UINT32_MAX, &queue),
333 "Set up a device queue");
334
335 /* Select a memory region for the kernel arguments.
336 The call-back function, get_kernarg_region, does the selection. */
337 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
338 NULL),
339 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
340 "Locate kernargs memory");
341 }
342
343
344 /* Read a whole input file.
345 Code copied from mkoffload. */
346
347 static char *
348 read_file (const char *filename, size_t *plen)
349 {
350 size_t alloc = 16384;
351 size_t base = 0;
352 char *buffer;
353
354 FILE *stream = fopen (filename, "rb");
355 if (!stream)
356 {
357 perror (filename);
358 exit (1);
359 }
360
361 if (!fseek (stream, 0, SEEK_END))
362 {
363 /* Get the file size. */
364 long s = ftell (stream);
365 if (s >= 0)
366 alloc = s + 100;
367 fseek (stream, 0, SEEK_SET);
368 }
369 buffer = malloc (alloc);
370
371 for (;;)
372 {
373 size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
374
375 if (!n)
376 break;
377 base += n;
378 if (base + 1 == alloc)
379 {
380 alloc *= 2;
381 buffer = realloc (buffer, alloc);
382 }
383 }
384 buffer[base] = 0;
385 *plen = base;
386
387 fclose (stream);
388
389 return buffer;
390 }
391
392 /* Read a HSA Code Object (HSACO) from file, and load it into the device. */
393
394 static void
395 load_image (const char *filename)
396 {
397 size_t image_size;
398 Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
399
400 /* An "executable" consists of one or more code objects. */
401 XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
402 HSA_EXECUTABLE_STATE_UNFROZEN, "",
403 &executable),
404 "Initialize GCN executable");
405
406 /* Hide relocations from the HSA runtime loader.
407 Keep a copy of the unmodified section headers to use later. */
408 Elf64_Shdr *image_sections =
409 (Elf64_Shdr *) ((char *) image + image->e_shoff);
410 Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
411 memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
412 for (int i = image->e_shnum - 1; i >= 0; i--)
413 {
414 if (image_sections[i].sh_type == SHT_RELA
415 || image_sections[i].sh_type == SHT_REL)
416 /* Change section type to something harmless. */
417 image_sections[i].sh_type = SHT_NOTE;
418 }
419
420 /* Add the HSACO to the executable. */
421 hsa_code_object_t co = { 0 };
422 XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
423 "Deserialize GCN code object");
424 XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
425 ""),
426 "Load GCN code object");
427
428 /* We're done modifying he executable. */
429 XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
430 "Freeze GCN executable");
431
432 /* Locate the "_init_array" function, and read the kernel's properties. */
433 hsa_executable_symbol_t symbol;
434 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array",
435 device, 0, &symbol),
436 "Find '_init_array' function");
437 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
438 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel),
439 "Extract '_init_array' kernel object kernel object");
440
441 /* Locate the "_fini_array" function, and read the kernel's properties. */
442 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array",
443 device, 0, &symbol),
444 "Find '_fini_array' function");
445 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
446 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel),
447 "Extract '_fini_array' kernel object kernel object");
448
449 /* Locate the "main" function, and read the kernel's properties. */
450 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
451 device, 0, &symbol),
452 "Find 'main' function");
453 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
454 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel),
455 "Extract 'main' kernel object");
456 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
457 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
458 &kernarg_segment_size),
459 "Extract kernarg segment size");
460 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
461 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
462 &group_segment_size),
463 "Extract group segment size");
464 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
465 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
466 &private_segment_size),
467 "Extract private segment size");
468
469 /* Find main function in ELF, and calculate actual load offset. */
470 Elf64_Addr load_offset;
471 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
472 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
473 &load_offset),
474 "Extract 'main' symbol address");
475 for (int i = 0; i < image->e_shnum; i++)
476 if (sections[i].sh_type == SHT_SYMTAB)
477 {
478 Elf64_Shdr *strtab = &sections[sections[i].sh_link];
479 char *strings = (char *) image + strtab->sh_offset;
480
481 for (size_t offset = 0;
482 offset < sections[i].sh_size;
483 offset += sections[i].sh_entsize)
484 {
485 Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
486 + sections[i].sh_offset + offset);
487 if (strcmp ("main", strings + sym->st_name) == 0)
488 {
489 load_offset -= sym->st_value;
490 goto found_main;
491 }
492 }
493 }
494 /* We only get here when main was not found.
495 This should never happen. */
496 fprintf (stderr, "Error: main function not found.\n");
497 abort ();
498 found_main:;
499
500 /* Find dynamic symbol table. */
501 Elf64_Shdr *dynsym = NULL;
502 for (int i = 0; i < image->e_shnum; i++)
503 if (sections[i].sh_type == SHT_DYNSYM)
504 {
505 dynsym = &sections[i];
506 break;
507 }
508
509 /* Fix up relocations. */
510 for (int i = 0; i < image->e_shnum; i++)
511 {
512 if (sections[i].sh_type == SHT_RELA)
513 for (size_t offset = 0;
514 offset < sections[i].sh_size;
515 offset += sections[i].sh_entsize)
516 {
517 Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
518 + sections[i].sh_offset
519 + offset);
520 Elf64_Sym *sym =
521 (dynsym
522 ? (Elf64_Sym *) ((char *) image
523 + dynsym->sh_offset
524 + (dynsym->sh_entsize
525 * ELF64_R_SYM (reloc->r_info))) : NULL);
526
527 int64_t S = (sym ? sym->st_value : 0);
528 int64_t P = reloc->r_offset + load_offset;
529 int64_t A = reloc->r_addend;
530 int64_t B = load_offset;
531 int64_t V, size;
532 switch (ELF64_R_TYPE (reloc->r_info))
533 {
534 case R_AMDGPU_ABS32_LO:
535 V = (S + A) & 0xFFFFFFFF;
536 size = 4;
537 break;
538 case R_AMDGPU_ABS32_HI:
539 V = (S + A) >> 32;
540 size = 4;
541 break;
542 case R_AMDGPU_ABS64:
543 V = S + A;
544 size = 8;
545 break;
546 case R_AMDGPU_REL32:
547 V = S + A - P;
548 size = 4;
549 break;
550 case R_AMDGPU_REL64:
551 /* FIXME
552 LLD seems to emit REL64 where the the assembler has ABS64.
553 This is clearly wrong because it's not what the compiler
554 is expecting. Let's assume, for now, that it's a bug.
555 In any case, GCN kernels are always self contained and
556 therefore relative relocations will have been resolved
557 already, so this should be a safe workaround. */
558 V = S + A /* - P */ ;
559 size = 8;
560 break;
561 case R_AMDGPU_ABS32:
562 V = S + A;
563 size = 4;
564 break;
565 /* TODO R_AMDGPU_GOTPCREL */
566 /* TODO R_AMDGPU_GOTPCREL32_LO */
567 /* TODO R_AMDGPU_GOTPCREL32_HI */
568 case R_AMDGPU_REL32_LO:
569 V = (S + A - P) & 0xFFFFFFFF;
570 size = 4;
571 break;
572 case R_AMDGPU_REL32_HI:
573 V = (S + A - P) >> 32;
574 size = 4;
575 break;
576 case R_AMDGPU_RELATIVE64:
577 V = B + A;
578 size = 8;
579 break;
580 default:
581 fprintf (stderr, "Error: unsupported relocation type.\n");
582 exit (1);
583 }
584 XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
585 "Fix up relocation");
586 }
587 }
588 }
589
590 /* Allocate some device memory from the kernargs region.
591 The returned address will be 32-bit (with excess zeroed on 64-bit host),
592 and accessible via the same address on both host and target (via
593 __flat_scalar GCN address space). */
594
595 static void *
596 device_malloc (size_t size)
597 {
598 void *result;
599 XHSA (hsa_fns.hsa_memory_allocate_fn (kernargs_region, size, &result),
600 "Allocate device memory");
601 return result;
602 }
603
604 /* These are the device pointers that will be transferred to the target.
605 The HSA Runtime points the kernargs register here.
606 They correspond to function signature:
607 int main (int argc, char *argv[], int *return_value)
608 The compiler expects this, for kernel functions, and will
609 automatically assign the exit value to *return_value. */
610 struct kernargs
611 {
612 /* Kernargs. */
613 int32_t argc;
614 int64_t argv;
615 int64_t out_ptr;
616 int64_t heap_ptr;
617
618 /* Output data. */
619 struct output
620 {
621 int return_value;
622 unsigned int next_output;
623 struct printf_data
624 {
625 int written;
626 char msg[128];
627 int type;
628 union
629 {
630 int64_t ivalue;
631 double dvalue;
632 char text[128];
633 };
634 } queue[1024];
635 unsigned int consumed;
636 } output_data;
637
638 struct heap
639 {
640 int64_t size;
641 char data[0];
642 } heap;
643 };
644
645 /* Print any console output from the kernel.
646 We print all entries from "consumed" to the next entry without a "written"
647 flag, or "next_output" is reached. The buffer is circular, but the
648 indices are absolute. It is assumed the kernel will stop writing data
649 if "next_output" wraps (becomes smaller than "consumed"). */
650 void
651 gomp_print_output (struct kernargs *kernargs, bool final)
652 {
653 unsigned int limit = (sizeof (kernargs->output_data.queue)
654 / sizeof (kernargs->output_data.queue[0]));
655
656 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
657 __ATOMIC_ACQUIRE);
658 unsigned int to = kernargs->output_data.next_output;
659
660 if (from > to)
661 {
662 /* Overflow. */
663 if (final)
664 printf ("GCN print buffer overflowed.\n");
665 return;
666 }
667
668 unsigned int i;
669 for (i = from; i < to; i++)
670 {
671 struct printf_data *data = &kernargs->output_data.queue[i%limit];
672
673 if (!data->written && !final)
674 break;
675
676 switch (data->type)
677 {
678 case 0:
679 printf ("%.128s%ld\n", data->msg, data->ivalue);
680 break;
681 case 1:
682 printf ("%.128s%f\n", data->msg, data->dvalue);
683 break;
684 case 2:
685 printf ("%.128s%.128s\n", data->msg, data->text);
686 break;
687 case 3:
688 printf ("%.128s%.128s", data->msg, data->text);
689 break;
690 default:
691 printf ("GCN print buffer error!\n");
692 break;
693 }
694
695 data->written = 0;
696 __atomic_store_n (&kernargs->output_data.consumed, i+1,
697 __ATOMIC_RELEASE);
698 }
699 fflush (stdout);
700 }
701
702 /* Execute an already-loaded kernel on the device. */
703
704 static void
705 run (uint64_t kernel, void *kernargs)
706 {
707 /* A "signal" is used to launch and monitor the kernel. */
708 hsa_signal_t signal;
709 XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
710 "Create signal");
711
712 /* Configure for a single-worker kernel. */
713 uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
714 const uint32_t queueMask = queue->size - 1;
715 hsa_kernel_dispatch_packet_t *dispatch_packet =
716 &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
717 queueMask]);
718 dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
719 dispatch_packet->workgroup_size_x = (uint16_t) 1;
720 dispatch_packet->workgroup_size_y = (uint16_t) 64;
721 dispatch_packet->workgroup_size_z = (uint16_t) 1;
722 dispatch_packet->grid_size_x = 1;
723 dispatch_packet->grid_size_y = 64;
724 dispatch_packet->grid_size_z = 1;
725 dispatch_packet->completion_signal = signal;
726 dispatch_packet->kernel_object = kernel;
727 dispatch_packet->kernarg_address = (void *) kernargs;
728 dispatch_packet->private_segment_size = private_segment_size;
729 dispatch_packet->group_segment_size = group_segment_size;
730
731 uint16_t header = 0;
732 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
733 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
734 header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
735
736 __atomic_store_n ((uint32_t *) dispatch_packet,
737 header | (dispatch_packet->setup << 16),
738 __ATOMIC_RELEASE);
739
740 if (debug)
741 fprintf (stderr, "Launch kernel\n");
742
743 hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
744 hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
745 /* Kernel running ...... */
746 while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
747 1, 1000000,
748 HSA_WAIT_STATE_ACTIVE) != 0)
749 {
750 usleep (10000);
751 gomp_print_output (kernargs, false);
752 }
753
754 gomp_print_output (kernargs, true);
755
756 if (debug)
757 fprintf (stderr, "Kernel exited\n");
758
759 XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
760 "Clean up signal");
761 }
762
763 int
764 main (int argc, char *argv[])
765 {
766 int kernel_arg = 0;
767 for (int i = 1; i < argc; i++)
768 {
769 if (!strcmp (argv[i], "--help"))
770 {
771 usage (argv[0]);
772 return 0;
773 }
774 else if (!strcmp (argv[i], "--version"))
775 {
776 version (argv[0]);
777 return 0;
778 }
779 else if (!strcmp (argv[i], "--debug"))
780 debug = true;
781 else if (argv[i][0] == '-')
782 {
783 usage (argv[0]);
784 return 1;
785 }
786 else
787 {
788 kernel_arg = i;
789 break;
790 }
791 }
792
793 if (!kernel_arg)
794 {
795 /* No kernel arguments were found. */
796 usage (argv[0]);
797 return 1;
798 }
799
800 /* The remaining arguments are for the GCN kernel. */
801 int kernel_argc = argc - kernel_arg;
802 char **kernel_argv = &argv[kernel_arg];
803
804 init_device ();
805 load_image (kernel_argv[0]);
806
807 /* Calculate size of function parameters + argv data. */
808 size_t args_size = 0;
809 for (int i = 0; i < kernel_argc; i++)
810 args_size += strlen (kernel_argv[i]) + 1;
811
812 /* Allocate device memory for both function parameters and the argv
813 data. */
814 size_t heap_size = 10 * 1024 * 1024; /* 10MB. */
815 struct kernargs *kernargs = device_malloc (sizeof (*kernargs) + heap_size);
816 struct argdata
817 {
818 int64_t argv_data[kernel_argc];
819 char strings[args_size];
820 } *args = device_malloc (sizeof (struct argdata));
821
822 /* Write the data to the target. */
823 kernargs->argc = kernel_argc;
824 kernargs->argv = (int64_t) args->argv_data;
825 kernargs->out_ptr = (int64_t) &kernargs->output_data;
826 kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
827 kernargs->output_data.next_output = 0;
828 for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
829 / sizeof (kernargs->output_data.queue[0])); i++)
830 kernargs->output_data.queue[i].written = 0;
831 kernargs->output_data.consumed = 0;
832 int offset = 0;
833 for (int i = 0; i < kernel_argc; i++)
834 {
835 size_t arg_len = strlen (kernel_argv[i]) + 1;
836 args->argv_data[i] = (int64_t) &args->strings[offset];
837 memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
838 offset += arg_len;
839 }
840 kernargs->heap_ptr = (int64_t) &kernargs->heap;
841 kernargs->heap.size = heap_size;
842
843 /* Run constructors on the GPU. */
844 run (init_array_kernel, kernargs);
845
846 /* Run the kernel on the GPU. */
847 run (main_kernel, kernargs);
848 unsigned int return_value =
849 (unsigned int) kernargs->output_data.return_value;
850
851 /* Run destructors on the GPU. */
852 run (fini_array_kernel, kernargs);
853
854 unsigned int upper = (return_value & ~0xffff) >> 16;
855 if (upper == 0xcafe)
856 printf ("Kernel exit value was never set\n");
857 else if (upper == 0xffff)
858 ; /* Set by exit. */
859 else if (upper == 0)
860 ; /* Set by return from main. */
861 else
862 printf ("Possible kernel exit value corruption, 2 most significant bytes "
863 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
864
865 if (upper == 0xffff)
866 {
867 unsigned int signal = (return_value >> 8) & 0xff;
868 if (signal == SIGABRT)
869 printf ("Kernel aborted\n");
870 else if (signal != 0)
871 printf ("Kernel received unkown signal\n");
872 }
873
874 if (debug)
875 printf ("Kernel exit value: %d\n", return_value & 0xff);
876
877 /* Clean shut down. */
878 XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
879 "Clean up device memory");
880 XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
881 "Clean up GCN executable");
882 XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
883 "Clean up device queue");
884 XHSA (hsa_fns.hsa_shut_down_fn (),
885 "Shut down run-time");
886
887 return return_value & 0xff;
888 }