]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/gcn/gcn-run.c
Implement circular print buffer.
[thirdparty/gcc.git] / gcc / config / gcn / gcn-run.c
1 /* Run a stand-alone AMD GCN kernel.
2
3 Copyright 2017 Mentor Graphics Corporation
4 Copyright 2018-2019 Free Software Foundation, Inc.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18
19 /* This program will run a compiled stand-alone GCN kernel on a GPU.
20
21 The kernel entry point's signature must use a standard main signature:
22
23 int main(int argc, char **argv)
24 */
25
26 #include <stdint.h>
27 #include <stdbool.h>
28 #include <stdlib.h>
29 #include <malloc.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <dlfcn.h>
33 #include <unistd.h>
34 #include <elf.h>
35 #include <signal.h>
36
37 /* These probably won't be in elf.h for a while. */
38 #ifndef R_AMDGPU_NONE
39 #define R_AMDGPU_NONE 0
40 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
41 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
42 #define R_AMDGPU_ABS64 3 /* S + A */
43 #define R_AMDGPU_REL32 4 /* S + A - P */
44 #define R_AMDGPU_REL64 5 /* S + A - P */
45 #define R_AMDGPU_ABS32 6 /* S + A */
46 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
47 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
48 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
49 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
50 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
51 #define reserved 12
52 #define R_AMDGPU_RELATIVE64 13 /* B + A */
53 #endif
54
55 #include "hsa.h"
56
57 #ifndef HSA_RUNTIME_LIB
58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so"
59 #endif
60
61 #ifndef VERSION_STRING
62 #define VERSION_STRING "(version unknown)"
63 #endif
64
65 bool debug = false;
66
67 hsa_agent_t device = { 0 };
68 hsa_queue_t *queue = NULL;
69 uint64_t kernel = 0;
70 hsa_executable_t executable = { 0 };
71
72 hsa_region_t kernargs_region = { 0 };
73 uint32_t kernarg_segment_size = 0;
74 uint32_t group_segment_size = 0;
75 uint32_t private_segment_size = 0;
76
77 static void
78 usage (const char *progname)
79 {
80 printf ("Usage: %s [options] kernel [kernel-args]\n\n"
81 "Options:\n"
82 " --help\n"
83 " --version\n"
84 " --debug\n", progname);
85 }
86
87 static void
88 version (const char *progname)
89 {
90 printf ("%s " VERSION_STRING "\n", progname);
91 }
92
93 /* As an HSA runtime is dlopened, following structure defines the necessary
94 function pointers.
95 Code adapted from libgomp. */
96
97 struct hsa_runtime_fn_info
98 {
99 /* HSA runtime. */
100 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
101 const char **status_string);
102 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
103 hsa_agent_info_t attribute,
104 void *value);
105 hsa_status_t (*hsa_init_fn) (void);
106 hsa_status_t (*hsa_iterate_agents_fn)
107 (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data);
108 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
109 hsa_region_info_t attribute,
110 void *value);
111 hsa_status_t (*hsa_queue_create_fn)
112 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
113 void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data),
114 void *data, uint32_t private_segment_size,
115 uint32_t group_segment_size, hsa_queue_t **queue);
116 hsa_status_t (*hsa_agent_iterate_regions_fn)
117 (hsa_agent_t agent,
118 hsa_status_t (*callback) (hsa_region_t region, void *data), void *data);
119 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
120 hsa_status_t (*hsa_executable_create_fn)
121 (hsa_profile_t profile, hsa_executable_state_t executable_state,
122 const char *options, hsa_executable_t *executable);
123 hsa_status_t (*hsa_executable_global_variable_define_fn)
124 (hsa_executable_t executable, const char *variable_name, void *address);
125 hsa_status_t (*hsa_executable_load_code_object_fn)
126 (hsa_executable_t executable, hsa_agent_t agent,
127 hsa_code_object_t code_object, const char *options);
128 hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable,
129 const char *options);
130 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
131 uint32_t num_consumers,
132 const hsa_agent_t *consumers,
133 hsa_signal_t *signal);
134 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
135 void **ptr);
136 hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src,
137 size_t size);
138 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
139 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
140 hsa_status_t (*hsa_executable_get_symbol_fn)
141 (hsa_executable_t executable, const char *module_name,
142 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
143 hsa_executable_symbol_t *symbol);
144 hsa_status_t (*hsa_executable_symbol_get_info_fn)
145 (hsa_executable_symbol_t executable_symbol,
146 hsa_executable_symbol_info_t attribute, void *value);
147 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
148 hsa_signal_value_t value);
149 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
150 (hsa_signal_t signal, hsa_signal_condition_t condition,
151 hsa_signal_value_t compare_value, uint64_t timeout_hint,
152 hsa_wait_state_t wait_state_hint);
153 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn)
154 (hsa_signal_t signal, hsa_signal_condition_t condition,
155 hsa_signal_value_t compare_value, uint64_t timeout_hint,
156 hsa_wait_state_t wait_state_hint);
157 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
158 hsa_status_t (*hsa_code_object_deserialize_fn)
159 (void *serialized_code_object, size_t serialized_code_object_size,
160 const char *options, hsa_code_object_t *code_object);
161 uint64_t (*hsa_queue_load_write_index_relaxed_fn)
162 (const hsa_queue_t *queue);
163 void (*hsa_queue_store_write_index_relaxed_fn)
164 (const hsa_queue_t *queue, uint64_t value);
165 hsa_status_t (*hsa_shut_down_fn) ();
166 };
167
168 /* HSA runtime functions that are initialized in init_hsa_context.
169 Code adapted from libgomp. */
170
171 static struct hsa_runtime_fn_info hsa_fns;
172
173 #define DLSYM_FN(function) \
174 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
175 if (hsa_fns.function##_fn == NULL) \
176 goto fail;
177
178 static void
179 init_hsa_runtime_functions (void)
180 {
181 void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY);
182 if (handle == NULL)
183 {
184 fprintf (stderr,
185 "The HSA runtime is required to run GCN kernels on hardware.\n"
186 "%s: File not found or could not be opened\n",
187 HSA_RUNTIME_LIB);
188 exit (1);
189 }
190
191 DLSYM_FN (hsa_status_string)
192 DLSYM_FN (hsa_agent_get_info)
193 DLSYM_FN (hsa_init)
194 DLSYM_FN (hsa_iterate_agents)
195 DLSYM_FN (hsa_region_get_info)
196 DLSYM_FN (hsa_queue_create)
197 DLSYM_FN (hsa_agent_iterate_regions)
198 DLSYM_FN (hsa_executable_destroy)
199 DLSYM_FN (hsa_executable_create)
200 DLSYM_FN (hsa_executable_global_variable_define)
201 DLSYM_FN (hsa_executable_load_code_object)
202 DLSYM_FN (hsa_executable_freeze)
203 DLSYM_FN (hsa_signal_create)
204 DLSYM_FN (hsa_memory_allocate)
205 DLSYM_FN (hsa_memory_copy)
206 DLSYM_FN (hsa_memory_free)
207 DLSYM_FN (hsa_signal_destroy)
208 DLSYM_FN (hsa_executable_get_symbol)
209 DLSYM_FN (hsa_executable_symbol_get_info)
210 DLSYM_FN (hsa_signal_wait_acquire)
211 DLSYM_FN (hsa_signal_wait_relaxed)
212 DLSYM_FN (hsa_signal_store_relaxed)
213 DLSYM_FN (hsa_queue_destroy)
214 DLSYM_FN (hsa_code_object_deserialize)
215 DLSYM_FN (hsa_queue_load_write_index_relaxed)
216 DLSYM_FN (hsa_queue_store_write_index_relaxed)
217 DLSYM_FN (hsa_shut_down)
218
219 return;
220
221 fail:
222 fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n");
223 exit (1);
224 }
225
226 #undef DLSYM_FN
227
228 /* Report a fatal error STR together with the HSA error corresponding to
229 STATUS and terminate execution of the current process. */
230
231 static void
232 hsa_fatal (const char *str, hsa_status_t status)
233 {
234 const char *hsa_error_msg;
235 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
236 fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str,
237 hsa_error_msg);
238 exit (1);
239 }
240
241 /* Helper macros to ensure we check the return values from the HSA Runtime.
242 These just keep the rest of the code a bit cleaner. */
243
244 #define XHSA_CMP(FN, CMP, MSG) \
245 do { \
246 hsa_status_t status = (FN); \
247 if (!(CMP)) \
248 hsa_fatal ((MSG), status); \
249 else if (debug) \
250 fprintf (stderr, "%s: OK\n", (MSG)); \
251 } while (0)
252 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
253
254 /* Callback of hsa_iterate_agents.
255 Called once for each available device, and returns "break" when a
256 suitable one has been found. */
257
258 static hsa_status_t
259 get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused)))
260 {
261 hsa_device_type_t device_type;
262 XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
263 &device_type),
264 "Get agent type");
265
266 /* Select only GPU devices. */
267 /* TODO: support selecting from multiple GPUs. */
268 if (HSA_DEVICE_TYPE_GPU == device_type)
269 {
270 device = agent;
271 return HSA_STATUS_INFO_BREAK;
272 }
273
274 /* The device was not suitable. */
275 return HSA_STATUS_SUCCESS;
276 }
277
278 /* Callback of hsa_iterate_regions.
279 Called once for each available memory region, and returns "break" when a
280 suitable one has been found. */
281
282 static hsa_status_t
283 get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused)))
284 {
285 /* Reject non-global regions. */
286 hsa_region_segment_t segment;
287 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment);
288 if (HSA_REGION_SEGMENT_GLOBAL != segment)
289 return HSA_STATUS_SUCCESS;
290
291 /* Find a region with the KERNARG flag set. */
292 hsa_region_global_flag_t flags;
293 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
294 &flags);
295 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
296 {
297 kernargs_region = region;
298 return HSA_STATUS_INFO_BREAK;
299 }
300
301 /* The region was not suitable. */
302 return HSA_STATUS_SUCCESS;
303 }
304
305 /* Initialize the HSA Runtime library and GPU device. */
306
307 static void
308 init_device ()
309 {
310 /* Load the shared library and find the API functions. */
311 init_hsa_runtime_functions ();
312
313 /* Initialize the HSA Runtime. */
314 XHSA (hsa_fns.hsa_init_fn (),
315 "Initialize run-time");
316
317 /* Select a suitable device.
318 The call-back function, get_gpu_agent, does the selection. */
319 XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL),
320 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
321 "Find a device");
322
323 /* Initialize the queue used for launching kernels. */
324 uint32_t queue_size = 0;
325 XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
326 &queue_size),
327 "Find max queue size");
328 XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size,
329 HSA_QUEUE_TYPE_SINGLE, NULL,
330 NULL, UINT32_MAX, UINT32_MAX, &queue),
331 "Set up a device queue");
332
333 /* Select a memory region for the kernel arguments.
334 The call-back function, get_kernarg_region, does the selection. */
335 XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region,
336 NULL),
337 status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK,
338 "Locate kernargs memory");
339 }
340
341
342 /* Read a whole input file.
343 Code copied from mkoffload. */
344
345 static char *
346 read_file (const char *filename, size_t *plen)
347 {
348 size_t alloc = 16384;
349 size_t base = 0;
350 char *buffer;
351
352 FILE *stream = fopen (filename, "rb");
353 if (!stream)
354 {
355 perror (filename);
356 exit (1);
357 }
358
359 if (!fseek (stream, 0, SEEK_END))
360 {
361 /* Get the file size. */
362 long s = ftell (stream);
363 if (s >= 0)
364 alloc = s + 100;
365 fseek (stream, 0, SEEK_SET);
366 }
367 buffer = malloc (alloc);
368
369 for (;;)
370 {
371 size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
372
373 if (!n)
374 break;
375 base += n;
376 if (base + 1 == alloc)
377 {
378 alloc *= 2;
379 buffer = realloc (buffer, alloc);
380 }
381 }
382 buffer[base] = 0;
383 *plen = base;
384
385 fclose (stream);
386
387 return buffer;
388 }
389
390 /* Read a HSA Code Object (HSACO) from file, and load it into the device. */
391
392 static void
393 load_image (const char *filename)
394 {
395 size_t image_size;
396 Elf64_Ehdr *image = (void *) read_file (filename, &image_size);
397
398 /* An "executable" consists of one or more code objects. */
399 XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
400 HSA_EXECUTABLE_STATE_UNFROZEN, "",
401 &executable),
402 "Initialize GCN executable");
403
404 /* Hide relocations from the HSA runtime loader.
405 Keep a copy of the unmodified section headers to use later. */
406 Elf64_Shdr *image_sections =
407 (Elf64_Shdr *) ((char *) image + image->e_shoff);
408 Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
409 memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
410 for (int i = image->e_shnum - 1; i >= 0; i--)
411 {
412 if (image_sections[i].sh_type == SHT_RELA
413 || image_sections[i].sh_type == SHT_REL)
414 /* Change section type to something harmless. */
415 image_sections[i].sh_type = SHT_NOTE;
416 }
417
418 /* Add the HSACO to the executable. */
419 hsa_code_object_t co = { 0 };
420 XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
421 "Deserialize GCN code object");
422 XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co,
423 ""),
424 "Load GCN code object");
425
426 /* We're done modifying he executable. */
427 XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""),
428 "Freeze GCN executable");
429
430 /* Locate the "main" function, and read the kernel's properties. */
431 hsa_executable_symbol_t symbol;
432 XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
433 device, 0, &symbol),
434 "Find 'main' function");
435 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
436 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel),
437 "Extract kernel object");
438 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
439 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
440 &kernarg_segment_size),
441 "Extract kernarg segment size");
442 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
443 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
444 &group_segment_size),
445 "Extract group segment size");
446 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
447 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
448 &private_segment_size),
449 "Extract private segment size");
450
451 /* Find main function in ELF, and calculate actual load offset. */
452 Elf64_Addr load_offset;
453 XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
454 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
455 &load_offset),
456 "Extract 'main' symbol address");
457 for (int i = 0; i < image->e_shnum; i++)
458 if (sections[i].sh_type == SHT_SYMTAB)
459 {
460 Elf64_Shdr *strtab = &sections[sections[i].sh_link];
461 char *strings = (char *) image + strtab->sh_offset;
462
463 for (size_t offset = 0;
464 offset < sections[i].sh_size;
465 offset += sections[i].sh_entsize)
466 {
467 Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
468 + sections[i].sh_offset + offset);
469 if (strcmp ("main", strings + sym->st_name) == 0)
470 {
471 load_offset -= sym->st_value;
472 goto found_main;
473 }
474 }
475 }
476 /* We only get here when main was not found.
477 This should never happen. */
478 fprintf (stderr, "Error: main function not found.\n");
479 abort ();
480 found_main:;
481
482 /* Find dynamic symbol table. */
483 Elf64_Shdr *dynsym = NULL;
484 for (int i = 0; i < image->e_shnum; i++)
485 if (sections[i].sh_type == SHT_DYNSYM)
486 {
487 dynsym = &sections[i];
488 break;
489 }
490
491 /* Fix up relocations. */
492 for (int i = 0; i < image->e_shnum; i++)
493 {
494 if (sections[i].sh_type == SHT_RELA)
495 for (size_t offset = 0;
496 offset < sections[i].sh_size;
497 offset += sections[i].sh_entsize)
498 {
499 Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
500 + sections[i].sh_offset
501 + offset);
502 Elf64_Sym *sym =
503 (dynsym
504 ? (Elf64_Sym *) ((char *) image
505 + dynsym->sh_offset
506 + (dynsym->sh_entsize
507 * ELF64_R_SYM (reloc->r_info))) : NULL);
508
509 int64_t S = (sym ? sym->st_value : 0);
510 int64_t P = reloc->r_offset + load_offset;
511 int64_t A = reloc->r_addend;
512 int64_t B = load_offset;
513 int64_t V, size;
514 switch (ELF64_R_TYPE (reloc->r_info))
515 {
516 case R_AMDGPU_ABS32_LO:
517 V = (S + A) & 0xFFFFFFFF;
518 size = 4;
519 break;
520 case R_AMDGPU_ABS32_HI:
521 V = (S + A) >> 32;
522 size = 4;
523 break;
524 case R_AMDGPU_ABS64:
525 V = S + A;
526 size = 8;
527 break;
528 case R_AMDGPU_REL32:
529 V = S + A - P;
530 size = 4;
531 break;
532 case R_AMDGPU_REL64:
533 /* FIXME
534 LLD seems to emit REL64 where the the assembler has ABS64.
535 This is clearly wrong because it's not what the compiler
536 is expecting. Let's assume, for now, that it's a bug.
537 In any case, GCN kernels are always self contained and
538 therefore relative relocations will have been resolved
539 already, so this should be a safe workaround. */
540 V = S + A /* - P */ ;
541 size = 8;
542 break;
543 case R_AMDGPU_ABS32:
544 V = S + A;
545 size = 4;
546 break;
547 /* TODO R_AMDGPU_GOTPCREL */
548 /* TODO R_AMDGPU_GOTPCREL32_LO */
549 /* TODO R_AMDGPU_GOTPCREL32_HI */
550 case R_AMDGPU_REL32_LO:
551 V = (S + A - P) & 0xFFFFFFFF;
552 size = 4;
553 break;
554 case R_AMDGPU_REL32_HI:
555 V = (S + A - P) >> 32;
556 size = 4;
557 break;
558 case R_AMDGPU_RELATIVE64:
559 V = B + A;
560 size = 8;
561 break;
562 default:
563 fprintf (stderr, "Error: unsupported relocation type.\n");
564 exit (1);
565 }
566 XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
567 "Fix up relocation");
568 }
569 }
570 }
571
572 /* Allocate some device memory from the kernargs region.
573 The returned address will be 32-bit (with excess zeroed on 64-bit host),
574 and accessible via the same address on both host and target (via
575 __flat_scalar GCN address space). */
576
577 static void *
578 device_malloc (size_t size)
579 {
580 void *result;
581 XHSA (hsa_fns.hsa_memory_allocate_fn (kernargs_region, size, &result),
582 "Allocate device memory");
583 return result;
584 }
585
586 /* These are the device pointers that will be transferred to the target.
587 The HSA Runtime points the kernargs register here.
588 They correspond to function signature:
589 int main (int argc, char *argv[], int *return_value)
590 The compiler expects this, for kernel functions, and will
591 automatically assign the exit value to *return_value. */
592 struct kernargs
593 {
594 /* Kernargs. */
595 int32_t argc;
596 int64_t argv;
597 int64_t out_ptr;
598 int64_t heap_ptr;
599
600 /* Output data. */
601 struct output
602 {
603 int return_value;
604 unsigned int next_output;
605 struct printf_data
606 {
607 int written;
608 char msg[128];
609 int type;
610 union
611 {
612 int64_t ivalue;
613 double dvalue;
614 char text[128];
615 };
616 } queue[1024];
617 unsigned int consumed;
618 } output_data;
619
620 struct heap
621 {
622 int64_t size;
623 char data[0];
624 } heap;
625 };
626
627 /* Print any console output from the kernel.
628 We print all entries from "consumed" to the next entry without a "written"
629 flag, or "next_output" is reached. The buffer is circular, but the
630 indices are absolute. It is assumed the kernel will stop writing data
631 if "next_output" wraps (becomes smaller than "consumed"). */
632 void
633 gomp_print_output (struct kernargs *kernargs, bool final)
634 {
635 unsigned int limit = (sizeof (kernargs->output_data.queue)
636 / sizeof (kernargs->output_data.queue[0]));
637
638 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
639 __ATOMIC_ACQUIRE);
640 unsigned int to = kernargs->output_data.next_output;
641
642 if (from > to)
643 {
644 /* Overflow. */
645 if (final)
646 printf ("GCN print buffer overflowed.\n");
647 return;
648 }
649
650 unsigned int i;
651 for (i = from; i < to; i++)
652 {
653 struct printf_data *data = &kernargs->output_data.queue[i%limit];
654
655 if (!data->written && !final)
656 break;
657
658 switch (data->type)
659 {
660 case 0:
661 printf ("%.128s%ld\n", data->msg, data->ivalue);
662 break;
663 case 1:
664 printf ("%.128s%f\n", data->msg, data->dvalue);
665 break;
666 case 2:
667 printf ("%.128s%.128s\n", data->msg, data->text);
668 break;
669 case 3:
670 printf ("%.128s%.128s", data->msg, data->text);
671 break;
672 default:
673 printf ("GCN print buffer error!\n");
674 break;
675 }
676
677 data->written = 0;
678 __atomic_store_n (&kernargs->output_data.consumed, i+1,
679 __ATOMIC_RELEASE);
680 }
681 fflush (stdout);
682 }
683
684 /* Execute an already-loaded kernel on the device. */
685
686 static void
687 run (void *kernargs)
688 {
689 /* A "signal" is used to launch and monitor the kernel. */
690 hsa_signal_t signal;
691 XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal),
692 "Create signal");
693
694 /* Configure for a single-worker kernel. */
695 uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue);
696 const uint32_t queueMask = queue->size - 1;
697 hsa_kernel_dispatch_packet_t *dispatch_packet =
698 &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index &
699 queueMask]);
700 dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
701 dispatch_packet->workgroup_size_x = (uint16_t) 1;
702 dispatch_packet->workgroup_size_y = (uint16_t) 64;
703 dispatch_packet->workgroup_size_z = (uint16_t) 1;
704 dispatch_packet->grid_size_x = 1;
705 dispatch_packet->grid_size_y = 64;
706 dispatch_packet->grid_size_z = 1;
707 dispatch_packet->completion_signal = signal;
708 dispatch_packet->kernel_object = kernel;
709 dispatch_packet->kernarg_address = (void *) kernargs;
710 dispatch_packet->private_segment_size = private_segment_size;
711 dispatch_packet->group_segment_size = group_segment_size;
712
713 uint16_t header = 0;
714 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
715 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
716 header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
717
718 __atomic_store_n ((uint32_t *) dispatch_packet,
719 header | (dispatch_packet->setup << 16),
720 __ATOMIC_RELEASE);
721
722 if (debug)
723 fprintf (stderr, "Launch kernel\n");
724
725 hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1);
726 hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index);
727 /* Kernel running ...... */
728 while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT,
729 1, 1000000,
730 HSA_WAIT_STATE_ACTIVE) != 0)
731 {
732 usleep (10000);
733 gomp_print_output (kernargs, false);
734 }
735
736 gomp_print_output (kernargs, true);
737
738 if (debug)
739 fprintf (stderr, "Kernel exited\n");
740
741 XHSA (hsa_fns.hsa_signal_destroy_fn (signal),
742 "Clean up signal");
743 }
744
745 int
746 main (int argc, char *argv[])
747 {
748 int kernel_arg = 0;
749 for (int i = 1; i < argc; i++)
750 {
751 if (!strcmp (argv[i], "--help"))
752 {
753 usage (argv[0]);
754 return 0;
755 }
756 else if (!strcmp (argv[i], "--version"))
757 {
758 version (argv[0]);
759 return 0;
760 }
761 else if (!strcmp (argv[i], "--debug"))
762 debug = true;
763 else if (argv[i][0] == '-')
764 {
765 usage (argv[0]);
766 return 1;
767 }
768 else
769 {
770 kernel_arg = i;
771 break;
772 }
773 }
774
775 if (!kernel_arg)
776 {
777 /* No kernel arguments were found. */
778 usage (argv[0]);
779 return 1;
780 }
781
782 /* The remaining arguments are for the GCN kernel. */
783 int kernel_argc = argc - kernel_arg;
784 char **kernel_argv = &argv[kernel_arg];
785
786 init_device ();
787 load_image (kernel_argv[0]);
788
789 /* Calculate size of function parameters + argv data. */
790 size_t args_size = 0;
791 for (int i = 0; i < kernel_argc; i++)
792 args_size += strlen (kernel_argv[i]) + 1;
793
794 /* Allocate device memory for both function parameters and the argv
795 data. */
796 size_t heap_size = 10 * 1024 * 1024; /* 10MB. */
797 struct kernargs *kernargs = device_malloc (sizeof (*kernargs) + heap_size);
798 struct argdata
799 {
800 int64_t argv_data[kernel_argc];
801 char strings[args_size];
802 } *args = device_malloc (sizeof (struct argdata));
803
804 /* Write the data to the target. */
805 kernargs->argc = kernel_argc;
806 kernargs->argv = (int64_t) args->argv_data;
807 kernargs->out_ptr = (int64_t) &kernargs->output_data;
808 kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
809 kernargs->output_data.next_output = 0;
810 for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
811 / sizeof (kernargs->output_data.queue[0])); i++)
812 kernargs->output_data.queue[i].written = 0;
813 kernargs->output_data.consumed = 0;
814 int offset = 0;
815 for (int i = 0; i < kernel_argc; i++)
816 {
817 size_t arg_len = strlen (kernel_argv[i]) + 1;
818 args->argv_data[i] = (int64_t) &args->strings[offset];
819 memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
820 offset += arg_len;
821 }
822 kernargs->heap_ptr = (int64_t) &kernargs->heap;
823 kernargs->heap.size = heap_size;
824
825 /* Run the kernel on the GPU. */
826 run (kernargs);
827 unsigned int return_value =
828 (unsigned int) kernargs->output_data.return_value;
829
830 unsigned int upper = (return_value & ~0xffff) >> 16;
831 if (upper == 0xcafe)
832 printf ("Kernel exit value was never set\n");
833 else if (upper == 0xffff)
834 ; /* Set by exit. */
835 else if (upper == 0)
836 ; /* Set by return from main. */
837 else
838 printf ("Possible kernel exit value corruption, 2 most significant bytes "
839 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value);
840
841 if (upper == 0xffff)
842 {
843 unsigned int signal = (return_value >> 8) & 0xff;
844 if (signal == SIGABRT)
845 printf ("Kernel aborted\n");
846 else if (signal != 0)
847 printf ("Kernel received unkown signal\n");
848 }
849
850 if (debug)
851 printf ("Kernel exit value: %d\n", return_value & 0xff);
852
853 /* Clean shut down. */
854 XHSA (hsa_fns.hsa_memory_free_fn (kernargs),
855 "Clean up device memory");
856 XHSA (hsa_fns.hsa_executable_destroy_fn (executable),
857 "Clean up GCN executable");
858 XHSA (hsa_fns.hsa_queue_destroy_fn (queue),
859 "Clean up device queue");
860 XHSA (hsa_fns.hsa_shut_down_fn (),
861 "Shut down run-time");
862
863 return return_value & 0xff;
864 }