1 /* Run a stand-alone AMD GCN kernel.
3 Copyright 2017 Mentor Graphics Corporation
4 Copyright (C) 2018-2020 Free Software Foundation, Inc.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /* This program will run a compiled stand-alone GCN kernel on a GPU.
21 The kernel entry point's signature must use a standard main signature:
23 int main(int argc, char **argv)
37 /* These probably won't be in elf.h for a while. */
39 #define R_AMDGPU_NONE 0
40 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
41 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
42 #define R_AMDGPU_ABS64 3 /* S + A */
43 #define R_AMDGPU_REL32 4 /* S + A - P */
44 #define R_AMDGPU_REL64 5 /* S + A - P */
45 #define R_AMDGPU_ABS32 6 /* S + A */
46 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
47 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
48 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
49 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
50 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
52 #define R_AMDGPU_RELATIVE64 13 /* B + A */
57 #ifndef HSA_RUNTIME_LIB
58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so"
61 #ifndef VERSION_STRING
62 #define VERSION_STRING "(version unknown)"
67 hsa_agent_t device
= { 0 };
68 hsa_queue_t
*queue
= NULL
;
69 uint64_t init_array_kernel
= 0;
70 uint64_t fini_array_kernel
= 0;
71 uint64_t main_kernel
= 0;
72 hsa_executable_t executable
= { 0 };
74 hsa_region_t kernargs_region
= { 0 };
75 hsa_region_t heap_region
= { 0 };
76 uint32_t kernarg_segment_size
= 0;
77 uint32_t group_segment_size
= 0;
78 uint32_t private_segment_size
= 0;
81 usage (const char *progname
)
83 printf ("Usage: %s [options] kernel [kernel-args]\n\n"
87 " --debug\n", progname
);
91 version (const char *progname
)
93 printf ("%s " VERSION_STRING
"\n", progname
);
96 /* As an HSA runtime is dlopened, following structure defines the necessary
98 Code adapted from libgomp. */
100 struct hsa_runtime_fn_info
103 hsa_status_t (*hsa_status_string_fn
) (hsa_status_t status
,
104 const char **status_string
);
105 hsa_status_t (*hsa_agent_get_info_fn
) (hsa_agent_t agent
,
106 hsa_agent_info_t attribute
,
108 hsa_status_t (*hsa_init_fn
) (void);
109 hsa_status_t (*hsa_iterate_agents_fn
)
110 (hsa_status_t (*callback
) (hsa_agent_t agent
, void *data
), void *data
);
111 hsa_status_t (*hsa_region_get_info_fn
) (hsa_region_t region
,
112 hsa_region_info_t attribute
,
114 hsa_status_t (*hsa_queue_create_fn
)
115 (hsa_agent_t agent
, uint32_t size
, hsa_queue_type_t type
,
116 void (*callback
) (hsa_status_t status
, hsa_queue_t
*source
, void *data
),
117 void *data
, uint32_t private_segment_size
,
118 uint32_t group_segment_size
, hsa_queue_t
**queue
);
119 hsa_status_t (*hsa_agent_iterate_regions_fn
)
121 hsa_status_t (*callback
) (hsa_region_t region
, void *data
), void *data
);
122 hsa_status_t (*hsa_executable_destroy_fn
) (hsa_executable_t executable
);
123 hsa_status_t (*hsa_executable_create_fn
)
124 (hsa_profile_t profile
, hsa_executable_state_t executable_state
,
125 const char *options
, hsa_executable_t
*executable
);
126 hsa_status_t (*hsa_executable_global_variable_define_fn
)
127 (hsa_executable_t executable
, const char *variable_name
, void *address
);
128 hsa_status_t (*hsa_executable_load_code_object_fn
)
129 (hsa_executable_t executable
, hsa_agent_t agent
,
130 hsa_code_object_t code_object
, const char *options
);
131 hsa_status_t (*hsa_executable_freeze_fn
) (hsa_executable_t executable
,
132 const char *options
);
133 hsa_status_t (*hsa_signal_create_fn
) (hsa_signal_value_t initial_value
,
134 uint32_t num_consumers
,
135 const hsa_agent_t
*consumers
,
136 hsa_signal_t
*signal
);
137 hsa_status_t (*hsa_memory_allocate_fn
) (hsa_region_t region
, size_t size
,
139 hsa_status_t (*hsa_memory_assign_agent_fn
) (void *ptr
, hsa_agent_t agent
,
140 hsa_access_permission_t access
);
141 hsa_status_t (*hsa_memory_copy_fn
) (void *dst
, const void *src
,
143 hsa_status_t (*hsa_memory_free_fn
) (void *ptr
);
144 hsa_status_t (*hsa_signal_destroy_fn
) (hsa_signal_t signal
);
145 hsa_status_t (*hsa_executable_get_symbol_fn
)
146 (hsa_executable_t executable
, const char *module_name
,
147 const char *symbol_name
, hsa_agent_t agent
, int32_t call_convention
,
148 hsa_executable_symbol_t
*symbol
);
149 hsa_status_t (*hsa_executable_symbol_get_info_fn
)
150 (hsa_executable_symbol_t executable_symbol
,
151 hsa_executable_symbol_info_t attribute
, void *value
);
152 void (*hsa_signal_store_relaxed_fn
) (hsa_signal_t signal
,
153 hsa_signal_value_t value
);
154 hsa_signal_value_t (*hsa_signal_wait_acquire_fn
)
155 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
156 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
157 hsa_wait_state_t wait_state_hint
);
158 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn
)
159 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
160 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
161 hsa_wait_state_t wait_state_hint
);
162 hsa_status_t (*hsa_queue_destroy_fn
) (hsa_queue_t
*queue
);
163 hsa_status_t (*hsa_code_object_deserialize_fn
)
164 (void *serialized_code_object
, size_t serialized_code_object_size
,
165 const char *options
, hsa_code_object_t
*code_object
);
166 uint64_t (*hsa_queue_load_write_index_relaxed_fn
)
167 (const hsa_queue_t
*queue
);
168 void (*hsa_queue_store_write_index_relaxed_fn
)
169 (const hsa_queue_t
*queue
, uint64_t value
);
170 hsa_status_t (*hsa_shut_down_fn
) ();
173 /* HSA runtime functions that are initialized in init_hsa_context.
174 Code adapted from libgomp. */
176 static struct hsa_runtime_fn_info hsa_fns
;
178 #define DLSYM_FN(function) \
179 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
180 if (hsa_fns.function##_fn == NULL) \
184 init_hsa_runtime_functions (void)
186 void *handle
= dlopen (HSA_RUNTIME_LIB
, RTLD_LAZY
);
190 "The HSA runtime is required to run GCN kernels on hardware.\n"
191 "%s: File not found or could not be opened\n",
196 DLSYM_FN (hsa_status_string
)
197 DLSYM_FN (hsa_agent_get_info
)
199 DLSYM_FN (hsa_iterate_agents
)
200 DLSYM_FN (hsa_region_get_info
)
201 DLSYM_FN (hsa_queue_create
)
202 DLSYM_FN (hsa_agent_iterate_regions
)
203 DLSYM_FN (hsa_executable_destroy
)
204 DLSYM_FN (hsa_executable_create
)
205 DLSYM_FN (hsa_executable_global_variable_define
)
206 DLSYM_FN (hsa_executable_load_code_object
)
207 DLSYM_FN (hsa_executable_freeze
)
208 DLSYM_FN (hsa_signal_create
)
209 DLSYM_FN (hsa_memory_allocate
)
210 DLSYM_FN (hsa_memory_assign_agent
)
211 DLSYM_FN (hsa_memory_copy
)
212 DLSYM_FN (hsa_memory_free
)
213 DLSYM_FN (hsa_signal_destroy
)
214 DLSYM_FN (hsa_executable_get_symbol
)
215 DLSYM_FN (hsa_executable_symbol_get_info
)
216 DLSYM_FN (hsa_signal_wait_acquire
)
217 DLSYM_FN (hsa_signal_wait_relaxed
)
218 DLSYM_FN (hsa_signal_store_relaxed
)
219 DLSYM_FN (hsa_queue_destroy
)
220 DLSYM_FN (hsa_code_object_deserialize
)
221 DLSYM_FN (hsa_queue_load_write_index_relaxed
)
222 DLSYM_FN (hsa_queue_store_write_index_relaxed
)
223 DLSYM_FN (hsa_shut_down
)
228 fprintf (stderr
, "Failed to find HSA functions in " HSA_RUNTIME_LIB
"\n");
234 /* Report a fatal error STR together with the HSA error corresponding to
235 STATUS and terminate execution of the current process. */
238 hsa_fatal (const char *str
, hsa_status_t status
)
240 const char *hsa_error_msg
;
241 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
242 fprintf (stderr
, "%s: FAILED\nHSA Runtime message: %s\n", str
,
247 /* Helper macros to ensure we check the return values from the HSA Runtime.
248 These just keep the rest of the code a bit cleaner. */
250 #define XHSA_CMP(FN, CMP, MSG) \
252 hsa_status_t status = (FN); \
254 hsa_fatal ((MSG), status); \
256 fprintf (stderr, "%s: OK\n", (MSG)); \
258 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
260 /* Callback of hsa_iterate_agents.
261 Called once for each available device, and returns "break" when a
262 suitable one has been found. */
265 get_gpu_agent (hsa_agent_t agent
, void *data
__attribute__ ((unused
)))
267 hsa_device_type_t device_type
;
268 XHSA (hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
272 /* Select only GPU devices. */
273 /* TODO: support selecting from multiple GPUs. */
274 if (HSA_DEVICE_TYPE_GPU
== device_type
)
277 return HSA_STATUS_INFO_BREAK
;
280 /* The device was not suitable. */
281 return HSA_STATUS_SUCCESS
;
284 /* Callback of hsa_iterate_regions.
285 Called once for each available memory region, and returns "break" when a
286 suitable one has been found. */
289 get_memory_region (hsa_region_t region
, hsa_region_t
*retval
,
290 hsa_region_global_flag_t kind
)
292 /* Reject non-global regions. */
293 hsa_region_segment_t segment
;
294 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
, &segment
);
295 if (HSA_REGION_SEGMENT_GLOBAL
!= segment
)
296 return HSA_STATUS_SUCCESS
;
298 /* Find a region with the KERNARG flag set. */
299 hsa_region_global_flag_t flags
;
300 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
305 return HSA_STATUS_INFO_BREAK
;
308 /* The region was not suitable. */
309 return HSA_STATUS_SUCCESS
;
313 get_kernarg_region (hsa_region_t region
, void *data
__attribute__((unused
)))
315 return get_memory_region (region
, &kernargs_region
,
316 HSA_REGION_GLOBAL_FLAG_KERNARG
);
320 get_heap_region (hsa_region_t region
, void *data
__attribute__((unused
)))
322 return get_memory_region (region
, &heap_region
,
323 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
);
326 /* Initialize the HSA Runtime library and GPU device. */
331 /* Load the shared library and find the API functions. */
332 init_hsa_runtime_functions ();
334 /* Initialize the HSA Runtime. */
335 XHSA (hsa_fns
.hsa_init_fn (),
336 "Initialize run-time");
338 /* Select a suitable device.
339 The call-back function, get_gpu_agent, does the selection. */
340 XHSA_CMP (hsa_fns
.hsa_iterate_agents_fn (get_gpu_agent
, NULL
),
341 status
== HSA_STATUS_SUCCESS
|| status
== HSA_STATUS_INFO_BREAK
,
344 /* Initialize the queue used for launching kernels. */
345 uint32_t queue_size
= 0;
346 XHSA (hsa_fns
.hsa_agent_get_info_fn (device
, HSA_AGENT_INFO_QUEUE_MAX_SIZE
,
348 "Find max queue size");
349 XHSA (hsa_fns
.hsa_queue_create_fn (device
, queue_size
,
350 HSA_QUEUE_TYPE_SINGLE
, NULL
,
351 NULL
, UINT32_MAX
, UINT32_MAX
, &queue
),
352 "Set up a device queue");
354 /* Select a memory region for the kernel arguments.
355 The call-back function, get_kernarg_region, does the selection. */
356 XHSA_CMP (hsa_fns
.hsa_agent_iterate_regions_fn (device
, get_kernarg_region
,
358 status
== HSA_STATUS_SUCCESS
|| status
== HSA_STATUS_INFO_BREAK
,
359 "Locate kernargs memory");
361 /* Select a memory region for the kernel heap.
362 The call-back function, get_heap_region, does the selection. */
363 XHSA_CMP (hsa_fns
.hsa_agent_iterate_regions_fn (device
, get_heap_region
,
365 status
== HSA_STATUS_SUCCESS
|| status
== HSA_STATUS_INFO_BREAK
,
366 "Locate device memory");
370 /* Read a whole input file.
371 Code copied from mkoffload. */
374 read_file (const char *filename
, size_t *plen
)
376 size_t alloc
= 16384;
380 FILE *stream
= fopen (filename
, "rb");
387 if (!fseek (stream
, 0, SEEK_END
))
389 /* Get the file size. */
390 long s
= ftell (stream
);
393 fseek (stream
, 0, SEEK_SET
);
395 buffer
= malloc (alloc
);
399 size_t n
= fread (buffer
+ base
, 1, alloc
- base
- 1, stream
);
404 if (base
+ 1 == alloc
)
407 buffer
= realloc (buffer
, alloc
);
418 /* Read a HSA Code Object (HSACO) from file, and load it into the device. */
421 load_image (const char *filename
)
424 Elf64_Ehdr
*image
= (void *) read_file (filename
, &image_size
);
426 /* An "executable" consists of one or more code objects. */
427 XHSA (hsa_fns
.hsa_executable_create_fn (HSA_PROFILE_FULL
,
428 HSA_EXECUTABLE_STATE_UNFROZEN
, "",
430 "Initialize GCN executable");
432 /* Hide relocations from the HSA runtime loader.
433 Keep a copy of the unmodified section headers to use later. */
434 Elf64_Shdr
*image_sections
=
435 (Elf64_Shdr
*) ((char *) image
+ image
->e_shoff
);
436 Elf64_Shdr
*sections
= malloc (sizeof (Elf64_Shdr
) * image
->e_shnum
);
437 memcpy (sections
, image_sections
, sizeof (Elf64_Shdr
) * image
->e_shnum
);
438 for (int i
= image
->e_shnum
- 1; i
>= 0; i
--)
440 if (image_sections
[i
].sh_type
== SHT_RELA
441 || image_sections
[i
].sh_type
== SHT_REL
)
442 /* Change section type to something harmless. */
443 image_sections
[i
].sh_type
= SHT_NOTE
;
446 /* Add the HSACO to the executable. */
447 hsa_code_object_t co
= { 0 };
448 XHSA (hsa_fns
.hsa_code_object_deserialize_fn (image
, image_size
, NULL
, &co
),
449 "Deserialize GCN code object");
450 XHSA (hsa_fns
.hsa_executable_load_code_object_fn (executable
, device
, co
,
452 "Load GCN code object");
454 /* We're done modifying he executable. */
455 XHSA (hsa_fns
.hsa_executable_freeze_fn (executable
, ""),
456 "Freeze GCN executable");
458 /* Locate the "_init_array" function, and read the kernel's properties. */
459 hsa_executable_symbol_t symbol
;
460 XHSA (hsa_fns
.hsa_executable_get_symbol_fn (executable
, NULL
, "_init_array",
462 "Find '_init_array' function");
463 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
464 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &init_array_kernel
),
465 "Extract '_init_array' kernel object kernel object");
467 /* Locate the "_fini_array" function, and read the kernel's properties. */
468 XHSA (hsa_fns
.hsa_executable_get_symbol_fn (executable
, NULL
, "_fini_array",
470 "Find '_fini_array' function");
471 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
472 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &fini_array_kernel
),
473 "Extract '_fini_array' kernel object kernel object");
475 /* Locate the "main" function, and read the kernel's properties. */
476 XHSA (hsa_fns
.hsa_executable_get_symbol_fn (executable
, NULL
, "main",
478 "Find 'main' function");
479 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
480 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &main_kernel
),
481 "Extract 'main' kernel object");
482 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
483 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
,
484 &kernarg_segment_size
),
485 "Extract kernarg segment size");
486 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
487 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
,
488 &group_segment_size
),
489 "Extract group segment size");
490 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
491 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
,
492 &private_segment_size
),
493 "Extract private segment size");
495 /* Find main function in ELF, and calculate actual load offset. */
496 Elf64_Addr load_offset
;
497 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
498 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
500 "Extract 'main' symbol address");
501 for (int i
= 0; i
< image
->e_shnum
; i
++)
502 if (sections
[i
].sh_type
== SHT_SYMTAB
)
504 Elf64_Shdr
*strtab
= §ions
[sections
[i
].sh_link
];
505 char *strings
= (char *) image
+ strtab
->sh_offset
;
507 for (size_t offset
= 0;
508 offset
< sections
[i
].sh_size
;
509 offset
+= sections
[i
].sh_entsize
)
511 Elf64_Sym
*sym
= (Elf64_Sym
*) ((char *) image
512 + sections
[i
].sh_offset
+ offset
);
513 if (strcmp ("main", strings
+ sym
->st_name
) == 0)
515 load_offset
-= sym
->st_value
;
520 /* We only get here when main was not found.
521 This should never happen. */
522 fprintf (stderr
, "Error: main function not found.\n");
526 /* Find dynamic symbol table. */
527 Elf64_Shdr
*dynsym
= NULL
;
528 for (int i
= 0; i
< image
->e_shnum
; i
++)
529 if (sections
[i
].sh_type
== SHT_DYNSYM
)
531 dynsym
= §ions
[i
];
535 /* Fix up relocations. */
536 for (int i
= 0; i
< image
->e_shnum
; i
++)
538 if (sections
[i
].sh_type
== SHT_RELA
)
539 for (size_t offset
= 0;
540 offset
< sections
[i
].sh_size
;
541 offset
+= sections
[i
].sh_entsize
)
543 Elf64_Rela
*reloc
= (Elf64_Rela
*) ((char *) image
544 + sections
[i
].sh_offset
548 ? (Elf64_Sym
*) ((char *) image
550 + (dynsym
->sh_entsize
551 * ELF64_R_SYM (reloc
->r_info
))) : NULL
);
553 int64_t S
= (sym
? sym
->st_value
: 0);
554 int64_t P
= reloc
->r_offset
+ load_offset
;
555 int64_t A
= reloc
->r_addend
;
556 int64_t B
= load_offset
;
558 switch (ELF64_R_TYPE (reloc
->r_info
))
560 case R_AMDGPU_ABS32_LO
:
561 V
= (S
+ A
) & 0xFFFFFFFF;
564 case R_AMDGPU_ABS32_HI
:
578 LLD seems to emit REL64 where the the assembler has ABS64.
579 This is clearly wrong because it's not what the compiler
580 is expecting. Let's assume, for now, that it's a bug.
581 In any case, GCN kernels are always self contained and
582 therefore relative relocations will have been resolved
583 already, so this should be a safe workaround. */
584 V
= S
+ A
/* - P */ ;
591 /* TODO R_AMDGPU_GOTPCREL */
592 /* TODO R_AMDGPU_GOTPCREL32_LO */
593 /* TODO R_AMDGPU_GOTPCREL32_HI */
594 case R_AMDGPU_REL32_LO
:
595 V
= (S
+ A
- P
) & 0xFFFFFFFF;
598 case R_AMDGPU_REL32_HI
:
599 V
= (S
+ A
- P
) >> 32;
602 case R_AMDGPU_RELATIVE64
:
607 fprintf (stderr
, "Error: unsupported relocation type.\n");
610 XHSA (hsa_fns
.hsa_memory_copy_fn ((void *) P
, &V
, size
),
611 "Fix up relocation");
616 /* Allocate some device memory from the kernargs region.
617 The returned address will be 32-bit (with excess zeroed on 64-bit host),
618 and accessible via the same address on both host and target (via
619 __flat_scalar GCN address space). */
622 device_malloc (size_t size
, hsa_region_t region
)
625 XHSA (hsa_fns
.hsa_memory_allocate_fn (region
, size
, &result
),
626 "Allocate device memory");
630 /* These are the device pointers that will be transferred to the target.
631 The HSA Runtime points the kernargs register here.
632 They correspond to function signature:
633 int main (int argc, char *argv[], int *return_value)
634 The compiler expects this, for kernel functions, and will
635 automatically assign the exit value to *return_value. */
648 unsigned int next_output
;
661 unsigned int consumed
;
671 /* Print any console output from the kernel.
672 We print all entries from "consumed" to the next entry without a "written"
673 flag, or "next_output" is reached. The buffer is circular, but the
674 indices are absolute. It is assumed the kernel will stop writing data
675 if "next_output" wraps (becomes smaller than "consumed"). */
677 gomp_print_output (struct kernargs
*kernargs
, bool final
)
679 unsigned int limit
= (sizeof (kernargs
->output_data
.queue
)
680 / sizeof (kernargs
->output_data
.queue
[0]));
682 unsigned int from
= __atomic_load_n (&kernargs
->output_data
.consumed
,
684 unsigned int to
= kernargs
->output_data
.next_output
;
690 printf ("GCN print buffer overflowed.\n");
695 for (i
= from
; i
< to
; i
++)
697 struct printf_data
*data
= &kernargs
->output_data
.queue
[i
%limit
];
699 if (!data
->written
&& !final
)
705 printf ("%.128s%ld\n", data
->msg
, data
->ivalue
);
708 printf ("%.128s%f\n", data
->msg
, data
->dvalue
);
711 printf ("%.128s%.128s\n", data
->msg
, data
->text
);
714 printf ("%.128s%.128s", data
->msg
, data
->text
);
717 printf ("GCN print buffer error!\n");
722 __atomic_store_n (&kernargs
->output_data
.consumed
, i
+1,
728 /* Execute an already-loaded kernel on the device. */
731 run (uint64_t kernel
, void *kernargs
)
733 /* A "signal" is used to launch and monitor the kernel. */
735 XHSA (hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &signal
),
738 /* Configure for a single-worker kernel. */
739 uint64_t index
= hsa_fns
.hsa_queue_load_write_index_relaxed_fn (queue
);
740 const uint32_t queueMask
= queue
->size
- 1;
741 hsa_kernel_dispatch_packet_t
*dispatch_packet
=
742 &(((hsa_kernel_dispatch_packet_t
*) (queue
->base_address
))[index
&
744 dispatch_packet
->setup
|= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
;
745 dispatch_packet
->workgroup_size_x
= (uint16_t) 1;
746 dispatch_packet
->workgroup_size_y
= (uint16_t) 64;
747 dispatch_packet
->workgroup_size_z
= (uint16_t) 1;
748 dispatch_packet
->grid_size_x
= 1;
749 dispatch_packet
->grid_size_y
= 64;
750 dispatch_packet
->grid_size_z
= 1;
751 dispatch_packet
->completion_signal
= signal
;
752 dispatch_packet
->kernel_object
= kernel
;
753 dispatch_packet
->kernarg_address
= (void *) kernargs
;
754 dispatch_packet
->private_segment_size
= private_segment_size
;
755 dispatch_packet
->group_segment_size
= group_segment_size
;
758 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
;
759 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
;
760 header
|= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
;
762 __atomic_store_n ((uint32_t *) dispatch_packet
,
763 header
| (dispatch_packet
->setup
<< 16),
767 fprintf (stderr
, "Launch kernel\n");
769 hsa_fns
.hsa_queue_store_write_index_relaxed_fn (queue
, index
+ 1);
770 hsa_fns
.hsa_signal_store_relaxed_fn (queue
->doorbell_signal
, index
);
771 /* Kernel running ...... */
772 while (hsa_fns
.hsa_signal_wait_relaxed_fn (signal
, HSA_SIGNAL_CONDITION_LT
,
774 HSA_WAIT_STATE_ACTIVE
) != 0)
777 gomp_print_output (kernargs
, false);
780 gomp_print_output (kernargs
, true);
783 fprintf (stderr
, "Kernel exited\n");
785 XHSA (hsa_fns
.hsa_signal_destroy_fn (signal
),
790 main (int argc
, char *argv
[])
793 for (int i
= 1; i
< argc
; i
++)
795 if (!strcmp (argv
[i
], "--help"))
800 else if (!strcmp (argv
[i
], "--version"))
805 else if (!strcmp (argv
[i
], "--debug"))
807 else if (argv
[i
][0] == '-')
821 /* No kernel arguments were found. */
826 /* The remaining arguments are for the GCN kernel. */
827 int kernel_argc
= argc
- kernel_arg
;
828 char **kernel_argv
= &argv
[kernel_arg
];
831 load_image (kernel_argv
[0]);
833 /* Calculate size of function parameters + argv data. */
834 size_t args_size
= 0;
835 for (int i
= 0; i
< kernel_argc
; i
++)
836 args_size
+= strlen (kernel_argv
[i
]) + 1;
838 /* Allocate device memory for both function parameters and the argv
840 struct kernargs
*kernargs
= device_malloc (sizeof (*kernargs
),
844 int64_t argv_data
[kernel_argc
];
845 char strings
[args_size
];
846 } *args
= device_malloc (sizeof (struct argdata
), kernargs_region
);
848 size_t heap_size
= 10 * 1024 * 1024; /* 10MB. */
849 struct heap
*heap
= device_malloc (heap_size
, heap_region
);
850 XHSA (hsa_fns
.hsa_memory_assign_agent_fn (heap
, device
,
851 HSA_ACCESS_PERMISSION_RW
),
852 "Assign heap to device agent");
854 /* Write the data to the target. */
855 kernargs
->argc
= kernel_argc
;
856 kernargs
->argv
= (int64_t) args
->argv_data
;
857 kernargs
->out_ptr
= (int64_t) &kernargs
->output_data
;
858 kernargs
->output_data
.return_value
= 0xcafe0000; /* Default return value. */
859 kernargs
->output_data
.next_output
= 0;
860 for (unsigned i
= 0; i
< (sizeof (kernargs
->output_data
.queue
)
861 / sizeof (kernargs
->output_data
.queue
[0])); i
++)
862 kernargs
->output_data
.queue
[i
].written
= 0;
863 kernargs
->output_data
.consumed
= 0;
865 for (int i
= 0; i
< kernel_argc
; i
++)
867 size_t arg_len
= strlen (kernel_argv
[i
]) + 1;
868 args
->argv_data
[i
] = (int64_t) &args
->strings
[offset
];
869 memcpy (&args
->strings
[offset
], kernel_argv
[i
], arg_len
+ 1);
872 kernargs
->heap_ptr
= (int64_t) heap
;
873 hsa_fns
.hsa_memory_copy_fn (&heap
->size
, &heap_size
, sizeof (heap_size
));
875 /* Run constructors on the GPU. */
876 run (init_array_kernel
, kernargs
);
878 /* Run the kernel on the GPU. */
879 run (main_kernel
, kernargs
);
880 unsigned int return_value
=
881 (unsigned int) kernargs
->output_data
.return_value
;
883 /* Run destructors on the GPU. */
884 run (fini_array_kernel
, kernargs
);
886 unsigned int upper
= (return_value
& ~0xffff) >> 16;
889 printf ("Kernel exit value was never set\n");
892 else if (upper
== 0xffff)
895 ; /* Set by return from main. */
897 printf ("Possible kernel exit value corruption, 2 most significant bytes "
898 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value
);
902 unsigned int signal
= (return_value
>> 8) & 0xff;
903 if (signal
== SIGABRT
)
904 printf ("Kernel aborted\n");
905 else if (signal
!= 0)
906 printf ("Kernel received unkown signal\n");
910 printf ("Kernel exit value: %d\n", return_value
& 0xff);
912 /* Clean shut down. */
913 XHSA (hsa_fns
.hsa_memory_free_fn (kernargs
),
914 "Clean up device memory");
915 XHSA (hsa_fns
.hsa_executable_destroy_fn (executable
),
916 "Clean up GCN executable");
917 XHSA (hsa_fns
.hsa_queue_destroy_fn (queue
),
918 "Clean up device queue");
919 XHSA (hsa_fns
.hsa_shut_down_fn (),
920 "Shut down run-time");
922 return return_value
& 0xff;