1 /* Run a stand-alone AMD GCN kernel.
3 Copyright 2017 Mentor Graphics Corporation
4 Copyright 2018-2019 Free Software Foundation, Inc.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /* This program will run a compiled stand-alone GCN kernel on a GPU.
21 The kernel entry point's signature must use a standard main signature:
23 int main(int argc, char **argv)
37 /* These probably won't be in elf.h for a while. */
39 #define R_AMDGPU_NONE 0
40 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
41 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
42 #define R_AMDGPU_ABS64 3 /* S + A */
43 #define R_AMDGPU_REL32 4 /* S + A - P */
44 #define R_AMDGPU_REL64 5 /* S + A - P */
45 #define R_AMDGPU_ABS32 6 /* S + A */
46 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
47 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
48 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
49 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
50 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
52 #define R_AMDGPU_RELATIVE64 13 /* B + A */
57 #ifndef HSA_RUNTIME_LIB
58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so"
61 #ifndef VERSION_STRING
62 #define VERSION_STRING "(version unknown)"
67 hsa_agent_t device
= { 0 };
68 hsa_queue_t
*queue
= NULL
;
70 hsa_executable_t executable
= { 0 };
72 hsa_region_t kernargs_region
= { 0 };
73 uint32_t kernarg_segment_size
= 0;
74 uint32_t group_segment_size
= 0;
75 uint32_t private_segment_size
= 0;
78 usage (const char *progname
)
80 printf ("Usage: %s [options] kernel [kernel-args]\n\n"
84 " --debug\n", progname
);
88 version (const char *progname
)
90 printf ("%s " VERSION_STRING
"\n", progname
);
93 /* As an HSA runtime is dlopened, following structure defines the necessary
95 Code adapted from libgomp. */
97 struct hsa_runtime_fn_info
100 hsa_status_t (*hsa_status_string_fn
) (hsa_status_t status
,
101 const char **status_string
);
102 hsa_status_t (*hsa_agent_get_info_fn
) (hsa_agent_t agent
,
103 hsa_agent_info_t attribute
,
105 hsa_status_t (*hsa_init_fn
) (void);
106 hsa_status_t (*hsa_iterate_agents_fn
)
107 (hsa_status_t (*callback
) (hsa_agent_t agent
, void *data
), void *data
);
108 hsa_status_t (*hsa_region_get_info_fn
) (hsa_region_t region
,
109 hsa_region_info_t attribute
,
111 hsa_status_t (*hsa_queue_create_fn
)
112 (hsa_agent_t agent
, uint32_t size
, hsa_queue_type_t type
,
113 void (*callback
) (hsa_status_t status
, hsa_queue_t
*source
, void *data
),
114 void *data
, uint32_t private_segment_size
,
115 uint32_t group_segment_size
, hsa_queue_t
**queue
);
116 hsa_status_t (*hsa_agent_iterate_regions_fn
)
118 hsa_status_t (*callback
) (hsa_region_t region
, void *data
), void *data
);
119 hsa_status_t (*hsa_executable_destroy_fn
) (hsa_executable_t executable
);
120 hsa_status_t (*hsa_executable_create_fn
)
121 (hsa_profile_t profile
, hsa_executable_state_t executable_state
,
122 const char *options
, hsa_executable_t
*executable
);
123 hsa_status_t (*hsa_executable_global_variable_define_fn
)
124 (hsa_executable_t executable
, const char *variable_name
, void *address
);
125 hsa_status_t (*hsa_executable_load_code_object_fn
)
126 (hsa_executable_t executable
, hsa_agent_t agent
,
127 hsa_code_object_t code_object
, const char *options
);
128 hsa_status_t (*hsa_executable_freeze_fn
) (hsa_executable_t executable
,
129 const char *options
);
130 hsa_status_t (*hsa_signal_create_fn
) (hsa_signal_value_t initial_value
,
131 uint32_t num_consumers
,
132 const hsa_agent_t
*consumers
,
133 hsa_signal_t
*signal
);
134 hsa_status_t (*hsa_memory_allocate_fn
) (hsa_region_t region
, size_t size
,
136 hsa_status_t (*hsa_memory_copy_fn
) (void *dst
, const void *src
,
138 hsa_status_t (*hsa_memory_free_fn
) (void *ptr
);
139 hsa_status_t (*hsa_signal_destroy_fn
) (hsa_signal_t signal
);
140 hsa_status_t (*hsa_executable_get_symbol_fn
)
141 (hsa_executable_t executable
, const char *module_name
,
142 const char *symbol_name
, hsa_agent_t agent
, int32_t call_convention
,
143 hsa_executable_symbol_t
*symbol
);
144 hsa_status_t (*hsa_executable_symbol_get_info_fn
)
145 (hsa_executable_symbol_t executable_symbol
,
146 hsa_executable_symbol_info_t attribute
, void *value
);
147 void (*hsa_signal_store_relaxed_fn
) (hsa_signal_t signal
,
148 hsa_signal_value_t value
);
149 hsa_signal_value_t (*hsa_signal_wait_acquire_fn
)
150 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
151 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
152 hsa_wait_state_t wait_state_hint
);
153 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn
)
154 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
155 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
156 hsa_wait_state_t wait_state_hint
);
157 hsa_status_t (*hsa_queue_destroy_fn
) (hsa_queue_t
*queue
);
158 hsa_status_t (*hsa_code_object_deserialize_fn
)
159 (void *serialized_code_object
, size_t serialized_code_object_size
,
160 const char *options
, hsa_code_object_t
*code_object
);
161 uint64_t (*hsa_queue_load_write_index_relaxed_fn
)
162 (const hsa_queue_t
*queue
);
163 void (*hsa_queue_store_write_index_relaxed_fn
)
164 (const hsa_queue_t
*queue
, uint64_t value
);
165 hsa_status_t (*hsa_shut_down_fn
) ();
168 /* HSA runtime functions that are initialized in init_hsa_context.
169 Code adapted from libgomp. */
171 static struct hsa_runtime_fn_info hsa_fns
;
173 #define DLSYM_FN(function) \
174 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
175 if (hsa_fns.function##_fn == NULL) \
179 init_hsa_runtime_functions (void)
181 void *handle
= dlopen (HSA_RUNTIME_LIB
, RTLD_LAZY
);
185 "The HSA runtime is required to run GCN kernels on hardware.\n"
186 "%s: File not found or could not be opened\n",
191 DLSYM_FN (hsa_status_string
)
192 DLSYM_FN (hsa_agent_get_info
)
194 DLSYM_FN (hsa_iterate_agents
)
195 DLSYM_FN (hsa_region_get_info
)
196 DLSYM_FN (hsa_queue_create
)
197 DLSYM_FN (hsa_agent_iterate_regions
)
198 DLSYM_FN (hsa_executable_destroy
)
199 DLSYM_FN (hsa_executable_create
)
200 DLSYM_FN (hsa_executable_global_variable_define
)
201 DLSYM_FN (hsa_executable_load_code_object
)
202 DLSYM_FN (hsa_executable_freeze
)
203 DLSYM_FN (hsa_signal_create
)
204 DLSYM_FN (hsa_memory_allocate
)
205 DLSYM_FN (hsa_memory_copy
)
206 DLSYM_FN (hsa_memory_free
)
207 DLSYM_FN (hsa_signal_destroy
)
208 DLSYM_FN (hsa_executable_get_symbol
)
209 DLSYM_FN (hsa_executable_symbol_get_info
)
210 DLSYM_FN (hsa_signal_wait_acquire
)
211 DLSYM_FN (hsa_signal_wait_relaxed
)
212 DLSYM_FN (hsa_signal_store_relaxed
)
213 DLSYM_FN (hsa_queue_destroy
)
214 DLSYM_FN (hsa_code_object_deserialize
)
215 DLSYM_FN (hsa_queue_load_write_index_relaxed
)
216 DLSYM_FN (hsa_queue_store_write_index_relaxed
)
217 DLSYM_FN (hsa_shut_down
)
222 fprintf (stderr
, "Failed to find HSA functions in " HSA_RUNTIME_LIB
"\n");
228 /* Report a fatal error STR together with the HSA error corresponding to
229 STATUS and terminate execution of the current process. */
232 hsa_fatal (const char *str
, hsa_status_t status
)
234 const char *hsa_error_msg
;
235 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
236 fprintf (stderr
, "%s: FAILED\nHSA Runtime message: %s\n", str
,
241 /* Helper macros to ensure we check the return values from the HSA Runtime.
242 These just keep the rest of the code a bit cleaner. */
244 #define XHSA_CMP(FN, CMP, MSG) \
246 hsa_status_t status = (FN); \
248 hsa_fatal ((MSG), status); \
250 fprintf (stderr, "%s: OK\n", (MSG)); \
252 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
254 /* Callback of hsa_iterate_agents.
255 Called once for each available device, and returns "break" when a
256 suitable one has been found. */
259 get_gpu_agent (hsa_agent_t agent
, void *data
__attribute__ ((unused
)))
261 hsa_device_type_t device_type
;
262 XHSA (hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
266 /* Select only GPU devices. */
267 /* TODO: support selecting from multiple GPUs. */
268 if (HSA_DEVICE_TYPE_GPU
== device_type
)
271 return HSA_STATUS_INFO_BREAK
;
274 /* The device was not suitable. */
275 return HSA_STATUS_SUCCESS
;
278 /* Callback of hsa_iterate_regions.
279 Called once for each available memory region, and returns "break" when a
280 suitable one has been found. */
283 get_kernarg_region (hsa_region_t region
, void *data
__attribute__ ((unused
)))
285 /* Reject non-global regions. */
286 hsa_region_segment_t segment
;
287 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
, &segment
);
288 if (HSA_REGION_SEGMENT_GLOBAL
!= segment
)
289 return HSA_STATUS_SUCCESS
;
291 /* Find a region with the KERNARG flag set. */
292 hsa_region_global_flag_t flags
;
293 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
295 if (flags
& HSA_REGION_GLOBAL_FLAG_KERNARG
)
297 kernargs_region
= region
;
298 return HSA_STATUS_INFO_BREAK
;
301 /* The region was not suitable. */
302 return HSA_STATUS_SUCCESS
;
305 /* Initialize the HSA Runtime library and GPU device. */
310 /* Load the shared library and find the API functions. */
311 init_hsa_runtime_functions ();
313 /* Initialize the HSA Runtime. */
314 XHSA (hsa_fns
.hsa_init_fn (),
315 "Initialize run-time");
317 /* Select a suitable device.
318 The call-back function, get_gpu_agent, does the selection. */
319 XHSA_CMP (hsa_fns
.hsa_iterate_agents_fn (get_gpu_agent
, NULL
),
320 status
== HSA_STATUS_SUCCESS
|| status
== HSA_STATUS_INFO_BREAK
,
323 /* Initialize the queue used for launching kernels. */
324 uint32_t queue_size
= 0;
325 XHSA (hsa_fns
.hsa_agent_get_info_fn (device
, HSA_AGENT_INFO_QUEUE_MAX_SIZE
,
327 "Find max queue size");
328 XHSA (hsa_fns
.hsa_queue_create_fn (device
, queue_size
,
329 HSA_QUEUE_TYPE_SINGLE
, NULL
,
330 NULL
, UINT32_MAX
, UINT32_MAX
, &queue
),
331 "Set up a device queue");
333 /* Select a memory region for the kernel arguments.
334 The call-back function, get_kernarg_region, does the selection. */
335 XHSA_CMP (hsa_fns
.hsa_agent_iterate_regions_fn (device
, get_kernarg_region
,
337 status
== HSA_STATUS_SUCCESS
|| status
== HSA_STATUS_INFO_BREAK
,
338 "Locate kernargs memory");
342 /* Read a whole input file.
343 Code copied from mkoffload. */
346 read_file (const char *filename
, size_t *plen
)
348 size_t alloc
= 16384;
352 FILE *stream
= fopen (filename
, "rb");
359 if (!fseek (stream
, 0, SEEK_END
))
361 /* Get the file size. */
362 long s
= ftell (stream
);
365 fseek (stream
, 0, SEEK_SET
);
367 buffer
= malloc (alloc
);
371 size_t n
= fread (buffer
+ base
, 1, alloc
- base
- 1, stream
);
376 if (base
+ 1 == alloc
)
379 buffer
= realloc (buffer
, alloc
);
390 /* Read a HSA Code Object (HSACO) from file, and load it into the device. */
393 load_image (const char *filename
)
396 Elf64_Ehdr
*image
= (void *) read_file (filename
, &image_size
);
398 /* An "executable" consists of one or more code objects. */
399 XHSA (hsa_fns
.hsa_executable_create_fn (HSA_PROFILE_FULL
,
400 HSA_EXECUTABLE_STATE_UNFROZEN
, "",
402 "Initialize GCN executable");
404 /* Hide relocations from the HSA runtime loader.
405 Keep a copy of the unmodified section headers to use later. */
406 Elf64_Shdr
*image_sections
=
407 (Elf64_Shdr
*) ((char *) image
+ image
->e_shoff
);
408 Elf64_Shdr
*sections
= malloc (sizeof (Elf64_Shdr
) * image
->e_shnum
);
409 memcpy (sections
, image_sections
, sizeof (Elf64_Shdr
) * image
->e_shnum
);
410 for (int i
= image
->e_shnum
- 1; i
>= 0; i
--)
412 if (image_sections
[i
].sh_type
== SHT_RELA
413 || image_sections
[i
].sh_type
== SHT_REL
)
414 /* Change section type to something harmless. */
415 image_sections
[i
].sh_type
= SHT_NOTE
;
418 /* Add the HSACO to the executable. */
419 hsa_code_object_t co
= { 0 };
420 XHSA (hsa_fns
.hsa_code_object_deserialize_fn (image
, image_size
, NULL
, &co
),
421 "Deserialize GCN code object");
422 XHSA (hsa_fns
.hsa_executable_load_code_object_fn (executable
, device
, co
,
424 "Load GCN code object");
426 /* We're done modifying he executable. */
427 XHSA (hsa_fns
.hsa_executable_freeze_fn (executable
, ""),
428 "Freeze GCN executable");
430 /* Locate the "main" function, and read the kernel's properties. */
431 hsa_executable_symbol_t symbol
;
432 XHSA (hsa_fns
.hsa_executable_get_symbol_fn (executable
, NULL
, "main",
434 "Find 'main' function");
435 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
436 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &kernel
),
437 "Extract kernel object");
438 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
439 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
,
440 &kernarg_segment_size
),
441 "Extract kernarg segment size");
442 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
443 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
,
444 &group_segment_size
),
445 "Extract group segment size");
446 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
447 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
,
448 &private_segment_size
),
449 "Extract private segment size");
451 /* Find main function in ELF, and calculate actual load offset. */
452 Elf64_Addr load_offset
;
453 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
454 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
456 "Extract 'main' symbol address");
457 for (int i
= 0; i
< image
->e_shnum
; i
++)
458 if (sections
[i
].sh_type
== SHT_SYMTAB
)
460 Elf64_Shdr
*strtab
= §ions
[sections
[i
].sh_link
];
461 char *strings
= (char *) image
+ strtab
->sh_offset
;
463 for (size_t offset
= 0;
464 offset
< sections
[i
].sh_size
;
465 offset
+= sections
[i
].sh_entsize
)
467 Elf64_Sym
*sym
= (Elf64_Sym
*) ((char *) image
468 + sections
[i
].sh_offset
+ offset
);
469 if (strcmp ("main", strings
+ sym
->st_name
) == 0)
471 load_offset
-= sym
->st_value
;
476 /* We only get here when main was not found.
477 This should never happen. */
478 fprintf (stderr
, "Error: main function not found.\n");
482 /* Find dynamic symbol table. */
483 Elf64_Shdr
*dynsym
= NULL
;
484 for (int i
= 0; i
< image
->e_shnum
; i
++)
485 if (sections
[i
].sh_type
== SHT_DYNSYM
)
487 dynsym
= §ions
[i
];
491 /* Fix up relocations. */
492 for (int i
= 0; i
< image
->e_shnum
; i
++)
494 if (sections
[i
].sh_type
== SHT_RELA
)
495 for (size_t offset
= 0;
496 offset
< sections
[i
].sh_size
;
497 offset
+= sections
[i
].sh_entsize
)
499 Elf64_Rela
*reloc
= (Elf64_Rela
*) ((char *) image
500 + sections
[i
].sh_offset
504 ? (Elf64_Sym
*) ((char *) image
506 + (dynsym
->sh_entsize
507 * ELF64_R_SYM (reloc
->r_info
))) : NULL
);
509 int64_t S
= (sym
? sym
->st_value
: 0);
510 int64_t P
= reloc
->r_offset
+ load_offset
;
511 int64_t A
= reloc
->r_addend
;
512 int64_t B
= load_offset
;
514 switch (ELF64_R_TYPE (reloc
->r_info
))
516 case R_AMDGPU_ABS32_LO
:
517 V
= (S
+ A
) & 0xFFFFFFFF;
520 case R_AMDGPU_ABS32_HI
:
534 LLD seems to emit REL64 where the the assembler has ABS64.
535 This is clearly wrong because it's not what the compiler
536 is expecting. Let's assume, for now, that it's a bug.
537 In any case, GCN kernels are always self contained and
538 therefore relative relocations will have been resolved
539 already, so this should be a safe workaround. */
540 V
= S
+ A
/* - P */ ;
547 /* TODO R_AMDGPU_GOTPCREL */
548 /* TODO R_AMDGPU_GOTPCREL32_LO */
549 /* TODO R_AMDGPU_GOTPCREL32_HI */
550 case R_AMDGPU_REL32_LO
:
551 V
= (S
+ A
- P
) & 0xFFFFFFFF;
554 case R_AMDGPU_REL32_HI
:
555 V
= (S
+ A
- P
) >> 32;
558 case R_AMDGPU_RELATIVE64
:
563 fprintf (stderr
, "Error: unsupported relocation type.\n");
566 XHSA (hsa_fns
.hsa_memory_copy_fn ((void *) P
, &V
, size
),
567 "Fix up relocation");
572 /* Allocate some device memory from the kernargs region.
573 The returned address will be 32-bit (with excess zeroed on 64-bit host),
574 and accessible via the same address on both host and target (via
575 __flat_scalar GCN address space). */
578 device_malloc (size_t size
)
581 XHSA (hsa_fns
.hsa_memory_allocate_fn (kernargs_region
, size
, &result
),
582 "Allocate device memory");
586 /* These are the device pointers that will be transferred to the target.
587 The HSA Runtime points the kernargs register here.
588 They correspond to function signature:
589 int main (int argc, char *argv[], int *return_value)
590 The compiler expects this, for kernel functions, and will
591 automatically assign the exit value to *return_value. */
604 unsigned int next_output
;
617 unsigned int consumed
;
627 /* Print any console output from the kernel.
628 We print all entries from "consumed" to the next entry without a "written"
629 flag, or "next_output" is reached. The buffer is circular, but the
630 indices are absolute. It is assumed the kernel will stop writing data
631 if "next_output" wraps (becomes smaller than "consumed"). */
633 gomp_print_output (struct kernargs
*kernargs
, bool final
)
635 unsigned int limit
= (sizeof (kernargs
->output_data
.queue
)
636 / sizeof (kernargs
->output_data
.queue
[0]));
638 unsigned int from
= __atomic_load_n (&kernargs
->output_data
.consumed
,
640 unsigned int to
= kernargs
->output_data
.next_output
;
646 printf ("GCN print buffer overflowed.\n");
651 for (i
= from
; i
< to
; i
++)
653 struct printf_data
*data
= &kernargs
->output_data
.queue
[i
%limit
];
655 if (!data
->written
&& !final
)
661 printf ("%.128s%ld\n", data
->msg
, data
->ivalue
);
664 printf ("%.128s%f\n", data
->msg
, data
->dvalue
);
667 printf ("%.128s%.128s\n", data
->msg
, data
->text
);
670 printf ("%.128s%.128s", data
->msg
, data
->text
);
673 printf ("GCN print buffer error!\n");
678 __atomic_store_n (&kernargs
->output_data
.consumed
, i
+1,
684 /* Execute an already-loaded kernel on the device. */
689 /* A "signal" is used to launch and monitor the kernel. */
691 XHSA (hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &signal
),
694 /* Configure for a single-worker kernel. */
695 uint64_t index
= hsa_fns
.hsa_queue_load_write_index_relaxed_fn (queue
);
696 const uint32_t queueMask
= queue
->size
- 1;
697 hsa_kernel_dispatch_packet_t
*dispatch_packet
=
698 &(((hsa_kernel_dispatch_packet_t
*) (queue
->base_address
))[index
&
700 dispatch_packet
->setup
|= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
;
701 dispatch_packet
->workgroup_size_x
= (uint16_t) 1;
702 dispatch_packet
->workgroup_size_y
= (uint16_t) 64;
703 dispatch_packet
->workgroup_size_z
= (uint16_t) 1;
704 dispatch_packet
->grid_size_x
= 1;
705 dispatch_packet
->grid_size_y
= 64;
706 dispatch_packet
->grid_size_z
= 1;
707 dispatch_packet
->completion_signal
= signal
;
708 dispatch_packet
->kernel_object
= kernel
;
709 dispatch_packet
->kernarg_address
= (void *) kernargs
;
710 dispatch_packet
->private_segment_size
= private_segment_size
;
711 dispatch_packet
->group_segment_size
= group_segment_size
;
714 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
;
715 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
;
716 header
|= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
;
718 __atomic_store_n ((uint32_t *) dispatch_packet
,
719 header
| (dispatch_packet
->setup
<< 16),
723 fprintf (stderr
, "Launch kernel\n");
725 hsa_fns
.hsa_queue_store_write_index_relaxed_fn (queue
, index
+ 1);
726 hsa_fns
.hsa_signal_store_relaxed_fn (queue
->doorbell_signal
, index
);
727 /* Kernel running ...... */
728 while (hsa_fns
.hsa_signal_wait_relaxed_fn (signal
, HSA_SIGNAL_CONDITION_LT
,
730 HSA_WAIT_STATE_ACTIVE
) != 0)
733 gomp_print_output (kernargs
, false);
736 gomp_print_output (kernargs
, true);
739 fprintf (stderr
, "Kernel exited\n");
741 XHSA (hsa_fns
.hsa_signal_destroy_fn (signal
),
746 main (int argc
, char *argv
[])
749 for (int i
= 1; i
< argc
; i
++)
751 if (!strcmp (argv
[i
], "--help"))
756 else if (!strcmp (argv
[i
], "--version"))
761 else if (!strcmp (argv
[i
], "--debug"))
763 else if (argv
[i
][0] == '-')
777 /* No kernel arguments were found. */
782 /* The remaining arguments are for the GCN kernel. */
783 int kernel_argc
= argc
- kernel_arg
;
784 char **kernel_argv
= &argv
[kernel_arg
];
787 load_image (kernel_argv
[0]);
789 /* Calculate size of function parameters + argv data. */
790 size_t args_size
= 0;
791 for (int i
= 0; i
< kernel_argc
; i
++)
792 args_size
+= strlen (kernel_argv
[i
]) + 1;
794 /* Allocate device memory for both function parameters and the argv
796 size_t heap_size
= 10 * 1024 * 1024; /* 10MB. */
797 struct kernargs
*kernargs
= device_malloc (sizeof (*kernargs
) + heap_size
);
800 int64_t argv_data
[kernel_argc
];
801 char strings
[args_size
];
802 } *args
= device_malloc (sizeof (struct argdata
));
804 /* Write the data to the target. */
805 kernargs
->argc
= kernel_argc
;
806 kernargs
->argv
= (int64_t) args
->argv_data
;
807 kernargs
->out_ptr
= (int64_t) &kernargs
->output_data
;
808 kernargs
->output_data
.return_value
= 0xcafe0000; /* Default return value. */
809 kernargs
->output_data
.next_output
= 0;
810 for (unsigned i
= 0; i
< (sizeof (kernargs
->output_data
.queue
)
811 / sizeof (kernargs
->output_data
.queue
[0])); i
++)
812 kernargs
->output_data
.queue
[i
].written
= 0;
813 kernargs
->output_data
.consumed
= 0;
815 for (int i
= 0; i
< kernel_argc
; i
++)
817 size_t arg_len
= strlen (kernel_argv
[i
]) + 1;
818 args
->argv_data
[i
] = (int64_t) &args
->strings
[offset
];
819 memcpy (&args
->strings
[offset
], kernel_argv
[i
], arg_len
+ 1);
822 kernargs
->heap_ptr
= (int64_t) &kernargs
->heap
;
823 kernargs
->heap
.size
= heap_size
;
825 /* Run the kernel on the GPU. */
827 unsigned int return_value
=
828 (unsigned int) kernargs
->output_data
.return_value
;
830 unsigned int upper
= (return_value
& ~0xffff) >> 16;
832 printf ("Kernel exit value was never set\n");
833 else if (upper
== 0xffff)
836 ; /* Set by return from main. */
838 printf ("Possible kernel exit value corruption, 2 most significant bytes "
839 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value
);
843 unsigned int signal
= (return_value
>> 8) & 0xff;
844 if (signal
== SIGABRT
)
845 printf ("Kernel aborted\n");
846 else if (signal
!= 0)
847 printf ("Kernel received unkown signal\n");
851 printf ("Kernel exit value: %d\n", return_value
& 0xff);
853 /* Clean shut down. */
854 XHSA (hsa_fns
.hsa_memory_free_fn (kernargs
),
855 "Clean up device memory");
856 XHSA (hsa_fns
.hsa_executable_destroy_fn (executable
),
857 "Clean up GCN executable");
858 XHSA (hsa_fns
.hsa_queue_destroy_fn (queue
),
859 "Clean up device queue");
860 XHSA (hsa_fns
.hsa_shut_down_fn (),
861 "Shut down run-time");
863 return return_value
& 0xff;