1 /* Run a stand-alone AMD GCN kernel.
3 Copyright 2017 Mentor Graphics Corporation
4 Copyright 2018-2019 Free Software Foundation, Inc.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 /* This program will run a compiled stand-alone GCN kernel on a GPU.
21 The kernel entry point's signature must use a standard main signature:
23 int main(int argc, char **argv)
37 /* These probably won't be in elf.h for a while. */
39 #define R_AMDGPU_NONE 0
40 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
41 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
42 #define R_AMDGPU_ABS64 3 /* S + A */
43 #define R_AMDGPU_REL32 4 /* S + A - P */
44 #define R_AMDGPU_REL64 5 /* S + A - P */
45 #define R_AMDGPU_ABS32 6 /* S + A */
46 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
47 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
48 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
49 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
50 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
52 #define R_AMDGPU_RELATIVE64 13 /* B + A */
57 #ifndef HSA_RUNTIME_LIB
58 #define HSA_RUNTIME_LIB "libhsa-runtime64.so"
61 #ifndef VERSION_STRING
62 #define VERSION_STRING "(version unknown)"
67 hsa_agent_t device
= { 0 };
68 hsa_queue_t
*queue
= NULL
;
69 uint64_t init_array_kernel
= 0;
70 uint64_t fini_array_kernel
= 0;
71 uint64_t main_kernel
= 0;
72 hsa_executable_t executable
= { 0 };
74 hsa_region_t kernargs_region
= { 0 };
75 uint32_t kernarg_segment_size
= 0;
76 uint32_t group_segment_size
= 0;
77 uint32_t private_segment_size
= 0;
80 usage (const char *progname
)
82 printf ("Usage: %s [options] kernel [kernel-args]\n\n"
86 " --debug\n", progname
);
90 version (const char *progname
)
92 printf ("%s " VERSION_STRING
"\n", progname
);
95 /* As an HSA runtime is dlopened, following structure defines the necessary
97 Code adapted from libgomp. */
99 struct hsa_runtime_fn_info
102 hsa_status_t (*hsa_status_string_fn
) (hsa_status_t status
,
103 const char **status_string
);
104 hsa_status_t (*hsa_agent_get_info_fn
) (hsa_agent_t agent
,
105 hsa_agent_info_t attribute
,
107 hsa_status_t (*hsa_init_fn
) (void);
108 hsa_status_t (*hsa_iterate_agents_fn
)
109 (hsa_status_t (*callback
) (hsa_agent_t agent
, void *data
), void *data
);
110 hsa_status_t (*hsa_region_get_info_fn
) (hsa_region_t region
,
111 hsa_region_info_t attribute
,
113 hsa_status_t (*hsa_queue_create_fn
)
114 (hsa_agent_t agent
, uint32_t size
, hsa_queue_type_t type
,
115 void (*callback
) (hsa_status_t status
, hsa_queue_t
*source
, void *data
),
116 void *data
, uint32_t private_segment_size
,
117 uint32_t group_segment_size
, hsa_queue_t
**queue
);
118 hsa_status_t (*hsa_agent_iterate_regions_fn
)
120 hsa_status_t (*callback
) (hsa_region_t region
, void *data
), void *data
);
121 hsa_status_t (*hsa_executable_destroy_fn
) (hsa_executable_t executable
);
122 hsa_status_t (*hsa_executable_create_fn
)
123 (hsa_profile_t profile
, hsa_executable_state_t executable_state
,
124 const char *options
, hsa_executable_t
*executable
);
125 hsa_status_t (*hsa_executable_global_variable_define_fn
)
126 (hsa_executable_t executable
, const char *variable_name
, void *address
);
127 hsa_status_t (*hsa_executable_load_code_object_fn
)
128 (hsa_executable_t executable
, hsa_agent_t agent
,
129 hsa_code_object_t code_object
, const char *options
);
130 hsa_status_t (*hsa_executable_freeze_fn
) (hsa_executable_t executable
,
131 const char *options
);
132 hsa_status_t (*hsa_signal_create_fn
) (hsa_signal_value_t initial_value
,
133 uint32_t num_consumers
,
134 const hsa_agent_t
*consumers
,
135 hsa_signal_t
*signal
);
136 hsa_status_t (*hsa_memory_allocate_fn
) (hsa_region_t region
, size_t size
,
138 hsa_status_t (*hsa_memory_copy_fn
) (void *dst
, const void *src
,
140 hsa_status_t (*hsa_memory_free_fn
) (void *ptr
);
141 hsa_status_t (*hsa_signal_destroy_fn
) (hsa_signal_t signal
);
142 hsa_status_t (*hsa_executable_get_symbol_fn
)
143 (hsa_executable_t executable
, const char *module_name
,
144 const char *symbol_name
, hsa_agent_t agent
, int32_t call_convention
,
145 hsa_executable_symbol_t
*symbol
);
146 hsa_status_t (*hsa_executable_symbol_get_info_fn
)
147 (hsa_executable_symbol_t executable_symbol
,
148 hsa_executable_symbol_info_t attribute
, void *value
);
149 void (*hsa_signal_store_relaxed_fn
) (hsa_signal_t signal
,
150 hsa_signal_value_t value
);
151 hsa_signal_value_t (*hsa_signal_wait_acquire_fn
)
152 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
153 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
154 hsa_wait_state_t wait_state_hint
);
155 hsa_signal_value_t (*hsa_signal_wait_relaxed_fn
)
156 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
157 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
158 hsa_wait_state_t wait_state_hint
);
159 hsa_status_t (*hsa_queue_destroy_fn
) (hsa_queue_t
*queue
);
160 hsa_status_t (*hsa_code_object_deserialize_fn
)
161 (void *serialized_code_object
, size_t serialized_code_object_size
,
162 const char *options
, hsa_code_object_t
*code_object
);
163 uint64_t (*hsa_queue_load_write_index_relaxed_fn
)
164 (const hsa_queue_t
*queue
);
165 void (*hsa_queue_store_write_index_relaxed_fn
)
166 (const hsa_queue_t
*queue
, uint64_t value
);
167 hsa_status_t (*hsa_shut_down_fn
) ();
170 /* HSA runtime functions that are initialized in init_hsa_context.
171 Code adapted from libgomp. */
173 static struct hsa_runtime_fn_info hsa_fns
;
175 #define DLSYM_FN(function) \
176 *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \
177 if (hsa_fns.function##_fn == NULL) \
181 init_hsa_runtime_functions (void)
183 void *handle
= dlopen (HSA_RUNTIME_LIB
, RTLD_LAZY
);
187 "The HSA runtime is required to run GCN kernels on hardware.\n"
188 "%s: File not found or could not be opened\n",
193 DLSYM_FN (hsa_status_string
)
194 DLSYM_FN (hsa_agent_get_info
)
196 DLSYM_FN (hsa_iterate_agents
)
197 DLSYM_FN (hsa_region_get_info
)
198 DLSYM_FN (hsa_queue_create
)
199 DLSYM_FN (hsa_agent_iterate_regions
)
200 DLSYM_FN (hsa_executable_destroy
)
201 DLSYM_FN (hsa_executable_create
)
202 DLSYM_FN (hsa_executable_global_variable_define
)
203 DLSYM_FN (hsa_executable_load_code_object
)
204 DLSYM_FN (hsa_executable_freeze
)
205 DLSYM_FN (hsa_signal_create
)
206 DLSYM_FN (hsa_memory_allocate
)
207 DLSYM_FN (hsa_memory_copy
)
208 DLSYM_FN (hsa_memory_free
)
209 DLSYM_FN (hsa_signal_destroy
)
210 DLSYM_FN (hsa_executable_get_symbol
)
211 DLSYM_FN (hsa_executable_symbol_get_info
)
212 DLSYM_FN (hsa_signal_wait_acquire
)
213 DLSYM_FN (hsa_signal_wait_relaxed
)
214 DLSYM_FN (hsa_signal_store_relaxed
)
215 DLSYM_FN (hsa_queue_destroy
)
216 DLSYM_FN (hsa_code_object_deserialize
)
217 DLSYM_FN (hsa_queue_load_write_index_relaxed
)
218 DLSYM_FN (hsa_queue_store_write_index_relaxed
)
219 DLSYM_FN (hsa_shut_down
)
224 fprintf (stderr
, "Failed to find HSA functions in " HSA_RUNTIME_LIB
"\n");
230 /* Report a fatal error STR together with the HSA error corresponding to
231 STATUS and terminate execution of the current process. */
234 hsa_fatal (const char *str
, hsa_status_t status
)
236 const char *hsa_error_msg
;
237 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
238 fprintf (stderr
, "%s: FAILED\nHSA Runtime message: %s\n", str
,
243 /* Helper macros to ensure we check the return values from the HSA Runtime.
244 These just keep the rest of the code a bit cleaner. */
246 #define XHSA_CMP(FN, CMP, MSG) \
248 hsa_status_t status = (FN); \
250 hsa_fatal ((MSG), status); \
252 fprintf (stderr, "%s: OK\n", (MSG)); \
254 #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG)
256 /* Callback of hsa_iterate_agents.
257 Called once for each available device, and returns "break" when a
258 suitable one has been found. */
261 get_gpu_agent (hsa_agent_t agent
, void *data
__attribute__ ((unused
)))
263 hsa_device_type_t device_type
;
264 XHSA (hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
268 /* Select only GPU devices. */
269 /* TODO: support selecting from multiple GPUs. */
270 if (HSA_DEVICE_TYPE_GPU
== device_type
)
273 return HSA_STATUS_INFO_BREAK
;
276 /* The device was not suitable. */
277 return HSA_STATUS_SUCCESS
;
280 /* Callback of hsa_iterate_regions.
281 Called once for each available memory region, and returns "break" when a
282 suitable one has been found. */
285 get_kernarg_region (hsa_region_t region
, void *data
__attribute__ ((unused
)))
287 /* Reject non-global regions. */
288 hsa_region_segment_t segment
;
289 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
, &segment
);
290 if (HSA_REGION_SEGMENT_GLOBAL
!= segment
)
291 return HSA_STATUS_SUCCESS
;
293 /* Find a region with the KERNARG flag set. */
294 hsa_region_global_flag_t flags
;
295 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
297 if (flags
& HSA_REGION_GLOBAL_FLAG_KERNARG
)
299 kernargs_region
= region
;
300 return HSA_STATUS_INFO_BREAK
;
303 /* The region was not suitable. */
304 return HSA_STATUS_SUCCESS
;
307 /* Initialize the HSA Runtime library and GPU device. */
312 /* Load the shared library and find the API functions. */
313 init_hsa_runtime_functions ();
315 /* Initialize the HSA Runtime. */
316 XHSA (hsa_fns
.hsa_init_fn (),
317 "Initialize run-time");
319 /* Select a suitable device.
320 The call-back function, get_gpu_agent, does the selection. */
321 XHSA_CMP (hsa_fns
.hsa_iterate_agents_fn (get_gpu_agent
, NULL
),
322 status
== HSA_STATUS_SUCCESS
|| status
== HSA_STATUS_INFO_BREAK
,
325 /* Initialize the queue used for launching kernels. */
326 uint32_t queue_size
= 0;
327 XHSA (hsa_fns
.hsa_agent_get_info_fn (device
, HSA_AGENT_INFO_QUEUE_MAX_SIZE
,
329 "Find max queue size");
330 XHSA (hsa_fns
.hsa_queue_create_fn (device
, queue_size
,
331 HSA_QUEUE_TYPE_SINGLE
, NULL
,
332 NULL
, UINT32_MAX
, UINT32_MAX
, &queue
),
333 "Set up a device queue");
335 /* Select a memory region for the kernel arguments.
336 The call-back function, get_kernarg_region, does the selection. */
337 XHSA_CMP (hsa_fns
.hsa_agent_iterate_regions_fn (device
, get_kernarg_region
,
339 status
== HSA_STATUS_SUCCESS
|| status
== HSA_STATUS_INFO_BREAK
,
340 "Locate kernargs memory");
344 /* Read a whole input file.
345 Code copied from mkoffload. */
348 read_file (const char *filename
, size_t *plen
)
350 size_t alloc
= 16384;
354 FILE *stream
= fopen (filename
, "rb");
361 if (!fseek (stream
, 0, SEEK_END
))
363 /* Get the file size. */
364 long s
= ftell (stream
);
367 fseek (stream
, 0, SEEK_SET
);
369 buffer
= malloc (alloc
);
373 size_t n
= fread (buffer
+ base
, 1, alloc
- base
- 1, stream
);
378 if (base
+ 1 == alloc
)
381 buffer
= realloc (buffer
, alloc
);
392 /* Read a HSA Code Object (HSACO) from file, and load it into the device. */
395 load_image (const char *filename
)
398 Elf64_Ehdr
*image
= (void *) read_file (filename
, &image_size
);
400 /* An "executable" consists of one or more code objects. */
401 XHSA (hsa_fns
.hsa_executable_create_fn (HSA_PROFILE_FULL
,
402 HSA_EXECUTABLE_STATE_UNFROZEN
, "",
404 "Initialize GCN executable");
406 /* Hide relocations from the HSA runtime loader.
407 Keep a copy of the unmodified section headers to use later. */
408 Elf64_Shdr
*image_sections
=
409 (Elf64_Shdr
*) ((char *) image
+ image
->e_shoff
);
410 Elf64_Shdr
*sections
= malloc (sizeof (Elf64_Shdr
) * image
->e_shnum
);
411 memcpy (sections
, image_sections
, sizeof (Elf64_Shdr
) * image
->e_shnum
);
412 for (int i
= image
->e_shnum
- 1; i
>= 0; i
--)
414 if (image_sections
[i
].sh_type
== SHT_RELA
415 || image_sections
[i
].sh_type
== SHT_REL
)
416 /* Change section type to something harmless. */
417 image_sections
[i
].sh_type
= SHT_NOTE
;
420 /* Add the HSACO to the executable. */
421 hsa_code_object_t co
= { 0 };
422 XHSA (hsa_fns
.hsa_code_object_deserialize_fn (image
, image_size
, NULL
, &co
),
423 "Deserialize GCN code object");
424 XHSA (hsa_fns
.hsa_executable_load_code_object_fn (executable
, device
, co
,
426 "Load GCN code object");
428 /* We're done modifying he executable. */
429 XHSA (hsa_fns
.hsa_executable_freeze_fn (executable
, ""),
430 "Freeze GCN executable");
432 /* Locate the "_init_array" function, and read the kernel's properties. */
433 hsa_executable_symbol_t symbol
;
434 XHSA (hsa_fns
.hsa_executable_get_symbol_fn (executable
, NULL
, "_init_array",
436 "Find '_init_array' function");
437 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
438 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &init_array_kernel
),
439 "Extract '_init_array' kernel object kernel object");
441 /* Locate the "_fini_array" function, and read the kernel's properties. */
442 XHSA (hsa_fns
.hsa_executable_get_symbol_fn (executable
, NULL
, "_fini_array",
444 "Find '_fini_array' function");
445 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
446 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &fini_array_kernel
),
447 "Extract '_fini_array' kernel object kernel object");
449 /* Locate the "main" function, and read the kernel's properties. */
450 XHSA (hsa_fns
.hsa_executable_get_symbol_fn (executable
, NULL
, "main",
452 "Find 'main' function");
453 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
454 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &main_kernel
),
455 "Extract 'main' kernel object");
456 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
457 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
,
458 &kernarg_segment_size
),
459 "Extract kernarg segment size");
460 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
461 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
,
462 &group_segment_size
),
463 "Extract group segment size");
464 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
465 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
,
466 &private_segment_size
),
467 "Extract private segment size");
469 /* Find main function in ELF, and calculate actual load offset. */
470 Elf64_Addr load_offset
;
471 XHSA (hsa_fns
.hsa_executable_symbol_get_info_fn
472 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
474 "Extract 'main' symbol address");
475 for (int i
= 0; i
< image
->e_shnum
; i
++)
476 if (sections
[i
].sh_type
== SHT_SYMTAB
)
478 Elf64_Shdr
*strtab
= §ions
[sections
[i
].sh_link
];
479 char *strings
= (char *) image
+ strtab
->sh_offset
;
481 for (size_t offset
= 0;
482 offset
< sections
[i
].sh_size
;
483 offset
+= sections
[i
].sh_entsize
)
485 Elf64_Sym
*sym
= (Elf64_Sym
*) ((char *) image
486 + sections
[i
].sh_offset
+ offset
);
487 if (strcmp ("main", strings
+ sym
->st_name
) == 0)
489 load_offset
-= sym
->st_value
;
494 /* We only get here when main was not found.
495 This should never happen. */
496 fprintf (stderr
, "Error: main function not found.\n");
500 /* Find dynamic symbol table. */
501 Elf64_Shdr
*dynsym
= NULL
;
502 for (int i
= 0; i
< image
->e_shnum
; i
++)
503 if (sections
[i
].sh_type
== SHT_DYNSYM
)
505 dynsym
= §ions
[i
];
509 /* Fix up relocations. */
510 for (int i
= 0; i
< image
->e_shnum
; i
++)
512 if (sections
[i
].sh_type
== SHT_RELA
)
513 for (size_t offset
= 0;
514 offset
< sections
[i
].sh_size
;
515 offset
+= sections
[i
].sh_entsize
)
517 Elf64_Rela
*reloc
= (Elf64_Rela
*) ((char *) image
518 + sections
[i
].sh_offset
522 ? (Elf64_Sym
*) ((char *) image
524 + (dynsym
->sh_entsize
525 * ELF64_R_SYM (reloc
->r_info
))) : NULL
);
527 int64_t S
= (sym
? sym
->st_value
: 0);
528 int64_t P
= reloc
->r_offset
+ load_offset
;
529 int64_t A
= reloc
->r_addend
;
530 int64_t B
= load_offset
;
532 switch (ELF64_R_TYPE (reloc
->r_info
))
534 case R_AMDGPU_ABS32_LO
:
535 V
= (S
+ A
) & 0xFFFFFFFF;
538 case R_AMDGPU_ABS32_HI
:
552 LLD seems to emit REL64 where the the assembler has ABS64.
553 This is clearly wrong because it's not what the compiler
554 is expecting. Let's assume, for now, that it's a bug.
555 In any case, GCN kernels are always self contained and
556 therefore relative relocations will have been resolved
557 already, so this should be a safe workaround. */
558 V
= S
+ A
/* - P */ ;
565 /* TODO R_AMDGPU_GOTPCREL */
566 /* TODO R_AMDGPU_GOTPCREL32_LO */
567 /* TODO R_AMDGPU_GOTPCREL32_HI */
568 case R_AMDGPU_REL32_LO
:
569 V
= (S
+ A
- P
) & 0xFFFFFFFF;
572 case R_AMDGPU_REL32_HI
:
573 V
= (S
+ A
- P
) >> 32;
576 case R_AMDGPU_RELATIVE64
:
581 fprintf (stderr
, "Error: unsupported relocation type.\n");
584 XHSA (hsa_fns
.hsa_memory_copy_fn ((void *) P
, &V
, size
),
585 "Fix up relocation");
590 /* Allocate some device memory from the kernargs region.
591 The returned address will be 32-bit (with excess zeroed on 64-bit host),
592 and accessible via the same address on both host and target (via
593 __flat_scalar GCN address space). */
596 device_malloc (size_t size
)
599 XHSA (hsa_fns
.hsa_memory_allocate_fn (kernargs_region
, size
, &result
),
600 "Allocate device memory");
604 /* These are the device pointers that will be transferred to the target.
605 The HSA Runtime points the kernargs register here.
606 They correspond to function signature:
607 int main (int argc, char *argv[], int *return_value)
608 The compiler expects this, for kernel functions, and will
609 automatically assign the exit value to *return_value. */
622 unsigned int next_output
;
635 unsigned int consumed
;
645 /* Print any console output from the kernel.
646 We print all entries from "consumed" to the next entry without a "written"
647 flag, or "next_output" is reached. The buffer is circular, but the
648 indices are absolute. It is assumed the kernel will stop writing data
649 if "next_output" wraps (becomes smaller than "consumed"). */
651 gomp_print_output (struct kernargs
*kernargs
, bool final
)
653 unsigned int limit
= (sizeof (kernargs
->output_data
.queue
)
654 / sizeof (kernargs
->output_data
.queue
[0]));
656 unsigned int from
= __atomic_load_n (&kernargs
->output_data
.consumed
,
658 unsigned int to
= kernargs
->output_data
.next_output
;
664 printf ("GCN print buffer overflowed.\n");
669 for (i
= from
; i
< to
; i
++)
671 struct printf_data
*data
= &kernargs
->output_data
.queue
[i
%limit
];
673 if (!data
->written
&& !final
)
679 printf ("%.128s%ld\n", data
->msg
, data
->ivalue
);
682 printf ("%.128s%f\n", data
->msg
, data
->dvalue
);
685 printf ("%.128s%.128s\n", data
->msg
, data
->text
);
688 printf ("%.128s%.128s", data
->msg
, data
->text
);
691 printf ("GCN print buffer error!\n");
696 __atomic_store_n (&kernargs
->output_data
.consumed
, i
+1,
702 /* Execute an already-loaded kernel on the device. */
705 run (uint64_t kernel
, void *kernargs
)
707 /* A "signal" is used to launch and monitor the kernel. */
709 XHSA (hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &signal
),
712 /* Configure for a single-worker kernel. */
713 uint64_t index
= hsa_fns
.hsa_queue_load_write_index_relaxed_fn (queue
);
714 const uint32_t queueMask
= queue
->size
- 1;
715 hsa_kernel_dispatch_packet_t
*dispatch_packet
=
716 &(((hsa_kernel_dispatch_packet_t
*) (queue
->base_address
))[index
&
718 dispatch_packet
->setup
|= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
;
719 dispatch_packet
->workgroup_size_x
= (uint16_t) 1;
720 dispatch_packet
->workgroup_size_y
= (uint16_t) 64;
721 dispatch_packet
->workgroup_size_z
= (uint16_t) 1;
722 dispatch_packet
->grid_size_x
= 1;
723 dispatch_packet
->grid_size_y
= 64;
724 dispatch_packet
->grid_size_z
= 1;
725 dispatch_packet
->completion_signal
= signal
;
726 dispatch_packet
->kernel_object
= kernel
;
727 dispatch_packet
->kernarg_address
= (void *) kernargs
;
728 dispatch_packet
->private_segment_size
= private_segment_size
;
729 dispatch_packet
->group_segment_size
= group_segment_size
;
732 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
;
733 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
;
734 header
|= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
;
736 __atomic_store_n ((uint32_t *) dispatch_packet
,
737 header
| (dispatch_packet
->setup
<< 16),
741 fprintf (stderr
, "Launch kernel\n");
743 hsa_fns
.hsa_queue_store_write_index_relaxed_fn (queue
, index
+ 1);
744 hsa_fns
.hsa_signal_store_relaxed_fn (queue
->doorbell_signal
, index
);
745 /* Kernel running ...... */
746 while (hsa_fns
.hsa_signal_wait_relaxed_fn (signal
, HSA_SIGNAL_CONDITION_LT
,
748 HSA_WAIT_STATE_ACTIVE
) != 0)
751 gomp_print_output (kernargs
, false);
754 gomp_print_output (kernargs
, true);
757 fprintf (stderr
, "Kernel exited\n");
759 XHSA (hsa_fns
.hsa_signal_destroy_fn (signal
),
764 main (int argc
, char *argv
[])
767 for (int i
= 1; i
< argc
; i
++)
769 if (!strcmp (argv
[i
], "--help"))
774 else if (!strcmp (argv
[i
], "--version"))
779 else if (!strcmp (argv
[i
], "--debug"))
781 else if (argv
[i
][0] == '-')
795 /* No kernel arguments were found. */
800 /* The remaining arguments are for the GCN kernel. */
801 int kernel_argc
= argc
- kernel_arg
;
802 char **kernel_argv
= &argv
[kernel_arg
];
805 load_image (kernel_argv
[0]);
807 /* Calculate size of function parameters + argv data. */
808 size_t args_size
= 0;
809 for (int i
= 0; i
< kernel_argc
; i
++)
810 args_size
+= strlen (kernel_argv
[i
]) + 1;
812 /* Allocate device memory for both function parameters and the argv
814 size_t heap_size
= 10 * 1024 * 1024; /* 10MB. */
815 struct kernargs
*kernargs
= device_malloc (sizeof (*kernargs
) + heap_size
);
818 int64_t argv_data
[kernel_argc
];
819 char strings
[args_size
];
820 } *args
= device_malloc (sizeof (struct argdata
));
822 /* Write the data to the target. */
823 kernargs
->argc
= kernel_argc
;
824 kernargs
->argv
= (int64_t) args
->argv_data
;
825 kernargs
->out_ptr
= (int64_t) &kernargs
->output_data
;
826 kernargs
->output_data
.return_value
= 0xcafe0000; /* Default return value. */
827 kernargs
->output_data
.next_output
= 0;
828 for (unsigned i
= 0; i
< (sizeof (kernargs
->output_data
.queue
)
829 / sizeof (kernargs
->output_data
.queue
[0])); i
++)
830 kernargs
->output_data
.queue
[i
].written
= 0;
831 kernargs
->output_data
.consumed
= 0;
833 for (int i
= 0; i
< kernel_argc
; i
++)
835 size_t arg_len
= strlen (kernel_argv
[i
]) + 1;
836 args
->argv_data
[i
] = (int64_t) &args
->strings
[offset
];
837 memcpy (&args
->strings
[offset
], kernel_argv
[i
], arg_len
+ 1);
840 kernargs
->heap_ptr
= (int64_t) &kernargs
->heap
;
841 kernargs
->heap
.size
= heap_size
;
843 /* Run constructors on the GPU. */
844 run (init_array_kernel
, kernargs
);
846 /* Run the kernel on the GPU. */
847 run (main_kernel
, kernargs
);
848 unsigned int return_value
=
849 (unsigned int) kernargs
->output_data
.return_value
;
851 /* Run destructors on the GPU. */
852 run (fini_array_kernel
, kernargs
);
854 unsigned int upper
= (return_value
& ~0xffff) >> 16;
856 printf ("Kernel exit value was never set\n");
857 else if (upper
== 0xffff)
860 ; /* Set by return from main. */
862 printf ("Possible kernel exit value corruption, 2 most significant bytes "
863 "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value
);
867 unsigned int signal
= (return_value
>> 8) & 0xff;
868 if (signal
== SIGABRT
)
869 printf ("Kernel aborted\n");
870 else if (signal
!= 0)
871 printf ("Kernel received unkown signal\n");
875 printf ("Kernel exit value: %d\n", return_value
& 0xff);
877 /* Clean shut down. */
878 XHSA (hsa_fns
.hsa_memory_free_fn (kernargs
),
879 "Clean up device memory");
880 XHSA (hsa_fns
.hsa_executable_destroy_fn (executable
),
881 "Clean up GCN executable");
882 XHSA (hsa_fns
.hsa_queue_destroy_fn (queue
),
883 "Clean up device queue");
884 XHSA (hsa_fns
.hsa_shut_down_fn (),
885 "Shut down run-time");
887 return return_value
& 0xff;