]>
Commit | Line | Data |
---|---|---|
5326695a AS |
1 | /* Run a stand-alone AMD GCN kernel. |
2 | ||
3 | Copyright 2017 Mentor Graphics Corporation | |
a945c346 | 4 | Copyright (C) 2018-2024 Free Software Foundation, Inc. |
5326695a AS |
5 | |
6 | This program is free software: you can redistribute it and/or modify | |
7 | it under the terms of the GNU General Public License as published by | |
8 | the Free Software Foundation, either version 3 of the License, or | |
9 | (at your option) any later version. | |
10 | ||
11 | This program is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with this program. If not, see <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | /* This program will run a compiled stand-alone GCN kernel on a GPU. | |
20 | ||
21 | The kernel entry point's signature must use a standard main signature: | |
22 | ||
23 | int main(int argc, char **argv) | |
24 | */ | |
25 | ||
26 | #include <stdint.h> | |
27 | #include <stdbool.h> | |
28 | #include <stdlib.h> | |
29 | #include <malloc.h> | |
30 | #include <stdio.h> | |
31 | #include <string.h> | |
32 | #include <dlfcn.h> | |
33 | #include <unistd.h> | |
34 | #include <elf.h> | |
35 | #include <signal.h> | |
36 | ||
5326695a | 37 | #include "hsa.h" |
f6fff8a6 | 38 | #include "../../../libgomp/config/gcn/libgomp-gcn.h" |
5326695a AS |
39 | |
40 | #ifndef HSA_RUNTIME_LIB | |
f062c3f1 | 41 | #define HSA_RUNTIME_LIB "libhsa-runtime64.so.1" |
5326695a AS |
42 | #endif |
43 | ||
44 | #ifndef VERSION_STRING | |
45 | #define VERSION_STRING "(version unknown)" | |
46 | #endif | |
47 | ||
48 | bool debug = false; | |
49 | ||
50 | hsa_agent_t device = { 0 }; | |
51 | hsa_queue_t *queue = NULL; | |
7039cebf KCY |
52 | uint64_t init_array_kernel = 0; |
53 | uint64_t fini_array_kernel = 0; | |
54 | uint64_t main_kernel = 0; | |
5326695a AS |
55 | hsa_executable_t executable = { 0 }; |
56 | ||
57 | hsa_region_t kernargs_region = { 0 }; | |
e3d0ee4a | 58 | hsa_region_t heap_region = { 0 }; |
5326695a AS |
59 | uint32_t kernarg_segment_size = 0; |
60 | uint32_t group_segment_size = 0; | |
61 | uint32_t private_segment_size = 0; | |
62 | ||
63 | static void | |
64 | usage (const char *progname) | |
65 | { | |
66 | printf ("Usage: %s [options] kernel [kernel-args]\n\n" | |
67 | "Options:\n" | |
68 | " --help\n" | |
69 | " --version\n" | |
70 | " --debug\n", progname); | |
71 | } | |
72 | ||
73 | static void | |
74 | version (const char *progname) | |
75 | { | |
76 | printf ("%s " VERSION_STRING "\n", progname); | |
77 | } | |
78 | ||
79 | /* As an HSA runtime is dlopened, following structure defines the necessary | |
80 | function pointers. | |
81 | Code adapted from libgomp. */ | |
82 | ||
83 | struct hsa_runtime_fn_info | |
84 | { | |
85 | /* HSA runtime. */ | |
86 | hsa_status_t (*hsa_status_string_fn) (hsa_status_t status, | |
87 | const char **status_string); | |
88 | hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent, | |
89 | hsa_agent_info_t attribute, | |
90 | void *value); | |
91 | hsa_status_t (*hsa_init_fn) (void); | |
92 | hsa_status_t (*hsa_iterate_agents_fn) | |
93 | (hsa_status_t (*callback) (hsa_agent_t agent, void *data), void *data); | |
94 | hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region, | |
95 | hsa_region_info_t attribute, | |
96 | void *value); | |
97 | hsa_status_t (*hsa_queue_create_fn) | |
98 | (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type, | |
99 | void (*callback) (hsa_status_t status, hsa_queue_t *source, void *data), | |
100 | void *data, uint32_t private_segment_size, | |
101 | uint32_t group_segment_size, hsa_queue_t **queue); | |
102 | hsa_status_t (*hsa_agent_iterate_regions_fn) | |
103 | (hsa_agent_t agent, | |
104 | hsa_status_t (*callback) (hsa_region_t region, void *data), void *data); | |
105 | hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable); | |
106 | hsa_status_t (*hsa_executable_create_fn) | |
107 | (hsa_profile_t profile, hsa_executable_state_t executable_state, | |
108 | const char *options, hsa_executable_t *executable); | |
109 | hsa_status_t (*hsa_executable_global_variable_define_fn) | |
110 | (hsa_executable_t executable, const char *variable_name, void *address); | |
111 | hsa_status_t (*hsa_executable_load_code_object_fn) | |
112 | (hsa_executable_t executable, hsa_agent_t agent, | |
113 | hsa_code_object_t code_object, const char *options); | |
114 | hsa_status_t (*hsa_executable_freeze_fn) (hsa_executable_t executable, | |
115 | const char *options); | |
116 | hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value, | |
117 | uint32_t num_consumers, | |
118 | const hsa_agent_t *consumers, | |
119 | hsa_signal_t *signal); | |
120 | hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size, | |
121 | void **ptr); | |
e3d0ee4a AS |
122 | hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent, |
123 | hsa_access_permission_t access); | |
5326695a AS |
124 | hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src, |
125 | size_t size); | |
126 | hsa_status_t (*hsa_memory_free_fn) (void *ptr); | |
127 | hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal); | |
128 | hsa_status_t (*hsa_executable_get_symbol_fn) | |
129 | (hsa_executable_t executable, const char *module_name, | |
130 | const char *symbol_name, hsa_agent_t agent, int32_t call_convention, | |
131 | hsa_executable_symbol_t *symbol); | |
132 | hsa_status_t (*hsa_executable_symbol_get_info_fn) | |
133 | (hsa_executable_symbol_t executable_symbol, | |
134 | hsa_executable_symbol_info_t attribute, void *value); | |
135 | void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal, | |
136 | hsa_signal_value_t value); | |
137 | hsa_signal_value_t (*hsa_signal_wait_acquire_fn) | |
138 | (hsa_signal_t signal, hsa_signal_condition_t condition, | |
139 | hsa_signal_value_t compare_value, uint64_t timeout_hint, | |
140 | hsa_wait_state_t wait_state_hint); | |
141 | hsa_signal_value_t (*hsa_signal_wait_relaxed_fn) | |
142 | (hsa_signal_t signal, hsa_signal_condition_t condition, | |
143 | hsa_signal_value_t compare_value, uint64_t timeout_hint, | |
144 | hsa_wait_state_t wait_state_hint); | |
145 | hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue); | |
146 | hsa_status_t (*hsa_code_object_deserialize_fn) | |
147 | (void *serialized_code_object, size_t serialized_code_object_size, | |
148 | const char *options, hsa_code_object_t *code_object); | |
149 | uint64_t (*hsa_queue_load_write_index_relaxed_fn) | |
150 | (const hsa_queue_t *queue); | |
151 | void (*hsa_queue_store_write_index_relaxed_fn) | |
152 | (const hsa_queue_t *queue, uint64_t value); | |
153 | hsa_status_t (*hsa_shut_down_fn) (); | |
154 | }; | |
155 | ||
156 | /* HSA runtime functions that are initialized in init_hsa_context. | |
157 | Code adapted from libgomp. */ | |
158 | ||
159 | static struct hsa_runtime_fn_info hsa_fns; | |
160 | ||
161 | #define DLSYM_FN(function) \ | |
162 | *(void**)(&hsa_fns.function##_fn) = dlsym (handle, #function); \ | |
163 | if (hsa_fns.function##_fn == NULL) \ | |
164 | goto fail; | |
165 | ||
166 | static void | |
167 | init_hsa_runtime_functions (void) | |
168 | { | |
169 | void *handle = dlopen (HSA_RUNTIME_LIB, RTLD_LAZY); | |
170 | if (handle == NULL) | |
171 | { | |
172 | fprintf (stderr, | |
173 | "The HSA runtime is required to run GCN kernels on hardware.\n" | |
174 | "%s: File not found or could not be opened\n", | |
175 | HSA_RUNTIME_LIB); | |
176 | exit (1); | |
177 | } | |
178 | ||
179 | DLSYM_FN (hsa_status_string) | |
180 | DLSYM_FN (hsa_agent_get_info) | |
181 | DLSYM_FN (hsa_init) | |
182 | DLSYM_FN (hsa_iterate_agents) | |
183 | DLSYM_FN (hsa_region_get_info) | |
184 | DLSYM_FN (hsa_queue_create) | |
185 | DLSYM_FN (hsa_agent_iterate_regions) | |
186 | DLSYM_FN (hsa_executable_destroy) | |
187 | DLSYM_FN (hsa_executable_create) | |
188 | DLSYM_FN (hsa_executable_global_variable_define) | |
189 | DLSYM_FN (hsa_executable_load_code_object) | |
190 | DLSYM_FN (hsa_executable_freeze) | |
191 | DLSYM_FN (hsa_signal_create) | |
192 | DLSYM_FN (hsa_memory_allocate) | |
e3d0ee4a | 193 | DLSYM_FN (hsa_memory_assign_agent) |
5326695a AS |
194 | DLSYM_FN (hsa_memory_copy) |
195 | DLSYM_FN (hsa_memory_free) | |
196 | DLSYM_FN (hsa_signal_destroy) | |
197 | DLSYM_FN (hsa_executable_get_symbol) | |
198 | DLSYM_FN (hsa_executable_symbol_get_info) | |
199 | DLSYM_FN (hsa_signal_wait_acquire) | |
200 | DLSYM_FN (hsa_signal_wait_relaxed) | |
201 | DLSYM_FN (hsa_signal_store_relaxed) | |
202 | DLSYM_FN (hsa_queue_destroy) | |
203 | DLSYM_FN (hsa_code_object_deserialize) | |
204 | DLSYM_FN (hsa_queue_load_write_index_relaxed) | |
205 | DLSYM_FN (hsa_queue_store_write_index_relaxed) | |
206 | DLSYM_FN (hsa_shut_down) | |
207 | ||
208 | return; | |
209 | ||
210 | fail: | |
211 | fprintf (stderr, "Failed to find HSA functions in " HSA_RUNTIME_LIB "\n"); | |
212 | exit (1); | |
213 | } | |
214 | ||
215 | #undef DLSYM_FN | |
216 | ||
217 | /* Report a fatal error STR together with the HSA error corresponding to | |
218 | STATUS and terminate execution of the current process. */ | |
219 | ||
220 | static void | |
221 | hsa_fatal (const char *str, hsa_status_t status) | |
222 | { | |
223 | const char *hsa_error_msg; | |
224 | hsa_fns.hsa_status_string_fn (status, &hsa_error_msg); | |
225 | fprintf (stderr, "%s: FAILED\nHSA Runtime message: %s\n", str, | |
226 | hsa_error_msg); | |
227 | exit (1); | |
228 | } | |
229 | ||
230 | /* Helper macros to ensure we check the return values from the HSA Runtime. | |
231 | These just keep the rest of the code a bit cleaner. */ | |
232 | ||
233 | #define XHSA_CMP(FN, CMP, MSG) \ | |
234 | do { \ | |
235 | hsa_status_t status = (FN); \ | |
236 | if (!(CMP)) \ | |
237 | hsa_fatal ((MSG), status); \ | |
238 | else if (debug) \ | |
239 | fprintf (stderr, "%s: OK\n", (MSG)); \ | |
240 | } while (0) | |
241 | #define XHSA(FN, MSG) XHSA_CMP(FN, status == HSA_STATUS_SUCCESS, MSG) | |
242 | ||
243 | /* Callback of hsa_iterate_agents. | |
244 | Called once for each available device, and returns "break" when a | |
245 | suitable one has been found. */ | |
246 | ||
247 | static hsa_status_t | |
248 | get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused))) | |
249 | { | |
250 | hsa_device_type_t device_type; | |
251 | XHSA (hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE, | |
252 | &device_type), | |
253 | "Get agent type"); | |
254 | ||
255 | /* Select only GPU devices. */ | |
256 | /* TODO: support selecting from multiple GPUs. */ | |
257 | if (HSA_DEVICE_TYPE_GPU == device_type) | |
258 | { | |
259 | device = agent; | |
260 | return HSA_STATUS_INFO_BREAK; | |
261 | } | |
262 | ||
263 | /* The device was not suitable. */ | |
264 | return HSA_STATUS_SUCCESS; | |
265 | } | |
266 | ||
267 | /* Callback of hsa_iterate_regions. | |
268 | Called once for each available memory region, and returns "break" when a | |
269 | suitable one has been found. */ | |
270 | ||
271 | static hsa_status_t | |
e3d0ee4a AS |
272 | get_memory_region (hsa_region_t region, hsa_region_t *retval, |
273 | hsa_region_global_flag_t kind) | |
5326695a AS |
274 | { |
275 | /* Reject non-global regions. */ | |
276 | hsa_region_segment_t segment; | |
277 | hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT, &segment); | |
278 | if (HSA_REGION_SEGMENT_GLOBAL != segment) | |
279 | return HSA_STATUS_SUCCESS; | |
280 | ||
281 | /* Find a region with the KERNARG flag set. */ | |
282 | hsa_region_global_flag_t flags; | |
283 | hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS, | |
284 | &flags); | |
e3d0ee4a | 285 | if (flags & kind) |
5326695a | 286 | { |
e3d0ee4a | 287 | *retval = region; |
5326695a AS |
288 | return HSA_STATUS_INFO_BREAK; |
289 | } | |
290 | ||
291 | /* The region was not suitable. */ | |
292 | return HSA_STATUS_SUCCESS; | |
293 | } | |
294 | ||
e3d0ee4a AS |
295 | static hsa_status_t |
296 | get_kernarg_region (hsa_region_t region, void *data __attribute__((unused))) | |
297 | { | |
298 | return get_memory_region (region, &kernargs_region, | |
299 | HSA_REGION_GLOBAL_FLAG_KERNARG); | |
300 | } | |
301 | ||
302 | static hsa_status_t | |
303 | get_heap_region (hsa_region_t region, void *data __attribute__((unused))) | |
304 | { | |
305 | return get_memory_region (region, &heap_region, | |
306 | HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED); | |
307 | } | |
308 | ||
5326695a AS |
309 | /* Initialize the HSA Runtime library and GPU device. */ |
310 | ||
311 | static void | |
312 | init_device () | |
313 | { | |
314 | /* Load the shared library and find the API functions. */ | |
315 | init_hsa_runtime_functions (); | |
316 | ||
317 | /* Initialize the HSA Runtime. */ | |
318 | XHSA (hsa_fns.hsa_init_fn (), | |
319 | "Initialize run-time"); | |
320 | ||
321 | /* Select a suitable device. | |
322 | The call-back function, get_gpu_agent, does the selection. */ | |
323 | XHSA_CMP (hsa_fns.hsa_iterate_agents_fn (get_gpu_agent, NULL), | |
324 | status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, | |
325 | "Find a device"); | |
326 | ||
327 | /* Initialize the queue used for launching kernels. */ | |
328 | uint32_t queue_size = 0; | |
329 | XHSA (hsa_fns.hsa_agent_get_info_fn (device, HSA_AGENT_INFO_QUEUE_MAX_SIZE, | |
330 | &queue_size), | |
331 | "Find max queue size"); | |
332 | XHSA (hsa_fns.hsa_queue_create_fn (device, queue_size, | |
333 | HSA_QUEUE_TYPE_SINGLE, NULL, | |
334 | NULL, UINT32_MAX, UINT32_MAX, &queue), | |
335 | "Set up a device queue"); | |
336 | ||
337 | /* Select a memory region for the kernel arguments. | |
338 | The call-back function, get_kernarg_region, does the selection. */ | |
339 | XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_kernarg_region, | |
340 | NULL), | |
341 | status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, | |
342 | "Locate kernargs memory"); | |
e3d0ee4a AS |
343 | |
344 | /* Select a memory region for the kernel heap. | |
345 | The call-back function, get_heap_region, does the selection. */ | |
346 | XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region, | |
347 | NULL), | |
348 | status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, | |
349 | "Locate device memory"); | |
5326695a AS |
350 | } |
351 | ||
352 | ||
353 | /* Read a whole input file. | |
354 | Code copied from mkoffload. */ | |
355 | ||
356 | static char * | |
357 | read_file (const char *filename, size_t *plen) | |
358 | { | |
359 | size_t alloc = 16384; | |
360 | size_t base = 0; | |
361 | char *buffer; | |
362 | ||
363 | FILE *stream = fopen (filename, "rb"); | |
364 | if (!stream) | |
365 | { | |
366 | perror (filename); | |
367 | exit (1); | |
368 | } | |
369 | ||
370 | if (!fseek (stream, 0, SEEK_END)) | |
371 | { | |
372 | /* Get the file size. */ | |
373 | long s = ftell (stream); | |
374 | if (s >= 0) | |
375 | alloc = s + 100; | |
376 | fseek (stream, 0, SEEK_SET); | |
377 | } | |
378 | buffer = malloc (alloc); | |
379 | ||
380 | for (;;) | |
381 | { | |
382 | size_t n = fread (buffer + base, 1, alloc - base - 1, stream); | |
383 | ||
384 | if (!n) | |
385 | break; | |
386 | base += n; | |
387 | if (base + 1 == alloc) | |
388 | { | |
389 | alloc *= 2; | |
390 | buffer = realloc (buffer, alloc); | |
391 | } | |
392 | } | |
393 | buffer[base] = 0; | |
394 | *plen = base; | |
395 | ||
396 | fclose (stream); | |
397 | ||
398 | return buffer; | |
399 | } | |
400 | ||
401 | /* Read a HSA Code Object (HSACO) from file, and load it into the device. */ | |
402 | ||
403 | static void | |
404 | load_image (const char *filename) | |
405 | { | |
406 | size_t image_size; | |
407 | Elf64_Ehdr *image = (void *) read_file (filename, &image_size); | |
408 | ||
409 | /* An "executable" consists of one or more code objects. */ | |
410 | XHSA (hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL, | |
411 | HSA_EXECUTABLE_STATE_UNFROZEN, "", | |
412 | &executable), | |
413 | "Initialize GCN executable"); | |
414 | ||
5326695a AS |
415 | /* Add the HSACO to the executable. */ |
416 | hsa_code_object_t co = { 0 }; | |
417 | XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co), | |
418 | "Deserialize GCN code object"); | |
419 | XHSA (hsa_fns.hsa_executable_load_code_object_fn (executable, device, co, | |
420 | ""), | |
421 | "Load GCN code object"); | |
422 | ||
423 | /* We're done modifying he executable. */ | |
424 | XHSA (hsa_fns.hsa_executable_freeze_fn (executable, ""), | |
425 | "Freeze GCN executable"); | |
426 | ||
7039cebf | 427 | /* Locate the "_init_array" function, and read the kernel's properties. */ |
5326695a | 428 | hsa_executable_symbol_t symbol; |
f062c3f1 AS |
429 | XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, |
430 | "_init_array.kd", device, 0, | |
431 | &symbol), | |
7039cebf KCY |
432 | "Find '_init_array' function"); |
433 | XHSA (hsa_fns.hsa_executable_symbol_get_info_fn | |
f062c3f1 AS |
434 | (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, |
435 | &init_array_kernel), | |
7039cebf KCY |
436 | "Extract '_init_array' kernel object kernel object"); |
437 | ||
438 | /* Locate the "_fini_array" function, and read the kernel's properties. */ | |
f062c3f1 AS |
439 | XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, |
440 | "_fini_array.kd", device, 0, | |
441 | &symbol), | |
7039cebf KCY |
442 | "Find '_fini_array' function"); |
443 | XHSA (hsa_fns.hsa_executable_symbol_get_info_fn | |
f062c3f1 AS |
444 | (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, |
445 | &fini_array_kernel), | |
7039cebf KCY |
446 | "Extract '_fini_array' kernel object kernel object"); |
447 | ||
448 | /* Locate the "main" function, and read the kernel's properties. */ | |
f062c3f1 | 449 | XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main.kd", |
5326695a AS |
450 | device, 0, &symbol), |
451 | "Find 'main' function"); | |
452 | XHSA (hsa_fns.hsa_executable_symbol_get_info_fn | |
7039cebf KCY |
453 | (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &main_kernel), |
454 | "Extract 'main' kernel object"); | |
5326695a AS |
455 | XHSA (hsa_fns.hsa_executable_symbol_get_info_fn |
456 | (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, | |
457 | &kernarg_segment_size), | |
458 | "Extract kernarg segment size"); | |
459 | XHSA (hsa_fns.hsa_executable_symbol_get_info_fn | |
460 | (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, | |
461 | &group_segment_size), | |
462 | "Extract group segment size"); | |
463 | XHSA (hsa_fns.hsa_executable_symbol_get_info_fn | |
464 | (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, | |
465 | &private_segment_size), | |
466 | "Extract private segment size"); | |
5326695a AS |
467 | } |
468 | ||
469 | /* Allocate some device memory from the kernargs region. | |
470 | The returned address will be 32-bit (with excess zeroed on 64-bit host), | |
471 | and accessible via the same address on both host and target (via | |
472 | __flat_scalar GCN address space). */ | |
473 | ||
474 | static void * | |
e3d0ee4a | 475 | device_malloc (size_t size, hsa_region_t region) |
5326695a AS |
476 | { |
477 | void *result; | |
e3d0ee4a | 478 | XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result), |
5326695a AS |
479 | "Allocate device memory"); |
480 | return result; | |
481 | } | |
482 | ||
483 | /* These are the device pointers that will be transferred to the target. | |
484 | The HSA Runtime points the kernargs register here. | |
485 | They correspond to function signature: | |
486 | int main (int argc, char *argv[], int *return_value) | |
487 | The compiler expects this, for kernel functions, and will | |
488 | automatically assign the exit value to *return_value. */ | |
489 | struct kernargs | |
490 | { | |
f6fff8a6 AS |
491 | union { |
492 | struct { | |
493 | int32_t argc; | |
494 | int64_t argv; | |
495 | } args; | |
496 | struct kernargs_abi abi; | |
497 | }; | |
498 | struct output output_data; | |
5326695a AS |
499 | }; |
500 | ||
501 | /* Print any console output from the kernel. | |
f9cacebb AS |
502 | We print all entries from "consumed" to the next entry without a "written" |
503 | flag, or "next_output" is reached. The buffer is circular, but the | |
504 | indices are absolute. It is assumed the kernel will stop writing data | |
505 | if "next_output" wraps (becomes smaller than "consumed"). */ | |
5326695a | 506 | void |
f9cacebb | 507 | gomp_print_output (struct kernargs *kernargs, bool final) |
5326695a | 508 | { |
f9cacebb AS |
509 | unsigned int limit = (sizeof (kernargs->output_data.queue) |
510 | / sizeof (kernargs->output_data.queue[0])); | |
5326695a | 511 | |
f9cacebb AS |
512 | unsigned int from = __atomic_load_n (&kernargs->output_data.consumed, |
513 | __ATOMIC_ACQUIRE); | |
514 | unsigned int to = kernargs->output_data.next_output; | |
515 | ||
516 | if (from > to) | |
517 | { | |
518 | /* Overflow. */ | |
519 | if (final) | |
520 | printf ("GCN print buffer overflowed.\n"); | |
521 | return; | |
522 | } | |
523 | ||
524 | unsigned int i; | |
525 | for (i = from; i < to; i++) | |
5326695a | 526 | { |
f9cacebb | 527 | struct printf_data *data = &kernargs->output_data.queue[i%limit]; |
5326695a | 528 | |
f9cacebb | 529 | if (!data->written && !final) |
5326695a AS |
530 | break; |
531 | ||
532 | switch (data->type) | |
533 | { | |
534 | case 0: | |
535 | printf ("%.128s%ld\n", data->msg, data->ivalue); | |
536 | break; | |
537 | case 1: | |
538 | printf ("%.128s%f\n", data->msg, data->dvalue); | |
539 | break; | |
540 | case 2: | |
541 | printf ("%.128s%.128s\n", data->msg, data->text); | |
542 | break; | |
543 | case 3: | |
544 | printf ("%.128s%.128s", data->msg, data->text); | |
545 | break; | |
f9cacebb AS |
546 | default: |
547 | printf ("GCN print buffer error!\n"); | |
548 | break; | |
5326695a AS |
549 | } |
550 | ||
551 | data->written = 0; | |
f9cacebb AS |
552 | __atomic_store_n (&kernargs->output_data.consumed, i+1, |
553 | __ATOMIC_RELEASE); | |
5326695a | 554 | } |
f9cacebb | 555 | fflush (stdout); |
5326695a AS |
556 | } |
557 | ||
558 | /* Execute an already-loaded kernel on the device. */ | |
559 | ||
560 | static void | |
7039cebf | 561 | run (uint64_t kernel, void *kernargs) |
5326695a AS |
562 | { |
563 | /* A "signal" is used to launch and monitor the kernel. */ | |
564 | hsa_signal_t signal; | |
565 | XHSA (hsa_fns.hsa_signal_create_fn (1, 0, NULL, &signal), | |
566 | "Create signal"); | |
567 | ||
568 | /* Configure for a single-worker kernel. */ | |
569 | uint64_t index = hsa_fns.hsa_queue_load_write_index_relaxed_fn (queue); | |
570 | const uint32_t queueMask = queue->size - 1; | |
571 | hsa_kernel_dispatch_packet_t *dispatch_packet = | |
572 | &(((hsa_kernel_dispatch_packet_t *) (queue->base_address))[index & | |
573 | queueMask]); | |
574 | dispatch_packet->setup |= 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; | |
575 | dispatch_packet->workgroup_size_x = (uint16_t) 1; | |
576 | dispatch_packet->workgroup_size_y = (uint16_t) 64; | |
577 | dispatch_packet->workgroup_size_z = (uint16_t) 1; | |
578 | dispatch_packet->grid_size_x = 1; | |
579 | dispatch_packet->grid_size_y = 64; | |
580 | dispatch_packet->grid_size_z = 1; | |
581 | dispatch_packet->completion_signal = signal; | |
582 | dispatch_packet->kernel_object = kernel; | |
583 | dispatch_packet->kernarg_address = (void *) kernargs; | |
584 | dispatch_packet->private_segment_size = private_segment_size; | |
585 | dispatch_packet->group_segment_size = group_segment_size; | |
586 | ||
587 | uint16_t header = 0; | |
588 | header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; | |
589 | header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; | |
590 | header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; | |
591 | ||
592 | __atomic_store_n ((uint32_t *) dispatch_packet, | |
593 | header | (dispatch_packet->setup << 16), | |
594 | __ATOMIC_RELEASE); | |
595 | ||
596 | if (debug) | |
597 | fprintf (stderr, "Launch kernel\n"); | |
598 | ||
599 | hsa_fns.hsa_queue_store_write_index_relaxed_fn (queue, index + 1); | |
600 | hsa_fns.hsa_signal_store_relaxed_fn (queue->doorbell_signal, index); | |
601 | /* Kernel running ...... */ | |
5326695a AS |
602 | while (hsa_fns.hsa_signal_wait_relaxed_fn (signal, HSA_SIGNAL_CONDITION_LT, |
603 | 1, 1000000, | |
604 | HSA_WAIT_STATE_ACTIVE) != 0) | |
605 | { | |
606 | usleep (10000); | |
f9cacebb | 607 | gomp_print_output (kernargs, false); |
5326695a AS |
608 | } |
609 | ||
f9cacebb | 610 | gomp_print_output (kernargs, true); |
5326695a AS |
611 | |
612 | if (debug) | |
613 | fprintf (stderr, "Kernel exited\n"); | |
614 | ||
615 | XHSA (hsa_fns.hsa_signal_destroy_fn (signal), | |
616 | "Clean up signal"); | |
617 | } | |
618 | ||
619 | int | |
620 | main (int argc, char *argv[]) | |
621 | { | |
622 | int kernel_arg = 0; | |
623 | for (int i = 1; i < argc; i++) | |
624 | { | |
625 | if (!strcmp (argv[i], "--help")) | |
626 | { | |
627 | usage (argv[0]); | |
628 | return 0; | |
629 | } | |
630 | else if (!strcmp (argv[i], "--version")) | |
631 | { | |
632 | version (argv[0]); | |
633 | return 0; | |
634 | } | |
635 | else if (!strcmp (argv[i], "--debug")) | |
636 | debug = true; | |
637 | else if (argv[i][0] == '-') | |
638 | { | |
639 | usage (argv[0]); | |
640 | return 1; | |
641 | } | |
642 | else | |
643 | { | |
644 | kernel_arg = i; | |
645 | break; | |
646 | } | |
647 | } | |
648 | ||
649 | if (!kernel_arg) | |
650 | { | |
651 | /* No kernel arguments were found. */ | |
652 | usage (argv[0]); | |
653 | return 1; | |
654 | } | |
655 | ||
656 | /* The remaining arguments are for the GCN kernel. */ | |
657 | int kernel_argc = argc - kernel_arg; | |
658 | char **kernel_argv = &argv[kernel_arg]; | |
659 | ||
660 | init_device (); | |
661 | load_image (kernel_argv[0]); | |
662 | ||
663 | /* Calculate size of function parameters + argv data. */ | |
664 | size_t args_size = 0; | |
665 | for (int i = 0; i < kernel_argc; i++) | |
666 | args_size += strlen (kernel_argv[i]) + 1; | |
667 | ||
f6fff8a6 AS |
668 | /* The device stack can be adjusted via an environment variable. */ |
669 | char *envvar = getenv ("GCN_STACK_SIZE"); | |
670 | int stack_size = 1 * 1024 * 1024; /* 1MB default. */ | |
671 | if (envvar) | |
672 | { | |
673 | int val = atoi (envvar); | |
674 | if (val) | |
675 | stack_size = val; | |
676 | } | |
677 | ||
5326695a AS |
678 | /* Allocate device memory for both function parameters and the argv |
679 | data. */ | |
e3d0ee4a AS |
680 | struct kernargs *kernargs = device_malloc (sizeof (*kernargs), |
681 | kernargs_region); | |
5326695a AS |
682 | struct argdata |
683 | { | |
684 | int64_t argv_data[kernel_argc]; | |
685 | char strings[args_size]; | |
e3d0ee4a AS |
686 | } *args = device_malloc (sizeof (struct argdata), kernargs_region); |
687 | ||
688 | size_t heap_size = 10 * 1024 * 1024; /* 10MB. */ | |
689 | struct heap *heap = device_malloc (heap_size, heap_region); | |
690 | XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device, | |
691 | HSA_ACCESS_PERMISSION_RW), | |
692 | "Assign heap to device agent"); | |
f6fff8a6 | 693 | void *stack = device_malloc (stack_size, heap_region); |
5326695a AS |
694 | |
695 | /* Write the data to the target. */ | |
f6fff8a6 AS |
696 | kernargs->args.argc = kernel_argc; |
697 | kernargs->args.argv = (int64_t) args->argv_data; | |
698 | kernargs->abi.out_ptr = (int64_t) &kernargs->output_data; | |
5326695a AS |
699 | kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */ |
700 | kernargs->output_data.next_output = 0; | |
701 | for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue) | |
702 | / sizeof (kernargs->output_data.queue[0])); i++) | |
703 | kernargs->output_data.queue[i].written = 0; | |
f9cacebb | 704 | kernargs->output_data.consumed = 0; |
5326695a AS |
705 | int offset = 0; |
706 | for (int i = 0; i < kernel_argc; i++) | |
707 | { | |
708 | size_t arg_len = strlen (kernel_argv[i]) + 1; | |
709 | args->argv_data[i] = (int64_t) &args->strings[offset]; | |
710 | memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1); | |
711 | offset += arg_len; | |
712 | } | |
f6fff8a6 | 713 | kernargs->abi.heap_ptr = (int64_t) heap; |
e3d0ee4a | 714 | hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size)); |
f6fff8a6 AS |
715 | kernargs->abi.arena_ptr = 0; |
716 | kernargs->abi.stack_ptr = (int64_t) stack; | |
717 | kernargs->abi.stack_size_per_thread = stack_size; | |
5326695a | 718 | |
7039cebf KCY |
719 | /* Run constructors on the GPU. */ |
720 | run (init_array_kernel, kernargs); | |
721 | ||
5326695a | 722 | /* Run the kernel on the GPU. */ |
7039cebf | 723 | run (main_kernel, kernargs); |
5326695a AS |
724 | unsigned int return_value = |
725 | (unsigned int) kernargs->output_data.return_value; | |
726 | ||
7039cebf KCY |
727 | /* Run destructors on the GPU. */ |
728 | run (fini_array_kernel, kernargs); | |
729 | ||
5326695a AS |
730 | unsigned int upper = (return_value & ~0xffff) >> 16; |
731 | if (upper == 0xcafe) | |
e8daba7e AS |
732 | { |
733 | printf ("Kernel exit value was never set\n"); | |
734 | return_value = 0xff; | |
735 | } | |
5326695a AS |
736 | else if (upper == 0xffff) |
737 | ; /* Set by exit. */ | |
738 | else if (upper == 0) | |
739 | ; /* Set by return from main. */ | |
740 | else | |
741 | printf ("Possible kernel exit value corruption, 2 most significant bytes " | |
742 | "aren't 0xffff, 0xcafe, or 0: 0x%x\n", return_value); | |
743 | ||
744 | if (upper == 0xffff) | |
745 | { | |
746 | unsigned int signal = (return_value >> 8) & 0xff; | |
747 | if (signal == SIGABRT) | |
748 | printf ("Kernel aborted\n"); | |
749 | else if (signal != 0) | |
750 | printf ("Kernel received unkown signal\n"); | |
751 | } | |
752 | ||
753 | if (debug) | |
754 | printf ("Kernel exit value: %d\n", return_value & 0xff); | |
755 | ||
756 | /* Clean shut down. */ | |
757 | XHSA (hsa_fns.hsa_memory_free_fn (kernargs), | |
758 | "Clean up device memory"); | |
759 | XHSA (hsa_fns.hsa_executable_destroy_fn (executable), | |
760 | "Clean up GCN executable"); | |
761 | XHSA (hsa_fns.hsa_queue_destroy_fn (queue), | |
762 | "Clean up device queue"); | |
763 | XHSA (hsa_fns.hsa_shut_down_fn (), | |
764 | "Shut down run-time"); | |
765 | ||
766 | return return_value & 0xff; | |
767 | } |