]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgomp/plugin/plugin-gcn.c
OpenACC 'acc_get_property' cleanup
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
1 /* Plugin for AMD GCN execution.
2
3 Copyright (C) 2013-2020 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* {{{ Includes and defines */
30
31 #include "config.h"
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <pthread.h>
36 #include <inttypes.h>
37 #include <stdbool.h>
38 #include <limits.h>
39 #include <hsa.h>
40 #include <dlfcn.h>
41 #include <signal.h>
42 #include "libgomp-plugin.h"
43 #include "gomp-constants.h"
44 #include <elf.h>
45 #include "oacc-plugin.h"
46 #include "oacc-int.h"
47 #include <assert.h>
48
49 /* Additional definitions not in HSA 1.1.
50 FIXME: this needs to be updated in hsa.h for upstream, but the only source
51 right now is the ROCr source which may cause license issues. */
52 #define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
53
54 /* These probably won't be in elf.h for a while. */
55 #define R_AMDGPU_NONE 0
56 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
57 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
58 #define R_AMDGPU_ABS64 3 /* S + A */
59 #define R_AMDGPU_REL32 4 /* S + A - P */
60 #define R_AMDGPU_REL64 5 /* S + A - P */
61 #define R_AMDGPU_ABS32 6 /* S + A */
62 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
63 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
64 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
65 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
66 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
67 #define reserved 12
68 #define R_AMDGPU_RELATIVE64 13 /* B + A */
69
70 /* GCN specific definitions for asynchronous queues. */
71
72 #define ASYNC_QUEUE_SIZE 64
73 #define DRAIN_QUEUE_SYNCHRONOUS_P false
74 #define DEBUG_QUEUES 0
75 #define DEBUG_THREAD_SLEEP 0
76 #define DEBUG_THREAD_SIGNAL 0
77
78 /* Defaults. */
79 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
80
81 /* Secure getenv() which returns NULL if running as SUID/SGID. */
82 #ifndef HAVE_SECURE_GETENV
83 #ifdef HAVE___SECURE_GETENV
84 #define secure_getenv __secure_getenv
85 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
86 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
87
88 #include <unistd.h>
89
90 /* Implementation of secure_getenv() for targets where it is not provided but
91 we have at least means to test real and effective IDs. */
92
93 static char *
94 secure_getenv (const char *name)
95 {
96 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
97 return getenv (name);
98 else
99 return NULL;
100 }
101
102 #else
103 #define secure_getenv getenv
104 #endif
105 #endif
106
107 /* }}} */
108 /* {{{ Types */
109
110 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
111
112 struct gcn_thread
113 {
114 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
115 int async;
116 };
117
118 /* As an HSA runtime is dlopened, following structure defines function
119 pointers utilized by the HSA plug-in. */
120
121 struct hsa_runtime_fn_info
122 {
123 /* HSA runtime. */
124 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
125 const char **status_string);
126 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
127 void *value);
128 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
129 hsa_agent_info_t attribute,
130 void *value);
131 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
132 hsa_isa_info_t attribute,
133 uint32_t index,
134 void *value);
135 hsa_status_t (*hsa_init_fn) (void);
136 hsa_status_t (*hsa_iterate_agents_fn)
137 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
138 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
139 hsa_region_info_t attribute,
140 void *value);
141 hsa_status_t (*hsa_queue_create_fn)
142 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
143 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
144 void *data, uint32_t private_segment_size,
145 uint32_t group_segment_size, hsa_queue_t **queue);
146 hsa_status_t (*hsa_agent_iterate_regions_fn)
147 (hsa_agent_t agent,
148 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
149 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
150 hsa_status_t (*hsa_executable_create_fn)
151 (hsa_profile_t profile, hsa_executable_state_t executable_state,
152 const char *options, hsa_executable_t *executable);
153 hsa_status_t (*hsa_executable_global_variable_define_fn)
154 (hsa_executable_t executable, const char *variable_name, void *address);
155 hsa_status_t (*hsa_executable_load_code_object_fn)
156 (hsa_executable_t executable, hsa_agent_t agent,
157 hsa_code_object_t code_object, const char *options);
158 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
159 const char *options);
160 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
161 uint32_t num_consumers,
162 const hsa_agent_t *consumers,
163 hsa_signal_t *signal);
164 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
165 void **ptr);
166 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
167 hsa_access_permission_t access);
168 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
169 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
170 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
171 hsa_status_t (*hsa_executable_get_symbol_fn)
172 (hsa_executable_t executable, const char *module_name,
173 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
174 hsa_executable_symbol_t *symbol);
175 hsa_status_t (*hsa_executable_symbol_get_info_fn)
176 (hsa_executable_symbol_t executable_symbol,
177 hsa_executable_symbol_info_t attribute, void *value);
178 hsa_status_t (*hsa_executable_iterate_symbols_fn)
179 (hsa_executable_t executable,
180 hsa_status_t (*callback)(hsa_executable_t executable,
181 hsa_executable_symbol_t symbol, void *data),
182 void *data);
183 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
184 uint64_t value);
185 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
186 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
187 hsa_signal_value_t value);
188 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
189 hsa_signal_value_t value);
190 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
191 (hsa_signal_t signal, hsa_signal_condition_t condition,
192 hsa_signal_value_t compare_value, uint64_t timeout_hint,
193 hsa_wait_state_t wait_state_hint);
194 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
195 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
196
197 hsa_status_t (*hsa_code_object_deserialize_fn)
198 (void *serialized_code_object, size_t serialized_code_object_size,
199 const char *options, hsa_code_object_t *code_object);
200 };
201
202 /* Structure describing the run-time and grid properties of an HSA kernel
203 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
204
205 struct GOMP_kernel_launch_attributes
206 {
207 /* Number of dimensions the workload has. Maximum number is 3. */
208 uint32_t ndim;
209 /* Size of the grid in the three respective dimensions. */
210 uint32_t gdims[3];
211 /* Size of work-groups in the respective dimensions. */
212 uint32_t wdims[3];
213 };
214
215 /* Collection of information needed for a dispatch of a kernel from a
216 kernel. */
217
218 struct kernel_dispatch
219 {
220 struct agent_info *agent;
221 /* Pointer to a command queue associated with a kernel dispatch agent. */
222 void *queue;
223 /* Pointer to a memory space used for kernel arguments passing. */
224 void *kernarg_address;
225 /* Kernel object. */
226 uint64_t object;
227 /* Synchronization signal used for dispatch synchronization. */
228 uint64_t signal;
229 /* Private segment size. */
230 uint32_t private_segment_size;
231 /* Group segment size. */
232 uint32_t group_segment_size;
233 };
234
235 /* Structure of the kernargs segment, supporting console output.
236
237 This needs to match the definitions in Newlib, and the expectations
238 in libgomp target code. */
239
240 struct kernargs {
241 /* Leave space for the real kernel arguments.
242 OpenACC and OpenMP only use one pointer. */
243 int64_t dummy1;
244 int64_t dummy2;
245
246 /* A pointer to struct output, below, for console output data. */
247 int64_t out_ptr;
248
249 /* A pointer to struct heap, below. */
250 int64_t heap_ptr;
251
252 /* A pointer to an ephemeral memory arena.
253 Only needed for OpenMP. */
254 int64_t arena_ptr;
255
256 /* Output data. */
257 struct output {
258 int return_value;
259 unsigned int next_output;
260 struct printf_data {
261 int written;
262 char msg[128];
263 int type;
264 union {
265 int64_t ivalue;
266 double dvalue;
267 char text[128];
268 };
269 } queue[1024];
270 unsigned int consumed;
271 } output_data;
272 };
273
274 /* A queue entry for a future asynchronous launch. */
275
276 struct kernel_launch
277 {
278 struct kernel_info *kernel;
279 void *vars;
280 struct GOMP_kernel_launch_attributes kla;
281 };
282
283 /* A queue entry for a future callback. */
284
285 struct callback
286 {
287 void (*fn)(void *);
288 void *data;
289 };
290
291 /* A data struct for the copy_data callback. */
292
293 struct copy_data
294 {
295 void *dst;
296 const void *src;
297 size_t len;
298 bool free_src;
299 struct goacc_asyncqueue *aq;
300 };
301
302 /* A queue entry for a placeholder. These correspond to a wait event. */
303
304 struct placeholder
305 {
306 int executed;
307 pthread_cond_t cond;
308 pthread_mutex_t mutex;
309 };
310
311 /* A queue entry for a wait directive. */
312
313 struct asyncwait_info
314 {
315 struct placeholder *placeholderp;
316 };
317
318 /* Encode the type of an entry in an async queue. */
319
320 enum entry_type
321 {
322 KERNEL_LAUNCH,
323 CALLBACK,
324 ASYNC_WAIT,
325 ASYNC_PLACEHOLDER
326 };
327
328 /* An entry in an async queue. */
329
330 struct queue_entry
331 {
332 enum entry_type type;
333 union {
334 struct kernel_launch launch;
335 struct callback callback;
336 struct asyncwait_info asyncwait;
337 struct placeholder placeholder;
338 } u;
339 };
340
341 /* An async queue header.
342
343 OpenMP may create one of these.
344 OpenACC may create many. */
345
346 struct goacc_asyncqueue
347 {
348 struct agent_info *agent;
349 hsa_queue_t *hsa_queue;
350
351 pthread_t thread_drain_queue;
352 pthread_mutex_t mutex;
353 pthread_cond_t queue_cond_in;
354 pthread_cond_t queue_cond_out;
355 struct queue_entry queue[ASYNC_QUEUE_SIZE];
356 int queue_first;
357 int queue_n;
358 int drain_queue_stop;
359
360 int id;
361 struct goacc_asyncqueue *prev;
362 struct goacc_asyncqueue *next;
363 };
364
365 /* Mkoffload uses this structure to describe a kernel.
366
367 OpenMP kernel dimensions are passed at runtime.
368 OpenACC kernel dimensions are passed at compile time, here. */
369
370 struct hsa_kernel_description
371 {
372 const char *name;
373 int oacc_dims[3]; /* Only present for GCN kernels. */
374 };
375
376 /* Mkoffload uses this structure to describe an offload variable. */
377
378 struct global_var_info
379 {
380 const char *name;
381 void *address;
382 };
383
384 /* Mkoffload uses this structure to describe all the kernels in a
385 loadable module. These are passed the libgomp via static constructors. */
386
387 struct gcn_image_desc
388 {
389 struct gcn_image {
390 size_t size;
391 void *image;
392 } *gcn_image;
393 const unsigned kernel_count;
394 struct hsa_kernel_description *kernel_infos;
395 const unsigned global_variable_count;
396 struct global_var_info *global_variables;
397 };
398
399 /* Description of an HSA GPU agent (device) and the program associated with
400 it. */
401
402 struct agent_info
403 {
404 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
405 hsa_agent_t id;
406 /* The user-visible device number. */
407 int device_id;
408 /* Whether the agent has been initialized. The fields below are usable only
409 if it has been. */
410 bool initialized;
411 /* Precomputed check for problem architectures. */
412 bool gfx900_p;
413
414 /* Command queues of the agent. */
415 hsa_queue_t *sync_queue;
416 struct goacc_asyncqueue *async_queues, *omp_async_queue;
417 pthread_mutex_t async_queues_mutex;
418
419 /* The HSA memory region from which to allocate kernel arguments. */
420 hsa_region_t kernarg_region;
421
422 /* The HSA memory region from which to allocate device data. */
423 hsa_region_t data_region;
424
425 /* Allocated team arenas. */
426 struct team_arena_list *team_arena_list;
427 pthread_mutex_t team_arena_write_lock;
428
429 /* Read-write lock that protects kernels which are running or about to be run
430 from interference with loading and unloading of images. Needs to be
431 locked for reading while a kernel is being run, and for writing if the
432 list of modules is manipulated (and thus the HSA program invalidated). */
433 pthread_rwlock_t module_rwlock;
434
435 /* The module associated with this kernel. */
436 struct module_info *module;
437
438 /* Mutex enforcing that only one thread will finalize the HSA program. A
439 thread should have locked agent->module_rwlock for reading before
440 acquiring it. */
441 pthread_mutex_t prog_mutex;
442 /* Flag whether the HSA program that consists of all the modules has been
443 finalized. */
444 bool prog_finalized;
445 /* HSA executable - the finalized program that is used to locate kernels. */
446 hsa_executable_t executable;
447 };
448
449 /* Information required to identify, finalize and run any given kernel. */
450
451 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
452
453 struct kernel_info
454 {
455 /* Name of the kernel, required to locate it within the GCN object-code
456 module. */
457 const char *name;
458 /* The specific agent the kernel has been or will be finalized for and run
459 on. */
460 struct agent_info *agent;
461 /* The specific module where the kernel takes place. */
462 struct module_info *module;
463 /* Mutex enforcing that at most once thread ever initializes a kernel for
464 use. A thread should have locked agent->module_rwlock for reading before
465 acquiring it. */
466 pthread_mutex_t init_mutex;
467 /* Flag indicating whether the kernel has been initialized and all fields
468 below it contain valid data. */
469 bool initialized;
470 /* Flag indicating that the kernel has a problem that blocks an execution. */
471 bool initialization_failed;
472 /* The object to be put into the dispatch queue. */
473 uint64_t object;
474 /* Required size of kernel arguments. */
475 uint32_t kernarg_segment_size;
476 /* Required size of group segment. */
477 uint32_t group_segment_size;
478 /* Required size of private segment. */
479 uint32_t private_segment_size;
480 /* Set up for OpenMP or OpenACC? */
481 enum offload_kind kind;
482 };
483
484 /* Information about a particular GCN module, its image and kernels. */
485
486 struct module_info
487 {
488 /* The description with which the program has registered the image. */
489 struct gcn_image_desc *image_desc;
490 /* GCN heap allocation. */
491 struct heap *heap;
492 /* Physical boundaries of the loaded module. */
493 Elf64_Addr phys_address_start;
494 Elf64_Addr phys_address_end;
495
496 bool constructors_run_p;
497 struct kernel_info *init_array_func, *fini_array_func;
498
499 /* Number of kernels in this module. */
500 int kernel_count;
501 /* An array of kernel_info structures describing each kernel in this
502 module. */
503 struct kernel_info kernels[];
504 };
505
506 /* A linked list of memory arenas allocated on the device.
507 These are only used by OpenMP, as a means to optimize per-team malloc. */
508
509 struct team_arena_list
510 {
511 struct team_arena_list *next;
512
513 /* The number of teams determines the size of the allocation. */
514 int num_teams;
515 /* The device address of the arena itself. */
516 void *arena;
517 /* A flag to prevent two asynchronous kernels trying to use the same arena.
518 The mutex is locked until the kernel exits. */
519 pthread_mutex_t in_use;
520 };
521
522 /* Information about the whole HSA environment and all of its agents. */
523
524 struct hsa_context_info
525 {
526 /* Whether the structure has been initialized. */
527 bool initialized;
528 /* Number of usable GPU HSA agents in the system. */
529 int agent_count;
530 /* Array of agent_info structures describing the individual HSA agents. */
531 struct agent_info *agents;
532 };
533
534 /* Format of the on-device heap.
535
536 This must match the definition in Newlib and gcn-run. */
537
538 struct heap {
539 int64_t size;
540 char data[0];
541 };
542
543 /* }}} */
544 /* {{{ Global variables */
545
546 /* Information about the whole HSA environment and all of its agents. */
547
548 static struct hsa_context_info hsa_context;
549
550 /* HSA runtime functions that are initialized in init_hsa_context. */
551
552 static struct hsa_runtime_fn_info hsa_fns;
553
554 /* Heap space, allocated target-side, provided for use of newlib malloc.
555 Each module should have it's own heap allocated.
556 Beware that heap usage increases with OpenMP teams. See also arenas. */
557
558 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
559
560 /* Flag to decide whether print to stderr information about what is going on.
561 Set in init_debug depending on environment variables. */
562
563 static bool debug;
564
565 /* Flag to decide if the runtime should suppress a possible fallback to host
566 execution. */
567
568 static bool suppress_host_fallback;
569
570 /* Flag to locate HSA runtime shared library that is dlopened
571 by this plug-in. */
572
573 static const char *hsa_runtime_lib;
574
575 /* Flag to decide if the runtime should support also CPU devices (can be
576 a simulator). */
577
578 static bool support_cpu_devices;
579
580 /* Runtime dimension overrides. Zero indicates default. */
581
582 static int override_x_dim = 0;
583 static int override_z_dim = 0;
584
585 /* }}} */
586 /* {{{ Debug & Diagnostic */
587
588 /* Print a message to stderr if GCN_DEBUG value is set to true. */
589
590 #define DEBUG_PRINT(...) \
591 do \
592 { \
593 if (debug) \
594 { \
595 fprintf (stderr, __VA_ARGS__); \
596 } \
597 } \
598 while (false);
599
600 /* Flush stderr if GCN_DEBUG value is set to true. */
601
602 #define DEBUG_FLUSH() \
603 do { \
604 if (debug) \
605 fflush (stderr); \
606 } while (false)
607
608 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
609 is set to true. */
610
611 #define DEBUG_LOG(prefix, ...) \
612 do \
613 { \
614 DEBUG_PRINT (prefix); \
615 DEBUG_PRINT (__VA_ARGS__); \
616 DEBUG_FLUSH (); \
617 } while (false)
618
619 /* Print a debugging message to stderr. */
620
621 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
622
623 /* Print a warning message to stderr. */
624
625 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
626
627 /* Print HSA warning STR with an HSA STATUS code. */
628
629 static void
630 hsa_warn (const char *str, hsa_status_t status)
631 {
632 if (!debug)
633 return;
634
635 const char *hsa_error_msg = "[unknown]";
636 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
637
638 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
639 hsa_error_msg);
640 }
641
642 /* Report a fatal error STR together with the HSA error corresponding to STATUS
643 and terminate execution of the current process. */
644
645 static void
646 hsa_fatal (const char *str, hsa_status_t status)
647 {
648 const char *hsa_error_msg = "[unknown]";
649 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
650 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
651 hsa_error_msg);
652 }
653
654 /* Like hsa_fatal, except only report error message, and return FALSE
655 for propagating error processing to outside of plugin. */
656
657 static bool
658 hsa_error (const char *str, hsa_status_t status)
659 {
660 const char *hsa_error_msg = "[unknown]";
661 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
662 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
663 hsa_error_msg);
664 return false;
665 }
666
667 /* Dump information about the available hardware. */
668
669 static void
670 dump_hsa_system_info (void)
671 {
672 hsa_status_t status;
673
674 hsa_endianness_t endianness;
675 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
676 &endianness);
677 if (status == HSA_STATUS_SUCCESS)
678 switch (endianness)
679 {
680 case HSA_ENDIANNESS_LITTLE:
681 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
682 break;
683 case HSA_ENDIANNESS_BIG:
684 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
685 break;
686 default:
687 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
688 }
689 else
690 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
691
692 uint8_t extensions[128];
693 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
694 &extensions);
695 if (status == HSA_STATUS_SUCCESS)
696 {
697 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
698 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
699 }
700 else
701 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
702 }
703
704 /* Dump information about the available hardware. */
705
706 static void
707 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
708 {
709 switch (machine_model)
710 {
711 case HSA_MACHINE_MODEL_SMALL:
712 GCN_DEBUG ("%s: SMALL\n", s);
713 break;
714 case HSA_MACHINE_MODEL_LARGE:
715 GCN_DEBUG ("%s: LARGE\n", s);
716 break;
717 default:
718 GCN_WARNING ("%s: UNKNOWN\n", s);
719 break;
720 }
721 }
722
723 /* Dump information about the available hardware. */
724
725 static void
726 dump_profile (hsa_profile_t profile, const char *s)
727 {
728 switch (profile)
729 {
730 case HSA_PROFILE_FULL:
731 GCN_DEBUG ("%s: FULL\n", s);
732 break;
733 case HSA_PROFILE_BASE:
734 GCN_DEBUG ("%s: BASE\n", s);
735 break;
736 default:
737 GCN_WARNING ("%s: UNKNOWN\n", s);
738 break;
739 }
740 }
741
742 /* Dump information about a device memory region. */
743
744 static hsa_status_t
745 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
746 {
747 hsa_status_t status;
748
749 hsa_region_segment_t segment;
750 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
751 &segment);
752 if (status == HSA_STATUS_SUCCESS)
753 {
754 if (segment == HSA_REGION_SEGMENT_GLOBAL)
755 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
756 else if (segment == HSA_REGION_SEGMENT_READONLY)
757 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
758 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
759 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
760 else if (segment == HSA_REGION_SEGMENT_GROUP)
761 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
762 else
763 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
764 }
765 else
766 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
767
768 if (segment == HSA_REGION_SEGMENT_GLOBAL)
769 {
770 uint32_t flags;
771 status
772 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
773 &flags);
774 if (status == HSA_STATUS_SUCCESS)
775 {
776 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
777 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
778 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
779 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
780 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
781 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
782 }
783 else
784 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
785 }
786
787 size_t size;
788 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
789 if (status == HSA_STATUS_SUCCESS)
790 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
791 else
792 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
793
794 status
795 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
796 &size);
797 if (status == HSA_STATUS_SUCCESS)
798 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
799 else
800 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
801
802 bool alloc_allowed;
803 status
804 = hsa_fns.hsa_region_get_info_fn (region,
805 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
806 &alloc_allowed);
807 if (status == HSA_STATUS_SUCCESS)
808 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
809 else
810 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
811
812 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
813 return HSA_STATUS_SUCCESS;
814
815 status
816 = hsa_fns.hsa_region_get_info_fn (region,
817 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
818 &size);
819 if (status == HSA_STATUS_SUCCESS)
820 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
821 else
822 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
823
824 size_t align;
825 status
826 = hsa_fns.hsa_region_get_info_fn (region,
827 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
828 &align);
829 if (status == HSA_STATUS_SUCCESS)
830 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
831 else
832 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
833
834 return HSA_STATUS_SUCCESS;
835 }
836
837 /* Dump information about all the device memory regions. */
838
839 static void
840 dump_hsa_regions (hsa_agent_t agent)
841 {
842 hsa_status_t status;
843 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
844 dump_hsa_region,
845 NULL);
846 if (status != HSA_STATUS_SUCCESS)
847 hsa_error ("Dumping hsa regions failed", status);
848 }
849
850 /* Dump information about the available devices. */
851
852 static hsa_status_t
853 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
854 {
855 hsa_status_t status;
856
857 char buf[64];
858 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
859 &buf);
860 if (status == HSA_STATUS_SUCCESS)
861 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
862 else
863 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
864
865 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
866 &buf);
867 if (status == HSA_STATUS_SUCCESS)
868 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
869 else
870 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
871
872 hsa_machine_model_t machine_model;
873 status
874 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
875 &machine_model);
876 if (status == HSA_STATUS_SUCCESS)
877 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
878 else
879 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
880
881 hsa_profile_t profile;
882 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
883 &profile);
884 if (status == HSA_STATUS_SUCCESS)
885 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
886 else
887 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
888
889 hsa_device_type_t device_type;
890 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
891 &device_type);
892 if (status == HSA_STATUS_SUCCESS)
893 {
894 switch (device_type)
895 {
896 case HSA_DEVICE_TYPE_CPU:
897 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
898 break;
899 case HSA_DEVICE_TYPE_GPU:
900 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
901 break;
902 case HSA_DEVICE_TYPE_DSP:
903 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
904 break;
905 default:
906 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
907 break;
908 }
909 }
910 else
911 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
912
913 uint32_t cu_count;
914 status = hsa_fns.hsa_agent_get_info_fn
915 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
916 if (status == HSA_STATUS_SUCCESS)
917 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
918 else
919 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
920
921 uint32_t size;
922 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
923 &size);
924 if (status == HSA_STATUS_SUCCESS)
925 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
926 else
927 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
928
929 uint32_t max_dim;
930 status = hsa_fns.hsa_agent_get_info_fn (agent,
931 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
932 &max_dim);
933 if (status == HSA_STATUS_SUCCESS)
934 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
935 else
936 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
937
938 uint32_t max_size;
939 status = hsa_fns.hsa_agent_get_info_fn (agent,
940 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
941 &max_size);
942 if (status == HSA_STATUS_SUCCESS)
943 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
944 else
945 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
946
947 uint32_t grid_max_dim;
948 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
949 &grid_max_dim);
950 if (status == HSA_STATUS_SUCCESS)
951 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
952 else
953 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
954
955 uint32_t grid_max_size;
956 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
957 &grid_max_size);
958 if (status == HSA_STATUS_SUCCESS)
959 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
960 else
961 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
962
963 dump_hsa_regions (agent);
964
965 return HSA_STATUS_SUCCESS;
966 }
967
968 /* Forward reference. */
969
970 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
971
972 /* Helper function for dump_executable_symbols. */
973
974 static hsa_status_t
975 dump_executable_symbol (hsa_executable_t executable,
976 hsa_executable_symbol_t symbol,
977 void *data __attribute__((unused)))
978 {
979 char *name = get_executable_symbol_name (symbol);
980
981 if (name)
982 {
983 GCN_DEBUG ("executable symbol: %s\n", name);
984 free (name);
985 }
986
987 return HSA_STATUS_SUCCESS;
988 }
989
990 /* Dump all global symbol in an executable. */
991
992 static void
993 dump_executable_symbols (hsa_executable_t executable)
994 {
995 hsa_status_t status;
996 status
997 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
998 dump_executable_symbol,
999 NULL);
1000 if (status != HSA_STATUS_SUCCESS)
1001 hsa_fatal ("Could not dump HSA executable symbols", status);
1002 }
1003
1004 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1005
1006 static void
1007 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1008 {
1009 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1010
1011 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1012 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1013 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1014 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1015 (void*)kernargs->heap_ptr);
1016 fprintf (stderr, "%*sarena address: %p\n", indent, "",
1017 (void*)kernargs->arena_ptr);
1018 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1019 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1020 dispatch->private_segment_size);
1021 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1022 dispatch->group_segment_size);
1023 fprintf (stderr, "\n");
1024 }
1025
1026 /* }}} */
1027 /* {{{ Utility functions */
1028
1029 /* Cast the thread local storage to gcn_thread. */
1030
1031 static inline struct gcn_thread *
1032 gcn_thread (void)
1033 {
1034 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1035 }
1036
1037 /* Initialize debug and suppress_host_fallback according to the environment. */
1038
1039 static void
1040 init_environment_variables (void)
1041 {
1042 if (secure_getenv ("GCN_DEBUG"))
1043 debug = true;
1044 else
1045 debug = false;
1046
1047 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1048 suppress_host_fallback = true;
1049 else
1050 suppress_host_fallback = false;
1051
1052 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1053 if (hsa_runtime_lib == NULL)
1054 hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so";
1055
1056 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1057
1058 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1059 if (!x)
1060 x = secure_getenv ("GCN_NUM_GANGS");
1061 if (x)
1062 override_x_dim = atoi (x);
1063
1064 const char *z = secure_getenv ("GCN_NUM_THREADS");
1065 if (!z)
1066 z = secure_getenv ("GCN_NUM_WORKERS");
1067 if (z)
1068 override_z_dim = atoi (z);
1069
1070 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1071 if (heap)
1072 {
1073 size_t tmp = atol (heap);
1074 if (tmp)
1075 gcn_kernel_heap_size = tmp;
1076 }
1077 }
1078
1079 /* Return malloc'd string with name of SYMBOL. */
1080
1081 static char *
1082 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1083 {
1084 hsa_status_t status;
1085 char *res;
1086 uint32_t len;
1087 const hsa_executable_symbol_info_t info_name_length
1088 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1089
1090 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1091 &len);
1092 if (status != HSA_STATUS_SUCCESS)
1093 {
1094 hsa_error ("Could not get length of symbol name", status);
1095 return NULL;
1096 }
1097
1098 res = GOMP_PLUGIN_malloc (len + 1);
1099
1100 const hsa_executable_symbol_info_t info_name
1101 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1102
1103 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1104
1105 if (status != HSA_STATUS_SUCCESS)
1106 {
1107 hsa_error ("Could not get symbol name", status);
1108 free (res);
1109 return NULL;
1110 }
1111
1112 res[len] = '\0';
1113
1114 return res;
1115 }
1116
1117 /* Helper function for find_executable_symbol. */
1118
1119 static hsa_status_t
1120 find_executable_symbol_1 (hsa_executable_t executable,
1121 hsa_executable_symbol_t symbol,
1122 void *data)
1123 {
1124 hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
1125 *res = symbol;
1126 return HSA_STATUS_INFO_BREAK;
1127 }
1128
1129 /* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true. If not
1130 found, return false. */
1131
1132 static bool
1133 find_executable_symbol (hsa_executable_t executable,
1134 hsa_executable_symbol_t *symbol)
1135 {
1136 hsa_status_t status;
1137
1138 status
1139 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1140 find_executable_symbol_1,
1141 symbol);
1142 if (status != HSA_STATUS_INFO_BREAK)
1143 {
1144 hsa_error ("Could not find executable symbol", status);
1145 return false;
1146 }
1147
1148 return true;
1149 }
1150
1151 /* Get the number of GPU Compute Units. */
1152
1153 static int
1154 get_cu_count (struct agent_info *agent)
1155 {
1156 uint32_t cu_count;
1157 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1158 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1159 if (status == HSA_STATUS_SUCCESS)
1160 return cu_count;
1161 else
1162 return 64; /* The usual number for older devices. */
1163 }
1164
1165 /* Calculate the maximum grid size for OMP threads / OACC workers.
1166 This depends on the kernel's resource usage levels. */
1167
1168 static int
1169 limit_worker_threads (int threads)
1170 {
1171 /* FIXME Do something more inteligent here.
1172 GCN can always run 4 threads within a Compute Unit, but
1173 more than that depends on register usage. */
1174 if (threads > 16)
1175 threads = 16;
1176 return threads;
1177 }
1178
1179 /* Parse the target attributes INPUT provided by the compiler and return true
1180 if we should run anything all. If INPUT is NULL, fill DEF with default
1181 values, then store INPUT or DEF into *RESULT.
1182
1183 This is used for OpenMP only. */
1184
1185 static bool
1186 parse_target_attributes (void **input,
1187 struct GOMP_kernel_launch_attributes *def,
1188 struct GOMP_kernel_launch_attributes **result,
1189 struct agent_info *agent)
1190 {
1191 if (!input)
1192 GOMP_PLUGIN_fatal ("No target arguments provided");
1193
1194 bool grid_attrs_found = false;
1195 bool gcn_dims_found = false;
1196 int gcn_teams = 0;
1197 int gcn_threads = 0;
1198 while (*input)
1199 {
1200 intptr_t id = (intptr_t) *input++, val;
1201
1202 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1203 val = (intptr_t) *input++;
1204 else
1205 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1206
1207 val = (val > INT_MAX) ? INT_MAX : val;
1208
1209 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1210 && ((id & GOMP_TARGET_ARG_ID_MASK)
1211 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1212 {
1213 grid_attrs_found = true;
1214 break;
1215 }
1216 else if ((id & GOMP_TARGET_ARG_DEVICE_ALL) == GOMP_TARGET_ARG_DEVICE_ALL)
1217 {
1218 gcn_dims_found = true;
1219 switch (id & GOMP_TARGET_ARG_ID_MASK)
1220 {
1221 case GOMP_TARGET_ARG_NUM_TEAMS:
1222 gcn_teams = val;
1223 break;
1224 case GOMP_TARGET_ARG_THREAD_LIMIT:
1225 gcn_threads = limit_worker_threads (val);
1226 break;
1227 default:
1228 ;
1229 }
1230 }
1231 }
1232
1233 if (gcn_dims_found)
1234 {
1235 if (agent->gfx900_p && gcn_threads == 0 && override_z_dim == 0)
1236 {
1237 gcn_threads = 4;
1238 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1239 "threads to 4 per team.\n");
1240 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1241 "GCN_NUM_THREADS=16\n");
1242 }
1243
1244 def->ndim = 3;
1245 /* Fiji has 64 CUs, but Vega20 has 60. */
1246 def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
1247 /* Each thread is 64 work items wide. */
1248 def->gdims[1] = 64;
1249 /* A work group can have 16 wavefronts. */
1250 def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
1251 def->wdims[0] = 1; /* Single team per work-group. */
1252 def->wdims[1] = 64;
1253 def->wdims[2] = 16;
1254 *result = def;
1255 return true;
1256 }
1257 else if (!grid_attrs_found)
1258 {
1259 def->ndim = 1;
1260 def->gdims[0] = 1;
1261 def->gdims[1] = 1;
1262 def->gdims[2] = 1;
1263 def->wdims[0] = 1;
1264 def->wdims[1] = 1;
1265 def->wdims[2] = 1;
1266 *result = def;
1267 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1268 return true;
1269 }
1270
1271 struct GOMP_kernel_launch_attributes *kla;
1272 kla = (struct GOMP_kernel_launch_attributes *) *input;
1273 *result = kla;
1274 if (kla->ndim == 0 || kla->ndim > 3)
1275 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1276
1277 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1278 unsigned i;
1279 for (i = 0; i < kla->ndim; i++)
1280 {
1281 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1282 kla->gdims[i], kla->wdims[i]);
1283 if (kla->gdims[i] == 0)
1284 return false;
1285 }
1286 return true;
1287 }
1288
1289 /* Return the group size given the requested GROUP size, GRID size and number
1290 of grid dimensions NDIM. */
1291
1292 static uint32_t
1293 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1294 {
1295 if (group == 0)
1296 {
1297 /* TODO: Provide a default via environment or device characteristics. */
1298 if (ndim == 1)
1299 group = 64;
1300 else if (ndim == 2)
1301 group = 8;
1302 else
1303 group = 4;
1304 }
1305
1306 if (group > grid)
1307 group = grid;
1308 return group;
1309 }
1310
1311 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1312
1313 static void
1314 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1315 {
1316 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1317 }
1318
1319 /* A never-called callback for the HSA command queues. These signal events
1320 that we don't use, so we trigger an error.
1321
1322 This "queue" is not to be confused with the async queues, below. */
1323
1324 static void
1325 hsa_queue_callback (hsa_status_t status,
1326 hsa_queue_t *queue __attribute__ ((unused)),
1327 void *data __attribute__ ((unused)))
1328 {
1329 hsa_fatal ("Asynchronous queue error", status);
1330 }
1331
1332 /* }}} */
1333 /* {{{ HSA initialization */
1334
1335 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1336
1337 static bool
1338 init_hsa_runtime_functions (void)
1339 {
1340 #define DLSYM_FN(function) \
1341 hsa_fns.function##_fn = dlsym (handle, #function); \
1342 if (hsa_fns.function##_fn == NULL) \
1343 return false;
1344 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1345 if (handle == NULL)
1346 return false;
1347
1348 DLSYM_FN (hsa_status_string)
1349 DLSYM_FN (hsa_system_get_info)
1350 DLSYM_FN (hsa_agent_get_info)
1351 DLSYM_FN (hsa_init)
1352 DLSYM_FN (hsa_iterate_agents)
1353 DLSYM_FN (hsa_region_get_info)
1354 DLSYM_FN (hsa_queue_create)
1355 DLSYM_FN (hsa_agent_iterate_regions)
1356 DLSYM_FN (hsa_executable_destroy)
1357 DLSYM_FN (hsa_executable_create)
1358 DLSYM_FN (hsa_executable_global_variable_define)
1359 DLSYM_FN (hsa_executable_load_code_object)
1360 DLSYM_FN (hsa_executable_freeze)
1361 DLSYM_FN (hsa_signal_create)
1362 DLSYM_FN (hsa_memory_allocate)
1363 DLSYM_FN (hsa_memory_assign_agent)
1364 DLSYM_FN (hsa_memory_copy)
1365 DLSYM_FN (hsa_memory_free)
1366 DLSYM_FN (hsa_signal_destroy)
1367 DLSYM_FN (hsa_executable_get_symbol)
1368 DLSYM_FN (hsa_executable_symbol_get_info)
1369 DLSYM_FN (hsa_executable_iterate_symbols)
1370 DLSYM_FN (hsa_queue_add_write_index_release)
1371 DLSYM_FN (hsa_queue_load_read_index_acquire)
1372 DLSYM_FN (hsa_signal_wait_acquire)
1373 DLSYM_FN (hsa_signal_store_relaxed)
1374 DLSYM_FN (hsa_signal_store_release)
1375 DLSYM_FN (hsa_signal_load_acquire)
1376 DLSYM_FN (hsa_queue_destroy)
1377 DLSYM_FN (hsa_code_object_deserialize)
1378 return true;
1379 #undef DLSYM_FN
1380 }
1381
1382 /* Return true if the agent is a GPU and can accept of concurrent submissions
1383 from different threads. */
1384
1385 static bool
1386 suitable_hsa_agent_p (hsa_agent_t agent)
1387 {
1388 hsa_device_type_t device_type;
1389 hsa_status_t status
1390 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1391 &device_type);
1392 if (status != HSA_STATUS_SUCCESS)
1393 return false;
1394
1395 switch (device_type)
1396 {
1397 case HSA_DEVICE_TYPE_GPU:
1398 break;
1399 case HSA_DEVICE_TYPE_CPU:
1400 if (!support_cpu_devices)
1401 return false;
1402 break;
1403 default:
1404 return false;
1405 }
1406
1407 uint32_t features = 0;
1408 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1409 &features);
1410 if (status != HSA_STATUS_SUCCESS
1411 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1412 return false;
1413 hsa_queue_type_t queue_type;
1414 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1415 &queue_type);
1416 if (status != HSA_STATUS_SUCCESS
1417 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1418 return false;
1419
1420 return true;
1421 }
1422
1423 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1424 agent_count in hsa_context. */
1425
1426 static hsa_status_t
1427 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1428 {
1429 if (suitable_hsa_agent_p (agent))
1430 hsa_context.agent_count++;
1431 return HSA_STATUS_SUCCESS;
1432 }
1433
1434 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1435 id to the describing structure in the hsa context. The index of the
1436 structure is pointed to by DATA, increment it afterwards. */
1437
1438 static hsa_status_t
1439 assign_agent_ids (hsa_agent_t agent, void *data)
1440 {
1441 if (suitable_hsa_agent_p (agent))
1442 {
1443 int *agent_index = (int *) data;
1444 hsa_context.agents[*agent_index].id = agent;
1445 ++*agent_index;
1446 }
1447 return HSA_STATUS_SUCCESS;
1448 }
1449
1450 /* Initialize hsa_context if it has not already been done.
1451 Return TRUE on success. */
1452
1453 static bool
1454 init_hsa_context (void)
1455 {
1456 hsa_status_t status;
1457 int agent_index = 0;
1458
1459 if (hsa_context.initialized)
1460 return true;
1461 init_environment_variables ();
1462 if (!init_hsa_runtime_functions ())
1463 {
1464 GCN_WARNING ("Run-time could not be dynamically opened\n");
1465 if (suppress_host_fallback)
1466 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1467 return false;
1468 }
1469 status = hsa_fns.hsa_init_fn ();
1470 if (status != HSA_STATUS_SUCCESS)
1471 return hsa_error ("Run-time could not be initialized", status);
1472 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1473
1474 if (debug)
1475 dump_hsa_system_info ();
1476
1477 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1478 if (status != HSA_STATUS_SUCCESS)
1479 return hsa_error ("GCN GPU devices could not be enumerated", status);
1480 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1481
1482 hsa_context.agents
1483 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1484 * sizeof (struct agent_info));
1485 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1486 if (agent_index != hsa_context.agent_count)
1487 {
1488 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1489 return false;
1490 }
1491
1492 if (debug)
1493 {
1494 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1495 if (status != HSA_STATUS_SUCCESS)
1496 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1497 }
1498
1499 hsa_context.initialized = true;
1500 return true;
1501 }
1502
1503 /* Verify that hsa_context has already been initialized and return the
1504 agent_info structure describing device number N. Return NULL on error. */
1505
1506 static struct agent_info *
1507 get_agent_info (int n)
1508 {
1509 if (!hsa_context.initialized)
1510 {
1511 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1512 return NULL;
1513 }
1514 if (n >= hsa_context.agent_count)
1515 {
1516 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1517 return NULL;
1518 }
1519 if (!hsa_context.agents[n].initialized)
1520 {
1521 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1522 return NULL;
1523 }
1524 return &hsa_context.agents[n];
1525 }
1526
1527 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1528
1529 Selects (breaks at) a suitable region of type KIND. */
1530
1531 static hsa_status_t
1532 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1533 hsa_region_global_flag_t kind)
1534 {
1535 hsa_status_t status;
1536 hsa_region_segment_t segment;
1537
1538 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1539 &segment);
1540 if (status != HSA_STATUS_SUCCESS)
1541 return status;
1542 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1543 return HSA_STATUS_SUCCESS;
1544
1545 uint32_t flags;
1546 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1547 &flags);
1548 if (status != HSA_STATUS_SUCCESS)
1549 return status;
1550 if (flags & kind)
1551 {
1552 *retval = region;
1553 return HSA_STATUS_INFO_BREAK;
1554 }
1555 return HSA_STATUS_SUCCESS;
1556 }
1557
1558 /* Callback of hsa_agent_iterate_regions.
1559
1560 Selects a kernargs memory region. */
1561
1562 static hsa_status_t
1563 get_kernarg_memory_region (hsa_region_t region, void *data)
1564 {
1565 return get_memory_region (region, (hsa_region_t *)data,
1566 HSA_REGION_GLOBAL_FLAG_KERNARG);
1567 }
1568
1569 /* Callback of hsa_agent_iterate_regions.
1570
1571 Selects a coarse-grained memory region suitable for the heap and
1572 offload data. */
1573
1574 static hsa_status_t
1575 get_data_memory_region (hsa_region_t region, void *data)
1576 {
1577 return get_memory_region (region, (hsa_region_t *)data,
1578 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1579 }
1580
1581 /* }}} */
1582 /* {{{ Run */
1583
1584 /* Create or reuse a team arena.
1585
1586 Team arenas are used by OpenMP to avoid calling malloc multiple times
1587 while setting up each team. This is purely a performance optimization.
1588
1589 Allocating an arena also costs performance, albeit on the host side, so
1590 this function will reuse an existing arena if a large enough one is idle.
1591 The arena is released, but not deallocated, when the kernel exits. */
1592
1593 static void *
1594 get_team_arena (struct agent_info *agent, int num_teams)
1595 {
1596 struct team_arena_list **next_ptr = &agent->team_arena_list;
1597 struct team_arena_list *item;
1598
1599 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1600 {
1601 if (item->num_teams < num_teams)
1602 continue;
1603
1604 if (pthread_mutex_trylock (&item->in_use))
1605 continue;
1606
1607 return item->arena;
1608 }
1609
1610 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1611
1612 if (pthread_mutex_lock (&agent->team_arena_write_lock))
1613 {
1614 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1615 return false;
1616 }
1617 item = malloc (sizeof (*item));
1618 item->num_teams = num_teams;
1619 item->next = NULL;
1620 *next_ptr = item;
1621
1622 if (pthread_mutex_init (&item->in_use, NULL))
1623 {
1624 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1625 return false;
1626 }
1627 if (pthread_mutex_lock (&item->in_use))
1628 {
1629 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1630 return false;
1631 }
1632 if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1633 {
1634 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1635 return false;
1636 }
1637
1638 const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */
1639 hsa_status_t status;
1640 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1641 TEAM_ARENA_SIZE*num_teams,
1642 &item->arena);
1643 if (status != HSA_STATUS_SUCCESS)
1644 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1645 status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1646 HSA_ACCESS_PERMISSION_RW);
1647 if (status != HSA_STATUS_SUCCESS)
1648 hsa_fatal ("Could not assign arena memory to device", status);
1649
1650 return item->arena;
1651 }
1652
1653 /* Mark a team arena available for reuse. */
1654
1655 static void
1656 release_team_arena (struct agent_info* agent, void *arena)
1657 {
1658 struct team_arena_list *item;
1659
1660 for (item = agent->team_arena_list; item; item = item->next)
1661 {
1662 if (item->arena == arena)
1663 {
1664 if (pthread_mutex_unlock (&item->in_use))
1665 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1666 return;
1667 }
1668 }
1669 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1670 }
1671
1672 /* Clean up all the allocated team arenas. */
1673
1674 static bool
1675 destroy_team_arenas (struct agent_info *agent)
1676 {
1677 struct team_arena_list *item, *next;
1678
1679 for (item = agent->team_arena_list; item; item = next)
1680 {
1681 next = item->next;
1682 hsa_fns.hsa_memory_free_fn (item->arena);
1683 if (pthread_mutex_destroy (&item->in_use))
1684 {
1685 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1686 return false;
1687 }
1688 free (item);
1689 }
1690 agent->team_arena_list = NULL;
1691
1692 return true;
1693 }
1694
1695 /* Allocate memory on a specified device. */
1696
1697 static void *
1698 alloc_by_agent (struct agent_info *agent, size_t size)
1699 {
1700 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1701
1702 /* Zero-size allocations are invalid, so in order to return a valid pointer
1703 we need to pass a valid size. One source of zero-size allocations is
1704 kernargs for kernels that have no inputs or outputs (the kernel may
1705 only use console output, for example). */
1706 if (size == 0)
1707 size = 4;
1708
1709 void *ptr;
1710 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1711 size, &ptr);
1712 if (status != HSA_STATUS_SUCCESS)
1713 {
1714 hsa_error ("Could not allocate device memory", status);
1715 return NULL;
1716 }
1717
1718 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1719 HSA_ACCESS_PERMISSION_RW);
1720 if (status != HSA_STATUS_SUCCESS)
1721 {
1722 hsa_error ("Could not assign data memory to device", status);
1723 return NULL;
1724 }
1725
1726 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1727 bool profiling_dispatch_p
1728 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1729 if (profiling_dispatch_p)
1730 {
1731 acc_prof_info *prof_info = thr->prof_info;
1732 acc_event_info data_event_info;
1733 acc_api_info *api_info = thr->api_info;
1734
1735 prof_info->event_type = acc_ev_alloc;
1736
1737 data_event_info.data_event.event_type = prof_info->event_type;
1738 data_event_info.data_event.valid_bytes
1739 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1740 data_event_info.data_event.parent_construct
1741 = acc_construct_parallel;
1742 data_event_info.data_event.implicit = 1;
1743 data_event_info.data_event.tool_info = NULL;
1744 data_event_info.data_event.var_name = NULL;
1745 data_event_info.data_event.bytes = size;
1746 data_event_info.data_event.host_ptr = NULL;
1747 data_event_info.data_event.device_ptr = (void *) ptr;
1748
1749 api_info->device_api = acc_device_api_other;
1750
1751 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1752 api_info);
1753 }
1754
1755 return ptr;
1756 }
1757
1758 /* Create kernel dispatch data structure for given KERNEL, along with
1759 the necessary device signals and memory allocations. */
1760
1761 static struct kernel_dispatch *
1762 create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1763 {
1764 struct agent_info *agent = kernel->agent;
1765 struct kernel_dispatch *shadow
1766 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1767
1768 shadow->agent = kernel->agent;
1769 shadow->object = kernel->object;
1770
1771 hsa_signal_t sync_signal;
1772 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1773 if (status != HSA_STATUS_SUCCESS)
1774 hsa_fatal ("Error creating the GCN sync signal", status);
1775
1776 shadow->signal = sync_signal.handle;
1777 shadow->private_segment_size = kernel->private_segment_size;
1778 shadow->group_segment_size = kernel->group_segment_size;
1779
1780 /* We expect kernels to request a single pointer, explicitly, and the
1781 rest of struct kernargs, implicitly. If they request anything else
1782 then something is wrong. */
1783 if (kernel->kernarg_segment_size > 8)
1784 {
1785 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1786 return NULL;
1787 }
1788
1789 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1790 sizeof (struct kernargs),
1791 &shadow->kernarg_address);
1792 if (status != HSA_STATUS_SUCCESS)
1793 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1794 struct kernargs *kernargs = shadow->kernarg_address;
1795
1796 /* Zero-initialize the output_data (minimum needed). */
1797 kernargs->out_ptr = (int64_t)&kernargs->output_data;
1798 kernargs->output_data.next_output = 0;
1799 for (unsigned i = 0;
1800 i < (sizeof (kernargs->output_data.queue)
1801 / sizeof (kernargs->output_data.queue[0]));
1802 i++)
1803 kernargs->output_data.queue[i].written = 0;
1804 kernargs->output_data.consumed = 0;
1805
1806 /* Pass in the heap location. */
1807 kernargs->heap_ptr = (int64_t)kernel->module->heap;
1808
1809 /* Create an arena. */
1810 if (kernel->kind == KIND_OPENMP)
1811 kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1812 else
1813 kernargs->arena_ptr = 0;
1814
1815 /* Ensure we can recognize unset return values. */
1816 kernargs->output_data.return_value = 0xcafe0000;
1817
1818 return shadow;
1819 }
1820
1821 /* Output any data written to console output from the kernel. It is expected
1822 that this function is polled during kernel execution.
1823
1824 We print all entries from the last item printed to the next entry without
1825 a "written" flag. If the "final" flag is set then it'll continue right to
1826 the end.
1827
1828 The print buffer is circular, but the from and to locations don't wrap when
1829 the buffer does, so the output limit is UINT_MAX. The target blocks on
1830 output when the buffer is full. */
1831
1832 static void
1833 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1834 bool final)
1835 {
1836 unsigned int limit = (sizeof (kernargs->output_data.queue)
1837 / sizeof (kernargs->output_data.queue[0]));
1838
1839 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1840 __ATOMIC_ACQUIRE);
1841 unsigned int to = kernargs->output_data.next_output;
1842
1843 if (from > to)
1844 {
1845 /* Overflow. */
1846 if (final)
1847 printf ("GCN print buffer overflowed.\n");
1848 return;
1849 }
1850
1851 unsigned int i;
1852 for (i = from; i < to; i++)
1853 {
1854 struct printf_data *data = &kernargs->output_data.queue[i%limit];
1855
1856 if (!data->written && !final)
1857 break;
1858
1859 switch (data->type)
1860 {
1861 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1862 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1863 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1864 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1865 default: printf ("GCN print buffer error!\n"); break;
1866 }
1867 data->written = 0;
1868 __atomic_store_n (&kernargs->output_data.consumed, i+1,
1869 __ATOMIC_RELEASE);
1870 }
1871 fflush (stdout);
1872 }
1873
1874 /* Release data structure created for a kernel dispatch in SHADOW argument,
1875 and clean up the signal and memory allocations. */
1876
1877 static void
1878 release_kernel_dispatch (struct kernel_dispatch *shadow)
1879 {
1880 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1881
1882 struct kernargs *kernargs = shadow->kernarg_address;
1883 void *arena = (void *)kernargs->arena_ptr;
1884 if (arena)
1885 release_team_arena (shadow->agent, arena);
1886
1887 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1888
1889 hsa_signal_t s;
1890 s.handle = shadow->signal;
1891 hsa_fns.hsa_signal_destroy_fn (s);
1892
1893 free (shadow);
1894 }
1895
1896 /* Extract the properties from a kernel binary. */
1897
1898 static void
1899 init_kernel_properties (struct kernel_info *kernel)
1900 {
1901 hsa_status_t status;
1902 struct agent_info *agent = kernel->agent;
1903 hsa_executable_symbol_t kernel_symbol;
1904 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
1905 kernel->name, agent->id,
1906 0, &kernel_symbol);
1907 if (status != HSA_STATUS_SUCCESS)
1908 {
1909 hsa_warn ("Could not find symbol for kernel in the code object", status);
1910 fprintf (stderr, "not found name: '%s'\n", kernel->name);
1911 dump_executable_symbols (agent->executable);
1912 goto failure;
1913 }
1914 GCN_DEBUG ("Located kernel %s\n", kernel->name);
1915 status = hsa_fns.hsa_executable_symbol_get_info_fn
1916 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
1917 if (status != HSA_STATUS_SUCCESS)
1918 hsa_fatal ("Could not extract a kernel object from its symbol", status);
1919 status = hsa_fns.hsa_executable_symbol_get_info_fn
1920 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1921 &kernel->kernarg_segment_size);
1922 if (status != HSA_STATUS_SUCCESS)
1923 hsa_fatal ("Could not get info about kernel argument size", status);
1924 status = hsa_fns.hsa_executable_symbol_get_info_fn
1925 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
1926 &kernel->group_segment_size);
1927 if (status != HSA_STATUS_SUCCESS)
1928 hsa_fatal ("Could not get info about kernel group segment size", status);
1929 status = hsa_fns.hsa_executable_symbol_get_info_fn
1930 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
1931 &kernel->private_segment_size);
1932 if (status != HSA_STATUS_SUCCESS)
1933 hsa_fatal ("Could not get info about kernel private segment size",
1934 status);
1935
1936 /* The kernel type is not known until something tries to launch it. */
1937 kernel->kind = KIND_UNKNOWN;
1938
1939 GCN_DEBUG ("Kernel structure for %s fully initialized with "
1940 "following segment sizes: \n", kernel->name);
1941 GCN_DEBUG (" group_segment_size: %u\n",
1942 (unsigned) kernel->group_segment_size);
1943 GCN_DEBUG (" private_segment_size: %u\n",
1944 (unsigned) kernel->private_segment_size);
1945 GCN_DEBUG (" kernarg_segment_size: %u\n",
1946 (unsigned) kernel->kernarg_segment_size);
1947 return;
1948
1949 failure:
1950 kernel->initialization_failed = true;
1951 }
1952
1953 /* Do all the work that is necessary before running KERNEL for the first time.
1954 The function assumes the program has been created, finalized and frozen by
1955 create_and_finalize_hsa_program. */
1956
1957 static void
1958 init_kernel (struct kernel_info *kernel)
1959 {
1960 if (pthread_mutex_lock (&kernel->init_mutex))
1961 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
1962 if (kernel->initialized)
1963 {
1964 if (pthread_mutex_unlock (&kernel->init_mutex))
1965 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
1966 "mutex");
1967
1968 return;
1969 }
1970
1971 init_kernel_properties (kernel);
1972
1973 if (!kernel->initialization_failed)
1974 {
1975 GCN_DEBUG ("\n");
1976
1977 kernel->initialized = true;
1978 }
1979 if (pthread_mutex_unlock (&kernel->init_mutex))
1980 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
1981 "mutex");
1982 }
1983
1984 /* Run KERNEL on its agent, pass VARS to it as arguments and take
1985 launch attributes from KLA.
1986
1987 MODULE_LOCKED indicates that the caller already holds the lock and
1988 run_kernel need not lock it again.
1989 If AQ is NULL then agent->sync_queue will be used. */
1990
1991 static void
1992 run_kernel (struct kernel_info *kernel, void *vars,
1993 struct GOMP_kernel_launch_attributes *kla,
1994 struct goacc_asyncqueue *aq, bool module_locked)
1995 {
1996 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
1997 (aq ? aq->id : 0));
1998 GCN_DEBUG ("GCN launch attribs: gdims:[");
1999 int i;
2000 for (i = 0; i < kla->ndim; ++i)
2001 {
2002 if (i)
2003 DEBUG_PRINT (", ");
2004 DEBUG_PRINT ("%u", kla->gdims[i]);
2005 }
2006 DEBUG_PRINT ("], normalized gdims:[");
2007 for (i = 0; i < kla->ndim; ++i)
2008 {
2009 if (i)
2010 DEBUG_PRINT (", ");
2011 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2012 }
2013 DEBUG_PRINT ("], wdims:[");
2014 for (i = 0; i < kla->ndim; ++i)
2015 {
2016 if (i)
2017 DEBUG_PRINT (", ");
2018 DEBUG_PRINT ("%u", kla->wdims[i]);
2019 }
2020 DEBUG_PRINT ("]\n");
2021 DEBUG_FLUSH ();
2022
2023 struct agent_info *agent = kernel->agent;
2024 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2025 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2026
2027 if (!agent->initialized)
2028 GOMP_PLUGIN_fatal ("Agent must be initialized");
2029
2030 if (!kernel->initialized)
2031 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2032
2033 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2034
2035 uint64_t index
2036 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2037 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2038
2039 /* Wait until the queue is not full before writing the packet. */
2040 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2041 >= command_q->size)
2042 ;
2043
2044 /* Do not allow the dimensions to be overridden when running
2045 constructors or destructors. */
2046 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2047 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2048
2049 hsa_kernel_dispatch_packet_t *packet;
2050 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2051 + index % command_q->size;
2052
2053 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2054 packet->grid_size_x = override_x ? : kla->gdims[0];
2055 packet->workgroup_size_x = get_group_size (kla->ndim,
2056 packet->grid_size_x,
2057 kla->wdims[0]);
2058
2059 if (kla->ndim >= 2)
2060 {
2061 packet->grid_size_y = kla->gdims[1];
2062 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2063 kla->wdims[1]);
2064 }
2065 else
2066 {
2067 packet->grid_size_y = 1;
2068 packet->workgroup_size_y = 1;
2069 }
2070
2071 if (kla->ndim == 3)
2072 {
2073 packet->grid_size_z = limit_worker_threads (override_z
2074 ? : kla->gdims[2]);
2075 packet->workgroup_size_z = get_group_size (kla->ndim,
2076 packet->grid_size_z,
2077 kla->wdims[2]);
2078 }
2079 else
2080 {
2081 packet->grid_size_z = 1;
2082 packet->workgroup_size_z = 1;
2083 }
2084
2085 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2086 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2087 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2088 packet->grid_size_x / packet->workgroup_size_x,
2089 packet->grid_size_y / packet->workgroup_size_y,
2090 packet->grid_size_z / packet->workgroup_size_z,
2091 packet->workgroup_size_x, packet->workgroup_size_y,
2092 packet->workgroup_size_z);
2093
2094 struct kernel_dispatch *shadow
2095 = create_kernel_dispatch (kernel, packet->grid_size_x);
2096 shadow->queue = command_q;
2097
2098 if (debug)
2099 {
2100 fprintf (stderr, "\nKernel has following dependencies:\n");
2101 print_kernel_dispatch (shadow, 2);
2102 }
2103
2104 packet->private_segment_size = kernel->private_segment_size;
2105 packet->group_segment_size = kernel->group_segment_size;
2106 packet->kernel_object = kernel->object;
2107 packet->kernarg_address = shadow->kernarg_address;
2108 hsa_signal_t s;
2109 s.handle = shadow->signal;
2110 packet->completion_signal = s;
2111 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2112 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2113
2114 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2115
2116 uint16_t header;
2117 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2118 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2119 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2120
2121 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2122 agent->device_id);
2123
2124 packet_store_release ((uint32_t *) packet, header,
2125 (uint16_t) kla->ndim
2126 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2127
2128 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2129 index);
2130
2131 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2132
2133 /* Root signal waits with 1ms timeout. */
2134 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2135 1000 * 1000,
2136 HSA_WAIT_STATE_BLOCKED) != 0)
2137 {
2138 console_output (kernel, shadow->kernarg_address, false);
2139 }
2140 console_output (kernel, shadow->kernarg_address, true);
2141
2142 struct kernargs *kernargs = shadow->kernarg_address;
2143 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2144
2145 release_kernel_dispatch (shadow);
2146
2147 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2148 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2149
2150 unsigned int upper = (return_value & ~0xffff) >> 16;
2151 if (upper == 0xcafe)
2152 ; // exit not called, normal termination.
2153 else if (upper == 0xffff)
2154 ; // exit called.
2155 else
2156 {
2157 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2158 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2159 return_value);
2160 abort ();
2161 }
2162
2163 if (upper == 0xffff)
2164 {
2165 unsigned int signal = (return_value >> 8) & 0xff;
2166
2167 if (signal == SIGABRT)
2168 {
2169 GCN_WARNING ("GCN Kernel aborted\n");
2170 abort ();
2171 }
2172 else if (signal != 0)
2173 {
2174 GCN_WARNING ("GCN Kernel received unknown signal\n");
2175 abort ();
2176 }
2177
2178 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2179 exit (return_value & 0xff);
2180 }
2181 }
2182
2183 /* }}} */
2184 /* {{{ Load/Unload */
2185
2186 /* Initialize KERNEL from D and other parameters. Return true on success. */
2187
2188 static bool
2189 init_basic_kernel_info (struct kernel_info *kernel,
2190 struct hsa_kernel_description *d,
2191 struct agent_info *agent,
2192 struct module_info *module)
2193 {
2194 kernel->agent = agent;
2195 kernel->module = module;
2196 kernel->name = d->name;
2197 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2198 {
2199 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2200 return false;
2201 }
2202 return true;
2203 }
2204
2205 /* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true. If
2206 not found, return false. */
2207
2208 static bool
2209 find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
2210 struct module_info *module, Elf64_Ehdr *image,
2211 Elf64_Shdr *sections)
2212 {
2213 bool res = false;
2214
2215 hsa_status_t status;
2216
2217 hsa_executable_symbol_t symbol;
2218 if (!find_executable_symbol (agent->executable, &symbol))
2219 return false;
2220
2221 status = hsa_fns.hsa_executable_symbol_get_info_fn
2222 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
2223 if (status != HSA_STATUS_SUCCESS)
2224 {
2225 hsa_error ("Could not extract symbol address", status);
2226 return false;
2227 }
2228
2229 char *symbol_name = get_executable_symbol_name (symbol);
2230 if (symbol_name == NULL)
2231 return false;
2232
2233 /* Find the kernel function in ELF, and calculate actual load offset. */
2234 for (int i = 0; i < image->e_shnum; i++)
2235 if (sections[i].sh_type == SHT_SYMTAB)
2236 {
2237 Elf64_Shdr *strtab = &sections[sections[i].sh_link];
2238 char *strings = (char *)image + strtab->sh_offset;
2239
2240 for (size_t offset = 0;
2241 offset < sections[i].sh_size;
2242 offset += sections[i].sh_entsize)
2243 {
2244 Elf64_Sym *sym = (Elf64_Sym*)((char*)image
2245 + sections[i].sh_offset
2246 + offset);
2247 if (strcmp (symbol_name, strings + sym->st_name) == 0)
2248 {
2249 *load_offset -= sym->st_value;
2250 res = true;
2251 break;
2252 }
2253 }
2254 }
2255
2256 free (symbol_name);
2257 return res;
2258 }
2259
2260 /* Create and finalize the program consisting of all loaded modules. */
2261
2262 static bool
2263 create_and_finalize_hsa_program (struct agent_info *agent)
2264 {
2265 hsa_status_t status;
2266 int reloc_count = 0;
2267 bool res = true;
2268 if (pthread_mutex_lock (&agent->prog_mutex))
2269 {
2270 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2271 return false;
2272 }
2273 if (agent->prog_finalized)
2274 goto final;
2275
2276 status
2277 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2278 HSA_EXECUTABLE_STATE_UNFROZEN,
2279 "", &agent->executable);
2280 if (status != HSA_STATUS_SUCCESS)
2281 {
2282 hsa_error ("Could not create GCN executable", status);
2283 goto fail;
2284 }
2285
2286 /* Load any GCN modules. */
2287 struct module_info *module = agent->module;
2288 if (module)
2289 {
2290 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2291
2292 /* Hide relocations from the HSA runtime loader.
2293 Keep a copy of the unmodified section headers to use later. */
2294 Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
2295 + image->e_shoff);
2296 for (int i = image->e_shnum - 1; i >= 0; i--)
2297 {
2298 if (image_sections[i].sh_type == SHT_RELA
2299 || image_sections[i].sh_type == SHT_REL)
2300 /* Change section type to something harmless. */
2301 image_sections[i].sh_type |= 0x80;
2302 }
2303
2304 hsa_code_object_t co = { 0 };
2305 status = hsa_fns.hsa_code_object_deserialize_fn
2306 (module->image_desc->gcn_image->image,
2307 module->image_desc->gcn_image->size,
2308 NULL, &co);
2309 if (status != HSA_STATUS_SUCCESS)
2310 {
2311 hsa_error ("Could not deserialize GCN code object", status);
2312 goto fail;
2313 }
2314
2315 status = hsa_fns.hsa_executable_load_code_object_fn
2316 (agent->executable, agent->id, co, "");
2317 if (status != HSA_STATUS_SUCCESS)
2318 {
2319 hsa_error ("Could not load GCN code object", status);
2320 goto fail;
2321 }
2322
2323 if (!module->heap)
2324 {
2325 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2326 gcn_kernel_heap_size,
2327 (void**)&module->heap);
2328 if (status != HSA_STATUS_SUCCESS)
2329 {
2330 hsa_error ("Could not allocate memory for GCN heap", status);
2331 goto fail;
2332 }
2333
2334 status = hsa_fns.hsa_memory_assign_agent_fn
2335 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2336 if (status != HSA_STATUS_SUCCESS)
2337 {
2338 hsa_error ("Could not assign GCN heap memory to device", status);
2339 goto fail;
2340 }
2341
2342 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2343 &gcn_kernel_heap_size,
2344 sizeof (gcn_kernel_heap_size));
2345 }
2346
2347 }
2348
2349 if (debug)
2350 dump_executable_symbols (agent->executable);
2351
2352 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2353 if (status != HSA_STATUS_SUCCESS)
2354 {
2355 hsa_error ("Could not freeze the GCN executable", status);
2356 goto fail;
2357 }
2358
2359 if (agent->module)
2360 {
2361 struct module_info *module = agent->module;
2362 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2363 Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
2364
2365 Elf64_Addr load_offset;
2366 if (!find_load_offset (&load_offset, agent, module, image, sections))
2367 goto fail;
2368
2369 /* Record the physical load address range.
2370 We need this for data copies later. */
2371 Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
2372 Elf64_Addr low = ~0, high = 0;
2373 for (int i = 0; i < image->e_phnum; i++)
2374 if (segments[i].p_memsz > 0)
2375 {
2376 if (segments[i].p_paddr < low)
2377 low = segments[i].p_paddr;
2378 if (segments[i].p_paddr > high)
2379 high = segments[i].p_paddr + segments[i].p_memsz - 1;
2380 }
2381 module->phys_address_start = low + load_offset;
2382 module->phys_address_end = high + load_offset;
2383
2384 // Find dynamic symbol table
2385 Elf64_Shdr *dynsym = NULL;
2386 for (int i = 0; i < image->e_shnum; i++)
2387 if (sections[i].sh_type == SHT_DYNSYM)
2388 {
2389 dynsym = &sections[i];
2390 break;
2391 }
2392
2393 /* Fix up relocations. */
2394 for (int i = 0; i < image->e_shnum; i++)
2395 {
2396 if (sections[i].sh_type == (SHT_RELA | 0x80))
2397 for (size_t offset = 0;
2398 offset < sections[i].sh_size;
2399 offset += sections[i].sh_entsize)
2400 {
2401 Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
2402 + sections[i].sh_offset
2403 + offset);
2404 Elf64_Sym *sym =
2405 (dynsym
2406 ? (Elf64_Sym*)((char*)image
2407 + dynsym->sh_offset
2408 + (dynsym->sh_entsize
2409 * ELF64_R_SYM (reloc->r_info)))
2410 : NULL);
2411
2412 int64_t S = (sym ? sym->st_value : 0);
2413 int64_t P = reloc->r_offset + load_offset;
2414 int64_t A = reloc->r_addend;
2415 int64_t B = load_offset;
2416 int64_t V, size;
2417 switch (ELF64_R_TYPE (reloc->r_info))
2418 {
2419 case R_AMDGPU_ABS32_LO:
2420 V = (S + A) & 0xFFFFFFFF;
2421 size = 4;
2422 break;
2423 case R_AMDGPU_ABS32_HI:
2424 V = (S + A) >> 32;
2425 size = 4;
2426 break;
2427 case R_AMDGPU_ABS64:
2428 V = S + A;
2429 size = 8;
2430 break;
2431 case R_AMDGPU_REL32:
2432 V = S + A - P;
2433 size = 4;
2434 break;
2435 case R_AMDGPU_REL64:
2436 /* FIXME
2437 LLD seems to emit REL64 where the the assembler has
2438 ABS64. This is clearly wrong because it's not what the
2439 compiler is expecting. Let's assume, for now, that
2440 it's a bug. In any case, GCN kernels are always self
2441 contained and therefore relative relocations will have
2442 been resolved already, so this should be a safe
2443 workaround. */
2444 V = S + A/* - P*/;
2445 size = 8;
2446 break;
2447 case R_AMDGPU_ABS32:
2448 V = S + A;
2449 size = 4;
2450 break;
2451 /* TODO R_AMDGPU_GOTPCREL */
2452 /* TODO R_AMDGPU_GOTPCREL32_LO */
2453 /* TODO R_AMDGPU_GOTPCREL32_HI */
2454 case R_AMDGPU_REL32_LO:
2455 V = (S + A - P) & 0xFFFFFFFF;
2456 size = 4;
2457 break;
2458 case R_AMDGPU_REL32_HI:
2459 V = (S + A - P) >> 32;
2460 size = 4;
2461 break;
2462 case R_AMDGPU_RELATIVE64:
2463 V = B + A;
2464 size = 8;
2465 break;
2466 default:
2467 fprintf (stderr, "Error: unsupported relocation type.\n");
2468 exit (1);
2469 }
2470 status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
2471 if (status != HSA_STATUS_SUCCESS)
2472 {
2473 hsa_error ("Failed to fix up relocation", status);
2474 goto fail;
2475 }
2476 reloc_count++;
2477 }
2478 }
2479 }
2480
2481 GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
2482 agent->device_id, reloc_count);
2483
2484 final:
2485 agent->prog_finalized = true;
2486
2487 if (pthread_mutex_unlock (&agent->prog_mutex))
2488 {
2489 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2490 res = false;
2491 }
2492
2493 return res;
2494
2495 fail:
2496 res = false;
2497 goto final;
2498 }
2499
2500 /* Free the HSA program in agent and everything associated with it and set
2501 agent->prog_finalized and the initialized flags of all kernels to false.
2502 Return TRUE on success. */
2503
2504 static bool
2505 destroy_hsa_program (struct agent_info *agent)
2506 {
2507 if (!agent->prog_finalized)
2508 return true;
2509
2510 hsa_status_t status;
2511
2512 GCN_DEBUG ("Destroying the current GCN program.\n");
2513
2514 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2515 if (status != HSA_STATUS_SUCCESS)
2516 return hsa_error ("Could not destroy GCN executable", status);
2517
2518 if (agent->module)
2519 {
2520 int i;
2521 for (i = 0; i < agent->module->kernel_count; i++)
2522 agent->module->kernels[i].initialized = false;
2523
2524 if (agent->module->heap)
2525 {
2526 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2527 agent->module->heap = NULL;
2528 }
2529 }
2530 agent->prog_finalized = false;
2531 return true;
2532 }
2533
2534 /* Deinitialize all information associated with MODULE and kernels within
2535 it. Return TRUE on success. */
2536
2537 static bool
2538 destroy_module (struct module_info *module, bool locked)
2539 {
2540 /* Run destructors before destroying module. */
2541 struct GOMP_kernel_launch_attributes kla =
2542 { 3,
2543 /* Grid size. */
2544 { 1, 64, 1 },
2545 /* Work-group size. */
2546 { 1, 64, 1 }
2547 };
2548
2549 if (module->fini_array_func)
2550 {
2551 init_kernel (module->fini_array_func);
2552 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2553 }
2554 module->constructors_run_p = false;
2555
2556 int i;
2557 for (i = 0; i < module->kernel_count; i++)
2558 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2559 {
2560 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2561 "mutex");
2562 return false;
2563 }
2564
2565 return true;
2566 }
2567
2568 /* }}} */
2569 /* {{{ Async */
2570
2571 /* Callback of dispatch queues to report errors. */
2572
2573 static void
2574 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2575 {
2576 struct queue_entry *entry = &aq->queue[index];
2577
2578 switch (entry->type)
2579 {
2580 case KERNEL_LAUNCH:
2581 if (DEBUG_QUEUES)
2582 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2583 aq->agent->device_id, aq->id, index);
2584 run_kernel (entry->u.launch.kernel,
2585 entry->u.launch.vars,
2586 &entry->u.launch.kla, aq, false);
2587 if (DEBUG_QUEUES)
2588 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2589 aq->agent->device_id, aq->id, index);
2590 break;
2591
2592 case CALLBACK:
2593 if (DEBUG_QUEUES)
2594 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2595 aq->agent->device_id, aq->id, index);
2596 entry->u.callback.fn (entry->u.callback.data);
2597 if (DEBUG_QUEUES)
2598 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2599 aq->agent->device_id, aq->id, index);
2600 break;
2601
2602 case ASYNC_WAIT:
2603 {
2604 /* FIXME: is it safe to access a placeholder that may already have
2605 been executed? */
2606 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2607
2608 if (DEBUG_QUEUES)
2609 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2610 aq->agent->device_id, aq->id, index);
2611
2612 pthread_mutex_lock (&placeholderp->mutex);
2613
2614 while (!placeholderp->executed)
2615 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2616
2617 pthread_mutex_unlock (&placeholderp->mutex);
2618
2619 if (pthread_cond_destroy (&placeholderp->cond))
2620 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2621
2622 if (pthread_mutex_destroy (&placeholderp->mutex))
2623 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2624
2625 if (DEBUG_QUEUES)
2626 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2627 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2628 }
2629 break;
2630
2631 case ASYNC_PLACEHOLDER:
2632 pthread_mutex_lock (&entry->u.placeholder.mutex);
2633 entry->u.placeholder.executed = 1;
2634 pthread_cond_signal (&entry->u.placeholder.cond);
2635 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2636 break;
2637
2638 default:
2639 GOMP_PLUGIN_fatal ("Unknown queue element");
2640 }
2641 }
2642
2643 /* This function is run as a thread to service an async queue in the
2644 background. It runs continuously until the stop flag is set. */
2645
2646 static void *
2647 drain_queue (void *thread_arg)
2648 {
2649 struct goacc_asyncqueue *aq = thread_arg;
2650
2651 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2652 {
2653 aq->drain_queue_stop = 2;
2654 return NULL;
2655 }
2656
2657 pthread_mutex_lock (&aq->mutex);
2658
2659 while (true)
2660 {
2661 if (aq->drain_queue_stop)
2662 break;
2663
2664 if (aq->queue_n > 0)
2665 {
2666 pthread_mutex_unlock (&aq->mutex);
2667 execute_queue_entry (aq, aq->queue_first);
2668
2669 pthread_mutex_lock (&aq->mutex);
2670 aq->queue_first = ((aq->queue_first + 1)
2671 % ASYNC_QUEUE_SIZE);
2672 aq->queue_n--;
2673
2674 if (DEBUG_THREAD_SIGNAL)
2675 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2676 aq->agent->device_id, aq->id);
2677 pthread_cond_broadcast (&aq->queue_cond_out);
2678 pthread_mutex_unlock (&aq->mutex);
2679
2680 if (DEBUG_QUEUES)
2681 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2682 aq->id);
2683 pthread_mutex_lock (&aq->mutex);
2684 }
2685 else
2686 {
2687 if (DEBUG_THREAD_SLEEP)
2688 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2689 aq->agent->device_id, aq->id);
2690 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2691 if (DEBUG_THREAD_SLEEP)
2692 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2693 aq->agent->device_id, aq->id);
2694 }
2695 }
2696
2697 aq->drain_queue_stop = 2;
2698 if (DEBUG_THREAD_SIGNAL)
2699 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2700 aq->agent->device_id, aq->id);
2701 pthread_cond_broadcast (&aq->queue_cond_out);
2702 pthread_mutex_unlock (&aq->mutex);
2703
2704 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2705 return NULL;
2706 }
2707
2708 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2709 is not usually the case. This is just a debug tool. */
2710
2711 static void
2712 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2713 {
2714 pthread_mutex_lock (&aq->mutex);
2715
2716 while (aq->queue_n > 0)
2717 {
2718 execute_queue_entry (aq, aq->queue_first);
2719
2720 aq->queue_first = ((aq->queue_first + 1)
2721 % ASYNC_QUEUE_SIZE);
2722 aq->queue_n--;
2723 }
2724
2725 pthread_mutex_unlock (&aq->mutex);
2726 }
2727
2728 /* Block the current thread until an async queue is writable. The aq->mutex
2729 lock should be held on entry, and remains locked on exit. */
2730
2731 static void
2732 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2733 {
2734 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2735 {
2736 /* Queue is full. Wait for it to not be full. */
2737 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2738 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2739 }
2740 }
2741
2742 /* Request an asynchronous kernel launch on the specified queue. This
2743 may block if the queue is full, but returns without waiting for the
2744 kernel to run. */
2745
2746 static void
2747 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2748 void *vars, struct GOMP_kernel_launch_attributes *kla)
2749 {
2750 assert (aq->agent == kernel->agent);
2751
2752 pthread_mutex_lock (&aq->mutex);
2753
2754 wait_for_queue_nonfull (aq);
2755
2756 int queue_last = ((aq->queue_first + aq->queue_n)
2757 % ASYNC_QUEUE_SIZE);
2758 if (DEBUG_QUEUES)
2759 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2760 aq->id, queue_last);
2761
2762 aq->queue[queue_last].type = KERNEL_LAUNCH;
2763 aq->queue[queue_last].u.launch.kernel = kernel;
2764 aq->queue[queue_last].u.launch.vars = vars;
2765 aq->queue[queue_last].u.launch.kla = *kla;
2766
2767 aq->queue_n++;
2768
2769 if (DEBUG_THREAD_SIGNAL)
2770 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2771 aq->agent->device_id, aq->id);
2772 pthread_cond_signal (&aq->queue_cond_in);
2773
2774 pthread_mutex_unlock (&aq->mutex);
2775 }
2776
2777 /* Request an asynchronous callback on the specified queue. The callback
2778 function will be called, with the given opaque data, from the appropriate
2779 async thread, when all previous items on that queue are complete. */
2780
2781 static void
2782 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2783 void *data)
2784 {
2785 pthread_mutex_lock (&aq->mutex);
2786
2787 wait_for_queue_nonfull (aq);
2788
2789 int queue_last = ((aq->queue_first + aq->queue_n)
2790 % ASYNC_QUEUE_SIZE);
2791 if (DEBUG_QUEUES)
2792 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2793 aq->id, queue_last);
2794
2795 aq->queue[queue_last].type = CALLBACK;
2796 aq->queue[queue_last].u.callback.fn = fn;
2797 aq->queue[queue_last].u.callback.data = data;
2798
2799 aq->queue_n++;
2800
2801 if (DEBUG_THREAD_SIGNAL)
2802 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2803 aq->agent->device_id, aq->id);
2804 pthread_cond_signal (&aq->queue_cond_in);
2805
2806 pthread_mutex_unlock (&aq->mutex);
2807 }
2808
2809 /* Request that a given async thread wait for another thread (unspecified) to
2810 reach the given placeholder. The wait will occur when all previous entries
2811 on the queue are complete. A placeholder is effectively a kind of signal
2812 which simply sets a flag when encountered in a queue. */
2813
2814 static void
2815 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2816 struct placeholder *placeholderp)
2817 {
2818 pthread_mutex_lock (&aq->mutex);
2819
2820 wait_for_queue_nonfull (aq);
2821
2822 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2823 if (DEBUG_QUEUES)
2824 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2825 aq->id, queue_last);
2826
2827 aq->queue[queue_last].type = ASYNC_WAIT;
2828 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2829
2830 aq->queue_n++;
2831
2832 if (DEBUG_THREAD_SIGNAL)
2833 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2834 aq->agent->device_id, aq->id);
2835 pthread_cond_signal (&aq->queue_cond_in);
2836
2837 pthread_mutex_unlock (&aq->mutex);
2838 }
2839
2840 /* Add a placeholder into an async queue. When the async thread reaches the
2841 placeholder it will set the "executed" flag to true and continue.
2842 Another thread may be waiting on this thread reaching the placeholder. */
2843
2844 static struct placeholder *
2845 queue_push_placeholder (struct goacc_asyncqueue *aq)
2846 {
2847 struct placeholder *placeholderp;
2848
2849 pthread_mutex_lock (&aq->mutex);
2850
2851 wait_for_queue_nonfull (aq);
2852
2853 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2854 if (DEBUG_QUEUES)
2855 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2856 aq->id, queue_last);
2857
2858 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2859 placeholderp = &aq->queue[queue_last].u.placeholder;
2860
2861 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2862 {
2863 pthread_mutex_unlock (&aq->mutex);
2864 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2865 }
2866
2867 if (pthread_cond_init (&placeholderp->cond, NULL))
2868 {
2869 pthread_mutex_unlock (&aq->mutex);
2870 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2871 }
2872
2873 placeholderp->executed = 0;
2874
2875 aq->queue_n++;
2876
2877 if (DEBUG_THREAD_SIGNAL)
2878 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2879 aq->agent->device_id, aq->id);
2880 pthread_cond_signal (&aq->queue_cond_in);
2881
2882 pthread_mutex_unlock (&aq->mutex);
2883
2884 return placeholderp;
2885 }
2886
2887 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
2888
2889 static void
2890 finalize_async_thread (struct goacc_asyncqueue *aq)
2891 {
2892 pthread_mutex_lock (&aq->mutex);
2893 if (aq->drain_queue_stop == 2)
2894 {
2895 pthread_mutex_unlock (&aq->mutex);
2896 return;
2897 }
2898
2899 aq->drain_queue_stop = 1;
2900
2901 if (DEBUG_THREAD_SIGNAL)
2902 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2903 aq->agent->device_id, aq->id);
2904 pthread_cond_signal (&aq->queue_cond_in);
2905
2906 while (aq->drain_queue_stop != 2)
2907 {
2908 if (DEBUG_THREAD_SLEEP)
2909 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2910 " to sleep\n", aq->agent->device_id, aq->id);
2911 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2912 if (DEBUG_THREAD_SLEEP)
2913 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2914 aq->agent->device_id, aq->id);
2915 }
2916
2917 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2918 aq->id);
2919 pthread_mutex_unlock (&aq->mutex);
2920
2921 int err = pthread_join (aq->thread_drain_queue, NULL);
2922 if (err != 0)
2923 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2924 aq->agent->device_id, aq->id, strerror (err));
2925 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2926 }
2927
2928 /* Set up an async queue for OpenMP. There will be only one. The
2929 implementation simply uses an OpenACC async queue.
2930 FIXME: is this thread-safe if two threads call this function? */
2931
2932 static void
2933 maybe_init_omp_async (struct agent_info *agent)
2934 {
2935 if (!agent->omp_async_queue)
2936 agent->omp_async_queue
2937 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2938 }
2939
2940 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2941 copies from read-only pages. */
2942
2943 static void
2944 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2945 {
2946 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2947
2948 if (status == HSA_STATUS_SUCCESS)
2949 return;
2950
2951 /* It appears that the copy fails if the source data is in a read-only page.
2952 We can't detect that easily, so try copying the data to a temporary buffer
2953 and doing the copy again if we got an error above. */
2954
2955 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2956 "[%p:+%d]\n", (void *) src, (int) len);
2957
2958 void *src_copy = malloc (len);
2959 memcpy (src_copy, src, len);
2960 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2961 free (src_copy);
2962 if (status != HSA_STATUS_SUCCESS)
2963 GOMP_PLUGIN_error ("memory copy failed");
2964 }
2965
2966 /* Copy data to or from a device. This is intended for use as an async
2967 callback event. */
2968
2969 static void
2970 copy_data (void *data_)
2971 {
2972 struct copy_data *data = (struct copy_data *)data_;
2973 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2974 data->aq->agent->device_id, data->aq->id, data->len, data->src,
2975 data->dst);
2976 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
2977 if (data->free_src)
2978 free ((void *) data->src);
2979 free (data);
2980 }
2981
2982 /* Free device data. This is intended for use as an async callback event. */
2983
2984 static void
2985 gomp_offload_free (void *ptr)
2986 {
2987 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2988 GOMP_OFFLOAD_free (0, ptr);
2989 }
2990
2991 /* Request an asynchronous data copy, to or from a device, on a given queue.
2992 The event will be registered as a callback. If FREE_SRC is true
2993 then the source data will be freed following the copy. */
2994
2995 static void
2996 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
2997 size_t len, bool free_src)
2998 {
2999 if (DEBUG_QUEUES)
3000 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3001 aq->agent->device_id, aq->id, len, src, dst);
3002 struct copy_data *data
3003 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3004 data->dst = dst;
3005 data->src = src;
3006 data->len = len;
3007 data->free_src = free_src;
3008 data->aq = aq;
3009 queue_push_callback (aq, copy_data, data);
3010 }
3011
3012 /* Return true if the given queue is currently empty. */
3013
3014 static int
3015 queue_empty (struct goacc_asyncqueue *aq)
3016 {
3017 pthread_mutex_lock (&aq->mutex);
3018 int res = aq->queue_n == 0 ? 1 : 0;
3019 pthread_mutex_unlock (&aq->mutex);
3020
3021 return res;
3022 }
3023
3024 /* Wait for a given queue to become empty. This implements an OpenACC wait
3025 directive. */
3026
3027 static void
3028 wait_queue (struct goacc_asyncqueue *aq)
3029 {
3030 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3031 {
3032 drain_queue_synchronous (aq);
3033 return;
3034 }
3035
3036 pthread_mutex_lock (&aq->mutex);
3037
3038 while (aq->queue_n > 0)
3039 {
3040 if (DEBUG_THREAD_SLEEP)
3041 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3042 aq->agent->device_id, aq->id);
3043 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3044 if (DEBUG_THREAD_SLEEP)
3045 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3046 aq->id);
3047 }
3048
3049 pthread_mutex_unlock (&aq->mutex);
3050 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3051 }
3052
3053 /* }}} */
3054 /* {{{ OpenACC support */
3055
3056 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3057
3058 static void
3059 gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3060 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3061 struct goacc_asyncqueue *aq)
3062 {
3063 if (!GOMP_OFFLOAD_can_run (kernel))
3064 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3065
3066 /* If we get here then this must be an OpenACC kernel. */
3067 kernel->kind = KIND_OPENACC;
3068
3069 /* devaddrs must be double-indirect on the target. */
3070 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3071 for (size_t i = 0; i < mapnum; i++)
3072 hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3073 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3074 sizeof (void *));
3075
3076 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3077 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3078 {
3079 struct hsa_kernel_description *d
3080 = &kernel->module->image_desc->kernel_infos[i];
3081 if (d->name == kernel->name)
3082 {
3083 hsa_kernel_desc = d;
3084 break;
3085 }
3086 }
3087
3088 /* We may have statically-determined dimensions in
3089 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3090 invocation at runtime in dims[]. We allow static dimensions to take
3091 priority over dynamic dimensions when present (non-zero). */
3092 if (hsa_kernel_desc->oacc_dims[0] > 0)
3093 dims[0] = hsa_kernel_desc->oacc_dims[0];
3094 if (hsa_kernel_desc->oacc_dims[1] > 0)
3095 dims[1] = hsa_kernel_desc->oacc_dims[1];
3096 if (hsa_kernel_desc->oacc_dims[2] > 0)
3097 dims[2] = hsa_kernel_desc->oacc_dims[2];
3098
3099 /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3100 There isn't really a correct answer for this without a clue about the
3101 problem size, so let's do a reasonable number of single-worker gangs.
3102 64 gangs matches a typical Fiji device. */
3103
3104 /* NOTE: Until support for middle-end worker partitioning is merged, use 1
3105 for the default number of workers. */
3106 if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs. */
3107 if (dims[1] == 0) dims[1] = 1; /* Workers. */
3108
3109 /* The incoming dimensions are expressed in terms of gangs, workers, and
3110 vectors. The HSA dimensions are expressed in terms of "work-items",
3111 which means multiples of vector lanes.
3112
3113 The "grid size" specifies the size of the problem space, and the
3114 "work-group size" specifies how much of that we want a single compute
3115 unit to chew on at once.
3116
3117 The three dimensions do not really correspond to hardware, but the
3118 important thing is that the HSA runtime will launch as many
3119 work-groups as it takes to process the entire grid, and each
3120 work-group will contain as many wave-fronts as it takes to process
3121 the work-items in that group.
3122
3123 Essentially, as long as we set the Y dimension to 64 (the number of
3124 vector lanes in hardware), and the Z group size to the maximum (16),
3125 then we will get the gangs (X) and workers (Z) launched as we expect.
3126
3127 The reason for the apparent reversal of vector and worker dimension
3128 order is to do with the way the run-time distributes work-items across
3129 v1 and v2. */
3130 struct GOMP_kernel_launch_attributes kla =
3131 {3,
3132 /* Grid size. */
3133 {dims[0], 64, dims[1]},
3134 /* Work-group size. */
3135 {1, 64, 16}
3136 };
3137
3138 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3139 acc_prof_info *prof_info = thr->prof_info;
3140 acc_event_info enqueue_launch_event_info;
3141 acc_api_info *api_info = thr->api_info;
3142 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3143 if (profiling_dispatch_p)
3144 {
3145 prof_info->event_type = acc_ev_enqueue_launch_start;
3146
3147 enqueue_launch_event_info.launch_event.event_type
3148 = prof_info->event_type;
3149 enqueue_launch_event_info.launch_event.valid_bytes
3150 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3151 enqueue_launch_event_info.launch_event.parent_construct
3152 = acc_construct_parallel;
3153 enqueue_launch_event_info.launch_event.implicit = 1;
3154 enqueue_launch_event_info.launch_event.tool_info = NULL;
3155 enqueue_launch_event_info.launch_event.kernel_name
3156 = (char *) kernel->name;
3157 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3158 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3159 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3160
3161 api_info->device_api = acc_device_api_other;
3162
3163 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3164 &enqueue_launch_event_info, api_info);
3165 }
3166
3167 if (!async)
3168 {
3169 run_kernel (kernel, ind_da, &kla, NULL, false);
3170 gomp_offload_free (ind_da);
3171 }
3172 else
3173 {
3174 queue_push_launch (aq, kernel, ind_da, &kla);
3175 if (DEBUG_QUEUES)
3176 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3177 aq->agent->device_id, aq->id, ind_da);
3178 queue_push_callback (aq, gomp_offload_free, ind_da);
3179 }
3180
3181 if (profiling_dispatch_p)
3182 {
3183 prof_info->event_type = acc_ev_enqueue_launch_end;
3184 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3185 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3186 &enqueue_launch_event_info,
3187 api_info);
3188 }
3189 }
3190
3191 /* }}} */
3192 /* {{{ Generic Plugin API */
3193
3194 /* Return the name of the accelerator, which is "gcn". */
3195
3196 const char *
3197 GOMP_OFFLOAD_get_name (void)
3198 {
3199 return "gcn";
3200 }
3201
3202 /* Return the specific capabilities the HSA accelerator have. */
3203
3204 unsigned int
3205 GOMP_OFFLOAD_get_caps (void)
3206 {
3207 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3208 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3209 | GOMP_OFFLOAD_CAP_OPENACC_200;
3210 }
3211
3212 /* Identify as GCN accelerator. */
3213
3214 int
3215 GOMP_OFFLOAD_get_type (void)
3216 {
3217 return OFFLOAD_TARGET_TYPE_GCN;
3218 }
3219
3220 /* Return the libgomp version number we're compatible with. There is
3221 no requirement for cross-version compatibility. */
3222
3223 unsigned
3224 GOMP_OFFLOAD_version (void)
3225 {
3226 return GOMP_VERSION;
3227 }
3228
3229 /* Return the number of GCN devices on the system. */
3230
3231 int
3232 GOMP_OFFLOAD_get_num_devices (void)
3233 {
3234 if (!init_hsa_context ())
3235 return 0;
3236 return hsa_context.agent_count;
3237 }
3238
3239 /* Initialize device (agent) number N so that it can be used for computation.
3240 Return TRUE on success. */
3241
3242 bool
3243 GOMP_OFFLOAD_init_device (int n)
3244 {
3245 if (!init_hsa_context ())
3246 return false;
3247 if (n >= hsa_context.agent_count)
3248 {
3249 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3250 return false;
3251 }
3252 struct agent_info *agent = &hsa_context.agents[n];
3253
3254 if (agent->initialized)
3255 return true;
3256
3257 agent->device_id = n;
3258
3259 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3260 {
3261 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3262 return false;
3263 }
3264 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3265 {
3266 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3267 return false;
3268 }
3269 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3270 {
3271 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3272 return false;
3273 }
3274 if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3275 {
3276 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3277 return false;
3278 }
3279 agent->async_queues = NULL;
3280 agent->omp_async_queue = NULL;
3281 agent->team_arena_list = NULL;
3282
3283 uint32_t queue_size;
3284 hsa_status_t status;
3285 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3286 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3287 &queue_size);
3288 if (status != HSA_STATUS_SUCCESS)
3289 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3290 status);
3291
3292 char buf[64];
3293 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3294 &buf);
3295 if (status != HSA_STATUS_SUCCESS)
3296 return hsa_error ("Error querying the name of the agent", status);
3297 agent->gfx900_p = (strncmp (buf, "gfx900", 6) == 0);
3298
3299 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3300 HSA_QUEUE_TYPE_MULTI,
3301 hsa_queue_callback, NULL, UINT32_MAX,
3302 UINT32_MAX, &agent->sync_queue);
3303 if (status != HSA_STATUS_SUCCESS)
3304 return hsa_error ("Error creating command queue", status);
3305
3306 agent->kernarg_region.handle = (uint64_t) -1;
3307 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3308 get_kernarg_memory_region,
3309 &agent->kernarg_region);
3310 if (agent->kernarg_region.handle == (uint64_t) -1)
3311 {
3312 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3313 "arguments");
3314 return false;
3315 }
3316 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3317 dump_hsa_region (agent->kernarg_region, NULL);
3318
3319 agent->data_region.handle = (uint64_t) -1;
3320 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3321 get_data_memory_region,
3322 &agent->data_region);
3323 if (agent->data_region.handle == (uint64_t) -1)
3324 {
3325 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3326 "data");
3327 return false;
3328 }
3329 GCN_DEBUG ("Selected device data memory region:\n");
3330 dump_hsa_region (agent->data_region, NULL);
3331
3332 GCN_DEBUG ("GCN agent %d initialized\n", n);
3333
3334 agent->initialized = true;
3335 return true;
3336 }
3337
3338 /* Load GCN object-code module described by struct gcn_image_desc in
3339 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3340 If there are any constructors then run them. */
3341
3342 int
3343 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3344 struct addr_pair **target_table)
3345 {
3346 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3347 {
3348 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3349 " (expected %u, received %u)",
3350 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3351 return -1;
3352 }
3353
3354 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3355 struct agent_info *agent;
3356 struct addr_pair *pair;
3357 struct module_info *module;
3358 struct kernel_info *kernel;
3359 int kernel_count = image_desc->kernel_count;
3360 unsigned var_count = image_desc->global_variable_count;
3361
3362 agent = get_agent_info (ord);
3363 if (!agent)
3364 return -1;
3365
3366 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3367 {
3368 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3369 return -1;
3370 }
3371 if (agent->prog_finalized
3372 && !destroy_hsa_program (agent))
3373 return -1;
3374
3375 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3376 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3377 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
3378 * sizeof (struct addr_pair));
3379 *target_table = pair;
3380 module = (struct module_info *)
3381 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3382 + kernel_count * sizeof (struct kernel_info));
3383 module->image_desc = image_desc;
3384 module->kernel_count = kernel_count;
3385 module->heap = NULL;
3386 module->constructors_run_p = false;
3387
3388 kernel = &module->kernels[0];
3389
3390 /* Allocate memory for kernel dependencies. */
3391 for (unsigned i = 0; i < kernel_count; i++)
3392 {
3393 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3394 if (!init_basic_kernel_info (kernel, d, agent, module))
3395 return -1;
3396 if (strcmp (d->name, "_init_array") == 0)
3397 module->init_array_func = kernel;
3398 else if (strcmp (d->name, "_fini_array") == 0)
3399 module->fini_array_func = kernel;
3400 else
3401 {
3402 pair->start = (uintptr_t) kernel;
3403 pair->end = (uintptr_t) (kernel + 1);
3404 pair++;
3405 }
3406 kernel++;
3407 }
3408
3409 agent->module = module;
3410 if (pthread_rwlock_unlock (&agent->module_rwlock))
3411 {
3412 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3413 return -1;
3414 }
3415
3416 if (!create_and_finalize_hsa_program (agent))
3417 return -1;
3418
3419 for (unsigned i = 0; i < var_count; i++)
3420 {
3421 struct global_var_info *v = &image_desc->global_variables[i];
3422 GCN_DEBUG ("Looking for variable %s\n", v->name);
3423
3424 hsa_status_t status;
3425 hsa_executable_symbol_t var_symbol;
3426 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3427 v->name, agent->id,
3428 0, &var_symbol);
3429
3430 if (status != HSA_STATUS_SUCCESS)
3431 hsa_fatal ("Could not find symbol for variable in the code object",
3432 status);
3433
3434 uint64_t var_addr;
3435 uint32_t var_size;
3436 status = hsa_fns.hsa_executable_symbol_get_info_fn
3437 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
3438 if (status != HSA_STATUS_SUCCESS)
3439 hsa_fatal ("Could not extract a variable from its symbol", status);
3440 status = hsa_fns.hsa_executable_symbol_get_info_fn
3441 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
3442 if (status != HSA_STATUS_SUCCESS)
3443 hsa_fatal ("Could not extract a variable size from its symbol", status);
3444
3445 pair->start = var_addr;
3446 pair->end = var_addr + var_size;
3447 GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
3448 (void *)var_addr, var_size);
3449 pair++;
3450 }
3451
3452 /* Ensure that constructors are run first. */
3453 struct GOMP_kernel_launch_attributes kla =
3454 { 3,
3455 /* Grid size. */
3456 { 1, 64, 1 },
3457 /* Work-group size. */
3458 { 1, 64, 1 }
3459 };
3460
3461 if (module->init_array_func)
3462 {
3463 init_kernel (module->init_array_func);
3464 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3465 }
3466 module->constructors_run_p = true;
3467
3468 /* Don't report kernels that libgomp need not know about. */
3469 if (module->init_array_func)
3470 kernel_count--;
3471 if (module->fini_array_func)
3472 kernel_count--;
3473
3474 return kernel_count + var_count;
3475 }
3476
3477 /* Unload GCN object-code module described by struct gcn_image_desc in
3478 TARGET_DATA from agent number N. Return TRUE on success. */
3479
3480 bool
3481 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3482 {
3483 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3484 {
3485 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3486 " (expected %u, received %u)",
3487 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3488 return false;
3489 }
3490
3491 struct agent_info *agent;
3492 agent = get_agent_info (n);
3493 if (!agent)
3494 return false;
3495
3496 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3497 {
3498 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3499 return false;
3500 }
3501
3502 if (!agent->module || agent->module->image_desc != target_data)
3503 {
3504 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3505 "loaded before");
3506 return false;
3507 }
3508
3509 if (!destroy_module (agent->module, true))
3510 return false;
3511 free (agent->module);
3512 agent->module = NULL;
3513 if (!destroy_hsa_program (agent))
3514 return false;
3515 if (pthread_rwlock_unlock (&agent->module_rwlock))
3516 {
3517 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3518 return false;
3519 }
3520 return true;
3521 }
3522
3523 /* Deinitialize all information and status associated with agent number N. We
3524 do not attempt any synchronization, assuming the user and libgomp will not
3525 attempt deinitialization of a device that is in any way being used at the
3526 same time. Return TRUE on success. */
3527
3528 bool
3529 GOMP_OFFLOAD_fini_device (int n)
3530 {
3531 struct agent_info *agent = get_agent_info (n);
3532 if (!agent)
3533 return false;
3534
3535 if (!agent->initialized)
3536 return true;
3537
3538 if (agent->omp_async_queue)
3539 {
3540 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3541 agent->omp_async_queue = NULL;
3542 }
3543
3544 if (agent->module)
3545 {
3546 if (!destroy_module (agent->module, false))
3547 return false;
3548 free (agent->module);
3549 agent->module = NULL;
3550 }
3551
3552 if (!destroy_team_arenas (agent))
3553 return false;
3554
3555 if (!destroy_hsa_program (agent))
3556 return false;
3557
3558 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3559 if (status != HSA_STATUS_SUCCESS)
3560 return hsa_error ("Error destroying command queue", status);
3561
3562 if (pthread_mutex_destroy (&agent->prog_mutex))
3563 {
3564 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3565 return false;
3566 }
3567 if (pthread_rwlock_destroy (&agent->module_rwlock))
3568 {
3569 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3570 return false;
3571 }
3572
3573 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3574 {
3575 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3576 return false;
3577 }
3578 if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3579 {
3580 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3581 return false;
3582 }
3583 agent->initialized = false;
3584 return true;
3585 }
3586
3587 /* Return true if the HSA runtime can run function FN_PTR. */
3588
3589 bool
3590 GOMP_OFFLOAD_can_run (void *fn_ptr)
3591 {
3592 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3593
3594 init_kernel (kernel);
3595 if (kernel->initialization_failed)
3596 goto failure;
3597
3598 return true;
3599
3600 failure:
3601 if (suppress_host_fallback)
3602 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3603 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3604 return false;
3605 }
3606
3607 /* Allocate memory on device N. */
3608
3609 void *
3610 GOMP_OFFLOAD_alloc (int n, size_t size)
3611 {
3612 struct agent_info *agent = get_agent_info (n);
3613 return alloc_by_agent (agent, size);
3614 }
3615
3616 /* Free memory from device N. */
3617
3618 bool
3619 GOMP_OFFLOAD_free (int device, void *ptr)
3620 {
3621 GCN_DEBUG ("Freeing memory on device %d\n", device);
3622
3623 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3624 if (status != HSA_STATUS_SUCCESS)
3625 {
3626 hsa_error ("Could not free device memory", status);
3627 return false;
3628 }
3629
3630 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3631 bool profiling_dispatch_p
3632 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3633 if (profiling_dispatch_p)
3634 {
3635 acc_prof_info *prof_info = thr->prof_info;
3636 acc_event_info data_event_info;
3637 acc_api_info *api_info = thr->api_info;
3638
3639 prof_info->event_type = acc_ev_free;
3640
3641 data_event_info.data_event.event_type = prof_info->event_type;
3642 data_event_info.data_event.valid_bytes
3643 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3644 data_event_info.data_event.parent_construct
3645 = acc_construct_parallel;
3646 data_event_info.data_event.implicit = 1;
3647 data_event_info.data_event.tool_info = NULL;
3648 data_event_info.data_event.var_name = NULL;
3649 data_event_info.data_event.bytes = 0;
3650 data_event_info.data_event.host_ptr = NULL;
3651 data_event_info.data_event.device_ptr = (void *) ptr;
3652
3653 api_info->device_api = acc_device_api_other;
3654
3655 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3656 api_info);
3657 }
3658
3659 return true;
3660 }
3661
3662 /* Copy data from DEVICE to host. */
3663
3664 bool
3665 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3666 {
3667 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3668 src, dst);
3669 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3670 if (status != HSA_STATUS_SUCCESS)
3671 GOMP_PLUGIN_error ("memory copy failed");
3672 return true;
3673 }
3674
3675 /* Copy data from host to DEVICE. */
3676
3677 bool
3678 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3679 {
3680 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3681 device, dst);
3682 hsa_memory_copy_wrapper (dst, src, n);
3683 return true;
3684 }
3685
3686 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3687
3688 bool
3689 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3690 {
3691 struct gcn_thread *thread_data = gcn_thread ();
3692
3693 if (thread_data && !async_synchronous_p (thread_data->async))
3694 {
3695 struct agent_info *agent = get_agent_info (device);
3696 maybe_init_omp_async (agent);
3697 queue_push_copy (agent->omp_async_queue, dst, src, n, false);
3698 return true;
3699 }
3700
3701 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3702 device, src, device, dst);
3703 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3704 if (status != HSA_STATUS_SUCCESS)
3705 GOMP_PLUGIN_error ("memory copy failed");
3706 return true;
3707 }
3708
3709 /* }}} */
3710 /* {{{ OpenMP Plugin API */
3711
3712 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3713 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3714 to a kernel_info structure, and must have previously been loaded to the
3715 specified device. */
3716
3717 void
3718 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3719 {
3720 struct agent_info *agent = get_agent_info (device);
3721 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3722 struct GOMP_kernel_launch_attributes def;
3723 struct GOMP_kernel_launch_attributes *kla;
3724 assert (agent == kernel->agent);
3725
3726 /* If we get here then the kernel must be OpenMP. */
3727 kernel->kind = KIND_OPENMP;
3728
3729 if (!parse_target_attributes (args, &def, &kla, agent))
3730 {
3731 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3732 return;
3733 }
3734 run_kernel (kernel, vars, kla, NULL, false);
3735 }
3736
3737 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3738 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3739 GOMP_PLUGIN_target_task_completion when it has finished. */
3740
3741 void
3742 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3743 void **args, void *async_data)
3744 {
3745 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3746 struct agent_info *agent = get_agent_info (device);
3747 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3748 struct GOMP_kernel_launch_attributes def;
3749 struct GOMP_kernel_launch_attributes *kla;
3750 assert (agent == kernel->agent);
3751
3752 /* If we get here then the kernel must be OpenMP. */
3753 kernel->kind = KIND_OPENMP;
3754
3755 if (!parse_target_attributes (args, &def, &kla, agent))
3756 {
3757 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3758 return;
3759 }
3760
3761 maybe_init_omp_async (agent);
3762 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3763 queue_push_callback (agent->omp_async_queue,
3764 GOMP_PLUGIN_target_task_completion, async_data);
3765 }
3766
3767 /* }}} */
3768 /* {{{ OpenACC Plugin API */
3769
3770 /* Run a synchronous OpenACC kernel. The device number is inferred from the
3771 already-loaded KERNEL. */
3772
3773 void
3774 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3775 void **hostaddrs, void **devaddrs, unsigned *dims,
3776 void *targ_mem_desc)
3777 {
3778 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3779
3780 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3781 NULL);
3782 }
3783
3784 /* Run an asynchronous OpenACC kernel on the specified queue. */
3785
3786 void
3787 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3788 void **hostaddrs, void **devaddrs,
3789 unsigned *dims, void *targ_mem_desc,
3790 struct goacc_asyncqueue *aq)
3791 {
3792 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3793
3794 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3795 aq);
3796 }
3797
3798 /* Create a new asynchronous thread and queue for running future kernels. */
3799
3800 struct goacc_asyncqueue *
3801 GOMP_OFFLOAD_openacc_async_construct (int device)
3802 {
3803 struct agent_info *agent = get_agent_info (device);
3804
3805 pthread_mutex_lock (&agent->async_queues_mutex);
3806
3807 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3808 aq->agent = get_agent_info (device);
3809 aq->prev = NULL;
3810 aq->next = agent->async_queues;
3811 if (aq->next)
3812 {
3813 aq->next->prev = aq;
3814 aq->id = aq->next->id + 1;
3815 }
3816 else
3817 aq->id = 1;
3818 agent->async_queues = aq;
3819
3820 aq->queue_first = 0;
3821 aq->queue_n = 0;
3822 aq->drain_queue_stop = 0;
3823
3824 if (pthread_mutex_init (&aq->mutex, NULL))
3825 {
3826 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3827 return false;
3828 }
3829 if (pthread_cond_init (&aq->queue_cond_in, NULL))
3830 {
3831 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3832 return false;
3833 }
3834 if (pthread_cond_init (&aq->queue_cond_out, NULL))
3835 {
3836 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3837 return false;
3838 }
3839
3840 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3841 ASYNC_QUEUE_SIZE,
3842 HSA_QUEUE_TYPE_MULTI,
3843 hsa_queue_callback, NULL,
3844 UINT32_MAX, UINT32_MAX,
3845 &aq->hsa_queue);
3846 if (status != HSA_STATUS_SUCCESS)
3847 hsa_fatal ("Error creating command queue", status);
3848
3849 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3850 if (err != 0)
3851 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3852 strerror (err));
3853 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3854 aq->id);
3855
3856 pthread_mutex_unlock (&agent->async_queues_mutex);
3857
3858 return aq;
3859 }
3860
3861 /* Destroy an existing asynchronous thread and queue. Waits for any
3862 currently-running task to complete, but cancels any queued tasks. */
3863
3864 bool
3865 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3866 {
3867 struct agent_info *agent = aq->agent;
3868
3869 finalize_async_thread (aq);
3870
3871 pthread_mutex_lock (&agent->async_queues_mutex);
3872
3873 int err;
3874 if ((err = pthread_mutex_destroy (&aq->mutex)))
3875 {
3876 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3877 goto fail;
3878 }
3879 if (pthread_cond_destroy (&aq->queue_cond_in))
3880 {
3881 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3882 goto fail;
3883 }
3884 if (pthread_cond_destroy (&aq->queue_cond_out))
3885 {
3886 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3887 goto fail;
3888 }
3889 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3890 if (status != HSA_STATUS_SUCCESS)
3891 {
3892 hsa_error ("Error destroying command queue", status);
3893 goto fail;
3894 }
3895
3896 if (aq->prev)
3897 aq->prev->next = aq->next;
3898 if (aq->next)
3899 aq->next->prev = aq->prev;
3900 if (agent->async_queues == aq)
3901 agent->async_queues = aq->next;
3902
3903 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3904
3905 free (aq);
3906 pthread_mutex_unlock (&agent->async_queues_mutex);
3907 return true;
3908
3909 fail:
3910 pthread_mutex_unlock (&agent->async_queues_mutex);
3911 return false;
3912 }
3913
3914 /* Return true if the specified async queue is currently empty. */
3915
3916 int
3917 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
3918 {
3919 return queue_empty (aq);
3920 }
3921
3922 /* Block until the specified queue has executed all its tasks and the
3923 queue is empty. */
3924
3925 bool
3926 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
3927 {
3928 wait_queue (aq);
3929 return true;
3930 }
3931
3932 /* Add a serialization point across two async queues. Any new tasks added to
3933 AQ2, after this call, will not run until all tasks on AQ1, at the time
3934 of this call, have completed. */
3935
3936 bool
3937 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
3938 struct goacc_asyncqueue *aq2)
3939 {
3940 /* For serialize, stream aq2 waits for aq1 to complete work that has been
3941 scheduled to run on it up to this point. */
3942 if (aq1 != aq2)
3943 {
3944 struct placeholder *placeholderp = queue_push_placeholder (aq1);
3945 queue_push_asyncwait (aq2, placeholderp);
3946 }
3947 return true;
3948 }
3949
3950 /* Add an opaque callback to the given async queue. */
3951
3952 void
3953 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
3954 void (*fn) (void *), void *data)
3955 {
3956 queue_push_callback (aq, fn, data);
3957 }
3958
3959 /* Queue up an asynchronous data copy from host to DEVICE. */
3960
3961 bool
3962 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
3963 size_t n, struct goacc_asyncqueue *aq)
3964 {
3965 struct agent_info *agent = get_agent_info (device);
3966 assert (agent == aq->agent);
3967 /* The source data does not necessarily remain live until the deferred
3968 copy happens. Taking a snapshot of the data here avoids reading
3969 uninitialised data later, but means that (a) data is copied twice and
3970 (b) modifications to the copied data between the "spawning" point of
3971 the asynchronous kernel and when it is executed will not be seen.
3972 But, that is probably correct. */
3973 void *src_copy = GOMP_PLUGIN_malloc (n);
3974 memcpy (src_copy, src, n);
3975 queue_push_copy (aq, dst, src_copy, n, true);
3976 return true;
3977 }
3978
3979 /* Queue up an asynchronous data copy from DEVICE to host. */
3980
3981 bool
3982 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
3983 size_t n, struct goacc_asyncqueue *aq)
3984 {
3985 struct agent_info *agent = get_agent_info (device);
3986 assert (agent == aq->agent);
3987 queue_push_copy (aq, dst, src, n, false);
3988 return true;
3989 }
3990
3991 union goacc_property_value
3992 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
3993 {
3994 /* Stub. Check device and return default value for unsupported properties. */
3995 /* TODO: Implement this function. */
3996 get_agent_info (device);
3997
3998 union goacc_property_value nullval = { .val = 0 };
3999 return nullval;
4000 }
4001
4002 /* Set up plugin-specific thread-local-data (host-side). */
4003
4004 void *
4005 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4006 {
4007 struct gcn_thread *thread_data
4008 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4009
4010 thread_data->async = GOMP_ASYNC_SYNC;
4011
4012 return (void *) thread_data;
4013 }
4014
4015 /* Clean up plugin-specific thread-local-data. */
4016
4017 void
4018 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4019 {
4020 free (data);
4021 }
4022
4023 /* }}} */