]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-gcn.c
jit: fix ICE with GCC_JIT_BOOL_OPTION_SELFCHECK_GC since r278084 (PR jit/92483)
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
CommitLineData
237957cc
AS
1/* Plugin for AMD GCN execution.
2
3 Copyright (C) 2013-2019 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* {{{ Includes and defines */
30
31#include "config.h"
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35#include <pthread.h>
36#include <inttypes.h>
37#include <stdbool.h>
38#include <limits.h>
39#include <hsa.h>
40#include <dlfcn.h>
41#include <signal.h>
42#include "libgomp-plugin.h"
43#include "gomp-constants.h"
44#include <elf.h>
45#include "oacc-plugin.h"
46#include "oacc-int.h"
47#include <assert.h>
48
49/* Additional definitions not in HSA 1.1.
50 FIXME: this needs to be updated in hsa.h for upstream, but the only source
51 right now is the ROCr source which may cause license issues. */
52#define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
53
54/* These probably won't be in elf.h for a while. */
55#define R_AMDGPU_NONE 0
56#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
57#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
58#define R_AMDGPU_ABS64 3 /* S + A */
59#define R_AMDGPU_REL32 4 /* S + A - P */
60#define R_AMDGPU_REL64 5 /* S + A - P */
61#define R_AMDGPU_ABS32 6 /* S + A */
62#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
63#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
64#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
65#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
66#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
67#define reserved 12
68#define R_AMDGPU_RELATIVE64 13 /* B + A */
69
70/* GCN specific definitions for asynchronous queues. */
71
72#define ASYNC_QUEUE_SIZE 64
73#define DRAIN_QUEUE_SYNCHRONOUS_P false
74#define DEBUG_QUEUES 0
75#define DEBUG_THREAD_SLEEP 0
76#define DEBUG_THREAD_SIGNAL 0
77
78/* Defaults. */
79#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
80
81/* Secure getenv() which returns NULL if running as SUID/SGID. */
82#ifndef HAVE_SECURE_GETENV
83#ifdef HAVE___SECURE_GETENV
84#define secure_getenv __secure_getenv
85#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
86 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
87
88#include <unistd.h>
89
90/* Implementation of secure_getenv() for targets where it is not provided but
91 we have at least means to test real and effective IDs. */
92
93static char *
94secure_getenv (const char *name)
95{
96 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
97 return getenv (name);
98 else
99 return NULL;
100}
101
102#else
103#define secure_getenv getenv
104#endif
105#endif
106
107/* }}} */
108/* {{{ Types */
109
110/* GCN-specific implmentation of the GOMP_PLUGIN_acc_thread data. */
111
112struct gcn_thread
113{
114 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
115 int async;
116};
117
118/* As an HSA runtime is dlopened, following structure defines function
119 pointers utilized by the HSA plug-in. */
120
121struct hsa_runtime_fn_info
122{
123 /* HSA runtime. */
124 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
125 const char **status_string);
126 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
127 void *value);
128 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
129 hsa_agent_info_t attribute,
130 void *value);
131 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
132 hsa_isa_info_t attribute,
133 uint32_t index,
134 void *value);
135 hsa_status_t (*hsa_init_fn) (void);
136 hsa_status_t (*hsa_iterate_agents_fn)
137 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
138 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
139 hsa_region_info_t attribute,
140 void *value);
141 hsa_status_t (*hsa_queue_create_fn)
142 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
143 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
144 void *data, uint32_t private_segment_size,
145 uint32_t group_segment_size, hsa_queue_t **queue);
146 hsa_status_t (*hsa_agent_iterate_regions_fn)
147 (hsa_agent_t agent,
148 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
149 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
150 hsa_status_t (*hsa_executable_create_fn)
151 (hsa_profile_t profile, hsa_executable_state_t executable_state,
152 const char *options, hsa_executable_t *executable);
153 hsa_status_t (*hsa_executable_global_variable_define_fn)
154 (hsa_executable_t executable, const char *variable_name, void *address);
155 hsa_status_t (*hsa_executable_load_code_object_fn)
156 (hsa_executable_t executable, hsa_agent_t agent,
157 hsa_code_object_t code_object, const char *options);
158 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
159 const char *options);
160 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
161 uint32_t num_consumers,
162 const hsa_agent_t *consumers,
163 hsa_signal_t *signal);
164 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
165 void **ptr);
166 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
167 hsa_access_permission_t access);
168 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
169 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
170 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
171 hsa_status_t (*hsa_executable_get_symbol_fn)
172 (hsa_executable_t executable, const char *module_name,
173 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
174 hsa_executable_symbol_t *symbol);
175 hsa_status_t (*hsa_executable_symbol_get_info_fn)
176 (hsa_executable_symbol_t executable_symbol,
177 hsa_executable_symbol_info_t attribute, void *value);
178 hsa_status_t (*hsa_executable_iterate_symbols_fn)
179 (hsa_executable_t executable,
180 hsa_status_t (*callback)(hsa_executable_t executable,
181 hsa_executable_symbol_t symbol, void *data),
182 void *data);
183 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
184 uint64_t value);
185 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
186 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
187 hsa_signal_value_t value);
188 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
189 hsa_signal_value_t value);
190 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
191 (hsa_signal_t signal, hsa_signal_condition_t condition,
192 hsa_signal_value_t compare_value, uint64_t timeout_hint,
193 hsa_wait_state_t wait_state_hint);
194 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
195 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
196
197 hsa_status_t (*hsa_code_object_deserialize_fn)
198 (void *serialized_code_object, size_t serialized_code_object_size,
199 const char *options, hsa_code_object_t *code_object);
200};
201
202/* Structure describing the run-time and grid properties of an HSA kernel
203 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
204
205struct GOMP_kernel_launch_attributes
206{
207 /* Number of dimensions the workload has. Maximum number is 3. */
208 uint32_t ndim;
209 /* Size of the grid in the three respective dimensions. */
210 uint32_t gdims[3];
211 /* Size of work-groups in the respective dimensions. */
212 uint32_t wdims[3];
213};
214
215/* Collection of information needed for a dispatch of a kernel from a
216 kernel. */
217
218struct kernel_dispatch
219{
220 struct agent_info *agent;
221 /* Pointer to a command queue associated with a kernel dispatch agent. */
222 void *queue;
223 /* Pointer to a memory space used for kernel arguments passing. */
224 void *kernarg_address;
225 /* Kernel object. */
226 uint64_t object;
227 /* Synchronization signal used for dispatch synchronization. */
228 uint64_t signal;
229 /* Private segment size. */
230 uint32_t private_segment_size;
231 /* Group segment size. */
232 uint32_t group_segment_size;
233};
234
235/* Structure of the kernargs segment, supporting console output.
236
237 This needs to match the definitions in Newlib, and the expectations
238 in libgomp target code. */
239
240struct kernargs {
241 /* Leave space for the real kernel arguments.
242 OpenACC and OpenMP only use one pointer. */
243 int64_t dummy1;
244 int64_t dummy2;
245
246 /* A pointer to struct output, below, for console output data. */
247 int64_t out_ptr;
248
249 /* A pointer to struct heap, below. */
250 int64_t heap_ptr;
251
252 /* A pointer to an ephemeral memory arena.
253 Only needed for OpenMP. */
254 int64_t arena_ptr;
255
256 /* Output data. */
257 struct output {
258 int return_value;
259 unsigned int next_output;
260 struct printf_data {
261 int written;
262 char msg[128];
263 int type;
264 union {
265 int64_t ivalue;
266 double dvalue;
267 char text[128];
268 };
269 } queue[1024];
270 unsigned int consumed;
271 } output_data;
272};
273
274/* A queue entry for a future asynchronous launch. */
275
276struct kernel_launch
277{
278 struct kernel_info *kernel;
279 void *vars;
280 struct GOMP_kernel_launch_attributes kla;
281};
282
283/* A queue entry for a future callback. */
284
285struct callback
286{
287 void (*fn)(void *);
288 void *data;
289};
290
291/* A data struct for the copy_data callback. */
292
293struct copy_data
294{
295 void *dst;
296 const void *src;
297 size_t len;
298 bool free_src;
299 struct goacc_asyncqueue *aq;
300};
301
302/* A queue entry for a placeholder. These correspond to a wait event. */
303
304struct placeholder
305{
306 int executed;
307 pthread_cond_t cond;
308 pthread_mutex_t mutex;
309};
310
311/* A queue entry for a wait directive. */
312
313struct asyncwait_info
314{
315 struct placeholder *placeholderp;
316};
317
318/* Encode the type of an entry in an async queue. */
319
320enum entry_type
321{
322 KERNEL_LAUNCH,
323 CALLBACK,
324 ASYNC_WAIT,
325 ASYNC_PLACEHOLDER
326};
327
328/* An entry in an async queue. */
329
330struct queue_entry
331{
332 enum entry_type type;
333 union {
334 struct kernel_launch launch;
335 struct callback callback;
336 struct asyncwait_info asyncwait;
337 struct placeholder placeholder;
338 } u;
339};
340
341/* An async queue header.
342
343 OpenMP may create one of these.
344 OpenACC may create many. */
345
346struct goacc_asyncqueue
347{
348 struct agent_info *agent;
349 hsa_queue_t *hsa_queue;
350
351 pthread_t thread_drain_queue;
352 pthread_mutex_t mutex;
353 pthread_cond_t queue_cond_in;
354 pthread_cond_t queue_cond_out;
355 struct queue_entry queue[ASYNC_QUEUE_SIZE];
356 int queue_first;
357 int queue_n;
358 int drain_queue_stop;
359
360 int id;
361 struct goacc_asyncqueue *prev;
362 struct goacc_asyncqueue *next;
363};
364
365/* Mkoffload uses this structure to describe a kernel.
366
367 OpenMP kernel dimensions are passed at runtime.
368 OpenACC kernel dimensions are passed at compile time, here. */
369
370struct hsa_kernel_description
371{
372 const char *name;
373 int oacc_dims[3]; /* Only present for GCN kernels. */
374};
375
376/* Mkoffload uses this structure to describe an offload variable. */
377
378struct global_var_info
379{
380 const char *name;
381 void *address;
382};
383
384/* Mkoffload uses this structure to describe all the kernels in a
385 loadable module. These are passed the libgomp via static constructors. */
386
387struct gcn_image_desc
388{
389 struct gcn_image {
390 size_t size;
391 void *image;
392 } *gcn_image;
393 const unsigned kernel_count;
394 struct hsa_kernel_description *kernel_infos;
395 const unsigned global_variable_count;
396 struct global_var_info *global_variables;
397};
398
399/* Description of an HSA GPU agent (device) and the program associated with
400 it. */
401
402struct agent_info
403{
404 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
405 hsa_agent_t id;
406 /* The user-visible device number. */
407 int device_id;
408 /* Whether the agent has been initialized. The fields below are usable only
409 if it has been. */
410 bool initialized;
411 /* Precomuted check for problem architectures. */
412 bool gfx900_p;
413
414 /* Command queues of the agent. */
415 hsa_queue_t *sync_queue;
416 struct goacc_asyncqueue *async_queues, *omp_async_queue;
417 pthread_mutex_t async_queues_mutex;
418
419 /* The HSA memory region from which to allocate kernel arguments. */
420 hsa_region_t kernarg_region;
421
422 /* The HSA memory region from which to allocate device data. */
423 hsa_region_t data_region;
424
425 /* Allocated team arenas. */
426 struct team_arena_list *team_arena_list;
427 pthread_mutex_t team_arena_write_lock;
428
429 /* Read-write lock that protects kernels which are running or about to be run
430 from interference with loading and unloading of images. Needs to be
431 locked for reading while a kernel is being run, and for writing if the
432 list of modules is manipulated (and thus the HSA program invalidated). */
433 pthread_rwlock_t module_rwlock;
434
435 /* The module associated with this kernel. */
436 struct module_info *module;
437
438 /* Mutex enforcing that only one thread will finalize the HSA program. A
439 thread should have locked agent->module_rwlock for reading before
440 acquiring it. */
441 pthread_mutex_t prog_mutex;
442 /* Flag whether the HSA program that consists of all the modules has been
443 finalized. */
444 bool prog_finalized;
445 /* HSA executable - the finalized program that is used to locate kernels. */
446 hsa_executable_t executable;
447};
448
449/* Information required to identify, finalize and run any given kernel. */
450
451enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
452
453struct kernel_info
454{
455 /* Name of the kernel, required to locate it within the GCN object-code
456 module. */
457 const char *name;
458 /* The specific agent the kernel has been or will be finalized for and run
459 on. */
460 struct agent_info *agent;
461 /* The specific module where the kernel takes place. */
462 struct module_info *module;
463 /* Mutex enforcing that at most once thread ever initializes a kernel for
464 use. A thread should have locked agent->module_rwlock for reading before
465 acquiring it. */
466 pthread_mutex_t init_mutex;
467 /* Flag indicating whether the kernel has been initialized and all fields
468 below it contain valid data. */
469 bool initialized;
470 /* Flag indicating that the kernel has a problem that blocks an execution. */
471 bool initialization_failed;
472 /* The object to be put into the dispatch queue. */
473 uint64_t object;
474 /* Required size of kernel arguments. */
475 uint32_t kernarg_segment_size;
476 /* Required size of group segment. */
477 uint32_t group_segment_size;
478 /* Required size of private segment. */
479 uint32_t private_segment_size;
480 /* Set up for OpenMP or OpenACC? */
481 enum offload_kind kind;
482};
483
484/* Information about a particular GCN module, its image and kernels. */
485
486struct module_info
487{
488 /* The description with which the program has registered the image. */
489 struct gcn_image_desc *image_desc;
490 /* GCN heap allocation. */
491 struct heap *heap;
492 /* Physical boundaries of the loaded module. */
493 Elf64_Addr phys_address_start;
494 Elf64_Addr phys_address_end;
495
496 bool constructors_run_p;
497 struct kernel_info *init_array_func, *fini_array_func;
498
499 /* Number of kernels in this module. */
500 int kernel_count;
501 /* An array of kernel_info structures describing each kernel in this
502 module. */
503 struct kernel_info kernels[];
504};
505
506/* A linked list of memory arenas allocated on the device.
507 These are only used by OpenMP, as a means to optimize per-team malloc. */
508
509struct team_arena_list
510{
511 struct team_arena_list *next;
512
513 /* The number of teams determines the size of the allocation. */
514 int num_teams;
515 /* The device address of the arena itself. */
516 void *arena;
517 /* A flag to prevent two asynchronous kernels trying to use the same arena.
518 The mutex is locked until the kernel exits. */
519 pthread_mutex_t in_use;
520};
521
522/* Information about the whole HSA environment and all of its agents. */
523
524struct hsa_context_info
525{
526 /* Whether the structure has been initialized. */
527 bool initialized;
528 /* Number of usable GPU HSA agents in the system. */
529 int agent_count;
530 /* Array of agent_info structures describing the individual HSA agents. */
531 struct agent_info *agents;
532};
533
534/* Format of the on-device heap.
535
536 This must match the definition in Newlib and gcn-run. */
537
538struct heap {
539 int64_t size;
540 char data[0];
541};
542
543/* }}} */
544/* {{{ Global variables */
545
546/* Information about the whole HSA environment and all of its agents. */
547
548static struct hsa_context_info hsa_context;
549
550/* HSA runtime functions that are initialized in init_hsa_context. */
551
552static struct hsa_runtime_fn_info hsa_fns;
553
554/* Heap space, allocated target-side, provided for use of newlib malloc.
555 Each module should have it's own heap allocated.
556 Beware that heap usage increases with OpenMP teams. See also arenas. */
557
558static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
559
560/* Flag to decide whether print to stderr information about what is going on.
561 Set in init_debug depending on environment variables. */
562
563static bool debug;
564
565/* Flag to decide if the runtime should suppress a possible fallback to host
566 execution. */
567
568static bool suppress_host_fallback;
569
570/* Flag to locate HSA runtime shared library that is dlopened
571 by this plug-in. */
572
573static const char *hsa_runtime_lib;
574
575/* Flag to decide if the runtime should support also CPU devices (can be
576 a simulator). */
577
578static bool support_cpu_devices;
579
580/* Runtime dimension overrides. Zero indicates default. */
581
582static int override_x_dim = 0;
583static int override_z_dim = 0;
584
585/* }}} */
586/* {{{ Debug & Diagnostic */
587
588/* Print a message to stderr if GCN_DEBUG value is set to true. */
589
590#define DEBUG_PRINT(...) \
591 do \
592 { \
593 if (debug) \
594 { \
595 fprintf (stderr, __VA_ARGS__); \
596 } \
597 } \
598 while (false);
599
600/* Flush stderr if GCN_DEBUG value is set to true. */
601
602#define DEBUG_FLUSH() \
603 do { \
604 if (debug) \
605 fflush (stderr); \
606 } while (false)
607
608/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
609 is set to true. */
610
611#define DEBUG_LOG(prefix, ...) \
612 do \
613 { \
614 DEBUG_PRINT (prefix); \
615 DEBUG_PRINT (__VA_ARGS__); \
616 DEBUG_FLUSH (); \
617 } while (false)
618
619/* Print a debugging message to stderr. */
620
621#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
622
623/* Print a warning message to stderr. */
624
625#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
626
627/* Print HSA warning STR with an HSA STATUS code. */
628
629static void
630hsa_warn (const char *str, hsa_status_t status)
631{
632 if (!debug)
633 return;
634
635 const char *hsa_error_msg = "[unknown]";
636 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
637
638 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
639 hsa_error_msg);
640}
641
642/* Report a fatal error STR together with the HSA error corresponding to STATUS
643 and terminate execution of the current process. */
644
645static void
646hsa_fatal (const char *str, hsa_status_t status)
647{
648 const char *hsa_error_msg = "[unknown]";
649 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
650 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
651 hsa_error_msg);
652}
653
654/* Like hsa_fatal, except only report error message, and return FALSE
655 for propagating error processing to outside of plugin. */
656
657static bool
658hsa_error (const char *str, hsa_status_t status)
659{
660 const char *hsa_error_msg = "[unknown]";
661 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
662 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
663 hsa_error_msg);
664 return false;
665}
666
667/* Dump information about the available hardware. */
668
669static void
670dump_hsa_system_info (void)
671{
672 hsa_status_t status;
673
674 hsa_endianness_t endianness;
675 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
676 &endianness);
677 if (status == HSA_STATUS_SUCCESS)
678 switch (endianness)
679 {
680 case HSA_ENDIANNESS_LITTLE:
681 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
682 break;
683 case HSA_ENDIANNESS_BIG:
684 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
685 break;
686 default:
687 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
688 }
689 else
690 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
691
692 uint8_t extensions[128];
693 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
694 &extensions);
695 if (status == HSA_STATUS_SUCCESS)
696 {
697 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
698 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
699 }
700 else
701 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
702}
703
704/* Dump information about the available hardware. */
705
706static void
707dump_machine_model (hsa_machine_model_t machine_model, const char *s)
708{
709 switch (machine_model)
710 {
711 case HSA_MACHINE_MODEL_SMALL:
712 GCN_DEBUG ("%s: SMALL\n", s);
713 break;
714 case HSA_MACHINE_MODEL_LARGE:
715 GCN_DEBUG ("%s: LARGE\n", s);
716 break;
717 default:
718 GCN_WARNING ("%s: UNKNOWN\n", s);
719 break;
720 }
721}
722
723/* Dump information about the available hardware. */
724
725static void
726dump_profile (hsa_profile_t profile, const char *s)
727{
728 switch (profile)
729 {
730 case HSA_PROFILE_FULL:
731 GCN_DEBUG ("%s: FULL\n", s);
732 break;
733 case HSA_PROFILE_BASE:
734 GCN_DEBUG ("%s: BASE\n", s);
735 break;
736 default:
737 GCN_WARNING ("%s: UNKNOWN\n", s);
738 break;
739 }
740}
741
742/* Dump information about a device memory region. */
743
744static hsa_status_t
745dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
746{
747 hsa_status_t status;
748
749 hsa_region_segment_t segment;
750 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
751 &segment);
752 if (status == HSA_STATUS_SUCCESS)
753 {
754 if (segment == HSA_REGION_SEGMENT_GLOBAL)
755 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
756 else if (segment == HSA_REGION_SEGMENT_READONLY)
757 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
758 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
759 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
760 else if (segment == HSA_REGION_SEGMENT_GROUP)
761 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
762 else
763 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
764 }
765 else
766 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
767
768 if (segment == HSA_REGION_SEGMENT_GLOBAL)
769 {
770 uint32_t flags;
771 status
772 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
773 &flags);
774 if (status == HSA_STATUS_SUCCESS)
775 {
776 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
777 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
778 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
779 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
780 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
781 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
782 }
783 else
784 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
785 }
786
787 size_t size;
788 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
789 if (status == HSA_STATUS_SUCCESS)
790 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
791 else
792 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
793
794 status
795 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
796 &size);
797 if (status == HSA_STATUS_SUCCESS)
798 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
799 else
800 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
801
802 bool alloc_allowed;
803 status
804 = hsa_fns.hsa_region_get_info_fn (region,
805 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
806 &alloc_allowed);
807 if (status == HSA_STATUS_SUCCESS)
808 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
809 else
810 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
811
812 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
813 return HSA_STATUS_SUCCESS;
814
815 status
816 = hsa_fns.hsa_region_get_info_fn (region,
817 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
818 &size);
819 if (status == HSA_STATUS_SUCCESS)
820 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
821 else
822 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
823
824 size_t align;
825 status
826 = hsa_fns.hsa_region_get_info_fn (region,
827 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
828 &align);
829 if (status == HSA_STATUS_SUCCESS)
830 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
831 else
832 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
833
834 return HSA_STATUS_SUCCESS;
835}
836
837/* Dump information about all the device memory regions. */
838
839static void
840dump_hsa_regions (hsa_agent_t agent)
841{
842 hsa_status_t status;
843 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
844 dump_hsa_region,
845 NULL);
846 if (status != HSA_STATUS_SUCCESS)
847 hsa_error ("Dumping hsa regions failed", status);
848}
849
850/* Dump information about the available devices. */
851
852static hsa_status_t
853dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
854{
855 hsa_status_t status;
856
857 char buf[64];
858 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
859 &buf);
860 if (status == HSA_STATUS_SUCCESS)
861 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
862 else
863 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
864
865 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
866 &buf);
867 if (status == HSA_STATUS_SUCCESS)
868 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
869 else
870 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
871
872 hsa_machine_model_t machine_model;
873 status
874 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
875 &machine_model);
876 if (status == HSA_STATUS_SUCCESS)
877 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
878 else
879 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
880
881 hsa_profile_t profile;
882 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
883 &profile);
884 if (status == HSA_STATUS_SUCCESS)
885 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
886 else
887 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
888
889 hsa_device_type_t device_type;
890 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
891 &device_type);
892 if (status == HSA_STATUS_SUCCESS)
893 {
894 switch (device_type)
895 {
896 case HSA_DEVICE_TYPE_CPU:
897 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
898 break;
899 case HSA_DEVICE_TYPE_GPU:
900 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
901 break;
902 case HSA_DEVICE_TYPE_DSP:
903 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
904 break;
905 default:
906 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
907 break;
908 }
909 }
910 else
911 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
912
913 uint32_t cu_count;
914 status = hsa_fns.hsa_agent_get_info_fn
915 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
916 if (status == HSA_STATUS_SUCCESS)
917 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
918 else
919 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
920
921 uint32_t size;
922 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
923 &size);
924 if (status == HSA_STATUS_SUCCESS)
925 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
926 else
927 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
928
929 uint32_t max_dim;
930 status = hsa_fns.hsa_agent_get_info_fn (agent,
931 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
932 &max_dim);
933 if (status == HSA_STATUS_SUCCESS)
934 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
935 else
936 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
937
938 uint32_t max_size;
939 status = hsa_fns.hsa_agent_get_info_fn (agent,
940 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
941 &max_size);
942 if (status == HSA_STATUS_SUCCESS)
943 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
944 else
945 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
946
947 uint32_t grid_max_dim;
948 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
949 &grid_max_dim);
950 if (status == HSA_STATUS_SUCCESS)
951 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
952 else
953 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
954
955 uint32_t grid_max_size;
956 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
957 &grid_max_size);
958 if (status == HSA_STATUS_SUCCESS)
959 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
960 else
961 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
962
963 dump_hsa_regions (agent);
964
965 return HSA_STATUS_SUCCESS;
966}
967
968/* Forward reference. */
969
970static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
971
972/* Helper function for dump_executable_symbols. */
973
974static hsa_status_t
975dump_executable_symbol (hsa_executable_t executable,
976 hsa_executable_symbol_t symbol,
977 void *data __attribute__((unused)))
978{
979 char *name = get_executable_symbol_name (symbol);
980
981 if (name)
982 {
983 GCN_DEBUG ("executable symbol: %s\n", name);
984 free (name);
985 }
986
987 return HSA_STATUS_SUCCESS;
988}
989
990/* Dump all global symbol in an executable. */
991
992static void
993dump_executable_symbols (hsa_executable_t executable)
994{
995 hsa_status_t status;
996 status
997 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
998 dump_executable_symbol,
999 NULL);
1000 if (status != HSA_STATUS_SUCCESS)
1001 hsa_fatal ("Could not dump HSA executable symbols", status);
1002}
1003
1004/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1005
1006static void
1007print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1008{
1009 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1010
1011 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1012 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1013 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1014 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1015 (void*)kernargs->heap_ptr);
1016 fprintf (stderr, "%*sarena address: %p\n", indent, "",
1017 (void*)kernargs->arena_ptr);
1018 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1019 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1020 dispatch->private_segment_size);
1021 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1022 dispatch->group_segment_size);
1023 fprintf (stderr, "\n");
1024}
1025
1026/* }}} */
1027/* {{{ Utility functions */
1028
1029/* Cast the thread local storage to gcn_thread. */
1030
1031static inline struct gcn_thread *
1032gcn_thread (void)
1033{
1034 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1035}
1036
1037/* Initialize debug and suppress_host_fallback according to the environment. */
1038
1039static void
1040init_environment_variables (void)
1041{
1042 if (secure_getenv ("GCN_DEBUG"))
1043 debug = true;
1044 else
1045 debug = false;
1046
1047 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1048 suppress_host_fallback = true;
1049 else
1050 suppress_host_fallback = false;
1051
1052 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1053 if (hsa_runtime_lib == NULL)
1054 hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so";
1055
1056 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1057
1058 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1059 if (!x)
1060 x = secure_getenv ("GCN_NUM_GANGS");
1061 if (x)
1062 override_x_dim = atoi (x);
1063
1064 const char *z = secure_getenv ("GCN_NUM_THREADS");
1065 if (!z)
1066 z = secure_getenv ("GCN_NUM_WORKERS");
1067 if (z)
1068 override_z_dim = atoi (z);
1069
1070 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1071 if (heap)
1072 {
1073 size_t tmp = atol (heap);
1074 if (tmp)
1075 gcn_kernel_heap_size = tmp;
1076 }
1077}
1078
1079/* Return malloc'd string with name of SYMBOL. */
1080
1081static char *
1082get_executable_symbol_name (hsa_executable_symbol_t symbol)
1083{
1084 hsa_status_t status;
1085 char *res;
1086 uint32_t len;
1087 const hsa_executable_symbol_info_t info_name_length
1088 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1089
1090 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1091 &len);
1092 if (status != HSA_STATUS_SUCCESS)
1093 {
1094 hsa_error ("Could not get length of symbol name", status);
1095 return NULL;
1096 }
1097
1098 res = GOMP_PLUGIN_malloc (len + 1);
1099
1100 const hsa_executable_symbol_info_t info_name
1101 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1102
1103 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1104
1105 if (status != HSA_STATUS_SUCCESS)
1106 {
1107 hsa_error ("Could not get symbol name", status);
1108 free (res);
1109 return NULL;
1110 }
1111
1112 res[len] = '\0';
1113
1114 return res;
1115}
1116
1117/* Helper function for find_executable_symbol. */
1118
1119static hsa_status_t
1120find_executable_symbol_1 (hsa_executable_t executable,
1121 hsa_executable_symbol_t symbol,
1122 void *data)
1123{
1124 hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
1125 *res = symbol;
1126 return HSA_STATUS_INFO_BREAK;
1127}
1128
1129/* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true. If not
1130 found, return false. */
1131
1132static bool
1133find_executable_symbol (hsa_executable_t executable,
1134 hsa_executable_symbol_t *symbol)
1135{
1136 hsa_status_t status;
1137
1138 status
1139 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1140 find_executable_symbol_1,
1141 symbol);
1142 if (status != HSA_STATUS_INFO_BREAK)
1143 {
1144 hsa_error ("Could not find executable symbol", status);
1145 return false;
1146 }
1147
1148 return true;
1149}
1150
1151/* Get the number of GPU Compute Units. */
1152
1153static int
1154get_cu_count (struct agent_info *agent)
1155{
1156 uint32_t cu_count;
1157 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1158 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1159 if (status == HSA_STATUS_SUCCESS)
1160 return cu_count;
1161 else
1162 return 64; /* The usual number for older devices. */
1163}
1164
1165/* Calculate the maximum grid size for OMP threads / OACC workers.
1166 This depends on the kernel's resource usage levels. */
1167
1168static int
1169limit_worker_threads (int threads)
1170{
1171 /* FIXME Do something more inteligent here.
1172 GCN can always run 4 threads within a Compute Unit, but
1173 more than that depends on register usage. */
1174 if (threads > 16)
1175 threads = 16;
1176 return threads;
1177}
1178
1179/* Parse the target attributes INPUT provided by the compiler and return true
1180 if we should run anything all. If INPUT is NULL, fill DEF with default
1181 values, then store INPUT or DEF into *RESULT.
1182
1183 This is used for OpenMP only. */
1184
1185static bool
1186parse_target_attributes (void **input,
1187 struct GOMP_kernel_launch_attributes *def,
1188 struct GOMP_kernel_launch_attributes **result,
1189 struct agent_info *agent)
1190{
1191 if (!input)
1192 GOMP_PLUGIN_fatal ("No target arguments provided");
1193
1194 bool grid_attrs_found = false;
1195 bool gcn_dims_found = false;
1196 int gcn_teams = 0;
1197 int gcn_threads = 0;
1198 while (*input)
1199 {
1200 intptr_t id = (intptr_t) *input++, val;
1201
1202 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1203 val = (intptr_t) *input++;
1204 else
1205 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1206
1207 val = (val > INT_MAX) ? INT_MAX : val;
1208
1209 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1210 && ((id & GOMP_TARGET_ARG_ID_MASK)
1211 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1212 {
1213 grid_attrs_found = true;
1214 break;
1215 }
1216 else if ((id & GOMP_TARGET_ARG_DEVICE_ALL) == GOMP_TARGET_ARG_DEVICE_ALL)
1217 {
1218 gcn_dims_found = true;
1219 switch (id & GOMP_TARGET_ARG_ID_MASK)
1220 {
1221 case GOMP_TARGET_ARG_NUM_TEAMS:
1222 gcn_teams = val;
1223 break;
1224 case GOMP_TARGET_ARG_THREAD_LIMIT:
1225 gcn_threads = limit_worker_threads (val);
1226 break;
1227 default:
1228 ;
1229 }
1230 }
1231 }
1232
1233 if (gcn_dims_found)
1234 {
1235 if (agent->gfx900_p && gcn_threads == 0 && override_z_dim == 0)
1236 {
1237 gcn_threads = 4;
1238 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1239 "threads to 4 per team.\n");
1240 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1241 "GCN_NUM_THREADS=16\n");
1242 }
1243
1244 def->ndim = 3;
1245 /* Fiji has 64 CUs, but Vega20 has 60. */
1246 def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
1247 /* Each thread is 64 work items wide. */
1248 def->gdims[1] = 64;
1249 /* A work group can have 16 wavefronts. */
1250 def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
1251 def->wdims[0] = 1; /* Single team per work-group. */
1252 def->wdims[1] = 64;
1253 def->wdims[2] = 16;
1254 *result = def;
1255 return true;
1256 }
1257 else if (!grid_attrs_found)
1258 {
1259 def->ndim = 1;
1260 def->gdims[0] = 1;
1261 def->gdims[1] = 1;
1262 def->gdims[2] = 1;
1263 def->wdims[0] = 1;
1264 def->wdims[1] = 1;
1265 def->wdims[2] = 1;
1266 *result = def;
1267 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1268 return true;
1269 }
1270
1271 struct GOMP_kernel_launch_attributes *kla;
1272 kla = (struct GOMP_kernel_launch_attributes *) *input;
1273 *result = kla;
1274 if (kla->ndim == 0 || kla->ndim > 3)
1275 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1276
1277 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1278 unsigned i;
1279 for (i = 0; i < kla->ndim; i++)
1280 {
1281 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1282 kla->gdims[i], kla->wdims[i]);
1283 if (kla->gdims[i] == 0)
1284 return false;
1285 }
1286 return true;
1287}
1288
1289/* Return the group size given the requested GROUP size, GRID size and number
1290 of grid dimensions NDIM. */
1291
1292static uint32_t
1293get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1294{
1295 if (group == 0)
1296 {
1297 /* TODO: Provide a default via environment or device characteristics. */
1298 if (ndim == 1)
1299 group = 64;
1300 else if (ndim == 2)
1301 group = 8;
1302 else
1303 group = 4;
1304 }
1305
1306 if (group > grid)
1307 group = grid;
1308 return group;
1309}
1310
1311/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1312
1313static void
1314packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1315{
1316 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1317}
1318
1319/* A never-called callback for the HSA command queues. These signal events
1320 that we don't use, so we trigger an error.
1321
1322 This "queue" is not to be confused with the async queues, below. */
1323
1324static void
1325hsa_queue_callback (hsa_status_t status,
1326 hsa_queue_t *queue __attribute__ ((unused)),
1327 void *data __attribute__ ((unused)))
1328{
1329 hsa_fatal ("Asynchronous queue error", status);
1330}
1331
1332/* }}} */
1333/* {{{ HSA initialization */
1334
1335/* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1336
1337static bool
1338init_hsa_runtime_functions (void)
1339{
1340#define DLSYM_FN(function) \
1341 hsa_fns.function##_fn = dlsym (handle, #function); \
1342 if (hsa_fns.function##_fn == NULL) \
1343 return false;
1344 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1345 if (handle == NULL)
1346 return false;
1347
1348 DLSYM_FN (hsa_status_string)
1349 DLSYM_FN (hsa_system_get_info)
1350 DLSYM_FN (hsa_agent_get_info)
1351 DLSYM_FN (hsa_init)
1352 DLSYM_FN (hsa_iterate_agents)
1353 DLSYM_FN (hsa_region_get_info)
1354 DLSYM_FN (hsa_queue_create)
1355 DLSYM_FN (hsa_agent_iterate_regions)
1356 DLSYM_FN (hsa_executable_destroy)
1357 DLSYM_FN (hsa_executable_create)
1358 DLSYM_FN (hsa_executable_global_variable_define)
1359 DLSYM_FN (hsa_executable_load_code_object)
1360 DLSYM_FN (hsa_executable_freeze)
1361 DLSYM_FN (hsa_signal_create)
1362 DLSYM_FN (hsa_memory_allocate)
1363 DLSYM_FN (hsa_memory_assign_agent)
1364 DLSYM_FN (hsa_memory_copy)
1365 DLSYM_FN (hsa_memory_free)
1366 DLSYM_FN (hsa_signal_destroy)
1367 DLSYM_FN (hsa_executable_get_symbol)
1368 DLSYM_FN (hsa_executable_symbol_get_info)
1369 DLSYM_FN (hsa_executable_iterate_symbols)
1370 DLSYM_FN (hsa_queue_add_write_index_release)
1371 DLSYM_FN (hsa_queue_load_read_index_acquire)
1372 DLSYM_FN (hsa_signal_wait_acquire)
1373 DLSYM_FN (hsa_signal_store_relaxed)
1374 DLSYM_FN (hsa_signal_store_release)
1375 DLSYM_FN (hsa_signal_load_acquire)
1376 DLSYM_FN (hsa_queue_destroy)
1377 DLSYM_FN (hsa_code_object_deserialize)
1378 return true;
1379#undef DLSYM_FN
1380}
1381
1382/* Return true if the agent is a GPU and can accept of concurrent submissions
1383 from different threads. */
1384
1385static bool
1386suitable_hsa_agent_p (hsa_agent_t agent)
1387{
1388 hsa_device_type_t device_type;
1389 hsa_status_t status
1390 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1391 &device_type);
1392 if (status != HSA_STATUS_SUCCESS)
1393 return false;
1394
1395 switch (device_type)
1396 {
1397 case HSA_DEVICE_TYPE_GPU:
1398 break;
1399 case HSA_DEVICE_TYPE_CPU:
1400 if (!support_cpu_devices)
1401 return false;
1402 break;
1403 default:
1404 return false;
1405 }
1406
1407 uint32_t features = 0;
1408 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1409 &features);
1410 if (status != HSA_STATUS_SUCCESS
1411 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1412 return false;
1413 hsa_queue_type_t queue_type;
1414 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1415 &queue_type);
1416 if (status != HSA_STATUS_SUCCESS
1417 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1418 return false;
1419
1420 return true;
1421}
1422
1423/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1424 agent_count in hsa_context. */
1425
1426static hsa_status_t
1427count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1428{
1429 if (suitable_hsa_agent_p (agent))
1430 hsa_context.agent_count++;
1431 return HSA_STATUS_SUCCESS;
1432}
1433
1434/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1435 id to the describing structure in the hsa context. The index of the
1436 structure is pointed to by DATA, increment it afterwards. */
1437
1438static hsa_status_t
1439assign_agent_ids (hsa_agent_t agent, void *data)
1440{
1441 if (suitable_hsa_agent_p (agent))
1442 {
1443 int *agent_index = (int *) data;
1444 hsa_context.agents[*agent_index].id = agent;
1445 ++*agent_index;
1446 }
1447 return HSA_STATUS_SUCCESS;
1448}
1449
1450/* Initialize hsa_context if it has not already been done.
1451 Return TRUE on success. */
1452
1453static bool
1454init_hsa_context (void)
1455{
1456 hsa_status_t status;
1457 int agent_index = 0;
1458
1459 if (hsa_context.initialized)
1460 return true;
1461 init_environment_variables ();
1462 if (!init_hsa_runtime_functions ())
1463 {
1464 GCN_WARNING ("Run-time could not be dynamically opened\n");
1465 if (suppress_host_fallback)
1466 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1467 return false;
1468 }
1469 status = hsa_fns.hsa_init_fn ();
1470 if (status != HSA_STATUS_SUCCESS)
1471 return hsa_error ("Run-time could not be initialized", status);
1472 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1473
1474 if (debug)
1475 dump_hsa_system_info ();
1476
1477 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1478 if (status != HSA_STATUS_SUCCESS)
1479 return hsa_error ("GCN GPU devices could not be enumerated", status);
1480 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1481
1482 hsa_context.agents
1483 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1484 * sizeof (struct agent_info));
1485 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1486 if (agent_index != hsa_context.agent_count)
1487 {
1488 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1489 return false;
1490 }
1491
1492 if (debug)
1493 {
1494 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1495 if (status != HSA_STATUS_SUCCESS)
1496 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1497 }
1498
1499 hsa_context.initialized = true;
1500 return true;
1501}
1502
1503/* Verify that hsa_context has already been initialized and return the
1504 agent_info structure describing device number N. Return NULL on error. */
1505
1506static struct agent_info *
1507get_agent_info (int n)
1508{
1509 if (!hsa_context.initialized)
1510 {
1511 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1512 return NULL;
1513 }
1514 if (n >= hsa_context.agent_count)
1515 {
1516 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1517 return NULL;
1518 }
1519 if (!hsa_context.agents[n].initialized)
1520 {
1521 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1522 return NULL;
1523 }
1524 return &hsa_context.agents[n];
1525}
1526
1527/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1528
1529 Selects (breaks at) a suitable region of type KIND. */
1530
1531static hsa_status_t
1532get_memory_region (hsa_region_t region, hsa_region_t *retval,
1533 hsa_region_global_flag_t kind)
1534{
1535 hsa_status_t status;
1536 hsa_region_segment_t segment;
1537
1538 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1539 &segment);
1540 if (status != HSA_STATUS_SUCCESS)
1541 return status;
1542 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1543 return HSA_STATUS_SUCCESS;
1544
1545 uint32_t flags;
1546 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1547 &flags);
1548 if (status != HSA_STATUS_SUCCESS)
1549 return status;
1550 if (flags & kind)
1551 {
1552 *retval = region;
1553 return HSA_STATUS_INFO_BREAK;
1554 }
1555 return HSA_STATUS_SUCCESS;
1556}
1557
1558/* Callback of hsa_agent_iterate_regions.
1559
1560 Selects a kernargs memory region. */
1561
1562static hsa_status_t
1563get_kernarg_memory_region (hsa_region_t region, void *data)
1564{
1565 return get_memory_region (region, (hsa_region_t *)data,
1566 HSA_REGION_GLOBAL_FLAG_KERNARG);
1567}
1568
1569/* Callback of hsa_agent_iterate_regions.
1570
1571 Selects a coarse-grained memory region suitable for the heap and
1572 offload data. */
1573
1574static hsa_status_t
1575get_data_memory_region (hsa_region_t region, void *data)
1576{
1577 return get_memory_region (region, (hsa_region_t *)data,
1578 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1579}
1580
1581/* }}} */
1582/* {{{ Run */
1583
1584/* Create or reuse a team arena.
1585
1586 Team arenas are used by OpenMP to avoid calling malloc multiple times
1587 while setting up each team. This is purely a performance optimization.
1588
1589 Allocating an arena also costs performance, albeit on the host side, so
1590 this function will reuse an existing arena if a large enough one is idle.
1591 The arena is released, but not deallocated, when the kernel exits. */
1592
1593static void *
1594get_team_arena (struct agent_info *agent, int num_teams)
1595{
1596 struct team_arena_list **next_ptr = &agent->team_arena_list;
1597 struct team_arena_list *item;
1598
1599 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1600 {
1601 if (item->num_teams < num_teams)
1602 continue;
1603
1604 if (pthread_mutex_trylock (&item->in_use))
1605 continue;
1606
1607 return item->arena;
1608 }
1609
1610 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1611
1612 if (pthread_mutex_lock (&agent->team_arena_write_lock))
1613 {
1614 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1615 return false;
1616 }
1617 item = malloc (sizeof (*item));
1618 item->num_teams = num_teams;
1619 item->next = NULL;
1620 *next_ptr = item;
1621
1622 if (pthread_mutex_init (&item->in_use, NULL))
1623 {
1624 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1625 return false;
1626 }
1627 if (pthread_mutex_lock (&item->in_use))
1628 {
1629 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1630 return false;
1631 }
1632 if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1633 {
1634 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1635 return false;
1636 }
1637
1638 const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */
1639 hsa_status_t status;
1640 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1641 TEAM_ARENA_SIZE*num_teams,
1642 &item->arena);
1643 if (status != HSA_STATUS_SUCCESS)
1644 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1645 status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1646 HSA_ACCESS_PERMISSION_RW);
1647 if (status != HSA_STATUS_SUCCESS)
1648 hsa_fatal ("Could not assign arena memory to device", status);
1649
1650 return item->arena;
1651}
1652
1653/* Mark a team arena available for reuse. */
1654
1655static void
1656release_team_arena (struct agent_info* agent, void *arena)
1657{
1658 struct team_arena_list *item;
1659
1660 for (item = agent->team_arena_list; item; item = item->next)
1661 {
1662 if (item->arena == arena)
1663 {
1664 if (pthread_mutex_unlock (&item->in_use))
1665 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1666 return;
1667 }
1668 }
1669 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1670}
1671
1672/* Clean up all the allocated team arenas. */
1673
1674static bool
1675destroy_team_arenas (struct agent_info *agent)
1676{
1677 struct team_arena_list *item, *next;
1678
1679 for (item = agent->team_arena_list; item; item = next)
1680 {
1681 next = item->next;
1682 hsa_fns.hsa_memory_free_fn (item->arena);
1683 if (pthread_mutex_destroy (&item->in_use))
1684 {
1685 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1686 return false;
1687 }
1688 free (item);
1689 }
1690 agent->team_arena_list = NULL;
1691
1692 return true;
1693}
1694
1695/* Allocate memory on a specified device. */
1696
1697static void *
1698alloc_by_agent (struct agent_info *agent, size_t size)
1699{
1700 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1701
1702 /* Zero-size allocations are invalid, so in order to return a valid pointer
1703 we need to pass a valid size. One source of zero-size allocations is
1704 kernargs for kernels that have no inputs or outputs (the kernel may
1705 only use console output, for example). */
1706 if (size == 0)
1707 size = 4;
1708
1709 void *ptr;
1710 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1711 size, &ptr);
1712 if (status != HSA_STATUS_SUCCESS)
1713 {
1714 hsa_error ("Could not allocate device memory", status);
1715 return NULL;
1716 }
1717
1718 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1719 HSA_ACCESS_PERMISSION_RW);
1720 if (status != HSA_STATUS_SUCCESS)
1721 {
1722 hsa_error ("Could not assign data memory to device", status);
1723 return NULL;
1724 }
1725
1726 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1727 bool profiling_dispatch_p
1728 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1729 if (profiling_dispatch_p)
1730 {
1731 acc_prof_info *prof_info = thr->prof_info;
1732 acc_event_info data_event_info;
1733 acc_api_info *api_info = thr->api_info;
1734
1735 prof_info->event_type = acc_ev_alloc;
1736
1737 data_event_info.data_event.event_type = prof_info->event_type;
1738 data_event_info.data_event.valid_bytes
1739 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1740 data_event_info.data_event.parent_construct
1741 = acc_construct_parallel;
1742 data_event_info.data_event.implicit = 1;
1743 data_event_info.data_event.tool_info = NULL;
1744 data_event_info.data_event.var_name = NULL;
1745 data_event_info.data_event.bytes = size;
1746 data_event_info.data_event.host_ptr = NULL;
1747 data_event_info.data_event.device_ptr = (void *) ptr;
1748
1749 api_info->device_api = acc_device_api_other;
1750
1751 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1752 api_info);
1753 }
1754
1755 return ptr;
1756}
1757
1758/* Create kernel dispatch data structure for given KERNEL, along with
1759 the necessary device signals and memory allocations. */
1760
1761static struct kernel_dispatch *
1762create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1763{
1764 struct agent_info *agent = kernel->agent;
1765 struct kernel_dispatch *shadow
1766 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1767
1768 shadow->agent = kernel->agent;
1769 shadow->object = kernel->object;
1770
1771 hsa_signal_t sync_signal;
1772 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1773 if (status != HSA_STATUS_SUCCESS)
1774 hsa_fatal ("Error creating the GCN sync signal", status);
1775
1776 shadow->signal = sync_signal.handle;
1777 shadow->private_segment_size = kernel->private_segment_size;
1778 shadow->group_segment_size = kernel->group_segment_size;
1779
1780 /* We expect kernels to request a single pointer, explicitly, and the
1781 rest of struct kernargs, implicitly. If they request anything else
1782 then something is wrong. */
1783 if (kernel->kernarg_segment_size > 8)
1784 {
1785 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1786 return NULL;
1787 }
1788
1789 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1790 sizeof (struct kernargs),
1791 &shadow->kernarg_address);
1792 if (status != HSA_STATUS_SUCCESS)
1793 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1794 struct kernargs *kernargs = shadow->kernarg_address;
1795
1796 /* Zero-initialize the output_data (minimum needed). */
1797 kernargs->out_ptr = (int64_t)&kernargs->output_data;
1798 kernargs->output_data.next_output = 0;
1799 for (unsigned i = 0;
1800 i < (sizeof (kernargs->output_data.queue)
1801 / sizeof (kernargs->output_data.queue[0]));
1802 i++)
1803 kernargs->output_data.queue[i].written = 0;
1804 kernargs->output_data.consumed = 0;
1805
1806 /* Pass in the heap location. */
1807 kernargs->heap_ptr = (int64_t)kernel->module->heap;
1808
1809 /* Create an arena. */
1810 if (kernel->kind == KIND_OPENMP)
1811 kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1812 else
1813 kernargs->arena_ptr = 0;
1814
1815 /* Ensure we can recognize unset return values. */
1816 kernargs->output_data.return_value = 0xcafe0000;
1817
1818 return shadow;
1819}
1820
1821/* Output any data written to console output from the kernel. It is expected
1822 that this function is polled during kernel execution.
1823
1824 We print all entries from the last item printed to the next entry without
1825 a "written" flag. If the "final" flag is set then it'll continue right to
1826 the end.
1827
1828 The print buffer is circular, but the from and to locations don't wrap when
1829 the buffer does, so the output limit is UINT_MAX. The target blocks on
1830 output when the buffer is full. */
1831
1832static void
1833console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1834 bool final)
1835{
1836 unsigned int limit = (sizeof (kernargs->output_data.queue)
1837 / sizeof (kernargs->output_data.queue[0]));
1838
1839 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1840 __ATOMIC_ACQUIRE);
1841 unsigned int to = kernargs->output_data.next_output;
1842
1843 if (from > to)
1844 {
1845 /* Overflow. */
1846 if (final)
1847 printf ("GCN print buffer overflowed.\n");
1848 return;
1849 }
1850
1851 unsigned int i;
1852 for (i = from; i < to; i++)
1853 {
1854 struct printf_data *data = &kernargs->output_data.queue[i%limit];
1855
1856 if (!data->written && !final)
1857 break;
1858
1859 switch (data->type)
1860 {
1861 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1862 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1863 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1864 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1865 default: printf ("GCN print buffer error!\n"); break;
1866 }
1867 data->written = 0;
1868 __atomic_store_n (&kernargs->output_data.consumed, i+1,
1869 __ATOMIC_RELEASE);
1870 }
1871 fflush (stdout);
1872}
1873
1874/* Release data structure created for a kernel dispatch in SHADOW argument,
1875 and clean up the signal and memory allocations. */
1876
1877static void
1878release_kernel_dispatch (struct kernel_dispatch *shadow)
1879{
1880 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1881
1882 struct kernargs *kernargs = shadow->kernarg_address;
1883 void *arena = (void *)kernargs->arena_ptr;
1884 if (arena)
1885 release_team_arena (shadow->agent, arena);
1886
1887 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1888
1889 hsa_signal_t s;
1890 s.handle = shadow->signal;
1891 hsa_fns.hsa_signal_destroy_fn (s);
1892
1893 free (shadow);
1894}
1895
1896/* Extract the properties from a kernel binary. */
1897
1898static void
1899init_kernel_properties (struct kernel_info *kernel)
1900{
1901 hsa_status_t status;
1902 struct agent_info *agent = kernel->agent;
1903 hsa_executable_symbol_t kernel_symbol;
1904 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
1905 kernel->name, agent->id,
1906 0, &kernel_symbol);
1907 if (status != HSA_STATUS_SUCCESS)
1908 {
1909 hsa_warn ("Could not find symbol for kernel in the code object", status);
1910 fprintf (stderr, "not found name: '%s'\n", kernel->name);
1911 dump_executable_symbols (agent->executable);
1912 goto failure;
1913 }
1914 GCN_DEBUG ("Located kernel %s\n", kernel->name);
1915 status = hsa_fns.hsa_executable_symbol_get_info_fn
1916 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
1917 if (status != HSA_STATUS_SUCCESS)
1918 hsa_fatal ("Could not extract a kernel object from its symbol", status);
1919 status = hsa_fns.hsa_executable_symbol_get_info_fn
1920 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1921 &kernel->kernarg_segment_size);
1922 if (status != HSA_STATUS_SUCCESS)
1923 hsa_fatal ("Could not get info about kernel argument size", status);
1924 status = hsa_fns.hsa_executable_symbol_get_info_fn
1925 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
1926 &kernel->group_segment_size);
1927 if (status != HSA_STATUS_SUCCESS)
1928 hsa_fatal ("Could not get info about kernel group segment size", status);
1929 status = hsa_fns.hsa_executable_symbol_get_info_fn
1930 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
1931 &kernel->private_segment_size);
1932 if (status != HSA_STATUS_SUCCESS)
1933 hsa_fatal ("Could not get info about kernel private segment size",
1934 status);
1935
1936 /* The kernel type is not known until something tries to launch it. */
1937 kernel->kind = KIND_UNKNOWN;
1938
1939 GCN_DEBUG ("Kernel structure for %s fully initialized with "
1940 "following segment sizes: \n", kernel->name);
1941 GCN_DEBUG (" group_segment_size: %u\n",
1942 (unsigned) kernel->group_segment_size);
1943 GCN_DEBUG (" private_segment_size: %u\n",
1944 (unsigned) kernel->private_segment_size);
1945 GCN_DEBUG (" kernarg_segment_size: %u\n",
1946 (unsigned) kernel->kernarg_segment_size);
1947 return;
1948
1949failure:
1950 kernel->initialization_failed = true;
1951}
1952
1953/* Do all the work that is necessary before running KERNEL for the first time.
1954 The function assumes the program has been created, finalized and frozen by
1955 create_and_finalize_hsa_program. */
1956
1957static void
1958init_kernel (struct kernel_info *kernel)
1959{
1960 if (pthread_mutex_lock (&kernel->init_mutex))
1961 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
1962 if (kernel->initialized)
1963 {
1964 if (pthread_mutex_unlock (&kernel->init_mutex))
1965 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
1966 "mutex");
1967
1968 return;
1969 }
1970
1971 init_kernel_properties (kernel);
1972
1973 if (!kernel->initialization_failed)
1974 {
1975 GCN_DEBUG ("\n");
1976
1977 kernel->initialized = true;
1978 }
1979 if (pthread_mutex_unlock (&kernel->init_mutex))
1980 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
1981 "mutex");
1982}
1983
1984/* Run KERNEL on its agent, pass VARS to it as arguments and take
1985 launch attributes from KLA.
1986
1987 MODULE_LOCKED indicates that the caller already holds the lock and
1988 run_kernel need not lock it again.
1989 If AQ is NULL then agent->sync_queue will be used. */
1990
1991static void
1992run_kernel (struct kernel_info *kernel, void *vars,
1993 struct GOMP_kernel_launch_attributes *kla,
1994 struct goacc_asyncqueue *aq, bool module_locked)
1995{
1996 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
1997 (aq ? aq->id : 0));
1998 GCN_DEBUG ("GCN launch attribs: gdims:[");
1999 int i;
2000 for (i = 0; i < kla->ndim; ++i)
2001 {
2002 if (i)
2003 DEBUG_PRINT (", ");
2004 DEBUG_PRINT ("%u", kla->gdims[i]);
2005 }
2006 DEBUG_PRINT ("], normalized gdims:[");
2007 for (i = 0; i < kla->ndim; ++i)
2008 {
2009 if (i)
2010 DEBUG_PRINT (", ");
2011 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2012 }
2013 DEBUG_PRINT ("], wdims:[");
2014 for (i = 0; i < kla->ndim; ++i)
2015 {
2016 if (i)
2017 DEBUG_PRINT (", ");
2018 DEBUG_PRINT ("%u", kla->wdims[i]);
2019 }
2020 DEBUG_PRINT ("]\n");
2021 DEBUG_FLUSH ();
2022
2023 struct agent_info *agent = kernel->agent;
2024 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2025 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2026
2027 if (!agent->initialized)
2028 GOMP_PLUGIN_fatal ("Agent must be initialized");
2029
2030 if (!kernel->initialized)
2031 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2032
2033 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2034
2035 uint64_t index
2036 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2037 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2038
2039 /* Wait until the queue is not full before writing the packet. */
2040 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2041 >= command_q->size)
2042 ;
2043
2044 /* Do not allow the dimensions to be overridden when running
2045 constructors or destructors. */
2046 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2047 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2048
2049 hsa_kernel_dispatch_packet_t *packet;
2050 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2051 + index % command_q->size;
2052
2053 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2054 packet->grid_size_x = override_x ? : kla->gdims[0];
2055 packet->workgroup_size_x = get_group_size (kla->ndim,
2056 packet->grid_size_x,
2057 kla->wdims[0]);
2058
2059 if (kla->ndim >= 2)
2060 {
2061 packet->grid_size_y = kla->gdims[1];
2062 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2063 kla->wdims[1]);
2064 }
2065 else
2066 {
2067 packet->grid_size_y = 1;
2068 packet->workgroup_size_y = 1;
2069 }
2070
2071 if (kla->ndim == 3)
2072 {
2073 packet->grid_size_z = limit_worker_threads (override_z
2074 ? : kla->gdims[2]);
2075 packet->workgroup_size_z = get_group_size (kla->ndim,
2076 packet->grid_size_z,
2077 kla->wdims[2]);
2078 }
2079 else
2080 {
2081 packet->grid_size_z = 1;
2082 packet->workgroup_size_z = 1;
2083 }
2084
2085 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2086 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2087 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2088 packet->grid_size_x / packet->workgroup_size_x,
2089 packet->grid_size_y / packet->workgroup_size_y,
2090 packet->grid_size_z / packet->workgroup_size_z,
2091 packet->workgroup_size_x, packet->workgroup_size_y,
2092 packet->workgroup_size_z);
2093
2094 struct kernel_dispatch *shadow
2095 = create_kernel_dispatch (kernel, packet->grid_size_x);
2096 shadow->queue = command_q;
2097
2098 if (debug)
2099 {
2100 fprintf (stderr, "\nKernel has following dependencies:\n");
2101 print_kernel_dispatch (shadow, 2);
2102 }
2103
2104 packet->private_segment_size = kernel->private_segment_size;
2105 packet->group_segment_size = kernel->group_segment_size;
2106 packet->kernel_object = kernel->object;
2107 packet->kernarg_address = shadow->kernarg_address;
2108 hsa_signal_t s;
2109 s.handle = shadow->signal;
2110 packet->completion_signal = s;
2111 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2112 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2113
2114 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2115
2116 uint16_t header;
2117 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2118 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2119 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2120
2121 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2122 agent->device_id);
2123
2124 packet_store_release ((uint32_t *) packet, header,
2125 (uint16_t) kla->ndim
2126 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2127
2128 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2129 index);
2130
2131 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2132
2133 /* Root signal waits with 1ms timeout. */
2134 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2135 1000 * 1000,
2136 HSA_WAIT_STATE_BLOCKED) != 0)
2137 {
2138 console_output (kernel, shadow->kernarg_address, false);
2139 }
2140 console_output (kernel, shadow->kernarg_address, true);
2141
2142 struct kernargs *kernargs = shadow->kernarg_address;
2143 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2144
2145 release_kernel_dispatch (shadow);
2146
2147 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2148 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2149
2150 unsigned int upper = (return_value & ~0xffff) >> 16;
2151 if (upper == 0xcafe)
2152 ; // exit not called, normal termination.
2153 else if (upper == 0xffff)
2154 ; // exit called.
2155 else
2156 {
2157 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2158 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2159 return_value);
2160 abort ();
2161 }
2162
2163 if (upper == 0xffff)
2164 {
2165 unsigned int signal = (return_value >> 8) & 0xff;
2166
2167 if (signal == SIGABRT)
2168 {
2169 GCN_WARNING ("GCN Kernel aborted\n");
2170 abort ();
2171 }
2172 else if (signal != 0)
2173 {
2174 GCN_WARNING ("GCN Kernel received unknown signal\n");
2175 abort ();
2176 }
2177
2178 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2179 exit (return_value & 0xff);
2180 }
2181}
2182
2183/* }}} */
2184/* {{{ Load/Unload */
2185
2186/* Initialize KERNEL from D and other parameters. Return true on success. */
2187
2188static bool
2189init_basic_kernel_info (struct kernel_info *kernel,
2190 struct hsa_kernel_description *d,
2191 struct agent_info *agent,
2192 struct module_info *module)
2193{
2194 kernel->agent = agent;
2195 kernel->module = module;
2196 kernel->name = d->name;
2197 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2198 {
2199 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2200 return false;
2201 }
2202 return true;
2203}
2204
2205/* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true. If
2206 not found, return false. */
2207
2208static bool
2209find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
2210 struct module_info *module, Elf64_Ehdr *image,
2211 Elf64_Shdr *sections)
2212{
2213 bool res = false;
2214
2215 hsa_status_t status;
2216
2217 hsa_executable_symbol_t symbol;
2218 if (!find_executable_symbol (agent->executable, &symbol))
2219 return false;
2220
2221 status = hsa_fns.hsa_executable_symbol_get_info_fn
2222 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
2223 if (status != HSA_STATUS_SUCCESS)
2224 {
2225 hsa_error ("Could not extract symbol address", status);
2226 return false;
2227 }
2228
2229 char *symbol_name = get_executable_symbol_name (symbol);
2230 if (symbol_name == NULL)
2231 return false;
2232
2233 /* Find the kernel function in ELF, and calculate actual load offset. */
2234 for (int i = 0; i < image->e_shnum; i++)
2235 if (sections[i].sh_type == SHT_SYMTAB)
2236 {
2237 Elf64_Shdr *strtab = &sections[sections[i].sh_link];
2238 char *strings = (char *)image + strtab->sh_offset;
2239
2240 for (size_t offset = 0;
2241 offset < sections[i].sh_size;
2242 offset += sections[i].sh_entsize)
2243 {
2244 Elf64_Sym *sym = (Elf64_Sym*)((char*)image
2245 + sections[i].sh_offset
2246 + offset);
2247 if (strcmp (symbol_name, strings + sym->st_name) == 0)
2248 {
2249 *load_offset -= sym->st_value;
2250 res = true;
2251 break;
2252 }
2253 }
2254 }
2255
2256 free (symbol_name);
2257 return res;
2258}
2259
2260/* Create and finalize the program consisting of all loaded modules. */
2261
2262static bool
2263create_and_finalize_hsa_program (struct agent_info *agent)
2264{
2265 hsa_status_t status;
2266 int reloc_count = 0;
2267 bool res = true;
2268 if (pthread_mutex_lock (&agent->prog_mutex))
2269 {
2270 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2271 return false;
2272 }
2273 if (agent->prog_finalized)
2274 goto final;
2275
2276 status
2277 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2278 HSA_EXECUTABLE_STATE_UNFROZEN,
2279 "", &agent->executable);
2280 if (status != HSA_STATUS_SUCCESS)
2281 {
2282 hsa_error ("Could not create GCN executable", status);
2283 goto fail;
2284 }
2285
2286 /* Load any GCN modules. */
2287 struct module_info *module = agent->module;
2288 if (module)
2289 {
2290 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2291
2292 /* Hide relocations from the HSA runtime loader.
2293 Keep a copy of the unmodified section headers to use later. */
2294 Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
2295 + image->e_shoff);
2296 for (int i = image->e_shnum - 1; i >= 0; i--)
2297 {
2298 if (image_sections[i].sh_type == SHT_RELA
2299 || image_sections[i].sh_type == SHT_REL)
2300 /* Change section type to something harmless. */
2301 image_sections[i].sh_type |= 0x80;
2302 }
2303
2304 hsa_code_object_t co = { 0 };
2305 status = hsa_fns.hsa_code_object_deserialize_fn
2306 (module->image_desc->gcn_image->image,
2307 module->image_desc->gcn_image->size,
2308 NULL, &co);
2309 if (status != HSA_STATUS_SUCCESS)
2310 {
2311 hsa_error ("Could not deserialize GCN code object", status);
2312 goto fail;
2313 }
2314
2315 status = hsa_fns.hsa_executable_load_code_object_fn
2316 (agent->executable, agent->id, co, "");
2317 if (status != HSA_STATUS_SUCCESS)
2318 {
2319 hsa_error ("Could not load GCN code object", status);
2320 goto fail;
2321 }
2322
2323 if (!module->heap)
2324 {
2325 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2326 gcn_kernel_heap_size,
2327 (void**)&module->heap);
2328 if (status != HSA_STATUS_SUCCESS)
2329 {
2330 hsa_error ("Could not allocate memory for GCN heap", status);
2331 goto fail;
2332 }
2333
2334 status = hsa_fns.hsa_memory_assign_agent_fn
2335 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2336 if (status != HSA_STATUS_SUCCESS)
2337 {
2338 hsa_error ("Could not assign GCN heap memory to device", status);
2339 goto fail;
2340 }
2341
2342 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2343 &gcn_kernel_heap_size,
2344 sizeof (gcn_kernel_heap_size));
2345 }
2346
2347 }
2348
2349 if (debug)
2350 dump_executable_symbols (agent->executable);
2351
2352 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2353 if (status != HSA_STATUS_SUCCESS)
2354 {
2355 hsa_error ("Could not freeze the GCN executable", status);
2356 goto fail;
2357 }
2358
2359 if (agent->module)
2360 {
2361 struct module_info *module = agent->module;
2362 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2363 Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
2364
2365 Elf64_Addr load_offset;
2366 if (!find_load_offset (&load_offset, agent, module, image, sections))
2367 goto fail;
2368
2369 /* Record the physical load address range.
2370 We need this for data copies later. */
2371 Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
2372 Elf64_Addr low = ~0, high = 0;
2373 for (int i = 0; i < image->e_phnum; i++)
2374 if (segments[i].p_memsz > 0)
2375 {
2376 if (segments[i].p_paddr < low)
2377 low = segments[i].p_paddr;
2378 if (segments[i].p_paddr > high)
2379 high = segments[i].p_paddr + segments[i].p_memsz - 1;
2380 }
2381 module->phys_address_start = low + load_offset;
2382 module->phys_address_end = high + load_offset;
2383
2384 // Find dynamic symbol table
2385 Elf64_Shdr *dynsym = NULL;
2386 for (int i = 0; i < image->e_shnum; i++)
2387 if (sections[i].sh_type == SHT_DYNSYM)
2388 {
2389 dynsym = &sections[i];
2390 break;
2391 }
2392
2393 /* Fix up relocations. */
2394 for (int i = 0; i < image->e_shnum; i++)
2395 {
2396 if (sections[i].sh_type == (SHT_RELA | 0x80))
2397 for (size_t offset = 0;
2398 offset < sections[i].sh_size;
2399 offset += sections[i].sh_entsize)
2400 {
2401 Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
2402 + sections[i].sh_offset
2403 + offset);
2404 Elf64_Sym *sym =
2405 (dynsym
2406 ? (Elf64_Sym*)((char*)image
2407 + dynsym->sh_offset
2408 + (dynsym->sh_entsize
2409 * ELF64_R_SYM (reloc->r_info)))
2410 : NULL);
2411
2412 int64_t S = (sym ? sym->st_value : 0);
2413 int64_t P = reloc->r_offset + load_offset;
2414 int64_t A = reloc->r_addend;
2415 int64_t B = load_offset;
2416 int64_t V, size;
2417 switch (ELF64_R_TYPE (reloc->r_info))
2418 {
2419 case R_AMDGPU_ABS32_LO:
2420 V = (S + A) & 0xFFFFFFFF;
2421 size = 4;
2422 break;
2423 case R_AMDGPU_ABS32_HI:
2424 V = (S + A) >> 32;
2425 size = 4;
2426 break;
2427 case R_AMDGPU_ABS64:
2428 V = S + A;
2429 size = 8;
2430 break;
2431 case R_AMDGPU_REL32:
2432 V = S + A - P;
2433 size = 4;
2434 break;
2435 case R_AMDGPU_REL64:
2436 /* FIXME
2437 LLD seems to emit REL64 where the the assembler has
2438 ABS64. This is clearly wrong because it's not what the
2439 compiler is expecting. Let's assume, for now, that
2440 it's a bug. In any case, GCN kernels are always self
2441 contained and therefore relative relocations will have
2442 been resolved already, so this should be a safe
2443 workaround. */
2444 V = S + A/* - P*/;
2445 size = 8;
2446 break;
2447 case R_AMDGPU_ABS32:
2448 V = S + A;
2449 size = 4;
2450 break;
2451 /* TODO R_AMDGPU_GOTPCREL */
2452 /* TODO R_AMDGPU_GOTPCREL32_LO */
2453 /* TODO R_AMDGPU_GOTPCREL32_HI */
2454 case R_AMDGPU_REL32_LO:
2455 V = (S + A - P) & 0xFFFFFFFF;
2456 size = 4;
2457 break;
2458 case R_AMDGPU_REL32_HI:
2459 V = (S + A - P) >> 32;
2460 size = 4;
2461 break;
2462 case R_AMDGPU_RELATIVE64:
2463 V = B + A;
2464 size = 8;
2465 break;
2466 default:
2467 fprintf (stderr, "Error: unsupported relocation type.\n");
2468 exit (1);
2469 }
2470 status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
2471 if (status != HSA_STATUS_SUCCESS)
2472 {
2473 hsa_error ("Failed to fix up relocation", status);
2474 goto fail;
2475 }
2476 reloc_count++;
2477 }
2478 }
2479 }
2480
2481 GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
2482 agent->device_id, reloc_count);
2483
2484final:
2485 agent->prog_finalized = true;
2486
2487 if (pthread_mutex_unlock (&agent->prog_mutex))
2488 {
2489 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2490 res = false;
2491 }
2492
2493 return res;
2494
2495fail:
2496 res = false;
2497 goto final;
2498}
2499
2500/* Free the HSA program in agent and everything associated with it and set
2501 agent->prog_finalized and the initialized flags of all kernels to false.
2502 Return TRUE on success. */
2503
2504static bool
2505destroy_hsa_program (struct agent_info *agent)
2506{
2507 if (!agent->prog_finalized)
2508 return true;
2509
2510 hsa_status_t status;
2511
2512 GCN_DEBUG ("Destroying the current GCN program.\n");
2513
2514 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2515 if (status != HSA_STATUS_SUCCESS)
2516 return hsa_error ("Could not destroy GCN executable", status);
2517
2518 if (agent->module)
2519 {
2520 int i;
2521 for (i = 0; i < agent->module->kernel_count; i++)
2522 agent->module->kernels[i].initialized = false;
2523
2524 if (agent->module->heap)
2525 {
2526 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2527 agent->module->heap = NULL;
2528 }
2529 }
2530 agent->prog_finalized = false;
2531 return true;
2532}
2533
2534/* Deinitialize all information associated with MODULE and kernels within
2535 it. Return TRUE on success. */
2536
2537static bool
2538destroy_module (struct module_info *module, bool locked)
2539{
2540 /* Run destructors before destroying module. */
2541 struct GOMP_kernel_launch_attributes kla =
2542 { 3,
2543 /* Grid size. */
2544 { 1, 64, 1 },
2545 /* Work-group size. */
2546 { 1, 64, 1 }
2547 };
2548
2549 if (module->fini_array_func)
2550 {
2551 init_kernel (module->fini_array_func);
2552 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2553 }
2554 module->constructors_run_p = false;
2555
2556 int i;
2557 for (i = 0; i < module->kernel_count; i++)
2558 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2559 {
2560 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2561 "mutex");
2562 return false;
2563 }
2564
2565 return true;
2566}
2567
2568/* }}} */
2569/* {{{ Async */
2570
2571/* Callback of dispatch queues to report errors. */
2572
2573static void
2574execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2575{
2576 struct queue_entry *entry = &aq->queue[index];
2577
2578 switch (entry->type)
2579 {
2580 case KERNEL_LAUNCH:
2581 if (DEBUG_QUEUES)
2582 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2583 aq->agent->device_id, aq->id, index);
2584 run_kernel (entry->u.launch.kernel,
2585 entry->u.launch.vars,
2586 &entry->u.launch.kla, aq, false);
2587 if (DEBUG_QUEUES)
2588 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2589 aq->agent->device_id, aq->id, index);
2590 break;
2591
2592 case CALLBACK:
2593 if (DEBUG_QUEUES)
2594 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2595 aq->agent->device_id, aq->id, index);
2596 entry->u.callback.fn (entry->u.callback.data);
2597 if (DEBUG_QUEUES)
2598 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2599 aq->agent->device_id, aq->id, index);
2600 break;
2601
2602 case ASYNC_WAIT:
2603 {
2604 /* FIXME: is it safe to access a placeholder that may already have
2605 been executed? */
2606 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2607
2608 if (DEBUG_QUEUES)
2609 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2610 aq->agent->device_id, aq->id, index);
2611
2612 pthread_mutex_lock (&placeholderp->mutex);
2613
2614 while (!placeholderp->executed)
2615 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2616
2617 pthread_mutex_unlock (&placeholderp->mutex);
2618
2619 if (pthread_cond_destroy (&placeholderp->cond))
2620 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2621
2622 if (pthread_mutex_destroy (&placeholderp->mutex))
2623 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2624
2625 if (DEBUG_QUEUES)
2626 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2627 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2628 }
2629 break;
2630
2631 case ASYNC_PLACEHOLDER:
2632 pthread_mutex_lock (&entry->u.placeholder.mutex);
2633 entry->u.placeholder.executed = 1;
2634 pthread_cond_signal (&entry->u.placeholder.cond);
2635 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2636 break;
2637
2638 default:
2639 GOMP_PLUGIN_fatal ("Unknown queue element");
2640 }
2641}
2642
2643/* This function is run as a thread to service an async queue in the
2644 background. It runs continuously until the stop flag is set. */
2645
2646static void *
2647drain_queue (void *thread_arg)
2648{
2649 struct goacc_asyncqueue *aq = thread_arg;
2650
2651 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2652 {
2653 aq->drain_queue_stop = 2;
2654 return NULL;
2655 }
2656
2657 pthread_mutex_lock (&aq->mutex);
2658
2659 while (true)
2660 {
2661 if (aq->drain_queue_stop)
2662 break;
2663
2664 if (aq->queue_n > 0)
2665 {
2666 pthread_mutex_unlock (&aq->mutex);
2667 execute_queue_entry (aq, aq->queue_first);
2668
2669 pthread_mutex_lock (&aq->mutex);
2670 aq->queue_first = ((aq->queue_first + 1)
2671 % ASYNC_QUEUE_SIZE);
2672 aq->queue_n--;
2673
2674 if (DEBUG_THREAD_SIGNAL)
2675 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2676 aq->agent->device_id, aq->id);
2677 pthread_cond_broadcast (&aq->queue_cond_out);
2678 pthread_mutex_unlock (&aq->mutex);
2679
2680 if (DEBUG_QUEUES)
2681 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2682 aq->id);
2683 pthread_mutex_lock (&aq->mutex);
2684 }
2685 else
2686 {
2687 if (DEBUG_THREAD_SLEEP)
2688 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2689 aq->agent->device_id, aq->id);
2690 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2691 if (DEBUG_THREAD_SLEEP)
2692 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2693 aq->agent->device_id, aq->id);
2694 }
2695 }
2696
2697 aq->drain_queue_stop = 2;
2698 if (DEBUG_THREAD_SIGNAL)
2699 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2700 aq->agent->device_id, aq->id);
2701 pthread_cond_broadcast (&aq->queue_cond_out);
2702 pthread_mutex_unlock (&aq->mutex);
2703
2704 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2705 return NULL;
2706}
2707
2708/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2709 is not usually the case. This is just a debug tool. */
2710
2711static void
2712drain_queue_synchronous (struct goacc_asyncqueue *aq)
2713{
2714 pthread_mutex_lock (&aq->mutex);
2715
2716 while (aq->queue_n > 0)
2717 {
2718 execute_queue_entry (aq, aq->queue_first);
2719
2720 aq->queue_first = ((aq->queue_first + 1)
2721 % ASYNC_QUEUE_SIZE);
2722 aq->queue_n--;
2723 }
2724
2725 pthread_mutex_unlock (&aq->mutex);
2726}
2727
2728/* Block the current thread until an async queue is writable. */
2729
2730static void
2731wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2732{
2733 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2734 {
2735 pthread_mutex_lock (&aq->mutex);
2736
2737 /* Queue is full. Wait for it to not be full. */
2738 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2739 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2740
2741 pthread_mutex_unlock (&aq->mutex);
2742 }
2743}
2744
2745/* Request an asynchronous kernel launch on the specified queue. This
2746 may block if the queue is full, but returns without waiting for the
2747 kernel to run. */
2748
2749static void
2750queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2751 void *vars, struct GOMP_kernel_launch_attributes *kla)
2752{
2753 assert (aq->agent == kernel->agent);
2754
2755 wait_for_queue_nonfull (aq);
2756
2757 pthread_mutex_lock (&aq->mutex);
2758
2759 int queue_last = ((aq->queue_first + aq->queue_n)
2760 % ASYNC_QUEUE_SIZE);
2761 if (DEBUG_QUEUES)
2762 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2763 aq->id, queue_last);
2764
2765 aq->queue[queue_last].type = KERNEL_LAUNCH;
2766 aq->queue[queue_last].u.launch.kernel = kernel;
2767 aq->queue[queue_last].u.launch.vars = vars;
2768 aq->queue[queue_last].u.launch.kla = *kla;
2769
2770 aq->queue_n++;
2771
2772 if (DEBUG_THREAD_SIGNAL)
2773 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2774 aq->agent->device_id, aq->id);
2775 pthread_cond_signal (&aq->queue_cond_in);
2776
2777 pthread_mutex_unlock (&aq->mutex);
2778}
2779
2780/* Request an asynchronous callback on the specified queue. The callback
2781 function will be called, with the given opaque data, from the appropriate
2782 async thread, when all previous items on that queue are complete. */
2783
2784static void
2785queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2786 void *data)
2787{
2788 wait_for_queue_nonfull (aq);
2789
2790 pthread_mutex_lock (&aq->mutex);
2791
2792 int queue_last = ((aq->queue_first + aq->queue_n)
2793 % ASYNC_QUEUE_SIZE);
2794 if (DEBUG_QUEUES)
2795 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2796 aq->id, queue_last);
2797
2798 aq->queue[queue_last].type = CALLBACK;
2799 aq->queue[queue_last].u.callback.fn = fn;
2800 aq->queue[queue_last].u.callback.data = data;
2801
2802 aq->queue_n++;
2803
2804 if (DEBUG_THREAD_SIGNAL)
2805 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2806 aq->agent->device_id, aq->id);
2807 pthread_cond_signal (&aq->queue_cond_in);
2808
2809 pthread_mutex_unlock (&aq->mutex);
2810}
2811
2812/* Request that a given async thread wait for another thread (unspecified) to
2813 reach the given placeholder. The wait will occur when all previous entries
2814 on the queue are complete. A placeholder is effectively a kind of signal
2815 which simply sets a flag when encountered in a queue. */
2816
2817static void
2818queue_push_asyncwait (struct goacc_asyncqueue *aq,
2819 struct placeholder *placeholderp)
2820{
2821 wait_for_queue_nonfull (aq);
2822
2823 pthread_mutex_lock (&aq->mutex);
2824
2825 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2826 if (DEBUG_QUEUES)
2827 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2828 aq->id, queue_last);
2829
2830 aq->queue[queue_last].type = ASYNC_WAIT;
2831 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2832
2833 aq->queue_n++;
2834
2835 if (DEBUG_THREAD_SIGNAL)
2836 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2837 aq->agent->device_id, aq->id);
2838 pthread_cond_signal (&aq->queue_cond_in);
2839
2840 pthread_mutex_unlock (&aq->mutex);
2841}
2842
2843/* Add a placeholder into an async queue. When the async thread reaches the
2844 placeholder it will set the "executed" flag to true and continue.
2845 Another thread may be waiting on this thread reaching the placeholder. */
2846
2847static struct placeholder *
2848queue_push_placeholder (struct goacc_asyncqueue *aq)
2849{
2850 struct placeholder *placeholderp;
2851
2852 wait_for_queue_nonfull (aq);
2853
2854 pthread_mutex_lock (&aq->mutex);
2855
2856 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2857 if (DEBUG_QUEUES)
2858 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2859 aq->id, queue_last);
2860
2861 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2862 placeholderp = &aq->queue[queue_last].u.placeholder;
2863
2864 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2865 {
2866 pthread_mutex_unlock (&aq->mutex);
2867 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2868 }
2869
2870 if (pthread_cond_init (&placeholderp->cond, NULL))
2871 {
2872 pthread_mutex_unlock (&aq->mutex);
2873 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2874 }
2875
2876 placeholderp->executed = 0;
2877
2878 aq->queue_n++;
2879
2880 if (DEBUG_THREAD_SIGNAL)
2881 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2882 aq->agent->device_id, aq->id);
2883 pthread_cond_signal (&aq->queue_cond_in);
2884
2885 pthread_mutex_unlock (&aq->mutex);
2886
2887 return placeholderp;
2888}
2889
2890/* Signal an asynchronous thread to terminate, and wait for it to do so. */
2891
2892static void
2893finalize_async_thread (struct goacc_asyncqueue *aq)
2894{
2895 pthread_mutex_lock (&aq->mutex);
2896 if (aq->drain_queue_stop == 2)
2897 {
2898 pthread_mutex_unlock (&aq->mutex);
2899 return;
2900 }
2901
2902 aq->drain_queue_stop = 1;
2903
2904 if (DEBUG_THREAD_SIGNAL)
2905 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2906 aq->agent->device_id, aq->id);
2907 pthread_cond_signal (&aq->queue_cond_in);
2908
2909 while (aq->drain_queue_stop != 2)
2910 {
2911 if (DEBUG_THREAD_SLEEP)
2912 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2913 " to sleep\n", aq->agent->device_id, aq->id);
2914 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2915 if (DEBUG_THREAD_SLEEP)
2916 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2917 aq->agent->device_id, aq->id);
2918 }
2919
2920 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2921 aq->id);
2922 pthread_mutex_unlock (&aq->mutex);
2923
2924 int err = pthread_join (aq->thread_drain_queue, NULL);
2925 if (err != 0)
2926 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2927 aq->agent->device_id, aq->id, strerror (err));
2928 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2929}
2930
2931/* Set up an async queue for OpenMP. There will be only one. The
2932 implementation simply uses an OpenACC async queue.
2933 FIXME: is this thread-safe if two threads call this function? */
2934
2935static void
2936maybe_init_omp_async (struct agent_info *agent)
2937{
2938 if (!agent->omp_async_queue)
2939 agent->omp_async_queue
2940 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2941}
2942
2943/* Copy data to or from a device. This is intended for use as an async
2944 callback event. */
2945
2946static void
2947copy_data (void *data_)
2948{
2949 struct copy_data *data = (struct copy_data *)data_;
2950 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2951 data->aq->agent->device_id, data->aq->id, data->len, data->src,
2952 data->dst);
2953 hsa_fns.hsa_memory_copy_fn (data->dst, data->src, data->len);
2954 if (data->free_src)
2955 free ((void *) data->src);
2956 free (data);
2957}
2958
2959/* Free device data. This is intended for use as an async callback event. */
2960
2961static void
2962gomp_offload_free (void *ptr)
2963{
2964 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2965 GOMP_OFFLOAD_free (0, ptr);
2966}
2967
2968/* Request an asynchronous data copy, to or from a device, on a given queue.
2969 The event will be registered as a callback. If FREE_SRC is true
2970 then the source data will be freed following the copy. */
2971
2972static void
2973queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
2974 size_t len, bool free_src)
2975{
2976 if (DEBUG_QUEUES)
2977 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2978 aq->agent->device_id, aq->id, len, src, dst);
2979 struct copy_data *data
2980 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
2981 data->dst = dst;
2982 data->src = src;
2983 data->len = len;
2984 data->free_src = free_src;
2985 data->aq = aq;
2986 queue_push_callback (aq, copy_data, data);
2987}
2988
2989/* Return true if the given queue is currently empty. */
2990
2991static int
2992queue_empty (struct goacc_asyncqueue *aq)
2993{
2994 pthread_mutex_lock (&aq->mutex);
2995 int res = aq->queue_n == 0 ? 1 : 0;
2996 pthread_mutex_unlock (&aq->mutex);
2997
2998 return res;
2999}
3000
3001/* Wait for a given queue to become empty. This implements an OpenACC wait
3002 directive. */
3003
3004static void
3005wait_queue (struct goacc_asyncqueue *aq)
3006{
3007 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3008 {
3009 drain_queue_synchronous (aq);
3010 return;
3011 }
3012
3013 pthread_mutex_lock (&aq->mutex);
3014
3015 while (aq->queue_n > 0)
3016 {
3017 if (DEBUG_THREAD_SLEEP)
3018 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3019 aq->agent->device_id, aq->id);
3020 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3021 if (DEBUG_THREAD_SLEEP)
3022 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3023 aq->id);
3024 }
3025
3026 pthread_mutex_unlock (&aq->mutex);
3027 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3028}
3029
3030/* }}} */
3031/* {{{ OpenACC support */
3032
3033/* Execute an OpenACC kernel, synchronously or asynchronously. */
3034
3035static void
3036gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3037 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3038 struct goacc_asyncqueue *aq)
3039{
3040 if (!GOMP_OFFLOAD_can_run (kernel))
3041 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3042
3043 /* If we get here then this must be an OpenACC kernel. */
3044 kernel->kind = KIND_OPENACC;
3045
3046 /* devaddrs must be double-indirect on the target. */
3047 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3048 for (size_t i = 0; i < mapnum; i++)
3049 hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3050 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3051 sizeof (void *));
3052
3053 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3054 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3055 {
3056 struct hsa_kernel_description *d
3057 = &kernel->module->image_desc->kernel_infos[i];
3058 if (d->name == kernel->name)
3059 {
3060 hsa_kernel_desc = d;
3061 break;
3062 }
3063 }
3064
3065 /* We may have statically-determined dimensions in
3066 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3067 invocation at runtime in dims[]. We allow static dimensions to take
3068 priority over dynamic dimensions when present (non-zero). */
3069 if (hsa_kernel_desc->oacc_dims[0] > 0)
3070 dims[0] = hsa_kernel_desc->oacc_dims[0];
3071 if (hsa_kernel_desc->oacc_dims[1] > 0)
3072 dims[1] = hsa_kernel_desc->oacc_dims[1];
3073 if (hsa_kernel_desc->oacc_dims[2] > 0)
3074 dims[2] = hsa_kernel_desc->oacc_dims[2];
3075
3076 /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3077 There isn't really a correct answer for this without a clue about the
3078 problem size, so let's do a reasonable number of single-worker gangs.
3079 64 gangs matches a typical Fiji device. */
3080
3081 /* NOTE: Until support for middle-end worker partitioning is merged, use 1
3082 for the default number of workers. */
3083 if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs. */
3084 if (dims[1] == 0) dims[1] = 1; /* Workers. */
3085
3086 /* The incoming dimensions are expressed in terms of gangs, workers, and
3087 vectors. The HSA dimensions are expressed in terms of "work-items",
3088 which means multiples of vector lanes.
3089
3090 The "grid size" specifies the size of the problem space, and the
3091 "work-group size" specifies how much of that we want a single compute
3092 unit to chew on at once.
3093
3094 The three dimensions do not really correspond to hardware, but the
3095 important thing is that the HSA runtime will launch as many
3096 work-groups as it takes to process the entire grid, and each
3097 work-group will contain as many wave-fronts as it takes to process
3098 the work-items in that group.
3099
3100 Essentially, as long as we set the Y dimension to 64 (the number of
3101 vector lanes in hardware), and the Z group size to the maximum (16),
3102 then we will get the gangs (X) and workers (Z) launched as we expect.
3103
3104 The reason for the apparent reversal of vector and worker dimension
3105 order is to do with the way the run-time distributes work-items across
3106 v1 and v2. */
3107 struct GOMP_kernel_launch_attributes kla =
3108 {3,
3109 /* Grid size. */
3110 {dims[0], 64, dims[1]},
3111 /* Work-group size. */
3112 {1, 64, 16}
3113 };
3114
3115 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3116 acc_prof_info *prof_info = thr->prof_info;
3117 acc_event_info enqueue_launch_event_info;
3118 acc_api_info *api_info = thr->api_info;
3119 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3120 if (profiling_dispatch_p)
3121 {
3122 prof_info->event_type = acc_ev_enqueue_launch_start;
3123
3124 enqueue_launch_event_info.launch_event.event_type
3125 = prof_info->event_type;
3126 enqueue_launch_event_info.launch_event.valid_bytes
3127 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3128 enqueue_launch_event_info.launch_event.parent_construct
3129 = acc_construct_parallel;
3130 enqueue_launch_event_info.launch_event.implicit = 1;
3131 enqueue_launch_event_info.launch_event.tool_info = NULL;
3132 enqueue_launch_event_info.launch_event.kernel_name
3133 = (char *) kernel->name;
3134 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3135 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3136 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3137
3138 api_info->device_api = acc_device_api_other;
3139
3140 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3141 &enqueue_launch_event_info, api_info);
3142 }
3143
3144 if (!async)
3145 {
3146 run_kernel (kernel, ind_da, &kla, NULL, false);
3147 gomp_offload_free (ind_da);
3148 }
3149 else
3150 {
3151 queue_push_launch (aq, kernel, ind_da, &kla);
3152 if (DEBUG_QUEUES)
3153 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3154 aq->agent->device_id, aq->id, ind_da);
3155 queue_push_callback (aq, gomp_offload_free, ind_da);
3156 }
3157
3158 if (profiling_dispatch_p)
3159 {
3160 prof_info->event_type = acc_ev_enqueue_launch_end;
3161 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3162 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3163 &enqueue_launch_event_info,
3164 api_info);
3165 }
3166}
3167
3168/* }}} */
3169/* {{{ Generic Plugin API */
3170
3171/* Return the name of the accelerator, which is "gcn". */
3172
3173const char *
3174GOMP_OFFLOAD_get_name (void)
3175{
3176 return "gcn";
3177}
3178
3179/* Return the specific capabilities the HSA accelerator have. */
3180
3181unsigned int
3182GOMP_OFFLOAD_get_caps (void)
3183{
3184 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3185 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3186 | GOMP_OFFLOAD_CAP_OPENACC_200;
3187}
3188
3189/* Identify as GCN accelerator. */
3190
3191int
3192GOMP_OFFLOAD_get_type (void)
3193{
3194 return OFFLOAD_TARGET_TYPE_GCN;
3195}
3196
3197/* Return the libgomp version number we're compatible with. There is
3198 no requirement for cross-version compatibility. */
3199
3200unsigned
3201GOMP_OFFLOAD_version (void)
3202{
3203 return GOMP_VERSION;
3204}
3205
3206/* Return the number of GCN devices on the system. */
3207
3208int
3209GOMP_OFFLOAD_get_num_devices (void)
3210{
3211 if (!init_hsa_context ())
3212 return 0;
3213 return hsa_context.agent_count;
3214}
3215
3216/* Initialize device (agent) number N so that it can be used for computation.
3217 Return TRUE on success. */
3218
3219bool
3220GOMP_OFFLOAD_init_device (int n)
3221{
3222 if (!init_hsa_context ())
3223 return false;
3224 if (n >= hsa_context.agent_count)
3225 {
3226 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3227 return false;
3228 }
3229 struct agent_info *agent = &hsa_context.agents[n];
3230
3231 if (agent->initialized)
3232 return true;
3233
3234 agent->device_id = n;
3235
3236 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3237 {
3238 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3239 return false;
3240 }
3241 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3242 {
3243 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3244 return false;
3245 }
3246 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3247 {
3248 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3249 return false;
3250 }
3251 if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3252 {
3253 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3254 return false;
3255 }
3256 agent->async_queues = NULL;
3257 agent->omp_async_queue = NULL;
3258 agent->team_arena_list = NULL;
3259
3260 uint32_t queue_size;
3261 hsa_status_t status;
3262 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3263 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3264 &queue_size);
3265 if (status != HSA_STATUS_SUCCESS)
3266 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3267 status);
3268
3269 char buf[64];
3270 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3271 &buf);
3272 if (status != HSA_STATUS_SUCCESS)
3273 return hsa_error ("Error querying the name of the agent", status);
3274 agent->gfx900_p = (strncmp (buf, "gfx900", 6) == 0);
3275
3276 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3277 HSA_QUEUE_TYPE_MULTI,
3278 hsa_queue_callback, NULL, UINT32_MAX,
3279 UINT32_MAX, &agent->sync_queue);
3280 if (status != HSA_STATUS_SUCCESS)
3281 return hsa_error ("Error creating command queue", status);
3282
3283 agent->kernarg_region.handle = (uint64_t) -1;
3284 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3285 get_kernarg_memory_region,
3286 &agent->kernarg_region);
3287 if (agent->kernarg_region.handle == (uint64_t) -1)
3288 {
3289 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3290 "arguments");
3291 return false;
3292 }
3293 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3294 dump_hsa_region (agent->kernarg_region, NULL);
3295
3296 agent->data_region.handle = (uint64_t) -1;
3297 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3298 get_data_memory_region,
3299 &agent->data_region);
3300 if (agent->data_region.handle == (uint64_t) -1)
3301 {
3302 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3303 "data");
3304 return false;
3305 }
3306 GCN_DEBUG ("Selected device data memory region:\n");
3307 dump_hsa_region (agent->data_region, NULL);
3308
3309 GCN_DEBUG ("GCN agent %d initialized\n", n);
3310
3311 agent->initialized = true;
3312 return true;
3313}
3314
3315/* Load GCN object-code module described by struct gcn_image_desc in
3316 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3317 If there are any constructors then run them. */
3318
3319int
3320GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3321 struct addr_pair **target_table)
3322{
3323 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3324 {
3325 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3326 " (expected %u, received %u)",
3327 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3328 return -1;
3329 }
3330
3331 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3332 struct agent_info *agent;
3333 struct addr_pair *pair;
3334 struct module_info *module;
3335 struct kernel_info *kernel;
3336 int kernel_count = image_desc->kernel_count;
3337 unsigned var_count = image_desc->global_variable_count;
3338
3339 agent = get_agent_info (ord);
3340 if (!agent)
3341 return -1;
3342
3343 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3344 {
3345 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3346 return -1;
3347 }
3348 if (agent->prog_finalized
3349 && !destroy_hsa_program (agent))
3350 return -1;
3351
3352 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3353 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3354 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
3355 * sizeof (struct addr_pair));
3356 *target_table = pair;
3357 module = (struct module_info *)
3358 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3359 + kernel_count * sizeof (struct kernel_info));
3360 module->image_desc = image_desc;
3361 module->kernel_count = kernel_count;
3362 module->heap = NULL;
3363 module->constructors_run_p = false;
3364
3365 kernel = &module->kernels[0];
3366
3367 /* Allocate memory for kernel dependencies. */
3368 for (unsigned i = 0; i < kernel_count; i++)
3369 {
3370 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3371 if (!init_basic_kernel_info (kernel, d, agent, module))
3372 return -1;
3373 if (strcmp (d->name, "_init_array") == 0)
3374 module->init_array_func = kernel;
3375 else if (strcmp (d->name, "_fini_array") == 0)
3376 module->fini_array_func = kernel;
3377 else
3378 {
3379 pair->start = (uintptr_t) kernel;
3380 pair->end = (uintptr_t) (kernel + 1);
3381 pair++;
3382 }
3383 kernel++;
3384 }
3385
3386 agent->module = module;
3387 if (pthread_rwlock_unlock (&agent->module_rwlock))
3388 {
3389 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3390 return -1;
3391 }
3392
3393 if (!create_and_finalize_hsa_program (agent))
3394 return -1;
3395
3396 for (unsigned i = 0; i < var_count; i++)
3397 {
3398 struct global_var_info *v = &image_desc->global_variables[i];
3399 GCN_DEBUG ("Looking for variable %s\n", v->name);
3400
3401 hsa_status_t status;
3402 hsa_executable_symbol_t var_symbol;
3403 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3404 v->name, agent->id,
3405 0, &var_symbol);
3406
3407 if (status != HSA_STATUS_SUCCESS)
3408 hsa_fatal ("Could not find symbol for variable in the code object",
3409 status);
3410
3411 uint64_t var_addr;
3412 uint32_t var_size;
3413 status = hsa_fns.hsa_executable_symbol_get_info_fn
3414 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
3415 if (status != HSA_STATUS_SUCCESS)
3416 hsa_fatal ("Could not extract a variable from its symbol", status);
3417 status = hsa_fns.hsa_executable_symbol_get_info_fn
3418 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
3419 if (status != HSA_STATUS_SUCCESS)
3420 hsa_fatal ("Could not extract a variable size from its symbol", status);
3421
3422 pair->start = var_addr;
3423 pair->end = var_addr + var_size;
3424 GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
3425 (void *)var_addr, var_size);
3426 pair++;
3427 }
3428
3429 /* Ensure that constructors are run first. */
3430 struct GOMP_kernel_launch_attributes kla =
3431 { 3,
3432 /* Grid size. */
3433 { 1, 64, 1 },
3434 /* Work-group size. */
3435 { 1, 64, 1 }
3436 };
3437
3438 if (module->init_array_func)
3439 {
3440 init_kernel (module->init_array_func);
3441 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3442 }
3443 module->constructors_run_p = true;
3444
3445 /* Don't report kernels that libgomp need not know about. */
3446 if (module->init_array_func)
3447 kernel_count--;
3448 if (module->fini_array_func)
3449 kernel_count--;
3450
3451 return kernel_count + var_count;
3452}
3453
3454/* Unload GCN object-code module described by struct gcn_image_desc in
3455 TARGET_DATA from agent number N. Return TRUE on success. */
3456
3457bool
3458GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3459{
3460 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3461 {
3462 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3463 " (expected %u, received %u)",
3464 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3465 return false;
3466 }
3467
3468 struct agent_info *agent;
3469 agent = get_agent_info (n);
3470 if (!agent)
3471 return false;
3472
3473 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3474 {
3475 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3476 return false;
3477 }
3478
3479 if (!agent->module || agent->module->image_desc != target_data)
3480 {
3481 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3482 "loaded before");
3483 return false;
3484 }
3485
3486 if (!destroy_module (agent->module, true))
3487 return false;
3488 free (agent->module);
3489 agent->module = NULL;
3490 if (!destroy_hsa_program (agent))
3491 return false;
3492 if (pthread_rwlock_unlock (&agent->module_rwlock))
3493 {
3494 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3495 return false;
3496 }
3497 return true;
3498}
3499
3500/* Deinitialize all information and status associated with agent number N. We
3501 do not attempt any synchronization, assuming the user and libgomp will not
3502 attempt deinitialization of a device that is in any way being used at the
3503 same time. Return TRUE on success. */
3504
3505bool
3506GOMP_OFFLOAD_fini_device (int n)
3507{
3508 struct agent_info *agent = get_agent_info (n);
3509 if (!agent)
3510 return false;
3511
3512 if (!agent->initialized)
3513 return true;
3514
3515 if (agent->omp_async_queue)
3516 {
3517 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3518 agent->omp_async_queue = NULL;
3519 }
3520
3521 if (agent->module)
3522 {
3523 if (!destroy_module (agent->module, false))
3524 return false;
3525 free (agent->module);
3526 agent->module = NULL;
3527 }
3528
3529 if (!destroy_team_arenas (agent))
3530 return false;
3531
3532 if (!destroy_hsa_program (agent))
3533 return false;
3534
3535 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3536 if (status != HSA_STATUS_SUCCESS)
3537 return hsa_error ("Error destroying command queue", status);
3538
3539 if (pthread_mutex_destroy (&agent->prog_mutex))
3540 {
3541 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3542 return false;
3543 }
3544 if (pthread_rwlock_destroy (&agent->module_rwlock))
3545 {
3546 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3547 return false;
3548 }
3549
3550 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3551 {
3552 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3553 return false;
3554 }
3555 if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3556 {
3557 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3558 return false;
3559 }
3560 agent->initialized = false;
3561 return true;
3562}
3563
3564/* Return true if the HSA runtime can run function FN_PTR. */
3565
3566bool
3567GOMP_OFFLOAD_can_run (void *fn_ptr)
3568{
3569 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3570
3571 init_kernel (kernel);
3572 if (kernel->initialization_failed)
3573 goto failure;
3574
3575 return true;
3576
3577failure:
3578 if (suppress_host_fallback)
3579 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3580 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3581 return false;
3582}
3583
3584/* Allocate memory on device N. */
3585
3586void *
3587GOMP_OFFLOAD_alloc (int n, size_t size)
3588{
3589 struct agent_info *agent = get_agent_info (n);
3590 return alloc_by_agent (agent, size);
3591}
3592
3593/* Free memory from device N. */
3594
3595bool
3596GOMP_OFFLOAD_free (int device, void *ptr)
3597{
3598 GCN_DEBUG ("Freeing memory on device %d\n", device);
3599
3600 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3601 if (status != HSA_STATUS_SUCCESS)
3602 {
3603 hsa_error ("Could not free device memory", status);
3604 return false;
3605 }
3606
3607 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3608 bool profiling_dispatch_p
3609 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3610 if (profiling_dispatch_p)
3611 {
3612 acc_prof_info *prof_info = thr->prof_info;
3613 acc_event_info data_event_info;
3614 acc_api_info *api_info = thr->api_info;
3615
3616 prof_info->event_type = acc_ev_free;
3617
3618 data_event_info.data_event.event_type = prof_info->event_type;
3619 data_event_info.data_event.valid_bytes
3620 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3621 data_event_info.data_event.parent_construct
3622 = acc_construct_parallel;
3623 data_event_info.data_event.implicit = 1;
3624 data_event_info.data_event.tool_info = NULL;
3625 data_event_info.data_event.var_name = NULL;
3626 data_event_info.data_event.bytes = 0;
3627 data_event_info.data_event.host_ptr = NULL;
3628 data_event_info.data_event.device_ptr = (void *) ptr;
3629
3630 api_info->device_api = acc_device_api_other;
3631
3632 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3633 api_info);
3634 }
3635
3636 return true;
3637}
3638
3639/* Copy data from DEVICE to host. */
3640
3641bool
3642GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3643{
3644 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3645 src, dst);
3646 hsa_fns.hsa_memory_copy_fn (dst, src, n);
3647 return true;
3648}
3649
3650/* Copy data from host to DEVICE. */
3651
3652bool
3653GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3654{
3655 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3656 device, dst);
3657 hsa_fns.hsa_memory_copy_fn (dst, src, n);
3658 return true;
3659}
3660
3661/* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3662
3663bool
3664GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3665{
3666 struct gcn_thread *thread_data = gcn_thread ();
3667
3668 if (thread_data && !async_synchronous_p (thread_data->async))
3669 {
3670 struct agent_info *agent = get_agent_info (device);
3671 maybe_init_omp_async (agent);
3672 queue_push_copy (agent->omp_async_queue, dst, src, n, false);
3673 return true;
3674 }
3675
3676 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3677 device, src, device, dst);
3678 hsa_fns.hsa_memory_copy_fn (dst, src, n);
3679 return true;
3680}
3681
3682/* }}} */
3683/* {{{ OpenMP Plugin API */
3684
3685/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3686 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3687 to a kernel_info structure, and must have previously been loaded to the
3688 specified device. */
3689
3690void
3691GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3692{
3693 struct agent_info *agent = get_agent_info (device);
3694 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3695 struct GOMP_kernel_launch_attributes def;
3696 struct GOMP_kernel_launch_attributes *kla;
3697 assert (agent == kernel->agent);
3698
3699 /* If we get here then the kernel must be OpenMP. */
3700 kernel->kind = KIND_OPENMP;
3701
3702 if (!parse_target_attributes (args, &def, &kla, agent))
3703 {
3704 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3705 return;
3706 }
3707 run_kernel (kernel, vars, kla, NULL, false);
3708}
3709
3710/* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3711 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3712 GOMP_PLUGIN_target_task_completion when it has finished. */
3713
3714void
3715GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3716 void **args, void *async_data)
3717{
3718 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3719 struct agent_info *agent = get_agent_info (device);
3720 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3721 struct GOMP_kernel_launch_attributes def;
3722 struct GOMP_kernel_launch_attributes *kla;
3723 assert (agent == kernel->agent);
3724
3725 /* If we get here then the kernel must be OpenMP. */
3726 kernel->kind = KIND_OPENMP;
3727
3728 if (!parse_target_attributes (args, &def, &kla, agent))
3729 {
3730 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3731 return;
3732 }
3733
3734 maybe_init_omp_async (agent);
3735 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3736 queue_push_callback (agent->omp_async_queue,
3737 GOMP_PLUGIN_target_task_completion, async_data);
3738}
3739
3740/* }}} */
3741/* {{{ OpenACC Plugin API */
3742
3743/* Run a synchronous OpenACC kernel. The device number is inferred from the
3744 already-loaded KERNEL. */
3745
3746void
3747GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3748 void **hostaddrs, void **devaddrs, unsigned *dims,
3749 void *targ_mem_desc)
3750{
3751 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3752
3753 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3754 NULL);
3755}
3756
3757/* Run an asynchronous OpenACC kernel on the specified queue. */
3758
3759void
3760GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3761 void **hostaddrs, void **devaddrs,
3762 unsigned *dims, void *targ_mem_desc,
3763 struct goacc_asyncqueue *aq)
3764{
3765 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3766
3767 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3768 aq);
3769}
3770
3771/* Create a new asynchronous thread and queue for running future kernels. */
3772
3773struct goacc_asyncqueue *
3774GOMP_OFFLOAD_openacc_async_construct (int device)
3775{
3776 struct agent_info *agent = get_agent_info (device);
3777
3778 pthread_mutex_lock (&agent->async_queues_mutex);
3779
3780 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3781 aq->agent = get_agent_info (device);
3782 aq->prev = NULL;
3783 aq->next = agent->async_queues;
3784 if (aq->next)
3785 {
3786 aq->next->prev = aq;
3787 aq->id = aq->next->id + 1;
3788 }
3789 else
3790 aq->id = 1;
3791 agent->async_queues = aq;
3792
3793 aq->queue_first = 0;
3794 aq->queue_n = 0;
3795 aq->drain_queue_stop = 0;
3796
3797 if (pthread_mutex_init (&aq->mutex, NULL))
3798 {
3799 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3800 return false;
3801 }
3802 if (pthread_cond_init (&aq->queue_cond_in, NULL))
3803 {
3804 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3805 return false;
3806 }
3807 if (pthread_cond_init (&aq->queue_cond_out, NULL))
3808 {
3809 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3810 return false;
3811 }
3812
3813 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3814 ASYNC_QUEUE_SIZE,
3815 HSA_QUEUE_TYPE_MULTI,
3816 hsa_queue_callback, NULL,
3817 UINT32_MAX, UINT32_MAX,
3818 &aq->hsa_queue);
3819 if (status != HSA_STATUS_SUCCESS)
3820 hsa_fatal ("Error creating command queue", status);
3821
3822 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3823 if (err != 0)
3824 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3825 strerror (err));
3826 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3827 aq->id);
3828
3829 pthread_mutex_unlock (&agent->async_queues_mutex);
3830
3831 return aq;
3832}
3833
3834/* Destroy an exisiting asynchronous thread and queue. Waits for any
3835 currently-running task to complete, but cancels any queued tasks. */
3836
3837bool
3838GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3839{
3840 struct agent_info *agent = aq->agent;
3841
3842 finalize_async_thread (aq);
3843
3844 pthread_mutex_lock (&agent->async_queues_mutex);
3845
3846 int err;
3847 if ((err = pthread_mutex_destroy (&aq->mutex)))
3848 {
3849 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3850 goto fail;
3851 }
3852 if (pthread_cond_destroy (&aq->queue_cond_in))
3853 {
3854 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3855 goto fail;
3856 }
3857 if (pthread_cond_destroy (&aq->queue_cond_out))
3858 {
3859 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3860 goto fail;
3861 }
3862 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3863 if (status != HSA_STATUS_SUCCESS)
3864 {
3865 hsa_error ("Error destroying command queue", status);
3866 goto fail;
3867 }
3868
3869 if (aq->prev)
3870 aq->prev->next = aq->next;
3871 if (aq->next)
3872 aq->next->prev = aq->prev;
3873 if (agent->async_queues == aq)
3874 agent->async_queues = aq->next;
3875
3876 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3877
3878 free (aq);
3879 pthread_mutex_unlock (&agent->async_queues_mutex);
3880 return true;
3881
3882fail:
3883 pthread_mutex_unlock (&agent->async_queues_mutex);
3884 return false;
3885}
3886
3887/* Return true if the specified async queue is currently empty. */
3888
3889int
3890GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
3891{
3892 return queue_empty (aq);
3893}
3894
3895/* Block until the specified queue has executed all its tasks and the
3896 queue is empty. */
3897
3898bool
3899GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
3900{
3901 wait_queue (aq);
3902 return true;
3903}
3904
3905/* Add a serialization point across two async queues. Any new tasks added to
3906 AQ2, after this call, will not run until all tasks on AQ1, at the time
3907 of this call, have completed. */
3908
3909bool
3910GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
3911 struct goacc_asyncqueue *aq2)
3912{
3913 /* For serialize, stream aq2 waits for aq1 to complete work that has been
3914 scheduled to run on it up to this point. */
3915 if (aq1 != aq2)
3916 {
3917 struct placeholder *placeholderp = queue_push_placeholder (aq1);
3918 queue_push_asyncwait (aq2, placeholderp);
3919 }
3920 return true;
3921}
3922
3923/* Add an opaque callback to the given async queue. */
3924
3925void
3926GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
3927 void (*fn) (void *), void *data)
3928{
3929 queue_push_callback (aq, fn, data);
3930}
3931
3932/* Queue up an asynchronous data copy from host to DEVICE. */
3933
3934bool
3935GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
3936 size_t n, struct goacc_asyncqueue *aq)
3937{
3938 struct agent_info *agent = get_agent_info (device);
3939 assert (agent == aq->agent);
3940 /* The source data does not necessarily remain live until the deferred
3941 copy happens. Taking a snapshot of the data here avoids reading
3942 uninitialised data later, but means that (a) data is copied twice and
3943 (b) modifications to the copied data between the "spawning" point of
3944 the asynchronous kernel and when it is executed will not be seen.
3945 But, that is probably correct. */
3946 void *src_copy = GOMP_PLUGIN_malloc (n);
3947 memcpy (src_copy, src, n);
3948 queue_push_copy (aq, dst, src_copy, n, true);
3949 return true;
3950}
3951
3952/* Queue up an asynchronous data copy from DEVICE to host. */
3953
3954bool
3955GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
3956 size_t n, struct goacc_asyncqueue *aq)
3957{
3958 struct agent_info *agent = get_agent_info (device);
3959 assert (agent == aq->agent);
3960 queue_push_copy (aq, dst, src, n, false);
3961 return true;
3962}
3963
3964/* Set up plugin-specific thread-local-data (host-side). */
3965
3966void *
3967GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
3968{
3969 struct gcn_thread *thread_data
3970 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
3971
3972 thread_data->async = GOMP_ASYNC_SYNC;
3973
3974 return (void *) thread_data;
3975}
3976
3977/* Clean up plugin-specific thread-local-data. */
3978
3979void
3980GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
3981{
3982 free (data);
3983}
3984
3985/* }}} */