]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-gcn.c
libstdc++: Add [[nodiscard]] to <compare>
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
CommitLineData
237957cc
AS
1/* Plugin for AMD GCN execution.
2
99dee823 3 Copyright (C) 2013-2021 Free Software Foundation, Inc.
237957cc
AS
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* {{{ Includes and defines */
30
31#include "config.h"
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35#include <pthread.h>
36#include <inttypes.h>
37#include <stdbool.h>
38#include <limits.h>
39#include <hsa.h>
85f0a4d9 40#include <hsa_ext_amd.h>
237957cc
AS
41#include <dlfcn.h>
42#include <signal.h>
43#include "libgomp-plugin.h"
44#include "gomp-constants.h"
45#include <elf.h>
46#include "oacc-plugin.h"
47#include "oacc-int.h"
48#include <assert.h>
49
237957cc 50/* These probably won't be in elf.h for a while. */
97981e13 51#ifndef R_AMDGPU_NONE
237957cc
AS
52#define R_AMDGPU_NONE 0
53#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
54#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
55#define R_AMDGPU_ABS64 3 /* S + A */
56#define R_AMDGPU_REL32 4 /* S + A - P */
57#define R_AMDGPU_REL64 5 /* S + A - P */
58#define R_AMDGPU_ABS32 6 /* S + A */
59#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
60#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
61#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
62#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
63#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
237957cc 64#define R_AMDGPU_RELATIVE64 13 /* B + A */
97981e13 65#endif
237957cc
AS
66
67/* GCN specific definitions for asynchronous queues. */
68
69#define ASYNC_QUEUE_SIZE 64
70#define DRAIN_QUEUE_SYNCHRONOUS_P false
71#define DEBUG_QUEUES 0
72#define DEBUG_THREAD_SLEEP 0
73#define DEBUG_THREAD_SIGNAL 0
74
75/* Defaults. */
76#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
77
78/* Secure getenv() which returns NULL if running as SUID/SGID. */
79#ifndef HAVE_SECURE_GETENV
80#ifdef HAVE___SECURE_GETENV
81#define secure_getenv __secure_getenv
82#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
83 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
84
85#include <unistd.h>
86
87/* Implementation of secure_getenv() for targets where it is not provided but
88 we have at least means to test real and effective IDs. */
89
90static char *
91secure_getenv (const char *name)
92{
93 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
94 return getenv (name);
95 else
96 return NULL;
97}
98
99#else
100#define secure_getenv getenv
101#endif
102#endif
103
104/* }}} */
105/* {{{ Types */
106
93d90219 107/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
237957cc
AS
108
109struct gcn_thread
110{
111 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
112 int async;
113};
114
115/* As an HSA runtime is dlopened, following structure defines function
116 pointers utilized by the HSA plug-in. */
117
118struct hsa_runtime_fn_info
119{
120 /* HSA runtime. */
121 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
122 const char **status_string);
123 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
124 void *value);
125 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
126 hsa_agent_info_t attribute,
127 void *value);
128 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
129 hsa_isa_info_t attribute,
130 uint32_t index,
131 void *value);
132 hsa_status_t (*hsa_init_fn) (void);
133 hsa_status_t (*hsa_iterate_agents_fn)
134 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
135 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
136 hsa_region_info_t attribute,
137 void *value);
138 hsa_status_t (*hsa_queue_create_fn)
139 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
140 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
141 void *data, uint32_t private_segment_size,
142 uint32_t group_segment_size, hsa_queue_t **queue);
143 hsa_status_t (*hsa_agent_iterate_regions_fn)
144 (hsa_agent_t agent,
145 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
146 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
147 hsa_status_t (*hsa_executable_create_fn)
148 (hsa_profile_t profile, hsa_executable_state_t executable_state,
149 const char *options, hsa_executable_t *executable);
150 hsa_status_t (*hsa_executable_global_variable_define_fn)
151 (hsa_executable_t executable, const char *variable_name, void *address);
152 hsa_status_t (*hsa_executable_load_code_object_fn)
153 (hsa_executable_t executable, hsa_agent_t agent,
154 hsa_code_object_t code_object, const char *options);
155 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
156 const char *options);
157 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
158 uint32_t num_consumers,
159 const hsa_agent_t *consumers,
160 hsa_signal_t *signal);
161 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
162 void **ptr);
163 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
164 hsa_access_permission_t access);
165 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
166 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
167 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
168 hsa_status_t (*hsa_executable_get_symbol_fn)
169 (hsa_executable_t executable, const char *module_name,
170 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
171 hsa_executable_symbol_t *symbol);
172 hsa_status_t (*hsa_executable_symbol_get_info_fn)
173 (hsa_executable_symbol_t executable_symbol,
174 hsa_executable_symbol_info_t attribute, void *value);
175 hsa_status_t (*hsa_executable_iterate_symbols_fn)
176 (hsa_executable_t executable,
177 hsa_status_t (*callback)(hsa_executable_t executable,
178 hsa_executable_symbol_t symbol, void *data),
179 void *data);
180 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
181 uint64_t value);
182 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
183 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
184 hsa_signal_value_t value);
185 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
188 (hsa_signal_t signal, hsa_signal_condition_t condition,
189 hsa_signal_value_t compare_value, uint64_t timeout_hint,
190 hsa_wait_state_t wait_state_hint);
191 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
192 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
193
194 hsa_status_t (*hsa_code_object_deserialize_fn)
195 (void *serialized_code_object, size_t serialized_code_object_size,
196 const char *options, hsa_code_object_t *code_object);
197};
198
199/* Structure describing the run-time and grid properties of an HSA kernel
200 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
201
202struct GOMP_kernel_launch_attributes
203{
204 /* Number of dimensions the workload has. Maximum number is 3. */
205 uint32_t ndim;
206 /* Size of the grid in the three respective dimensions. */
207 uint32_t gdims[3];
208 /* Size of work-groups in the respective dimensions. */
209 uint32_t wdims[3];
210};
211
212/* Collection of information needed for a dispatch of a kernel from a
213 kernel. */
214
215struct kernel_dispatch
216{
217 struct agent_info *agent;
218 /* Pointer to a command queue associated with a kernel dispatch agent. */
219 void *queue;
220 /* Pointer to a memory space used for kernel arguments passing. */
221 void *kernarg_address;
222 /* Kernel object. */
223 uint64_t object;
224 /* Synchronization signal used for dispatch synchronization. */
225 uint64_t signal;
226 /* Private segment size. */
227 uint32_t private_segment_size;
228 /* Group segment size. */
229 uint32_t group_segment_size;
230};
231
232/* Structure of the kernargs segment, supporting console output.
233
234 This needs to match the definitions in Newlib, and the expectations
235 in libgomp target code. */
236
237struct kernargs {
238 /* Leave space for the real kernel arguments.
239 OpenACC and OpenMP only use one pointer. */
240 int64_t dummy1;
241 int64_t dummy2;
242
243 /* A pointer to struct output, below, for console output data. */
244 int64_t out_ptr;
245
246 /* A pointer to struct heap, below. */
247 int64_t heap_ptr;
248
249 /* A pointer to an ephemeral memory arena.
250 Only needed for OpenMP. */
251 int64_t arena_ptr;
252
253 /* Output data. */
254 struct output {
255 int return_value;
256 unsigned int next_output;
257 struct printf_data {
258 int written;
259 char msg[128];
260 int type;
261 union {
262 int64_t ivalue;
263 double dvalue;
264 char text[128];
265 };
266 } queue[1024];
267 unsigned int consumed;
268 } output_data;
269};
270
271/* A queue entry for a future asynchronous launch. */
272
273struct kernel_launch
274{
275 struct kernel_info *kernel;
276 void *vars;
277 struct GOMP_kernel_launch_attributes kla;
278};
279
280/* A queue entry for a future callback. */
281
282struct callback
283{
284 void (*fn)(void *);
285 void *data;
286};
287
288/* A data struct for the copy_data callback. */
289
290struct copy_data
291{
292 void *dst;
293 const void *src;
294 size_t len;
237957cc
AS
295 struct goacc_asyncqueue *aq;
296};
297
298/* A queue entry for a placeholder. These correspond to a wait event. */
299
300struct placeholder
301{
302 int executed;
303 pthread_cond_t cond;
304 pthread_mutex_t mutex;
305};
306
307/* A queue entry for a wait directive. */
308
309struct asyncwait_info
310{
311 struct placeholder *placeholderp;
312};
313
314/* Encode the type of an entry in an async queue. */
315
316enum entry_type
317{
318 KERNEL_LAUNCH,
319 CALLBACK,
320 ASYNC_WAIT,
321 ASYNC_PLACEHOLDER
322};
323
324/* An entry in an async queue. */
325
326struct queue_entry
327{
328 enum entry_type type;
329 union {
330 struct kernel_launch launch;
331 struct callback callback;
332 struct asyncwait_info asyncwait;
333 struct placeholder placeholder;
334 } u;
335};
336
337/* An async queue header.
338
339 OpenMP may create one of these.
340 OpenACC may create many. */
341
342struct goacc_asyncqueue
343{
344 struct agent_info *agent;
345 hsa_queue_t *hsa_queue;
346
347 pthread_t thread_drain_queue;
348 pthread_mutex_t mutex;
349 pthread_cond_t queue_cond_in;
350 pthread_cond_t queue_cond_out;
351 struct queue_entry queue[ASYNC_QUEUE_SIZE];
352 int queue_first;
353 int queue_n;
354 int drain_queue_stop;
355
356 int id;
357 struct goacc_asyncqueue *prev;
358 struct goacc_asyncqueue *next;
359};
360
361/* Mkoffload uses this structure to describe a kernel.
362
363 OpenMP kernel dimensions are passed at runtime.
364 OpenACC kernel dimensions are passed at compile time, here. */
365
366struct hsa_kernel_description
367{
368 const char *name;
369 int oacc_dims[3]; /* Only present for GCN kernels. */
5a28e272
KCY
370 int sgpr_count;
371 int vpgr_count;
237957cc
AS
372};
373
374/* Mkoffload uses this structure to describe an offload variable. */
375
376struct global_var_info
377{
378 const char *name;
379 void *address;
380};
381
382/* Mkoffload uses this structure to describe all the kernels in a
383 loadable module. These are passed the libgomp via static constructors. */
384
385struct gcn_image_desc
386{
387 struct gcn_image {
388 size_t size;
389 void *image;
390 } *gcn_image;
391 const unsigned kernel_count;
392 struct hsa_kernel_description *kernel_infos;
393 const unsigned global_variable_count;
394 struct global_var_info *global_variables;
395};
396
7d593fd6
FH
397/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
398 support.
399 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
400
401typedef enum {
7d593fd6
FH
402 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
403 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
404 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
3535402e 405 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030
7d593fd6
FH
406} EF_AMDGPU_MACH;
407
408const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
409typedef EF_AMDGPU_MACH gcn_isa;
410
237957cc
AS
411/* Description of an HSA GPU agent (device) and the program associated with
412 it. */
413
414struct agent_info
415{
416 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
417 hsa_agent_t id;
418 /* The user-visible device number. */
419 int device_id;
420 /* Whether the agent has been initialized. The fields below are usable only
421 if it has been. */
422 bool initialized;
7d593fd6
FH
423
424 /* The instruction set architecture of the device. */
425 gcn_isa device_isa;
2e5ea579
FH
426 /* Name of the agent. */
427 char name[64];
428 /* Name of the vendor of the agent. */
429 char vendor_name[64];
237957cc
AS
430 /* Command queues of the agent. */
431 hsa_queue_t *sync_queue;
432 struct goacc_asyncqueue *async_queues, *omp_async_queue;
433 pthread_mutex_t async_queues_mutex;
434
435 /* The HSA memory region from which to allocate kernel arguments. */
436 hsa_region_t kernarg_region;
437
438 /* The HSA memory region from which to allocate device data. */
439 hsa_region_t data_region;
440
441 /* Allocated team arenas. */
442 struct team_arena_list *team_arena_list;
443 pthread_mutex_t team_arena_write_lock;
444
445 /* Read-write lock that protects kernels which are running or about to be run
446 from interference with loading and unloading of images. Needs to be
447 locked for reading while a kernel is being run, and for writing if the
448 list of modules is manipulated (and thus the HSA program invalidated). */
449 pthread_rwlock_t module_rwlock;
450
451 /* The module associated with this kernel. */
452 struct module_info *module;
453
454 /* Mutex enforcing that only one thread will finalize the HSA program. A
455 thread should have locked agent->module_rwlock for reading before
456 acquiring it. */
457 pthread_mutex_t prog_mutex;
458 /* Flag whether the HSA program that consists of all the modules has been
459 finalized. */
460 bool prog_finalized;
461 /* HSA executable - the finalized program that is used to locate kernels. */
462 hsa_executable_t executable;
463};
464
465/* Information required to identify, finalize and run any given kernel. */
466
467enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
468
469struct kernel_info
470{
471 /* Name of the kernel, required to locate it within the GCN object-code
472 module. */
473 const char *name;
474 /* The specific agent the kernel has been or will be finalized for and run
475 on. */
476 struct agent_info *agent;
477 /* The specific module where the kernel takes place. */
478 struct module_info *module;
5a28e272
KCY
479 /* Information provided by mkoffload associated with the kernel. */
480 struct hsa_kernel_description *description;
237957cc
AS
481 /* Mutex enforcing that at most once thread ever initializes a kernel for
482 use. A thread should have locked agent->module_rwlock for reading before
483 acquiring it. */
484 pthread_mutex_t init_mutex;
485 /* Flag indicating whether the kernel has been initialized and all fields
486 below it contain valid data. */
487 bool initialized;
488 /* Flag indicating that the kernel has a problem that blocks an execution. */
489 bool initialization_failed;
490 /* The object to be put into the dispatch queue. */
491 uint64_t object;
492 /* Required size of kernel arguments. */
493 uint32_t kernarg_segment_size;
494 /* Required size of group segment. */
495 uint32_t group_segment_size;
496 /* Required size of private segment. */
497 uint32_t private_segment_size;
498 /* Set up for OpenMP or OpenACC? */
499 enum offload_kind kind;
500};
501
502/* Information about a particular GCN module, its image and kernels. */
503
504struct module_info
505{
506 /* The description with which the program has registered the image. */
507 struct gcn_image_desc *image_desc;
508 /* GCN heap allocation. */
509 struct heap *heap;
510 /* Physical boundaries of the loaded module. */
511 Elf64_Addr phys_address_start;
512 Elf64_Addr phys_address_end;
513
514 bool constructors_run_p;
515 struct kernel_info *init_array_func, *fini_array_func;
516
517 /* Number of kernels in this module. */
518 int kernel_count;
519 /* An array of kernel_info structures describing each kernel in this
520 module. */
521 struct kernel_info kernels[];
522};
523
524/* A linked list of memory arenas allocated on the device.
525 These are only used by OpenMP, as a means to optimize per-team malloc. */
526
527struct team_arena_list
528{
529 struct team_arena_list *next;
530
531 /* The number of teams determines the size of the allocation. */
532 int num_teams;
533 /* The device address of the arena itself. */
534 void *arena;
535 /* A flag to prevent two asynchronous kernels trying to use the same arena.
536 The mutex is locked until the kernel exits. */
537 pthread_mutex_t in_use;
538};
539
540/* Information about the whole HSA environment and all of its agents. */
541
542struct hsa_context_info
543{
544 /* Whether the structure has been initialized. */
545 bool initialized;
546 /* Number of usable GPU HSA agents in the system. */
547 int agent_count;
548 /* Array of agent_info structures describing the individual HSA agents. */
549 struct agent_info *agents;
2e5ea579
FH
550 /* Driver version string. */
551 char driver_version_s[30];
237957cc
AS
552};
553
554/* Format of the on-device heap.
555
556 This must match the definition in Newlib and gcn-run. */
557
558struct heap {
559 int64_t size;
560 char data[0];
561};
562
563/* }}} */
564/* {{{ Global variables */
565
566/* Information about the whole HSA environment and all of its agents. */
567
568static struct hsa_context_info hsa_context;
569
570/* HSA runtime functions that are initialized in init_hsa_context. */
571
572static struct hsa_runtime_fn_info hsa_fns;
573
574/* Heap space, allocated target-side, provided for use of newlib malloc.
575 Each module should have it's own heap allocated.
576 Beware that heap usage increases with OpenMP teams. See also arenas. */
577
578static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
579
580/* Flag to decide whether print to stderr information about what is going on.
581 Set in init_debug depending on environment variables. */
582
583static bool debug;
584
585/* Flag to decide if the runtime should suppress a possible fallback to host
586 execution. */
587
588static bool suppress_host_fallback;
589
590/* Flag to locate HSA runtime shared library that is dlopened
591 by this plug-in. */
592
593static const char *hsa_runtime_lib;
594
595/* Flag to decide if the runtime should support also CPU devices (can be
596 a simulator). */
597
598static bool support_cpu_devices;
599
600/* Runtime dimension overrides. Zero indicates default. */
601
602static int override_x_dim = 0;
603static int override_z_dim = 0;
604
605/* }}} */
606/* {{{ Debug & Diagnostic */
607
608/* Print a message to stderr if GCN_DEBUG value is set to true. */
609
610#define DEBUG_PRINT(...) \
611 do \
612 { \
613 if (debug) \
614 { \
615 fprintf (stderr, __VA_ARGS__); \
616 } \
617 } \
618 while (false);
619
620/* Flush stderr if GCN_DEBUG value is set to true. */
621
622#define DEBUG_FLUSH() \
623 do { \
624 if (debug) \
625 fflush (stderr); \
626 } while (false)
627
628/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
629 is set to true. */
630
631#define DEBUG_LOG(prefix, ...) \
632 do \
633 { \
634 DEBUG_PRINT (prefix); \
635 DEBUG_PRINT (__VA_ARGS__); \
636 DEBUG_FLUSH (); \
637 } while (false)
638
639/* Print a debugging message to stderr. */
640
641#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
642
643/* Print a warning message to stderr. */
644
645#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
646
647/* Print HSA warning STR with an HSA STATUS code. */
648
649static void
650hsa_warn (const char *str, hsa_status_t status)
651{
652 if (!debug)
653 return;
654
655 const char *hsa_error_msg = "[unknown]";
656 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
657
658 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
659 hsa_error_msg);
660}
661
662/* Report a fatal error STR together with the HSA error corresponding to STATUS
663 and terminate execution of the current process. */
664
665static void
666hsa_fatal (const char *str, hsa_status_t status)
667{
668 const char *hsa_error_msg = "[unknown]";
669 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
670 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
671 hsa_error_msg);
672}
673
674/* Like hsa_fatal, except only report error message, and return FALSE
675 for propagating error processing to outside of plugin. */
676
677static bool
678hsa_error (const char *str, hsa_status_t status)
679{
680 const char *hsa_error_msg = "[unknown]";
681 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
682 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
683 hsa_error_msg);
684 return false;
685}
686
687/* Dump information about the available hardware. */
688
689static void
690dump_hsa_system_info (void)
691{
692 hsa_status_t status;
693
694 hsa_endianness_t endianness;
695 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
696 &endianness);
697 if (status == HSA_STATUS_SUCCESS)
698 switch (endianness)
699 {
700 case HSA_ENDIANNESS_LITTLE:
701 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
702 break;
703 case HSA_ENDIANNESS_BIG:
704 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
705 break;
706 default:
707 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
708 }
709 else
710 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
711
712 uint8_t extensions[128];
713 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
714 &extensions);
715 if (status == HSA_STATUS_SUCCESS)
716 {
717 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
718 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
719 }
720 else
721 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
722}
723
724/* Dump information about the available hardware. */
725
726static void
727dump_machine_model (hsa_machine_model_t machine_model, const char *s)
728{
729 switch (machine_model)
730 {
731 case HSA_MACHINE_MODEL_SMALL:
732 GCN_DEBUG ("%s: SMALL\n", s);
733 break;
734 case HSA_MACHINE_MODEL_LARGE:
735 GCN_DEBUG ("%s: LARGE\n", s);
736 break;
737 default:
738 GCN_WARNING ("%s: UNKNOWN\n", s);
739 break;
740 }
741}
742
743/* Dump information about the available hardware. */
744
745static void
746dump_profile (hsa_profile_t profile, const char *s)
747{
748 switch (profile)
749 {
750 case HSA_PROFILE_FULL:
751 GCN_DEBUG ("%s: FULL\n", s);
752 break;
753 case HSA_PROFILE_BASE:
754 GCN_DEBUG ("%s: BASE\n", s);
755 break;
756 default:
757 GCN_WARNING ("%s: UNKNOWN\n", s);
758 break;
759 }
760}
761
762/* Dump information about a device memory region. */
763
764static hsa_status_t
765dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
766{
767 hsa_status_t status;
768
769 hsa_region_segment_t segment;
770 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
771 &segment);
772 if (status == HSA_STATUS_SUCCESS)
773 {
774 if (segment == HSA_REGION_SEGMENT_GLOBAL)
775 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
776 else if (segment == HSA_REGION_SEGMENT_READONLY)
777 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
778 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
779 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
780 else if (segment == HSA_REGION_SEGMENT_GROUP)
781 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
782 else
783 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
784 }
785 else
786 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
787
788 if (segment == HSA_REGION_SEGMENT_GLOBAL)
789 {
790 uint32_t flags;
791 status
792 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
793 &flags);
794 if (status == HSA_STATUS_SUCCESS)
795 {
796 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
797 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
798 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
799 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
800 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
801 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
802 }
803 else
804 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
805 }
806
807 size_t size;
808 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
809 if (status == HSA_STATUS_SUCCESS)
810 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
811 else
812 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
813
814 status
815 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
816 &size);
817 if (status == HSA_STATUS_SUCCESS)
818 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
819 else
820 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
821
822 bool alloc_allowed;
823 status
824 = hsa_fns.hsa_region_get_info_fn (region,
825 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
826 &alloc_allowed);
827 if (status == HSA_STATUS_SUCCESS)
828 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
829 else
830 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
831
832 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
833 return HSA_STATUS_SUCCESS;
834
835 status
836 = hsa_fns.hsa_region_get_info_fn (region,
837 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
838 &size);
839 if (status == HSA_STATUS_SUCCESS)
840 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
841 else
842 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
843
844 size_t align;
845 status
846 = hsa_fns.hsa_region_get_info_fn (region,
847 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
848 &align);
849 if (status == HSA_STATUS_SUCCESS)
850 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
851 else
852 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
853
854 return HSA_STATUS_SUCCESS;
855}
856
857/* Dump information about all the device memory regions. */
858
859static void
860dump_hsa_regions (hsa_agent_t agent)
861{
862 hsa_status_t status;
863 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
864 dump_hsa_region,
865 NULL);
866 if (status != HSA_STATUS_SUCCESS)
867 hsa_error ("Dumping hsa regions failed", status);
868}
869
870/* Dump information about the available devices. */
871
872static hsa_status_t
873dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
874{
875 hsa_status_t status;
876
877 char buf[64];
878 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
879 &buf);
880 if (status == HSA_STATUS_SUCCESS)
881 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
882 else
883 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
884
885 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
886 &buf);
887 if (status == HSA_STATUS_SUCCESS)
888 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
889 else
890 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
891
892 hsa_machine_model_t machine_model;
893 status
894 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
895 &machine_model);
896 if (status == HSA_STATUS_SUCCESS)
897 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
898 else
899 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
900
901 hsa_profile_t profile;
902 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
903 &profile);
904 if (status == HSA_STATUS_SUCCESS)
905 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
906 else
907 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
908
909 hsa_device_type_t device_type;
910 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
911 &device_type);
912 if (status == HSA_STATUS_SUCCESS)
913 {
914 switch (device_type)
915 {
916 case HSA_DEVICE_TYPE_CPU:
917 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
918 break;
919 case HSA_DEVICE_TYPE_GPU:
920 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
921 break;
922 case HSA_DEVICE_TYPE_DSP:
923 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
924 break;
925 default:
926 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
927 break;
928 }
929 }
930 else
931 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
932
933 uint32_t cu_count;
934 status = hsa_fns.hsa_agent_get_info_fn
935 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
936 if (status == HSA_STATUS_SUCCESS)
937 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
938 else
939 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
940
941 uint32_t size;
942 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
943 &size);
944 if (status == HSA_STATUS_SUCCESS)
945 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
946 else
947 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
948
949 uint32_t max_dim;
950 status = hsa_fns.hsa_agent_get_info_fn (agent,
951 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
952 &max_dim);
953 if (status == HSA_STATUS_SUCCESS)
954 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
955 else
956 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
957
958 uint32_t max_size;
959 status = hsa_fns.hsa_agent_get_info_fn (agent,
960 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
961 &max_size);
962 if (status == HSA_STATUS_SUCCESS)
963 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
964 else
965 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
966
967 uint32_t grid_max_dim;
968 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
969 &grid_max_dim);
970 if (status == HSA_STATUS_SUCCESS)
971 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
972 else
973 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
974
975 uint32_t grid_max_size;
976 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
977 &grid_max_size);
978 if (status == HSA_STATUS_SUCCESS)
979 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
980 else
981 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
982
983 dump_hsa_regions (agent);
984
985 return HSA_STATUS_SUCCESS;
986}
987
988/* Forward reference. */
989
990static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
991
992/* Helper function for dump_executable_symbols. */
993
994static hsa_status_t
995dump_executable_symbol (hsa_executable_t executable,
996 hsa_executable_symbol_t symbol,
997 void *data __attribute__((unused)))
998{
999 char *name = get_executable_symbol_name (symbol);
1000
1001 if (name)
1002 {
1003 GCN_DEBUG ("executable symbol: %s\n", name);
1004 free (name);
1005 }
1006
1007 return HSA_STATUS_SUCCESS;
1008}
1009
1010/* Dump all global symbol in an executable. */
1011
1012static void
1013dump_executable_symbols (hsa_executable_t executable)
1014{
1015 hsa_status_t status;
1016 status
1017 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1018 dump_executable_symbol,
1019 NULL);
1020 if (status != HSA_STATUS_SUCCESS)
1021 hsa_fatal ("Could not dump HSA executable symbols", status);
1022}
1023
1024/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1025
1026static void
1027print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1028{
1029 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1030
1031 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1032 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1033 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1034 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1035 (void*)kernargs->heap_ptr);
1036 fprintf (stderr, "%*sarena address: %p\n", indent, "",
1037 (void*)kernargs->arena_ptr);
1038 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1039 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1040 dispatch->private_segment_size);
1041 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1042 dispatch->group_segment_size);
1043 fprintf (stderr, "\n");
1044}
1045
1046/* }}} */
1047/* {{{ Utility functions */
1048
1049/* Cast the thread local storage to gcn_thread. */
1050
1051static inline struct gcn_thread *
1052gcn_thread (void)
1053{
1054 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1055}
1056
1057/* Initialize debug and suppress_host_fallback according to the environment. */
1058
1059static void
1060init_environment_variables (void)
1061{
1062 if (secure_getenv ("GCN_DEBUG"))
1063 debug = true;
1064 else
1065 debug = false;
1066
1067 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1068 suppress_host_fallback = true;
1069 else
1070 suppress_host_fallback = false;
1071
1072 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1073 if (hsa_runtime_lib == NULL)
7c1e856b 1074 hsa_runtime_lib = "libhsa-runtime64.so.1";
237957cc
AS
1075
1076 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1077
1078 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1079 if (!x)
1080 x = secure_getenv ("GCN_NUM_GANGS");
1081 if (x)
1082 override_x_dim = atoi (x);
1083
1084 const char *z = secure_getenv ("GCN_NUM_THREADS");
1085 if (!z)
1086 z = secure_getenv ("GCN_NUM_WORKERS");
1087 if (z)
1088 override_z_dim = atoi (z);
1089
1090 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1091 if (heap)
1092 {
1093 size_t tmp = atol (heap);
1094 if (tmp)
1095 gcn_kernel_heap_size = tmp;
1096 }
1097}
1098
1099/* Return malloc'd string with name of SYMBOL. */
1100
1101static char *
1102get_executable_symbol_name (hsa_executable_symbol_t symbol)
1103{
1104 hsa_status_t status;
1105 char *res;
1106 uint32_t len;
1107 const hsa_executable_symbol_info_t info_name_length
1108 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1109
1110 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1111 &len);
1112 if (status != HSA_STATUS_SUCCESS)
1113 {
1114 hsa_error ("Could not get length of symbol name", status);
1115 return NULL;
1116 }
1117
1118 res = GOMP_PLUGIN_malloc (len + 1);
1119
1120 const hsa_executable_symbol_info_t info_name
1121 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1122
1123 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1124
1125 if (status != HSA_STATUS_SUCCESS)
1126 {
1127 hsa_error ("Could not get symbol name", status);
1128 free (res);
1129 return NULL;
1130 }
1131
1132 res[len] = '\0';
1133
1134 return res;
1135}
1136
237957cc
AS
1137/* Get the number of GPU Compute Units. */
1138
1139static int
1140get_cu_count (struct agent_info *agent)
1141{
1142 uint32_t cu_count;
1143 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1144 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1145 if (status == HSA_STATUS_SUCCESS)
1146 return cu_count;
1147 else
1148 return 64; /* The usual number for older devices. */
1149}
1150
1151/* Calculate the maximum grid size for OMP threads / OACC workers.
1152 This depends on the kernel's resource usage levels. */
1153
1154static int
1155limit_worker_threads (int threads)
1156{
1157 /* FIXME Do something more inteligent here.
1158 GCN can always run 4 threads within a Compute Unit, but
1159 more than that depends on register usage. */
1160 if (threads > 16)
1161 threads = 16;
1162 return threads;
1163}
1164
1165/* Parse the target attributes INPUT provided by the compiler and return true
1166 if we should run anything all. If INPUT is NULL, fill DEF with default
1167 values, then store INPUT or DEF into *RESULT.
1168
1169 This is used for OpenMP only. */
1170
1171static bool
1172parse_target_attributes (void **input,
1173 struct GOMP_kernel_launch_attributes *def,
1174 struct GOMP_kernel_launch_attributes **result,
1175 struct agent_info *agent)
1176{
1177 if (!input)
1178 GOMP_PLUGIN_fatal ("No target arguments provided");
1179
1180 bool grid_attrs_found = false;
1181 bool gcn_dims_found = false;
1182 int gcn_teams = 0;
1183 int gcn_threads = 0;
1184 while (*input)
1185 {
1186 intptr_t id = (intptr_t) *input++, val;
1187
1188 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1189 val = (intptr_t) *input++;
1190 else
1191 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1192
1193 val = (val > INT_MAX) ? INT_MAX : val;
1194
1195 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1196 && ((id & GOMP_TARGET_ARG_ID_MASK)
1197 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1198 {
1199 grid_attrs_found = true;
1200 break;
1201 }
14e5e746
AS
1202 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1203 == GOMP_TARGET_ARG_DEVICE_ALL)
237957cc
AS
1204 {
1205 gcn_dims_found = true;
1206 switch (id & GOMP_TARGET_ARG_ID_MASK)
1207 {
1208 case GOMP_TARGET_ARG_NUM_TEAMS:
1209 gcn_teams = val;
1210 break;
1211 case GOMP_TARGET_ARG_THREAD_LIMIT:
1212 gcn_threads = limit_worker_threads (val);
1213 break;
1214 default:
1215 ;
1216 }
1217 }
1218 }
1219
1220 if (gcn_dims_found)
1221 {
7d593fd6
FH
1222 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1223 && gcn_threads == 0 && override_z_dim == 0)
237957cc
AS
1224 {
1225 gcn_threads = 4;
1226 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1227 "threads to 4 per team.\n");
1228 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1229 "GCN_NUM_THREADS=16\n");
1230 }
1231
1232 def->ndim = 3;
1233 /* Fiji has 64 CUs, but Vega20 has 60. */
1234 def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
1235 /* Each thread is 64 work items wide. */
1236 def->gdims[1] = 64;
1237 /* A work group can have 16 wavefronts. */
1238 def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
1239 def->wdims[0] = 1; /* Single team per work-group. */
1240 def->wdims[1] = 64;
1241 def->wdims[2] = 16;
1242 *result = def;
1243 return true;
1244 }
1245 else if (!grid_attrs_found)
1246 {
1247 def->ndim = 1;
1248 def->gdims[0] = 1;
1249 def->gdims[1] = 1;
1250 def->gdims[2] = 1;
1251 def->wdims[0] = 1;
1252 def->wdims[1] = 1;
1253 def->wdims[2] = 1;
1254 *result = def;
1255 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1256 return true;
1257 }
1258
1259 struct GOMP_kernel_launch_attributes *kla;
1260 kla = (struct GOMP_kernel_launch_attributes *) *input;
1261 *result = kla;
1262 if (kla->ndim == 0 || kla->ndim > 3)
1263 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1264
1265 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1266 unsigned i;
1267 for (i = 0; i < kla->ndim; i++)
1268 {
1269 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1270 kla->gdims[i], kla->wdims[i]);
1271 if (kla->gdims[i] == 0)
1272 return false;
1273 }
1274 return true;
1275}
1276
1277/* Return the group size given the requested GROUP size, GRID size and number
1278 of grid dimensions NDIM. */
1279
1280static uint32_t
1281get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1282{
1283 if (group == 0)
1284 {
1285 /* TODO: Provide a default via environment or device characteristics. */
1286 if (ndim == 1)
1287 group = 64;
1288 else if (ndim == 2)
1289 group = 8;
1290 else
1291 group = 4;
1292 }
1293
1294 if (group > grid)
1295 group = grid;
1296 return group;
1297}
1298
1299/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1300
1301static void
1302packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1303{
1304 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1305}
1306
1307/* A never-called callback for the HSA command queues. These signal events
1308 that we don't use, so we trigger an error.
1309
1310 This "queue" is not to be confused with the async queues, below. */
1311
1312static void
1313hsa_queue_callback (hsa_status_t status,
1314 hsa_queue_t *queue __attribute__ ((unused)),
1315 void *data __attribute__ ((unused)))
1316{
1317 hsa_fatal ("Asynchronous queue error", status);
1318}
1319
1320/* }}} */
1321/* {{{ HSA initialization */
1322
1323/* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1324
1325static bool
1326init_hsa_runtime_functions (void)
1327{
1328#define DLSYM_FN(function) \
1329 hsa_fns.function##_fn = dlsym (handle, #function); \
1330 if (hsa_fns.function##_fn == NULL) \
1331 return false;
1332 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1333 if (handle == NULL)
1334 return false;
1335
1336 DLSYM_FN (hsa_status_string)
1337 DLSYM_FN (hsa_system_get_info)
1338 DLSYM_FN (hsa_agent_get_info)
1339 DLSYM_FN (hsa_init)
1340 DLSYM_FN (hsa_iterate_agents)
1341 DLSYM_FN (hsa_region_get_info)
1342 DLSYM_FN (hsa_queue_create)
1343 DLSYM_FN (hsa_agent_iterate_regions)
1344 DLSYM_FN (hsa_executable_destroy)
1345 DLSYM_FN (hsa_executable_create)
1346 DLSYM_FN (hsa_executable_global_variable_define)
1347 DLSYM_FN (hsa_executable_load_code_object)
1348 DLSYM_FN (hsa_executable_freeze)
1349 DLSYM_FN (hsa_signal_create)
1350 DLSYM_FN (hsa_memory_allocate)
1351 DLSYM_FN (hsa_memory_assign_agent)
1352 DLSYM_FN (hsa_memory_copy)
1353 DLSYM_FN (hsa_memory_free)
1354 DLSYM_FN (hsa_signal_destroy)
1355 DLSYM_FN (hsa_executable_get_symbol)
1356 DLSYM_FN (hsa_executable_symbol_get_info)
1357 DLSYM_FN (hsa_executable_iterate_symbols)
1358 DLSYM_FN (hsa_queue_add_write_index_release)
1359 DLSYM_FN (hsa_queue_load_read_index_acquire)
1360 DLSYM_FN (hsa_signal_wait_acquire)
1361 DLSYM_FN (hsa_signal_store_relaxed)
1362 DLSYM_FN (hsa_signal_store_release)
1363 DLSYM_FN (hsa_signal_load_acquire)
1364 DLSYM_FN (hsa_queue_destroy)
1365 DLSYM_FN (hsa_code_object_deserialize)
1366 return true;
1367#undef DLSYM_FN
1368}
1369
1370/* Return true if the agent is a GPU and can accept of concurrent submissions
1371 from different threads. */
1372
1373static bool
1374suitable_hsa_agent_p (hsa_agent_t agent)
1375{
1376 hsa_device_type_t device_type;
1377 hsa_status_t status
1378 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1379 &device_type);
1380 if (status != HSA_STATUS_SUCCESS)
1381 return false;
1382
1383 switch (device_type)
1384 {
1385 case HSA_DEVICE_TYPE_GPU:
1386 break;
1387 case HSA_DEVICE_TYPE_CPU:
1388 if (!support_cpu_devices)
1389 return false;
1390 break;
1391 default:
1392 return false;
1393 }
1394
1395 uint32_t features = 0;
1396 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1397 &features);
1398 if (status != HSA_STATUS_SUCCESS
1399 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1400 return false;
1401 hsa_queue_type_t queue_type;
1402 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1403 &queue_type);
1404 if (status != HSA_STATUS_SUCCESS
1405 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1406 return false;
1407
1408 return true;
1409}
1410
1411/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1412 agent_count in hsa_context. */
1413
1414static hsa_status_t
1415count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1416{
1417 if (suitable_hsa_agent_p (agent))
1418 hsa_context.agent_count++;
1419 return HSA_STATUS_SUCCESS;
1420}
1421
1422/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1423 id to the describing structure in the hsa context. The index of the
1424 structure is pointed to by DATA, increment it afterwards. */
1425
1426static hsa_status_t
1427assign_agent_ids (hsa_agent_t agent, void *data)
1428{
1429 if (suitable_hsa_agent_p (agent))
1430 {
1431 int *agent_index = (int *) data;
1432 hsa_context.agents[*agent_index].id = agent;
1433 ++*agent_index;
1434 }
1435 return HSA_STATUS_SUCCESS;
1436}
1437
1438/* Initialize hsa_context if it has not already been done.
1439 Return TRUE on success. */
1440
1441static bool
1442init_hsa_context (void)
1443{
1444 hsa_status_t status;
1445 int agent_index = 0;
1446
1447 if (hsa_context.initialized)
1448 return true;
1449 init_environment_variables ();
1450 if (!init_hsa_runtime_functions ())
1451 {
1452 GCN_WARNING ("Run-time could not be dynamically opened\n");
1453 if (suppress_host_fallback)
1454 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1455 return false;
1456 }
1457 status = hsa_fns.hsa_init_fn ();
1458 if (status != HSA_STATUS_SUCCESS)
1459 return hsa_error ("Run-time could not be initialized", status);
1460 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1461
1462 if (debug)
1463 dump_hsa_system_info ();
1464
1465 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1466 if (status != HSA_STATUS_SUCCESS)
1467 return hsa_error ("GCN GPU devices could not be enumerated", status);
1468 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1469
1470 hsa_context.agents
1471 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1472 * sizeof (struct agent_info));
1473 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
966de09b
AS
1474 if (status != HSA_STATUS_SUCCESS)
1475 return hsa_error ("Scanning compute agents failed", status);
237957cc
AS
1476 if (agent_index != hsa_context.agent_count)
1477 {
1478 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1479 return false;
1480 }
1481
1482 if (debug)
1483 {
1484 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1485 if (status != HSA_STATUS_SUCCESS)
1486 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1487 }
1488
2e5ea579
FH
1489 uint16_t minor, major;
1490 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1491 &minor);
1492 if (status != HSA_STATUS_SUCCESS)
1493 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1494 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1495 &major);
1496 if (status != HSA_STATUS_SUCCESS)
1497 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1498
1499 size_t len = sizeof hsa_context.driver_version_s;
1500 int printed = snprintf (hsa_context.driver_version_s, len,
1501 "HSA Runtime %hu.%hu", (unsigned short int)major,
1502 (unsigned short int)minor);
1503 if (printed >= len)
1504 GCN_WARNING ("HSA runtime version string was truncated."
1505 "Version %hu.%hu is too long.", (unsigned short int)major,
1506 (unsigned short int)minor);
1507
237957cc
AS
1508 hsa_context.initialized = true;
1509 return true;
1510}
1511
1512/* Verify that hsa_context has already been initialized and return the
1513 agent_info structure describing device number N. Return NULL on error. */
1514
1515static struct agent_info *
1516get_agent_info (int n)
1517{
1518 if (!hsa_context.initialized)
1519 {
1520 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1521 return NULL;
1522 }
1523 if (n >= hsa_context.agent_count)
1524 {
1525 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1526 return NULL;
1527 }
1528 if (!hsa_context.agents[n].initialized)
1529 {
1530 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1531 return NULL;
1532 }
1533 return &hsa_context.agents[n];
1534}
1535
1536/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1537
1538 Selects (breaks at) a suitable region of type KIND. */
1539
1540static hsa_status_t
1541get_memory_region (hsa_region_t region, hsa_region_t *retval,
1542 hsa_region_global_flag_t kind)
1543{
1544 hsa_status_t status;
1545 hsa_region_segment_t segment;
1546
1547 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1548 &segment);
1549 if (status != HSA_STATUS_SUCCESS)
1550 return status;
1551 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1552 return HSA_STATUS_SUCCESS;
1553
1554 uint32_t flags;
1555 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1556 &flags);
1557 if (status != HSA_STATUS_SUCCESS)
1558 return status;
1559 if (flags & kind)
1560 {
1561 *retval = region;
1562 return HSA_STATUS_INFO_BREAK;
1563 }
1564 return HSA_STATUS_SUCCESS;
1565}
1566
1567/* Callback of hsa_agent_iterate_regions.
1568
1569 Selects a kernargs memory region. */
1570
1571static hsa_status_t
1572get_kernarg_memory_region (hsa_region_t region, void *data)
1573{
1574 return get_memory_region (region, (hsa_region_t *)data,
1575 HSA_REGION_GLOBAL_FLAG_KERNARG);
1576}
1577
1578/* Callback of hsa_agent_iterate_regions.
1579
1580 Selects a coarse-grained memory region suitable for the heap and
1581 offload data. */
1582
1583static hsa_status_t
1584get_data_memory_region (hsa_region_t region, void *data)
1585{
1586 return get_memory_region (region, (hsa_region_t *)data,
1587 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1588}
1589
7d593fd6
FH
1590static int
1591elf_gcn_isa_field (Elf64_Ehdr *image)
1592{
1593 return image->e_flags & EF_AMDGPU_MACH_MASK;
1594}
1595
7d593fd6
FH
1596const static char *gcn_gfx803_s = "gfx803";
1597const static char *gcn_gfx900_s = "gfx900";
1598const static char *gcn_gfx906_s = "gfx906";
3535402e 1599const static char *gcn_gfx908_s = "gfx908";
7d593fd6
FH
1600const static int gcn_isa_name_len = 6;
1601
1602/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1603 support the ISA. */
1604
1605static const char*
1606isa_hsa_name (int isa) {
1607 switch(isa)
1608 {
7d593fd6
FH
1609 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1610 return gcn_gfx803_s;
1611 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1612 return gcn_gfx900_s;
1613 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1614 return gcn_gfx906_s;
3535402e
AS
1615 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1616 return gcn_gfx908_s;
7d593fd6
FH
1617 }
1618 return NULL;
1619}
1620
1621/* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1622 with -march) or NULL if we do not support the ISA.
1623 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1624
1625static const char*
1626isa_gcc_name (int isa) {
1627 switch(isa)
1628 {
7d593fd6
FH
1629 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1630 return "fiji";
1631 default:
1632 return isa_hsa_name (isa);
1633 }
1634}
1635
1636/* Returns the code which is used in the GCN object code to identify the ISA with
1637 the given name (as used by the HSA runtime). */
1638
1639static gcn_isa
1640isa_code(const char *isa) {
7d593fd6
FH
1641 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1642 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1643
1644 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1645 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1646
1647 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1648 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1649
3535402e
AS
1650 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1651 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1652
7d593fd6
FH
1653 return -1;
1654}
1655
237957cc
AS
1656/* }}} */
1657/* {{{ Run */
1658
1659/* Create or reuse a team arena.
1660
1661 Team arenas are used by OpenMP to avoid calling malloc multiple times
1662 while setting up each team. This is purely a performance optimization.
1663
1664 Allocating an arena also costs performance, albeit on the host side, so
1665 this function will reuse an existing arena if a large enough one is idle.
1666 The arena is released, but not deallocated, when the kernel exits. */
1667
1668static void *
1669get_team_arena (struct agent_info *agent, int num_teams)
1670{
1671 struct team_arena_list **next_ptr = &agent->team_arena_list;
1672 struct team_arena_list *item;
1673
1674 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1675 {
1676 if (item->num_teams < num_teams)
1677 continue;
1678
1679 if (pthread_mutex_trylock (&item->in_use))
1680 continue;
1681
1682 return item->arena;
1683 }
1684
1685 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1686
1687 if (pthread_mutex_lock (&agent->team_arena_write_lock))
1688 {
1689 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1690 return false;
1691 }
1692 item = malloc (sizeof (*item));
1693 item->num_teams = num_teams;
1694 item->next = NULL;
1695 *next_ptr = item;
1696
1697 if (pthread_mutex_init (&item->in_use, NULL))
1698 {
1699 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1700 return false;
1701 }
1702 if (pthread_mutex_lock (&item->in_use))
1703 {
1704 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1705 return false;
1706 }
1707 if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1708 {
1709 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1710 return false;
1711 }
1712
1713 const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */
1714 hsa_status_t status;
1715 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1716 TEAM_ARENA_SIZE*num_teams,
1717 &item->arena);
1718 if (status != HSA_STATUS_SUCCESS)
1719 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1720 status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1721 HSA_ACCESS_PERMISSION_RW);
1722 if (status != HSA_STATUS_SUCCESS)
1723 hsa_fatal ("Could not assign arena memory to device", status);
1724
1725 return item->arena;
1726}
1727
1728/* Mark a team arena available for reuse. */
1729
1730static void
1731release_team_arena (struct agent_info* agent, void *arena)
1732{
1733 struct team_arena_list *item;
1734
1735 for (item = agent->team_arena_list; item; item = item->next)
1736 {
1737 if (item->arena == arena)
1738 {
1739 if (pthread_mutex_unlock (&item->in_use))
1740 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1741 return;
1742 }
1743 }
1744 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1745}
1746
1747/* Clean up all the allocated team arenas. */
1748
1749static bool
1750destroy_team_arenas (struct agent_info *agent)
1751{
1752 struct team_arena_list *item, *next;
1753
1754 for (item = agent->team_arena_list; item; item = next)
1755 {
1756 next = item->next;
1757 hsa_fns.hsa_memory_free_fn (item->arena);
1758 if (pthread_mutex_destroy (&item->in_use))
1759 {
1760 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1761 return false;
1762 }
1763 free (item);
1764 }
1765 agent->team_arena_list = NULL;
1766
1767 return true;
1768}
1769
1770/* Allocate memory on a specified device. */
1771
1772static void *
1773alloc_by_agent (struct agent_info *agent, size_t size)
1774{
1775 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1776
1777 /* Zero-size allocations are invalid, so in order to return a valid pointer
1778 we need to pass a valid size. One source of zero-size allocations is
1779 kernargs for kernels that have no inputs or outputs (the kernel may
1780 only use console output, for example). */
1781 if (size == 0)
1782 size = 4;
1783
1784 void *ptr;
1785 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1786 size, &ptr);
1787 if (status != HSA_STATUS_SUCCESS)
1788 {
1789 hsa_error ("Could not allocate device memory", status);
1790 return NULL;
1791 }
1792
1793 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1794 HSA_ACCESS_PERMISSION_RW);
1795 if (status != HSA_STATUS_SUCCESS)
1796 {
1797 hsa_error ("Could not assign data memory to device", status);
1798 return NULL;
1799 }
1800
1801 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1802 bool profiling_dispatch_p
1803 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1804 if (profiling_dispatch_p)
1805 {
1806 acc_prof_info *prof_info = thr->prof_info;
1807 acc_event_info data_event_info;
1808 acc_api_info *api_info = thr->api_info;
1809
1810 prof_info->event_type = acc_ev_alloc;
1811
1812 data_event_info.data_event.event_type = prof_info->event_type;
1813 data_event_info.data_event.valid_bytes
1814 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1815 data_event_info.data_event.parent_construct
1816 = acc_construct_parallel;
1817 data_event_info.data_event.implicit = 1;
1818 data_event_info.data_event.tool_info = NULL;
1819 data_event_info.data_event.var_name = NULL;
1820 data_event_info.data_event.bytes = size;
1821 data_event_info.data_event.host_ptr = NULL;
1822 data_event_info.data_event.device_ptr = (void *) ptr;
1823
1824 api_info->device_api = acc_device_api_other;
1825
1826 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1827 api_info);
1828 }
1829
1830 return ptr;
1831}
1832
1833/* Create kernel dispatch data structure for given KERNEL, along with
1834 the necessary device signals and memory allocations. */
1835
1836static struct kernel_dispatch *
1837create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1838{
1839 struct agent_info *agent = kernel->agent;
1840 struct kernel_dispatch *shadow
1841 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1842
1843 shadow->agent = kernel->agent;
1844 shadow->object = kernel->object;
1845
1846 hsa_signal_t sync_signal;
1847 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1848 if (status != HSA_STATUS_SUCCESS)
1849 hsa_fatal ("Error creating the GCN sync signal", status);
1850
1851 shadow->signal = sync_signal.handle;
1852 shadow->private_segment_size = kernel->private_segment_size;
1853 shadow->group_segment_size = kernel->group_segment_size;
1854
1855 /* We expect kernels to request a single pointer, explicitly, and the
1856 rest of struct kernargs, implicitly. If they request anything else
1857 then something is wrong. */
1858 if (kernel->kernarg_segment_size > 8)
1859 {
1860 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1861 return NULL;
1862 }
1863
1864 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1865 sizeof (struct kernargs),
1866 &shadow->kernarg_address);
1867 if (status != HSA_STATUS_SUCCESS)
1868 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1869 struct kernargs *kernargs = shadow->kernarg_address;
1870
1871 /* Zero-initialize the output_data (minimum needed). */
1872 kernargs->out_ptr = (int64_t)&kernargs->output_data;
1873 kernargs->output_data.next_output = 0;
1874 for (unsigned i = 0;
1875 i < (sizeof (kernargs->output_data.queue)
1876 / sizeof (kernargs->output_data.queue[0]));
1877 i++)
1878 kernargs->output_data.queue[i].written = 0;
1879 kernargs->output_data.consumed = 0;
1880
1881 /* Pass in the heap location. */
1882 kernargs->heap_ptr = (int64_t)kernel->module->heap;
1883
1884 /* Create an arena. */
1885 if (kernel->kind == KIND_OPENMP)
1886 kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1887 else
1888 kernargs->arena_ptr = 0;
1889
1890 /* Ensure we can recognize unset return values. */
1891 kernargs->output_data.return_value = 0xcafe0000;
1892
1893 return shadow;
1894}
1895
1896/* Output any data written to console output from the kernel. It is expected
1897 that this function is polled during kernel execution.
1898
1899 We print all entries from the last item printed to the next entry without
1900 a "written" flag. If the "final" flag is set then it'll continue right to
1901 the end.
1902
1903 The print buffer is circular, but the from and to locations don't wrap when
1904 the buffer does, so the output limit is UINT_MAX. The target blocks on
1905 output when the buffer is full. */
1906
1907static void
1908console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1909 bool final)
1910{
1911 unsigned int limit = (sizeof (kernargs->output_data.queue)
1912 / sizeof (kernargs->output_data.queue[0]));
1913
1914 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1915 __ATOMIC_ACQUIRE);
1916 unsigned int to = kernargs->output_data.next_output;
1917
1918 if (from > to)
1919 {
1920 /* Overflow. */
1921 if (final)
1922 printf ("GCN print buffer overflowed.\n");
1923 return;
1924 }
1925
1926 unsigned int i;
1927 for (i = from; i < to; i++)
1928 {
1929 struct printf_data *data = &kernargs->output_data.queue[i%limit];
1930
1931 if (!data->written && !final)
1932 break;
1933
1934 switch (data->type)
1935 {
1936 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1937 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1938 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1939 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1940 default: printf ("GCN print buffer error!\n"); break;
1941 }
1942 data->written = 0;
1943 __atomic_store_n (&kernargs->output_data.consumed, i+1,
1944 __ATOMIC_RELEASE);
1945 }
1946 fflush (stdout);
1947}
1948
1949/* Release data structure created for a kernel dispatch in SHADOW argument,
1950 and clean up the signal and memory allocations. */
1951
1952static void
1953release_kernel_dispatch (struct kernel_dispatch *shadow)
1954{
1955 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1956
1957 struct kernargs *kernargs = shadow->kernarg_address;
1958 void *arena = (void *)kernargs->arena_ptr;
1959 if (arena)
1960 release_team_arena (shadow->agent, arena);
1961
1962 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1963
1964 hsa_signal_t s;
1965 s.handle = shadow->signal;
1966 hsa_fns.hsa_signal_destroy_fn (s);
1967
1968 free (shadow);
1969}
1970
1971/* Extract the properties from a kernel binary. */
1972
1973static void
1974init_kernel_properties (struct kernel_info *kernel)
1975{
1976 hsa_status_t status;
1977 struct agent_info *agent = kernel->agent;
1978 hsa_executable_symbol_t kernel_symbol;
f062c3f1
AS
1979 char *buf = alloca (strlen (kernel->name) + 4);
1980 sprintf (buf, "%s.kd", kernel->name);
237957cc 1981 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
f062c3f1 1982 buf, agent->id,
237957cc
AS
1983 0, &kernel_symbol);
1984 if (status != HSA_STATUS_SUCCESS)
1985 {
1986 hsa_warn ("Could not find symbol for kernel in the code object", status);
f062c3f1 1987 fprintf (stderr, "not found name: '%s'\n", buf);
237957cc
AS
1988 dump_executable_symbols (agent->executable);
1989 goto failure;
1990 }
1991 GCN_DEBUG ("Located kernel %s\n", kernel->name);
1992 status = hsa_fns.hsa_executable_symbol_get_info_fn
1993 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
1994 if (status != HSA_STATUS_SUCCESS)
1995 hsa_fatal ("Could not extract a kernel object from its symbol", status);
1996 status = hsa_fns.hsa_executable_symbol_get_info_fn
1997 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1998 &kernel->kernarg_segment_size);
1999 if (status != HSA_STATUS_SUCCESS)
2000 hsa_fatal ("Could not get info about kernel argument size", status);
2001 status = hsa_fns.hsa_executable_symbol_get_info_fn
2002 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2003 &kernel->group_segment_size);
2004 if (status != HSA_STATUS_SUCCESS)
2005 hsa_fatal ("Could not get info about kernel group segment size", status);
2006 status = hsa_fns.hsa_executable_symbol_get_info_fn
2007 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2008 &kernel->private_segment_size);
2009 if (status != HSA_STATUS_SUCCESS)
2010 hsa_fatal ("Could not get info about kernel private segment size",
2011 status);
2012
2013 /* The kernel type is not known until something tries to launch it. */
2014 kernel->kind = KIND_UNKNOWN;
2015
2016 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2017 "following segment sizes: \n", kernel->name);
2018 GCN_DEBUG (" group_segment_size: %u\n",
2019 (unsigned) kernel->group_segment_size);
2020 GCN_DEBUG (" private_segment_size: %u\n",
2021 (unsigned) kernel->private_segment_size);
2022 GCN_DEBUG (" kernarg_segment_size: %u\n",
2023 (unsigned) kernel->kernarg_segment_size);
2024 return;
2025
2026failure:
2027 kernel->initialization_failed = true;
2028}
2029
2030/* Do all the work that is necessary before running KERNEL for the first time.
2031 The function assumes the program has been created, finalized and frozen by
2032 create_and_finalize_hsa_program. */
2033
2034static void
2035init_kernel (struct kernel_info *kernel)
2036{
2037 if (pthread_mutex_lock (&kernel->init_mutex))
2038 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2039 if (kernel->initialized)
2040 {
2041 if (pthread_mutex_unlock (&kernel->init_mutex))
2042 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2043 "mutex");
2044
2045 return;
2046 }
2047
2048 init_kernel_properties (kernel);
2049
2050 if (!kernel->initialization_failed)
2051 {
2052 GCN_DEBUG ("\n");
2053
2054 kernel->initialized = true;
2055 }
2056 if (pthread_mutex_unlock (&kernel->init_mutex))
2057 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2058 "mutex");
2059}
2060
2061/* Run KERNEL on its agent, pass VARS to it as arguments and take
2062 launch attributes from KLA.
2063
2064 MODULE_LOCKED indicates that the caller already holds the lock and
2065 run_kernel need not lock it again.
2066 If AQ is NULL then agent->sync_queue will be used. */
2067
2068static void
2069run_kernel (struct kernel_info *kernel, void *vars,
2070 struct GOMP_kernel_launch_attributes *kla,
2071 struct goacc_asyncqueue *aq, bool module_locked)
2072{
5a28e272
KCY
2073 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2074 kernel->description->vpgr_count);
2075
2076 /* Reduce the number of threads/workers if there are insufficient
2077 VGPRs available to run the kernels together. */
2078 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2079 {
2080 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2081 int max_threads = (256 / granulated_vgprs) * 4;
2082 if (kla->gdims[2] > max_threads)
2083 {
2084 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2085 " per team/gang - reducing to %d threads/workers.\n",
2086 kla->gdims[2], max_threads);
2087 kla->gdims[2] = max_threads;
2088 }
2089 }
2090
237957cc
AS
2091 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2092 (aq ? aq->id : 0));
2093 GCN_DEBUG ("GCN launch attribs: gdims:[");
2094 int i;
2095 for (i = 0; i < kla->ndim; ++i)
2096 {
2097 if (i)
2098 DEBUG_PRINT (", ");
2099 DEBUG_PRINT ("%u", kla->gdims[i]);
2100 }
2101 DEBUG_PRINT ("], normalized gdims:[");
2102 for (i = 0; i < kla->ndim; ++i)
2103 {
2104 if (i)
2105 DEBUG_PRINT (", ");
2106 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2107 }
2108 DEBUG_PRINT ("], wdims:[");
2109 for (i = 0; i < kla->ndim; ++i)
2110 {
2111 if (i)
2112 DEBUG_PRINT (", ");
2113 DEBUG_PRINT ("%u", kla->wdims[i]);
2114 }
2115 DEBUG_PRINT ("]\n");
2116 DEBUG_FLUSH ();
2117
2118 struct agent_info *agent = kernel->agent;
2119 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2120 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2121
2122 if (!agent->initialized)
2123 GOMP_PLUGIN_fatal ("Agent must be initialized");
2124
2125 if (!kernel->initialized)
2126 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2127
2128 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2129
2130 uint64_t index
2131 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2132 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2133
2134 /* Wait until the queue is not full before writing the packet. */
2135 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2136 >= command_q->size)
2137 ;
2138
2139 /* Do not allow the dimensions to be overridden when running
2140 constructors or destructors. */
2141 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2142 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2143
2144 hsa_kernel_dispatch_packet_t *packet;
2145 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2146 + index % command_q->size;
2147
2148 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2149 packet->grid_size_x = override_x ? : kla->gdims[0];
2150 packet->workgroup_size_x = get_group_size (kla->ndim,
2151 packet->grid_size_x,
2152 kla->wdims[0]);
2153
2154 if (kla->ndim >= 2)
2155 {
2156 packet->grid_size_y = kla->gdims[1];
2157 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2158 kla->wdims[1]);
2159 }
2160 else
2161 {
2162 packet->grid_size_y = 1;
2163 packet->workgroup_size_y = 1;
2164 }
2165
2166 if (kla->ndim == 3)
2167 {
2168 packet->grid_size_z = limit_worker_threads (override_z
2169 ? : kla->gdims[2]);
2170 packet->workgroup_size_z = get_group_size (kla->ndim,
2171 packet->grid_size_z,
2172 kla->wdims[2]);
2173 }
2174 else
2175 {
2176 packet->grid_size_z = 1;
2177 packet->workgroup_size_z = 1;
2178 }
2179
2180 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2181 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2182 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2183 packet->grid_size_x / packet->workgroup_size_x,
2184 packet->grid_size_y / packet->workgroup_size_y,
2185 packet->grid_size_z / packet->workgroup_size_z,
2186 packet->workgroup_size_x, packet->workgroup_size_y,
2187 packet->workgroup_size_z);
2188
2189 struct kernel_dispatch *shadow
2190 = create_kernel_dispatch (kernel, packet->grid_size_x);
2191 shadow->queue = command_q;
2192
2193 if (debug)
2194 {
2195 fprintf (stderr, "\nKernel has following dependencies:\n");
2196 print_kernel_dispatch (shadow, 2);
2197 }
2198
2199 packet->private_segment_size = kernel->private_segment_size;
2200 packet->group_segment_size = kernel->group_segment_size;
2201 packet->kernel_object = kernel->object;
2202 packet->kernarg_address = shadow->kernarg_address;
2203 hsa_signal_t s;
2204 s.handle = shadow->signal;
2205 packet->completion_signal = s;
2206 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2207 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2208
2209 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2210
2211 uint16_t header;
2212 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2213 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2214 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2215
2216 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2217 agent->device_id);
2218
2219 packet_store_release ((uint32_t *) packet, header,
2220 (uint16_t) kla->ndim
2221 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2222
2223 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2224 index);
2225
2226 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2227
2228 /* Root signal waits with 1ms timeout. */
2229 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2230 1000 * 1000,
2231 HSA_WAIT_STATE_BLOCKED) != 0)
2232 {
2233 console_output (kernel, shadow->kernarg_address, false);
2234 }
2235 console_output (kernel, shadow->kernarg_address, true);
2236
2237 struct kernargs *kernargs = shadow->kernarg_address;
2238 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2239
2240 release_kernel_dispatch (shadow);
2241
2242 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2243 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2244
2245 unsigned int upper = (return_value & ~0xffff) >> 16;
2246 if (upper == 0xcafe)
2247 ; // exit not called, normal termination.
2248 else if (upper == 0xffff)
2249 ; // exit called.
2250 else
2251 {
2252 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2253 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2254 return_value);
2255 abort ();
2256 }
2257
2258 if (upper == 0xffff)
2259 {
2260 unsigned int signal = (return_value >> 8) & 0xff;
2261
2262 if (signal == SIGABRT)
2263 {
2264 GCN_WARNING ("GCN Kernel aborted\n");
2265 abort ();
2266 }
2267 else if (signal != 0)
2268 {
2269 GCN_WARNING ("GCN Kernel received unknown signal\n");
2270 abort ();
2271 }
2272
2273 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2274 exit (return_value & 0xff);
2275 }
2276}
2277
2278/* }}} */
2279/* {{{ Load/Unload */
2280
2281/* Initialize KERNEL from D and other parameters. Return true on success. */
2282
2283static bool
2284init_basic_kernel_info (struct kernel_info *kernel,
2285 struct hsa_kernel_description *d,
2286 struct agent_info *agent,
2287 struct module_info *module)
2288{
2289 kernel->agent = agent;
2290 kernel->module = module;
2291 kernel->name = d->name;
5a28e272 2292 kernel->description = d;
237957cc
AS
2293 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2294 {
2295 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2296 return false;
2297 }
2298 return true;
2299}
2300
7d593fd6
FH
2301/* Check that the GCN ISA of the given image matches the ISA of the agent. */
2302
2303static bool
2304isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2305{
2306 int isa_field = elf_gcn_isa_field (image);
2307 const char* isa_s = isa_hsa_name (isa_field);
2308 if (!isa_s)
2309 {
2310 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2311 return false;
2312 }
2313
2314 if (isa_field != agent->device_isa)
2315 {
2316 char msg[120];
2317 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2318 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2319 assert (agent_isa_s);
2320 assert (agent_isa_gcc_s);
2321
2322 snprintf (msg, sizeof msg,
2323 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2324 "Try to recompile with '-foffload=-march=%s'.\n",
2325 isa_s, agent_isa_s, agent_isa_gcc_s);
2326
2327 hsa_error (msg, HSA_STATUS_ERROR);
2328 return false;
2329 }
2330
2331 return true;
2332}
2333
237957cc
AS
2334/* Create and finalize the program consisting of all loaded modules. */
2335
2336static bool
2337create_and_finalize_hsa_program (struct agent_info *agent)
2338{
2339 hsa_status_t status;
237957cc
AS
2340 bool res = true;
2341 if (pthread_mutex_lock (&agent->prog_mutex))
2342 {
2343 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2344 return false;
2345 }
2346 if (agent->prog_finalized)
2347 goto final;
2348
2349 status
2350 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2351 HSA_EXECUTABLE_STATE_UNFROZEN,
2352 "", &agent->executable);
2353 if (status != HSA_STATUS_SUCCESS)
2354 {
2355 hsa_error ("Could not create GCN executable", status);
2356 goto fail;
2357 }
2358
2359 /* Load any GCN modules. */
2360 struct module_info *module = agent->module;
2361 if (module)
2362 {
2363 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2364
7d593fd6
FH
2365 if (!isa_matches_agent (agent, image))
2366 goto fail;
2367
237957cc
AS
2368 hsa_code_object_t co = { 0 };
2369 status = hsa_fns.hsa_code_object_deserialize_fn
2370 (module->image_desc->gcn_image->image,
2371 module->image_desc->gcn_image->size,
2372 NULL, &co);
2373 if (status != HSA_STATUS_SUCCESS)
2374 {
2375 hsa_error ("Could not deserialize GCN code object", status);
2376 goto fail;
2377 }
2378
2379 status = hsa_fns.hsa_executable_load_code_object_fn
2380 (agent->executable, agent->id, co, "");
2381 if (status != HSA_STATUS_SUCCESS)
2382 {
2383 hsa_error ("Could not load GCN code object", status);
2384 goto fail;
2385 }
2386
2387 if (!module->heap)
2388 {
2389 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2390 gcn_kernel_heap_size,
2391 (void**)&module->heap);
2392 if (status != HSA_STATUS_SUCCESS)
2393 {
2394 hsa_error ("Could not allocate memory for GCN heap", status);
2395 goto fail;
2396 }
2397
2398 status = hsa_fns.hsa_memory_assign_agent_fn
2399 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2400 if (status != HSA_STATUS_SUCCESS)
2401 {
2402 hsa_error ("Could not assign GCN heap memory to device", status);
2403 goto fail;
2404 }
2405
2406 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2407 &gcn_kernel_heap_size,
2408 sizeof (gcn_kernel_heap_size));
2409 }
2410
2411 }
2412
2413 if (debug)
2414 dump_executable_symbols (agent->executable);
2415
2416 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2417 if (status != HSA_STATUS_SUCCESS)
2418 {
2419 hsa_error ("Could not freeze the GCN executable", status);
2420 goto fail;
2421 }
2422
237957cc
AS
2423final:
2424 agent->prog_finalized = true;
2425
2426 if (pthread_mutex_unlock (&agent->prog_mutex))
2427 {
2428 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2429 res = false;
2430 }
2431
2432 return res;
2433
2434fail:
2435 res = false;
2436 goto final;
2437}
2438
2439/* Free the HSA program in agent and everything associated with it and set
2440 agent->prog_finalized and the initialized flags of all kernels to false.
2441 Return TRUE on success. */
2442
2443static bool
2444destroy_hsa_program (struct agent_info *agent)
2445{
2446 if (!agent->prog_finalized)
2447 return true;
2448
2449 hsa_status_t status;
2450
2451 GCN_DEBUG ("Destroying the current GCN program.\n");
2452
2453 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2454 if (status != HSA_STATUS_SUCCESS)
2455 return hsa_error ("Could not destroy GCN executable", status);
2456
2457 if (agent->module)
2458 {
2459 int i;
2460 for (i = 0; i < agent->module->kernel_count; i++)
2461 agent->module->kernels[i].initialized = false;
2462
2463 if (agent->module->heap)
2464 {
2465 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2466 agent->module->heap = NULL;
2467 }
2468 }
2469 agent->prog_finalized = false;
2470 return true;
2471}
2472
2473/* Deinitialize all information associated with MODULE and kernels within
2474 it. Return TRUE on success. */
2475
2476static bool
2477destroy_module (struct module_info *module, bool locked)
2478{
2479 /* Run destructors before destroying module. */
2480 struct GOMP_kernel_launch_attributes kla =
2481 { 3,
2482 /* Grid size. */
2483 { 1, 64, 1 },
2484 /* Work-group size. */
2485 { 1, 64, 1 }
2486 };
2487
2488 if (module->fini_array_func)
2489 {
2490 init_kernel (module->fini_array_func);
2491 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2492 }
2493 module->constructors_run_p = false;
2494
2495 int i;
2496 for (i = 0; i < module->kernel_count; i++)
2497 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2498 {
2499 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2500 "mutex");
2501 return false;
2502 }
2503
2504 return true;
2505}
2506
2507/* }}} */
2508/* {{{ Async */
2509
2510/* Callback of dispatch queues to report errors. */
2511
2512static void
2513execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2514{
2515 struct queue_entry *entry = &aq->queue[index];
2516
2517 switch (entry->type)
2518 {
2519 case KERNEL_LAUNCH:
2520 if (DEBUG_QUEUES)
2521 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2522 aq->agent->device_id, aq->id, index);
2523 run_kernel (entry->u.launch.kernel,
2524 entry->u.launch.vars,
2525 &entry->u.launch.kla, aq, false);
2526 if (DEBUG_QUEUES)
2527 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2528 aq->agent->device_id, aq->id, index);
2529 break;
2530
2531 case CALLBACK:
2532 if (DEBUG_QUEUES)
2533 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2534 aq->agent->device_id, aq->id, index);
2535 entry->u.callback.fn (entry->u.callback.data);
2536 if (DEBUG_QUEUES)
2537 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2538 aq->agent->device_id, aq->id, index);
2539 break;
2540
2541 case ASYNC_WAIT:
2542 {
2543 /* FIXME: is it safe to access a placeholder that may already have
2544 been executed? */
2545 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2546
2547 if (DEBUG_QUEUES)
2548 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2549 aq->agent->device_id, aq->id, index);
2550
2551 pthread_mutex_lock (&placeholderp->mutex);
2552
2553 while (!placeholderp->executed)
2554 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2555
2556 pthread_mutex_unlock (&placeholderp->mutex);
2557
2558 if (pthread_cond_destroy (&placeholderp->cond))
2559 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2560
2561 if (pthread_mutex_destroy (&placeholderp->mutex))
2562 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2563
2564 if (DEBUG_QUEUES)
2565 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2566 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2567 }
2568 break;
2569
2570 case ASYNC_PLACEHOLDER:
2571 pthread_mutex_lock (&entry->u.placeholder.mutex);
2572 entry->u.placeholder.executed = 1;
2573 pthread_cond_signal (&entry->u.placeholder.cond);
2574 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2575 break;
2576
2577 default:
2578 GOMP_PLUGIN_fatal ("Unknown queue element");
2579 }
2580}
2581
2582/* This function is run as a thread to service an async queue in the
2583 background. It runs continuously until the stop flag is set. */
2584
2585static void *
2586drain_queue (void *thread_arg)
2587{
2588 struct goacc_asyncqueue *aq = thread_arg;
2589
2590 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2591 {
2592 aq->drain_queue_stop = 2;
2593 return NULL;
2594 }
2595
2596 pthread_mutex_lock (&aq->mutex);
2597
2598 while (true)
2599 {
2600 if (aq->drain_queue_stop)
2601 break;
2602
2603 if (aq->queue_n > 0)
2604 {
2605 pthread_mutex_unlock (&aq->mutex);
2606 execute_queue_entry (aq, aq->queue_first);
2607
2608 pthread_mutex_lock (&aq->mutex);
2609 aq->queue_first = ((aq->queue_first + 1)
2610 % ASYNC_QUEUE_SIZE);
2611 aq->queue_n--;
2612
2613 if (DEBUG_THREAD_SIGNAL)
2614 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2615 aq->agent->device_id, aq->id);
2616 pthread_cond_broadcast (&aq->queue_cond_out);
2617 pthread_mutex_unlock (&aq->mutex);
2618
2619 if (DEBUG_QUEUES)
2620 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2621 aq->id);
2622 pthread_mutex_lock (&aq->mutex);
2623 }
2624 else
2625 {
2626 if (DEBUG_THREAD_SLEEP)
2627 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2628 aq->agent->device_id, aq->id);
2629 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2630 if (DEBUG_THREAD_SLEEP)
2631 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2632 aq->agent->device_id, aq->id);
2633 }
2634 }
2635
2636 aq->drain_queue_stop = 2;
2637 if (DEBUG_THREAD_SIGNAL)
2638 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2639 aq->agent->device_id, aq->id);
2640 pthread_cond_broadcast (&aq->queue_cond_out);
2641 pthread_mutex_unlock (&aq->mutex);
2642
2643 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2644 return NULL;
2645}
2646
2647/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2648 is not usually the case. This is just a debug tool. */
2649
2650static void
2651drain_queue_synchronous (struct goacc_asyncqueue *aq)
2652{
2653 pthread_mutex_lock (&aq->mutex);
2654
2655 while (aq->queue_n > 0)
2656 {
2657 execute_queue_entry (aq, aq->queue_first);
2658
2659 aq->queue_first = ((aq->queue_first + 1)
2660 % ASYNC_QUEUE_SIZE);
2661 aq->queue_n--;
2662 }
2663
2664 pthread_mutex_unlock (&aq->mutex);
2665}
2666
d88b27da
JB
2667/* Block the current thread until an async queue is writable. The aq->mutex
2668 lock should be held on entry, and remains locked on exit. */
237957cc
AS
2669
2670static void
2671wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2672{
2673 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2674 {
237957cc
AS
2675 /* Queue is full. Wait for it to not be full. */
2676 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2677 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
237957cc
AS
2678 }
2679}
2680
2681/* Request an asynchronous kernel launch on the specified queue. This
2682 may block if the queue is full, but returns without waiting for the
2683 kernel to run. */
2684
2685static void
2686queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2687 void *vars, struct GOMP_kernel_launch_attributes *kla)
2688{
2689 assert (aq->agent == kernel->agent);
2690
237957cc
AS
2691 pthread_mutex_lock (&aq->mutex);
2692
d88b27da
JB
2693 wait_for_queue_nonfull (aq);
2694
237957cc
AS
2695 int queue_last = ((aq->queue_first + aq->queue_n)
2696 % ASYNC_QUEUE_SIZE);
2697 if (DEBUG_QUEUES)
2698 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2699 aq->id, queue_last);
2700
2701 aq->queue[queue_last].type = KERNEL_LAUNCH;
2702 aq->queue[queue_last].u.launch.kernel = kernel;
2703 aq->queue[queue_last].u.launch.vars = vars;
2704 aq->queue[queue_last].u.launch.kla = *kla;
2705
2706 aq->queue_n++;
2707
2708 if (DEBUG_THREAD_SIGNAL)
2709 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2710 aq->agent->device_id, aq->id);
2711 pthread_cond_signal (&aq->queue_cond_in);
2712
2713 pthread_mutex_unlock (&aq->mutex);
2714}
2715
2716/* Request an asynchronous callback on the specified queue. The callback
2717 function will be called, with the given opaque data, from the appropriate
2718 async thread, when all previous items on that queue are complete. */
2719
2720static void
2721queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2722 void *data)
2723{
237957cc
AS
2724 pthread_mutex_lock (&aq->mutex);
2725
d88b27da
JB
2726 wait_for_queue_nonfull (aq);
2727
237957cc
AS
2728 int queue_last = ((aq->queue_first + aq->queue_n)
2729 % ASYNC_QUEUE_SIZE);
2730 if (DEBUG_QUEUES)
2731 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2732 aq->id, queue_last);
2733
2734 aq->queue[queue_last].type = CALLBACK;
2735 aq->queue[queue_last].u.callback.fn = fn;
2736 aq->queue[queue_last].u.callback.data = data;
2737
2738 aq->queue_n++;
2739
2740 if (DEBUG_THREAD_SIGNAL)
2741 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2742 aq->agent->device_id, aq->id);
2743 pthread_cond_signal (&aq->queue_cond_in);
2744
2745 pthread_mutex_unlock (&aq->mutex);
2746}
2747
2748/* Request that a given async thread wait for another thread (unspecified) to
2749 reach the given placeholder. The wait will occur when all previous entries
2750 on the queue are complete. A placeholder is effectively a kind of signal
2751 which simply sets a flag when encountered in a queue. */
2752
2753static void
2754queue_push_asyncwait (struct goacc_asyncqueue *aq,
2755 struct placeholder *placeholderp)
2756{
237957cc
AS
2757 pthread_mutex_lock (&aq->mutex);
2758
d88b27da
JB
2759 wait_for_queue_nonfull (aq);
2760
237957cc
AS
2761 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2762 if (DEBUG_QUEUES)
2763 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2764 aq->id, queue_last);
2765
2766 aq->queue[queue_last].type = ASYNC_WAIT;
2767 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2768
2769 aq->queue_n++;
2770
2771 if (DEBUG_THREAD_SIGNAL)
2772 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2773 aq->agent->device_id, aq->id);
2774 pthread_cond_signal (&aq->queue_cond_in);
2775
2776 pthread_mutex_unlock (&aq->mutex);
2777}
2778
2779/* Add a placeholder into an async queue. When the async thread reaches the
2780 placeholder it will set the "executed" flag to true and continue.
2781 Another thread may be waiting on this thread reaching the placeholder. */
2782
2783static struct placeholder *
2784queue_push_placeholder (struct goacc_asyncqueue *aq)
2785{
2786 struct placeholder *placeholderp;
2787
237957cc
AS
2788 pthread_mutex_lock (&aq->mutex);
2789
d88b27da
JB
2790 wait_for_queue_nonfull (aq);
2791
237957cc
AS
2792 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2793 if (DEBUG_QUEUES)
2794 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2795 aq->id, queue_last);
2796
2797 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2798 placeholderp = &aq->queue[queue_last].u.placeholder;
2799
2800 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2801 {
2802 pthread_mutex_unlock (&aq->mutex);
2803 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2804 }
2805
2806 if (pthread_cond_init (&placeholderp->cond, NULL))
2807 {
2808 pthread_mutex_unlock (&aq->mutex);
2809 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2810 }
2811
2812 placeholderp->executed = 0;
2813
2814 aq->queue_n++;
2815
2816 if (DEBUG_THREAD_SIGNAL)
2817 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2818 aq->agent->device_id, aq->id);
2819 pthread_cond_signal (&aq->queue_cond_in);
2820
2821 pthread_mutex_unlock (&aq->mutex);
2822
2823 return placeholderp;
2824}
2825
2826/* Signal an asynchronous thread to terminate, and wait for it to do so. */
2827
2828static void
2829finalize_async_thread (struct goacc_asyncqueue *aq)
2830{
2831 pthread_mutex_lock (&aq->mutex);
2832 if (aq->drain_queue_stop == 2)
2833 {
2834 pthread_mutex_unlock (&aq->mutex);
2835 return;
2836 }
2837
2838 aq->drain_queue_stop = 1;
2839
2840 if (DEBUG_THREAD_SIGNAL)
2841 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2842 aq->agent->device_id, aq->id);
2843 pthread_cond_signal (&aq->queue_cond_in);
2844
2845 while (aq->drain_queue_stop != 2)
2846 {
2847 if (DEBUG_THREAD_SLEEP)
2848 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2849 " to sleep\n", aq->agent->device_id, aq->id);
2850 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2851 if (DEBUG_THREAD_SLEEP)
2852 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2853 aq->agent->device_id, aq->id);
2854 }
2855
2856 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2857 aq->id);
2858 pthread_mutex_unlock (&aq->mutex);
2859
2860 int err = pthread_join (aq->thread_drain_queue, NULL);
2861 if (err != 0)
2862 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2863 aq->agent->device_id, aq->id, strerror (err));
2864 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2865}
2866
2867/* Set up an async queue for OpenMP. There will be only one. The
2868 implementation simply uses an OpenACC async queue.
2869 FIXME: is this thread-safe if two threads call this function? */
2870
2871static void
2872maybe_init_omp_async (struct agent_info *agent)
2873{
2874 if (!agent->omp_async_queue)
2875 agent->omp_async_queue
2876 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2877}
2878
8d2f4ddf
JB
2879/* A wrapper that works around an issue in the HSA runtime with host-to-device
2880 copies from read-only pages. */
2881
2882static void
2883hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2884{
2885 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2886
2887 if (status == HSA_STATUS_SUCCESS)
2888 return;
2889
2890 /* It appears that the copy fails if the source data is in a read-only page.
2891 We can't detect that easily, so try copying the data to a temporary buffer
2892 and doing the copy again if we got an error above. */
2893
2894 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2895 "[%p:+%d]\n", (void *) src, (int) len);
2896
2897 void *src_copy = malloc (len);
2898 memcpy (src_copy, src, len);
2899 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2900 free (src_copy);
2901 if (status != HSA_STATUS_SUCCESS)
2902 GOMP_PLUGIN_error ("memory copy failed");
2903}
2904
237957cc
AS
2905/* Copy data to or from a device. This is intended for use as an async
2906 callback event. */
2907
2908static void
2909copy_data (void *data_)
2910{
2911 struct copy_data *data = (struct copy_data *)data_;
2912 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2913 data->aq->agent->device_id, data->aq->id, data->len, data->src,
2914 data->dst);
8d2f4ddf 2915 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
237957cc
AS
2916 free (data);
2917}
2918
2919/* Free device data. This is intended for use as an async callback event. */
2920
2921static void
2922gomp_offload_free (void *ptr)
2923{
2924 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2925 GOMP_OFFLOAD_free (0, ptr);
2926}
2927
2928/* Request an asynchronous data copy, to or from a device, on a given queue.
9c41f5b9 2929 The event will be registered as a callback. */
237957cc
AS
2930
2931static void
2932queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
9c41f5b9 2933 size_t len)
237957cc
AS
2934{
2935 if (DEBUG_QUEUES)
2936 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2937 aq->agent->device_id, aq->id, len, src, dst);
2938 struct copy_data *data
2939 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
2940 data->dst = dst;
2941 data->src = src;
2942 data->len = len;
237957cc
AS
2943 data->aq = aq;
2944 queue_push_callback (aq, copy_data, data);
2945}
2946
2947/* Return true if the given queue is currently empty. */
2948
2949static int
2950queue_empty (struct goacc_asyncqueue *aq)
2951{
2952 pthread_mutex_lock (&aq->mutex);
2953 int res = aq->queue_n == 0 ? 1 : 0;
2954 pthread_mutex_unlock (&aq->mutex);
2955
2956 return res;
2957}
2958
2959/* Wait for a given queue to become empty. This implements an OpenACC wait
2960 directive. */
2961
2962static void
2963wait_queue (struct goacc_asyncqueue *aq)
2964{
2965 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2966 {
2967 drain_queue_synchronous (aq);
2968 return;
2969 }
2970
2971 pthread_mutex_lock (&aq->mutex);
2972
2973 while (aq->queue_n > 0)
2974 {
2975 if (DEBUG_THREAD_SLEEP)
2976 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
2977 aq->agent->device_id, aq->id);
2978 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2979 if (DEBUG_THREAD_SLEEP)
2980 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
2981 aq->id);
2982 }
2983
2984 pthread_mutex_unlock (&aq->mutex);
2985 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
2986}
2987
2988/* }}} */
2989/* {{{ OpenACC support */
2990
2991/* Execute an OpenACC kernel, synchronously or asynchronously. */
2992
2993static void
2994gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
2995 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
2996 struct goacc_asyncqueue *aq)
2997{
2998 if (!GOMP_OFFLOAD_can_run (kernel))
2999 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3000
3001 /* If we get here then this must be an OpenACC kernel. */
3002 kernel->kind = KIND_OPENACC;
3003
3004 /* devaddrs must be double-indirect on the target. */
3005 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3006 for (size_t i = 0; i < mapnum; i++)
3007 hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3008 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3009 sizeof (void *));
3010
3011 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3012 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3013 {
3014 struct hsa_kernel_description *d
3015 = &kernel->module->image_desc->kernel_infos[i];
3016 if (d->name == kernel->name)
3017 {
3018 hsa_kernel_desc = d;
3019 break;
3020 }
3021 }
3022
3023 /* We may have statically-determined dimensions in
3024 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3025 invocation at runtime in dims[]. We allow static dimensions to take
3026 priority over dynamic dimensions when present (non-zero). */
3027 if (hsa_kernel_desc->oacc_dims[0] > 0)
3028 dims[0] = hsa_kernel_desc->oacc_dims[0];
3029 if (hsa_kernel_desc->oacc_dims[1] > 0)
3030 dims[1] = hsa_kernel_desc->oacc_dims[1];
3031 if (hsa_kernel_desc->oacc_dims[2] > 0)
3032 dims[2] = hsa_kernel_desc->oacc_dims[2];
3033
3034 /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3035 There isn't really a correct answer for this without a clue about the
3036 problem size, so let's do a reasonable number of single-worker gangs.
3037 64 gangs matches a typical Fiji device. */
3038
237957cc 3039 if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs. */
30656822
TS
3040 /* NOTE: Until support for middle-end worker partitioning is merged, force 'num_workers (1)'. */
3041 if (/*TODO dims[1] == 0*/ true) dims[1] = 1; /* Workers. */
237957cc
AS
3042
3043 /* The incoming dimensions are expressed in terms of gangs, workers, and
3044 vectors. The HSA dimensions are expressed in terms of "work-items",
3045 which means multiples of vector lanes.
3046
3047 The "grid size" specifies the size of the problem space, and the
3048 "work-group size" specifies how much of that we want a single compute
3049 unit to chew on at once.
3050
3051 The three dimensions do not really correspond to hardware, but the
3052 important thing is that the HSA runtime will launch as many
3053 work-groups as it takes to process the entire grid, and each
3054 work-group will contain as many wave-fronts as it takes to process
3055 the work-items in that group.
3056
3057 Essentially, as long as we set the Y dimension to 64 (the number of
3058 vector lanes in hardware), and the Z group size to the maximum (16),
3059 then we will get the gangs (X) and workers (Z) launched as we expect.
3060
3061 The reason for the apparent reversal of vector and worker dimension
3062 order is to do with the way the run-time distributes work-items across
3063 v1 and v2. */
3064 struct GOMP_kernel_launch_attributes kla =
3065 {3,
3066 /* Grid size. */
3067 {dims[0], 64, dims[1]},
3068 /* Work-group size. */
3069 {1, 64, 16}
3070 };
3071
3072 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3073 acc_prof_info *prof_info = thr->prof_info;
3074 acc_event_info enqueue_launch_event_info;
3075 acc_api_info *api_info = thr->api_info;
3076 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3077 if (profiling_dispatch_p)
3078 {
3079 prof_info->event_type = acc_ev_enqueue_launch_start;
3080
3081 enqueue_launch_event_info.launch_event.event_type
3082 = prof_info->event_type;
3083 enqueue_launch_event_info.launch_event.valid_bytes
3084 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3085 enqueue_launch_event_info.launch_event.parent_construct
3086 = acc_construct_parallel;
3087 enqueue_launch_event_info.launch_event.implicit = 1;
3088 enqueue_launch_event_info.launch_event.tool_info = NULL;
3089 enqueue_launch_event_info.launch_event.kernel_name
3090 = (char *) kernel->name;
3091 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3092 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3093 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3094
3095 api_info->device_api = acc_device_api_other;
3096
3097 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3098 &enqueue_launch_event_info, api_info);
3099 }
3100
3101 if (!async)
3102 {
3103 run_kernel (kernel, ind_da, &kla, NULL, false);
3104 gomp_offload_free (ind_da);
3105 }
3106 else
3107 {
3108 queue_push_launch (aq, kernel, ind_da, &kla);
3109 if (DEBUG_QUEUES)
3110 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3111 aq->agent->device_id, aq->id, ind_da);
3112 queue_push_callback (aq, gomp_offload_free, ind_da);
3113 }
3114
3115 if (profiling_dispatch_p)
3116 {
3117 prof_info->event_type = acc_ev_enqueue_launch_end;
3118 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3119 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3120 &enqueue_launch_event_info,
3121 api_info);
3122 }
3123}
3124
3125/* }}} */
3126/* {{{ Generic Plugin API */
3127
3128/* Return the name of the accelerator, which is "gcn". */
3129
3130const char *
3131GOMP_OFFLOAD_get_name (void)
3132{
3133 return "gcn";
3134}
3135
3136/* Return the specific capabilities the HSA accelerator have. */
3137
3138unsigned int
3139GOMP_OFFLOAD_get_caps (void)
3140{
3141 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3142 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3143 | GOMP_OFFLOAD_CAP_OPENACC_200;
3144}
3145
3146/* Identify as GCN accelerator. */
3147
3148int
3149GOMP_OFFLOAD_get_type (void)
3150{
3151 return OFFLOAD_TARGET_TYPE_GCN;
3152}
3153
3154/* Return the libgomp version number we're compatible with. There is
3155 no requirement for cross-version compatibility. */
3156
3157unsigned
3158GOMP_OFFLOAD_version (void)
3159{
3160 return GOMP_VERSION;
3161}
3162
3163/* Return the number of GCN devices on the system. */
3164
3165int
3166GOMP_OFFLOAD_get_num_devices (void)
3167{
3168 if (!init_hsa_context ())
3169 return 0;
3170 return hsa_context.agent_count;
3171}
3172
3173/* Initialize device (agent) number N so that it can be used for computation.
3174 Return TRUE on success. */
3175
3176bool
3177GOMP_OFFLOAD_init_device (int n)
3178{
3179 if (!init_hsa_context ())
3180 return false;
3181 if (n >= hsa_context.agent_count)
3182 {
3183 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3184 return false;
3185 }
3186 struct agent_info *agent = &hsa_context.agents[n];
3187
3188 if (agent->initialized)
3189 return true;
3190
3191 agent->device_id = n;
3192
3193 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3194 {
3195 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3196 return false;
3197 }
3198 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3199 {
3200 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3201 return false;
3202 }
3203 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3204 {
3205 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3206 return false;
3207 }
3208 if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3209 {
3210 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3211 return false;
3212 }
3213 agent->async_queues = NULL;
3214 agent->omp_async_queue = NULL;
3215 agent->team_arena_list = NULL;
3216
3217 uint32_t queue_size;
3218 hsa_status_t status;
3219 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3220 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3221 &queue_size);
3222 if (status != HSA_STATUS_SUCCESS)
3223 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3224 status);
3225
237957cc 3226 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
2e5ea579 3227 &agent->name);
237957cc
AS
3228 if (status != HSA_STATUS_SUCCESS)
3229 return hsa_error ("Error querying the name of the agent", status);
7d593fd6 3230
2e5ea579 3231 agent->device_isa = isa_code (agent->name);
7d593fd6 3232 if (agent->device_isa < 0)
2e5ea579
FH
3233 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3234
3235 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3236 &agent->vendor_name);
3237 if (status != HSA_STATUS_SUCCESS)
3238 return hsa_error ("Error querying the vendor name of the agent", status);
237957cc
AS
3239
3240 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3241 HSA_QUEUE_TYPE_MULTI,
3242 hsa_queue_callback, NULL, UINT32_MAX,
3243 UINT32_MAX, &agent->sync_queue);
3244 if (status != HSA_STATUS_SUCCESS)
3245 return hsa_error ("Error creating command queue", status);
3246
3247 agent->kernarg_region.handle = (uint64_t) -1;
3248 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3249 get_kernarg_memory_region,
3250 &agent->kernarg_region);
966de09b
AS
3251 if (status != HSA_STATUS_SUCCESS
3252 && status != HSA_STATUS_INFO_BREAK)
3253 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3254 if (agent->kernarg_region.handle == (uint64_t) -1)
3255 {
3256 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3257 "arguments");
3258 return false;
3259 }
3260 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3261 dump_hsa_region (agent->kernarg_region, NULL);
3262
3263 agent->data_region.handle = (uint64_t) -1;
3264 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3265 get_data_memory_region,
3266 &agent->data_region);
966de09b
AS
3267 if (status != HSA_STATUS_SUCCESS
3268 && status != HSA_STATUS_INFO_BREAK)
3269 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3270 if (agent->data_region.handle == (uint64_t) -1)
3271 {
3272 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3273 "data");
3274 return false;
3275 }
3276 GCN_DEBUG ("Selected device data memory region:\n");
3277 dump_hsa_region (agent->data_region, NULL);
3278
3279 GCN_DEBUG ("GCN agent %d initialized\n", n);
3280
3281 agent->initialized = true;
3282 return true;
3283}
3284
3285/* Load GCN object-code module described by struct gcn_image_desc in
3286 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3287 If there are any constructors then run them. */
3288
3289int
3290GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3291 struct addr_pair **target_table)
3292{
3293 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3294 {
3295 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3296 " (expected %u, received %u)",
3297 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3298 return -1;
3299 }
3300
3301 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3302 struct agent_info *agent;
3303 struct addr_pair *pair;
3304 struct module_info *module;
3305 struct kernel_info *kernel;
3306 int kernel_count = image_desc->kernel_count;
3307 unsigned var_count = image_desc->global_variable_count;
3308
3309 agent = get_agent_info (ord);
3310 if (!agent)
3311 return -1;
3312
3313 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3314 {
3315 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3316 return -1;
3317 }
3318 if (agent->prog_finalized
3319 && !destroy_hsa_program (agent))
3320 return -1;
3321
3322 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3323 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3324 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
3325 * sizeof (struct addr_pair));
3326 *target_table = pair;
3327 module = (struct module_info *)
3328 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3329 + kernel_count * sizeof (struct kernel_info));
3330 module->image_desc = image_desc;
3331 module->kernel_count = kernel_count;
3332 module->heap = NULL;
3333 module->constructors_run_p = false;
3334
3335 kernel = &module->kernels[0];
3336
3337 /* Allocate memory for kernel dependencies. */
3338 for (unsigned i = 0; i < kernel_count; i++)
3339 {
3340 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3341 if (!init_basic_kernel_info (kernel, d, agent, module))
3342 return -1;
3343 if (strcmp (d->name, "_init_array") == 0)
3344 module->init_array_func = kernel;
3345 else if (strcmp (d->name, "_fini_array") == 0)
3346 module->fini_array_func = kernel;
3347 else
3348 {
3349 pair->start = (uintptr_t) kernel;
3350 pair->end = (uintptr_t) (kernel + 1);
3351 pair++;
3352 }
3353 kernel++;
3354 }
3355
3356 agent->module = module;
3357 if (pthread_rwlock_unlock (&agent->module_rwlock))
3358 {
3359 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3360 return -1;
3361 }
3362
3363 if (!create_and_finalize_hsa_program (agent))
3364 return -1;
3365
3366 for (unsigned i = 0; i < var_count; i++)
3367 {
3368 struct global_var_info *v = &image_desc->global_variables[i];
3369 GCN_DEBUG ("Looking for variable %s\n", v->name);
3370
3371 hsa_status_t status;
3372 hsa_executable_symbol_t var_symbol;
3373 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3374 v->name, agent->id,
3375 0, &var_symbol);
3376
3377 if (status != HSA_STATUS_SUCCESS)
3378 hsa_fatal ("Could not find symbol for variable in the code object",
3379 status);
3380
3381 uint64_t var_addr;
3382 uint32_t var_size;
3383 status = hsa_fns.hsa_executable_symbol_get_info_fn
3384 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
3385 if (status != HSA_STATUS_SUCCESS)
3386 hsa_fatal ("Could not extract a variable from its symbol", status);
3387 status = hsa_fns.hsa_executable_symbol_get_info_fn
3388 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
3389 if (status != HSA_STATUS_SUCCESS)
3390 hsa_fatal ("Could not extract a variable size from its symbol", status);
3391
3392 pair->start = var_addr;
3393 pair->end = var_addr + var_size;
3394 GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
3395 (void *)var_addr, var_size);
3396 pair++;
3397 }
3398
3399 /* Ensure that constructors are run first. */
3400 struct GOMP_kernel_launch_attributes kla =
3401 { 3,
3402 /* Grid size. */
3403 { 1, 64, 1 },
3404 /* Work-group size. */
3405 { 1, 64, 1 }
3406 };
3407
3408 if (module->init_array_func)
3409 {
3410 init_kernel (module->init_array_func);
3411 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3412 }
3413 module->constructors_run_p = true;
3414
3415 /* Don't report kernels that libgomp need not know about. */
3416 if (module->init_array_func)
3417 kernel_count--;
3418 if (module->fini_array_func)
3419 kernel_count--;
3420
3421 return kernel_count + var_count;
3422}
3423
3424/* Unload GCN object-code module described by struct gcn_image_desc in
3425 TARGET_DATA from agent number N. Return TRUE on success. */
3426
3427bool
3428GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3429{
3430 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3431 {
3432 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3433 " (expected %u, received %u)",
3434 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3435 return false;
3436 }
3437
3438 struct agent_info *agent;
3439 agent = get_agent_info (n);
3440 if (!agent)
3441 return false;
3442
3443 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3444 {
3445 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3446 return false;
3447 }
3448
3449 if (!agent->module || agent->module->image_desc != target_data)
3450 {
3451 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3452 "loaded before");
3453 return false;
3454 }
3455
3456 if (!destroy_module (agent->module, true))
3457 return false;
3458 free (agent->module);
3459 agent->module = NULL;
3460 if (!destroy_hsa_program (agent))
3461 return false;
3462 if (pthread_rwlock_unlock (&agent->module_rwlock))
3463 {
3464 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3465 return false;
3466 }
3467 return true;
3468}
3469
3470/* Deinitialize all information and status associated with agent number N. We
3471 do not attempt any synchronization, assuming the user and libgomp will not
3472 attempt deinitialization of a device that is in any way being used at the
3473 same time. Return TRUE on success. */
3474
3475bool
3476GOMP_OFFLOAD_fini_device (int n)
3477{
3478 struct agent_info *agent = get_agent_info (n);
3479 if (!agent)
3480 return false;
3481
3482 if (!agent->initialized)
3483 return true;
3484
3485 if (agent->omp_async_queue)
3486 {
3487 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3488 agent->omp_async_queue = NULL;
3489 }
3490
3491 if (agent->module)
3492 {
3493 if (!destroy_module (agent->module, false))
3494 return false;
3495 free (agent->module);
3496 agent->module = NULL;
3497 }
3498
3499 if (!destroy_team_arenas (agent))
3500 return false;
3501
3502 if (!destroy_hsa_program (agent))
3503 return false;
3504
3505 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3506 if (status != HSA_STATUS_SUCCESS)
3507 return hsa_error ("Error destroying command queue", status);
3508
3509 if (pthread_mutex_destroy (&agent->prog_mutex))
3510 {
3511 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3512 return false;
3513 }
3514 if (pthread_rwlock_destroy (&agent->module_rwlock))
3515 {
3516 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3517 return false;
3518 }
3519
3520 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3521 {
3522 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3523 return false;
3524 }
3525 if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3526 {
3527 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3528 return false;
3529 }
3530 agent->initialized = false;
3531 return true;
3532}
3533
3534/* Return true if the HSA runtime can run function FN_PTR. */
3535
3536bool
3537GOMP_OFFLOAD_can_run (void *fn_ptr)
3538{
3539 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3540
3541 init_kernel (kernel);
3542 if (kernel->initialization_failed)
3543 goto failure;
3544
3545 return true;
3546
3547failure:
3548 if (suppress_host_fallback)
3549 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3550 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3551 return false;
3552}
3553
3554/* Allocate memory on device N. */
3555
3556void *
3557GOMP_OFFLOAD_alloc (int n, size_t size)
3558{
3559 struct agent_info *agent = get_agent_info (n);
3560 return alloc_by_agent (agent, size);
3561}
3562
3563/* Free memory from device N. */
3564
3565bool
3566GOMP_OFFLOAD_free (int device, void *ptr)
3567{
3568 GCN_DEBUG ("Freeing memory on device %d\n", device);
3569
3570 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3571 if (status != HSA_STATUS_SUCCESS)
3572 {
3573 hsa_error ("Could not free device memory", status);
3574 return false;
3575 }
3576
3577 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3578 bool profiling_dispatch_p
3579 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3580 if (profiling_dispatch_p)
3581 {
3582 acc_prof_info *prof_info = thr->prof_info;
3583 acc_event_info data_event_info;
3584 acc_api_info *api_info = thr->api_info;
3585
3586 prof_info->event_type = acc_ev_free;
3587
3588 data_event_info.data_event.event_type = prof_info->event_type;
3589 data_event_info.data_event.valid_bytes
3590 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3591 data_event_info.data_event.parent_construct
3592 = acc_construct_parallel;
3593 data_event_info.data_event.implicit = 1;
3594 data_event_info.data_event.tool_info = NULL;
3595 data_event_info.data_event.var_name = NULL;
3596 data_event_info.data_event.bytes = 0;
3597 data_event_info.data_event.host_ptr = NULL;
3598 data_event_info.data_event.device_ptr = (void *) ptr;
3599
3600 api_info->device_api = acc_device_api_other;
3601
3602 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3603 api_info);
3604 }
3605
3606 return true;
3607}
3608
3609/* Copy data from DEVICE to host. */
3610
3611bool
3612GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3613{
3614 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3615 src, dst);
8d2f4ddf
JB
3616 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3617 if (status != HSA_STATUS_SUCCESS)
3618 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3619 return true;
3620}
3621
3622/* Copy data from host to DEVICE. */
3623
3624bool
3625GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3626{
3627 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3628 device, dst);
8d2f4ddf 3629 hsa_memory_copy_wrapper (dst, src, n);
237957cc
AS
3630 return true;
3631}
3632
3633/* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3634
3635bool
3636GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3637{
3638 struct gcn_thread *thread_data = gcn_thread ();
3639
3640 if (thread_data && !async_synchronous_p (thread_data->async))
3641 {
3642 struct agent_info *agent = get_agent_info (device);
3643 maybe_init_omp_async (agent);
9c41f5b9 3644 queue_push_copy (agent->omp_async_queue, dst, src, n);
237957cc
AS
3645 return true;
3646 }
3647
3648 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3649 device, src, device, dst);
8d2f4ddf
JB
3650 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3651 if (status != HSA_STATUS_SUCCESS)
3652 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3653 return true;
3654}
3655
3656/* }}} */
3657/* {{{ OpenMP Plugin API */
3658
3659/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3660 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3661 to a kernel_info structure, and must have previously been loaded to the
3662 specified device. */
3663
3664void
3665GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3666{
3667 struct agent_info *agent = get_agent_info (device);
3668 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3669 struct GOMP_kernel_launch_attributes def;
3670 struct GOMP_kernel_launch_attributes *kla;
3671 assert (agent == kernel->agent);
3672
3673 /* If we get here then the kernel must be OpenMP. */
3674 kernel->kind = KIND_OPENMP;
3675
3676 if (!parse_target_attributes (args, &def, &kla, agent))
3677 {
3678 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3679 return;
3680 }
3681 run_kernel (kernel, vars, kla, NULL, false);
3682}
3683
3684/* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3685 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3686 GOMP_PLUGIN_target_task_completion when it has finished. */
3687
3688void
3689GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3690 void **args, void *async_data)
3691{
3692 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3693 struct agent_info *agent = get_agent_info (device);
3694 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3695 struct GOMP_kernel_launch_attributes def;
3696 struct GOMP_kernel_launch_attributes *kla;
3697 assert (agent == kernel->agent);
3698
3699 /* If we get here then the kernel must be OpenMP. */
3700 kernel->kind = KIND_OPENMP;
3701
3702 if (!parse_target_attributes (args, &def, &kla, agent))
3703 {
3704 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3705 return;
3706 }
3707
3708 maybe_init_omp_async (agent);
3709 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3710 queue_push_callback (agent->omp_async_queue,
3711 GOMP_PLUGIN_target_task_completion, async_data);
3712}
3713
3714/* }}} */
3715/* {{{ OpenACC Plugin API */
3716
3717/* Run a synchronous OpenACC kernel. The device number is inferred from the
3718 already-loaded KERNEL. */
3719
3720void
3721GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3722 void **hostaddrs, void **devaddrs, unsigned *dims,
3723 void *targ_mem_desc)
3724{
3725 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3726
3727 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3728 NULL);
3729}
3730
3731/* Run an asynchronous OpenACC kernel on the specified queue. */
3732
3733void
3734GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3735 void **hostaddrs, void **devaddrs,
3736 unsigned *dims, void *targ_mem_desc,
3737 struct goacc_asyncqueue *aq)
3738{
3739 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3740
3741 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3742 aq);
3743}
3744
3745/* Create a new asynchronous thread and queue for running future kernels. */
3746
3747struct goacc_asyncqueue *
3748GOMP_OFFLOAD_openacc_async_construct (int device)
3749{
3750 struct agent_info *agent = get_agent_info (device);
3751
3752 pthread_mutex_lock (&agent->async_queues_mutex);
3753
3754 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3755 aq->agent = get_agent_info (device);
3756 aq->prev = NULL;
3757 aq->next = agent->async_queues;
3758 if (aq->next)
3759 {
3760 aq->next->prev = aq;
3761 aq->id = aq->next->id + 1;
3762 }
3763 else
3764 aq->id = 1;
3765 agent->async_queues = aq;
3766
3767 aq->queue_first = 0;
3768 aq->queue_n = 0;
3769 aq->drain_queue_stop = 0;
3770
3771 if (pthread_mutex_init (&aq->mutex, NULL))
3772 {
3773 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3774 return false;
3775 }
3776 if (pthread_cond_init (&aq->queue_cond_in, NULL))
3777 {
3778 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3779 return false;
3780 }
3781 if (pthread_cond_init (&aq->queue_cond_out, NULL))
3782 {
3783 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3784 return false;
3785 }
3786
3787 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3788 ASYNC_QUEUE_SIZE,
3789 HSA_QUEUE_TYPE_MULTI,
3790 hsa_queue_callback, NULL,
3791 UINT32_MAX, UINT32_MAX,
3792 &aq->hsa_queue);
3793 if (status != HSA_STATUS_SUCCESS)
3794 hsa_fatal ("Error creating command queue", status);
3795
3796 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3797 if (err != 0)
3798 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3799 strerror (err));
3800 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3801 aq->id);
3802
3803 pthread_mutex_unlock (&agent->async_queues_mutex);
3804
3805 return aq;
3806}
3807
93d90219 3808/* Destroy an existing asynchronous thread and queue. Waits for any
237957cc
AS
3809 currently-running task to complete, but cancels any queued tasks. */
3810
3811bool
3812GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3813{
3814 struct agent_info *agent = aq->agent;
3815
3816 finalize_async_thread (aq);
3817
3818 pthread_mutex_lock (&agent->async_queues_mutex);
3819
3820 int err;
3821 if ((err = pthread_mutex_destroy (&aq->mutex)))
3822 {
3823 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3824 goto fail;
3825 }
3826 if (pthread_cond_destroy (&aq->queue_cond_in))
3827 {
3828 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3829 goto fail;
3830 }
3831 if (pthread_cond_destroy (&aq->queue_cond_out))
3832 {
3833 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3834 goto fail;
3835 }
3836 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3837 if (status != HSA_STATUS_SUCCESS)
3838 {
3839 hsa_error ("Error destroying command queue", status);
3840 goto fail;
3841 }
3842
3843 if (aq->prev)
3844 aq->prev->next = aq->next;
3845 if (aq->next)
3846 aq->next->prev = aq->prev;
3847 if (agent->async_queues == aq)
3848 agent->async_queues = aq->next;
3849
3850 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3851
3852 free (aq);
3853 pthread_mutex_unlock (&agent->async_queues_mutex);
3854 return true;
3855
3856fail:
3857 pthread_mutex_unlock (&agent->async_queues_mutex);
3858 return false;
3859}
3860
3861/* Return true if the specified async queue is currently empty. */
3862
3863int
3864GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
3865{
3866 return queue_empty (aq);
3867}
3868
3869/* Block until the specified queue has executed all its tasks and the
3870 queue is empty. */
3871
3872bool
3873GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
3874{
3875 wait_queue (aq);
3876 return true;
3877}
3878
3879/* Add a serialization point across two async queues. Any new tasks added to
3880 AQ2, after this call, will not run until all tasks on AQ1, at the time
3881 of this call, have completed. */
3882
3883bool
3884GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
3885 struct goacc_asyncqueue *aq2)
3886{
3887 /* For serialize, stream aq2 waits for aq1 to complete work that has been
3888 scheduled to run on it up to this point. */
3889 if (aq1 != aq2)
3890 {
3891 struct placeholder *placeholderp = queue_push_placeholder (aq1);
3892 queue_push_asyncwait (aq2, placeholderp);
3893 }
3894 return true;
3895}
3896
3897/* Add an opaque callback to the given async queue. */
3898
3899void
3900GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
3901 void (*fn) (void *), void *data)
3902{
3903 queue_push_callback (aq, fn, data);
3904}
3905
3906/* Queue up an asynchronous data copy from host to DEVICE. */
3907
3908bool
3909GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
3910 size_t n, struct goacc_asyncqueue *aq)
3911{
3912 struct agent_info *agent = get_agent_info (device);
3913 assert (agent == aq->agent);
9c41f5b9 3914 queue_push_copy (aq, dst, src, n);
237957cc
AS
3915 return true;
3916}
3917
3918/* Queue up an asynchronous data copy from DEVICE to host. */
3919
3920bool
3921GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
3922 size_t n, struct goacc_asyncqueue *aq)
3923{
3924 struct agent_info *agent = get_agent_info (device);
3925 assert (agent == aq->agent);
9c41f5b9 3926 queue_push_copy (aq, dst, src, n);
237957cc
AS
3927 return true;
3928}
3929
6fc0385c
TS
3930union goacc_property_value
3931GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
3932{
2e5ea579
FH
3933 struct agent_info *agent = get_agent_info (device);
3934
3935 union goacc_property_value propval = { .val = 0 };
3936
3937 switch (prop)
3938 {
3939 case GOACC_PROPERTY_FREE_MEMORY:
3940 /* Not supported. */
3941 break;
3942 case GOACC_PROPERTY_MEMORY:
3943 {
3944 size_t size;
3945 hsa_region_t region = agent->data_region;
3946 hsa_status_t status =
3947 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
3948 if (status == HSA_STATUS_SUCCESS)
3949 propval.val = size;
3950 break;
3951 }
3952 case GOACC_PROPERTY_NAME:
3953 propval.ptr = agent->name;
3954 break;
3955 case GOACC_PROPERTY_VENDOR:
3956 propval.ptr = agent->vendor_name;
3957 break;
3958 case GOACC_PROPERTY_DRIVER:
3959 propval.ptr = hsa_context.driver_version_s;
3960 break;
3961 }
6fc0385c 3962
2e5ea579 3963 return propval;
6fc0385c
TS
3964}
3965
237957cc
AS
3966/* Set up plugin-specific thread-local-data (host-side). */
3967
3968void *
3969GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
3970{
3971 struct gcn_thread *thread_data
3972 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
3973
3974 thread_data->async = GOMP_ASYNC_SYNC;
3975
3976 return (void *) thread_data;
3977}
3978
3979/* Clean up plugin-specific thread-local-data. */
3980
3981void
3982GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
3983{
3984 free (data);
3985}
3986
3987/* }}} */