]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-gcn.c
OpenACC: Remove 'acc_async_test' -> skip shortcut in 'libgomp/oacc-async.c:goacc_wait'
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
CommitLineData
237957cc
AS
1/* Plugin for AMD GCN execution.
2
83ffe9cd 3 Copyright (C) 2013-2023 Free Software Foundation, Inc.
237957cc
AS
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* {{{ Includes and defines */
30
31#include "config.h"
0bac793e 32#include "symcat.h"
237957cc
AS
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <pthread.h>
37#include <inttypes.h>
38#include <stdbool.h>
39#include <limits.h>
40#include <hsa.h>
85f0a4d9 41#include <hsa_ext_amd.h>
237957cc
AS
42#include <dlfcn.h>
43#include <signal.h>
44#include "libgomp-plugin.h"
8c05d8cd 45#include "config/gcn/libgomp-gcn.h" /* For struct output. */
237957cc
AS
46#include "gomp-constants.h"
47#include <elf.h>
48#include "oacc-plugin.h"
49#include "oacc-int.h"
50#include <assert.h>
51
237957cc 52/* These probably won't be in elf.h for a while. */
97981e13 53#ifndef R_AMDGPU_NONE
237957cc
AS
54#define R_AMDGPU_NONE 0
55#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57#define R_AMDGPU_ABS64 3 /* S + A */
58#define R_AMDGPU_REL32 4 /* S + A - P */
59#define R_AMDGPU_REL64 5 /* S + A - P */
60#define R_AMDGPU_ABS32 6 /* S + A */
61#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
237957cc 66#define R_AMDGPU_RELATIVE64 13 /* B + A */
97981e13 67#endif
237957cc
AS
68
69/* GCN specific definitions for asynchronous queues. */
70
71#define ASYNC_QUEUE_SIZE 64
72#define DRAIN_QUEUE_SYNCHRONOUS_P false
73#define DEBUG_QUEUES 0
74#define DEBUG_THREAD_SLEEP 0
75#define DEBUG_THREAD_SIGNAL 0
76
77/* Defaults. */
78#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
79
80/* Secure getenv() which returns NULL if running as SUID/SGID. */
81#ifndef HAVE_SECURE_GETENV
82#ifdef HAVE___SECURE_GETENV
83#define secure_getenv __secure_getenv
84#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
86
87#include <unistd.h>
88
89/* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
91
92static char *
93secure_getenv (const char *name)
94{
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
96 return getenv (name);
97 else
98 return NULL;
99}
100
101#else
102#define secure_getenv getenv
103#endif
104#endif
105
106/* }}} */
107/* {{{ Types */
108
93d90219 109/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
237957cc
AS
110
111struct gcn_thread
112{
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
114 int async;
115};
116
117/* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
119
120struct hsa_runtime_fn_info
121{
122 /* HSA runtime. */
123 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
124 const char **status_string);
125 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
126 void *value);
127 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
128 hsa_agent_info_t attribute,
129 void *value);
130 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
131 hsa_isa_info_t attribute,
132 uint32_t index,
133 void *value);
134 hsa_status_t (*hsa_init_fn) (void);
135 hsa_status_t (*hsa_iterate_agents_fn)
136 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
137 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
138 hsa_region_info_t attribute,
139 void *value);
140 hsa_status_t (*hsa_queue_create_fn)
141 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
142 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
143 void *data, uint32_t private_segment_size,
144 uint32_t group_segment_size, hsa_queue_t **queue);
145 hsa_status_t (*hsa_agent_iterate_regions_fn)
146 (hsa_agent_t agent,
147 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
148 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
149 hsa_status_t (*hsa_executable_create_fn)
150 (hsa_profile_t profile, hsa_executable_state_t executable_state,
151 const char *options, hsa_executable_t *executable);
152 hsa_status_t (*hsa_executable_global_variable_define_fn)
153 (hsa_executable_t executable, const char *variable_name, void *address);
154 hsa_status_t (*hsa_executable_load_code_object_fn)
155 (hsa_executable_t executable, hsa_agent_t agent,
156 hsa_code_object_t code_object, const char *options);
157 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
158 const char *options);
159 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
160 uint32_t num_consumers,
161 const hsa_agent_t *consumers,
162 hsa_signal_t *signal);
163 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
164 void **ptr);
165 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
166 hsa_access_permission_t access);
167 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
168 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
169 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
170 hsa_status_t (*hsa_executable_get_symbol_fn)
171 (hsa_executable_t executable, const char *module_name,
172 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
173 hsa_executable_symbol_t *symbol);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn)
175 (hsa_executable_symbol_t executable_symbol,
176 hsa_executable_symbol_info_t attribute, void *value);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn)
178 (hsa_executable_t executable,
179 hsa_status_t (*callback)(hsa_executable_t executable,
180 hsa_executable_symbol_t symbol, void *data),
181 void *data);
182 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
183 uint64_t value);
184 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
185 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
190 (hsa_signal_t signal, hsa_signal_condition_t condition,
191 hsa_signal_value_t compare_value, uint64_t timeout_hint,
192 hsa_wait_state_t wait_state_hint);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
194 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
195
196 hsa_status_t (*hsa_code_object_deserialize_fn)
197 (void *serialized_code_object, size_t serialized_code_object_size,
198 const char *options, hsa_code_object_t *code_object);
199};
200
201/* Structure describing the run-time and grid properties of an HSA kernel
202 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
203
204struct GOMP_kernel_launch_attributes
205{
206 /* Number of dimensions the workload has. Maximum number is 3. */
207 uint32_t ndim;
208 /* Size of the grid in the three respective dimensions. */
209 uint32_t gdims[3];
210 /* Size of work-groups in the respective dimensions. */
211 uint32_t wdims[3];
212};
213
214/* Collection of information needed for a dispatch of a kernel from a
215 kernel. */
216
217struct kernel_dispatch
218{
219 struct agent_info *agent;
220 /* Pointer to a command queue associated with a kernel dispatch agent. */
221 void *queue;
222 /* Pointer to a memory space used for kernel arguments passing. */
223 void *kernarg_address;
224 /* Kernel object. */
225 uint64_t object;
226 /* Synchronization signal used for dispatch synchronization. */
227 uint64_t signal;
228 /* Private segment size. */
229 uint32_t private_segment_size;
230 /* Group segment size. */
231 uint32_t group_segment_size;
232};
233
234/* Structure of the kernargs segment, supporting console output.
235
236 This needs to match the definitions in Newlib, and the expectations
237 in libgomp target code. */
238
239struct kernargs {
f6fff8a6 240 struct kernargs_abi abi;
237957cc
AS
241
242 /* Output data. */
8c05d8cd 243 struct output output_data;
237957cc
AS
244};
245
246/* A queue entry for a future asynchronous launch. */
247
248struct kernel_launch
249{
250 struct kernel_info *kernel;
251 void *vars;
252 struct GOMP_kernel_launch_attributes kla;
253};
254
255/* A queue entry for a future callback. */
256
257struct callback
258{
259 void (*fn)(void *);
260 void *data;
261};
262
263/* A data struct for the copy_data callback. */
264
265struct copy_data
266{
267 void *dst;
268 const void *src;
269 size_t len;
237957cc
AS
270 struct goacc_asyncqueue *aq;
271};
272
273/* A queue entry for a placeholder. These correspond to a wait event. */
274
275struct placeholder
276{
277 int executed;
278 pthread_cond_t cond;
279 pthread_mutex_t mutex;
280};
281
282/* A queue entry for a wait directive. */
283
284struct asyncwait_info
285{
286 struct placeholder *placeholderp;
287};
288
289/* Encode the type of an entry in an async queue. */
290
291enum entry_type
292{
293 KERNEL_LAUNCH,
294 CALLBACK,
295 ASYNC_WAIT,
296 ASYNC_PLACEHOLDER
297};
298
299/* An entry in an async queue. */
300
301struct queue_entry
302{
303 enum entry_type type;
304 union {
305 struct kernel_launch launch;
306 struct callback callback;
307 struct asyncwait_info asyncwait;
308 struct placeholder placeholder;
309 } u;
310};
311
312/* An async queue header.
313
314 OpenMP may create one of these.
315 OpenACC may create many. */
316
317struct goacc_asyncqueue
318{
319 struct agent_info *agent;
320 hsa_queue_t *hsa_queue;
321
322 pthread_t thread_drain_queue;
323 pthread_mutex_t mutex;
324 pthread_cond_t queue_cond_in;
325 pthread_cond_t queue_cond_out;
326 struct queue_entry queue[ASYNC_QUEUE_SIZE];
327 int queue_first;
328 int queue_n;
329 int drain_queue_stop;
330
331 int id;
332 struct goacc_asyncqueue *prev;
333 struct goacc_asyncqueue *next;
334};
335
336/* Mkoffload uses this structure to describe a kernel.
337
338 OpenMP kernel dimensions are passed at runtime.
339 OpenACC kernel dimensions are passed at compile time, here. */
340
341struct hsa_kernel_description
342{
343 const char *name;
344 int oacc_dims[3]; /* Only present for GCN kernels. */
5a28e272
KCY
345 int sgpr_count;
346 int vpgr_count;
237957cc
AS
347};
348
349/* Mkoffload uses this structure to describe an offload variable. */
350
351struct global_var_info
352{
353 const char *name;
354 void *address;
355};
356
357/* Mkoffload uses this structure to describe all the kernels in a
358 loadable module. These are passed the libgomp via static constructors. */
359
360struct gcn_image_desc
361{
362 struct gcn_image {
363 size_t size;
364 void *image;
365 } *gcn_image;
366 const unsigned kernel_count;
367 struct hsa_kernel_description *kernel_infos;
368 const unsigned global_variable_count;
237957cc
AS
369};
370
7d593fd6
FH
371/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
372 support.
373 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
374
375typedef enum {
7d593fd6
FH
376 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
377 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
378 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
cde52d3a
AS
379 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
380 EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f
7d593fd6
FH
381} EF_AMDGPU_MACH;
382
383const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
384typedef EF_AMDGPU_MACH gcn_isa;
385
237957cc
AS
386/* Description of an HSA GPU agent (device) and the program associated with
387 it. */
388
389struct agent_info
390{
391 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
392 hsa_agent_t id;
393 /* The user-visible device number. */
394 int device_id;
395 /* Whether the agent has been initialized. The fields below are usable only
396 if it has been. */
397 bool initialized;
7d593fd6
FH
398
399 /* The instruction set architecture of the device. */
400 gcn_isa device_isa;
2e5ea579
FH
401 /* Name of the agent. */
402 char name[64];
403 /* Name of the vendor of the agent. */
404 char vendor_name[64];
237957cc
AS
405 /* Command queues of the agent. */
406 hsa_queue_t *sync_queue;
407 struct goacc_asyncqueue *async_queues, *omp_async_queue;
408 pthread_mutex_t async_queues_mutex;
409
410 /* The HSA memory region from which to allocate kernel arguments. */
411 hsa_region_t kernarg_region;
412
413 /* The HSA memory region from which to allocate device data. */
414 hsa_region_t data_region;
415
f6fff8a6
AS
416 /* Allocated ephemeral memories (team arena and stack space). */
417 struct ephemeral_memories_list *ephemeral_memories_list;
418 pthread_mutex_t ephemeral_memories_write_lock;
237957cc
AS
419
420 /* Read-write lock that protects kernels which are running or about to be run
421 from interference with loading and unloading of images. Needs to be
422 locked for reading while a kernel is being run, and for writing if the
423 list of modules is manipulated (and thus the HSA program invalidated). */
424 pthread_rwlock_t module_rwlock;
425
426 /* The module associated with this kernel. */
427 struct module_info *module;
428
429 /* Mutex enforcing that only one thread will finalize the HSA program. A
430 thread should have locked agent->module_rwlock for reading before
431 acquiring it. */
432 pthread_mutex_t prog_mutex;
433 /* Flag whether the HSA program that consists of all the modules has been
434 finalized. */
435 bool prog_finalized;
436 /* HSA executable - the finalized program that is used to locate kernels. */
437 hsa_executable_t executable;
438};
439
440/* Information required to identify, finalize and run any given kernel. */
441
442enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
443
444struct kernel_info
445{
446 /* Name of the kernel, required to locate it within the GCN object-code
447 module. */
448 const char *name;
449 /* The specific agent the kernel has been or will be finalized for and run
450 on. */
451 struct agent_info *agent;
452 /* The specific module where the kernel takes place. */
453 struct module_info *module;
5a28e272
KCY
454 /* Information provided by mkoffload associated with the kernel. */
455 struct hsa_kernel_description *description;
237957cc
AS
456 /* Mutex enforcing that at most once thread ever initializes a kernel for
457 use. A thread should have locked agent->module_rwlock for reading before
458 acquiring it. */
459 pthread_mutex_t init_mutex;
460 /* Flag indicating whether the kernel has been initialized and all fields
461 below it contain valid data. */
462 bool initialized;
463 /* Flag indicating that the kernel has a problem that blocks an execution. */
464 bool initialization_failed;
465 /* The object to be put into the dispatch queue. */
466 uint64_t object;
467 /* Required size of kernel arguments. */
468 uint32_t kernarg_segment_size;
469 /* Required size of group segment. */
470 uint32_t group_segment_size;
471 /* Required size of private segment. */
472 uint32_t private_segment_size;
473 /* Set up for OpenMP or OpenACC? */
474 enum offload_kind kind;
475};
476
477/* Information about a particular GCN module, its image and kernels. */
478
479struct module_info
480{
481 /* The description with which the program has registered the image. */
482 struct gcn_image_desc *image_desc;
483 /* GCN heap allocation. */
484 struct heap *heap;
485 /* Physical boundaries of the loaded module. */
486 Elf64_Addr phys_address_start;
487 Elf64_Addr phys_address_end;
488
489 bool constructors_run_p;
490 struct kernel_info *init_array_func, *fini_array_func;
491
492 /* Number of kernels in this module. */
493 int kernel_count;
494 /* An array of kernel_info structures describing each kernel in this
495 module. */
496 struct kernel_info kernels[];
497};
498
499/* A linked list of memory arenas allocated on the device.
f6fff8a6
AS
500 These are used by OpenMP, as a means to optimize per-team malloc,
501 and for host-accessible stack space. */
237957cc 502
f6fff8a6 503struct ephemeral_memories_list
237957cc 504{
f6fff8a6 505 struct ephemeral_memories_list *next;
237957cc 506
f6fff8a6
AS
507 /* The size is determined by the number of teams and threads. */
508 size_t size;
509 /* The device address allocated memory. */
510 void *address;
511 /* A flag to prevent two asynchronous kernels trying to use the same memory.
237957cc
AS
512 The mutex is locked until the kernel exits. */
513 pthread_mutex_t in_use;
514};
515
516/* Information about the whole HSA environment and all of its agents. */
517
518struct hsa_context_info
519{
520 /* Whether the structure has been initialized. */
521 bool initialized;
522 /* Number of usable GPU HSA agents in the system. */
523 int agent_count;
524 /* Array of agent_info structures describing the individual HSA agents. */
525 struct agent_info *agents;
2e5ea579
FH
526 /* Driver version string. */
527 char driver_version_s[30];
237957cc
AS
528};
529
237957cc
AS
530/* }}} */
531/* {{{ Global variables */
532
533/* Information about the whole HSA environment and all of its agents. */
534
535static struct hsa_context_info hsa_context;
536
537/* HSA runtime functions that are initialized in init_hsa_context. */
538
539static struct hsa_runtime_fn_info hsa_fns;
540
541/* Heap space, allocated target-side, provided for use of newlib malloc.
542 Each module should have it's own heap allocated.
543 Beware that heap usage increases with OpenMP teams. See also arenas. */
544
545static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
546
f6fff8a6
AS
547/* Ephemeral memory sizes for each kernel launch. */
548
549static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
550static int stack_size = DEFAULT_GCN_STACK_SIZE;
551
237957cc
AS
552/* Flag to decide whether print to stderr information about what is going on.
553 Set in init_debug depending on environment variables. */
554
555static bool debug;
556
557/* Flag to decide if the runtime should suppress a possible fallback to host
558 execution. */
559
560static bool suppress_host_fallback;
561
562/* Flag to locate HSA runtime shared library that is dlopened
563 by this plug-in. */
564
565static const char *hsa_runtime_lib;
566
567/* Flag to decide if the runtime should support also CPU devices (can be
568 a simulator). */
569
570static bool support_cpu_devices;
571
572/* Runtime dimension overrides. Zero indicates default. */
573
574static int override_x_dim = 0;
575static int override_z_dim = 0;
576
577/* }}} */
578/* {{{ Debug & Diagnostic */
579
580/* Print a message to stderr if GCN_DEBUG value is set to true. */
581
582#define DEBUG_PRINT(...) \
583 do \
584 { \
585 if (debug) \
586 { \
587 fprintf (stderr, __VA_ARGS__); \
588 } \
589 } \
590 while (false);
591
592/* Flush stderr if GCN_DEBUG value is set to true. */
593
594#define DEBUG_FLUSH() \
595 do { \
596 if (debug) \
597 fflush (stderr); \
598 } while (false)
599
600/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
601 is set to true. */
602
603#define DEBUG_LOG(prefix, ...) \
604 do \
605 { \
606 DEBUG_PRINT (prefix); \
607 DEBUG_PRINT (__VA_ARGS__); \
608 DEBUG_FLUSH (); \
609 } while (false)
610
611/* Print a debugging message to stderr. */
612
613#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
614
615/* Print a warning message to stderr. */
616
617#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
618
619/* Print HSA warning STR with an HSA STATUS code. */
620
621static void
622hsa_warn (const char *str, hsa_status_t status)
623{
624 if (!debug)
625 return;
626
627 const char *hsa_error_msg = "[unknown]";
628 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
629
630 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
631 hsa_error_msg);
632}
633
634/* Report a fatal error STR together with the HSA error corresponding to STATUS
635 and terminate execution of the current process. */
636
637static void
638hsa_fatal (const char *str, hsa_status_t status)
639{
640 const char *hsa_error_msg = "[unknown]";
641 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
642 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
643 hsa_error_msg);
644}
645
646/* Like hsa_fatal, except only report error message, and return FALSE
647 for propagating error processing to outside of plugin. */
648
649static bool
650hsa_error (const char *str, hsa_status_t status)
651{
652 const char *hsa_error_msg = "[unknown]";
653 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
654 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
655 hsa_error_msg);
656 return false;
657}
658
659/* Dump information about the available hardware. */
660
661static void
662dump_hsa_system_info (void)
663{
664 hsa_status_t status;
665
666 hsa_endianness_t endianness;
667 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
668 &endianness);
669 if (status == HSA_STATUS_SUCCESS)
670 switch (endianness)
671 {
672 case HSA_ENDIANNESS_LITTLE:
673 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
674 break;
675 case HSA_ENDIANNESS_BIG:
676 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
677 break;
678 default:
679 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
680 }
681 else
682 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
683
684 uint8_t extensions[128];
685 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
686 &extensions);
687 if (status == HSA_STATUS_SUCCESS)
688 {
689 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
690 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
691 }
692 else
693 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
694}
695
696/* Dump information about the available hardware. */
697
698static void
699dump_machine_model (hsa_machine_model_t machine_model, const char *s)
700{
701 switch (machine_model)
702 {
703 case HSA_MACHINE_MODEL_SMALL:
704 GCN_DEBUG ("%s: SMALL\n", s);
705 break;
706 case HSA_MACHINE_MODEL_LARGE:
707 GCN_DEBUG ("%s: LARGE\n", s);
708 break;
709 default:
710 GCN_WARNING ("%s: UNKNOWN\n", s);
711 break;
712 }
713}
714
715/* Dump information about the available hardware. */
716
717static void
718dump_profile (hsa_profile_t profile, const char *s)
719{
720 switch (profile)
721 {
722 case HSA_PROFILE_FULL:
723 GCN_DEBUG ("%s: FULL\n", s);
724 break;
725 case HSA_PROFILE_BASE:
726 GCN_DEBUG ("%s: BASE\n", s);
727 break;
728 default:
729 GCN_WARNING ("%s: UNKNOWN\n", s);
730 break;
731 }
732}
733
734/* Dump information about a device memory region. */
735
736static hsa_status_t
737dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
738{
739 hsa_status_t status;
740
741 hsa_region_segment_t segment;
742 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
743 &segment);
744 if (status == HSA_STATUS_SUCCESS)
745 {
746 if (segment == HSA_REGION_SEGMENT_GLOBAL)
747 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
748 else if (segment == HSA_REGION_SEGMENT_READONLY)
749 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
750 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
751 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
752 else if (segment == HSA_REGION_SEGMENT_GROUP)
753 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
754 else
755 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
756 }
757 else
758 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
759
760 if (segment == HSA_REGION_SEGMENT_GLOBAL)
761 {
762 uint32_t flags;
763 status
764 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
765 &flags);
766 if (status == HSA_STATUS_SUCCESS)
767 {
768 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
769 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
770 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
771 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
772 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
773 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
774 }
775 else
776 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
777 }
778
779 size_t size;
780 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
781 if (status == HSA_STATUS_SUCCESS)
782 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
783 else
784 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
785
786 status
787 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
788 &size);
789 if (status == HSA_STATUS_SUCCESS)
790 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
791 else
792 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
793
794 bool alloc_allowed;
795 status
796 = hsa_fns.hsa_region_get_info_fn (region,
797 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
798 &alloc_allowed);
799 if (status == HSA_STATUS_SUCCESS)
800 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
801 else
802 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
803
804 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
805 return HSA_STATUS_SUCCESS;
806
807 status
808 = hsa_fns.hsa_region_get_info_fn (region,
809 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
810 &size);
811 if (status == HSA_STATUS_SUCCESS)
812 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
813 else
814 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
815
816 size_t align;
817 status
818 = hsa_fns.hsa_region_get_info_fn (region,
819 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
820 &align);
821 if (status == HSA_STATUS_SUCCESS)
822 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
823 else
824 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
825
826 return HSA_STATUS_SUCCESS;
827}
828
829/* Dump information about all the device memory regions. */
830
831static void
832dump_hsa_regions (hsa_agent_t agent)
833{
834 hsa_status_t status;
835 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
836 dump_hsa_region,
837 NULL);
838 if (status != HSA_STATUS_SUCCESS)
839 hsa_error ("Dumping hsa regions failed", status);
840}
841
842/* Dump information about the available devices. */
843
844static hsa_status_t
845dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
846{
847 hsa_status_t status;
848
849 char buf[64];
850 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
851 &buf);
852 if (status == HSA_STATUS_SUCCESS)
853 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
854 else
855 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
856
857 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
858 &buf);
859 if (status == HSA_STATUS_SUCCESS)
860 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
861 else
862 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
863
864 hsa_machine_model_t machine_model;
865 status
866 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
867 &machine_model);
868 if (status == HSA_STATUS_SUCCESS)
869 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
870 else
871 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
872
873 hsa_profile_t profile;
874 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
875 &profile);
876 if (status == HSA_STATUS_SUCCESS)
877 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
878 else
879 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
880
881 hsa_device_type_t device_type;
882 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
883 &device_type);
884 if (status == HSA_STATUS_SUCCESS)
885 {
886 switch (device_type)
887 {
888 case HSA_DEVICE_TYPE_CPU:
889 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
890 break;
891 case HSA_DEVICE_TYPE_GPU:
892 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
893 break;
894 case HSA_DEVICE_TYPE_DSP:
895 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
896 break;
897 default:
898 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
899 break;
900 }
901 }
902 else
903 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
904
905 uint32_t cu_count;
906 status = hsa_fns.hsa_agent_get_info_fn
907 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
908 if (status == HSA_STATUS_SUCCESS)
909 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
910 else
911 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
912
913 uint32_t size;
914 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
915 &size);
916 if (status == HSA_STATUS_SUCCESS)
917 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
918 else
919 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
920
921 uint32_t max_dim;
922 status = hsa_fns.hsa_agent_get_info_fn (agent,
923 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
924 &max_dim);
925 if (status == HSA_STATUS_SUCCESS)
926 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
927 else
928 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
929
930 uint32_t max_size;
931 status = hsa_fns.hsa_agent_get_info_fn (agent,
932 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
933 &max_size);
934 if (status == HSA_STATUS_SUCCESS)
935 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
936 else
937 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
938
939 uint32_t grid_max_dim;
940 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
941 &grid_max_dim);
942 if (status == HSA_STATUS_SUCCESS)
943 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
944 else
945 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
946
947 uint32_t grid_max_size;
948 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
949 &grid_max_size);
950 if (status == HSA_STATUS_SUCCESS)
951 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
952 else
953 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
954
955 dump_hsa_regions (agent);
956
957 return HSA_STATUS_SUCCESS;
958}
959
960/* Forward reference. */
961
962static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
963
964/* Helper function for dump_executable_symbols. */
965
966static hsa_status_t
967dump_executable_symbol (hsa_executable_t executable,
968 hsa_executable_symbol_t symbol,
969 void *data __attribute__((unused)))
970{
971 char *name = get_executable_symbol_name (symbol);
972
973 if (name)
974 {
975 GCN_DEBUG ("executable symbol: %s\n", name);
976 free (name);
977 }
978
979 return HSA_STATUS_SUCCESS;
980}
981
982/* Dump all global symbol in an executable. */
983
984static void
985dump_executable_symbols (hsa_executable_t executable)
986{
987 hsa_status_t status;
988 status
989 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
990 dump_executable_symbol,
991 NULL);
992 if (status != HSA_STATUS_SUCCESS)
993 hsa_fatal ("Could not dump HSA executable symbols", status);
994}
995
996/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
997
998static void
999print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1000{
1001 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1002
1003 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1004 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1005 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1006 fprintf (stderr, "%*sheap address: %p\n", indent, "",
f6fff8a6
AS
1007 (void*)kernargs->abi.heap_ptr);
1008 fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1009 "", (void*)kernargs->abi.arena_ptr,
1010 kernargs->abi.arena_size_per_team);
1011 fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1012 "", (void*)kernargs->abi.stack_ptr,
1013 kernargs->abi.stack_size_per_thread);
237957cc
AS
1014 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1015 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1016 dispatch->private_segment_size);
1017 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1018 dispatch->group_segment_size);
1019 fprintf (stderr, "\n");
1020}
1021
1022/* }}} */
1023/* {{{ Utility functions */
1024
1025/* Cast the thread local storage to gcn_thread. */
1026
1027static inline struct gcn_thread *
1028gcn_thread (void)
1029{
1030 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1031}
1032
1033/* Initialize debug and suppress_host_fallback according to the environment. */
1034
1035static void
1036init_environment_variables (void)
1037{
1038 if (secure_getenv ("GCN_DEBUG"))
1039 debug = true;
1040 else
1041 debug = false;
1042
1043 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1044 suppress_host_fallback = true;
1045 else
1046 suppress_host_fallback = false;
1047
1048 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1049 if (hsa_runtime_lib == NULL)
7c1e856b 1050 hsa_runtime_lib = "libhsa-runtime64.so.1";
237957cc
AS
1051
1052 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1053
1054 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1055 if (!x)
1056 x = secure_getenv ("GCN_NUM_GANGS");
1057 if (x)
1058 override_x_dim = atoi (x);
1059
1060 const char *z = secure_getenv ("GCN_NUM_THREADS");
1061 if (!z)
1062 z = secure_getenv ("GCN_NUM_WORKERS");
1063 if (z)
1064 override_z_dim = atoi (z);
1065
1066 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1067 if (heap)
1068 {
1069 size_t tmp = atol (heap);
1070 if (tmp)
1071 gcn_kernel_heap_size = tmp;
1072 }
f6fff8a6
AS
1073
1074 const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1075 if (arena)
1076 {
1077 int tmp = atoi (arena);
1078 if (tmp)
1079 team_arena_size = tmp;;
1080 }
1081
1082 const char *stack = secure_getenv ("GCN_STACK_SIZE");
1083 if (stack)
1084 {
1085 int tmp = atoi (stack);
1086 if (tmp)
1087 stack_size = tmp;;
1088 }
237957cc
AS
1089}
1090
1091/* Return malloc'd string with name of SYMBOL. */
1092
1093static char *
1094get_executable_symbol_name (hsa_executable_symbol_t symbol)
1095{
1096 hsa_status_t status;
1097 char *res;
1098 uint32_t len;
1099 const hsa_executable_symbol_info_t info_name_length
1100 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1101
1102 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1103 &len);
1104 if (status != HSA_STATUS_SUCCESS)
1105 {
1106 hsa_error ("Could not get length of symbol name", status);
1107 return NULL;
1108 }
1109
1110 res = GOMP_PLUGIN_malloc (len + 1);
1111
1112 const hsa_executable_symbol_info_t info_name
1113 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1114
1115 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1116
1117 if (status != HSA_STATUS_SUCCESS)
1118 {
1119 hsa_error ("Could not get symbol name", status);
1120 free (res);
1121 return NULL;
1122 }
1123
1124 res[len] = '\0';
1125
1126 return res;
1127}
1128
237957cc
AS
1129/* Get the number of GPU Compute Units. */
1130
1131static int
1132get_cu_count (struct agent_info *agent)
1133{
1134 uint32_t cu_count;
1135 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1136 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1137 if (status == HSA_STATUS_SUCCESS)
1138 return cu_count;
1139 else
1140 return 64; /* The usual number for older devices. */
1141}
1142
1143/* Calculate the maximum grid size for OMP threads / OACC workers.
1144 This depends on the kernel's resource usage levels. */
1145
1146static int
1147limit_worker_threads (int threads)
1148{
1149 /* FIXME Do something more inteligent here.
1150 GCN can always run 4 threads within a Compute Unit, but
1151 more than that depends on register usage. */
1152 if (threads > 16)
1153 threads = 16;
1154 return threads;
1155}
1156
81476bc4
MV
1157/* This sets the maximum number of teams to twice the number of GPU Compute
1158 Units to avoid memory waste and corresponding memory access faults. */
1159
1160static int
1161limit_teams (int teams, struct agent_info *agent)
1162{
1163 int max_teams = 2 * get_cu_count (agent);
1164 if (teams > max_teams)
1165 teams = max_teams;
1166 return teams;
1167}
1168
237957cc
AS
1169/* Parse the target attributes INPUT provided by the compiler and return true
1170 if we should run anything all. If INPUT is NULL, fill DEF with default
1171 values, then store INPUT or DEF into *RESULT.
1172
1173 This is used for OpenMP only. */
1174
1175static bool
1176parse_target_attributes (void **input,
1177 struct GOMP_kernel_launch_attributes *def,
1178 struct GOMP_kernel_launch_attributes **result,
1179 struct agent_info *agent)
1180{
1181 if (!input)
1182 GOMP_PLUGIN_fatal ("No target arguments provided");
1183
1184 bool grid_attrs_found = false;
1185 bool gcn_dims_found = false;
1186 int gcn_teams = 0;
1187 int gcn_threads = 0;
1188 while (*input)
1189 {
1190 intptr_t id = (intptr_t) *input++, val;
1191
1192 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1193 val = (intptr_t) *input++;
1194 else
1195 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1196
1197 val = (val > INT_MAX) ? INT_MAX : val;
1198
1199 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1200 && ((id & GOMP_TARGET_ARG_ID_MASK)
1201 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1202 {
1203 grid_attrs_found = true;
1204 break;
1205 }
14e5e746
AS
1206 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1207 == GOMP_TARGET_ARG_DEVICE_ALL)
237957cc
AS
1208 {
1209 gcn_dims_found = true;
1210 switch (id & GOMP_TARGET_ARG_ID_MASK)
1211 {
1212 case GOMP_TARGET_ARG_NUM_TEAMS:
81476bc4 1213 gcn_teams = limit_teams (val, agent);
237957cc
AS
1214 break;
1215 case GOMP_TARGET_ARG_THREAD_LIMIT:
1216 gcn_threads = limit_worker_threads (val);
1217 break;
1218 default:
1219 ;
1220 }
1221 }
1222 }
1223
1224 if (gcn_dims_found)
1225 {
a78b1ab1
KCY
1226 bool gfx900_workaround_p = false;
1227
7d593fd6
FH
1228 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1229 && gcn_threads == 0 && override_z_dim == 0)
237957cc 1230 {
a78b1ab1 1231 gfx900_workaround_p = true;
237957cc 1232 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
a78b1ab1 1233 "threads to at most 4 per team.\n");
237957cc
AS
1234 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1235 "GCN_NUM_THREADS=16\n");
1236 }
1237
a78b1ab1
KCY
1238 /* Ideally, when a dimension isn't explicitly specified, we should
1239 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1240 In practice, we tune for peak performance on BabelStream, which
1241 for OpenACC is currently 32 threads per CU. */
237957cc 1242 def->ndim = 3;
a78b1ab1
KCY
1243 if (gcn_teams <= 0 && gcn_threads <= 0)
1244 {
1245 /* Set up a reasonable number of teams and threads. */
1246 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1247 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1248 def->gdims[2] = gcn_threads;
1249 }
1250 else if (gcn_teams <= 0 && gcn_threads > 0)
1251 {
1252 /* Auto-scale the number of teams with the number of threads. */
1253 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1254 def->gdims[2] = gcn_threads;
1255 }
1256 else if (gcn_teams > 0 && gcn_threads <= 0)
1257 {
1258 int max_threads = gfx900_workaround_p ? 4 : 16;
1259
1260 /* Auto-scale the number of threads with the number of teams. */
1261 def->gdims[0] = gcn_teams;
1262 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1263 if (def->gdims[2] == 0)
1264 def->gdims[2] = 1;
1265 else if (def->gdims[2] > max_threads)
1266 def->gdims[2] = max_threads;
1267 }
1268 else
1269 {
1270 def->gdims[0] = gcn_teams;
1271 def->gdims[2] = gcn_threads;
1272 }
1273 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1274 def->wdims[0] = 1; /* Single team per work-group. */
237957cc
AS
1275 def->wdims[1] = 64;
1276 def->wdims[2] = 16;
1277 *result = def;
1278 return true;
1279 }
1280 else if (!grid_attrs_found)
1281 {
1282 def->ndim = 1;
1283 def->gdims[0] = 1;
1284 def->gdims[1] = 1;
1285 def->gdims[2] = 1;
1286 def->wdims[0] = 1;
1287 def->wdims[1] = 1;
1288 def->wdims[2] = 1;
1289 *result = def;
1290 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1291 return true;
1292 }
1293
1294 struct GOMP_kernel_launch_attributes *kla;
1295 kla = (struct GOMP_kernel_launch_attributes *) *input;
1296 *result = kla;
1297 if (kla->ndim == 0 || kla->ndim > 3)
1298 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1299
1300 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1301 unsigned i;
1302 for (i = 0; i < kla->ndim; i++)
1303 {
1304 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1305 kla->gdims[i], kla->wdims[i]);
1306 if (kla->gdims[i] == 0)
1307 return false;
1308 }
1309 return true;
1310}
1311
1312/* Return the group size given the requested GROUP size, GRID size and number
1313 of grid dimensions NDIM. */
1314
1315static uint32_t
1316get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1317{
1318 if (group == 0)
1319 {
1320 /* TODO: Provide a default via environment or device characteristics. */
1321 if (ndim == 1)
1322 group = 64;
1323 else if (ndim == 2)
1324 group = 8;
1325 else
1326 group = 4;
1327 }
1328
1329 if (group > grid)
1330 group = grid;
1331 return group;
1332}
1333
1334/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1335
1336static void
1337packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1338{
1339 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1340}
1341
1342/* A never-called callback for the HSA command queues. These signal events
1343 that we don't use, so we trigger an error.
1344
1345 This "queue" is not to be confused with the async queues, below. */
1346
1347static void
1348hsa_queue_callback (hsa_status_t status,
1349 hsa_queue_t *queue __attribute__ ((unused)),
1350 void *data __attribute__ ((unused)))
1351{
1352 hsa_fatal ("Asynchronous queue error", status);
1353}
1354
1355/* }}} */
1356/* {{{ HSA initialization */
1357
1358/* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1359
1360static bool
1361init_hsa_runtime_functions (void)
1362{
1363#define DLSYM_FN(function) \
1364 hsa_fns.function##_fn = dlsym (handle, #function); \
1365 if (hsa_fns.function##_fn == NULL) \
1366 return false;
1367 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1368 if (handle == NULL)
1369 return false;
1370
1371 DLSYM_FN (hsa_status_string)
1372 DLSYM_FN (hsa_system_get_info)
1373 DLSYM_FN (hsa_agent_get_info)
1374 DLSYM_FN (hsa_init)
1375 DLSYM_FN (hsa_iterate_agents)
1376 DLSYM_FN (hsa_region_get_info)
1377 DLSYM_FN (hsa_queue_create)
1378 DLSYM_FN (hsa_agent_iterate_regions)
1379 DLSYM_FN (hsa_executable_destroy)
1380 DLSYM_FN (hsa_executable_create)
1381 DLSYM_FN (hsa_executable_global_variable_define)
1382 DLSYM_FN (hsa_executable_load_code_object)
1383 DLSYM_FN (hsa_executable_freeze)
1384 DLSYM_FN (hsa_signal_create)
1385 DLSYM_FN (hsa_memory_allocate)
1386 DLSYM_FN (hsa_memory_assign_agent)
1387 DLSYM_FN (hsa_memory_copy)
1388 DLSYM_FN (hsa_memory_free)
1389 DLSYM_FN (hsa_signal_destroy)
1390 DLSYM_FN (hsa_executable_get_symbol)
1391 DLSYM_FN (hsa_executable_symbol_get_info)
1392 DLSYM_FN (hsa_executable_iterate_symbols)
1393 DLSYM_FN (hsa_queue_add_write_index_release)
1394 DLSYM_FN (hsa_queue_load_read_index_acquire)
1395 DLSYM_FN (hsa_signal_wait_acquire)
1396 DLSYM_FN (hsa_signal_store_relaxed)
1397 DLSYM_FN (hsa_signal_store_release)
1398 DLSYM_FN (hsa_signal_load_acquire)
1399 DLSYM_FN (hsa_queue_destroy)
1400 DLSYM_FN (hsa_code_object_deserialize)
1401 return true;
1402#undef DLSYM_FN
1403}
1404
1405/* Return true if the agent is a GPU and can accept of concurrent submissions
1406 from different threads. */
1407
1408static bool
1409suitable_hsa_agent_p (hsa_agent_t agent)
1410{
1411 hsa_device_type_t device_type;
1412 hsa_status_t status
1413 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1414 &device_type);
1415 if (status != HSA_STATUS_SUCCESS)
1416 return false;
1417
1418 switch (device_type)
1419 {
1420 case HSA_DEVICE_TYPE_GPU:
1421 break;
1422 case HSA_DEVICE_TYPE_CPU:
1423 if (!support_cpu_devices)
1424 return false;
1425 break;
1426 default:
1427 return false;
1428 }
1429
1430 uint32_t features = 0;
1431 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1432 &features);
1433 if (status != HSA_STATUS_SUCCESS
1434 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1435 return false;
1436 hsa_queue_type_t queue_type;
1437 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1438 &queue_type);
1439 if (status != HSA_STATUS_SUCCESS
1440 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1441 return false;
1442
1443 return true;
1444}
1445
1446/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1447 agent_count in hsa_context. */
1448
1449static hsa_status_t
1450count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1451{
1452 if (suitable_hsa_agent_p (agent))
1453 hsa_context.agent_count++;
1454 return HSA_STATUS_SUCCESS;
1455}
1456
1457/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1458 id to the describing structure in the hsa context. The index of the
1459 structure is pointed to by DATA, increment it afterwards. */
1460
1461static hsa_status_t
1462assign_agent_ids (hsa_agent_t agent, void *data)
1463{
1464 if (suitable_hsa_agent_p (agent))
1465 {
1466 int *agent_index = (int *) data;
1467 hsa_context.agents[*agent_index].id = agent;
1468 ++*agent_index;
1469 }
1470 return HSA_STATUS_SUCCESS;
1471}
1472
1473/* Initialize hsa_context if it has not already been done.
1474 Return TRUE on success. */
1475
1476static bool
1477init_hsa_context (void)
1478{
1479 hsa_status_t status;
1480 int agent_index = 0;
1481
1482 if (hsa_context.initialized)
1483 return true;
1484 init_environment_variables ();
1485 if (!init_hsa_runtime_functions ())
1486 {
1487 GCN_WARNING ("Run-time could not be dynamically opened\n");
1488 if (suppress_host_fallback)
1489 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1490 return false;
1491 }
1492 status = hsa_fns.hsa_init_fn ();
1493 if (status != HSA_STATUS_SUCCESS)
1494 return hsa_error ("Run-time could not be initialized", status);
1495 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1496
1497 if (debug)
1498 dump_hsa_system_info ();
1499
1500 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1501 if (status != HSA_STATUS_SUCCESS)
1502 return hsa_error ("GCN GPU devices could not be enumerated", status);
1503 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1504
1505 hsa_context.agents
1506 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1507 * sizeof (struct agent_info));
1508 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
966de09b
AS
1509 if (status != HSA_STATUS_SUCCESS)
1510 return hsa_error ("Scanning compute agents failed", status);
237957cc
AS
1511 if (agent_index != hsa_context.agent_count)
1512 {
1513 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1514 return false;
1515 }
1516
1517 if (debug)
1518 {
1519 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1520 if (status != HSA_STATUS_SUCCESS)
1521 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1522 }
1523
2e5ea579
FH
1524 uint16_t minor, major;
1525 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1526 &minor);
1527 if (status != HSA_STATUS_SUCCESS)
1528 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1529 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1530 &major);
1531 if (status != HSA_STATUS_SUCCESS)
1532 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1533
1534 size_t len = sizeof hsa_context.driver_version_s;
1535 int printed = snprintf (hsa_context.driver_version_s, len,
1536 "HSA Runtime %hu.%hu", (unsigned short int)major,
1537 (unsigned short int)minor);
1538 if (printed >= len)
1539 GCN_WARNING ("HSA runtime version string was truncated."
1540 "Version %hu.%hu is too long.", (unsigned short int)major,
1541 (unsigned short int)minor);
1542
237957cc
AS
1543 hsa_context.initialized = true;
1544 return true;
1545}
1546
1547/* Verify that hsa_context has already been initialized and return the
1548 agent_info structure describing device number N. Return NULL on error. */
1549
1550static struct agent_info *
1551get_agent_info (int n)
1552{
1553 if (!hsa_context.initialized)
1554 {
1555 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1556 return NULL;
1557 }
1558 if (n >= hsa_context.agent_count)
1559 {
1560 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1561 return NULL;
1562 }
1563 if (!hsa_context.agents[n].initialized)
1564 {
1565 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1566 return NULL;
1567 }
1568 return &hsa_context.agents[n];
1569}
1570
1571/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1572
1573 Selects (breaks at) a suitable region of type KIND. */
1574
1575static hsa_status_t
1576get_memory_region (hsa_region_t region, hsa_region_t *retval,
1577 hsa_region_global_flag_t kind)
1578{
1579 hsa_status_t status;
1580 hsa_region_segment_t segment;
1581
1582 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1583 &segment);
1584 if (status != HSA_STATUS_SUCCESS)
1585 return status;
1586 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1587 return HSA_STATUS_SUCCESS;
1588
1589 uint32_t flags;
1590 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1591 &flags);
1592 if (status != HSA_STATUS_SUCCESS)
1593 return status;
1594 if (flags & kind)
1595 {
1596 *retval = region;
1597 return HSA_STATUS_INFO_BREAK;
1598 }
1599 return HSA_STATUS_SUCCESS;
1600}
1601
1602/* Callback of hsa_agent_iterate_regions.
1603
1604 Selects a kernargs memory region. */
1605
1606static hsa_status_t
1607get_kernarg_memory_region (hsa_region_t region, void *data)
1608{
1609 return get_memory_region (region, (hsa_region_t *)data,
1610 HSA_REGION_GLOBAL_FLAG_KERNARG);
1611}
1612
1613/* Callback of hsa_agent_iterate_regions.
1614
1615 Selects a coarse-grained memory region suitable for the heap and
1616 offload data. */
1617
1618static hsa_status_t
1619get_data_memory_region (hsa_region_t region, void *data)
1620{
1621 return get_memory_region (region, (hsa_region_t *)data,
1622 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1623}
1624
7d593fd6
FH
1625static int
1626elf_gcn_isa_field (Elf64_Ehdr *image)
1627{
1628 return image->e_flags & EF_AMDGPU_MACH_MASK;
1629}
1630
7d593fd6
FH
1631const static char *gcn_gfx803_s = "gfx803";
1632const static char *gcn_gfx900_s = "gfx900";
1633const static char *gcn_gfx906_s = "gfx906";
3535402e 1634const static char *gcn_gfx908_s = "gfx908";
cde52d3a 1635const static char *gcn_gfx90a_s = "gfx90a";
7d593fd6
FH
1636const static int gcn_isa_name_len = 6;
1637
1638/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1639 support the ISA. */
1640
1641static const char*
1642isa_hsa_name (int isa) {
1643 switch(isa)
1644 {
7d593fd6
FH
1645 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1646 return gcn_gfx803_s;
1647 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1648 return gcn_gfx900_s;
1649 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1650 return gcn_gfx906_s;
3535402e
AS
1651 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1652 return gcn_gfx908_s;
cde52d3a
AS
1653 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1654 return gcn_gfx90a_s;
7d593fd6
FH
1655 }
1656 return NULL;
1657}
1658
1659/* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1660 with -march) or NULL if we do not support the ISA.
1661 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1662
1663static const char*
1664isa_gcc_name (int isa) {
1665 switch(isa)
1666 {
7d593fd6
FH
1667 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1668 return "fiji";
1669 default:
1670 return isa_hsa_name (isa);
1671 }
1672}
1673
1674/* Returns the code which is used in the GCN object code to identify the ISA with
1675 the given name (as used by the HSA runtime). */
1676
1677static gcn_isa
1678isa_code(const char *isa) {
7d593fd6
FH
1679 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1680 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1681
1682 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1683 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1684
1685 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1686 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1687
3535402e
AS
1688 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1689 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1690
cde52d3a
AS
1691 if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1692 return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1693
7d593fd6
FH
1694 return -1;
1695}
1696
237957cc
AS
1697/* }}} */
1698/* {{{ Run */
1699
f6fff8a6 1700/* Create or reuse a team arena and stack space.
237957cc
AS
1701
1702 Team arenas are used by OpenMP to avoid calling malloc multiple times
1703 while setting up each team. This is purely a performance optimization.
1704
f6fff8a6
AS
1705 The stack space is used by all kernels. We must allocate it in such a
1706 way that the reverse offload implmentation can access the data.
237957cc 1707
f6fff8a6
AS
1708 Allocating this memory costs performance, so this function will reuse an
1709 existing allocation if a large enough one is idle.
1710 The memory lock is released, but not deallocated, when the kernel exits. */
1711
1712static void
1713configure_ephemeral_memories (struct kernel_info *kernel,
1714 struct kernargs_abi *kernargs, int num_teams,
1715 int num_threads)
237957cc 1716{
f6fff8a6
AS
1717 struct agent_info *agent = kernel->agent;
1718 struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1719 struct ephemeral_memories_list *item;
1720
1721 int actual_arena_size = (kernel->kind == KIND_OPENMP
1722 ? team_arena_size : 0);
1723 int actual_arena_total_size = actual_arena_size * num_teams;
1724 size_t size = (actual_arena_total_size
1725 + num_teams * num_threads * stack_size);
237957cc
AS
1726
1727 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1728 {
f6fff8a6 1729 if (item->size < size)
237957cc
AS
1730 continue;
1731
f6fff8a6
AS
1732 if (pthread_mutex_trylock (&item->in_use) == 0)
1733 break;
237957cc
AS
1734 }
1735
f6fff8a6 1736 if (!item)
237957cc 1737 {
f6fff8a6
AS
1738 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1739 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1740 num_teams, num_threads, size);
237957cc 1741
f6fff8a6
AS
1742 if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1743 {
1744 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1745 return;
1746 }
1747 item = malloc (sizeof (*item));
1748 item->size = size;
1749 item->next = NULL;
1750 *next_ptr = item;
237957cc 1751
f6fff8a6
AS
1752 if (pthread_mutex_init (&item->in_use, NULL))
1753 {
1754 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1755 return;
1756 }
1757 if (pthread_mutex_lock (&item->in_use))
1758 {
1759 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1760 return;
1761 }
1762 if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1763 {
1764 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1765 return;
1766 }
1767
1768 hsa_status_t status;
1769 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1770 &item->address);
1771 if (status != HSA_STATUS_SUCCESS)
1772 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1773 status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1774 HSA_ACCESS_PERMISSION_RW);
1775 if (status != HSA_STATUS_SUCCESS)
1776 hsa_fatal ("Could not assign arena & stack memory to device", status);
1777 }
237957cc 1778
f6fff8a6
AS
1779 kernargs->arena_ptr = (actual_arena_total_size
1780 ? (uint64_t)item->address
1781 : 0);
1782 kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1783 kernargs->arena_size_per_team = actual_arena_size;
1784 kernargs->stack_size_per_thread = stack_size;
237957cc
AS
1785}
1786
f6fff8a6 1787/* Mark an ephemeral memory space available for reuse. */
237957cc
AS
1788
1789static void
f6fff8a6 1790release_ephemeral_memories (struct agent_info* agent, void *address)
237957cc 1791{
f6fff8a6 1792 struct ephemeral_memories_list *item;
237957cc 1793
f6fff8a6 1794 for (item = agent->ephemeral_memories_list; item; item = item->next)
237957cc 1795 {
f6fff8a6 1796 if (item->address == address)
237957cc
AS
1797 {
1798 if (pthread_mutex_unlock (&item->in_use))
1799 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1800 return;
1801 }
1802 }
1803 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1804}
1805
1806/* Clean up all the allocated team arenas. */
1807
1808static bool
f6fff8a6 1809destroy_ephemeral_memories (struct agent_info *agent)
237957cc 1810{
f6fff8a6 1811 struct ephemeral_memories_list *item, *next;
237957cc 1812
f6fff8a6 1813 for (item = agent->ephemeral_memories_list; item; item = next)
237957cc
AS
1814 {
1815 next = item->next;
f6fff8a6 1816 hsa_fns.hsa_memory_free_fn (item->address);
237957cc
AS
1817 if (pthread_mutex_destroy (&item->in_use))
1818 {
f6fff8a6 1819 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
237957cc
AS
1820 return false;
1821 }
1822 free (item);
1823 }
f6fff8a6 1824 agent->ephemeral_memories_list = NULL;
237957cc
AS
1825
1826 return true;
1827}
1828
1829/* Allocate memory on a specified device. */
1830
1831static void *
1832alloc_by_agent (struct agent_info *agent, size_t size)
1833{
1834 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1835
1836 /* Zero-size allocations are invalid, so in order to return a valid pointer
1837 we need to pass a valid size. One source of zero-size allocations is
1838 kernargs for kernels that have no inputs or outputs (the kernel may
1839 only use console output, for example). */
1840 if (size == 0)
1841 size = 4;
1842
1843 void *ptr;
1844 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1845 size, &ptr);
1846 if (status != HSA_STATUS_SUCCESS)
1847 {
1848 hsa_error ("Could not allocate device memory", status);
1849 return NULL;
1850 }
1851
1852 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1853 HSA_ACCESS_PERMISSION_RW);
1854 if (status != HSA_STATUS_SUCCESS)
1855 {
1856 hsa_error ("Could not assign data memory to device", status);
1857 return NULL;
1858 }
1859
1860 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1861 bool profiling_dispatch_p
1862 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1863 if (profiling_dispatch_p)
1864 {
1865 acc_prof_info *prof_info = thr->prof_info;
1866 acc_event_info data_event_info;
1867 acc_api_info *api_info = thr->api_info;
1868
1869 prof_info->event_type = acc_ev_alloc;
1870
1871 data_event_info.data_event.event_type = prof_info->event_type;
1872 data_event_info.data_event.valid_bytes
1873 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1874 data_event_info.data_event.parent_construct
1875 = acc_construct_parallel;
1876 data_event_info.data_event.implicit = 1;
1877 data_event_info.data_event.tool_info = NULL;
1878 data_event_info.data_event.var_name = NULL;
1879 data_event_info.data_event.bytes = size;
1880 data_event_info.data_event.host_ptr = NULL;
1881 data_event_info.data_event.device_ptr = (void *) ptr;
1882
1883 api_info->device_api = acc_device_api_other;
1884
1885 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1886 api_info);
1887 }
1888
1889 return ptr;
1890}
1891
1892/* Create kernel dispatch data structure for given KERNEL, along with
1893 the necessary device signals and memory allocations. */
1894
1895static struct kernel_dispatch *
f6fff8a6
AS
1896create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1897 int num_threads)
237957cc
AS
1898{
1899 struct agent_info *agent = kernel->agent;
1900 struct kernel_dispatch *shadow
1901 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1902
1903 shadow->agent = kernel->agent;
1904 shadow->object = kernel->object;
1905
1906 hsa_signal_t sync_signal;
1907 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1908 if (status != HSA_STATUS_SUCCESS)
1909 hsa_fatal ("Error creating the GCN sync signal", status);
1910
1911 shadow->signal = sync_signal.handle;
1912 shadow->private_segment_size = kernel->private_segment_size;
1913 shadow->group_segment_size = kernel->group_segment_size;
1914
1915 /* We expect kernels to request a single pointer, explicitly, and the
1916 rest of struct kernargs, implicitly. If they request anything else
1917 then something is wrong. */
1918 if (kernel->kernarg_segment_size > 8)
1919 {
1920 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1921 return NULL;
1922 }
1923
1924 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1925 sizeof (struct kernargs),
1926 &shadow->kernarg_address);
1927 if (status != HSA_STATUS_SUCCESS)
1928 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1929 struct kernargs *kernargs = shadow->kernarg_address;
1930
1931 /* Zero-initialize the output_data (minimum needed). */
f6fff8a6 1932 kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
237957cc
AS
1933 kernargs->output_data.next_output = 0;
1934 for (unsigned i = 0;
1935 i < (sizeof (kernargs->output_data.queue)
1936 / sizeof (kernargs->output_data.queue[0]));
1937 i++)
1938 kernargs->output_data.queue[i].written = 0;
1939 kernargs->output_data.consumed = 0;
1940
1941 /* Pass in the heap location. */
f6fff8a6 1942 kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
237957cc 1943
f6fff8a6
AS
1944 /* Create the ephemeral memory spaces. */
1945 configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
237957cc
AS
1946
1947 /* Ensure we can recognize unset return values. */
1948 kernargs->output_data.return_value = 0xcafe0000;
1949
1950 return shadow;
1951}
1952
8c05d8cd 1953static void
6edcb5dc
TB
1954process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
1955 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
8c05d8cd
TB
1956{
1957 int dev_num = dev_num64;
6edcb5dc
TB
1958 GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
1959 NULL, NULL, NULL);
8c05d8cd
TB
1960}
1961
237957cc
AS
1962/* Output any data written to console output from the kernel. It is expected
1963 that this function is polled during kernel execution.
1964
1965 We print all entries from the last item printed to the next entry without
1966 a "written" flag. If the "final" flag is set then it'll continue right to
1967 the end.
1968
1969 The print buffer is circular, but the from and to locations don't wrap when
1970 the buffer does, so the output limit is UINT_MAX. The target blocks on
1971 output when the buffer is full. */
1972
1973static void
1974console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1975 bool final)
1976{
1977 unsigned int limit = (sizeof (kernargs->output_data.queue)
1978 / sizeof (kernargs->output_data.queue[0]));
1979
1980 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1981 __ATOMIC_ACQUIRE);
1982 unsigned int to = kernargs->output_data.next_output;
1983
1984 if (from > to)
1985 {
1986 /* Overflow. */
1987 if (final)
1988 printf ("GCN print buffer overflowed.\n");
1989 return;
1990 }
1991
1992 unsigned int i;
1993 for (i = from; i < to; i++)
1994 {
1995 struct printf_data *data = &kernargs->output_data.queue[i%limit];
1996
1997 if (!data->written && !final)
1998 break;
1999
2000 switch (data->type)
2001 {
2002 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2003 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2004 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2005 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
8c05d8cd 2006 case 4:
6edcb5dc
TB
2007 process_reverse_offload (data->value_u64[0], data->value_u64[1],
2008 data->value_u64[2], data->value_u64[3],
2009 data->value_u64[4], data->value_u64[5]);
8c05d8cd 2010 break;
237957cc
AS
2011 default: printf ("GCN print buffer error!\n"); break;
2012 }
2013 data->written = 0;
2014 __atomic_store_n (&kernargs->output_data.consumed, i+1,
2015 __ATOMIC_RELEASE);
2016 }
2017 fflush (stdout);
2018}
2019
2020/* Release data structure created for a kernel dispatch in SHADOW argument,
2021 and clean up the signal and memory allocations. */
2022
2023static void
2024release_kernel_dispatch (struct kernel_dispatch *shadow)
2025{
2026 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2027
2028 struct kernargs *kernargs = shadow->kernarg_address;
f6fff8a6
AS
2029 void *addr = (void *)kernargs->abi.arena_ptr;
2030 if (!addr)
2031 addr = (void *)kernargs->abi.stack_ptr;
2032 release_ephemeral_memories (shadow->agent, addr);
237957cc
AS
2033
2034 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2035
2036 hsa_signal_t s;
2037 s.handle = shadow->signal;
2038 hsa_fns.hsa_signal_destroy_fn (s);
2039
2040 free (shadow);
2041}
2042
2043/* Extract the properties from a kernel binary. */
2044
2045static void
2046init_kernel_properties (struct kernel_info *kernel)
2047{
2048 hsa_status_t status;
2049 struct agent_info *agent = kernel->agent;
2050 hsa_executable_symbol_t kernel_symbol;
f062c3f1
AS
2051 char *buf = alloca (strlen (kernel->name) + 4);
2052 sprintf (buf, "%s.kd", kernel->name);
237957cc 2053 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
f062c3f1 2054 buf, agent->id,
237957cc
AS
2055 0, &kernel_symbol);
2056 if (status != HSA_STATUS_SUCCESS)
2057 {
2058 hsa_warn ("Could not find symbol for kernel in the code object", status);
f062c3f1 2059 fprintf (stderr, "not found name: '%s'\n", buf);
237957cc
AS
2060 dump_executable_symbols (agent->executable);
2061 goto failure;
2062 }
2063 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2064 status = hsa_fns.hsa_executable_symbol_get_info_fn
2065 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2066 if (status != HSA_STATUS_SUCCESS)
2067 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2068 status = hsa_fns.hsa_executable_symbol_get_info_fn
2069 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2070 &kernel->kernarg_segment_size);
2071 if (status != HSA_STATUS_SUCCESS)
2072 hsa_fatal ("Could not get info about kernel argument size", status);
2073 status = hsa_fns.hsa_executable_symbol_get_info_fn
2074 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2075 &kernel->group_segment_size);
2076 if (status != HSA_STATUS_SUCCESS)
2077 hsa_fatal ("Could not get info about kernel group segment size", status);
2078 status = hsa_fns.hsa_executable_symbol_get_info_fn
2079 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2080 &kernel->private_segment_size);
2081 if (status != HSA_STATUS_SUCCESS)
2082 hsa_fatal ("Could not get info about kernel private segment size",
2083 status);
2084
2085 /* The kernel type is not known until something tries to launch it. */
2086 kernel->kind = KIND_UNKNOWN;
2087
2088 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2089 "following segment sizes: \n", kernel->name);
2090 GCN_DEBUG (" group_segment_size: %u\n",
2091 (unsigned) kernel->group_segment_size);
2092 GCN_DEBUG (" private_segment_size: %u\n",
2093 (unsigned) kernel->private_segment_size);
2094 GCN_DEBUG (" kernarg_segment_size: %u\n",
2095 (unsigned) kernel->kernarg_segment_size);
2096 return;
2097
2098failure:
2099 kernel->initialization_failed = true;
2100}
2101
2102/* Do all the work that is necessary before running KERNEL for the first time.
2103 The function assumes the program has been created, finalized and frozen by
2104 create_and_finalize_hsa_program. */
2105
2106static void
2107init_kernel (struct kernel_info *kernel)
2108{
2109 if (pthread_mutex_lock (&kernel->init_mutex))
2110 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2111 if (kernel->initialized)
2112 {
2113 if (pthread_mutex_unlock (&kernel->init_mutex))
2114 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2115 "mutex");
2116
2117 return;
2118 }
2119
2120 init_kernel_properties (kernel);
2121
2122 if (!kernel->initialization_failed)
2123 {
2124 GCN_DEBUG ("\n");
2125
2126 kernel->initialized = true;
2127 }
2128 if (pthread_mutex_unlock (&kernel->init_mutex))
2129 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2130 "mutex");
2131}
2132
2133/* Run KERNEL on its agent, pass VARS to it as arguments and take
2134 launch attributes from KLA.
2135
2136 MODULE_LOCKED indicates that the caller already holds the lock and
2137 run_kernel need not lock it again.
2138 If AQ is NULL then agent->sync_queue will be used. */
2139
2140static void
2141run_kernel (struct kernel_info *kernel, void *vars,
2142 struct GOMP_kernel_launch_attributes *kla,
2143 struct goacc_asyncqueue *aq, bool module_locked)
2144{
5a28e272
KCY
2145 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2146 kernel->description->vpgr_count);
2147
2148 /* Reduce the number of threads/workers if there are insufficient
2149 VGPRs available to run the kernels together. */
2150 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2151 {
2152 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2153 int max_threads = (256 / granulated_vgprs) * 4;
2154 if (kla->gdims[2] > max_threads)
2155 {
2156 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2157 " per team/gang - reducing to %d threads/workers.\n",
2158 kla->gdims[2], max_threads);
2159 kla->gdims[2] = max_threads;
2160 }
2161 }
2162
237957cc
AS
2163 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2164 (aq ? aq->id : 0));
2165 GCN_DEBUG ("GCN launch attribs: gdims:[");
2166 int i;
2167 for (i = 0; i < kla->ndim; ++i)
2168 {
2169 if (i)
2170 DEBUG_PRINT (", ");
2171 DEBUG_PRINT ("%u", kla->gdims[i]);
2172 }
2173 DEBUG_PRINT ("], normalized gdims:[");
2174 for (i = 0; i < kla->ndim; ++i)
2175 {
2176 if (i)
2177 DEBUG_PRINT (", ");
2178 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2179 }
2180 DEBUG_PRINT ("], wdims:[");
2181 for (i = 0; i < kla->ndim; ++i)
2182 {
2183 if (i)
2184 DEBUG_PRINT (", ");
2185 DEBUG_PRINT ("%u", kla->wdims[i]);
2186 }
2187 DEBUG_PRINT ("]\n");
2188 DEBUG_FLUSH ();
2189
2190 struct agent_info *agent = kernel->agent;
2191 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2192 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2193
2194 if (!agent->initialized)
2195 GOMP_PLUGIN_fatal ("Agent must be initialized");
2196
2197 if (!kernel->initialized)
2198 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2199
2200 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2201
2202 uint64_t index
2203 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2204 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2205
2206 /* Wait until the queue is not full before writing the packet. */
2207 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2208 >= command_q->size)
2209 ;
2210
2211 /* Do not allow the dimensions to be overridden when running
2212 constructors or destructors. */
2213 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2214 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2215
2216 hsa_kernel_dispatch_packet_t *packet;
2217 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2218 + index % command_q->size;
2219
2220 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2221 packet->grid_size_x = override_x ? : kla->gdims[0];
2222 packet->workgroup_size_x = get_group_size (kla->ndim,
2223 packet->grid_size_x,
2224 kla->wdims[0]);
2225
2226 if (kla->ndim >= 2)
2227 {
2228 packet->grid_size_y = kla->gdims[1];
2229 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2230 kla->wdims[1]);
2231 }
2232 else
2233 {
2234 packet->grid_size_y = 1;
2235 packet->workgroup_size_y = 1;
2236 }
2237
2238 if (kla->ndim == 3)
2239 {
2240 packet->grid_size_z = limit_worker_threads (override_z
2241 ? : kla->gdims[2]);
2242 packet->workgroup_size_z = get_group_size (kla->ndim,
2243 packet->grid_size_z,
2244 kla->wdims[2]);
2245 }
2246 else
2247 {
2248 packet->grid_size_z = 1;
2249 packet->workgroup_size_z = 1;
2250 }
2251
2252 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2253 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2254 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2255 packet->grid_size_x / packet->workgroup_size_x,
2256 packet->grid_size_y / packet->workgroup_size_y,
2257 packet->grid_size_z / packet->workgroup_size_z,
2258 packet->workgroup_size_x, packet->workgroup_size_y,
2259 packet->workgroup_size_z);
2260
2261 struct kernel_dispatch *shadow
f6fff8a6
AS
2262 = create_kernel_dispatch (kernel, packet->grid_size_x,
2263 packet->grid_size_z);
237957cc
AS
2264 shadow->queue = command_q;
2265
2266 if (debug)
2267 {
2268 fprintf (stderr, "\nKernel has following dependencies:\n");
2269 print_kernel_dispatch (shadow, 2);
2270 }
2271
2272 packet->private_segment_size = kernel->private_segment_size;
2273 packet->group_segment_size = kernel->group_segment_size;
2274 packet->kernel_object = kernel->object;
2275 packet->kernarg_address = shadow->kernarg_address;
2276 hsa_signal_t s;
2277 s.handle = shadow->signal;
2278 packet->completion_signal = s;
2279 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2280 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2281
2282 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2283
2284 uint16_t header;
2285 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2286 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2287 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2288
2289 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2290 agent->device_id);
2291
2292 packet_store_release ((uint32_t *) packet, header,
2293 (uint16_t) kla->ndim
2294 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2295
2296 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2297 index);
2298
2299 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2300
2301 /* Root signal waits with 1ms timeout. */
2302 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2303 1000 * 1000,
2304 HSA_WAIT_STATE_BLOCKED) != 0)
2305 {
2306 console_output (kernel, shadow->kernarg_address, false);
2307 }
2308 console_output (kernel, shadow->kernarg_address, true);
2309
2310 struct kernargs *kernargs = shadow->kernarg_address;
2311 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2312
2313 release_kernel_dispatch (shadow);
2314
2315 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2316 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2317
2318 unsigned int upper = (return_value & ~0xffff) >> 16;
2319 if (upper == 0xcafe)
2320 ; // exit not called, normal termination.
2321 else if (upper == 0xffff)
2322 ; // exit called.
2323 else
2324 {
2325 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2326 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2327 return_value);
2328 abort ();
2329 }
2330
2331 if (upper == 0xffff)
2332 {
2333 unsigned int signal = (return_value >> 8) & 0xff;
2334
2335 if (signal == SIGABRT)
2336 {
2337 GCN_WARNING ("GCN Kernel aborted\n");
2338 abort ();
2339 }
2340 else if (signal != 0)
2341 {
2342 GCN_WARNING ("GCN Kernel received unknown signal\n");
2343 abort ();
2344 }
2345
2346 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2347 exit (return_value & 0xff);
2348 }
2349}
2350
2351/* }}} */
2352/* {{{ Load/Unload */
2353
2354/* Initialize KERNEL from D and other parameters. Return true on success. */
2355
2356static bool
2357init_basic_kernel_info (struct kernel_info *kernel,
2358 struct hsa_kernel_description *d,
2359 struct agent_info *agent,
2360 struct module_info *module)
2361{
2362 kernel->agent = agent;
2363 kernel->module = module;
2364 kernel->name = d->name;
5a28e272 2365 kernel->description = d;
237957cc
AS
2366 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2367 {
2368 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2369 return false;
2370 }
2371 return true;
2372}
2373
7d593fd6
FH
2374/* Check that the GCN ISA of the given image matches the ISA of the agent. */
2375
2376static bool
2377isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2378{
2379 int isa_field = elf_gcn_isa_field (image);
2380 const char* isa_s = isa_hsa_name (isa_field);
2381 if (!isa_s)
2382 {
2383 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2384 return false;
2385 }
2386
2387 if (isa_field != agent->device_isa)
2388 {
2389 char msg[120];
2390 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2391 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2392 assert (agent_isa_s);
2393 assert (agent_isa_gcc_s);
2394
2395 snprintf (msg, sizeof msg,
2396 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
4a206161 2397 "Try to recompile with '-foffload-options=-march=%s'.\n",
7d593fd6
FH
2398 isa_s, agent_isa_s, agent_isa_gcc_s);
2399
2400 hsa_error (msg, HSA_STATUS_ERROR);
2401 return false;
2402 }
2403
2404 return true;
2405}
2406
237957cc
AS
2407/* Create and finalize the program consisting of all loaded modules. */
2408
2409static bool
2410create_and_finalize_hsa_program (struct agent_info *agent)
2411{
2412 hsa_status_t status;
237957cc
AS
2413 bool res = true;
2414 if (pthread_mutex_lock (&agent->prog_mutex))
2415 {
2416 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2417 return false;
2418 }
2419 if (agent->prog_finalized)
2420 goto final;
2421
2422 status
2423 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2424 HSA_EXECUTABLE_STATE_UNFROZEN,
2425 "", &agent->executable);
2426 if (status != HSA_STATUS_SUCCESS)
2427 {
2428 hsa_error ("Could not create GCN executable", status);
2429 goto fail;
2430 }
2431
2432 /* Load any GCN modules. */
2433 struct module_info *module = agent->module;
2434 if (module)
2435 {
2436 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2437
7d593fd6
FH
2438 if (!isa_matches_agent (agent, image))
2439 goto fail;
2440
237957cc
AS
2441 hsa_code_object_t co = { 0 };
2442 status = hsa_fns.hsa_code_object_deserialize_fn
2443 (module->image_desc->gcn_image->image,
2444 module->image_desc->gcn_image->size,
2445 NULL, &co);
2446 if (status != HSA_STATUS_SUCCESS)
2447 {
2448 hsa_error ("Could not deserialize GCN code object", status);
2449 goto fail;
2450 }
2451
2452 status = hsa_fns.hsa_executable_load_code_object_fn
2453 (agent->executable, agent->id, co, "");
2454 if (status != HSA_STATUS_SUCCESS)
2455 {
2456 hsa_error ("Could not load GCN code object", status);
2457 goto fail;
2458 }
2459
2460 if (!module->heap)
2461 {
2462 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2463 gcn_kernel_heap_size,
2464 (void**)&module->heap);
2465 if (status != HSA_STATUS_SUCCESS)
2466 {
2467 hsa_error ("Could not allocate memory for GCN heap", status);
2468 goto fail;
2469 }
2470
2471 status = hsa_fns.hsa_memory_assign_agent_fn
2472 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2473 if (status != HSA_STATUS_SUCCESS)
2474 {
2475 hsa_error ("Could not assign GCN heap memory to device", status);
2476 goto fail;
2477 }
2478
2479 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2480 &gcn_kernel_heap_size,
2481 sizeof (gcn_kernel_heap_size));
2482 }
2483
2484 }
2485
2486 if (debug)
2487 dump_executable_symbols (agent->executable);
2488
2489 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2490 if (status != HSA_STATUS_SUCCESS)
2491 {
2492 hsa_error ("Could not freeze the GCN executable", status);
2493 goto fail;
2494 }
2495
237957cc
AS
2496final:
2497 agent->prog_finalized = true;
2498
2499 if (pthread_mutex_unlock (&agent->prog_mutex))
2500 {
2501 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2502 res = false;
2503 }
2504
2505 return res;
2506
2507fail:
2508 res = false;
2509 goto final;
2510}
2511
2512/* Free the HSA program in agent and everything associated with it and set
2513 agent->prog_finalized and the initialized flags of all kernels to false.
2514 Return TRUE on success. */
2515
2516static bool
2517destroy_hsa_program (struct agent_info *agent)
2518{
2519 if (!agent->prog_finalized)
2520 return true;
2521
2522 hsa_status_t status;
2523
2524 GCN_DEBUG ("Destroying the current GCN program.\n");
2525
2526 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2527 if (status != HSA_STATUS_SUCCESS)
2528 return hsa_error ("Could not destroy GCN executable", status);
2529
2530 if (agent->module)
2531 {
2532 int i;
2533 for (i = 0; i < agent->module->kernel_count; i++)
2534 agent->module->kernels[i].initialized = false;
2535
2536 if (agent->module->heap)
2537 {
2538 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2539 agent->module->heap = NULL;
2540 }
2541 }
2542 agent->prog_finalized = false;
2543 return true;
2544}
2545
2546/* Deinitialize all information associated with MODULE and kernels within
2547 it. Return TRUE on success. */
2548
2549static bool
2550destroy_module (struct module_info *module, bool locked)
2551{
2552 /* Run destructors before destroying module. */
2553 struct GOMP_kernel_launch_attributes kla =
2554 { 3,
2555 /* Grid size. */
2556 { 1, 64, 1 },
2557 /* Work-group size. */
2558 { 1, 64, 1 }
2559 };
2560
2561 if (module->fini_array_func)
2562 {
2563 init_kernel (module->fini_array_func);
2564 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2565 }
2566 module->constructors_run_p = false;
2567
2568 int i;
2569 for (i = 0; i < module->kernel_count; i++)
2570 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2571 {
2572 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2573 "mutex");
2574 return false;
2575 }
2576
2577 return true;
2578}
2579
2580/* }}} */
2581/* {{{ Async */
2582
2583/* Callback of dispatch queues to report errors. */
2584
2585static void
2586execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2587{
2588 struct queue_entry *entry = &aq->queue[index];
2589
2590 switch (entry->type)
2591 {
2592 case KERNEL_LAUNCH:
2593 if (DEBUG_QUEUES)
2594 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2595 aq->agent->device_id, aq->id, index);
2596 run_kernel (entry->u.launch.kernel,
2597 entry->u.launch.vars,
2598 &entry->u.launch.kla, aq, false);
2599 if (DEBUG_QUEUES)
2600 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2601 aq->agent->device_id, aq->id, index);
2602 break;
2603
2604 case CALLBACK:
2605 if (DEBUG_QUEUES)
2606 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2607 aq->agent->device_id, aq->id, index);
2608 entry->u.callback.fn (entry->u.callback.data);
2609 if (DEBUG_QUEUES)
2610 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2611 aq->agent->device_id, aq->id, index);
2612 break;
2613
2614 case ASYNC_WAIT:
2615 {
2616 /* FIXME: is it safe to access a placeholder that may already have
2617 been executed? */
2618 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2619
2620 if (DEBUG_QUEUES)
2621 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2622 aq->agent->device_id, aq->id, index);
2623
2624 pthread_mutex_lock (&placeholderp->mutex);
2625
2626 while (!placeholderp->executed)
2627 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2628
2629 pthread_mutex_unlock (&placeholderp->mutex);
2630
2631 if (pthread_cond_destroy (&placeholderp->cond))
2632 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2633
2634 if (pthread_mutex_destroy (&placeholderp->mutex))
2635 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2636
2637 if (DEBUG_QUEUES)
2638 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2639 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2640 }
2641 break;
2642
2643 case ASYNC_PLACEHOLDER:
2644 pthread_mutex_lock (&entry->u.placeholder.mutex);
2645 entry->u.placeholder.executed = 1;
2646 pthread_cond_signal (&entry->u.placeholder.cond);
2647 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2648 break;
2649
2650 default:
2651 GOMP_PLUGIN_fatal ("Unknown queue element");
2652 }
2653}
2654
2655/* This function is run as a thread to service an async queue in the
2656 background. It runs continuously until the stop flag is set. */
2657
2658static void *
2659drain_queue (void *thread_arg)
2660{
2661 struct goacc_asyncqueue *aq = thread_arg;
2662
2663 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2664 {
2665 aq->drain_queue_stop = 2;
2666 return NULL;
2667 }
2668
2669 pthread_mutex_lock (&aq->mutex);
2670
2671 while (true)
2672 {
2673 if (aq->drain_queue_stop)
2674 break;
2675
2676 if (aq->queue_n > 0)
2677 {
2678 pthread_mutex_unlock (&aq->mutex);
2679 execute_queue_entry (aq, aq->queue_first);
2680
2681 pthread_mutex_lock (&aq->mutex);
2682 aq->queue_first = ((aq->queue_first + 1)
2683 % ASYNC_QUEUE_SIZE);
2684 aq->queue_n--;
2685
2686 if (DEBUG_THREAD_SIGNAL)
2687 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2688 aq->agent->device_id, aq->id);
2689 pthread_cond_broadcast (&aq->queue_cond_out);
2690 pthread_mutex_unlock (&aq->mutex);
2691
2692 if (DEBUG_QUEUES)
2693 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2694 aq->id);
2695 pthread_mutex_lock (&aq->mutex);
2696 }
2697 else
2698 {
2699 if (DEBUG_THREAD_SLEEP)
2700 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2701 aq->agent->device_id, aq->id);
2702 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2703 if (DEBUG_THREAD_SLEEP)
2704 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2705 aq->agent->device_id, aq->id);
2706 }
2707 }
2708
2709 aq->drain_queue_stop = 2;
2710 if (DEBUG_THREAD_SIGNAL)
2711 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2712 aq->agent->device_id, aq->id);
2713 pthread_cond_broadcast (&aq->queue_cond_out);
2714 pthread_mutex_unlock (&aq->mutex);
2715
2716 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2717 return NULL;
2718}
2719
2720/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2721 is not usually the case. This is just a debug tool. */
2722
2723static void
2724drain_queue_synchronous (struct goacc_asyncqueue *aq)
2725{
2726 pthread_mutex_lock (&aq->mutex);
2727
2728 while (aq->queue_n > 0)
2729 {
2730 execute_queue_entry (aq, aq->queue_first);
2731
2732 aq->queue_first = ((aq->queue_first + 1)
2733 % ASYNC_QUEUE_SIZE);
2734 aq->queue_n--;
2735 }
2736
2737 pthread_mutex_unlock (&aq->mutex);
2738}
2739
d88b27da
JB
2740/* Block the current thread until an async queue is writable. The aq->mutex
2741 lock should be held on entry, and remains locked on exit. */
237957cc
AS
2742
2743static void
2744wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2745{
2746 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2747 {
237957cc
AS
2748 /* Queue is full. Wait for it to not be full. */
2749 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2750 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
237957cc
AS
2751 }
2752}
2753
2754/* Request an asynchronous kernel launch on the specified queue. This
2755 may block if the queue is full, but returns without waiting for the
2756 kernel to run. */
2757
2758static void
2759queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2760 void *vars, struct GOMP_kernel_launch_attributes *kla)
2761{
2762 assert (aq->agent == kernel->agent);
2763
237957cc
AS
2764 pthread_mutex_lock (&aq->mutex);
2765
d88b27da
JB
2766 wait_for_queue_nonfull (aq);
2767
237957cc
AS
2768 int queue_last = ((aq->queue_first + aq->queue_n)
2769 % ASYNC_QUEUE_SIZE);
2770 if (DEBUG_QUEUES)
2771 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2772 aq->id, queue_last);
2773
2774 aq->queue[queue_last].type = KERNEL_LAUNCH;
2775 aq->queue[queue_last].u.launch.kernel = kernel;
2776 aq->queue[queue_last].u.launch.vars = vars;
2777 aq->queue[queue_last].u.launch.kla = *kla;
2778
2779 aq->queue_n++;
2780
2781 if (DEBUG_THREAD_SIGNAL)
2782 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2783 aq->agent->device_id, aq->id);
2784 pthread_cond_signal (&aq->queue_cond_in);
2785
2786 pthread_mutex_unlock (&aq->mutex);
2787}
2788
2789/* Request an asynchronous callback on the specified queue. The callback
2790 function will be called, with the given opaque data, from the appropriate
2791 async thread, when all previous items on that queue are complete. */
2792
2793static void
2794queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2795 void *data)
2796{
237957cc
AS
2797 pthread_mutex_lock (&aq->mutex);
2798
d88b27da
JB
2799 wait_for_queue_nonfull (aq);
2800
237957cc
AS
2801 int queue_last = ((aq->queue_first + aq->queue_n)
2802 % ASYNC_QUEUE_SIZE);
2803 if (DEBUG_QUEUES)
2804 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2805 aq->id, queue_last);
2806
2807 aq->queue[queue_last].type = CALLBACK;
2808 aq->queue[queue_last].u.callback.fn = fn;
2809 aq->queue[queue_last].u.callback.data = data;
2810
2811 aq->queue_n++;
2812
2813 if (DEBUG_THREAD_SIGNAL)
2814 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2815 aq->agent->device_id, aq->id);
2816 pthread_cond_signal (&aq->queue_cond_in);
2817
2818 pthread_mutex_unlock (&aq->mutex);
2819}
2820
2821/* Request that a given async thread wait for another thread (unspecified) to
2822 reach the given placeholder. The wait will occur when all previous entries
2823 on the queue are complete. A placeholder is effectively a kind of signal
2824 which simply sets a flag when encountered in a queue. */
2825
2826static void
2827queue_push_asyncwait (struct goacc_asyncqueue *aq,
2828 struct placeholder *placeholderp)
2829{
237957cc
AS
2830 pthread_mutex_lock (&aq->mutex);
2831
d88b27da
JB
2832 wait_for_queue_nonfull (aq);
2833
237957cc
AS
2834 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2835 if (DEBUG_QUEUES)
2836 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2837 aq->id, queue_last);
2838
2839 aq->queue[queue_last].type = ASYNC_WAIT;
2840 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2841
2842 aq->queue_n++;
2843
2844 if (DEBUG_THREAD_SIGNAL)
2845 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2846 aq->agent->device_id, aq->id);
2847 pthread_cond_signal (&aq->queue_cond_in);
2848
2849 pthread_mutex_unlock (&aq->mutex);
2850}
2851
2852/* Add a placeholder into an async queue. When the async thread reaches the
2853 placeholder it will set the "executed" flag to true and continue.
2854 Another thread may be waiting on this thread reaching the placeholder. */
2855
2856static struct placeholder *
2857queue_push_placeholder (struct goacc_asyncqueue *aq)
2858{
2859 struct placeholder *placeholderp;
2860
237957cc
AS
2861 pthread_mutex_lock (&aq->mutex);
2862
d88b27da
JB
2863 wait_for_queue_nonfull (aq);
2864
237957cc
AS
2865 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2866 if (DEBUG_QUEUES)
2867 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2868 aq->id, queue_last);
2869
2870 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2871 placeholderp = &aq->queue[queue_last].u.placeholder;
2872
2873 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2874 {
2875 pthread_mutex_unlock (&aq->mutex);
2876 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2877 }
2878
2879 if (pthread_cond_init (&placeholderp->cond, NULL))
2880 {
2881 pthread_mutex_unlock (&aq->mutex);
2882 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2883 }
2884
2885 placeholderp->executed = 0;
2886
2887 aq->queue_n++;
2888
2889 if (DEBUG_THREAD_SIGNAL)
2890 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2891 aq->agent->device_id, aq->id);
2892 pthread_cond_signal (&aq->queue_cond_in);
2893
2894 pthread_mutex_unlock (&aq->mutex);
2895
2896 return placeholderp;
2897}
2898
2899/* Signal an asynchronous thread to terminate, and wait for it to do so. */
2900
2901static void
2902finalize_async_thread (struct goacc_asyncqueue *aq)
2903{
2904 pthread_mutex_lock (&aq->mutex);
2905 if (aq->drain_queue_stop == 2)
2906 {
2907 pthread_mutex_unlock (&aq->mutex);
2908 return;
2909 }
2910
2911 aq->drain_queue_stop = 1;
2912
2913 if (DEBUG_THREAD_SIGNAL)
2914 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2915 aq->agent->device_id, aq->id);
2916 pthread_cond_signal (&aq->queue_cond_in);
2917
2918 while (aq->drain_queue_stop != 2)
2919 {
2920 if (DEBUG_THREAD_SLEEP)
2921 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2922 " to sleep\n", aq->agent->device_id, aq->id);
2923 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2924 if (DEBUG_THREAD_SLEEP)
2925 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2926 aq->agent->device_id, aq->id);
2927 }
2928
2929 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2930 aq->id);
2931 pthread_mutex_unlock (&aq->mutex);
2932
2933 int err = pthread_join (aq->thread_drain_queue, NULL);
2934 if (err != 0)
2935 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2936 aq->agent->device_id, aq->id, strerror (err));
2937 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2938}
2939
2940/* Set up an async queue for OpenMP. There will be only one. The
2941 implementation simply uses an OpenACC async queue.
2942 FIXME: is this thread-safe if two threads call this function? */
2943
2944static void
2945maybe_init_omp_async (struct agent_info *agent)
2946{
2947 if (!agent->omp_async_queue)
2948 agent->omp_async_queue
2949 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2950}
2951
8d2f4ddf
JB
2952/* A wrapper that works around an issue in the HSA runtime with host-to-device
2953 copies from read-only pages. */
2954
2955static void
2956hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2957{
2958 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2959
2960 if (status == HSA_STATUS_SUCCESS)
2961 return;
2962
2963 /* It appears that the copy fails if the source data is in a read-only page.
2964 We can't detect that easily, so try copying the data to a temporary buffer
2965 and doing the copy again if we got an error above. */
2966
2967 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2968 "[%p:+%d]\n", (void *) src, (int) len);
2969
2970 void *src_copy = malloc (len);
2971 memcpy (src_copy, src, len);
2972 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2973 free (src_copy);
2974 if (status != HSA_STATUS_SUCCESS)
2975 GOMP_PLUGIN_error ("memory copy failed");
2976}
2977
237957cc
AS
2978/* Copy data to or from a device. This is intended for use as an async
2979 callback event. */
2980
2981static void
2982copy_data (void *data_)
2983{
2984 struct copy_data *data = (struct copy_data *)data_;
2985 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2986 data->aq->agent->device_id, data->aq->id, data->len, data->src,
2987 data->dst);
8d2f4ddf 2988 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
237957cc
AS
2989 free (data);
2990}
2991
2992/* Free device data. This is intended for use as an async callback event. */
2993
2994static void
2995gomp_offload_free (void *ptr)
2996{
2997 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2998 GOMP_OFFLOAD_free (0, ptr);
2999}
3000
3001/* Request an asynchronous data copy, to or from a device, on a given queue.
9c41f5b9 3002 The event will be registered as a callback. */
237957cc
AS
3003
3004static void
3005queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
9c41f5b9 3006 size_t len)
237957cc
AS
3007{
3008 if (DEBUG_QUEUES)
3009 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3010 aq->agent->device_id, aq->id, len, src, dst);
3011 struct copy_data *data
3012 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3013 data->dst = dst;
3014 data->src = src;
3015 data->len = len;
237957cc
AS
3016 data->aq = aq;
3017 queue_push_callback (aq, copy_data, data);
3018}
3019
3020/* Return true if the given queue is currently empty. */
3021
3022static int
3023queue_empty (struct goacc_asyncqueue *aq)
3024{
3025 pthread_mutex_lock (&aq->mutex);
3026 int res = aq->queue_n == 0 ? 1 : 0;
3027 pthread_mutex_unlock (&aq->mutex);
3028
3029 return res;
3030}
3031
3032/* Wait for a given queue to become empty. This implements an OpenACC wait
3033 directive. */
3034
3035static void
3036wait_queue (struct goacc_asyncqueue *aq)
3037{
3038 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3039 {
3040 drain_queue_synchronous (aq);
3041 return;
3042 }
3043
3044 pthread_mutex_lock (&aq->mutex);
3045
3046 while (aq->queue_n > 0)
3047 {
3048 if (DEBUG_THREAD_SLEEP)
3049 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3050 aq->agent->device_id, aq->id);
3051 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3052 if (DEBUG_THREAD_SLEEP)
3053 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3054 aq->id);
3055 }
3056
3057 pthread_mutex_unlock (&aq->mutex);
3058 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3059}
3060
3061/* }}} */
3062/* {{{ OpenACC support */
3063
3064/* Execute an OpenACC kernel, synchronously or asynchronously. */
3065
3066static void
3067gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3068 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3069 struct goacc_asyncqueue *aq)
3070{
3071 if (!GOMP_OFFLOAD_can_run (kernel))
3072 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3073
3074 /* If we get here then this must be an OpenACC kernel. */
3075 kernel->kind = KIND_OPENACC;
3076
3077 /* devaddrs must be double-indirect on the target. */
3078 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3079 for (size_t i = 0; i < mapnum; i++)
3080 hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3081 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3082 sizeof (void *));
3083
3084 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3085 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3086 {
3087 struct hsa_kernel_description *d
3088 = &kernel->module->image_desc->kernel_infos[i];
3089 if (d->name == kernel->name)
3090 {
3091 hsa_kernel_desc = d;
3092 break;
3093 }
3094 }
3095
3096 /* We may have statically-determined dimensions in
3097 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3098 invocation at runtime in dims[]. We allow static dimensions to take
3099 priority over dynamic dimensions when present (non-zero). */
3100 if (hsa_kernel_desc->oacc_dims[0] > 0)
3101 dims[0] = hsa_kernel_desc->oacc_dims[0];
3102 if (hsa_kernel_desc->oacc_dims[1] > 0)
3103 dims[1] = hsa_kernel_desc->oacc_dims[1];
3104 if (hsa_kernel_desc->oacc_dims[2] > 0)
3105 dims[2] = hsa_kernel_desc->oacc_dims[2];
3106
a78b1ab1
KCY
3107 /* Ideally, when a dimension isn't explicitly specified, we should
3108 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3109 In practice, we tune for peak performance on BabelStream, which
3110 for OpenACC is currently 32 threads per CU. */
3111 if (dims[0] == 0 && dims[1] == 0)
3112 {
3113 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3114 number. There isn't really a correct answer for this without a clue
3115 about the problem size, so let's do a reasonable number of workers
3116 and gangs. */
237957cc 3117
a78b1ab1
KCY
3118 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3119 dims[1] = 8; /* Workers. */
3120 }
3121 else if (dims[0] == 0 && dims[1] > 0)
3122 {
3123 /* Auto-scale the number of gangs with the requested number of workers. */
3124 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3125 }
3126 else if (dims[0] > 0 && dims[1] == 0)
3127 {
3128 /* Auto-scale the number of workers with the requested number of gangs. */
3129 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3130 if (dims[1] == 0)
3131 dims[1] = 1;
3132 if (dims[1] > 16)
3133 dims[1] = 16;
3134 }
237957cc
AS
3135
3136 /* The incoming dimensions are expressed in terms of gangs, workers, and
3137 vectors. The HSA dimensions are expressed in terms of "work-items",
3138 which means multiples of vector lanes.
3139
3140 The "grid size" specifies the size of the problem space, and the
3141 "work-group size" specifies how much of that we want a single compute
3142 unit to chew on at once.
3143
3144 The three dimensions do not really correspond to hardware, but the
3145 important thing is that the HSA runtime will launch as many
3146 work-groups as it takes to process the entire grid, and each
3147 work-group will contain as many wave-fronts as it takes to process
3148 the work-items in that group.
3149
3150 Essentially, as long as we set the Y dimension to 64 (the number of
3151 vector lanes in hardware), and the Z group size to the maximum (16),
3152 then we will get the gangs (X) and workers (Z) launched as we expect.
3153
3154 The reason for the apparent reversal of vector and worker dimension
3155 order is to do with the way the run-time distributes work-items across
3156 v1 and v2. */
3157 struct GOMP_kernel_launch_attributes kla =
3158 {3,
3159 /* Grid size. */
3160 {dims[0], 64, dims[1]},
3161 /* Work-group size. */
3162 {1, 64, 16}
3163 };
3164
3165 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3166 acc_prof_info *prof_info = thr->prof_info;
3167 acc_event_info enqueue_launch_event_info;
3168 acc_api_info *api_info = thr->api_info;
3169 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3170 if (profiling_dispatch_p)
3171 {
3172 prof_info->event_type = acc_ev_enqueue_launch_start;
3173
3174 enqueue_launch_event_info.launch_event.event_type
3175 = prof_info->event_type;
3176 enqueue_launch_event_info.launch_event.valid_bytes
3177 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3178 enqueue_launch_event_info.launch_event.parent_construct
3179 = acc_construct_parallel;
3180 enqueue_launch_event_info.launch_event.implicit = 1;
3181 enqueue_launch_event_info.launch_event.tool_info = NULL;
3182 enqueue_launch_event_info.launch_event.kernel_name
3183 = (char *) kernel->name;
3184 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3185 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3186 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3187
3188 api_info->device_api = acc_device_api_other;
3189
3190 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3191 &enqueue_launch_event_info, api_info);
3192 }
3193
3194 if (!async)
649f1939 3195 run_kernel (kernel, ind_da, &kla, NULL, false);
237957cc 3196 else
649f1939 3197 queue_push_launch (aq, kernel, ind_da, &kla);
237957cc
AS
3198
3199 if (profiling_dispatch_p)
3200 {
3201 prof_info->event_type = acc_ev_enqueue_launch_end;
3202 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3203 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3204 &enqueue_launch_event_info,
3205 api_info);
3206 }
649f1939
TS
3207
3208 if (!async)
3209 gomp_offload_free (ind_da);
3210 else
3211 {
3212 if (DEBUG_QUEUES)
3213 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3214 aq->agent->device_id, aq->id, ind_da);
3215 queue_push_callback (aq, gomp_offload_free, ind_da);
3216 }
237957cc
AS
3217}
3218
3219/* }}} */
3220/* {{{ Generic Plugin API */
3221
3222/* Return the name of the accelerator, which is "gcn". */
3223
3224const char *
3225GOMP_OFFLOAD_get_name (void)
3226{
3227 return "gcn";
3228}
3229
3230/* Return the specific capabilities the HSA accelerator have. */
3231
3232unsigned int
3233GOMP_OFFLOAD_get_caps (void)
3234{
3235 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3236 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3237 | GOMP_OFFLOAD_CAP_OPENACC_200;
3238}
3239
3240/* Identify as GCN accelerator. */
3241
3242int
3243GOMP_OFFLOAD_get_type (void)
3244{
3245 return OFFLOAD_TARGET_TYPE_GCN;
3246}
3247
3248/* Return the libgomp version number we're compatible with. There is
3249 no requirement for cross-version compatibility. */
3250
3251unsigned
3252GOMP_OFFLOAD_version (void)
3253{
3254 return GOMP_VERSION;
3255}
3256
3257/* Return the number of GCN devices on the system. */
3258
3259int
683f1184 3260GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
237957cc
AS
3261{
3262 if (!init_hsa_context ())
3263 return 0;
683f1184
TB
3264 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3265 devices were present. */
f84fdb13
TB
3266 if (hsa_context.agent_count > 0
3267 && (omp_requires_mask & ~GOMP_REQUIRES_REVERSE_OFFLOAD) != 0)
683f1184 3268 return -1;
237957cc
AS
3269 return hsa_context.agent_count;
3270}
3271
3272/* Initialize device (agent) number N so that it can be used for computation.
3273 Return TRUE on success. */
3274
3275bool
3276GOMP_OFFLOAD_init_device (int n)
3277{
3278 if (!init_hsa_context ())
3279 return false;
3280 if (n >= hsa_context.agent_count)
3281 {
3282 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3283 return false;
3284 }
3285 struct agent_info *agent = &hsa_context.agents[n];
3286
3287 if (agent->initialized)
3288 return true;
3289
3290 agent->device_id = n;
3291
3292 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3293 {
3294 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3295 return false;
3296 }
3297 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3298 {
3299 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3300 return false;
3301 }
3302 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3303 {
3304 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3305 return false;
3306 }
f6fff8a6 3307 if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
237957cc
AS
3308 {
3309 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3310 return false;
3311 }
3312 agent->async_queues = NULL;
3313 agent->omp_async_queue = NULL;
f6fff8a6 3314 agent->ephemeral_memories_list = NULL;
237957cc
AS
3315
3316 uint32_t queue_size;
3317 hsa_status_t status;
3318 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3319 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3320 &queue_size);
3321 if (status != HSA_STATUS_SUCCESS)
3322 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3323 status);
3324
237957cc 3325 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
2e5ea579 3326 &agent->name);
237957cc
AS
3327 if (status != HSA_STATUS_SUCCESS)
3328 return hsa_error ("Error querying the name of the agent", status);
7d593fd6 3329
2e5ea579 3330 agent->device_isa = isa_code (agent->name);
7d593fd6 3331 if (agent->device_isa < 0)
2e5ea579
FH
3332 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3333
3334 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3335 &agent->vendor_name);
3336 if (status != HSA_STATUS_SUCCESS)
3337 return hsa_error ("Error querying the vendor name of the agent", status);
237957cc
AS
3338
3339 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3340 HSA_QUEUE_TYPE_MULTI,
3341 hsa_queue_callback, NULL, UINT32_MAX,
3342 UINT32_MAX, &agent->sync_queue);
3343 if (status != HSA_STATUS_SUCCESS)
3344 return hsa_error ("Error creating command queue", status);
3345
3346 agent->kernarg_region.handle = (uint64_t) -1;
3347 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3348 get_kernarg_memory_region,
3349 &agent->kernarg_region);
966de09b
AS
3350 if (status != HSA_STATUS_SUCCESS
3351 && status != HSA_STATUS_INFO_BREAK)
3352 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3353 if (agent->kernarg_region.handle == (uint64_t) -1)
3354 {
3355 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3356 "arguments");
3357 return false;
3358 }
3359 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3360 dump_hsa_region (agent->kernarg_region, NULL);
3361
3362 agent->data_region.handle = (uint64_t) -1;
3363 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3364 get_data_memory_region,
3365 &agent->data_region);
966de09b
AS
3366 if (status != HSA_STATUS_SUCCESS
3367 && status != HSA_STATUS_INFO_BREAK)
3368 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3369 if (agent->data_region.handle == (uint64_t) -1)
3370 {
3371 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3372 "data");
3373 return false;
3374 }
3375 GCN_DEBUG ("Selected device data memory region:\n");
3376 dump_hsa_region (agent->data_region, NULL);
3377
3378 GCN_DEBUG ("GCN agent %d initialized\n", n);
3379
3380 agent->initialized = true;
3381 return true;
3382}
3383
3384/* Load GCN object-code module described by struct gcn_image_desc in
3385 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
0fcc0cf9
TB
3386 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3387 contain the on-device addresses of the functions for reverse offload. To be
3388 freed by the caller. */
237957cc
AS
3389
3390int
3391GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
0fcc0cf9 3392 struct addr_pair **target_table,
dfd75bf7 3393 uint64_t **rev_fn_table)
237957cc
AS
3394{
3395 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3396 {
3397 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3398 " (expected %u, received %u)",
3399 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3400 return -1;
3401 }
3402
3403 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3404 struct agent_info *agent;
3405 struct addr_pair *pair;
3406 struct module_info *module;
3407 struct kernel_info *kernel;
3408 int kernel_count = image_desc->kernel_count;
3409 unsigned var_count = image_desc->global_variable_count;
9f2fca56 3410 /* Currently, "others" is a struct of ICVS. */
0bac793e 3411 int other_count = 1;
237957cc
AS
3412
3413 agent = get_agent_info (ord);
3414 if (!agent)
3415 return -1;
3416
3417 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3418 {
3419 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3420 return -1;
3421 }
3422 if (agent->prog_finalized
3423 && !destroy_hsa_program (agent))
3424 return -1;
3425
3426 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3427 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
0bac793e
CLT
3428 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3429 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
237957cc
AS
3430 * sizeof (struct addr_pair));
3431 *target_table = pair;
3432 module = (struct module_info *)
3433 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3434 + kernel_count * sizeof (struct kernel_info));
3435 module->image_desc = image_desc;
3436 module->kernel_count = kernel_count;
3437 module->heap = NULL;
3438 module->constructors_run_p = false;
3439
3440 kernel = &module->kernels[0];
3441
3442 /* Allocate memory for kernel dependencies. */
3443 for (unsigned i = 0; i < kernel_count; i++)
3444 {
3445 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3446 if (!init_basic_kernel_info (kernel, d, agent, module))
3447 return -1;
3448 if (strcmp (d->name, "_init_array") == 0)
3449 module->init_array_func = kernel;
3450 else if (strcmp (d->name, "_fini_array") == 0)
3451 module->fini_array_func = kernel;
3452 else
3453 {
3454 pair->start = (uintptr_t) kernel;
3455 pair->end = (uintptr_t) (kernel + 1);
3456 pair++;
3457 }
3458 kernel++;
3459 }
3460
3461 agent->module = module;
3462 if (pthread_rwlock_unlock (&agent->module_rwlock))
3463 {
3464 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3465 return -1;
3466 }
3467
3468 if (!create_and_finalize_hsa_program (agent))
3469 return -1;
3470
4a87a8e4 3471 if (var_count > 0)
237957cc 3472 {
237957cc
AS
3473 hsa_status_t status;
3474 hsa_executable_symbol_t var_symbol;
3475 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
4a87a8e4
AS
3476 ".offload_var_table",
3477 agent->id,
237957cc
AS
3478 0, &var_symbol);
3479
3480 if (status != HSA_STATUS_SUCCESS)
3481 hsa_fatal ("Could not find symbol for variable in the code object",
3482 status);
3483
4a87a8e4 3484 uint64_t var_table_addr;
237957cc 3485 status = hsa_fns.hsa_executable_symbol_get_info_fn
4a87a8e4
AS
3486 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3487 &var_table_addr);
237957cc
AS
3488 if (status != HSA_STATUS_SUCCESS)
3489 hsa_fatal ("Could not extract a variable from its symbol", status);
237957cc 3490
4a87a8e4
AS
3491 struct {
3492 uint64_t addr;
3493 uint64_t size;
3494 } var_table[var_count];
3495 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3496 (void*)var_table_addr, sizeof (var_table));
3497
3498 for (unsigned i = 0; i < var_count; i++)
3499 {
3500 pair->start = var_table[i].addr;
3501 pair->end = var_table[i].addr + var_table[i].size;
3502 GCN_DEBUG ("Found variable at %p with size %lu\n",
3503 (void *)var_table[i].addr, var_table[i].size);
3504 pair++;
3505 }
237957cc
AS
3506 }
3507
9f2fca56 3508 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
0bac793e
CLT
3509
3510 hsa_status_t status;
3511 hsa_executable_symbol_t var_symbol;
3512 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
9f2fca56 3513 XSTRING (GOMP_ADDITIONAL_ICVS),
0bac793e
CLT
3514 agent->id, 0, &var_symbol);
3515 if (status == HSA_STATUS_SUCCESS)
3516 {
9f2fca56
MV
3517 uint64_t varptr;
3518 uint32_t varsize;
0bac793e
CLT
3519
3520 status = hsa_fns.hsa_executable_symbol_get_info_fn
3521 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
9f2fca56 3522 &varptr);
0bac793e
CLT
3523 if (status != HSA_STATUS_SUCCESS)
3524 hsa_fatal ("Could not extract a variable from its symbol", status);
3525 status = hsa_fns.hsa_executable_symbol_get_info_fn
3526 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
9f2fca56 3527 &varsize);
0bac793e 3528 if (status != HSA_STATUS_SUCCESS)
9f2fca56
MV
3529 hsa_fatal ("Could not extract a variable size from its symbol",
3530 status);
0bac793e 3531
9f2fca56
MV
3532 pair->start = varptr;
3533 pair->end = varptr + varsize;
0bac793e
CLT
3534 }
3535 else
9f2fca56
MV
3536 {
3537 /* The variable was not in this image. */
3538 GCN_DEBUG ("Variable not found in image: %s\n",
3539 XSTRING (GOMP_ADDITIONAL_ICVS));
3540 pair->start = pair->end = 0;
3541 }
0bac793e 3542
237957cc
AS
3543 /* Ensure that constructors are run first. */
3544 struct GOMP_kernel_launch_attributes kla =
3545 { 3,
3546 /* Grid size. */
3547 { 1, 64, 1 },
3548 /* Work-group size. */
3549 { 1, 64, 1 }
3550 };
3551
3552 if (module->init_array_func)
3553 {
3554 init_kernel (module->init_array_func);
3555 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3556 }
3557 module->constructors_run_p = true;
3558
3559 /* Don't report kernels that libgomp need not know about. */
3560 if (module->init_array_func)
3561 kernel_count--;
3562 if (module->fini_array_func)
3563 kernel_count--;
3564
dfd75bf7
TB
3565 if (rev_fn_table != NULL && kernel_count == 0)
3566 *rev_fn_table = NULL;
3567 else if (rev_fn_table != NULL)
3568 {
3569 hsa_status_t status;
3570 hsa_executable_symbol_t var_symbol;
3571 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3572 ".offload_func_table",
3573 agent->id, 0, &var_symbol);
3574 if (status != HSA_STATUS_SUCCESS)
3575 hsa_fatal ("Could not find symbol for variable in the code object",
3576 status);
3577 uint64_t fn_table_addr;
3578 status = hsa_fns.hsa_executable_symbol_get_info_fn
3579 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3580 &fn_table_addr);
3581 if (status != HSA_STATUS_SUCCESS)
3582 hsa_fatal ("Could not extract a variable from its symbol", status);
3583 *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3584 GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3585 (void*) fn_table_addr,
3586 kernel_count * sizeof (uint64_t));
3587 }
3588
0bac793e 3589 return kernel_count + var_count + other_count;
237957cc
AS
3590}
3591
3592/* Unload GCN object-code module described by struct gcn_image_desc in
3593 TARGET_DATA from agent number N. Return TRUE on success. */
3594
3595bool
3596GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3597{
3598 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3599 {
3600 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3601 " (expected %u, received %u)",
3602 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3603 return false;
3604 }
3605
3606 struct agent_info *agent;
3607 agent = get_agent_info (n);
3608 if (!agent)
3609 return false;
3610
3611 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3612 {
3613 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3614 return false;
3615 }
3616
3617 if (!agent->module || agent->module->image_desc != target_data)
3618 {
3619 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3620 "loaded before");
3621 return false;
3622 }
3623
3624 if (!destroy_module (agent->module, true))
3625 return false;
3626 free (agent->module);
3627 agent->module = NULL;
3628 if (!destroy_hsa_program (agent))
3629 return false;
3630 if (pthread_rwlock_unlock (&agent->module_rwlock))
3631 {
3632 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3633 return false;
3634 }
3635 return true;
3636}
3637
3638/* Deinitialize all information and status associated with agent number N. We
3639 do not attempt any synchronization, assuming the user and libgomp will not
3640 attempt deinitialization of a device that is in any way being used at the
3641 same time. Return TRUE on success. */
3642
3643bool
3644GOMP_OFFLOAD_fini_device (int n)
3645{
3646 struct agent_info *agent = get_agent_info (n);
3647 if (!agent)
3648 return false;
3649
3650 if (!agent->initialized)
3651 return true;
3652
3653 if (agent->omp_async_queue)
3654 {
3655 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3656 agent->omp_async_queue = NULL;
3657 }
3658
3659 if (agent->module)
3660 {
3661 if (!destroy_module (agent->module, false))
3662 return false;
3663 free (agent->module);
3664 agent->module = NULL;
3665 }
3666
f6fff8a6 3667 if (!destroy_ephemeral_memories (agent))
237957cc
AS
3668 return false;
3669
3670 if (!destroy_hsa_program (agent))
3671 return false;
3672
3673 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3674 if (status != HSA_STATUS_SUCCESS)
3675 return hsa_error ("Error destroying command queue", status);
3676
3677 if (pthread_mutex_destroy (&agent->prog_mutex))
3678 {
3679 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3680 return false;
3681 }
3682 if (pthread_rwlock_destroy (&agent->module_rwlock))
3683 {
3684 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3685 return false;
3686 }
3687
3688 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3689 {
3690 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3691 return false;
3692 }
f6fff8a6 3693 if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
237957cc 3694 {
f6fff8a6 3695 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
237957cc
AS
3696 return false;
3697 }
3698 agent->initialized = false;
3699 return true;
3700}
3701
3702/* Return true if the HSA runtime can run function FN_PTR. */
3703
3704bool
3705GOMP_OFFLOAD_can_run (void *fn_ptr)
3706{
3707 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3708
3709 init_kernel (kernel);
3710 if (kernel->initialization_failed)
3711 goto failure;
3712
3713 return true;
3714
3715failure:
3716 if (suppress_host_fallback)
3717 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3718 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3719 return false;
3720}
3721
3722/* Allocate memory on device N. */
3723
3724void *
3725GOMP_OFFLOAD_alloc (int n, size_t size)
3726{
3727 struct agent_info *agent = get_agent_info (n);
3728 return alloc_by_agent (agent, size);
3729}
3730
3731/* Free memory from device N. */
3732
3733bool
3734GOMP_OFFLOAD_free (int device, void *ptr)
3735{
3736 GCN_DEBUG ("Freeing memory on device %d\n", device);
3737
3738 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3739 if (status != HSA_STATUS_SUCCESS)
3740 {
3741 hsa_error ("Could not free device memory", status);
3742 return false;
3743 }
3744
3745 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3746 bool profiling_dispatch_p
3747 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3748 if (profiling_dispatch_p)
3749 {
3750 acc_prof_info *prof_info = thr->prof_info;
3751 acc_event_info data_event_info;
3752 acc_api_info *api_info = thr->api_info;
3753
3754 prof_info->event_type = acc_ev_free;
3755
3756 data_event_info.data_event.event_type = prof_info->event_type;
3757 data_event_info.data_event.valid_bytes
3758 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3759 data_event_info.data_event.parent_construct
3760 = acc_construct_parallel;
3761 data_event_info.data_event.implicit = 1;
3762 data_event_info.data_event.tool_info = NULL;
3763 data_event_info.data_event.var_name = NULL;
3764 data_event_info.data_event.bytes = 0;
3765 data_event_info.data_event.host_ptr = NULL;
3766 data_event_info.data_event.device_ptr = (void *) ptr;
3767
3768 api_info->device_api = acc_device_api_other;
3769
3770 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3771 api_info);
3772 }
3773
3774 return true;
3775}
3776
3777/* Copy data from DEVICE to host. */
3778
3779bool
3780GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3781{
3782 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3783 src, dst);
8d2f4ddf
JB
3784 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3785 if (status != HSA_STATUS_SUCCESS)
3786 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3787 return true;
3788}
3789
3790/* Copy data from host to DEVICE. */
3791
3792bool
3793GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3794{
3795 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3796 device, dst);
8d2f4ddf 3797 hsa_memory_copy_wrapper (dst, src, n);
237957cc
AS
3798 return true;
3799}
3800
3801/* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3802
3803bool
3804GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3805{
3806 struct gcn_thread *thread_data = gcn_thread ();
3807
3808 if (thread_data && !async_synchronous_p (thread_data->async))
3809 {
3810 struct agent_info *agent = get_agent_info (device);
3811 maybe_init_omp_async (agent);
9c41f5b9 3812 queue_push_copy (agent->omp_async_queue, dst, src, n);
237957cc
AS
3813 return true;
3814 }
3815
3816 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3817 device, src, device, dst);
8d2f4ddf
JB
3818 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3819 if (status != HSA_STATUS_SUCCESS)
3820 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3821 return true;
3822}
3823
3824/* }}} */
3825/* {{{ OpenMP Plugin API */
3826
3827/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3828 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3829 to a kernel_info structure, and must have previously been loaded to the
3830 specified device. */
3831
3832void
3833GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3834{
3835 struct agent_info *agent = get_agent_info (device);
3836 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3837 struct GOMP_kernel_launch_attributes def;
3838 struct GOMP_kernel_launch_attributes *kla;
3839 assert (agent == kernel->agent);
3840
3841 /* If we get here then the kernel must be OpenMP. */
3842 kernel->kind = KIND_OPENMP;
3843
3844 if (!parse_target_attributes (args, &def, &kla, agent))
3845 {
3846 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3847 return;
3848 }
3849 run_kernel (kernel, vars, kla, NULL, false);
3850}
3851
3852/* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3853 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3854 GOMP_PLUGIN_target_task_completion when it has finished. */
3855
3856void
3857GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3858 void **args, void *async_data)
3859{
3860 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3861 struct agent_info *agent = get_agent_info (device);
3862 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3863 struct GOMP_kernel_launch_attributes def;
3864 struct GOMP_kernel_launch_attributes *kla;
3865 assert (agent == kernel->agent);
3866
3867 /* If we get here then the kernel must be OpenMP. */
3868 kernel->kind = KIND_OPENMP;
3869
3870 if (!parse_target_attributes (args, &def, &kla, agent))
3871 {
3872 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3873 return;
3874 }
3875
3876 maybe_init_omp_async (agent);
3877 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3878 queue_push_callback (agent->omp_async_queue,
3879 GOMP_PLUGIN_target_task_completion, async_data);
3880}
3881
3882/* }}} */
3883/* {{{ OpenACC Plugin API */
3884
3885/* Run a synchronous OpenACC kernel. The device number is inferred from the
3886 already-loaded KERNEL. */
3887
3888void
3889GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3890 void **hostaddrs, void **devaddrs, unsigned *dims,
3891 void *targ_mem_desc)
3892{
3893 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3894
3895 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3896 NULL);
3897}
3898
3899/* Run an asynchronous OpenACC kernel on the specified queue. */
3900
3901void
3902GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3903 void **hostaddrs, void **devaddrs,
3904 unsigned *dims, void *targ_mem_desc,
3905 struct goacc_asyncqueue *aq)
3906{
3907 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3908
3909 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3910 aq);
3911}
3912
3913/* Create a new asynchronous thread and queue for running future kernels. */
3914
3915struct goacc_asyncqueue *
3916GOMP_OFFLOAD_openacc_async_construct (int device)
3917{
3918 struct agent_info *agent = get_agent_info (device);
3919
3920 pthread_mutex_lock (&agent->async_queues_mutex);
3921
3922 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3923 aq->agent = get_agent_info (device);
3924 aq->prev = NULL;
3925 aq->next = agent->async_queues;
3926 if (aq->next)
3927 {
3928 aq->next->prev = aq;
3929 aq->id = aq->next->id + 1;
3930 }
3931 else
3932 aq->id = 1;
3933 agent->async_queues = aq;
3934
3935 aq->queue_first = 0;
3936 aq->queue_n = 0;
3937 aq->drain_queue_stop = 0;
3938
3939 if (pthread_mutex_init (&aq->mutex, NULL))
3940 {
3941 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3942 return false;
3943 }
3944 if (pthread_cond_init (&aq->queue_cond_in, NULL))
3945 {
3946 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3947 return false;
3948 }
3949 if (pthread_cond_init (&aq->queue_cond_out, NULL))
3950 {
3951 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3952 return false;
3953 }
3954
3955 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3956 ASYNC_QUEUE_SIZE,
3957 HSA_QUEUE_TYPE_MULTI,
3958 hsa_queue_callback, NULL,
3959 UINT32_MAX, UINT32_MAX,
3960 &aq->hsa_queue);
3961 if (status != HSA_STATUS_SUCCESS)
3962 hsa_fatal ("Error creating command queue", status);
3963
3964 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3965 if (err != 0)
3966 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3967 strerror (err));
3968 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3969 aq->id);
3970
3971 pthread_mutex_unlock (&agent->async_queues_mutex);
3972
3973 return aq;
3974}
3975
93d90219 3976/* Destroy an existing asynchronous thread and queue. Waits for any
237957cc
AS
3977 currently-running task to complete, but cancels any queued tasks. */
3978
3979bool
3980GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3981{
3982 struct agent_info *agent = aq->agent;
3983
3984 finalize_async_thread (aq);
3985
3986 pthread_mutex_lock (&agent->async_queues_mutex);
3987
3988 int err;
3989 if ((err = pthread_mutex_destroy (&aq->mutex)))
3990 {
3991 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3992 goto fail;
3993 }
3994 if (pthread_cond_destroy (&aq->queue_cond_in))
3995 {
3996 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3997 goto fail;
3998 }
3999 if (pthread_cond_destroy (&aq->queue_cond_out))
4000 {
4001 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4002 goto fail;
4003 }
4004 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4005 if (status != HSA_STATUS_SUCCESS)
4006 {
4007 hsa_error ("Error destroying command queue", status);
4008 goto fail;
4009 }
4010
4011 if (aq->prev)
4012 aq->prev->next = aq->next;
4013 if (aq->next)
4014 aq->next->prev = aq->prev;
4015 if (agent->async_queues == aq)
4016 agent->async_queues = aq->next;
4017
4018 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4019
4020 free (aq);
4021 pthread_mutex_unlock (&agent->async_queues_mutex);
4022 return true;
4023
4024fail:
4025 pthread_mutex_unlock (&agent->async_queues_mutex);
4026 return false;
4027}
4028
4029/* Return true if the specified async queue is currently empty. */
4030
4031int
4032GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4033{
4034 return queue_empty (aq);
4035}
4036
4037/* Block until the specified queue has executed all its tasks and the
4038 queue is empty. */
4039
4040bool
4041GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4042{
4043 wait_queue (aq);
4044 return true;
4045}
4046
4047/* Add a serialization point across two async queues. Any new tasks added to
4048 AQ2, after this call, will not run until all tasks on AQ1, at the time
4049 of this call, have completed. */
4050
4051bool
4052GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4053 struct goacc_asyncqueue *aq2)
4054{
4055 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4056 scheduled to run on it up to this point. */
4057 if (aq1 != aq2)
4058 {
4059 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4060 queue_push_asyncwait (aq2, placeholderp);
4061 }
4062 return true;
4063}
4064
4065/* Add an opaque callback to the given async queue. */
4066
4067void
4068GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4069 void (*fn) (void *), void *data)
4070{
4071 queue_push_callback (aq, fn, data);
4072}
4073
4074/* Queue up an asynchronous data copy from host to DEVICE. */
4075
4076bool
4077GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4078 size_t n, struct goacc_asyncqueue *aq)
4079{
4080 struct agent_info *agent = get_agent_info (device);
4081 assert (agent == aq->agent);
9c41f5b9 4082 queue_push_copy (aq, dst, src, n);
237957cc
AS
4083 return true;
4084}
4085
4086/* Queue up an asynchronous data copy from DEVICE to host. */
4087
4088bool
4089GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4090 size_t n, struct goacc_asyncqueue *aq)
4091{
4092 struct agent_info *agent = get_agent_info (device);
4093 assert (agent == aq->agent);
9c41f5b9 4094 queue_push_copy (aq, dst, src, n);
237957cc
AS
4095 return true;
4096}
4097
6fc0385c
TS
4098union goacc_property_value
4099GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4100{
2e5ea579
FH
4101 struct agent_info *agent = get_agent_info (device);
4102
4103 union goacc_property_value propval = { .val = 0 };
4104
4105 switch (prop)
4106 {
4107 case GOACC_PROPERTY_FREE_MEMORY:
4108 /* Not supported. */
4109 break;
4110 case GOACC_PROPERTY_MEMORY:
4111 {
4112 size_t size;
4113 hsa_region_t region = agent->data_region;
4114 hsa_status_t status =
4115 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4116 if (status == HSA_STATUS_SUCCESS)
4117 propval.val = size;
4118 break;
4119 }
4120 case GOACC_PROPERTY_NAME:
4121 propval.ptr = agent->name;
4122 break;
4123 case GOACC_PROPERTY_VENDOR:
4124 propval.ptr = agent->vendor_name;
4125 break;
4126 case GOACC_PROPERTY_DRIVER:
4127 propval.ptr = hsa_context.driver_version_s;
4128 break;
4129 }
6fc0385c 4130
2e5ea579 4131 return propval;
6fc0385c
TS
4132}
4133
237957cc
AS
4134/* Set up plugin-specific thread-local-data (host-side). */
4135
4136void *
4137GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4138{
4139 struct gcn_thread *thread_data
4140 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4141
4142 thread_data->async = GOMP_ASYNC_SYNC;
4143
4144 return (void *) thread_data;
4145}
4146
4147/* Clean up plugin-specific thread-local-data. */
4148
4149void
4150GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4151{
4152 free (data);
4153}
4154
4155/* }}} */