]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-gcn.c
amdgcn, libgomp: Manually allocated stacks
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
CommitLineData
237957cc
AS
1/* Plugin for AMD GCN execution.
2
83ffe9cd 3 Copyright (C) 2013-2023 Free Software Foundation, Inc.
237957cc
AS
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* {{{ Includes and defines */
30
31#include "config.h"
0bac793e 32#include "symcat.h"
237957cc
AS
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <pthread.h>
37#include <inttypes.h>
38#include <stdbool.h>
39#include <limits.h>
40#include <hsa.h>
85f0a4d9 41#include <hsa_ext_amd.h>
237957cc
AS
42#include <dlfcn.h>
43#include <signal.h>
44#include "libgomp-plugin.h"
8c05d8cd 45#include "config/gcn/libgomp-gcn.h" /* For struct output. */
237957cc
AS
46#include "gomp-constants.h"
47#include <elf.h>
48#include "oacc-plugin.h"
49#include "oacc-int.h"
50#include <assert.h>
51
237957cc 52/* These probably won't be in elf.h for a while. */
97981e13 53#ifndef R_AMDGPU_NONE
237957cc
AS
54#define R_AMDGPU_NONE 0
55#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57#define R_AMDGPU_ABS64 3 /* S + A */
58#define R_AMDGPU_REL32 4 /* S + A - P */
59#define R_AMDGPU_REL64 5 /* S + A - P */
60#define R_AMDGPU_ABS32 6 /* S + A */
61#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
237957cc 66#define R_AMDGPU_RELATIVE64 13 /* B + A */
97981e13 67#endif
237957cc
AS
68
69/* GCN specific definitions for asynchronous queues. */
70
71#define ASYNC_QUEUE_SIZE 64
72#define DRAIN_QUEUE_SYNCHRONOUS_P false
73#define DEBUG_QUEUES 0
74#define DEBUG_THREAD_SLEEP 0
75#define DEBUG_THREAD_SIGNAL 0
76
77/* Defaults. */
78#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
79
80/* Secure getenv() which returns NULL if running as SUID/SGID. */
81#ifndef HAVE_SECURE_GETENV
82#ifdef HAVE___SECURE_GETENV
83#define secure_getenv __secure_getenv
84#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
86
87#include <unistd.h>
88
89/* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
91
92static char *
93secure_getenv (const char *name)
94{
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
96 return getenv (name);
97 else
98 return NULL;
99}
100
101#else
102#define secure_getenv getenv
103#endif
104#endif
105
106/* }}} */
107/* {{{ Types */
108
93d90219 109/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
237957cc
AS
110
111struct gcn_thread
112{
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
114 int async;
115};
116
117/* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
119
120struct hsa_runtime_fn_info
121{
122 /* HSA runtime. */
123 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
124 const char **status_string);
125 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
126 void *value);
127 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
128 hsa_agent_info_t attribute,
129 void *value);
130 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
131 hsa_isa_info_t attribute,
132 uint32_t index,
133 void *value);
134 hsa_status_t (*hsa_init_fn) (void);
135 hsa_status_t (*hsa_iterate_agents_fn)
136 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
137 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
138 hsa_region_info_t attribute,
139 void *value);
140 hsa_status_t (*hsa_queue_create_fn)
141 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
142 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
143 void *data, uint32_t private_segment_size,
144 uint32_t group_segment_size, hsa_queue_t **queue);
145 hsa_status_t (*hsa_agent_iterate_regions_fn)
146 (hsa_agent_t agent,
147 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
148 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
149 hsa_status_t (*hsa_executable_create_fn)
150 (hsa_profile_t profile, hsa_executable_state_t executable_state,
151 const char *options, hsa_executable_t *executable);
152 hsa_status_t (*hsa_executable_global_variable_define_fn)
153 (hsa_executable_t executable, const char *variable_name, void *address);
154 hsa_status_t (*hsa_executable_load_code_object_fn)
155 (hsa_executable_t executable, hsa_agent_t agent,
156 hsa_code_object_t code_object, const char *options);
157 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
158 const char *options);
159 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
160 uint32_t num_consumers,
161 const hsa_agent_t *consumers,
162 hsa_signal_t *signal);
163 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
164 void **ptr);
165 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
166 hsa_access_permission_t access);
167 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
168 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
169 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
170 hsa_status_t (*hsa_executable_get_symbol_fn)
171 (hsa_executable_t executable, const char *module_name,
172 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
173 hsa_executable_symbol_t *symbol);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn)
175 (hsa_executable_symbol_t executable_symbol,
176 hsa_executable_symbol_info_t attribute, void *value);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn)
178 (hsa_executable_t executable,
179 hsa_status_t (*callback)(hsa_executable_t executable,
180 hsa_executable_symbol_t symbol, void *data),
181 void *data);
182 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
183 uint64_t value);
184 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
185 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
190 (hsa_signal_t signal, hsa_signal_condition_t condition,
191 hsa_signal_value_t compare_value, uint64_t timeout_hint,
192 hsa_wait_state_t wait_state_hint);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
194 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
195
196 hsa_status_t (*hsa_code_object_deserialize_fn)
197 (void *serialized_code_object, size_t serialized_code_object_size,
198 const char *options, hsa_code_object_t *code_object);
199};
200
201/* Structure describing the run-time and grid properties of an HSA kernel
202 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
203
204struct GOMP_kernel_launch_attributes
205{
206 /* Number of dimensions the workload has. Maximum number is 3. */
207 uint32_t ndim;
208 /* Size of the grid in the three respective dimensions. */
209 uint32_t gdims[3];
210 /* Size of work-groups in the respective dimensions. */
211 uint32_t wdims[3];
212};
213
214/* Collection of information needed for a dispatch of a kernel from a
215 kernel. */
216
217struct kernel_dispatch
218{
219 struct agent_info *agent;
220 /* Pointer to a command queue associated with a kernel dispatch agent. */
221 void *queue;
222 /* Pointer to a memory space used for kernel arguments passing. */
223 void *kernarg_address;
224 /* Kernel object. */
225 uint64_t object;
226 /* Synchronization signal used for dispatch synchronization. */
227 uint64_t signal;
228 /* Private segment size. */
229 uint32_t private_segment_size;
230 /* Group segment size. */
231 uint32_t group_segment_size;
232};
233
234/* Structure of the kernargs segment, supporting console output.
235
236 This needs to match the definitions in Newlib, and the expectations
237 in libgomp target code. */
238
239struct kernargs {
f6fff8a6 240 struct kernargs_abi abi;
237957cc
AS
241
242 /* Output data. */
8c05d8cd 243 struct output output_data;
237957cc
AS
244};
245
246/* A queue entry for a future asynchronous launch. */
247
248struct kernel_launch
249{
250 struct kernel_info *kernel;
251 void *vars;
252 struct GOMP_kernel_launch_attributes kla;
253};
254
255/* A queue entry for a future callback. */
256
257struct callback
258{
259 void (*fn)(void *);
260 void *data;
261};
262
263/* A data struct for the copy_data callback. */
264
265struct copy_data
266{
267 void *dst;
268 const void *src;
269 size_t len;
237957cc
AS
270 struct goacc_asyncqueue *aq;
271};
272
273/* A queue entry for a placeholder. These correspond to a wait event. */
274
275struct placeholder
276{
277 int executed;
278 pthread_cond_t cond;
279 pthread_mutex_t mutex;
280};
281
282/* A queue entry for a wait directive. */
283
284struct asyncwait_info
285{
286 struct placeholder *placeholderp;
287};
288
289/* Encode the type of an entry in an async queue. */
290
291enum entry_type
292{
293 KERNEL_LAUNCH,
294 CALLBACK,
295 ASYNC_WAIT,
296 ASYNC_PLACEHOLDER
297};
298
299/* An entry in an async queue. */
300
301struct queue_entry
302{
303 enum entry_type type;
304 union {
305 struct kernel_launch launch;
306 struct callback callback;
307 struct asyncwait_info asyncwait;
308 struct placeholder placeholder;
309 } u;
310};
311
312/* An async queue header.
313
314 OpenMP may create one of these.
315 OpenACC may create many. */
316
317struct goacc_asyncqueue
318{
319 struct agent_info *agent;
320 hsa_queue_t *hsa_queue;
321
322 pthread_t thread_drain_queue;
323 pthread_mutex_t mutex;
324 pthread_cond_t queue_cond_in;
325 pthread_cond_t queue_cond_out;
326 struct queue_entry queue[ASYNC_QUEUE_SIZE];
327 int queue_first;
328 int queue_n;
329 int drain_queue_stop;
330
331 int id;
332 struct goacc_asyncqueue *prev;
333 struct goacc_asyncqueue *next;
334};
335
336/* Mkoffload uses this structure to describe a kernel.
337
338 OpenMP kernel dimensions are passed at runtime.
339 OpenACC kernel dimensions are passed at compile time, here. */
340
341struct hsa_kernel_description
342{
343 const char *name;
344 int oacc_dims[3]; /* Only present for GCN kernels. */
5a28e272
KCY
345 int sgpr_count;
346 int vpgr_count;
237957cc
AS
347};
348
349/* Mkoffload uses this structure to describe an offload variable. */
350
351struct global_var_info
352{
353 const char *name;
354 void *address;
355};
356
357/* Mkoffload uses this structure to describe all the kernels in a
358 loadable module. These are passed the libgomp via static constructors. */
359
360struct gcn_image_desc
361{
362 struct gcn_image {
363 size_t size;
364 void *image;
365 } *gcn_image;
366 const unsigned kernel_count;
367 struct hsa_kernel_description *kernel_infos;
368 const unsigned global_variable_count;
237957cc
AS
369};
370
7d593fd6
FH
371/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
372 support.
373 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
374
375typedef enum {
7d593fd6
FH
376 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
377 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
378 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
cde52d3a
AS
379 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
380 EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f
7d593fd6
FH
381} EF_AMDGPU_MACH;
382
383const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
384typedef EF_AMDGPU_MACH gcn_isa;
385
237957cc
AS
386/* Description of an HSA GPU agent (device) and the program associated with
387 it. */
388
389struct agent_info
390{
391 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
392 hsa_agent_t id;
393 /* The user-visible device number. */
394 int device_id;
395 /* Whether the agent has been initialized. The fields below are usable only
396 if it has been. */
397 bool initialized;
7d593fd6
FH
398
399 /* The instruction set architecture of the device. */
400 gcn_isa device_isa;
2e5ea579
FH
401 /* Name of the agent. */
402 char name[64];
403 /* Name of the vendor of the agent. */
404 char vendor_name[64];
237957cc
AS
405 /* Command queues of the agent. */
406 hsa_queue_t *sync_queue;
407 struct goacc_asyncqueue *async_queues, *omp_async_queue;
408 pthread_mutex_t async_queues_mutex;
409
410 /* The HSA memory region from which to allocate kernel arguments. */
411 hsa_region_t kernarg_region;
412
413 /* The HSA memory region from which to allocate device data. */
414 hsa_region_t data_region;
415
f6fff8a6
AS
416 /* Allocated ephemeral memories (team arena and stack space). */
417 struct ephemeral_memories_list *ephemeral_memories_list;
418 pthread_mutex_t ephemeral_memories_write_lock;
237957cc
AS
419
420 /* Read-write lock that protects kernels which are running or about to be run
421 from interference with loading and unloading of images. Needs to be
422 locked for reading while a kernel is being run, and for writing if the
423 list of modules is manipulated (and thus the HSA program invalidated). */
424 pthread_rwlock_t module_rwlock;
425
426 /* The module associated with this kernel. */
427 struct module_info *module;
428
429 /* Mutex enforcing that only one thread will finalize the HSA program. A
430 thread should have locked agent->module_rwlock for reading before
431 acquiring it. */
432 pthread_mutex_t prog_mutex;
433 /* Flag whether the HSA program that consists of all the modules has been
434 finalized. */
435 bool prog_finalized;
436 /* HSA executable - the finalized program that is used to locate kernels. */
437 hsa_executable_t executable;
438};
439
440/* Information required to identify, finalize and run any given kernel. */
441
442enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
443
444struct kernel_info
445{
446 /* Name of the kernel, required to locate it within the GCN object-code
447 module. */
448 const char *name;
449 /* The specific agent the kernel has been or will be finalized for and run
450 on. */
451 struct agent_info *agent;
452 /* The specific module where the kernel takes place. */
453 struct module_info *module;
5a28e272
KCY
454 /* Information provided by mkoffload associated with the kernel. */
455 struct hsa_kernel_description *description;
237957cc
AS
456 /* Mutex enforcing that at most once thread ever initializes a kernel for
457 use. A thread should have locked agent->module_rwlock for reading before
458 acquiring it. */
459 pthread_mutex_t init_mutex;
460 /* Flag indicating whether the kernel has been initialized and all fields
461 below it contain valid data. */
462 bool initialized;
463 /* Flag indicating that the kernel has a problem that blocks an execution. */
464 bool initialization_failed;
465 /* The object to be put into the dispatch queue. */
466 uint64_t object;
467 /* Required size of kernel arguments. */
468 uint32_t kernarg_segment_size;
469 /* Required size of group segment. */
470 uint32_t group_segment_size;
471 /* Required size of private segment. */
472 uint32_t private_segment_size;
473 /* Set up for OpenMP or OpenACC? */
474 enum offload_kind kind;
475};
476
477/* Information about a particular GCN module, its image and kernels. */
478
479struct module_info
480{
481 /* The description with which the program has registered the image. */
482 struct gcn_image_desc *image_desc;
483 /* GCN heap allocation. */
484 struct heap *heap;
485 /* Physical boundaries of the loaded module. */
486 Elf64_Addr phys_address_start;
487 Elf64_Addr phys_address_end;
488
489 bool constructors_run_p;
490 struct kernel_info *init_array_func, *fini_array_func;
491
492 /* Number of kernels in this module. */
493 int kernel_count;
494 /* An array of kernel_info structures describing each kernel in this
495 module. */
496 struct kernel_info kernels[];
497};
498
499/* A linked list of memory arenas allocated on the device.
f6fff8a6
AS
500 These are used by OpenMP, as a means to optimize per-team malloc,
501 and for host-accessible stack space. */
237957cc 502
f6fff8a6 503struct ephemeral_memories_list
237957cc 504{
f6fff8a6 505 struct ephemeral_memories_list *next;
237957cc 506
f6fff8a6
AS
507 /* The size is determined by the number of teams and threads. */
508 size_t size;
509 /* The device address allocated memory. */
510 void *address;
511 /* A flag to prevent two asynchronous kernels trying to use the same memory.
237957cc
AS
512 The mutex is locked until the kernel exits. */
513 pthread_mutex_t in_use;
514};
515
516/* Information about the whole HSA environment and all of its agents. */
517
518struct hsa_context_info
519{
520 /* Whether the structure has been initialized. */
521 bool initialized;
522 /* Number of usable GPU HSA agents in the system. */
523 int agent_count;
524 /* Array of agent_info structures describing the individual HSA agents. */
525 struct agent_info *agents;
2e5ea579
FH
526 /* Driver version string. */
527 char driver_version_s[30];
237957cc
AS
528};
529
237957cc
AS
530/* }}} */
531/* {{{ Global variables */
532
533/* Information about the whole HSA environment and all of its agents. */
534
535static struct hsa_context_info hsa_context;
536
537/* HSA runtime functions that are initialized in init_hsa_context. */
538
539static struct hsa_runtime_fn_info hsa_fns;
540
541/* Heap space, allocated target-side, provided for use of newlib malloc.
542 Each module should have it's own heap allocated.
543 Beware that heap usage increases with OpenMP teams. See also arenas. */
544
545static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
546
f6fff8a6
AS
547/* Ephemeral memory sizes for each kernel launch. */
548
549static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
550static int stack_size = DEFAULT_GCN_STACK_SIZE;
551
237957cc
AS
552/* Flag to decide whether print to stderr information about what is going on.
553 Set in init_debug depending on environment variables. */
554
555static bool debug;
556
557/* Flag to decide if the runtime should suppress a possible fallback to host
558 execution. */
559
560static bool suppress_host_fallback;
561
562/* Flag to locate HSA runtime shared library that is dlopened
563 by this plug-in. */
564
565static const char *hsa_runtime_lib;
566
567/* Flag to decide if the runtime should support also CPU devices (can be
568 a simulator). */
569
570static bool support_cpu_devices;
571
572/* Runtime dimension overrides. Zero indicates default. */
573
574static int override_x_dim = 0;
575static int override_z_dim = 0;
576
577/* }}} */
578/* {{{ Debug & Diagnostic */
579
580/* Print a message to stderr if GCN_DEBUG value is set to true. */
581
582#define DEBUG_PRINT(...) \
583 do \
584 { \
585 if (debug) \
586 { \
587 fprintf (stderr, __VA_ARGS__); \
588 } \
589 } \
590 while (false);
591
592/* Flush stderr if GCN_DEBUG value is set to true. */
593
594#define DEBUG_FLUSH() \
595 do { \
596 if (debug) \
597 fflush (stderr); \
598 } while (false)
599
600/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
601 is set to true. */
602
603#define DEBUG_LOG(prefix, ...) \
604 do \
605 { \
606 DEBUG_PRINT (prefix); \
607 DEBUG_PRINT (__VA_ARGS__); \
608 DEBUG_FLUSH (); \
609 } while (false)
610
611/* Print a debugging message to stderr. */
612
613#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
614
615/* Print a warning message to stderr. */
616
617#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
618
619/* Print HSA warning STR with an HSA STATUS code. */
620
621static void
622hsa_warn (const char *str, hsa_status_t status)
623{
624 if (!debug)
625 return;
626
627 const char *hsa_error_msg = "[unknown]";
628 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
629
630 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
631 hsa_error_msg);
632}
633
634/* Report a fatal error STR together with the HSA error corresponding to STATUS
635 and terminate execution of the current process. */
636
637static void
638hsa_fatal (const char *str, hsa_status_t status)
639{
640 const char *hsa_error_msg = "[unknown]";
641 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
642 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
643 hsa_error_msg);
644}
645
646/* Like hsa_fatal, except only report error message, and return FALSE
647 for propagating error processing to outside of plugin. */
648
649static bool
650hsa_error (const char *str, hsa_status_t status)
651{
652 const char *hsa_error_msg = "[unknown]";
653 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
654 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
655 hsa_error_msg);
656 return false;
657}
658
659/* Dump information about the available hardware. */
660
661static void
662dump_hsa_system_info (void)
663{
664 hsa_status_t status;
665
666 hsa_endianness_t endianness;
667 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
668 &endianness);
669 if (status == HSA_STATUS_SUCCESS)
670 switch (endianness)
671 {
672 case HSA_ENDIANNESS_LITTLE:
673 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
674 break;
675 case HSA_ENDIANNESS_BIG:
676 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
677 break;
678 default:
679 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
680 }
681 else
682 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
683
684 uint8_t extensions[128];
685 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
686 &extensions);
687 if (status == HSA_STATUS_SUCCESS)
688 {
689 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
690 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
691 }
692 else
693 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
694}
695
696/* Dump information about the available hardware. */
697
698static void
699dump_machine_model (hsa_machine_model_t machine_model, const char *s)
700{
701 switch (machine_model)
702 {
703 case HSA_MACHINE_MODEL_SMALL:
704 GCN_DEBUG ("%s: SMALL\n", s);
705 break;
706 case HSA_MACHINE_MODEL_LARGE:
707 GCN_DEBUG ("%s: LARGE\n", s);
708 break;
709 default:
710 GCN_WARNING ("%s: UNKNOWN\n", s);
711 break;
712 }
713}
714
715/* Dump information about the available hardware. */
716
717static void
718dump_profile (hsa_profile_t profile, const char *s)
719{
720 switch (profile)
721 {
722 case HSA_PROFILE_FULL:
723 GCN_DEBUG ("%s: FULL\n", s);
724 break;
725 case HSA_PROFILE_BASE:
726 GCN_DEBUG ("%s: BASE\n", s);
727 break;
728 default:
729 GCN_WARNING ("%s: UNKNOWN\n", s);
730 break;
731 }
732}
733
734/* Dump information about a device memory region. */
735
736static hsa_status_t
737dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
738{
739 hsa_status_t status;
740
741 hsa_region_segment_t segment;
742 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
743 &segment);
744 if (status == HSA_STATUS_SUCCESS)
745 {
746 if (segment == HSA_REGION_SEGMENT_GLOBAL)
747 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
748 else if (segment == HSA_REGION_SEGMENT_READONLY)
749 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
750 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
751 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
752 else if (segment == HSA_REGION_SEGMENT_GROUP)
753 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
754 else
755 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
756 }
757 else
758 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
759
760 if (segment == HSA_REGION_SEGMENT_GLOBAL)
761 {
762 uint32_t flags;
763 status
764 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
765 &flags);
766 if (status == HSA_STATUS_SUCCESS)
767 {
768 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
769 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
770 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
771 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
772 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
773 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
774 }
775 else
776 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
777 }
778
779 size_t size;
780 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
781 if (status == HSA_STATUS_SUCCESS)
782 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
783 else
784 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
785
786 status
787 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
788 &size);
789 if (status == HSA_STATUS_SUCCESS)
790 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
791 else
792 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
793
794 bool alloc_allowed;
795 status
796 = hsa_fns.hsa_region_get_info_fn (region,
797 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
798 &alloc_allowed);
799 if (status == HSA_STATUS_SUCCESS)
800 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
801 else
802 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
803
804 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
805 return HSA_STATUS_SUCCESS;
806
807 status
808 = hsa_fns.hsa_region_get_info_fn (region,
809 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
810 &size);
811 if (status == HSA_STATUS_SUCCESS)
812 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
813 else
814 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
815
816 size_t align;
817 status
818 = hsa_fns.hsa_region_get_info_fn (region,
819 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
820 &align);
821 if (status == HSA_STATUS_SUCCESS)
822 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
823 else
824 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
825
826 return HSA_STATUS_SUCCESS;
827}
828
829/* Dump information about all the device memory regions. */
830
831static void
832dump_hsa_regions (hsa_agent_t agent)
833{
834 hsa_status_t status;
835 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
836 dump_hsa_region,
837 NULL);
838 if (status != HSA_STATUS_SUCCESS)
839 hsa_error ("Dumping hsa regions failed", status);
840}
841
842/* Dump information about the available devices. */
843
844static hsa_status_t
845dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
846{
847 hsa_status_t status;
848
849 char buf[64];
850 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
851 &buf);
852 if (status == HSA_STATUS_SUCCESS)
853 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
854 else
855 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
856
857 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
858 &buf);
859 if (status == HSA_STATUS_SUCCESS)
860 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
861 else
862 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
863
864 hsa_machine_model_t machine_model;
865 status
866 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
867 &machine_model);
868 if (status == HSA_STATUS_SUCCESS)
869 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
870 else
871 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
872
873 hsa_profile_t profile;
874 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
875 &profile);
876 if (status == HSA_STATUS_SUCCESS)
877 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
878 else
879 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
880
881 hsa_device_type_t device_type;
882 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
883 &device_type);
884 if (status == HSA_STATUS_SUCCESS)
885 {
886 switch (device_type)
887 {
888 case HSA_DEVICE_TYPE_CPU:
889 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
890 break;
891 case HSA_DEVICE_TYPE_GPU:
892 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
893 break;
894 case HSA_DEVICE_TYPE_DSP:
895 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
896 break;
897 default:
898 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
899 break;
900 }
901 }
902 else
903 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
904
905 uint32_t cu_count;
906 status = hsa_fns.hsa_agent_get_info_fn
907 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
908 if (status == HSA_STATUS_SUCCESS)
909 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
910 else
911 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
912
913 uint32_t size;
914 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
915 &size);
916 if (status == HSA_STATUS_SUCCESS)
917 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
918 else
919 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
920
921 uint32_t max_dim;
922 status = hsa_fns.hsa_agent_get_info_fn (agent,
923 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
924 &max_dim);
925 if (status == HSA_STATUS_SUCCESS)
926 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
927 else
928 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
929
930 uint32_t max_size;
931 status = hsa_fns.hsa_agent_get_info_fn (agent,
932 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
933 &max_size);
934 if (status == HSA_STATUS_SUCCESS)
935 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
936 else
937 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
938
939 uint32_t grid_max_dim;
940 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
941 &grid_max_dim);
942 if (status == HSA_STATUS_SUCCESS)
943 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
944 else
945 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
946
947 uint32_t grid_max_size;
948 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
949 &grid_max_size);
950 if (status == HSA_STATUS_SUCCESS)
951 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
952 else
953 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
954
955 dump_hsa_regions (agent);
956
957 return HSA_STATUS_SUCCESS;
958}
959
960/* Forward reference. */
961
962static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
963
964/* Helper function for dump_executable_symbols. */
965
966static hsa_status_t
967dump_executable_symbol (hsa_executable_t executable,
968 hsa_executable_symbol_t symbol,
969 void *data __attribute__((unused)))
970{
971 char *name = get_executable_symbol_name (symbol);
972
973 if (name)
974 {
975 GCN_DEBUG ("executable symbol: %s\n", name);
976 free (name);
977 }
978
979 return HSA_STATUS_SUCCESS;
980}
981
982/* Dump all global symbol in an executable. */
983
984static void
985dump_executable_symbols (hsa_executable_t executable)
986{
987 hsa_status_t status;
988 status
989 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
990 dump_executable_symbol,
991 NULL);
992 if (status != HSA_STATUS_SUCCESS)
993 hsa_fatal ("Could not dump HSA executable symbols", status);
994}
995
996/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
997
998static void
999print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1000{
1001 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1002
1003 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1004 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1005 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1006 fprintf (stderr, "%*sheap address: %p\n", indent, "",
f6fff8a6
AS
1007 (void*)kernargs->abi.heap_ptr);
1008 fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1009 "", (void*)kernargs->abi.arena_ptr,
1010 kernargs->abi.arena_size_per_team);
1011 fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1012 "", (void*)kernargs->abi.stack_ptr,
1013 kernargs->abi.stack_size_per_thread);
237957cc
AS
1014 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1015 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1016 dispatch->private_segment_size);
1017 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1018 dispatch->group_segment_size);
1019 fprintf (stderr, "\n");
1020}
1021
1022/* }}} */
1023/* {{{ Utility functions */
1024
1025/* Cast the thread local storage to gcn_thread. */
1026
1027static inline struct gcn_thread *
1028gcn_thread (void)
1029{
1030 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1031}
1032
1033/* Initialize debug and suppress_host_fallback according to the environment. */
1034
1035static void
1036init_environment_variables (void)
1037{
1038 if (secure_getenv ("GCN_DEBUG"))
1039 debug = true;
1040 else
1041 debug = false;
1042
1043 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1044 suppress_host_fallback = true;
1045 else
1046 suppress_host_fallback = false;
1047
1048 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1049 if (hsa_runtime_lib == NULL)
7c1e856b 1050 hsa_runtime_lib = "libhsa-runtime64.so.1";
237957cc
AS
1051
1052 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1053
1054 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1055 if (!x)
1056 x = secure_getenv ("GCN_NUM_GANGS");
1057 if (x)
1058 override_x_dim = atoi (x);
1059
1060 const char *z = secure_getenv ("GCN_NUM_THREADS");
1061 if (!z)
1062 z = secure_getenv ("GCN_NUM_WORKERS");
1063 if (z)
1064 override_z_dim = atoi (z);
1065
1066 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1067 if (heap)
1068 {
1069 size_t tmp = atol (heap);
1070 if (tmp)
1071 gcn_kernel_heap_size = tmp;
1072 }
f6fff8a6
AS
1073
1074 const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1075 if (arena)
1076 {
1077 int tmp = atoi (arena);
1078 if (tmp)
1079 team_arena_size = tmp;;
1080 }
1081
1082 const char *stack = secure_getenv ("GCN_STACK_SIZE");
1083 if (stack)
1084 {
1085 int tmp = atoi (stack);
1086 if (tmp)
1087 stack_size = tmp;;
1088 }
237957cc
AS
1089}
1090
1091/* Return malloc'd string with name of SYMBOL. */
1092
1093static char *
1094get_executable_symbol_name (hsa_executable_symbol_t symbol)
1095{
1096 hsa_status_t status;
1097 char *res;
1098 uint32_t len;
1099 const hsa_executable_symbol_info_t info_name_length
1100 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1101
1102 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1103 &len);
1104 if (status != HSA_STATUS_SUCCESS)
1105 {
1106 hsa_error ("Could not get length of symbol name", status);
1107 return NULL;
1108 }
1109
1110 res = GOMP_PLUGIN_malloc (len + 1);
1111
1112 const hsa_executable_symbol_info_t info_name
1113 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1114
1115 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1116
1117 if (status != HSA_STATUS_SUCCESS)
1118 {
1119 hsa_error ("Could not get symbol name", status);
1120 free (res);
1121 return NULL;
1122 }
1123
1124 res[len] = '\0';
1125
1126 return res;
1127}
1128
237957cc
AS
1129/* Get the number of GPU Compute Units. */
1130
1131static int
1132get_cu_count (struct agent_info *agent)
1133{
1134 uint32_t cu_count;
1135 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1136 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1137 if (status == HSA_STATUS_SUCCESS)
1138 return cu_count;
1139 else
1140 return 64; /* The usual number for older devices. */
1141}
1142
1143/* Calculate the maximum grid size for OMP threads / OACC workers.
1144 This depends on the kernel's resource usage levels. */
1145
1146static int
1147limit_worker_threads (int threads)
1148{
1149 /* FIXME Do something more inteligent here.
1150 GCN can always run 4 threads within a Compute Unit, but
1151 more than that depends on register usage. */
1152 if (threads > 16)
1153 threads = 16;
1154 return threads;
1155}
1156
81476bc4
MV
1157/* This sets the maximum number of teams to twice the number of GPU Compute
1158 Units to avoid memory waste and corresponding memory access faults. */
1159
1160static int
1161limit_teams (int teams, struct agent_info *agent)
1162{
1163 int max_teams = 2 * get_cu_count (agent);
1164 if (teams > max_teams)
1165 teams = max_teams;
1166 return teams;
1167}
1168
237957cc
AS
1169/* Parse the target attributes INPUT provided by the compiler and return true
1170 if we should run anything all. If INPUT is NULL, fill DEF with default
1171 values, then store INPUT or DEF into *RESULT.
1172
1173 This is used for OpenMP only. */
1174
1175static bool
1176parse_target_attributes (void **input,
1177 struct GOMP_kernel_launch_attributes *def,
1178 struct GOMP_kernel_launch_attributes **result,
1179 struct agent_info *agent)
1180{
1181 if (!input)
1182 GOMP_PLUGIN_fatal ("No target arguments provided");
1183
1184 bool grid_attrs_found = false;
1185 bool gcn_dims_found = false;
1186 int gcn_teams = 0;
1187 int gcn_threads = 0;
1188 while (*input)
1189 {
1190 intptr_t id = (intptr_t) *input++, val;
1191
1192 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1193 val = (intptr_t) *input++;
1194 else
1195 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1196
1197 val = (val > INT_MAX) ? INT_MAX : val;
1198
1199 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1200 && ((id & GOMP_TARGET_ARG_ID_MASK)
1201 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1202 {
1203 grid_attrs_found = true;
1204 break;
1205 }
14e5e746
AS
1206 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1207 == GOMP_TARGET_ARG_DEVICE_ALL)
237957cc
AS
1208 {
1209 gcn_dims_found = true;
1210 switch (id & GOMP_TARGET_ARG_ID_MASK)
1211 {
1212 case GOMP_TARGET_ARG_NUM_TEAMS:
81476bc4 1213 gcn_teams = limit_teams (val, agent);
237957cc
AS
1214 break;
1215 case GOMP_TARGET_ARG_THREAD_LIMIT:
1216 gcn_threads = limit_worker_threads (val);
1217 break;
1218 default:
1219 ;
1220 }
1221 }
1222 }
1223
1224 if (gcn_dims_found)
1225 {
a78b1ab1
KCY
1226 bool gfx900_workaround_p = false;
1227
7d593fd6
FH
1228 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1229 && gcn_threads == 0 && override_z_dim == 0)
237957cc 1230 {
a78b1ab1 1231 gfx900_workaround_p = true;
237957cc 1232 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
a78b1ab1 1233 "threads to at most 4 per team.\n");
237957cc
AS
1234 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1235 "GCN_NUM_THREADS=16\n");
1236 }
1237
a78b1ab1
KCY
1238 /* Ideally, when a dimension isn't explicitly specified, we should
1239 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1240 In practice, we tune for peak performance on BabelStream, which
1241 for OpenACC is currently 32 threads per CU. */
237957cc 1242 def->ndim = 3;
a78b1ab1
KCY
1243 if (gcn_teams <= 0 && gcn_threads <= 0)
1244 {
1245 /* Set up a reasonable number of teams and threads. */
1246 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1247 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1248 def->gdims[2] = gcn_threads;
1249 }
1250 else if (gcn_teams <= 0 && gcn_threads > 0)
1251 {
1252 /* Auto-scale the number of teams with the number of threads. */
1253 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1254 def->gdims[2] = gcn_threads;
1255 }
1256 else if (gcn_teams > 0 && gcn_threads <= 0)
1257 {
1258 int max_threads = gfx900_workaround_p ? 4 : 16;
1259
1260 /* Auto-scale the number of threads with the number of teams. */
1261 def->gdims[0] = gcn_teams;
1262 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1263 if (def->gdims[2] == 0)
1264 def->gdims[2] = 1;
1265 else if (def->gdims[2] > max_threads)
1266 def->gdims[2] = max_threads;
1267 }
1268 else
1269 {
1270 def->gdims[0] = gcn_teams;
1271 def->gdims[2] = gcn_threads;
1272 }
1273 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1274 def->wdims[0] = 1; /* Single team per work-group. */
237957cc
AS
1275 def->wdims[1] = 64;
1276 def->wdims[2] = 16;
1277 *result = def;
1278 return true;
1279 }
1280 else if (!grid_attrs_found)
1281 {
1282 def->ndim = 1;
1283 def->gdims[0] = 1;
1284 def->gdims[1] = 1;
1285 def->gdims[2] = 1;
1286 def->wdims[0] = 1;
1287 def->wdims[1] = 1;
1288 def->wdims[2] = 1;
1289 *result = def;
1290 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1291 return true;
1292 }
1293
1294 struct GOMP_kernel_launch_attributes *kla;
1295 kla = (struct GOMP_kernel_launch_attributes *) *input;
1296 *result = kla;
1297 if (kla->ndim == 0 || kla->ndim > 3)
1298 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1299
1300 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1301 unsigned i;
1302 for (i = 0; i < kla->ndim; i++)
1303 {
1304 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1305 kla->gdims[i], kla->wdims[i]);
1306 if (kla->gdims[i] == 0)
1307 return false;
1308 }
1309 return true;
1310}
1311
1312/* Return the group size given the requested GROUP size, GRID size and number
1313 of grid dimensions NDIM. */
1314
1315static uint32_t
1316get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1317{
1318 if (group == 0)
1319 {
1320 /* TODO: Provide a default via environment or device characteristics. */
1321 if (ndim == 1)
1322 group = 64;
1323 else if (ndim == 2)
1324 group = 8;
1325 else
1326 group = 4;
1327 }
1328
1329 if (group > grid)
1330 group = grid;
1331 return group;
1332}
1333
1334/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1335
1336static void
1337packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1338{
1339 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1340}
1341
1342/* A never-called callback for the HSA command queues. These signal events
1343 that we don't use, so we trigger an error.
1344
1345 This "queue" is not to be confused with the async queues, below. */
1346
1347static void
1348hsa_queue_callback (hsa_status_t status,
1349 hsa_queue_t *queue __attribute__ ((unused)),
1350 void *data __attribute__ ((unused)))
1351{
1352 hsa_fatal ("Asynchronous queue error", status);
1353}
1354
1355/* }}} */
1356/* {{{ HSA initialization */
1357
1358/* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1359
1360static bool
1361init_hsa_runtime_functions (void)
1362{
1363#define DLSYM_FN(function) \
1364 hsa_fns.function##_fn = dlsym (handle, #function); \
1365 if (hsa_fns.function##_fn == NULL) \
1366 return false;
1367 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1368 if (handle == NULL)
1369 return false;
1370
1371 DLSYM_FN (hsa_status_string)
1372 DLSYM_FN (hsa_system_get_info)
1373 DLSYM_FN (hsa_agent_get_info)
1374 DLSYM_FN (hsa_init)
1375 DLSYM_FN (hsa_iterate_agents)
1376 DLSYM_FN (hsa_region_get_info)
1377 DLSYM_FN (hsa_queue_create)
1378 DLSYM_FN (hsa_agent_iterate_regions)
1379 DLSYM_FN (hsa_executable_destroy)
1380 DLSYM_FN (hsa_executable_create)
1381 DLSYM_FN (hsa_executable_global_variable_define)
1382 DLSYM_FN (hsa_executable_load_code_object)
1383 DLSYM_FN (hsa_executable_freeze)
1384 DLSYM_FN (hsa_signal_create)
1385 DLSYM_FN (hsa_memory_allocate)
1386 DLSYM_FN (hsa_memory_assign_agent)
1387 DLSYM_FN (hsa_memory_copy)
1388 DLSYM_FN (hsa_memory_free)
1389 DLSYM_FN (hsa_signal_destroy)
1390 DLSYM_FN (hsa_executable_get_symbol)
1391 DLSYM_FN (hsa_executable_symbol_get_info)
1392 DLSYM_FN (hsa_executable_iterate_symbols)
1393 DLSYM_FN (hsa_queue_add_write_index_release)
1394 DLSYM_FN (hsa_queue_load_read_index_acquire)
1395 DLSYM_FN (hsa_signal_wait_acquire)
1396 DLSYM_FN (hsa_signal_store_relaxed)
1397 DLSYM_FN (hsa_signal_store_release)
1398 DLSYM_FN (hsa_signal_load_acquire)
1399 DLSYM_FN (hsa_queue_destroy)
1400 DLSYM_FN (hsa_code_object_deserialize)
1401 return true;
1402#undef DLSYM_FN
1403}
1404
1405/* Return true if the agent is a GPU and can accept of concurrent submissions
1406 from different threads. */
1407
1408static bool
1409suitable_hsa_agent_p (hsa_agent_t agent)
1410{
1411 hsa_device_type_t device_type;
1412 hsa_status_t status
1413 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1414 &device_type);
1415 if (status != HSA_STATUS_SUCCESS)
1416 return false;
1417
1418 switch (device_type)
1419 {
1420 case HSA_DEVICE_TYPE_GPU:
1421 break;
1422 case HSA_DEVICE_TYPE_CPU:
1423 if (!support_cpu_devices)
1424 return false;
1425 break;
1426 default:
1427 return false;
1428 }
1429
1430 uint32_t features = 0;
1431 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1432 &features);
1433 if (status != HSA_STATUS_SUCCESS
1434 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1435 return false;
1436 hsa_queue_type_t queue_type;
1437 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1438 &queue_type);
1439 if (status != HSA_STATUS_SUCCESS
1440 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1441 return false;
1442
1443 return true;
1444}
1445
1446/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1447 agent_count in hsa_context. */
1448
1449static hsa_status_t
1450count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1451{
1452 if (suitable_hsa_agent_p (agent))
1453 hsa_context.agent_count++;
1454 return HSA_STATUS_SUCCESS;
1455}
1456
1457/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1458 id to the describing structure in the hsa context. The index of the
1459 structure is pointed to by DATA, increment it afterwards. */
1460
1461static hsa_status_t
1462assign_agent_ids (hsa_agent_t agent, void *data)
1463{
1464 if (suitable_hsa_agent_p (agent))
1465 {
1466 int *agent_index = (int *) data;
1467 hsa_context.agents[*agent_index].id = agent;
1468 ++*agent_index;
1469 }
1470 return HSA_STATUS_SUCCESS;
1471}
1472
1473/* Initialize hsa_context if it has not already been done.
1474 Return TRUE on success. */
1475
1476static bool
1477init_hsa_context (void)
1478{
1479 hsa_status_t status;
1480 int agent_index = 0;
1481
1482 if (hsa_context.initialized)
1483 return true;
1484 init_environment_variables ();
1485 if (!init_hsa_runtime_functions ())
1486 {
1487 GCN_WARNING ("Run-time could not be dynamically opened\n");
1488 if (suppress_host_fallback)
1489 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1490 return false;
1491 }
1492 status = hsa_fns.hsa_init_fn ();
1493 if (status != HSA_STATUS_SUCCESS)
1494 return hsa_error ("Run-time could not be initialized", status);
1495 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1496
1497 if (debug)
1498 dump_hsa_system_info ();
1499
1500 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1501 if (status != HSA_STATUS_SUCCESS)
1502 return hsa_error ("GCN GPU devices could not be enumerated", status);
1503 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1504
1505 hsa_context.agents
1506 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1507 * sizeof (struct agent_info));
1508 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
966de09b
AS
1509 if (status != HSA_STATUS_SUCCESS)
1510 return hsa_error ("Scanning compute agents failed", status);
237957cc
AS
1511 if (agent_index != hsa_context.agent_count)
1512 {
1513 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1514 return false;
1515 }
1516
1517 if (debug)
1518 {
1519 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1520 if (status != HSA_STATUS_SUCCESS)
1521 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1522 }
1523
2e5ea579
FH
1524 uint16_t minor, major;
1525 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1526 &minor);
1527 if (status != HSA_STATUS_SUCCESS)
1528 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1529 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1530 &major);
1531 if (status != HSA_STATUS_SUCCESS)
1532 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1533
1534 size_t len = sizeof hsa_context.driver_version_s;
1535 int printed = snprintf (hsa_context.driver_version_s, len,
1536 "HSA Runtime %hu.%hu", (unsigned short int)major,
1537 (unsigned short int)minor);
1538 if (printed >= len)
1539 GCN_WARNING ("HSA runtime version string was truncated."
1540 "Version %hu.%hu is too long.", (unsigned short int)major,
1541 (unsigned short int)minor);
1542
237957cc
AS
1543 hsa_context.initialized = true;
1544 return true;
1545}
1546
1547/* Verify that hsa_context has already been initialized and return the
1548 agent_info structure describing device number N. Return NULL on error. */
1549
1550static struct agent_info *
1551get_agent_info (int n)
1552{
1553 if (!hsa_context.initialized)
1554 {
1555 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1556 return NULL;
1557 }
1558 if (n >= hsa_context.agent_count)
1559 {
1560 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1561 return NULL;
1562 }
1563 if (!hsa_context.agents[n].initialized)
1564 {
1565 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1566 return NULL;
1567 }
1568 return &hsa_context.agents[n];
1569}
1570
1571/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1572
1573 Selects (breaks at) a suitable region of type KIND. */
1574
1575static hsa_status_t
1576get_memory_region (hsa_region_t region, hsa_region_t *retval,
1577 hsa_region_global_flag_t kind)
1578{
1579 hsa_status_t status;
1580 hsa_region_segment_t segment;
1581
1582 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1583 &segment);
1584 if (status != HSA_STATUS_SUCCESS)
1585 return status;
1586 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1587 return HSA_STATUS_SUCCESS;
1588
1589 uint32_t flags;
1590 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1591 &flags);
1592 if (status != HSA_STATUS_SUCCESS)
1593 return status;
1594 if (flags & kind)
1595 {
1596 *retval = region;
1597 return HSA_STATUS_INFO_BREAK;
1598 }
1599 return HSA_STATUS_SUCCESS;
1600}
1601
1602/* Callback of hsa_agent_iterate_regions.
1603
1604 Selects a kernargs memory region. */
1605
1606static hsa_status_t
1607get_kernarg_memory_region (hsa_region_t region, void *data)
1608{
1609 return get_memory_region (region, (hsa_region_t *)data,
1610 HSA_REGION_GLOBAL_FLAG_KERNARG);
1611}
1612
1613/* Callback of hsa_agent_iterate_regions.
1614
1615 Selects a coarse-grained memory region suitable for the heap and
1616 offload data. */
1617
1618static hsa_status_t
1619get_data_memory_region (hsa_region_t region, void *data)
1620{
1621 return get_memory_region (region, (hsa_region_t *)data,
1622 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1623}
1624
7d593fd6
FH
1625static int
1626elf_gcn_isa_field (Elf64_Ehdr *image)
1627{
1628 return image->e_flags & EF_AMDGPU_MACH_MASK;
1629}
1630
7d593fd6
FH
1631const static char *gcn_gfx803_s = "gfx803";
1632const static char *gcn_gfx900_s = "gfx900";
1633const static char *gcn_gfx906_s = "gfx906";
3535402e 1634const static char *gcn_gfx908_s = "gfx908";
cde52d3a 1635const static char *gcn_gfx90a_s = "gfx90a";
7d593fd6
FH
1636const static int gcn_isa_name_len = 6;
1637
1638/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1639 support the ISA. */
1640
1641static const char*
1642isa_hsa_name (int isa) {
1643 switch(isa)
1644 {
7d593fd6
FH
1645 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1646 return gcn_gfx803_s;
1647 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1648 return gcn_gfx900_s;
1649 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1650 return gcn_gfx906_s;
3535402e
AS
1651 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1652 return gcn_gfx908_s;
cde52d3a
AS
1653 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1654 return gcn_gfx90a_s;
7d593fd6
FH
1655 }
1656 return NULL;
1657}
1658
1659/* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1660 with -march) or NULL if we do not support the ISA.
1661 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1662
1663static const char*
1664isa_gcc_name (int isa) {
1665 switch(isa)
1666 {
7d593fd6
FH
1667 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1668 return "fiji";
1669 default:
1670 return isa_hsa_name (isa);
1671 }
1672}
1673
1674/* Returns the code which is used in the GCN object code to identify the ISA with
1675 the given name (as used by the HSA runtime). */
1676
1677static gcn_isa
1678isa_code(const char *isa) {
7d593fd6
FH
1679 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1680 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1681
1682 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1683 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1684
1685 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1686 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1687
3535402e
AS
1688 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1689 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1690
cde52d3a
AS
1691 if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1692 return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1693
7d593fd6
FH
1694 return -1;
1695}
1696
237957cc
AS
1697/* }}} */
1698/* {{{ Run */
1699
f6fff8a6 1700/* Create or reuse a team arena and stack space.
237957cc
AS
1701
1702 Team arenas are used by OpenMP to avoid calling malloc multiple times
1703 while setting up each team. This is purely a performance optimization.
1704
f6fff8a6
AS
1705 The stack space is used by all kernels. We must allocate it in such a
1706 way that the reverse offload implmentation can access the data.
237957cc 1707
f6fff8a6
AS
1708 Allocating this memory costs performance, so this function will reuse an
1709 existing allocation if a large enough one is idle.
1710 The memory lock is released, but not deallocated, when the kernel exits. */
1711
1712static void
1713configure_ephemeral_memories (struct kernel_info *kernel,
1714 struct kernargs_abi *kernargs, int num_teams,
1715 int num_threads)
237957cc 1716{
f6fff8a6
AS
1717 struct agent_info *agent = kernel->agent;
1718 struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1719 struct ephemeral_memories_list *item;
1720
1721 int actual_arena_size = (kernel->kind == KIND_OPENMP
1722 ? team_arena_size : 0);
1723 int actual_arena_total_size = actual_arena_size * num_teams;
1724 size_t size = (actual_arena_total_size
1725 + num_teams * num_threads * stack_size);
237957cc
AS
1726
1727 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1728 {
f6fff8a6 1729 if (item->size < size)
237957cc
AS
1730 continue;
1731
f6fff8a6
AS
1732 if (pthread_mutex_trylock (&item->in_use) == 0)
1733 break;
237957cc
AS
1734 }
1735
f6fff8a6 1736 if (!item)
237957cc 1737 {
f6fff8a6
AS
1738 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1739 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1740 num_teams, num_threads, size);
237957cc 1741
f6fff8a6
AS
1742 if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1743 {
1744 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1745 return;
1746 }
1747 item = malloc (sizeof (*item));
1748 item->size = size;
1749 item->next = NULL;
1750 *next_ptr = item;
237957cc 1751
f6fff8a6
AS
1752 if (pthread_mutex_init (&item->in_use, NULL))
1753 {
1754 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1755 return;
1756 }
1757 if (pthread_mutex_lock (&item->in_use))
1758 {
1759 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1760 return;
1761 }
1762 if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1763 {
1764 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1765 return;
1766 }
1767
1768 hsa_status_t status;
1769 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1770 &item->address);
1771 if (status != HSA_STATUS_SUCCESS)
1772 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1773 status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1774 HSA_ACCESS_PERMISSION_RW);
1775 if (status != HSA_STATUS_SUCCESS)
1776 hsa_fatal ("Could not assign arena & stack memory to device", status);
1777 }
237957cc 1778
f6fff8a6
AS
1779 kernargs->arena_ptr = (actual_arena_total_size
1780 ? (uint64_t)item->address
1781 : 0);
1782 kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1783 kernargs->arena_size_per_team = actual_arena_size;
1784 kernargs->stack_size_per_thread = stack_size;
237957cc
AS
1785}
1786
f6fff8a6 1787/* Mark an ephemeral memory space available for reuse. */
237957cc
AS
1788
1789static void
f6fff8a6 1790release_ephemeral_memories (struct agent_info* agent, void *address)
237957cc 1791{
f6fff8a6 1792 struct ephemeral_memories_list *item;
237957cc 1793
f6fff8a6 1794 for (item = agent->ephemeral_memories_list; item; item = item->next)
237957cc 1795 {
f6fff8a6 1796 if (item->address == address)
237957cc
AS
1797 {
1798 if (pthread_mutex_unlock (&item->in_use))
1799 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1800 return;
1801 }
1802 }
1803 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1804}
1805
1806/* Clean up all the allocated team arenas. */
1807
1808static bool
f6fff8a6 1809destroy_ephemeral_memories (struct agent_info *agent)
237957cc 1810{
f6fff8a6 1811 struct ephemeral_memories_list *item, *next;
237957cc 1812
f6fff8a6 1813 for (item = agent->ephemeral_memories_list; item; item = next)
237957cc
AS
1814 {
1815 next = item->next;
f6fff8a6 1816 hsa_fns.hsa_memory_free_fn (item->address);
237957cc
AS
1817 if (pthread_mutex_destroy (&item->in_use))
1818 {
f6fff8a6 1819 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
237957cc
AS
1820 return false;
1821 }
1822 free (item);
1823 }
f6fff8a6 1824 agent->ephemeral_memories_list = NULL;
237957cc
AS
1825
1826 return true;
1827}
1828
1829/* Allocate memory on a specified device. */
1830
1831static void *
1832alloc_by_agent (struct agent_info *agent, size_t size)
1833{
1834 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1835
1836 /* Zero-size allocations are invalid, so in order to return a valid pointer
1837 we need to pass a valid size. One source of zero-size allocations is
1838 kernargs for kernels that have no inputs or outputs (the kernel may
1839 only use console output, for example). */
1840 if (size == 0)
1841 size = 4;
1842
1843 void *ptr;
1844 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1845 size, &ptr);
1846 if (status != HSA_STATUS_SUCCESS)
1847 {
1848 hsa_error ("Could not allocate device memory", status);
1849 return NULL;
1850 }
1851
1852 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1853 HSA_ACCESS_PERMISSION_RW);
1854 if (status != HSA_STATUS_SUCCESS)
1855 {
1856 hsa_error ("Could not assign data memory to device", status);
1857 return NULL;
1858 }
1859
1860 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1861 bool profiling_dispatch_p
1862 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1863 if (profiling_dispatch_p)
1864 {
1865 acc_prof_info *prof_info = thr->prof_info;
1866 acc_event_info data_event_info;
1867 acc_api_info *api_info = thr->api_info;
1868
1869 prof_info->event_type = acc_ev_alloc;
1870
1871 data_event_info.data_event.event_type = prof_info->event_type;
1872 data_event_info.data_event.valid_bytes
1873 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1874 data_event_info.data_event.parent_construct
1875 = acc_construct_parallel;
1876 data_event_info.data_event.implicit = 1;
1877 data_event_info.data_event.tool_info = NULL;
1878 data_event_info.data_event.var_name = NULL;
1879 data_event_info.data_event.bytes = size;
1880 data_event_info.data_event.host_ptr = NULL;
1881 data_event_info.data_event.device_ptr = (void *) ptr;
1882
1883 api_info->device_api = acc_device_api_other;
1884
1885 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1886 api_info);
1887 }
1888
1889 return ptr;
1890}
1891
1892/* Create kernel dispatch data structure for given KERNEL, along with
1893 the necessary device signals and memory allocations. */
1894
1895static struct kernel_dispatch *
f6fff8a6
AS
1896create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1897 int num_threads)
237957cc
AS
1898{
1899 struct agent_info *agent = kernel->agent;
1900 struct kernel_dispatch *shadow
1901 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1902
1903 shadow->agent = kernel->agent;
1904 shadow->object = kernel->object;
1905
1906 hsa_signal_t sync_signal;
1907 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1908 if (status != HSA_STATUS_SUCCESS)
1909 hsa_fatal ("Error creating the GCN sync signal", status);
1910
1911 shadow->signal = sync_signal.handle;
1912 shadow->private_segment_size = kernel->private_segment_size;
1913 shadow->group_segment_size = kernel->group_segment_size;
1914
1915 /* We expect kernels to request a single pointer, explicitly, and the
1916 rest of struct kernargs, implicitly. If they request anything else
1917 then something is wrong. */
1918 if (kernel->kernarg_segment_size > 8)
1919 {
1920 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1921 return NULL;
1922 }
1923
1924 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1925 sizeof (struct kernargs),
1926 &shadow->kernarg_address);
1927 if (status != HSA_STATUS_SUCCESS)
1928 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1929 struct kernargs *kernargs = shadow->kernarg_address;
1930
1931 /* Zero-initialize the output_data (minimum needed). */
f6fff8a6 1932 kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
237957cc
AS
1933 kernargs->output_data.next_output = 0;
1934 for (unsigned i = 0;
1935 i < (sizeof (kernargs->output_data.queue)
1936 / sizeof (kernargs->output_data.queue[0]));
1937 i++)
1938 kernargs->output_data.queue[i].written = 0;
1939 kernargs->output_data.consumed = 0;
1940
1941 /* Pass in the heap location. */
f6fff8a6 1942 kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
237957cc 1943
f6fff8a6
AS
1944 /* Create the ephemeral memory spaces. */
1945 configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
237957cc
AS
1946
1947 /* Ensure we can recognize unset return values. */
1948 kernargs->output_data.return_value = 0xcafe0000;
1949
1950 return shadow;
1951}
1952
8c05d8cd 1953static void
6edcb5dc
TB
1954process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
1955 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
8c05d8cd
TB
1956{
1957 int dev_num = dev_num64;
6edcb5dc
TB
1958 GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
1959 NULL, NULL, NULL);
8c05d8cd
TB
1960}
1961
237957cc
AS
1962/* Output any data written to console output from the kernel. It is expected
1963 that this function is polled during kernel execution.
1964
1965 We print all entries from the last item printed to the next entry without
1966 a "written" flag. If the "final" flag is set then it'll continue right to
1967 the end.
1968
1969 The print buffer is circular, but the from and to locations don't wrap when
1970 the buffer does, so the output limit is UINT_MAX. The target blocks on
1971 output when the buffer is full. */
1972
1973static void
1974console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1975 bool final)
1976{
1977 unsigned int limit = (sizeof (kernargs->output_data.queue)
1978 / sizeof (kernargs->output_data.queue[0]));
1979
1980 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1981 __ATOMIC_ACQUIRE);
1982 unsigned int to = kernargs->output_data.next_output;
1983
1984 if (from > to)
1985 {
1986 /* Overflow. */
1987 if (final)
1988 printf ("GCN print buffer overflowed.\n");
1989 return;
1990 }
1991
1992 unsigned int i;
1993 for (i = from; i < to; i++)
1994 {
1995 struct printf_data *data = &kernargs->output_data.queue[i%limit];
1996
1997 if (!data->written && !final)
1998 break;
1999
2000 switch (data->type)
2001 {
2002 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2003 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2004 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2005 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
8c05d8cd 2006 case 4:
6edcb5dc
TB
2007 process_reverse_offload (data->value_u64[0], data->value_u64[1],
2008 data->value_u64[2], data->value_u64[3],
2009 data->value_u64[4], data->value_u64[5]);
8c05d8cd 2010 break;
237957cc
AS
2011 default: printf ("GCN print buffer error!\n"); break;
2012 }
2013 data->written = 0;
2014 __atomic_store_n (&kernargs->output_data.consumed, i+1,
2015 __ATOMIC_RELEASE);
2016 }
2017 fflush (stdout);
2018}
2019
2020/* Release data structure created for a kernel dispatch in SHADOW argument,
2021 and clean up the signal and memory allocations. */
2022
2023static void
2024release_kernel_dispatch (struct kernel_dispatch *shadow)
2025{
2026 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2027
2028 struct kernargs *kernargs = shadow->kernarg_address;
f6fff8a6
AS
2029 void *addr = (void *)kernargs->abi.arena_ptr;
2030 if (!addr)
2031 addr = (void *)kernargs->abi.stack_ptr;
2032 release_ephemeral_memories (shadow->agent, addr);
237957cc
AS
2033
2034 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2035
2036 hsa_signal_t s;
2037 s.handle = shadow->signal;
2038 hsa_fns.hsa_signal_destroy_fn (s);
2039
2040 free (shadow);
2041}
2042
2043/* Extract the properties from a kernel binary. */
2044
2045static void
2046init_kernel_properties (struct kernel_info *kernel)
2047{
2048 hsa_status_t status;
2049 struct agent_info *agent = kernel->agent;
2050 hsa_executable_symbol_t kernel_symbol;
f062c3f1
AS
2051 char *buf = alloca (strlen (kernel->name) + 4);
2052 sprintf (buf, "%s.kd", kernel->name);
237957cc 2053 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
f062c3f1 2054 buf, agent->id,
237957cc
AS
2055 0, &kernel_symbol);
2056 if (status != HSA_STATUS_SUCCESS)
2057 {
2058 hsa_warn ("Could not find symbol for kernel in the code object", status);
f062c3f1 2059 fprintf (stderr, "not found name: '%s'\n", buf);
237957cc
AS
2060 dump_executable_symbols (agent->executable);
2061 goto failure;
2062 }
2063 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2064 status = hsa_fns.hsa_executable_symbol_get_info_fn
2065 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2066 if (status != HSA_STATUS_SUCCESS)
2067 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2068 status = hsa_fns.hsa_executable_symbol_get_info_fn
2069 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2070 &kernel->kernarg_segment_size);
2071 if (status != HSA_STATUS_SUCCESS)
2072 hsa_fatal ("Could not get info about kernel argument size", status);
2073 status = hsa_fns.hsa_executable_symbol_get_info_fn
2074 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2075 &kernel->group_segment_size);
2076 if (status != HSA_STATUS_SUCCESS)
2077 hsa_fatal ("Could not get info about kernel group segment size", status);
2078 status = hsa_fns.hsa_executable_symbol_get_info_fn
2079 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2080 &kernel->private_segment_size);
2081 if (status != HSA_STATUS_SUCCESS)
2082 hsa_fatal ("Could not get info about kernel private segment size",
2083 status);
2084
2085 /* The kernel type is not known until something tries to launch it. */
2086 kernel->kind = KIND_UNKNOWN;
2087
2088 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2089 "following segment sizes: \n", kernel->name);
2090 GCN_DEBUG (" group_segment_size: %u\n",
2091 (unsigned) kernel->group_segment_size);
2092 GCN_DEBUG (" private_segment_size: %u\n",
2093 (unsigned) kernel->private_segment_size);
2094 GCN_DEBUG (" kernarg_segment_size: %u\n",
2095 (unsigned) kernel->kernarg_segment_size);
2096 return;
2097
2098failure:
2099 kernel->initialization_failed = true;
2100}
2101
2102/* Do all the work that is necessary before running KERNEL for the first time.
2103 The function assumes the program has been created, finalized and frozen by
2104 create_and_finalize_hsa_program. */
2105
2106static void
2107init_kernel (struct kernel_info *kernel)
2108{
2109 if (pthread_mutex_lock (&kernel->init_mutex))
2110 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2111 if (kernel->initialized)
2112 {
2113 if (pthread_mutex_unlock (&kernel->init_mutex))
2114 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2115 "mutex");
2116
2117 return;
2118 }
2119
2120 init_kernel_properties (kernel);
2121
2122 if (!kernel->initialization_failed)
2123 {
2124 GCN_DEBUG ("\n");
2125
2126 kernel->initialized = true;
2127 }
2128 if (pthread_mutex_unlock (&kernel->init_mutex))
2129 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2130 "mutex");
2131}
2132
2133/* Run KERNEL on its agent, pass VARS to it as arguments and take
2134 launch attributes from KLA.
2135
2136 MODULE_LOCKED indicates that the caller already holds the lock and
2137 run_kernel need not lock it again.
2138 If AQ is NULL then agent->sync_queue will be used. */
2139
2140static void
2141run_kernel (struct kernel_info *kernel, void *vars,
2142 struct GOMP_kernel_launch_attributes *kla,
2143 struct goacc_asyncqueue *aq, bool module_locked)
2144{
5a28e272
KCY
2145 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2146 kernel->description->vpgr_count);
2147
2148 /* Reduce the number of threads/workers if there are insufficient
2149 VGPRs available to run the kernels together. */
2150 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2151 {
2152 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2153 int max_threads = (256 / granulated_vgprs) * 4;
2154 if (kla->gdims[2] > max_threads)
2155 {
2156 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2157 " per team/gang - reducing to %d threads/workers.\n",
2158 kla->gdims[2], max_threads);
2159 kla->gdims[2] = max_threads;
2160 }
2161 }
2162
237957cc
AS
2163 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2164 (aq ? aq->id : 0));
2165 GCN_DEBUG ("GCN launch attribs: gdims:[");
2166 int i;
2167 for (i = 0; i < kla->ndim; ++i)
2168 {
2169 if (i)
2170 DEBUG_PRINT (", ");
2171 DEBUG_PRINT ("%u", kla->gdims[i]);
2172 }
2173 DEBUG_PRINT ("], normalized gdims:[");
2174 for (i = 0; i < kla->ndim; ++i)
2175 {
2176 if (i)
2177 DEBUG_PRINT (", ");
2178 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2179 }
2180 DEBUG_PRINT ("], wdims:[");
2181 for (i = 0; i < kla->ndim; ++i)
2182 {
2183 if (i)
2184 DEBUG_PRINT (", ");
2185 DEBUG_PRINT ("%u", kla->wdims[i]);
2186 }
2187 DEBUG_PRINT ("]\n");
2188 DEBUG_FLUSH ();
2189
2190 struct agent_info *agent = kernel->agent;
2191 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2192 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2193
2194 if (!agent->initialized)
2195 GOMP_PLUGIN_fatal ("Agent must be initialized");
2196
2197 if (!kernel->initialized)
2198 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2199
2200 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2201
2202 uint64_t index
2203 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2204 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2205
2206 /* Wait until the queue is not full before writing the packet. */
2207 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2208 >= command_q->size)
2209 ;
2210
2211 /* Do not allow the dimensions to be overridden when running
2212 constructors or destructors. */
2213 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2214 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2215
2216 hsa_kernel_dispatch_packet_t *packet;
2217 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2218 + index % command_q->size;
2219
2220 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2221 packet->grid_size_x = override_x ? : kla->gdims[0];
2222 packet->workgroup_size_x = get_group_size (kla->ndim,
2223 packet->grid_size_x,
2224 kla->wdims[0]);
2225
2226 if (kla->ndim >= 2)
2227 {
2228 packet->grid_size_y = kla->gdims[1];
2229 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2230 kla->wdims[1]);
2231 }
2232 else
2233 {
2234 packet->grid_size_y = 1;
2235 packet->workgroup_size_y = 1;
2236 }
2237
2238 if (kla->ndim == 3)
2239 {
2240 packet->grid_size_z = limit_worker_threads (override_z
2241 ? : kla->gdims[2]);
2242 packet->workgroup_size_z = get_group_size (kla->ndim,
2243 packet->grid_size_z,
2244 kla->wdims[2]);
2245 }
2246 else
2247 {
2248 packet->grid_size_z = 1;
2249 packet->workgroup_size_z = 1;
2250 }
2251
2252 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2253 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2254 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2255 packet->grid_size_x / packet->workgroup_size_x,
2256 packet->grid_size_y / packet->workgroup_size_y,
2257 packet->grid_size_z / packet->workgroup_size_z,
2258 packet->workgroup_size_x, packet->workgroup_size_y,
2259 packet->workgroup_size_z);
2260
2261 struct kernel_dispatch *shadow
f6fff8a6
AS
2262 = create_kernel_dispatch (kernel, packet->grid_size_x,
2263 packet->grid_size_z);
237957cc
AS
2264 shadow->queue = command_q;
2265
2266 if (debug)
2267 {
2268 fprintf (stderr, "\nKernel has following dependencies:\n");
2269 print_kernel_dispatch (shadow, 2);
2270 }
2271
2272 packet->private_segment_size = kernel->private_segment_size;
2273 packet->group_segment_size = kernel->group_segment_size;
2274 packet->kernel_object = kernel->object;
2275 packet->kernarg_address = shadow->kernarg_address;
2276 hsa_signal_t s;
2277 s.handle = shadow->signal;
2278 packet->completion_signal = s;
2279 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2280 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2281
2282 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2283
2284 uint16_t header;
2285 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2286 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2287 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2288
2289 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2290 agent->device_id);
2291
2292 packet_store_release ((uint32_t *) packet, header,
2293 (uint16_t) kla->ndim
2294 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2295
2296 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2297 index);
2298
2299 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2300
2301 /* Root signal waits with 1ms timeout. */
2302 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2303 1000 * 1000,
2304 HSA_WAIT_STATE_BLOCKED) != 0)
2305 {
2306 console_output (kernel, shadow->kernarg_address, false);
2307 }
2308 console_output (kernel, shadow->kernarg_address, true);
2309
2310 struct kernargs *kernargs = shadow->kernarg_address;
2311 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2312
2313 release_kernel_dispatch (shadow);
2314
2315 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2316 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2317
2318 unsigned int upper = (return_value & ~0xffff) >> 16;
2319 if (upper == 0xcafe)
2320 ; // exit not called, normal termination.
2321 else if (upper == 0xffff)
2322 ; // exit called.
2323 else
2324 {
2325 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2326 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2327 return_value);
2328 abort ();
2329 }
2330
2331 if (upper == 0xffff)
2332 {
2333 unsigned int signal = (return_value >> 8) & 0xff;
2334
2335 if (signal == SIGABRT)
2336 {
2337 GCN_WARNING ("GCN Kernel aborted\n");
2338 abort ();
2339 }
2340 else if (signal != 0)
2341 {
2342 GCN_WARNING ("GCN Kernel received unknown signal\n");
2343 abort ();
2344 }
2345
2346 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2347 exit (return_value & 0xff);
2348 }
2349}
2350
2351/* }}} */
2352/* {{{ Load/Unload */
2353
2354/* Initialize KERNEL from D and other parameters. Return true on success. */
2355
2356static bool
2357init_basic_kernel_info (struct kernel_info *kernel,
2358 struct hsa_kernel_description *d,
2359 struct agent_info *agent,
2360 struct module_info *module)
2361{
2362 kernel->agent = agent;
2363 kernel->module = module;
2364 kernel->name = d->name;
5a28e272 2365 kernel->description = d;
237957cc
AS
2366 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2367 {
2368 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2369 return false;
2370 }
2371 return true;
2372}
2373
7d593fd6
FH
2374/* Check that the GCN ISA of the given image matches the ISA of the agent. */
2375
2376static bool
2377isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2378{
2379 int isa_field = elf_gcn_isa_field (image);
2380 const char* isa_s = isa_hsa_name (isa_field);
2381 if (!isa_s)
2382 {
2383 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2384 return false;
2385 }
2386
2387 if (isa_field != agent->device_isa)
2388 {
2389 char msg[120];
2390 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2391 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2392 assert (agent_isa_s);
2393 assert (agent_isa_gcc_s);
2394
2395 snprintf (msg, sizeof msg,
2396 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
4a206161 2397 "Try to recompile with '-foffload-options=-march=%s'.\n",
7d593fd6
FH
2398 isa_s, agent_isa_s, agent_isa_gcc_s);
2399
2400 hsa_error (msg, HSA_STATUS_ERROR);
2401 return false;
2402 }
2403
2404 return true;
2405}
2406
237957cc
AS
2407/* Create and finalize the program consisting of all loaded modules. */
2408
2409static bool
2410create_and_finalize_hsa_program (struct agent_info *agent)
2411{
2412 hsa_status_t status;
237957cc
AS
2413 bool res = true;
2414 if (pthread_mutex_lock (&agent->prog_mutex))
2415 {
2416 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2417 return false;
2418 }
2419 if (agent->prog_finalized)
2420 goto final;
2421
2422 status
2423 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2424 HSA_EXECUTABLE_STATE_UNFROZEN,
2425 "", &agent->executable);
2426 if (status != HSA_STATUS_SUCCESS)
2427 {
2428 hsa_error ("Could not create GCN executable", status);
2429 goto fail;
2430 }
2431
2432 /* Load any GCN modules. */
2433 struct module_info *module = agent->module;
2434 if (module)
2435 {
2436 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2437
7d593fd6
FH
2438 if (!isa_matches_agent (agent, image))
2439 goto fail;
2440
237957cc
AS
2441 hsa_code_object_t co = { 0 };
2442 status = hsa_fns.hsa_code_object_deserialize_fn
2443 (module->image_desc->gcn_image->image,
2444 module->image_desc->gcn_image->size,
2445 NULL, &co);
2446 if (status != HSA_STATUS_SUCCESS)
2447 {
2448 hsa_error ("Could not deserialize GCN code object", status);
2449 goto fail;
2450 }
2451
2452 status = hsa_fns.hsa_executable_load_code_object_fn
2453 (agent->executable, agent->id, co, "");
2454 if (status != HSA_STATUS_SUCCESS)
2455 {
2456 hsa_error ("Could not load GCN code object", status);
2457 goto fail;
2458 }
2459
2460 if (!module->heap)
2461 {
2462 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2463 gcn_kernel_heap_size,
2464 (void**)&module->heap);
2465 if (status != HSA_STATUS_SUCCESS)
2466 {
2467 hsa_error ("Could not allocate memory for GCN heap", status);
2468 goto fail;
2469 }
2470
2471 status = hsa_fns.hsa_memory_assign_agent_fn
2472 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2473 if (status != HSA_STATUS_SUCCESS)
2474 {
2475 hsa_error ("Could not assign GCN heap memory to device", status);
2476 goto fail;
2477 }
2478
2479 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2480 &gcn_kernel_heap_size,
2481 sizeof (gcn_kernel_heap_size));
2482 }
2483
2484 }
2485
2486 if (debug)
2487 dump_executable_symbols (agent->executable);
2488
2489 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2490 if (status != HSA_STATUS_SUCCESS)
2491 {
2492 hsa_error ("Could not freeze the GCN executable", status);
2493 goto fail;
2494 }
2495
237957cc
AS
2496final:
2497 agent->prog_finalized = true;
2498
2499 if (pthread_mutex_unlock (&agent->prog_mutex))
2500 {
2501 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2502 res = false;
2503 }
2504
2505 return res;
2506
2507fail:
2508 res = false;
2509 goto final;
2510}
2511
2512/* Free the HSA program in agent and everything associated with it and set
2513 agent->prog_finalized and the initialized flags of all kernels to false.
2514 Return TRUE on success. */
2515
2516static bool
2517destroy_hsa_program (struct agent_info *agent)
2518{
2519 if (!agent->prog_finalized)
2520 return true;
2521
2522 hsa_status_t status;
2523
2524 GCN_DEBUG ("Destroying the current GCN program.\n");
2525
2526 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2527 if (status != HSA_STATUS_SUCCESS)
2528 return hsa_error ("Could not destroy GCN executable", status);
2529
2530 if (agent->module)
2531 {
2532 int i;
2533 for (i = 0; i < agent->module->kernel_count; i++)
2534 agent->module->kernels[i].initialized = false;
2535
2536 if (agent->module->heap)
2537 {
2538 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2539 agent->module->heap = NULL;
2540 }
2541 }
2542 agent->prog_finalized = false;
2543 return true;
2544}
2545
2546/* Deinitialize all information associated with MODULE and kernels within
2547 it. Return TRUE on success. */
2548
2549static bool
2550destroy_module (struct module_info *module, bool locked)
2551{
2552 /* Run destructors before destroying module. */
2553 struct GOMP_kernel_launch_attributes kla =
2554 { 3,
2555 /* Grid size. */
2556 { 1, 64, 1 },
2557 /* Work-group size. */
2558 { 1, 64, 1 }
2559 };
2560
2561 if (module->fini_array_func)
2562 {
2563 init_kernel (module->fini_array_func);
2564 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2565 }
2566 module->constructors_run_p = false;
2567
2568 int i;
2569 for (i = 0; i < module->kernel_count; i++)
2570 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2571 {
2572 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2573 "mutex");
2574 return false;
2575 }
2576
2577 return true;
2578}
2579
2580/* }}} */
2581/* {{{ Async */
2582
2583/* Callback of dispatch queues to report errors. */
2584
2585static void
2586execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2587{
2588 struct queue_entry *entry = &aq->queue[index];
2589
2590 switch (entry->type)
2591 {
2592 case KERNEL_LAUNCH:
2593 if (DEBUG_QUEUES)
2594 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2595 aq->agent->device_id, aq->id, index);
2596 run_kernel (entry->u.launch.kernel,
2597 entry->u.launch.vars,
2598 &entry->u.launch.kla, aq, false);
2599 if (DEBUG_QUEUES)
2600 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2601 aq->agent->device_id, aq->id, index);
2602 break;
2603
2604 case CALLBACK:
2605 if (DEBUG_QUEUES)
2606 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2607 aq->agent->device_id, aq->id, index);
2608 entry->u.callback.fn (entry->u.callback.data);
2609 if (DEBUG_QUEUES)
2610 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2611 aq->agent->device_id, aq->id, index);
2612 break;
2613
2614 case ASYNC_WAIT:
2615 {
2616 /* FIXME: is it safe to access a placeholder that may already have
2617 been executed? */
2618 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2619
2620 if (DEBUG_QUEUES)
2621 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2622 aq->agent->device_id, aq->id, index);
2623
2624 pthread_mutex_lock (&placeholderp->mutex);
2625
2626 while (!placeholderp->executed)
2627 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2628
2629 pthread_mutex_unlock (&placeholderp->mutex);
2630
2631 if (pthread_cond_destroy (&placeholderp->cond))
2632 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2633
2634 if (pthread_mutex_destroy (&placeholderp->mutex))
2635 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2636
2637 if (DEBUG_QUEUES)
2638 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2639 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2640 }
2641 break;
2642
2643 case ASYNC_PLACEHOLDER:
2644 pthread_mutex_lock (&entry->u.placeholder.mutex);
2645 entry->u.placeholder.executed = 1;
2646 pthread_cond_signal (&entry->u.placeholder.cond);
2647 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2648 break;
2649
2650 default:
2651 GOMP_PLUGIN_fatal ("Unknown queue element");
2652 }
2653}
2654
2655/* This function is run as a thread to service an async queue in the
2656 background. It runs continuously until the stop flag is set. */
2657
2658static void *
2659drain_queue (void *thread_arg)
2660{
2661 struct goacc_asyncqueue *aq = thread_arg;
2662
2663 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2664 {
2665 aq->drain_queue_stop = 2;
2666 return NULL;
2667 }
2668
2669 pthread_mutex_lock (&aq->mutex);
2670
2671 while (true)
2672 {
2673 if (aq->drain_queue_stop)
2674 break;
2675
2676 if (aq->queue_n > 0)
2677 {
2678 pthread_mutex_unlock (&aq->mutex);
2679 execute_queue_entry (aq, aq->queue_first);
2680
2681 pthread_mutex_lock (&aq->mutex);
2682 aq->queue_first = ((aq->queue_first + 1)
2683 % ASYNC_QUEUE_SIZE);
2684 aq->queue_n--;
2685
2686 if (DEBUG_THREAD_SIGNAL)
2687 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2688 aq->agent->device_id, aq->id);
2689 pthread_cond_broadcast (&aq->queue_cond_out);
2690 pthread_mutex_unlock (&aq->mutex);
2691
2692 if (DEBUG_QUEUES)
2693 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2694 aq->id);
2695 pthread_mutex_lock (&aq->mutex);
2696 }
2697 else
2698 {
2699 if (DEBUG_THREAD_SLEEP)
2700 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2701 aq->agent->device_id, aq->id);
2702 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2703 if (DEBUG_THREAD_SLEEP)
2704 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2705 aq->agent->device_id, aq->id);
2706 }
2707 }
2708
2709 aq->drain_queue_stop = 2;
2710 if (DEBUG_THREAD_SIGNAL)
2711 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2712 aq->agent->device_id, aq->id);
2713 pthread_cond_broadcast (&aq->queue_cond_out);
2714 pthread_mutex_unlock (&aq->mutex);
2715
2716 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2717 return NULL;
2718}
2719
2720/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2721 is not usually the case. This is just a debug tool. */
2722
2723static void
2724drain_queue_synchronous (struct goacc_asyncqueue *aq)
2725{
2726 pthread_mutex_lock (&aq->mutex);
2727
2728 while (aq->queue_n > 0)
2729 {
2730 execute_queue_entry (aq, aq->queue_first);
2731
2732 aq->queue_first = ((aq->queue_first + 1)
2733 % ASYNC_QUEUE_SIZE);
2734 aq->queue_n--;
2735 }
2736
2737 pthread_mutex_unlock (&aq->mutex);
2738}
2739
d88b27da
JB
2740/* Block the current thread until an async queue is writable. The aq->mutex
2741 lock should be held on entry, and remains locked on exit. */
237957cc
AS
2742
2743static void
2744wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2745{
2746 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2747 {
237957cc
AS
2748 /* Queue is full. Wait for it to not be full. */
2749 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2750 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
237957cc
AS
2751 }
2752}
2753
2754/* Request an asynchronous kernel launch on the specified queue. This
2755 may block if the queue is full, but returns without waiting for the
2756 kernel to run. */
2757
2758static void
2759queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2760 void *vars, struct GOMP_kernel_launch_attributes *kla)
2761{
2762 assert (aq->agent == kernel->agent);
2763
237957cc
AS
2764 pthread_mutex_lock (&aq->mutex);
2765
d88b27da
JB
2766 wait_for_queue_nonfull (aq);
2767
237957cc
AS
2768 int queue_last = ((aq->queue_first + aq->queue_n)
2769 % ASYNC_QUEUE_SIZE);
2770 if (DEBUG_QUEUES)
2771 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2772 aq->id, queue_last);
2773
2774 aq->queue[queue_last].type = KERNEL_LAUNCH;
2775 aq->queue[queue_last].u.launch.kernel = kernel;
2776 aq->queue[queue_last].u.launch.vars = vars;
2777 aq->queue[queue_last].u.launch.kla = *kla;
2778
2779 aq->queue_n++;
2780
2781 if (DEBUG_THREAD_SIGNAL)
2782 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2783 aq->agent->device_id, aq->id);
2784 pthread_cond_signal (&aq->queue_cond_in);
2785
2786 pthread_mutex_unlock (&aq->mutex);
2787}
2788
2789/* Request an asynchronous callback on the specified queue. The callback
2790 function will be called, with the given opaque data, from the appropriate
2791 async thread, when all previous items on that queue are complete. */
2792
2793static void
2794queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2795 void *data)
2796{
237957cc
AS
2797 pthread_mutex_lock (&aq->mutex);
2798
d88b27da
JB
2799 wait_for_queue_nonfull (aq);
2800
237957cc
AS
2801 int queue_last = ((aq->queue_first + aq->queue_n)
2802 % ASYNC_QUEUE_SIZE);
2803 if (DEBUG_QUEUES)
2804 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2805 aq->id, queue_last);
2806
2807 aq->queue[queue_last].type = CALLBACK;
2808 aq->queue[queue_last].u.callback.fn = fn;
2809 aq->queue[queue_last].u.callback.data = data;
2810
2811 aq->queue_n++;
2812
2813 if (DEBUG_THREAD_SIGNAL)
2814 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2815 aq->agent->device_id, aq->id);
2816 pthread_cond_signal (&aq->queue_cond_in);
2817
2818 pthread_mutex_unlock (&aq->mutex);
2819}
2820
2821/* Request that a given async thread wait for another thread (unspecified) to
2822 reach the given placeholder. The wait will occur when all previous entries
2823 on the queue are complete. A placeholder is effectively a kind of signal
2824 which simply sets a flag when encountered in a queue. */
2825
2826static void
2827queue_push_asyncwait (struct goacc_asyncqueue *aq,
2828 struct placeholder *placeholderp)
2829{
237957cc
AS
2830 pthread_mutex_lock (&aq->mutex);
2831
d88b27da
JB
2832 wait_for_queue_nonfull (aq);
2833
237957cc
AS
2834 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2835 if (DEBUG_QUEUES)
2836 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2837 aq->id, queue_last);
2838
2839 aq->queue[queue_last].type = ASYNC_WAIT;
2840 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2841
2842 aq->queue_n++;
2843
2844 if (DEBUG_THREAD_SIGNAL)
2845 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2846 aq->agent->device_id, aq->id);
2847 pthread_cond_signal (&aq->queue_cond_in);
2848
2849 pthread_mutex_unlock (&aq->mutex);
2850}
2851
2852/* Add a placeholder into an async queue. When the async thread reaches the
2853 placeholder it will set the "executed" flag to true and continue.
2854 Another thread may be waiting on this thread reaching the placeholder. */
2855
2856static struct placeholder *
2857queue_push_placeholder (struct goacc_asyncqueue *aq)
2858{
2859 struct placeholder *placeholderp;
2860
237957cc
AS
2861 pthread_mutex_lock (&aq->mutex);
2862
d88b27da
JB
2863 wait_for_queue_nonfull (aq);
2864
237957cc
AS
2865 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2866 if (DEBUG_QUEUES)
2867 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2868 aq->id, queue_last);
2869
2870 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2871 placeholderp = &aq->queue[queue_last].u.placeholder;
2872
2873 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2874 {
2875 pthread_mutex_unlock (&aq->mutex);
2876 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2877 }
2878
2879 if (pthread_cond_init (&placeholderp->cond, NULL))
2880 {
2881 pthread_mutex_unlock (&aq->mutex);
2882 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2883 }
2884
2885 placeholderp->executed = 0;
2886
2887 aq->queue_n++;
2888
2889 if (DEBUG_THREAD_SIGNAL)
2890 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2891 aq->agent->device_id, aq->id);
2892 pthread_cond_signal (&aq->queue_cond_in);
2893
2894 pthread_mutex_unlock (&aq->mutex);
2895
2896 return placeholderp;
2897}
2898
2899/* Signal an asynchronous thread to terminate, and wait for it to do so. */
2900
2901static void
2902finalize_async_thread (struct goacc_asyncqueue *aq)
2903{
2904 pthread_mutex_lock (&aq->mutex);
2905 if (aq->drain_queue_stop == 2)
2906 {
2907 pthread_mutex_unlock (&aq->mutex);
2908 return;
2909 }
2910
2911 aq->drain_queue_stop = 1;
2912
2913 if (DEBUG_THREAD_SIGNAL)
2914 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2915 aq->agent->device_id, aq->id);
2916 pthread_cond_signal (&aq->queue_cond_in);
2917
2918 while (aq->drain_queue_stop != 2)
2919 {
2920 if (DEBUG_THREAD_SLEEP)
2921 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2922 " to sleep\n", aq->agent->device_id, aq->id);
2923 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2924 if (DEBUG_THREAD_SLEEP)
2925 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2926 aq->agent->device_id, aq->id);
2927 }
2928
2929 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2930 aq->id);
2931 pthread_mutex_unlock (&aq->mutex);
2932
2933 int err = pthread_join (aq->thread_drain_queue, NULL);
2934 if (err != 0)
2935 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2936 aq->agent->device_id, aq->id, strerror (err));
2937 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2938}
2939
2940/* Set up an async queue for OpenMP. There will be only one. The
2941 implementation simply uses an OpenACC async queue.
2942 FIXME: is this thread-safe if two threads call this function? */
2943
2944static void
2945maybe_init_omp_async (struct agent_info *agent)
2946{
2947 if (!agent->omp_async_queue)
2948 agent->omp_async_queue
2949 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2950}
2951
8d2f4ddf
JB
2952/* A wrapper that works around an issue in the HSA runtime with host-to-device
2953 copies from read-only pages. */
2954
2955static void
2956hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2957{
2958 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2959
2960 if (status == HSA_STATUS_SUCCESS)
2961 return;
2962
2963 /* It appears that the copy fails if the source data is in a read-only page.
2964 We can't detect that easily, so try copying the data to a temporary buffer
2965 and doing the copy again if we got an error above. */
2966
2967 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2968 "[%p:+%d]\n", (void *) src, (int) len);
2969
2970 void *src_copy = malloc (len);
2971 memcpy (src_copy, src, len);
2972 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2973 free (src_copy);
2974 if (status != HSA_STATUS_SUCCESS)
2975 GOMP_PLUGIN_error ("memory copy failed");
2976}
2977
237957cc
AS
2978/* Copy data to or from a device. This is intended for use as an async
2979 callback event. */
2980
2981static void
2982copy_data (void *data_)
2983{
2984 struct copy_data *data = (struct copy_data *)data_;
2985 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2986 data->aq->agent->device_id, data->aq->id, data->len, data->src,
2987 data->dst);
8d2f4ddf 2988 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
237957cc
AS
2989 free (data);
2990}
2991
2992/* Free device data. This is intended for use as an async callback event. */
2993
2994static void
2995gomp_offload_free (void *ptr)
2996{
2997 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2998 GOMP_OFFLOAD_free (0, ptr);
2999}
3000
3001/* Request an asynchronous data copy, to or from a device, on a given queue.
9c41f5b9 3002 The event will be registered as a callback. */
237957cc
AS
3003
3004static void
3005queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
9c41f5b9 3006 size_t len)
237957cc
AS
3007{
3008 if (DEBUG_QUEUES)
3009 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3010 aq->agent->device_id, aq->id, len, src, dst);
3011 struct copy_data *data
3012 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3013 data->dst = dst;
3014 data->src = src;
3015 data->len = len;
237957cc
AS
3016 data->aq = aq;
3017 queue_push_callback (aq, copy_data, data);
3018}
3019
3020/* Return true if the given queue is currently empty. */
3021
3022static int
3023queue_empty (struct goacc_asyncqueue *aq)
3024{
3025 pthread_mutex_lock (&aq->mutex);
3026 int res = aq->queue_n == 0 ? 1 : 0;
3027 pthread_mutex_unlock (&aq->mutex);
3028
3029 return res;
3030}
3031
3032/* Wait for a given queue to become empty. This implements an OpenACC wait
3033 directive. */
3034
3035static void
3036wait_queue (struct goacc_asyncqueue *aq)
3037{
3038 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3039 {
3040 drain_queue_synchronous (aq);
3041 return;
3042 }
3043
3044 pthread_mutex_lock (&aq->mutex);
3045
3046 while (aq->queue_n > 0)
3047 {
3048 if (DEBUG_THREAD_SLEEP)
3049 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3050 aq->agent->device_id, aq->id);
3051 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3052 if (DEBUG_THREAD_SLEEP)
3053 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3054 aq->id);
3055 }
3056
3057 pthread_mutex_unlock (&aq->mutex);
3058 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3059}
3060
3061/* }}} */
3062/* {{{ OpenACC support */
3063
3064/* Execute an OpenACC kernel, synchronously or asynchronously. */
3065
3066static void
3067gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3068 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3069 struct goacc_asyncqueue *aq)
3070{
3071 if (!GOMP_OFFLOAD_can_run (kernel))
3072 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3073
3074 /* If we get here then this must be an OpenACC kernel. */
3075 kernel->kind = KIND_OPENACC;
3076
3077 /* devaddrs must be double-indirect on the target. */
3078 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3079 for (size_t i = 0; i < mapnum; i++)
3080 hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3081 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3082 sizeof (void *));
3083
3084 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3085 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3086 {
3087 struct hsa_kernel_description *d
3088 = &kernel->module->image_desc->kernel_infos[i];
3089 if (d->name == kernel->name)
3090 {
3091 hsa_kernel_desc = d;
3092 break;
3093 }
3094 }
3095
3096 /* We may have statically-determined dimensions in
3097 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3098 invocation at runtime in dims[]. We allow static dimensions to take
3099 priority over dynamic dimensions when present (non-zero). */
3100 if (hsa_kernel_desc->oacc_dims[0] > 0)
3101 dims[0] = hsa_kernel_desc->oacc_dims[0];
3102 if (hsa_kernel_desc->oacc_dims[1] > 0)
3103 dims[1] = hsa_kernel_desc->oacc_dims[1];
3104 if (hsa_kernel_desc->oacc_dims[2] > 0)
3105 dims[2] = hsa_kernel_desc->oacc_dims[2];
3106
a78b1ab1
KCY
3107 /* Ideally, when a dimension isn't explicitly specified, we should
3108 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3109 In practice, we tune for peak performance on BabelStream, which
3110 for OpenACC is currently 32 threads per CU. */
3111 if (dims[0] == 0 && dims[1] == 0)
3112 {
3113 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3114 number. There isn't really a correct answer for this without a clue
3115 about the problem size, so let's do a reasonable number of workers
3116 and gangs. */
237957cc 3117
a78b1ab1
KCY
3118 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3119 dims[1] = 8; /* Workers. */
3120 }
3121 else if (dims[0] == 0 && dims[1] > 0)
3122 {
3123 /* Auto-scale the number of gangs with the requested number of workers. */
3124 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3125 }
3126 else if (dims[0] > 0 && dims[1] == 0)
3127 {
3128 /* Auto-scale the number of workers with the requested number of gangs. */
3129 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3130 if (dims[1] == 0)
3131 dims[1] = 1;
3132 if (dims[1] > 16)
3133 dims[1] = 16;
3134 }
237957cc
AS
3135
3136 /* The incoming dimensions are expressed in terms of gangs, workers, and
3137 vectors. The HSA dimensions are expressed in terms of "work-items",
3138 which means multiples of vector lanes.
3139
3140 The "grid size" specifies the size of the problem space, and the
3141 "work-group size" specifies how much of that we want a single compute
3142 unit to chew on at once.
3143
3144 The three dimensions do not really correspond to hardware, but the
3145 important thing is that the HSA runtime will launch as many
3146 work-groups as it takes to process the entire grid, and each
3147 work-group will contain as many wave-fronts as it takes to process
3148 the work-items in that group.
3149
3150 Essentially, as long as we set the Y dimension to 64 (the number of
3151 vector lanes in hardware), and the Z group size to the maximum (16),
3152 then we will get the gangs (X) and workers (Z) launched as we expect.
3153
3154 The reason for the apparent reversal of vector and worker dimension
3155 order is to do with the way the run-time distributes work-items across
3156 v1 and v2. */
3157 struct GOMP_kernel_launch_attributes kla =
3158 {3,
3159 /* Grid size. */
3160 {dims[0], 64, dims[1]},
3161 /* Work-group size. */
3162 {1, 64, 16}
3163 };
3164
3165 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3166 acc_prof_info *prof_info = thr->prof_info;
3167 acc_event_info enqueue_launch_event_info;
3168 acc_api_info *api_info = thr->api_info;
3169 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3170 if (profiling_dispatch_p)
3171 {
3172 prof_info->event_type = acc_ev_enqueue_launch_start;
3173
3174 enqueue_launch_event_info.launch_event.event_type
3175 = prof_info->event_type;
3176 enqueue_launch_event_info.launch_event.valid_bytes
3177 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3178 enqueue_launch_event_info.launch_event.parent_construct
3179 = acc_construct_parallel;
3180 enqueue_launch_event_info.launch_event.implicit = 1;
3181 enqueue_launch_event_info.launch_event.tool_info = NULL;
3182 enqueue_launch_event_info.launch_event.kernel_name
3183 = (char *) kernel->name;
3184 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3185 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3186 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3187
3188 api_info->device_api = acc_device_api_other;
3189
3190 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3191 &enqueue_launch_event_info, api_info);
3192 }
3193
3194 if (!async)
3195 {
3196 run_kernel (kernel, ind_da, &kla, NULL, false);
3197 gomp_offload_free (ind_da);
3198 }
3199 else
3200 {
3201 queue_push_launch (aq, kernel, ind_da, &kla);
3202 if (DEBUG_QUEUES)
3203 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3204 aq->agent->device_id, aq->id, ind_da);
3205 queue_push_callback (aq, gomp_offload_free, ind_da);
3206 }
3207
3208 if (profiling_dispatch_p)
3209 {
3210 prof_info->event_type = acc_ev_enqueue_launch_end;
3211 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3212 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3213 &enqueue_launch_event_info,
3214 api_info);
3215 }
3216}
3217
3218/* }}} */
3219/* {{{ Generic Plugin API */
3220
3221/* Return the name of the accelerator, which is "gcn". */
3222
3223const char *
3224GOMP_OFFLOAD_get_name (void)
3225{
3226 return "gcn";
3227}
3228
3229/* Return the specific capabilities the HSA accelerator have. */
3230
3231unsigned int
3232GOMP_OFFLOAD_get_caps (void)
3233{
3234 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3235 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3236 | GOMP_OFFLOAD_CAP_OPENACC_200;
3237}
3238
3239/* Identify as GCN accelerator. */
3240
3241int
3242GOMP_OFFLOAD_get_type (void)
3243{
3244 return OFFLOAD_TARGET_TYPE_GCN;
3245}
3246
3247/* Return the libgomp version number we're compatible with. There is
3248 no requirement for cross-version compatibility. */
3249
3250unsigned
3251GOMP_OFFLOAD_version (void)
3252{
3253 return GOMP_VERSION;
3254}
3255
3256/* Return the number of GCN devices on the system. */
3257
3258int
683f1184 3259GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
237957cc
AS
3260{
3261 if (!init_hsa_context ())
3262 return 0;
683f1184
TB
3263 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3264 devices were present. */
3265 if (hsa_context.agent_count > 0 && omp_requires_mask != 0)
3266 return -1;
237957cc
AS
3267 return hsa_context.agent_count;
3268}
3269
3270/* Initialize device (agent) number N so that it can be used for computation.
3271 Return TRUE on success. */
3272
3273bool
3274GOMP_OFFLOAD_init_device (int n)
3275{
3276 if (!init_hsa_context ())
3277 return false;
3278 if (n >= hsa_context.agent_count)
3279 {
3280 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3281 return false;
3282 }
3283 struct agent_info *agent = &hsa_context.agents[n];
3284
3285 if (agent->initialized)
3286 return true;
3287
3288 agent->device_id = n;
3289
3290 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3291 {
3292 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3293 return false;
3294 }
3295 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3296 {
3297 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3298 return false;
3299 }
3300 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3301 {
3302 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3303 return false;
3304 }
f6fff8a6 3305 if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
237957cc
AS
3306 {
3307 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3308 return false;
3309 }
3310 agent->async_queues = NULL;
3311 agent->omp_async_queue = NULL;
f6fff8a6 3312 agent->ephemeral_memories_list = NULL;
237957cc
AS
3313
3314 uint32_t queue_size;
3315 hsa_status_t status;
3316 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3317 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3318 &queue_size);
3319 if (status != HSA_STATUS_SUCCESS)
3320 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3321 status);
3322
237957cc 3323 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
2e5ea579 3324 &agent->name);
237957cc
AS
3325 if (status != HSA_STATUS_SUCCESS)
3326 return hsa_error ("Error querying the name of the agent", status);
7d593fd6 3327
2e5ea579 3328 agent->device_isa = isa_code (agent->name);
7d593fd6 3329 if (agent->device_isa < 0)
2e5ea579
FH
3330 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3331
3332 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3333 &agent->vendor_name);
3334 if (status != HSA_STATUS_SUCCESS)
3335 return hsa_error ("Error querying the vendor name of the agent", status);
237957cc
AS
3336
3337 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3338 HSA_QUEUE_TYPE_MULTI,
3339 hsa_queue_callback, NULL, UINT32_MAX,
3340 UINT32_MAX, &agent->sync_queue);
3341 if (status != HSA_STATUS_SUCCESS)
3342 return hsa_error ("Error creating command queue", status);
3343
3344 agent->kernarg_region.handle = (uint64_t) -1;
3345 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3346 get_kernarg_memory_region,
3347 &agent->kernarg_region);
966de09b
AS
3348 if (status != HSA_STATUS_SUCCESS
3349 && status != HSA_STATUS_INFO_BREAK)
3350 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3351 if (agent->kernarg_region.handle == (uint64_t) -1)
3352 {
3353 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3354 "arguments");
3355 return false;
3356 }
3357 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3358 dump_hsa_region (agent->kernarg_region, NULL);
3359
3360 agent->data_region.handle = (uint64_t) -1;
3361 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3362 get_data_memory_region,
3363 &agent->data_region);
966de09b
AS
3364 if (status != HSA_STATUS_SUCCESS
3365 && status != HSA_STATUS_INFO_BREAK)
3366 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3367 if (agent->data_region.handle == (uint64_t) -1)
3368 {
3369 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3370 "data");
3371 return false;
3372 }
3373 GCN_DEBUG ("Selected device data memory region:\n");
3374 dump_hsa_region (agent->data_region, NULL);
3375
3376 GCN_DEBUG ("GCN agent %d initialized\n", n);
3377
3378 agent->initialized = true;
3379 return true;
3380}
3381
3382/* Load GCN object-code module described by struct gcn_image_desc in
3383 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
0fcc0cf9
TB
3384 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3385 contain the on-device addresses of the functions for reverse offload. To be
3386 freed by the caller. */
237957cc
AS
3387
3388int
3389GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
0fcc0cf9 3390 struct addr_pair **target_table,
dfd75bf7 3391 uint64_t **rev_fn_table)
237957cc
AS
3392{
3393 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3394 {
3395 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3396 " (expected %u, received %u)",
3397 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3398 return -1;
3399 }
3400
3401 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3402 struct agent_info *agent;
3403 struct addr_pair *pair;
3404 struct module_info *module;
3405 struct kernel_info *kernel;
3406 int kernel_count = image_desc->kernel_count;
3407 unsigned var_count = image_desc->global_variable_count;
9f2fca56 3408 /* Currently, "others" is a struct of ICVS. */
0bac793e 3409 int other_count = 1;
237957cc
AS
3410
3411 agent = get_agent_info (ord);
3412 if (!agent)
3413 return -1;
3414
3415 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3416 {
3417 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3418 return -1;
3419 }
3420 if (agent->prog_finalized
3421 && !destroy_hsa_program (agent))
3422 return -1;
3423
3424 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3425 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
0bac793e
CLT
3426 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3427 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
237957cc
AS
3428 * sizeof (struct addr_pair));
3429 *target_table = pair;
3430 module = (struct module_info *)
3431 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3432 + kernel_count * sizeof (struct kernel_info));
3433 module->image_desc = image_desc;
3434 module->kernel_count = kernel_count;
3435 module->heap = NULL;
3436 module->constructors_run_p = false;
3437
3438 kernel = &module->kernels[0];
3439
3440 /* Allocate memory for kernel dependencies. */
3441 for (unsigned i = 0; i < kernel_count; i++)
3442 {
3443 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3444 if (!init_basic_kernel_info (kernel, d, agent, module))
3445 return -1;
3446 if (strcmp (d->name, "_init_array") == 0)
3447 module->init_array_func = kernel;
3448 else if (strcmp (d->name, "_fini_array") == 0)
3449 module->fini_array_func = kernel;
3450 else
3451 {
3452 pair->start = (uintptr_t) kernel;
3453 pair->end = (uintptr_t) (kernel + 1);
3454 pair++;
3455 }
3456 kernel++;
3457 }
3458
3459 agent->module = module;
3460 if (pthread_rwlock_unlock (&agent->module_rwlock))
3461 {
3462 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3463 return -1;
3464 }
3465
3466 if (!create_and_finalize_hsa_program (agent))
3467 return -1;
3468
4a87a8e4 3469 if (var_count > 0)
237957cc 3470 {
237957cc
AS
3471 hsa_status_t status;
3472 hsa_executable_symbol_t var_symbol;
3473 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
4a87a8e4
AS
3474 ".offload_var_table",
3475 agent->id,
237957cc
AS
3476 0, &var_symbol);
3477
3478 if (status != HSA_STATUS_SUCCESS)
3479 hsa_fatal ("Could not find symbol for variable in the code object",
3480 status);
3481
4a87a8e4 3482 uint64_t var_table_addr;
237957cc 3483 status = hsa_fns.hsa_executable_symbol_get_info_fn
4a87a8e4
AS
3484 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3485 &var_table_addr);
237957cc
AS
3486 if (status != HSA_STATUS_SUCCESS)
3487 hsa_fatal ("Could not extract a variable from its symbol", status);
237957cc 3488
4a87a8e4
AS
3489 struct {
3490 uint64_t addr;
3491 uint64_t size;
3492 } var_table[var_count];
3493 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3494 (void*)var_table_addr, sizeof (var_table));
3495
3496 for (unsigned i = 0; i < var_count; i++)
3497 {
3498 pair->start = var_table[i].addr;
3499 pair->end = var_table[i].addr + var_table[i].size;
3500 GCN_DEBUG ("Found variable at %p with size %lu\n",
3501 (void *)var_table[i].addr, var_table[i].size);
3502 pair++;
3503 }
237957cc
AS
3504 }
3505
9f2fca56 3506 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
0bac793e
CLT
3507
3508 hsa_status_t status;
3509 hsa_executable_symbol_t var_symbol;
3510 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
9f2fca56 3511 XSTRING (GOMP_ADDITIONAL_ICVS),
0bac793e
CLT
3512 agent->id, 0, &var_symbol);
3513 if (status == HSA_STATUS_SUCCESS)
3514 {
9f2fca56
MV
3515 uint64_t varptr;
3516 uint32_t varsize;
0bac793e
CLT
3517
3518 status = hsa_fns.hsa_executable_symbol_get_info_fn
3519 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
9f2fca56 3520 &varptr);
0bac793e
CLT
3521 if (status != HSA_STATUS_SUCCESS)
3522 hsa_fatal ("Could not extract a variable from its symbol", status);
3523 status = hsa_fns.hsa_executable_symbol_get_info_fn
3524 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
9f2fca56 3525 &varsize);
0bac793e 3526 if (status != HSA_STATUS_SUCCESS)
9f2fca56
MV
3527 hsa_fatal ("Could not extract a variable size from its symbol",
3528 status);
0bac793e 3529
9f2fca56
MV
3530 pair->start = varptr;
3531 pair->end = varptr + varsize;
0bac793e
CLT
3532 }
3533 else
9f2fca56
MV
3534 {
3535 /* The variable was not in this image. */
3536 GCN_DEBUG ("Variable not found in image: %s\n",
3537 XSTRING (GOMP_ADDITIONAL_ICVS));
3538 pair->start = pair->end = 0;
3539 }
0bac793e 3540
237957cc
AS
3541 /* Ensure that constructors are run first. */
3542 struct GOMP_kernel_launch_attributes kla =
3543 { 3,
3544 /* Grid size. */
3545 { 1, 64, 1 },
3546 /* Work-group size. */
3547 { 1, 64, 1 }
3548 };
3549
3550 if (module->init_array_func)
3551 {
3552 init_kernel (module->init_array_func);
3553 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3554 }
3555 module->constructors_run_p = true;
3556
3557 /* Don't report kernels that libgomp need not know about. */
3558 if (module->init_array_func)
3559 kernel_count--;
3560 if (module->fini_array_func)
3561 kernel_count--;
3562
dfd75bf7
TB
3563 if (rev_fn_table != NULL && kernel_count == 0)
3564 *rev_fn_table = NULL;
3565 else if (rev_fn_table != NULL)
3566 {
3567 hsa_status_t status;
3568 hsa_executable_symbol_t var_symbol;
3569 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3570 ".offload_func_table",
3571 agent->id, 0, &var_symbol);
3572 if (status != HSA_STATUS_SUCCESS)
3573 hsa_fatal ("Could not find symbol for variable in the code object",
3574 status);
3575 uint64_t fn_table_addr;
3576 status = hsa_fns.hsa_executable_symbol_get_info_fn
3577 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3578 &fn_table_addr);
3579 if (status != HSA_STATUS_SUCCESS)
3580 hsa_fatal ("Could not extract a variable from its symbol", status);
3581 *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3582 GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3583 (void*) fn_table_addr,
3584 kernel_count * sizeof (uint64_t));
3585 }
3586
0bac793e 3587 return kernel_count + var_count + other_count;
237957cc
AS
3588}
3589
3590/* Unload GCN object-code module described by struct gcn_image_desc in
3591 TARGET_DATA from agent number N. Return TRUE on success. */
3592
3593bool
3594GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3595{
3596 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3597 {
3598 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3599 " (expected %u, received %u)",
3600 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3601 return false;
3602 }
3603
3604 struct agent_info *agent;
3605 agent = get_agent_info (n);
3606 if (!agent)
3607 return false;
3608
3609 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3610 {
3611 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3612 return false;
3613 }
3614
3615 if (!agent->module || agent->module->image_desc != target_data)
3616 {
3617 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3618 "loaded before");
3619 return false;
3620 }
3621
3622 if (!destroy_module (agent->module, true))
3623 return false;
3624 free (agent->module);
3625 agent->module = NULL;
3626 if (!destroy_hsa_program (agent))
3627 return false;
3628 if (pthread_rwlock_unlock (&agent->module_rwlock))
3629 {
3630 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3631 return false;
3632 }
3633 return true;
3634}
3635
3636/* Deinitialize all information and status associated with agent number N. We
3637 do not attempt any synchronization, assuming the user and libgomp will not
3638 attempt deinitialization of a device that is in any way being used at the
3639 same time. Return TRUE on success. */
3640
3641bool
3642GOMP_OFFLOAD_fini_device (int n)
3643{
3644 struct agent_info *agent = get_agent_info (n);
3645 if (!agent)
3646 return false;
3647
3648 if (!agent->initialized)
3649 return true;
3650
3651 if (agent->omp_async_queue)
3652 {
3653 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3654 agent->omp_async_queue = NULL;
3655 }
3656
3657 if (agent->module)
3658 {
3659 if (!destroy_module (agent->module, false))
3660 return false;
3661 free (agent->module);
3662 agent->module = NULL;
3663 }
3664
f6fff8a6 3665 if (!destroy_ephemeral_memories (agent))
237957cc
AS
3666 return false;
3667
3668 if (!destroy_hsa_program (agent))
3669 return false;
3670
3671 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3672 if (status != HSA_STATUS_SUCCESS)
3673 return hsa_error ("Error destroying command queue", status);
3674
3675 if (pthread_mutex_destroy (&agent->prog_mutex))
3676 {
3677 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3678 return false;
3679 }
3680 if (pthread_rwlock_destroy (&agent->module_rwlock))
3681 {
3682 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3683 return false;
3684 }
3685
3686 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3687 {
3688 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3689 return false;
3690 }
f6fff8a6 3691 if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
237957cc 3692 {
f6fff8a6 3693 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
237957cc
AS
3694 return false;
3695 }
3696 agent->initialized = false;
3697 return true;
3698}
3699
3700/* Return true if the HSA runtime can run function FN_PTR. */
3701
3702bool
3703GOMP_OFFLOAD_can_run (void *fn_ptr)
3704{
3705 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3706
3707 init_kernel (kernel);
3708 if (kernel->initialization_failed)
3709 goto failure;
3710
3711 return true;
3712
3713failure:
3714 if (suppress_host_fallback)
3715 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3716 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3717 return false;
3718}
3719
3720/* Allocate memory on device N. */
3721
3722void *
3723GOMP_OFFLOAD_alloc (int n, size_t size)
3724{
3725 struct agent_info *agent = get_agent_info (n);
3726 return alloc_by_agent (agent, size);
3727}
3728
3729/* Free memory from device N. */
3730
3731bool
3732GOMP_OFFLOAD_free (int device, void *ptr)
3733{
3734 GCN_DEBUG ("Freeing memory on device %d\n", device);
3735
3736 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3737 if (status != HSA_STATUS_SUCCESS)
3738 {
3739 hsa_error ("Could not free device memory", status);
3740 return false;
3741 }
3742
3743 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3744 bool profiling_dispatch_p
3745 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3746 if (profiling_dispatch_p)
3747 {
3748 acc_prof_info *prof_info = thr->prof_info;
3749 acc_event_info data_event_info;
3750 acc_api_info *api_info = thr->api_info;
3751
3752 prof_info->event_type = acc_ev_free;
3753
3754 data_event_info.data_event.event_type = prof_info->event_type;
3755 data_event_info.data_event.valid_bytes
3756 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3757 data_event_info.data_event.parent_construct
3758 = acc_construct_parallel;
3759 data_event_info.data_event.implicit = 1;
3760 data_event_info.data_event.tool_info = NULL;
3761 data_event_info.data_event.var_name = NULL;
3762 data_event_info.data_event.bytes = 0;
3763 data_event_info.data_event.host_ptr = NULL;
3764 data_event_info.data_event.device_ptr = (void *) ptr;
3765
3766 api_info->device_api = acc_device_api_other;
3767
3768 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3769 api_info);
3770 }
3771
3772 return true;
3773}
3774
3775/* Copy data from DEVICE to host. */
3776
3777bool
3778GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3779{
3780 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3781 src, dst);
8d2f4ddf
JB
3782 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3783 if (status != HSA_STATUS_SUCCESS)
3784 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3785 return true;
3786}
3787
3788/* Copy data from host to DEVICE. */
3789
3790bool
3791GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3792{
3793 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3794 device, dst);
8d2f4ddf 3795 hsa_memory_copy_wrapper (dst, src, n);
237957cc
AS
3796 return true;
3797}
3798
3799/* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3800
3801bool
3802GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3803{
3804 struct gcn_thread *thread_data = gcn_thread ();
3805
3806 if (thread_data && !async_synchronous_p (thread_data->async))
3807 {
3808 struct agent_info *agent = get_agent_info (device);
3809 maybe_init_omp_async (agent);
9c41f5b9 3810 queue_push_copy (agent->omp_async_queue, dst, src, n);
237957cc
AS
3811 return true;
3812 }
3813
3814 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3815 device, src, device, dst);
8d2f4ddf
JB
3816 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3817 if (status != HSA_STATUS_SUCCESS)
3818 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3819 return true;
3820}
3821
3822/* }}} */
3823/* {{{ OpenMP Plugin API */
3824
3825/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3826 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3827 to a kernel_info structure, and must have previously been loaded to the
3828 specified device. */
3829
3830void
3831GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3832{
3833 struct agent_info *agent = get_agent_info (device);
3834 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3835 struct GOMP_kernel_launch_attributes def;
3836 struct GOMP_kernel_launch_attributes *kla;
3837 assert (agent == kernel->agent);
3838
3839 /* If we get here then the kernel must be OpenMP. */
3840 kernel->kind = KIND_OPENMP;
3841
3842 if (!parse_target_attributes (args, &def, &kla, agent))
3843 {
3844 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3845 return;
3846 }
3847 run_kernel (kernel, vars, kla, NULL, false);
3848}
3849
3850/* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3851 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3852 GOMP_PLUGIN_target_task_completion when it has finished. */
3853
3854void
3855GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3856 void **args, void *async_data)
3857{
3858 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3859 struct agent_info *agent = get_agent_info (device);
3860 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3861 struct GOMP_kernel_launch_attributes def;
3862 struct GOMP_kernel_launch_attributes *kla;
3863 assert (agent == kernel->agent);
3864
3865 /* If we get here then the kernel must be OpenMP. */
3866 kernel->kind = KIND_OPENMP;
3867
3868 if (!parse_target_attributes (args, &def, &kla, agent))
3869 {
3870 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3871 return;
3872 }
3873
3874 maybe_init_omp_async (agent);
3875 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3876 queue_push_callback (agent->omp_async_queue,
3877 GOMP_PLUGIN_target_task_completion, async_data);
3878}
3879
3880/* }}} */
3881/* {{{ OpenACC Plugin API */
3882
3883/* Run a synchronous OpenACC kernel. The device number is inferred from the
3884 already-loaded KERNEL. */
3885
3886void
3887GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3888 void **hostaddrs, void **devaddrs, unsigned *dims,
3889 void *targ_mem_desc)
3890{
3891 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3892
3893 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3894 NULL);
3895}
3896
3897/* Run an asynchronous OpenACC kernel on the specified queue. */
3898
3899void
3900GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3901 void **hostaddrs, void **devaddrs,
3902 unsigned *dims, void *targ_mem_desc,
3903 struct goacc_asyncqueue *aq)
3904{
3905 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3906
3907 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3908 aq);
3909}
3910
3911/* Create a new asynchronous thread and queue for running future kernels. */
3912
3913struct goacc_asyncqueue *
3914GOMP_OFFLOAD_openacc_async_construct (int device)
3915{
3916 struct agent_info *agent = get_agent_info (device);
3917
3918 pthread_mutex_lock (&agent->async_queues_mutex);
3919
3920 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3921 aq->agent = get_agent_info (device);
3922 aq->prev = NULL;
3923 aq->next = agent->async_queues;
3924 if (aq->next)
3925 {
3926 aq->next->prev = aq;
3927 aq->id = aq->next->id + 1;
3928 }
3929 else
3930 aq->id = 1;
3931 agent->async_queues = aq;
3932
3933 aq->queue_first = 0;
3934 aq->queue_n = 0;
3935 aq->drain_queue_stop = 0;
3936
3937 if (pthread_mutex_init (&aq->mutex, NULL))
3938 {
3939 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3940 return false;
3941 }
3942 if (pthread_cond_init (&aq->queue_cond_in, NULL))
3943 {
3944 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3945 return false;
3946 }
3947 if (pthread_cond_init (&aq->queue_cond_out, NULL))
3948 {
3949 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3950 return false;
3951 }
3952
3953 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3954 ASYNC_QUEUE_SIZE,
3955 HSA_QUEUE_TYPE_MULTI,
3956 hsa_queue_callback, NULL,
3957 UINT32_MAX, UINT32_MAX,
3958 &aq->hsa_queue);
3959 if (status != HSA_STATUS_SUCCESS)
3960 hsa_fatal ("Error creating command queue", status);
3961
3962 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3963 if (err != 0)
3964 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3965 strerror (err));
3966 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3967 aq->id);
3968
3969 pthread_mutex_unlock (&agent->async_queues_mutex);
3970
3971 return aq;
3972}
3973
93d90219 3974/* Destroy an existing asynchronous thread and queue. Waits for any
237957cc
AS
3975 currently-running task to complete, but cancels any queued tasks. */
3976
3977bool
3978GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3979{
3980 struct agent_info *agent = aq->agent;
3981
3982 finalize_async_thread (aq);
3983
3984 pthread_mutex_lock (&agent->async_queues_mutex);
3985
3986 int err;
3987 if ((err = pthread_mutex_destroy (&aq->mutex)))
3988 {
3989 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3990 goto fail;
3991 }
3992 if (pthread_cond_destroy (&aq->queue_cond_in))
3993 {
3994 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3995 goto fail;
3996 }
3997 if (pthread_cond_destroy (&aq->queue_cond_out))
3998 {
3999 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4000 goto fail;
4001 }
4002 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4003 if (status != HSA_STATUS_SUCCESS)
4004 {
4005 hsa_error ("Error destroying command queue", status);
4006 goto fail;
4007 }
4008
4009 if (aq->prev)
4010 aq->prev->next = aq->next;
4011 if (aq->next)
4012 aq->next->prev = aq->prev;
4013 if (agent->async_queues == aq)
4014 agent->async_queues = aq->next;
4015
4016 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4017
4018 free (aq);
4019 pthread_mutex_unlock (&agent->async_queues_mutex);
4020 return true;
4021
4022fail:
4023 pthread_mutex_unlock (&agent->async_queues_mutex);
4024 return false;
4025}
4026
4027/* Return true if the specified async queue is currently empty. */
4028
4029int
4030GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4031{
4032 return queue_empty (aq);
4033}
4034
4035/* Block until the specified queue has executed all its tasks and the
4036 queue is empty. */
4037
4038bool
4039GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4040{
4041 wait_queue (aq);
4042 return true;
4043}
4044
4045/* Add a serialization point across two async queues. Any new tasks added to
4046 AQ2, after this call, will not run until all tasks on AQ1, at the time
4047 of this call, have completed. */
4048
4049bool
4050GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4051 struct goacc_asyncqueue *aq2)
4052{
4053 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4054 scheduled to run on it up to this point. */
4055 if (aq1 != aq2)
4056 {
4057 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4058 queue_push_asyncwait (aq2, placeholderp);
4059 }
4060 return true;
4061}
4062
4063/* Add an opaque callback to the given async queue. */
4064
4065void
4066GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4067 void (*fn) (void *), void *data)
4068{
4069 queue_push_callback (aq, fn, data);
4070}
4071
4072/* Queue up an asynchronous data copy from host to DEVICE. */
4073
4074bool
4075GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4076 size_t n, struct goacc_asyncqueue *aq)
4077{
4078 struct agent_info *agent = get_agent_info (device);
4079 assert (agent == aq->agent);
9c41f5b9 4080 queue_push_copy (aq, dst, src, n);
237957cc
AS
4081 return true;
4082}
4083
4084/* Queue up an asynchronous data copy from DEVICE to host. */
4085
4086bool
4087GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4088 size_t n, struct goacc_asyncqueue *aq)
4089{
4090 struct agent_info *agent = get_agent_info (device);
4091 assert (agent == aq->agent);
9c41f5b9 4092 queue_push_copy (aq, dst, src, n);
237957cc
AS
4093 return true;
4094}
4095
6fc0385c
TS
4096union goacc_property_value
4097GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4098{
2e5ea579
FH
4099 struct agent_info *agent = get_agent_info (device);
4100
4101 union goacc_property_value propval = { .val = 0 };
4102
4103 switch (prop)
4104 {
4105 case GOACC_PROPERTY_FREE_MEMORY:
4106 /* Not supported. */
4107 break;
4108 case GOACC_PROPERTY_MEMORY:
4109 {
4110 size_t size;
4111 hsa_region_t region = agent->data_region;
4112 hsa_status_t status =
4113 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4114 if (status == HSA_STATUS_SUCCESS)
4115 propval.val = size;
4116 break;
4117 }
4118 case GOACC_PROPERTY_NAME:
4119 propval.ptr = agent->name;
4120 break;
4121 case GOACC_PROPERTY_VENDOR:
4122 propval.ptr = agent->vendor_name;
4123 break;
4124 case GOACC_PROPERTY_DRIVER:
4125 propval.ptr = hsa_context.driver_version_s;
4126 break;
4127 }
6fc0385c 4128
2e5ea579 4129 return propval;
6fc0385c
TS
4130}
4131
237957cc
AS
4132/* Set up plugin-specific thread-local-data (host-side). */
4133
4134void *
4135GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4136{
4137 struct gcn_thread *thread_data
4138 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4139
4140 thread_data->async = GOMP_ASYNC_SYNC;
4141
4142 return (void *) thread_data;
4143}
4144
4145/* Clean up plugin-specific thread-local-data. */
4146
4147void
4148GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4149{
4150 free (data);
4151}
4152
4153/* }}} */