]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-gcn.c
Fix architecture support in OMP_OFFLOAD_init_device for gcn
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
CommitLineData
237957cc
AS
1/* Plugin for AMD GCN execution.
2
a945c346 3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
237957cc
AS
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* {{{ Includes and defines */
30
31#include "config.h"
0bac793e 32#include "symcat.h"
237957cc
AS
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <pthread.h>
37#include <inttypes.h>
38#include <stdbool.h>
39#include <limits.h>
40#include <hsa.h>
85f0a4d9 41#include <hsa_ext_amd.h>
237957cc
AS
42#include <dlfcn.h>
43#include <signal.h>
44#include "libgomp-plugin.h"
8c05d8cd 45#include "config/gcn/libgomp-gcn.h" /* For struct output. */
237957cc
AS
46#include "gomp-constants.h"
47#include <elf.h>
48#include "oacc-plugin.h"
49#include "oacc-int.h"
50#include <assert.h>
51
237957cc 52/* These probably won't be in elf.h for a while. */
97981e13 53#ifndef R_AMDGPU_NONE
237957cc
AS
54#define R_AMDGPU_NONE 0
55#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57#define R_AMDGPU_ABS64 3 /* S + A */
58#define R_AMDGPU_REL32 4 /* S + A - P */
59#define R_AMDGPU_REL64 5 /* S + A - P */
60#define R_AMDGPU_ABS32 6 /* S + A */
61#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
237957cc 66#define R_AMDGPU_RELATIVE64 13 /* B + A */
97981e13 67#endif
237957cc
AS
68
69/* GCN specific definitions for asynchronous queues. */
70
71#define ASYNC_QUEUE_SIZE 64
72#define DRAIN_QUEUE_SYNCHRONOUS_P false
73#define DEBUG_QUEUES 0
74#define DEBUG_THREAD_SLEEP 0
75#define DEBUG_THREAD_SIGNAL 0
76
77/* Defaults. */
78#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
79
80/* Secure getenv() which returns NULL if running as SUID/SGID. */
81#ifndef HAVE_SECURE_GETENV
82#ifdef HAVE___SECURE_GETENV
83#define secure_getenv __secure_getenv
84#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
86
87#include <unistd.h>
88
89/* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
91
92static char *
93secure_getenv (const char *name)
94{
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
96 return getenv (name);
97 else
98 return NULL;
99}
100
101#else
102#define secure_getenv getenv
103#endif
104#endif
105
106/* }}} */
107/* {{{ Types */
108
93d90219 109/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
237957cc
AS
110
111struct gcn_thread
112{
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
114 int async;
115};
116
117/* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
119
120struct hsa_runtime_fn_info
121{
122 /* HSA runtime. */
123 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
124 const char **status_string);
125 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
126 void *value);
127 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
128 hsa_agent_info_t attribute,
129 void *value);
130 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
131 hsa_isa_info_t attribute,
132 uint32_t index,
133 void *value);
134 hsa_status_t (*hsa_init_fn) (void);
135 hsa_status_t (*hsa_iterate_agents_fn)
136 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
137 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
138 hsa_region_info_t attribute,
139 void *value);
140 hsa_status_t (*hsa_queue_create_fn)
141 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
142 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
143 void *data, uint32_t private_segment_size,
144 uint32_t group_segment_size, hsa_queue_t **queue);
145 hsa_status_t (*hsa_agent_iterate_regions_fn)
146 (hsa_agent_t agent,
147 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
148 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
149 hsa_status_t (*hsa_executable_create_fn)
150 (hsa_profile_t profile, hsa_executable_state_t executable_state,
151 const char *options, hsa_executable_t *executable);
152 hsa_status_t (*hsa_executable_global_variable_define_fn)
153 (hsa_executable_t executable, const char *variable_name, void *address);
154 hsa_status_t (*hsa_executable_load_code_object_fn)
155 (hsa_executable_t executable, hsa_agent_t agent,
156 hsa_code_object_t code_object, const char *options);
157 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
158 const char *options);
159 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
160 uint32_t num_consumers,
161 const hsa_agent_t *consumers,
162 hsa_signal_t *signal);
163 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
164 void **ptr);
165 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
166 hsa_access_permission_t access);
167 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
168 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
169 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
170 hsa_status_t (*hsa_executable_get_symbol_fn)
171 (hsa_executable_t executable, const char *module_name,
172 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
173 hsa_executable_symbol_t *symbol);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn)
175 (hsa_executable_symbol_t executable_symbol,
176 hsa_executable_symbol_info_t attribute, void *value);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn)
178 (hsa_executable_t executable,
179 hsa_status_t (*callback)(hsa_executable_t executable,
180 hsa_executable_symbol_t symbol, void *data),
181 void *data);
182 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
183 uint64_t value);
184 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
185 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
190 (hsa_signal_t signal, hsa_signal_condition_t condition,
191 hsa_signal_value_t compare_value, uint64_t timeout_hint,
192 hsa_wait_state_t wait_state_hint);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
194 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
195
196 hsa_status_t (*hsa_code_object_deserialize_fn)
197 (void *serialized_code_object, size_t serialized_code_object_size,
198 const char *options, hsa_code_object_t *code_object);
a17299c1
JB
199 hsa_status_t (*hsa_amd_memory_lock_fn)
200 (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
201 void **agent_ptr);
202 hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
203 hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
204 (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
205 const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
206 const hsa_dim3_t *range, hsa_agent_t copy_agent,
207 hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
208 const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
237957cc
AS
209};
210
211/* Structure describing the run-time and grid properties of an HSA kernel
212 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
213
214struct GOMP_kernel_launch_attributes
215{
216 /* Number of dimensions the workload has. Maximum number is 3. */
217 uint32_t ndim;
218 /* Size of the grid in the three respective dimensions. */
219 uint32_t gdims[3];
220 /* Size of work-groups in the respective dimensions. */
221 uint32_t wdims[3];
222};
223
224/* Collection of information needed for a dispatch of a kernel from a
225 kernel. */
226
227struct kernel_dispatch
228{
229 struct agent_info *agent;
230 /* Pointer to a command queue associated with a kernel dispatch agent. */
231 void *queue;
232 /* Pointer to a memory space used for kernel arguments passing. */
233 void *kernarg_address;
234 /* Kernel object. */
235 uint64_t object;
236 /* Synchronization signal used for dispatch synchronization. */
237 uint64_t signal;
238 /* Private segment size. */
239 uint32_t private_segment_size;
240 /* Group segment size. */
241 uint32_t group_segment_size;
242};
243
244/* Structure of the kernargs segment, supporting console output.
245
246 This needs to match the definitions in Newlib, and the expectations
247 in libgomp target code. */
248
249struct kernargs {
f6fff8a6 250 struct kernargs_abi abi;
237957cc
AS
251
252 /* Output data. */
8c05d8cd 253 struct output output_data;
237957cc
AS
254};
255
256/* A queue entry for a future asynchronous launch. */
257
258struct kernel_launch
259{
260 struct kernel_info *kernel;
261 void *vars;
262 struct GOMP_kernel_launch_attributes kla;
263};
264
265/* A queue entry for a future callback. */
266
267struct callback
268{
269 void (*fn)(void *);
270 void *data;
271};
272
273/* A data struct for the copy_data callback. */
274
275struct copy_data
276{
277 void *dst;
278 const void *src;
279 size_t len;
237957cc
AS
280 struct goacc_asyncqueue *aq;
281};
282
283/* A queue entry for a placeholder. These correspond to a wait event. */
284
285struct placeholder
286{
287 int executed;
288 pthread_cond_t cond;
289 pthread_mutex_t mutex;
290};
291
292/* A queue entry for a wait directive. */
293
294struct asyncwait_info
295{
296 struct placeholder *placeholderp;
297};
298
299/* Encode the type of an entry in an async queue. */
300
301enum entry_type
302{
303 KERNEL_LAUNCH,
304 CALLBACK,
305 ASYNC_WAIT,
306 ASYNC_PLACEHOLDER
307};
308
309/* An entry in an async queue. */
310
311struct queue_entry
312{
313 enum entry_type type;
314 union {
315 struct kernel_launch launch;
316 struct callback callback;
317 struct asyncwait_info asyncwait;
318 struct placeholder placeholder;
319 } u;
320};
321
322/* An async queue header.
323
324 OpenMP may create one of these.
325 OpenACC may create many. */
326
327struct goacc_asyncqueue
328{
329 struct agent_info *agent;
330 hsa_queue_t *hsa_queue;
331
332 pthread_t thread_drain_queue;
333 pthread_mutex_t mutex;
334 pthread_cond_t queue_cond_in;
335 pthread_cond_t queue_cond_out;
336 struct queue_entry queue[ASYNC_QUEUE_SIZE];
337 int queue_first;
338 int queue_n;
339 int drain_queue_stop;
340
341 int id;
342 struct goacc_asyncqueue *prev;
343 struct goacc_asyncqueue *next;
344};
345
346/* Mkoffload uses this structure to describe a kernel.
347
348 OpenMP kernel dimensions are passed at runtime.
349 OpenACC kernel dimensions are passed at compile time, here. */
350
351struct hsa_kernel_description
352{
353 const char *name;
354 int oacc_dims[3]; /* Only present for GCN kernels. */
5a28e272
KCY
355 int sgpr_count;
356 int vpgr_count;
237957cc
AS
357};
358
359/* Mkoffload uses this structure to describe an offload variable. */
360
361struct global_var_info
362{
363 const char *name;
364 void *address;
365};
366
367/* Mkoffload uses this structure to describe all the kernels in a
368 loadable module. These are passed the libgomp via static constructors. */
369
370struct gcn_image_desc
371{
372 struct gcn_image {
373 size_t size;
374 void *image;
375 } *gcn_image;
376 const unsigned kernel_count;
377 struct hsa_kernel_description *kernel_infos;
a49c7d31 378 const unsigned ind_func_count;
237957cc 379 const unsigned global_variable_count;
237957cc
AS
380};
381
7d593fd6
FH
382/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
383 support.
384 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
385
386typedef enum {
209ed06c 387 EF_AMDGPU_MACH_UNSUPPORTED = -1,
7d593fd6
FH
388 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
389 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
390 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
cde52d3a 391 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
c7ec7bd1 392 EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
f9290cdf
TS
393 EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
394 EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041
7d593fd6
FH
395} EF_AMDGPU_MACH;
396
397const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
398typedef EF_AMDGPU_MACH gcn_isa;
399
237957cc
AS
400/* Description of an HSA GPU agent (device) and the program associated with
401 it. */
402
403struct agent_info
404{
405 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
406 hsa_agent_t id;
407 /* The user-visible device number. */
408 int device_id;
409 /* Whether the agent has been initialized. The fields below are usable only
410 if it has been. */
411 bool initialized;
7d593fd6
FH
412
413 /* The instruction set architecture of the device. */
414 gcn_isa device_isa;
2e5ea579
FH
415 /* Name of the agent. */
416 char name[64];
417 /* Name of the vendor of the agent. */
418 char vendor_name[64];
237957cc
AS
419 /* Command queues of the agent. */
420 hsa_queue_t *sync_queue;
421 struct goacc_asyncqueue *async_queues, *omp_async_queue;
422 pthread_mutex_t async_queues_mutex;
423
424 /* The HSA memory region from which to allocate kernel arguments. */
425 hsa_region_t kernarg_region;
426
427 /* The HSA memory region from which to allocate device data. */
428 hsa_region_t data_region;
429
f6fff8a6
AS
430 /* Allocated ephemeral memories (team arena and stack space). */
431 struct ephemeral_memories_list *ephemeral_memories_list;
432 pthread_mutex_t ephemeral_memories_write_lock;
237957cc
AS
433
434 /* Read-write lock that protects kernels which are running or about to be run
435 from interference with loading and unloading of images. Needs to be
436 locked for reading while a kernel is being run, and for writing if the
437 list of modules is manipulated (and thus the HSA program invalidated). */
438 pthread_rwlock_t module_rwlock;
439
440 /* The module associated with this kernel. */
441 struct module_info *module;
442
443 /* Mutex enforcing that only one thread will finalize the HSA program. A
444 thread should have locked agent->module_rwlock for reading before
445 acquiring it. */
446 pthread_mutex_t prog_mutex;
447 /* Flag whether the HSA program that consists of all the modules has been
448 finalized. */
449 bool prog_finalized;
450 /* HSA executable - the finalized program that is used to locate kernels. */
451 hsa_executable_t executable;
452};
453
454/* Information required to identify, finalize and run any given kernel. */
455
456enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
457
458struct kernel_info
459{
460 /* Name of the kernel, required to locate it within the GCN object-code
461 module. */
462 const char *name;
463 /* The specific agent the kernel has been or will be finalized for and run
464 on. */
465 struct agent_info *agent;
466 /* The specific module where the kernel takes place. */
467 struct module_info *module;
5a28e272
KCY
468 /* Information provided by mkoffload associated with the kernel. */
469 struct hsa_kernel_description *description;
237957cc
AS
470 /* Mutex enforcing that at most once thread ever initializes a kernel for
471 use. A thread should have locked agent->module_rwlock for reading before
472 acquiring it. */
473 pthread_mutex_t init_mutex;
474 /* Flag indicating whether the kernel has been initialized and all fields
475 below it contain valid data. */
476 bool initialized;
477 /* Flag indicating that the kernel has a problem that blocks an execution. */
478 bool initialization_failed;
479 /* The object to be put into the dispatch queue. */
480 uint64_t object;
481 /* Required size of kernel arguments. */
482 uint32_t kernarg_segment_size;
483 /* Required size of group segment. */
484 uint32_t group_segment_size;
485 /* Required size of private segment. */
486 uint32_t private_segment_size;
487 /* Set up for OpenMP or OpenACC? */
488 enum offload_kind kind;
489};
490
491/* Information about a particular GCN module, its image and kernels. */
492
493struct module_info
494{
495 /* The description with which the program has registered the image. */
496 struct gcn_image_desc *image_desc;
497 /* GCN heap allocation. */
498 struct heap *heap;
499 /* Physical boundaries of the loaded module. */
500 Elf64_Addr phys_address_start;
501 Elf64_Addr phys_address_end;
502
503 bool constructors_run_p;
504 struct kernel_info *init_array_func, *fini_array_func;
505
506 /* Number of kernels in this module. */
507 int kernel_count;
508 /* An array of kernel_info structures describing each kernel in this
509 module. */
510 struct kernel_info kernels[];
511};
512
513/* A linked list of memory arenas allocated on the device.
f6fff8a6
AS
514 These are used by OpenMP, as a means to optimize per-team malloc,
515 and for host-accessible stack space. */
237957cc 516
f6fff8a6 517struct ephemeral_memories_list
237957cc 518{
f6fff8a6 519 struct ephemeral_memories_list *next;
237957cc 520
f6fff8a6
AS
521 /* The size is determined by the number of teams and threads. */
522 size_t size;
523 /* The device address allocated memory. */
524 void *address;
525 /* A flag to prevent two asynchronous kernels trying to use the same memory.
237957cc
AS
526 The mutex is locked until the kernel exits. */
527 pthread_mutex_t in_use;
528};
529
530/* Information about the whole HSA environment and all of its agents. */
531
532struct hsa_context_info
533{
534 /* Whether the structure has been initialized. */
535 bool initialized;
536 /* Number of usable GPU HSA agents in the system. */
537 int agent_count;
538 /* Array of agent_info structures describing the individual HSA agents. */
539 struct agent_info *agents;
2e5ea579
FH
540 /* Driver version string. */
541 char driver_version_s[30];
237957cc
AS
542};
543
237957cc
AS
544/* }}} */
545/* {{{ Global variables */
546
547/* Information about the whole HSA environment and all of its agents. */
548
549static struct hsa_context_info hsa_context;
550
551/* HSA runtime functions that are initialized in init_hsa_context. */
552
553static struct hsa_runtime_fn_info hsa_fns;
554
555/* Heap space, allocated target-side, provided for use of newlib malloc.
556 Each module should have it's own heap allocated.
557 Beware that heap usage increases with OpenMP teams. See also arenas. */
558
559static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
560
f6fff8a6
AS
561/* Ephemeral memory sizes for each kernel launch. */
562
563static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
564static int stack_size = DEFAULT_GCN_STACK_SIZE;
e7d6c277 565static int lowlat_size = -1;
f6fff8a6 566
237957cc
AS
567/* Flag to decide whether print to stderr information about what is going on.
568 Set in init_debug depending on environment variables. */
569
570static bool debug;
571
572/* Flag to decide if the runtime should suppress a possible fallback to host
573 execution. */
574
575static bool suppress_host_fallback;
576
577/* Flag to locate HSA runtime shared library that is dlopened
578 by this plug-in. */
579
580static const char *hsa_runtime_lib;
581
582/* Flag to decide if the runtime should support also CPU devices (can be
583 a simulator). */
584
585static bool support_cpu_devices;
586
587/* Runtime dimension overrides. Zero indicates default. */
588
589static int override_x_dim = 0;
590static int override_z_dim = 0;
591
592/* }}} */
593/* {{{ Debug & Diagnostic */
594
595/* Print a message to stderr if GCN_DEBUG value is set to true. */
596
597#define DEBUG_PRINT(...) \
598 do \
599 { \
600 if (debug) \
601 { \
602 fprintf (stderr, __VA_ARGS__); \
603 } \
604 } \
605 while (false);
606
607/* Flush stderr if GCN_DEBUG value is set to true. */
608
609#define DEBUG_FLUSH() \
610 do { \
611 if (debug) \
612 fflush (stderr); \
613 } while (false)
614
615/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
616 is set to true. */
617
618#define DEBUG_LOG(prefix, ...) \
619 do \
620 { \
621 DEBUG_PRINT (prefix); \
622 DEBUG_PRINT (__VA_ARGS__); \
623 DEBUG_FLUSH (); \
624 } while (false)
625
626/* Print a debugging message to stderr. */
627
628#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
629
630/* Print a warning message to stderr. */
631
632#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
633
634/* Print HSA warning STR with an HSA STATUS code. */
635
636static void
637hsa_warn (const char *str, hsa_status_t status)
638{
639 if (!debug)
640 return;
641
642 const char *hsa_error_msg = "[unknown]";
643 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
644
645 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
646 hsa_error_msg);
647}
648
649/* Report a fatal error STR together with the HSA error corresponding to STATUS
650 and terminate execution of the current process. */
651
652static void
653hsa_fatal (const char *str, hsa_status_t status)
654{
655 const char *hsa_error_msg = "[unknown]";
656 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
657 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
658 hsa_error_msg);
659}
660
661/* Like hsa_fatal, except only report error message, and return FALSE
662 for propagating error processing to outside of plugin. */
663
664static bool
665hsa_error (const char *str, hsa_status_t status)
666{
667 const char *hsa_error_msg = "[unknown]";
668 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
669 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
670 hsa_error_msg);
671 return false;
672}
673
674/* Dump information about the available hardware. */
675
676static void
677dump_hsa_system_info (void)
678{
679 hsa_status_t status;
680
681 hsa_endianness_t endianness;
682 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
683 &endianness);
684 if (status == HSA_STATUS_SUCCESS)
685 switch (endianness)
686 {
687 case HSA_ENDIANNESS_LITTLE:
688 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
689 break;
690 case HSA_ENDIANNESS_BIG:
691 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
692 break;
693 default:
694 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
695 }
696 else
697 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
698
699 uint8_t extensions[128];
700 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
701 &extensions);
702 if (status == HSA_STATUS_SUCCESS)
703 {
704 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
705 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
706 }
707 else
708 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
709}
710
711/* Dump information about the available hardware. */
712
713static void
714dump_machine_model (hsa_machine_model_t machine_model, const char *s)
715{
716 switch (machine_model)
717 {
718 case HSA_MACHINE_MODEL_SMALL:
719 GCN_DEBUG ("%s: SMALL\n", s);
720 break;
721 case HSA_MACHINE_MODEL_LARGE:
722 GCN_DEBUG ("%s: LARGE\n", s);
723 break;
724 default:
725 GCN_WARNING ("%s: UNKNOWN\n", s);
726 break;
727 }
728}
729
730/* Dump information about the available hardware. */
731
732static void
733dump_profile (hsa_profile_t profile, const char *s)
734{
735 switch (profile)
736 {
737 case HSA_PROFILE_FULL:
738 GCN_DEBUG ("%s: FULL\n", s);
739 break;
740 case HSA_PROFILE_BASE:
741 GCN_DEBUG ("%s: BASE\n", s);
742 break;
743 default:
744 GCN_WARNING ("%s: UNKNOWN\n", s);
745 break;
746 }
747}
748
749/* Dump information about a device memory region. */
750
751static hsa_status_t
752dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
753{
754 hsa_status_t status;
755
756 hsa_region_segment_t segment;
757 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
758 &segment);
759 if (status == HSA_STATUS_SUCCESS)
760 {
761 if (segment == HSA_REGION_SEGMENT_GLOBAL)
762 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
763 else if (segment == HSA_REGION_SEGMENT_READONLY)
764 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
765 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
766 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
767 else if (segment == HSA_REGION_SEGMENT_GROUP)
768 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
769 else
770 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
771 }
772 else
773 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
774
775 if (segment == HSA_REGION_SEGMENT_GLOBAL)
776 {
777 uint32_t flags;
778 status
779 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
780 &flags);
781 if (status == HSA_STATUS_SUCCESS)
782 {
783 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
784 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
785 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
786 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
787 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
788 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
789 }
790 else
791 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
792 }
793
794 size_t size;
795 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
796 if (status == HSA_STATUS_SUCCESS)
797 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
798 else
799 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
800
801 status
802 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
803 &size);
804 if (status == HSA_STATUS_SUCCESS)
805 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
806 else
807 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
808
809 bool alloc_allowed;
810 status
811 = hsa_fns.hsa_region_get_info_fn (region,
812 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
813 &alloc_allowed);
814 if (status == HSA_STATUS_SUCCESS)
815 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
816 else
817 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
818
819 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
820 return HSA_STATUS_SUCCESS;
821
822 status
823 = hsa_fns.hsa_region_get_info_fn (region,
824 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
825 &size);
826 if (status == HSA_STATUS_SUCCESS)
827 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
828 else
829 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
830
831 size_t align;
832 status
833 = hsa_fns.hsa_region_get_info_fn (region,
834 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
835 &align);
836 if (status == HSA_STATUS_SUCCESS)
837 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
838 else
839 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
840
841 return HSA_STATUS_SUCCESS;
842}
843
844/* Dump information about all the device memory regions. */
845
846static void
847dump_hsa_regions (hsa_agent_t agent)
848{
849 hsa_status_t status;
850 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
851 dump_hsa_region,
852 NULL);
853 if (status != HSA_STATUS_SUCCESS)
854 hsa_error ("Dumping hsa regions failed", status);
855}
856
857/* Dump information about the available devices. */
858
859static hsa_status_t
860dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
861{
862 hsa_status_t status;
863
864 char buf[64];
865 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
866 &buf);
867 if (status == HSA_STATUS_SUCCESS)
868 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
869 else
870 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
871
872 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
873 &buf);
874 if (status == HSA_STATUS_SUCCESS)
875 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
876 else
877 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
878
879 hsa_machine_model_t machine_model;
880 status
881 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
882 &machine_model);
883 if (status == HSA_STATUS_SUCCESS)
884 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
885 else
886 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
887
888 hsa_profile_t profile;
889 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
890 &profile);
891 if (status == HSA_STATUS_SUCCESS)
892 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
893 else
894 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
895
896 hsa_device_type_t device_type;
897 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
898 &device_type);
899 if (status == HSA_STATUS_SUCCESS)
900 {
901 switch (device_type)
902 {
903 case HSA_DEVICE_TYPE_CPU:
904 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
905 break;
906 case HSA_DEVICE_TYPE_GPU:
907 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
908 break;
909 case HSA_DEVICE_TYPE_DSP:
910 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
911 break;
912 default:
913 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
914 break;
915 }
916 }
917 else
918 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
919
920 uint32_t cu_count;
921 status = hsa_fns.hsa_agent_get_info_fn
922 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
923 if (status == HSA_STATUS_SUCCESS)
924 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
925 else
926 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
927
928 uint32_t size;
929 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
930 &size);
931 if (status == HSA_STATUS_SUCCESS)
932 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
933 else
934 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
935
936 uint32_t max_dim;
937 status = hsa_fns.hsa_agent_get_info_fn (agent,
938 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
939 &max_dim);
940 if (status == HSA_STATUS_SUCCESS)
941 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
942 else
943 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
944
945 uint32_t max_size;
946 status = hsa_fns.hsa_agent_get_info_fn (agent,
947 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
948 &max_size);
949 if (status == HSA_STATUS_SUCCESS)
950 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
951 else
952 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
953
954 uint32_t grid_max_dim;
955 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
956 &grid_max_dim);
957 if (status == HSA_STATUS_SUCCESS)
958 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
959 else
960 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
961
962 uint32_t grid_max_size;
963 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
964 &grid_max_size);
965 if (status == HSA_STATUS_SUCCESS)
966 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
967 else
968 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
969
970 dump_hsa_regions (agent);
971
972 return HSA_STATUS_SUCCESS;
973}
974
975/* Forward reference. */
976
977static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
978
979/* Helper function for dump_executable_symbols. */
980
981static hsa_status_t
982dump_executable_symbol (hsa_executable_t executable,
983 hsa_executable_symbol_t symbol,
984 void *data __attribute__((unused)))
985{
986 char *name = get_executable_symbol_name (symbol);
987
988 if (name)
989 {
990 GCN_DEBUG ("executable symbol: %s\n", name);
991 free (name);
992 }
993
994 return HSA_STATUS_SUCCESS;
995}
996
997/* Dump all global symbol in an executable. */
998
999static void
1000dump_executable_symbols (hsa_executable_t executable)
1001{
1002 hsa_status_t status;
1003 status
1004 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1005 dump_executable_symbol,
1006 NULL);
1007 if (status != HSA_STATUS_SUCCESS)
1008 hsa_fatal ("Could not dump HSA executable symbols", status);
1009}
1010
1011/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1012
1013static void
1014print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1015{
1016 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1017
1018 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1019 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1020 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1021 fprintf (stderr, "%*sheap address: %p\n", indent, "",
f6fff8a6
AS
1022 (void*)kernargs->abi.heap_ptr);
1023 fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1024 "", (void*)kernargs->abi.arena_ptr,
1025 kernargs->abi.arena_size_per_team);
1026 fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1027 "", (void*)kernargs->abi.stack_ptr,
1028 kernargs->abi.stack_size_per_thread);
237957cc
AS
1029 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1030 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1031 dispatch->private_segment_size);
e7d6c277
AS
1032 fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1033 "", dispatch->group_segment_size);
237957cc
AS
1034 fprintf (stderr, "\n");
1035}
1036
1037/* }}} */
1038/* {{{ Utility functions */
1039
1040/* Cast the thread local storage to gcn_thread. */
1041
1042static inline struct gcn_thread *
1043gcn_thread (void)
1044{
1045 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1046}
1047
1048/* Initialize debug and suppress_host_fallback according to the environment. */
1049
1050static void
1051init_environment_variables (void)
1052{
1053 if (secure_getenv ("GCN_DEBUG"))
1054 debug = true;
1055 else
1056 debug = false;
1057
1058 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1059 suppress_host_fallback = true;
1060 else
1061 suppress_host_fallback = false;
1062
1063 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1064 if (hsa_runtime_lib == NULL)
7c1e856b 1065 hsa_runtime_lib = "libhsa-runtime64.so.1";
237957cc
AS
1066
1067 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1068
1069 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1070 if (!x)
1071 x = secure_getenv ("GCN_NUM_GANGS");
1072 if (x)
1073 override_x_dim = atoi (x);
1074
1075 const char *z = secure_getenv ("GCN_NUM_THREADS");
1076 if (!z)
1077 z = secure_getenv ("GCN_NUM_WORKERS");
1078 if (z)
1079 override_z_dim = atoi (z);
1080
1081 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1082 if (heap)
1083 {
1084 size_t tmp = atol (heap);
1085 if (tmp)
1086 gcn_kernel_heap_size = tmp;
1087 }
f6fff8a6
AS
1088
1089 const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1090 if (arena)
1091 {
1092 int tmp = atoi (arena);
1093 if (tmp)
1094 team_arena_size = tmp;;
1095 }
1096
1097 const char *stack = secure_getenv ("GCN_STACK_SIZE");
1098 if (stack)
1099 {
1100 int tmp = atoi (stack);
1101 if (tmp)
1102 stack_size = tmp;;
1103 }
e7d6c277
AS
1104
1105 const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1106 if (lowlat)
1107 lowlat_size = atoi (lowlat);
237957cc
AS
1108}
1109
1110/* Return malloc'd string with name of SYMBOL. */
1111
1112static char *
1113get_executable_symbol_name (hsa_executable_symbol_t symbol)
1114{
1115 hsa_status_t status;
1116 char *res;
1117 uint32_t len;
1118 const hsa_executable_symbol_info_t info_name_length
1119 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1120
1121 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1122 &len);
1123 if (status != HSA_STATUS_SUCCESS)
1124 {
1125 hsa_error ("Could not get length of symbol name", status);
1126 return NULL;
1127 }
1128
1129 res = GOMP_PLUGIN_malloc (len + 1);
1130
1131 const hsa_executable_symbol_info_t info_name
1132 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1133
1134 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1135
1136 if (status != HSA_STATUS_SUCCESS)
1137 {
1138 hsa_error ("Could not get symbol name", status);
1139 free (res);
1140 return NULL;
1141 }
1142
1143 res[len] = '\0';
1144
1145 return res;
1146}
1147
237957cc
AS
1148/* Get the number of GPU Compute Units. */
1149
1150static int
1151get_cu_count (struct agent_info *agent)
1152{
1153 uint32_t cu_count;
1154 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1155 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1156 if (status == HSA_STATUS_SUCCESS)
1157 return cu_count;
1158 else
1159 return 64; /* The usual number for older devices. */
1160}
1161
1162/* Calculate the maximum grid size for OMP threads / OACC workers.
1163 This depends on the kernel's resource usage levels. */
1164
1165static int
1166limit_worker_threads (int threads)
1167{
1168 /* FIXME Do something more inteligent here.
1169 GCN can always run 4 threads within a Compute Unit, but
1170 more than that depends on register usage. */
1171 if (threads > 16)
1172 threads = 16;
1173 return threads;
1174}
1175
81476bc4
MV
1176/* This sets the maximum number of teams to twice the number of GPU Compute
1177 Units to avoid memory waste and corresponding memory access faults. */
1178
1179static int
1180limit_teams (int teams, struct agent_info *agent)
1181{
1182 int max_teams = 2 * get_cu_count (agent);
1183 if (teams > max_teams)
1184 teams = max_teams;
1185 return teams;
1186}
1187
237957cc
AS
1188/* Parse the target attributes INPUT provided by the compiler and return true
1189 if we should run anything all. If INPUT is NULL, fill DEF with default
1190 values, then store INPUT or DEF into *RESULT.
1191
1192 This is used for OpenMP only. */
1193
1194static bool
1195parse_target_attributes (void **input,
1196 struct GOMP_kernel_launch_attributes *def,
1197 struct GOMP_kernel_launch_attributes **result,
1198 struct agent_info *agent)
1199{
1200 if (!input)
1201 GOMP_PLUGIN_fatal ("No target arguments provided");
1202
1203 bool grid_attrs_found = false;
1204 bool gcn_dims_found = false;
1205 int gcn_teams = 0;
1206 int gcn_threads = 0;
1207 while (*input)
1208 {
1209 intptr_t id = (intptr_t) *input++, val;
1210
1211 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1212 val = (intptr_t) *input++;
1213 else
1214 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1215
1216 val = (val > INT_MAX) ? INT_MAX : val;
1217
1218 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1219 && ((id & GOMP_TARGET_ARG_ID_MASK)
1220 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1221 {
1222 grid_attrs_found = true;
1223 break;
1224 }
14e5e746
AS
1225 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1226 == GOMP_TARGET_ARG_DEVICE_ALL)
237957cc
AS
1227 {
1228 gcn_dims_found = true;
1229 switch (id & GOMP_TARGET_ARG_ID_MASK)
1230 {
1231 case GOMP_TARGET_ARG_NUM_TEAMS:
81476bc4 1232 gcn_teams = limit_teams (val, agent);
237957cc
AS
1233 break;
1234 case GOMP_TARGET_ARG_THREAD_LIMIT:
1235 gcn_threads = limit_worker_threads (val);
1236 break;
1237 default:
1238 ;
1239 }
1240 }
1241 }
1242
1243 if (gcn_dims_found)
1244 {
a78b1ab1
KCY
1245 bool gfx900_workaround_p = false;
1246
7d593fd6
FH
1247 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1248 && gcn_threads == 0 && override_z_dim == 0)
237957cc 1249 {
a78b1ab1 1250 gfx900_workaround_p = true;
237957cc 1251 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
a78b1ab1 1252 "threads to at most 4 per team.\n");
237957cc
AS
1253 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1254 "GCN_NUM_THREADS=16\n");
1255 }
1256
a78b1ab1
KCY
1257 /* Ideally, when a dimension isn't explicitly specified, we should
1258 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1259 In practice, we tune for peak performance on BabelStream, which
1260 for OpenACC is currently 32 threads per CU. */
237957cc 1261 def->ndim = 3;
a78b1ab1
KCY
1262 if (gcn_teams <= 0 && gcn_threads <= 0)
1263 {
1264 /* Set up a reasonable number of teams and threads. */
1265 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1266 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1267 def->gdims[2] = gcn_threads;
1268 }
1269 else if (gcn_teams <= 0 && gcn_threads > 0)
1270 {
1271 /* Auto-scale the number of teams with the number of threads. */
1272 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1273 def->gdims[2] = gcn_threads;
1274 }
1275 else if (gcn_teams > 0 && gcn_threads <= 0)
1276 {
1277 int max_threads = gfx900_workaround_p ? 4 : 16;
1278
1279 /* Auto-scale the number of threads with the number of teams. */
1280 def->gdims[0] = gcn_teams;
1281 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1282 if (def->gdims[2] == 0)
1283 def->gdims[2] = 1;
1284 else if (def->gdims[2] > max_threads)
1285 def->gdims[2] = max_threads;
1286 }
1287 else
1288 {
1289 def->gdims[0] = gcn_teams;
1290 def->gdims[2] = gcn_threads;
1291 }
1292 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1293 def->wdims[0] = 1; /* Single team per work-group. */
237957cc
AS
1294 def->wdims[1] = 64;
1295 def->wdims[2] = 16;
1296 *result = def;
1297 return true;
1298 }
1299 else if (!grid_attrs_found)
1300 {
1301 def->ndim = 1;
1302 def->gdims[0] = 1;
1303 def->gdims[1] = 1;
1304 def->gdims[2] = 1;
1305 def->wdims[0] = 1;
1306 def->wdims[1] = 1;
1307 def->wdims[2] = 1;
1308 *result = def;
1309 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1310 return true;
1311 }
1312
1313 struct GOMP_kernel_launch_attributes *kla;
1314 kla = (struct GOMP_kernel_launch_attributes *) *input;
1315 *result = kla;
1316 if (kla->ndim == 0 || kla->ndim > 3)
1317 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1318
1319 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1320 unsigned i;
1321 for (i = 0; i < kla->ndim; i++)
1322 {
1323 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1324 kla->gdims[i], kla->wdims[i]);
1325 if (kla->gdims[i] == 0)
1326 return false;
1327 }
1328 return true;
1329}
1330
1331/* Return the group size given the requested GROUP size, GRID size and number
1332 of grid dimensions NDIM. */
1333
1334static uint32_t
1335get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1336{
1337 if (group == 0)
1338 {
1339 /* TODO: Provide a default via environment or device characteristics. */
1340 if (ndim == 1)
1341 group = 64;
1342 else if (ndim == 2)
1343 group = 8;
1344 else
1345 group = 4;
1346 }
1347
1348 if (group > grid)
1349 group = grid;
1350 return group;
1351}
1352
1353/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1354
1355static void
1356packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1357{
1358 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1359}
1360
1361/* A never-called callback for the HSA command queues. These signal events
1362 that we don't use, so we trigger an error.
1363
1364 This "queue" is not to be confused with the async queues, below. */
1365
1366static void
1367hsa_queue_callback (hsa_status_t status,
1368 hsa_queue_t *queue __attribute__ ((unused)),
1369 void *data __attribute__ ((unused)))
1370{
1371 hsa_fatal ("Asynchronous queue error", status);
1372}
1373
1374/* }}} */
1375/* {{{ HSA initialization */
1376
1377/* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1378
1379static bool
1380init_hsa_runtime_functions (void)
1381{
1382#define DLSYM_FN(function) \
1383 hsa_fns.function##_fn = dlsym (handle, #function); \
1384 if (hsa_fns.function##_fn == NULL) \
1385 return false;
a17299c1
JB
1386#define DLSYM_OPT_FN(function) \
1387 hsa_fns.function##_fn = dlsym (handle, #function);
237957cc
AS
1388 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1389 if (handle == NULL)
1390 return false;
1391
1392 DLSYM_FN (hsa_status_string)
1393 DLSYM_FN (hsa_system_get_info)
1394 DLSYM_FN (hsa_agent_get_info)
1395 DLSYM_FN (hsa_init)
1396 DLSYM_FN (hsa_iterate_agents)
1397 DLSYM_FN (hsa_region_get_info)
1398 DLSYM_FN (hsa_queue_create)
1399 DLSYM_FN (hsa_agent_iterate_regions)
1400 DLSYM_FN (hsa_executable_destroy)
1401 DLSYM_FN (hsa_executable_create)
1402 DLSYM_FN (hsa_executable_global_variable_define)
1403 DLSYM_FN (hsa_executable_load_code_object)
1404 DLSYM_FN (hsa_executable_freeze)
1405 DLSYM_FN (hsa_signal_create)
1406 DLSYM_FN (hsa_memory_allocate)
1407 DLSYM_FN (hsa_memory_assign_agent)
1408 DLSYM_FN (hsa_memory_copy)
1409 DLSYM_FN (hsa_memory_free)
1410 DLSYM_FN (hsa_signal_destroy)
1411 DLSYM_FN (hsa_executable_get_symbol)
1412 DLSYM_FN (hsa_executable_symbol_get_info)
1413 DLSYM_FN (hsa_executable_iterate_symbols)
1414 DLSYM_FN (hsa_queue_add_write_index_release)
1415 DLSYM_FN (hsa_queue_load_read_index_acquire)
1416 DLSYM_FN (hsa_signal_wait_acquire)
1417 DLSYM_FN (hsa_signal_store_relaxed)
1418 DLSYM_FN (hsa_signal_store_release)
1419 DLSYM_FN (hsa_signal_load_acquire)
1420 DLSYM_FN (hsa_queue_destroy)
1421 DLSYM_FN (hsa_code_object_deserialize)
a17299c1
JB
1422 DLSYM_OPT_FN (hsa_amd_memory_lock)
1423 DLSYM_OPT_FN (hsa_amd_memory_unlock)
1424 DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
237957cc 1425 return true;
a17299c1 1426#undef DLSYM_OPT_FN
237957cc
AS
1427#undef DLSYM_FN
1428}
1429
1430/* Return true if the agent is a GPU and can accept of concurrent submissions
1431 from different threads. */
1432
1433static bool
1434suitable_hsa_agent_p (hsa_agent_t agent)
1435{
1436 hsa_device_type_t device_type;
1437 hsa_status_t status
1438 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1439 &device_type);
1440 if (status != HSA_STATUS_SUCCESS)
1441 return false;
1442
1443 switch (device_type)
1444 {
1445 case HSA_DEVICE_TYPE_GPU:
1446 break;
1447 case HSA_DEVICE_TYPE_CPU:
1448 if (!support_cpu_devices)
1449 return false;
1450 break;
1451 default:
1452 return false;
1453 }
1454
1455 uint32_t features = 0;
1456 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1457 &features);
1458 if (status != HSA_STATUS_SUCCESS
1459 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1460 return false;
1461 hsa_queue_type_t queue_type;
1462 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1463 &queue_type);
1464 if (status != HSA_STATUS_SUCCESS
1465 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1466 return false;
1467
1468 return true;
1469}
1470
1471/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1472 agent_count in hsa_context. */
1473
1474static hsa_status_t
1475count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1476{
1477 if (suitable_hsa_agent_p (agent))
1478 hsa_context.agent_count++;
1479 return HSA_STATUS_SUCCESS;
1480}
1481
1482/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1483 id to the describing structure in the hsa context. The index of the
1484 structure is pointed to by DATA, increment it afterwards. */
1485
1486static hsa_status_t
1487assign_agent_ids (hsa_agent_t agent, void *data)
1488{
1489 if (suitable_hsa_agent_p (agent))
1490 {
1491 int *agent_index = (int *) data;
1492 hsa_context.agents[*agent_index].id = agent;
1493 ++*agent_index;
1494 }
1495 return HSA_STATUS_SUCCESS;
1496}
1497
1498/* Initialize hsa_context if it has not already been done.
1499 Return TRUE on success. */
1500
1501static bool
1502init_hsa_context (void)
1503{
1504 hsa_status_t status;
1505 int agent_index = 0;
1506
1507 if (hsa_context.initialized)
1508 return true;
1509 init_environment_variables ();
1510 if (!init_hsa_runtime_functions ())
1511 {
1512 GCN_WARNING ("Run-time could not be dynamically opened\n");
1513 if (suppress_host_fallback)
1514 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1515 return false;
1516 }
1517 status = hsa_fns.hsa_init_fn ();
1518 if (status != HSA_STATUS_SUCCESS)
1519 return hsa_error ("Run-time could not be initialized", status);
1520 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1521
1522 if (debug)
1523 dump_hsa_system_info ();
1524
1525 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1526 if (status != HSA_STATUS_SUCCESS)
1527 return hsa_error ("GCN GPU devices could not be enumerated", status);
1528 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1529
1530 hsa_context.agents
1531 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1532 * sizeof (struct agent_info));
1533 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
966de09b
AS
1534 if (status != HSA_STATUS_SUCCESS)
1535 return hsa_error ("Scanning compute agents failed", status);
237957cc
AS
1536 if (agent_index != hsa_context.agent_count)
1537 {
1538 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1539 return false;
1540 }
1541
1542 if (debug)
1543 {
1544 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1545 if (status != HSA_STATUS_SUCCESS)
1546 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1547 }
1548
2e5ea579
FH
1549 uint16_t minor, major;
1550 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1551 &minor);
1552 if (status != HSA_STATUS_SUCCESS)
1553 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1554 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1555 &major);
1556 if (status != HSA_STATUS_SUCCESS)
1557 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1558
1559 size_t len = sizeof hsa_context.driver_version_s;
1560 int printed = snprintf (hsa_context.driver_version_s, len,
1561 "HSA Runtime %hu.%hu", (unsigned short int)major,
1562 (unsigned short int)minor);
1563 if (printed >= len)
1564 GCN_WARNING ("HSA runtime version string was truncated."
1565 "Version %hu.%hu is too long.", (unsigned short int)major,
1566 (unsigned short int)minor);
1567
237957cc
AS
1568 hsa_context.initialized = true;
1569 return true;
1570}
1571
1572/* Verify that hsa_context has already been initialized and return the
1573 agent_info structure describing device number N. Return NULL on error. */
1574
1575static struct agent_info *
1576get_agent_info (int n)
1577{
1578 if (!hsa_context.initialized)
1579 {
1580 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1581 return NULL;
1582 }
1583 if (n >= hsa_context.agent_count)
1584 {
1585 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1586 return NULL;
1587 }
1588 if (!hsa_context.agents[n].initialized)
1589 {
1590 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1591 return NULL;
1592 }
1593 return &hsa_context.agents[n];
1594}
1595
1596/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1597
1598 Selects (breaks at) a suitable region of type KIND. */
1599
1600static hsa_status_t
1601get_memory_region (hsa_region_t region, hsa_region_t *retval,
1602 hsa_region_global_flag_t kind)
1603{
1604 hsa_status_t status;
1605 hsa_region_segment_t segment;
1606
1607 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1608 &segment);
1609 if (status != HSA_STATUS_SUCCESS)
1610 return status;
1611 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1612 return HSA_STATUS_SUCCESS;
1613
1614 uint32_t flags;
1615 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1616 &flags);
1617 if (status != HSA_STATUS_SUCCESS)
1618 return status;
1619 if (flags & kind)
1620 {
1621 *retval = region;
1622 return HSA_STATUS_INFO_BREAK;
1623 }
1624 return HSA_STATUS_SUCCESS;
1625}
1626
1627/* Callback of hsa_agent_iterate_regions.
1628
1629 Selects a kernargs memory region. */
1630
1631static hsa_status_t
1632get_kernarg_memory_region (hsa_region_t region, void *data)
1633{
1634 return get_memory_region (region, (hsa_region_t *)data,
1635 HSA_REGION_GLOBAL_FLAG_KERNARG);
1636}
1637
1638/* Callback of hsa_agent_iterate_regions.
1639
1640 Selects a coarse-grained memory region suitable for the heap and
1641 offload data. */
1642
1643static hsa_status_t
1644get_data_memory_region (hsa_region_t region, void *data)
1645{
1646 return get_memory_region (region, (hsa_region_t *)data,
1647 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1648}
1649
7d593fd6
FH
1650static int
1651elf_gcn_isa_field (Elf64_Ehdr *image)
1652{
1653 return image->e_flags & EF_AMDGPU_MACH_MASK;
1654}
1655
7d593fd6
FH
1656const static char *gcn_gfx803_s = "gfx803";
1657const static char *gcn_gfx900_s = "gfx900";
1658const static char *gcn_gfx906_s = "gfx906";
3535402e 1659const static char *gcn_gfx908_s = "gfx908";
cde52d3a 1660const static char *gcn_gfx90a_s = "gfx90a";
c7ec7bd1 1661const static char *gcn_gfx1030_s = "gfx1030";
52a2c659
TB
1662const static char *gcn_gfx1100_s = "gfx1100";
1663const static int gcn_isa_name_len = 7;
7d593fd6
FH
1664
1665/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1666 support the ISA. */
1667
1668static const char*
1669isa_hsa_name (int isa) {
1670 switch(isa)
1671 {
7d593fd6
FH
1672 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1673 return gcn_gfx803_s;
1674 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1675 return gcn_gfx900_s;
1676 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1677 return gcn_gfx906_s;
3535402e
AS
1678 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1679 return gcn_gfx908_s;
cde52d3a
AS
1680 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1681 return gcn_gfx90a_s;
c7ec7bd1
AS
1682 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1683 return gcn_gfx1030_s;
52a2c659
TB
1684 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1685 return gcn_gfx1100_s;
7d593fd6
FH
1686 }
1687 return NULL;
1688}
1689
1690/* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1691 with -march) or NULL if we do not support the ISA.
1692 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1693
1694static const char*
1695isa_gcc_name (int isa) {
1696 switch(isa)
1697 {
7d593fd6
FH
1698 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1699 return "fiji";
1700 default:
1701 return isa_hsa_name (isa);
1702 }
1703}
1704
1705/* Returns the code which is used in the GCN object code to identify the ISA with
1706 the given name (as used by the HSA runtime). */
1707
1708static gcn_isa
1709isa_code(const char *isa) {
7d593fd6
FH
1710 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1711 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1712
1713 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1714 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1715
1716 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1717 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1718
3535402e
AS
1719 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1720 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1721
cde52d3a
AS
1722 if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1723 return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1724
c7ec7bd1
AS
1725 if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
1726 return EF_AMDGPU_MACH_AMDGCN_GFX1030;
1727
52a2c659
TB
1728 if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len))
1729 return EF_AMDGPU_MACH_AMDGCN_GFX1100;
1730
209ed06c 1731 return EF_AMDGPU_MACH_UNSUPPORTED;
7d593fd6
FH
1732}
1733
ae0d2c24
AS
1734/* CDNA2 devices have twice as many VGPRs compared to older devices. */
1735
1736static int
1737max_isa_vgprs (int isa)
1738{
1739 switch (isa)
1740 {
1741 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1742 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1743 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1744 case EF_AMDGPU_MACH_AMDGCN_GFX908:
ae0d2c24
AS
1745 return 256;
1746 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1747 return 512;
99890e15
AS
1748 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1749 return 512; /* 512 SIMD32 = 256 wavefrontsize64. */
1750 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1751 return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */
ae0d2c24
AS
1752 }
1753 GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1754}
1755
237957cc
AS
1756/* }}} */
1757/* {{{ Run */
1758
f6fff8a6 1759/* Create or reuse a team arena and stack space.
237957cc
AS
1760
1761 Team arenas are used by OpenMP to avoid calling malloc multiple times
1762 while setting up each team. This is purely a performance optimization.
1763
f6fff8a6
AS
1764 The stack space is used by all kernels. We must allocate it in such a
1765 way that the reverse offload implmentation can access the data.
237957cc 1766
f6fff8a6
AS
1767 Allocating this memory costs performance, so this function will reuse an
1768 existing allocation if a large enough one is idle.
1769 The memory lock is released, but not deallocated, when the kernel exits. */
1770
1771static void
1772configure_ephemeral_memories (struct kernel_info *kernel,
1773 struct kernargs_abi *kernargs, int num_teams,
1774 int num_threads)
237957cc 1775{
f6fff8a6
AS
1776 struct agent_info *agent = kernel->agent;
1777 struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1778 struct ephemeral_memories_list *item;
1779
1780 int actual_arena_size = (kernel->kind == KIND_OPENMP
1781 ? team_arena_size : 0);
1782 int actual_arena_total_size = actual_arena_size * num_teams;
1783 size_t size = (actual_arena_total_size
1784 + num_teams * num_threads * stack_size);
237957cc
AS
1785
1786 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1787 {
f6fff8a6 1788 if (item->size < size)
237957cc
AS
1789 continue;
1790
f6fff8a6
AS
1791 if (pthread_mutex_trylock (&item->in_use) == 0)
1792 break;
237957cc
AS
1793 }
1794
f6fff8a6 1795 if (!item)
237957cc 1796 {
f6fff8a6
AS
1797 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1798 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1799 num_teams, num_threads, size);
237957cc 1800
f6fff8a6
AS
1801 if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1802 {
1803 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1804 return;
1805 }
1806 item = malloc (sizeof (*item));
1807 item->size = size;
1808 item->next = NULL;
1809 *next_ptr = item;
237957cc 1810
f6fff8a6
AS
1811 if (pthread_mutex_init (&item->in_use, NULL))
1812 {
1813 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1814 return;
1815 }
1816 if (pthread_mutex_lock (&item->in_use))
1817 {
1818 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1819 return;
1820 }
1821 if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1822 {
1823 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1824 return;
1825 }
1826
1827 hsa_status_t status;
1828 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1829 &item->address);
1830 if (status != HSA_STATUS_SUCCESS)
1831 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1832 status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1833 HSA_ACCESS_PERMISSION_RW);
1834 if (status != HSA_STATUS_SUCCESS)
1835 hsa_fatal ("Could not assign arena & stack memory to device", status);
1836 }
237957cc 1837
f6fff8a6
AS
1838 kernargs->arena_ptr = (actual_arena_total_size
1839 ? (uint64_t)item->address
1840 : 0);
1841 kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1842 kernargs->arena_size_per_team = actual_arena_size;
1843 kernargs->stack_size_per_thread = stack_size;
237957cc
AS
1844}
1845
f6fff8a6 1846/* Mark an ephemeral memory space available for reuse. */
237957cc
AS
1847
1848static void
f6fff8a6 1849release_ephemeral_memories (struct agent_info* agent, void *address)
237957cc 1850{
f6fff8a6 1851 struct ephemeral_memories_list *item;
237957cc 1852
f6fff8a6 1853 for (item = agent->ephemeral_memories_list; item; item = item->next)
237957cc 1854 {
f6fff8a6 1855 if (item->address == address)
237957cc
AS
1856 {
1857 if (pthread_mutex_unlock (&item->in_use))
1858 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1859 return;
1860 }
1861 }
1862 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1863}
1864
1865/* Clean up all the allocated team arenas. */
1866
1867static bool
f6fff8a6 1868destroy_ephemeral_memories (struct agent_info *agent)
237957cc 1869{
f6fff8a6 1870 struct ephemeral_memories_list *item, *next;
237957cc 1871
f6fff8a6 1872 for (item = agent->ephemeral_memories_list; item; item = next)
237957cc
AS
1873 {
1874 next = item->next;
f6fff8a6 1875 hsa_fns.hsa_memory_free_fn (item->address);
237957cc
AS
1876 if (pthread_mutex_destroy (&item->in_use))
1877 {
f6fff8a6 1878 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
237957cc
AS
1879 return false;
1880 }
1881 free (item);
1882 }
f6fff8a6 1883 agent->ephemeral_memories_list = NULL;
237957cc
AS
1884
1885 return true;
1886}
1887
1888/* Allocate memory on a specified device. */
1889
1890static void *
1891alloc_by_agent (struct agent_info *agent, size_t size)
1892{
1893 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1894
237957cc
AS
1895 void *ptr;
1896 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1897 size, &ptr);
1898 if (status != HSA_STATUS_SUCCESS)
1899 {
1900 hsa_error ("Could not allocate device memory", status);
1901 return NULL;
1902 }
1903
1904 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1905 HSA_ACCESS_PERMISSION_RW);
1906 if (status != HSA_STATUS_SUCCESS)
1907 {
1908 hsa_error ("Could not assign data memory to device", status);
1909 return NULL;
1910 }
1911
1912 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1913 bool profiling_dispatch_p
1914 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1915 if (profiling_dispatch_p)
1916 {
1917 acc_prof_info *prof_info = thr->prof_info;
1918 acc_event_info data_event_info;
1919 acc_api_info *api_info = thr->api_info;
1920
1921 prof_info->event_type = acc_ev_alloc;
1922
1923 data_event_info.data_event.event_type = prof_info->event_type;
1924 data_event_info.data_event.valid_bytes
1925 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1926 data_event_info.data_event.parent_construct
1927 = acc_construct_parallel;
1928 data_event_info.data_event.implicit = 1;
1929 data_event_info.data_event.tool_info = NULL;
1930 data_event_info.data_event.var_name = NULL;
1931 data_event_info.data_event.bytes = size;
1932 data_event_info.data_event.host_ptr = NULL;
1933 data_event_info.data_event.device_ptr = (void *) ptr;
1934
1935 api_info->device_api = acc_device_api_other;
1936
1937 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1938 api_info);
1939 }
1940
1941 return ptr;
1942}
1943
1944/* Create kernel dispatch data structure for given KERNEL, along with
1945 the necessary device signals and memory allocations. */
1946
1947static struct kernel_dispatch *
f6fff8a6
AS
1948create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1949 int num_threads)
237957cc
AS
1950{
1951 struct agent_info *agent = kernel->agent;
1952 struct kernel_dispatch *shadow
1953 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1954
1955 shadow->agent = kernel->agent;
1956 shadow->object = kernel->object;
1957
1958 hsa_signal_t sync_signal;
1959 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1960 if (status != HSA_STATUS_SUCCESS)
1961 hsa_fatal ("Error creating the GCN sync signal", status);
1962
1963 shadow->signal = sync_signal.handle;
1964 shadow->private_segment_size = kernel->private_segment_size;
e7d6c277
AS
1965
1966 if (lowlat_size < 0)
1967 {
1968 /* Divide the LDS between the number of running teams.
1969 Allocate not less than is defined in the kernel metadata. */
1970 int teams_per_cu = num_teams / get_cu_count (agent);
1971 int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
1972 shadow->group_segment_size
1973 = (kernel->group_segment_size > LDS_per_team
1974 ? kernel->group_segment_size
1975 : LDS_per_team);;
1976 }
1977 else if (lowlat_size < GCN_LOWLAT_HEAP+8)
1978 /* Ensure that there's space for the OpenMP libgomp data. */
1979 shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
1980 else
1981 shadow->group_segment_size = (lowlat_size > 65536
1982 ? 65536
1983 : lowlat_size);
237957cc
AS
1984
1985 /* We expect kernels to request a single pointer, explicitly, and the
1986 rest of struct kernargs, implicitly. If they request anything else
1987 then something is wrong. */
1988 if (kernel->kernarg_segment_size > 8)
1989 {
1990 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1991 return NULL;
1992 }
1993
1994 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1995 sizeof (struct kernargs),
1996 &shadow->kernarg_address);
1997 if (status != HSA_STATUS_SUCCESS)
1998 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1999 struct kernargs *kernargs = shadow->kernarg_address;
2000
2001 /* Zero-initialize the output_data (minimum needed). */
f6fff8a6 2002 kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
237957cc
AS
2003 kernargs->output_data.next_output = 0;
2004 for (unsigned i = 0;
2005 i < (sizeof (kernargs->output_data.queue)
2006 / sizeof (kernargs->output_data.queue[0]));
2007 i++)
2008 kernargs->output_data.queue[i].written = 0;
2009 kernargs->output_data.consumed = 0;
2010
2011 /* Pass in the heap location. */
f6fff8a6 2012 kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
237957cc 2013
f6fff8a6
AS
2014 /* Create the ephemeral memory spaces. */
2015 configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
237957cc
AS
2016
2017 /* Ensure we can recognize unset return values. */
2018 kernargs->output_data.return_value = 0xcafe0000;
2019
2020 return shadow;
2021}
2022
8c05d8cd 2023static void
6edcb5dc
TB
2024process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
2025 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
8c05d8cd
TB
2026{
2027 int dev_num = dev_num64;
6edcb5dc 2028 GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
130c2f3c 2029 NULL);
8c05d8cd
TB
2030}
2031
237957cc
AS
2032/* Output any data written to console output from the kernel. It is expected
2033 that this function is polled during kernel execution.
2034
2035 We print all entries from the last item printed to the next entry without
2036 a "written" flag. If the "final" flag is set then it'll continue right to
2037 the end.
2038
2039 The print buffer is circular, but the from and to locations don't wrap when
2040 the buffer does, so the output limit is UINT_MAX. The target blocks on
2041 output when the buffer is full. */
2042
2043static void
2044console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2045 bool final)
2046{
2047 unsigned int limit = (sizeof (kernargs->output_data.queue)
2048 / sizeof (kernargs->output_data.queue[0]));
2049
2050 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2051 __ATOMIC_ACQUIRE);
2052 unsigned int to = kernargs->output_data.next_output;
2053
2054 if (from > to)
2055 {
2056 /* Overflow. */
2057 if (final)
2058 printf ("GCN print buffer overflowed.\n");
2059 return;
2060 }
2061
2062 unsigned int i;
2063 for (i = from; i < to; i++)
2064 {
2065 struct printf_data *data = &kernargs->output_data.queue[i%limit];
2066
2067 if (!data->written && !final)
2068 break;
2069
2070 switch (data->type)
2071 {
2072 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2073 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2074 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2075 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
8c05d8cd 2076 case 4:
6edcb5dc
TB
2077 process_reverse_offload (data->value_u64[0], data->value_u64[1],
2078 data->value_u64[2], data->value_u64[3],
2079 data->value_u64[4], data->value_u64[5]);
8c05d8cd 2080 break;
237957cc
AS
2081 default: printf ("GCN print buffer error!\n"); break;
2082 }
2083 data->written = 0;
2084 __atomic_store_n (&kernargs->output_data.consumed, i+1,
2085 __ATOMIC_RELEASE);
2086 }
2087 fflush (stdout);
2088}
2089
2090/* Release data structure created for a kernel dispatch in SHADOW argument,
2091 and clean up the signal and memory allocations. */
2092
2093static void
2094release_kernel_dispatch (struct kernel_dispatch *shadow)
2095{
2096 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2097
2098 struct kernargs *kernargs = shadow->kernarg_address;
f6fff8a6
AS
2099 void *addr = (void *)kernargs->abi.arena_ptr;
2100 if (!addr)
2101 addr = (void *)kernargs->abi.stack_ptr;
2102 release_ephemeral_memories (shadow->agent, addr);
237957cc
AS
2103
2104 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2105
2106 hsa_signal_t s;
2107 s.handle = shadow->signal;
2108 hsa_fns.hsa_signal_destroy_fn (s);
2109
2110 free (shadow);
2111}
2112
2113/* Extract the properties from a kernel binary. */
2114
2115static void
2116init_kernel_properties (struct kernel_info *kernel)
2117{
2118 hsa_status_t status;
2119 struct agent_info *agent = kernel->agent;
2120 hsa_executable_symbol_t kernel_symbol;
f062c3f1
AS
2121 char *buf = alloca (strlen (kernel->name) + 4);
2122 sprintf (buf, "%s.kd", kernel->name);
237957cc 2123 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
f062c3f1 2124 buf, agent->id,
237957cc
AS
2125 0, &kernel_symbol);
2126 if (status != HSA_STATUS_SUCCESS)
2127 {
2128 hsa_warn ("Could not find symbol for kernel in the code object", status);
f062c3f1 2129 fprintf (stderr, "not found name: '%s'\n", buf);
237957cc
AS
2130 dump_executable_symbols (agent->executable);
2131 goto failure;
2132 }
2133 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2134 status = hsa_fns.hsa_executable_symbol_get_info_fn
2135 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2136 if (status != HSA_STATUS_SUCCESS)
2137 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2138 status = hsa_fns.hsa_executable_symbol_get_info_fn
2139 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2140 &kernel->kernarg_segment_size);
2141 if (status != HSA_STATUS_SUCCESS)
2142 hsa_fatal ("Could not get info about kernel argument size", status);
2143 status = hsa_fns.hsa_executable_symbol_get_info_fn
2144 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2145 &kernel->group_segment_size);
2146 if (status != HSA_STATUS_SUCCESS)
2147 hsa_fatal ("Could not get info about kernel group segment size", status);
2148 status = hsa_fns.hsa_executable_symbol_get_info_fn
2149 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2150 &kernel->private_segment_size);
2151 if (status != HSA_STATUS_SUCCESS)
2152 hsa_fatal ("Could not get info about kernel private segment size",
2153 status);
2154
2155 /* The kernel type is not known until something tries to launch it. */
2156 kernel->kind = KIND_UNKNOWN;
2157
2158 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2159 "following segment sizes: \n", kernel->name);
2160 GCN_DEBUG (" group_segment_size: %u\n",
2161 (unsigned) kernel->group_segment_size);
2162 GCN_DEBUG (" private_segment_size: %u\n",
2163 (unsigned) kernel->private_segment_size);
2164 GCN_DEBUG (" kernarg_segment_size: %u\n",
2165 (unsigned) kernel->kernarg_segment_size);
2166 return;
2167
2168failure:
2169 kernel->initialization_failed = true;
2170}
2171
2172/* Do all the work that is necessary before running KERNEL for the first time.
2173 The function assumes the program has been created, finalized and frozen by
2174 create_and_finalize_hsa_program. */
2175
2176static void
2177init_kernel (struct kernel_info *kernel)
2178{
2179 if (pthread_mutex_lock (&kernel->init_mutex))
2180 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2181 if (kernel->initialized)
2182 {
2183 if (pthread_mutex_unlock (&kernel->init_mutex))
2184 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2185 "mutex");
2186
2187 return;
2188 }
2189
2190 init_kernel_properties (kernel);
2191
2192 if (!kernel->initialization_failed)
2193 {
2194 GCN_DEBUG ("\n");
2195
2196 kernel->initialized = true;
2197 }
2198 if (pthread_mutex_unlock (&kernel->init_mutex))
2199 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2200 "mutex");
2201}
2202
2203/* Run KERNEL on its agent, pass VARS to it as arguments and take
2204 launch attributes from KLA.
2205
2206 MODULE_LOCKED indicates that the caller already holds the lock and
2207 run_kernel need not lock it again.
2208 If AQ is NULL then agent->sync_queue will be used. */
2209
2210static void
2211run_kernel (struct kernel_info *kernel, void *vars,
2212 struct GOMP_kernel_launch_attributes *kla,
2213 struct goacc_asyncqueue *aq, bool module_locked)
2214{
ae0d2c24 2215 struct agent_info *agent = kernel->agent;
5a28e272
KCY
2216 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2217 kernel->description->vpgr_count);
2218
2219 /* Reduce the number of threads/workers if there are insufficient
2220 VGPRs available to run the kernels together. */
2221 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2222 {
ae0d2c24 2223 int max_vgprs = max_isa_vgprs (agent->device_isa);
5a28e272 2224 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
ae0d2c24 2225 int max_threads = (max_vgprs / granulated_vgprs) * 4;
5a28e272
KCY
2226 if (kla->gdims[2] > max_threads)
2227 {
2228 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2229 " per team/gang - reducing to %d threads/workers.\n",
2230 kla->gdims[2], max_threads);
2231 kla->gdims[2] = max_threads;
2232 }
2233 }
2234
237957cc
AS
2235 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2236 (aq ? aq->id : 0));
2237 GCN_DEBUG ("GCN launch attribs: gdims:[");
2238 int i;
2239 for (i = 0; i < kla->ndim; ++i)
2240 {
2241 if (i)
2242 DEBUG_PRINT (", ");
2243 DEBUG_PRINT ("%u", kla->gdims[i]);
2244 }
2245 DEBUG_PRINT ("], normalized gdims:[");
2246 for (i = 0; i < kla->ndim; ++i)
2247 {
2248 if (i)
2249 DEBUG_PRINT (", ");
2250 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2251 }
2252 DEBUG_PRINT ("], wdims:[");
2253 for (i = 0; i < kla->ndim; ++i)
2254 {
2255 if (i)
2256 DEBUG_PRINT (", ");
2257 DEBUG_PRINT ("%u", kla->wdims[i]);
2258 }
2259 DEBUG_PRINT ("]\n");
2260 DEBUG_FLUSH ();
2261
237957cc
AS
2262 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2263 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2264
2265 if (!agent->initialized)
2266 GOMP_PLUGIN_fatal ("Agent must be initialized");
2267
2268 if (!kernel->initialized)
2269 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2270
2271 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2272
2273 uint64_t index
2274 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2275 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2276
2277 /* Wait until the queue is not full before writing the packet. */
2278 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2279 >= command_q->size)
2280 ;
2281
2282 /* Do not allow the dimensions to be overridden when running
2283 constructors or destructors. */
2284 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2285 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2286
2287 hsa_kernel_dispatch_packet_t *packet;
2288 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2289 + index % command_q->size;
2290
2291 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2292 packet->grid_size_x = override_x ? : kla->gdims[0];
2293 packet->workgroup_size_x = get_group_size (kla->ndim,
2294 packet->grid_size_x,
2295 kla->wdims[0]);
2296
2297 if (kla->ndim >= 2)
2298 {
2299 packet->grid_size_y = kla->gdims[1];
2300 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2301 kla->wdims[1]);
2302 }
2303 else
2304 {
2305 packet->grid_size_y = 1;
2306 packet->workgroup_size_y = 1;
2307 }
2308
2309 if (kla->ndim == 3)
2310 {
2311 packet->grid_size_z = limit_worker_threads (override_z
2312 ? : kla->gdims[2]);
2313 packet->workgroup_size_z = get_group_size (kla->ndim,
2314 packet->grid_size_z,
2315 kla->wdims[2]);
2316 }
2317 else
2318 {
2319 packet->grid_size_z = 1;
2320 packet->workgroup_size_z = 1;
2321 }
2322
2323 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2324 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2325 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2326 packet->grid_size_x / packet->workgroup_size_x,
2327 packet->grid_size_y / packet->workgroup_size_y,
2328 packet->grid_size_z / packet->workgroup_size_z,
2329 packet->workgroup_size_x, packet->workgroup_size_y,
2330 packet->workgroup_size_z);
2331
2332 struct kernel_dispatch *shadow
f6fff8a6
AS
2333 = create_kernel_dispatch (kernel, packet->grid_size_x,
2334 packet->grid_size_z);
237957cc
AS
2335 shadow->queue = command_q;
2336
2337 if (debug)
2338 {
2339 fprintf (stderr, "\nKernel has following dependencies:\n");
2340 print_kernel_dispatch (shadow, 2);
2341 }
2342
e7d6c277
AS
2343 packet->private_segment_size = shadow->private_segment_size;
2344 packet->group_segment_size = shadow->group_segment_size;
2345 packet->kernel_object = shadow->object;
237957cc
AS
2346 packet->kernarg_address = shadow->kernarg_address;
2347 hsa_signal_t s;
2348 s.handle = shadow->signal;
2349 packet->completion_signal = s;
2350 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2351 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2352
2353 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2354
2355 uint16_t header;
2356 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2357 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2358 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2359
2360 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2361 agent->device_id);
2362
2363 packet_store_release ((uint32_t *) packet, header,
2364 (uint16_t) kla->ndim
2365 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2366
2367 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2368 index);
2369
2370 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2371
2372 /* Root signal waits with 1ms timeout. */
2373 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2374 1000 * 1000,
2375 HSA_WAIT_STATE_BLOCKED) != 0)
2376 {
2377 console_output (kernel, shadow->kernarg_address, false);
2378 }
2379 console_output (kernel, shadow->kernarg_address, true);
2380
2381 struct kernargs *kernargs = shadow->kernarg_address;
2382 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2383
2384 release_kernel_dispatch (shadow);
2385
2386 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2387 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2388
2389 unsigned int upper = (return_value & ~0xffff) >> 16;
2390 if (upper == 0xcafe)
2391 ; // exit not called, normal termination.
2392 else if (upper == 0xffff)
2393 ; // exit called.
2394 else
2395 {
2396 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2397 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2398 return_value);
2399 abort ();
2400 }
2401
2402 if (upper == 0xffff)
2403 {
2404 unsigned int signal = (return_value >> 8) & 0xff;
2405
2406 if (signal == SIGABRT)
2407 {
2408 GCN_WARNING ("GCN Kernel aborted\n");
2409 abort ();
2410 }
2411 else if (signal != 0)
2412 {
2413 GCN_WARNING ("GCN Kernel received unknown signal\n");
2414 abort ();
2415 }
2416
2417 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2418 exit (return_value & 0xff);
2419 }
2420}
2421
2422/* }}} */
2423/* {{{ Load/Unload */
2424
2425/* Initialize KERNEL from D and other parameters. Return true on success. */
2426
2427static bool
2428init_basic_kernel_info (struct kernel_info *kernel,
2429 struct hsa_kernel_description *d,
2430 struct agent_info *agent,
2431 struct module_info *module)
2432{
2433 kernel->agent = agent;
2434 kernel->module = module;
2435 kernel->name = d->name;
5a28e272 2436 kernel->description = d;
237957cc
AS
2437 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2438 {
2439 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2440 return false;
2441 }
2442 return true;
2443}
2444
7d593fd6
FH
2445/* Check that the GCN ISA of the given image matches the ISA of the agent. */
2446
2447static bool
2448isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2449{
2450 int isa_field = elf_gcn_isa_field (image);
2451 const char* isa_s = isa_hsa_name (isa_field);
2452 if (!isa_s)
2453 {
2454 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2455 return false;
2456 }
2457
2458 if (isa_field != agent->device_isa)
2459 {
2460 char msg[120];
2461 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2462 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2463 assert (agent_isa_s);
2464 assert (agent_isa_gcc_s);
2465
2466 snprintf (msg, sizeof msg,
2467 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
4a206161 2468 "Try to recompile with '-foffload-options=-march=%s'.\n",
7d593fd6
FH
2469 isa_s, agent_isa_s, agent_isa_gcc_s);
2470
2471 hsa_error (msg, HSA_STATUS_ERROR);
2472 return false;
2473 }
2474
2475 return true;
2476}
2477
237957cc
AS
2478/* Create and finalize the program consisting of all loaded modules. */
2479
2480static bool
2481create_and_finalize_hsa_program (struct agent_info *agent)
2482{
2483 hsa_status_t status;
237957cc
AS
2484 bool res = true;
2485 if (pthread_mutex_lock (&agent->prog_mutex))
2486 {
2487 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2488 return false;
2489 }
2490 if (agent->prog_finalized)
2491 goto final;
2492
2493 status
2494 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2495 HSA_EXECUTABLE_STATE_UNFROZEN,
2496 "", &agent->executable);
2497 if (status != HSA_STATUS_SUCCESS)
2498 {
2499 hsa_error ("Could not create GCN executable", status);
2500 goto fail;
2501 }
2502
2503 /* Load any GCN modules. */
2504 struct module_info *module = agent->module;
2505 if (module)
2506 {
2507 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2508
7d593fd6
FH
2509 if (!isa_matches_agent (agent, image))
2510 goto fail;
2511
237957cc
AS
2512 hsa_code_object_t co = { 0 };
2513 status = hsa_fns.hsa_code_object_deserialize_fn
2514 (module->image_desc->gcn_image->image,
2515 module->image_desc->gcn_image->size,
2516 NULL, &co);
2517 if (status != HSA_STATUS_SUCCESS)
2518 {
2519 hsa_error ("Could not deserialize GCN code object", status);
2520 goto fail;
2521 }
2522
2523 status = hsa_fns.hsa_executable_load_code_object_fn
2524 (agent->executable, agent->id, co, "");
2525 if (status != HSA_STATUS_SUCCESS)
2526 {
2527 hsa_error ("Could not load GCN code object", status);
2528 goto fail;
2529 }
2530
2531 if (!module->heap)
2532 {
2533 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2534 gcn_kernel_heap_size,
2535 (void**)&module->heap);
2536 if (status != HSA_STATUS_SUCCESS)
2537 {
2538 hsa_error ("Could not allocate memory for GCN heap", status);
2539 goto fail;
2540 }
2541
2542 status = hsa_fns.hsa_memory_assign_agent_fn
2543 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2544 if (status != HSA_STATUS_SUCCESS)
2545 {
2546 hsa_error ("Could not assign GCN heap memory to device", status);
2547 goto fail;
2548 }
2549
2550 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2551 &gcn_kernel_heap_size,
2552 sizeof (gcn_kernel_heap_size));
2553 }
2554
2555 }
2556
2557 if (debug)
2558 dump_executable_symbols (agent->executable);
2559
2560 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2561 if (status != HSA_STATUS_SUCCESS)
2562 {
2563 hsa_error ("Could not freeze the GCN executable", status);
2564 goto fail;
2565 }
2566
237957cc
AS
2567final:
2568 agent->prog_finalized = true;
2569
2570 if (pthread_mutex_unlock (&agent->prog_mutex))
2571 {
2572 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2573 res = false;
2574 }
2575
2576 return res;
2577
2578fail:
2579 res = false;
2580 goto final;
2581}
2582
2583/* Free the HSA program in agent and everything associated with it and set
2584 agent->prog_finalized and the initialized flags of all kernels to false.
2585 Return TRUE on success. */
2586
2587static bool
2588destroy_hsa_program (struct agent_info *agent)
2589{
2590 if (!agent->prog_finalized)
2591 return true;
2592
2593 hsa_status_t status;
2594
2595 GCN_DEBUG ("Destroying the current GCN program.\n");
2596
2597 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2598 if (status != HSA_STATUS_SUCCESS)
2599 return hsa_error ("Could not destroy GCN executable", status);
2600
2601 if (agent->module)
2602 {
2603 int i;
2604 for (i = 0; i < agent->module->kernel_count; i++)
2605 agent->module->kernels[i].initialized = false;
2606
2607 if (agent->module->heap)
2608 {
2609 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2610 agent->module->heap = NULL;
2611 }
2612 }
2613 agent->prog_finalized = false;
2614 return true;
2615}
2616
2617/* Deinitialize all information associated with MODULE and kernels within
2618 it. Return TRUE on success. */
2619
2620static bool
2621destroy_module (struct module_info *module, bool locked)
2622{
2623 /* Run destructors before destroying module. */
2624 struct GOMP_kernel_launch_attributes kla =
2625 { 3,
2626 /* Grid size. */
2627 { 1, 64, 1 },
2628 /* Work-group size. */
2629 { 1, 64, 1 }
2630 };
2631
2632 if (module->fini_array_func)
2633 {
2634 init_kernel (module->fini_array_func);
2635 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2636 }
2637 module->constructors_run_p = false;
2638
2639 int i;
2640 for (i = 0; i < module->kernel_count; i++)
2641 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2642 {
2643 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2644 "mutex");
2645 return false;
2646 }
2647
2648 return true;
2649}
2650
2651/* }}} */
2652/* {{{ Async */
2653
2654/* Callback of dispatch queues to report errors. */
2655
2656static void
2657execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2658{
2659 struct queue_entry *entry = &aq->queue[index];
2660
2661 switch (entry->type)
2662 {
2663 case KERNEL_LAUNCH:
2664 if (DEBUG_QUEUES)
2665 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2666 aq->agent->device_id, aq->id, index);
2667 run_kernel (entry->u.launch.kernel,
2668 entry->u.launch.vars,
2669 &entry->u.launch.kla, aq, false);
2670 if (DEBUG_QUEUES)
2671 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2672 aq->agent->device_id, aq->id, index);
2673 break;
2674
2675 case CALLBACK:
2676 if (DEBUG_QUEUES)
2677 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2678 aq->agent->device_id, aq->id, index);
2679 entry->u.callback.fn (entry->u.callback.data);
2680 if (DEBUG_QUEUES)
2681 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2682 aq->agent->device_id, aq->id, index);
2683 break;
2684
2685 case ASYNC_WAIT:
2686 {
2687 /* FIXME: is it safe to access a placeholder that may already have
2688 been executed? */
2689 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2690
2691 if (DEBUG_QUEUES)
2692 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2693 aq->agent->device_id, aq->id, index);
2694
2695 pthread_mutex_lock (&placeholderp->mutex);
2696
2697 while (!placeholderp->executed)
2698 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2699
2700 pthread_mutex_unlock (&placeholderp->mutex);
2701
2702 if (pthread_cond_destroy (&placeholderp->cond))
2703 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2704
2705 if (pthread_mutex_destroy (&placeholderp->mutex))
2706 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2707
2708 if (DEBUG_QUEUES)
2709 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2710 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2711 }
2712 break;
2713
2714 case ASYNC_PLACEHOLDER:
2715 pthread_mutex_lock (&entry->u.placeholder.mutex);
2716 entry->u.placeholder.executed = 1;
2717 pthread_cond_signal (&entry->u.placeholder.cond);
2718 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2719 break;
2720
2721 default:
2722 GOMP_PLUGIN_fatal ("Unknown queue element");
2723 }
2724}
2725
2726/* This function is run as a thread to service an async queue in the
2727 background. It runs continuously until the stop flag is set. */
2728
2729static void *
2730drain_queue (void *thread_arg)
2731{
2732 struct goacc_asyncqueue *aq = thread_arg;
2733
2734 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2735 {
2736 aq->drain_queue_stop = 2;
2737 return NULL;
2738 }
2739
2740 pthread_mutex_lock (&aq->mutex);
2741
2742 while (true)
2743 {
2744 if (aq->drain_queue_stop)
2745 break;
2746
2747 if (aq->queue_n > 0)
2748 {
2749 pthread_mutex_unlock (&aq->mutex);
2750 execute_queue_entry (aq, aq->queue_first);
2751
2752 pthread_mutex_lock (&aq->mutex);
2753 aq->queue_first = ((aq->queue_first + 1)
2754 % ASYNC_QUEUE_SIZE);
2755 aq->queue_n--;
2756
2757 if (DEBUG_THREAD_SIGNAL)
2758 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2759 aq->agent->device_id, aq->id);
2760 pthread_cond_broadcast (&aq->queue_cond_out);
2761 pthread_mutex_unlock (&aq->mutex);
2762
2763 if (DEBUG_QUEUES)
2764 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2765 aq->id);
2766 pthread_mutex_lock (&aq->mutex);
2767 }
2768 else
2769 {
2770 if (DEBUG_THREAD_SLEEP)
2771 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2772 aq->agent->device_id, aq->id);
2773 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2774 if (DEBUG_THREAD_SLEEP)
2775 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2776 aq->agent->device_id, aq->id);
2777 }
2778 }
2779
2780 aq->drain_queue_stop = 2;
2781 if (DEBUG_THREAD_SIGNAL)
2782 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2783 aq->agent->device_id, aq->id);
2784 pthread_cond_broadcast (&aq->queue_cond_out);
2785 pthread_mutex_unlock (&aq->mutex);
2786
2787 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2788 return NULL;
2789}
2790
2791/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2792 is not usually the case. This is just a debug tool. */
2793
2794static void
2795drain_queue_synchronous (struct goacc_asyncqueue *aq)
2796{
2797 pthread_mutex_lock (&aq->mutex);
2798
2799 while (aq->queue_n > 0)
2800 {
2801 execute_queue_entry (aq, aq->queue_first);
2802
2803 aq->queue_first = ((aq->queue_first + 1)
2804 % ASYNC_QUEUE_SIZE);
2805 aq->queue_n--;
2806 }
2807
2808 pthread_mutex_unlock (&aq->mutex);
2809}
2810
d88b27da
JB
2811/* Block the current thread until an async queue is writable. The aq->mutex
2812 lock should be held on entry, and remains locked on exit. */
237957cc
AS
2813
2814static void
2815wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2816{
2817 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2818 {
237957cc
AS
2819 /* Queue is full. Wait for it to not be full. */
2820 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2821 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
237957cc
AS
2822 }
2823}
2824
2825/* Request an asynchronous kernel launch on the specified queue. This
2826 may block if the queue is full, but returns without waiting for the
2827 kernel to run. */
2828
2829static void
2830queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2831 void *vars, struct GOMP_kernel_launch_attributes *kla)
2832{
2833 assert (aq->agent == kernel->agent);
2834
237957cc
AS
2835 pthread_mutex_lock (&aq->mutex);
2836
d88b27da
JB
2837 wait_for_queue_nonfull (aq);
2838
237957cc
AS
2839 int queue_last = ((aq->queue_first + aq->queue_n)
2840 % ASYNC_QUEUE_SIZE);
2841 if (DEBUG_QUEUES)
2842 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2843 aq->id, queue_last);
2844
2845 aq->queue[queue_last].type = KERNEL_LAUNCH;
2846 aq->queue[queue_last].u.launch.kernel = kernel;
2847 aq->queue[queue_last].u.launch.vars = vars;
2848 aq->queue[queue_last].u.launch.kla = *kla;
2849
2850 aq->queue_n++;
2851
2852 if (DEBUG_THREAD_SIGNAL)
2853 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2854 aq->agent->device_id, aq->id);
2855 pthread_cond_signal (&aq->queue_cond_in);
2856
2857 pthread_mutex_unlock (&aq->mutex);
2858}
2859
2860/* Request an asynchronous callback on the specified queue. The callback
2861 function will be called, with the given opaque data, from the appropriate
2862 async thread, when all previous items on that queue are complete. */
2863
2864static void
2865queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2866 void *data)
2867{
237957cc
AS
2868 pthread_mutex_lock (&aq->mutex);
2869
d88b27da
JB
2870 wait_for_queue_nonfull (aq);
2871
237957cc
AS
2872 int queue_last = ((aq->queue_first + aq->queue_n)
2873 % ASYNC_QUEUE_SIZE);
2874 if (DEBUG_QUEUES)
2875 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2876 aq->id, queue_last);
2877
2878 aq->queue[queue_last].type = CALLBACK;
2879 aq->queue[queue_last].u.callback.fn = fn;
2880 aq->queue[queue_last].u.callback.data = data;
2881
2882 aq->queue_n++;
2883
2884 if (DEBUG_THREAD_SIGNAL)
2885 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2886 aq->agent->device_id, aq->id);
2887 pthread_cond_signal (&aq->queue_cond_in);
2888
2889 pthread_mutex_unlock (&aq->mutex);
2890}
2891
2892/* Request that a given async thread wait for another thread (unspecified) to
2893 reach the given placeholder. The wait will occur when all previous entries
2894 on the queue are complete. A placeholder is effectively a kind of signal
2895 which simply sets a flag when encountered in a queue. */
2896
2897static void
2898queue_push_asyncwait (struct goacc_asyncqueue *aq,
2899 struct placeholder *placeholderp)
2900{
237957cc
AS
2901 pthread_mutex_lock (&aq->mutex);
2902
d88b27da
JB
2903 wait_for_queue_nonfull (aq);
2904
237957cc
AS
2905 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2906 if (DEBUG_QUEUES)
2907 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2908 aq->id, queue_last);
2909
2910 aq->queue[queue_last].type = ASYNC_WAIT;
2911 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2912
2913 aq->queue_n++;
2914
2915 if (DEBUG_THREAD_SIGNAL)
2916 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2917 aq->agent->device_id, aq->id);
2918 pthread_cond_signal (&aq->queue_cond_in);
2919
2920 pthread_mutex_unlock (&aq->mutex);
2921}
2922
2923/* Add a placeholder into an async queue. When the async thread reaches the
2924 placeholder it will set the "executed" flag to true and continue.
2925 Another thread may be waiting on this thread reaching the placeholder. */
2926
2927static struct placeholder *
2928queue_push_placeholder (struct goacc_asyncqueue *aq)
2929{
2930 struct placeholder *placeholderp;
2931
237957cc
AS
2932 pthread_mutex_lock (&aq->mutex);
2933
d88b27da
JB
2934 wait_for_queue_nonfull (aq);
2935
237957cc
AS
2936 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2937 if (DEBUG_QUEUES)
2938 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2939 aq->id, queue_last);
2940
2941 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2942 placeholderp = &aq->queue[queue_last].u.placeholder;
2943
2944 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2945 {
2946 pthread_mutex_unlock (&aq->mutex);
2947 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2948 }
2949
2950 if (pthread_cond_init (&placeholderp->cond, NULL))
2951 {
2952 pthread_mutex_unlock (&aq->mutex);
2953 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2954 }
2955
2956 placeholderp->executed = 0;
2957
2958 aq->queue_n++;
2959
2960 if (DEBUG_THREAD_SIGNAL)
2961 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2962 aq->agent->device_id, aq->id);
2963 pthread_cond_signal (&aq->queue_cond_in);
2964
2965 pthread_mutex_unlock (&aq->mutex);
2966
2967 return placeholderp;
2968}
2969
2970/* Signal an asynchronous thread to terminate, and wait for it to do so. */
2971
2972static void
2973finalize_async_thread (struct goacc_asyncqueue *aq)
2974{
2975 pthread_mutex_lock (&aq->mutex);
2976 if (aq->drain_queue_stop == 2)
2977 {
2978 pthread_mutex_unlock (&aq->mutex);
2979 return;
2980 }
2981
2982 aq->drain_queue_stop = 1;
2983
2984 if (DEBUG_THREAD_SIGNAL)
2985 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2986 aq->agent->device_id, aq->id);
2987 pthread_cond_signal (&aq->queue_cond_in);
2988
2989 while (aq->drain_queue_stop != 2)
2990 {
2991 if (DEBUG_THREAD_SLEEP)
2992 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2993 " to sleep\n", aq->agent->device_id, aq->id);
2994 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2995 if (DEBUG_THREAD_SLEEP)
2996 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2997 aq->agent->device_id, aq->id);
2998 }
2999
3000 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
3001 aq->id);
3002 pthread_mutex_unlock (&aq->mutex);
3003
3004 int err = pthread_join (aq->thread_drain_queue, NULL);
3005 if (err != 0)
3006 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3007 aq->agent->device_id, aq->id, strerror (err));
3008 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3009}
3010
3011/* Set up an async queue for OpenMP. There will be only one. The
3012 implementation simply uses an OpenACC async queue.
3013 FIXME: is this thread-safe if two threads call this function? */
3014
3015static void
3016maybe_init_omp_async (struct agent_info *agent)
3017{
3018 if (!agent->omp_async_queue)
3019 agent->omp_async_queue
3020 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3021}
3022
8d2f4ddf
JB
3023/* A wrapper that works around an issue in the HSA runtime with host-to-device
3024 copies from read-only pages. */
3025
3026static void
3027hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3028{
3029 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3030
3031 if (status == HSA_STATUS_SUCCESS)
3032 return;
3033
3034 /* It appears that the copy fails if the source data is in a read-only page.
3035 We can't detect that easily, so try copying the data to a temporary buffer
3036 and doing the copy again if we got an error above. */
3037
3038 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3039 "[%p:+%d]\n", (void *) src, (int) len);
3040
3041 void *src_copy = malloc (len);
3042 memcpy (src_copy, src, len);
3043 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3044 free (src_copy);
3045 if (status != HSA_STATUS_SUCCESS)
3046 GOMP_PLUGIN_error ("memory copy failed");
3047}
3048
237957cc
AS
3049/* Copy data to or from a device. This is intended for use as an async
3050 callback event. */
3051
3052static void
3053copy_data (void *data_)
3054{
3055 struct copy_data *data = (struct copy_data *)data_;
3056 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3057 data->aq->agent->device_id, data->aq->id, data->len, data->src,
3058 data->dst);
8d2f4ddf 3059 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
237957cc
AS
3060 free (data);
3061}
3062
237957cc 3063/* Request an asynchronous data copy, to or from a device, on a given queue.
9c41f5b9 3064 The event will be registered as a callback. */
237957cc
AS
3065
3066static void
3067queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
9c41f5b9 3068 size_t len)
237957cc
AS
3069{
3070 if (DEBUG_QUEUES)
3071 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3072 aq->agent->device_id, aq->id, len, src, dst);
3073 struct copy_data *data
3074 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3075 data->dst = dst;
3076 data->src = src;
3077 data->len = len;
237957cc
AS
3078 data->aq = aq;
3079 queue_push_callback (aq, copy_data, data);
3080}
3081
3082/* Return true if the given queue is currently empty. */
3083
3084static int
3085queue_empty (struct goacc_asyncqueue *aq)
3086{
3087 pthread_mutex_lock (&aq->mutex);
3088 int res = aq->queue_n == 0 ? 1 : 0;
3089 pthread_mutex_unlock (&aq->mutex);
3090
3091 return res;
3092}
3093
3094/* Wait for a given queue to become empty. This implements an OpenACC wait
3095 directive. */
3096
3097static void
3098wait_queue (struct goacc_asyncqueue *aq)
3099{
3100 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3101 {
3102 drain_queue_synchronous (aq);
3103 return;
3104 }
3105
3106 pthread_mutex_lock (&aq->mutex);
3107
3108 while (aq->queue_n > 0)
3109 {
3110 if (DEBUG_THREAD_SLEEP)
3111 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3112 aq->agent->device_id, aq->id);
3113 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3114 if (DEBUG_THREAD_SLEEP)
3115 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3116 aq->id);
3117 }
3118
3119 pthread_mutex_unlock (&aq->mutex);
3120 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3121}
3122
3123/* }}} */
3124/* {{{ OpenACC support */
3125
3126/* Execute an OpenACC kernel, synchronously or asynchronously. */
3127
3128static void
f8332e52 3129gcn_exec (struct kernel_info *kernel,
237957cc
AS
3130 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3131 struct goacc_asyncqueue *aq)
3132{
3133 if (!GOMP_OFFLOAD_can_run (kernel))
3134 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3135
3136 /* If we get here then this must be an OpenACC kernel. */
3137 kernel->kind = KIND_OPENACC;
3138
237957cc
AS
3139 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3140 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3141 {
3142 struct hsa_kernel_description *d
3143 = &kernel->module->image_desc->kernel_infos[i];
3144 if (d->name == kernel->name)
3145 {
3146 hsa_kernel_desc = d;
3147 break;
3148 }
3149 }
3150
3151 /* We may have statically-determined dimensions in
3152 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3153 invocation at runtime in dims[]. We allow static dimensions to take
3154 priority over dynamic dimensions when present (non-zero). */
3155 if (hsa_kernel_desc->oacc_dims[0] > 0)
3156 dims[0] = hsa_kernel_desc->oacc_dims[0];
3157 if (hsa_kernel_desc->oacc_dims[1] > 0)
3158 dims[1] = hsa_kernel_desc->oacc_dims[1];
3159 if (hsa_kernel_desc->oacc_dims[2] > 0)
3160 dims[2] = hsa_kernel_desc->oacc_dims[2];
3161
a78b1ab1
KCY
3162 /* Ideally, when a dimension isn't explicitly specified, we should
3163 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3164 In practice, we tune for peak performance on BabelStream, which
3165 for OpenACC is currently 32 threads per CU. */
3166 if (dims[0] == 0 && dims[1] == 0)
3167 {
3168 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3169 number. There isn't really a correct answer for this without a clue
3170 about the problem size, so let's do a reasonable number of workers
3171 and gangs. */
237957cc 3172
a78b1ab1
KCY
3173 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3174 dims[1] = 8; /* Workers. */
3175 }
3176 else if (dims[0] == 0 && dims[1] > 0)
3177 {
3178 /* Auto-scale the number of gangs with the requested number of workers. */
3179 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3180 }
3181 else if (dims[0] > 0 && dims[1] == 0)
3182 {
3183 /* Auto-scale the number of workers with the requested number of gangs. */
3184 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3185 if (dims[1] == 0)
3186 dims[1] = 1;
3187 if (dims[1] > 16)
3188 dims[1] = 16;
3189 }
237957cc
AS
3190
3191 /* The incoming dimensions are expressed in terms of gangs, workers, and
3192 vectors. The HSA dimensions are expressed in terms of "work-items",
3193 which means multiples of vector lanes.
3194
3195 The "grid size" specifies the size of the problem space, and the
3196 "work-group size" specifies how much of that we want a single compute
3197 unit to chew on at once.
3198
3199 The three dimensions do not really correspond to hardware, but the
3200 important thing is that the HSA runtime will launch as many
3201 work-groups as it takes to process the entire grid, and each
3202 work-group will contain as many wave-fronts as it takes to process
3203 the work-items in that group.
3204
3205 Essentially, as long as we set the Y dimension to 64 (the number of
3206 vector lanes in hardware), and the Z group size to the maximum (16),
3207 then we will get the gangs (X) and workers (Z) launched as we expect.
3208
3209 The reason for the apparent reversal of vector and worker dimension
3210 order is to do with the way the run-time distributes work-items across
3211 v1 and v2. */
3212 struct GOMP_kernel_launch_attributes kla =
3213 {3,
3214 /* Grid size. */
3215 {dims[0], 64, dims[1]},
3216 /* Work-group size. */
3217 {1, 64, 16}
3218 };
3219
3220 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3221 acc_prof_info *prof_info = thr->prof_info;
3222 acc_event_info enqueue_launch_event_info;
3223 acc_api_info *api_info = thr->api_info;
3224 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3225 if (profiling_dispatch_p)
3226 {
3227 prof_info->event_type = acc_ev_enqueue_launch_start;
3228
3229 enqueue_launch_event_info.launch_event.event_type
3230 = prof_info->event_type;
3231 enqueue_launch_event_info.launch_event.valid_bytes
3232 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3233 enqueue_launch_event_info.launch_event.parent_construct
3234 = acc_construct_parallel;
3235 enqueue_launch_event_info.launch_event.implicit = 1;
3236 enqueue_launch_event_info.launch_event.tool_info = NULL;
3237 enqueue_launch_event_info.launch_event.kernel_name
3238 = (char *) kernel->name;
3239 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3240 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3241 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3242
3243 api_info->device_api = acc_device_api_other;
3244
3245 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3246 &enqueue_launch_event_info, api_info);
3247 }
3248
3249 if (!async)
f8332e52 3250 run_kernel (kernel, devaddrs, &kla, NULL, false);
237957cc 3251 else
f8332e52 3252 queue_push_launch (aq, kernel, devaddrs, &kla);
237957cc
AS
3253
3254 if (profiling_dispatch_p)
3255 {
3256 prof_info->event_type = acc_ev_enqueue_launch_end;
3257 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3258 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3259 &enqueue_launch_event_info,
3260 api_info);
3261 }
3262}
3263
3264/* }}} */
3265/* {{{ Generic Plugin API */
3266
3267/* Return the name of the accelerator, which is "gcn". */
3268
3269const char *
3270GOMP_OFFLOAD_get_name (void)
3271{
3272 return "gcn";
3273}
3274
3275/* Return the specific capabilities the HSA accelerator have. */
3276
3277unsigned int
3278GOMP_OFFLOAD_get_caps (void)
3279{
3280 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3281 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3282 | GOMP_OFFLOAD_CAP_OPENACC_200;
3283}
3284
3285/* Identify as GCN accelerator. */
3286
3287int
3288GOMP_OFFLOAD_get_type (void)
3289{
3290 return OFFLOAD_TARGET_TYPE_GCN;
3291}
3292
3293/* Return the libgomp version number we're compatible with. There is
3294 no requirement for cross-version compatibility. */
3295
3296unsigned
3297GOMP_OFFLOAD_version (void)
3298{
3299 return GOMP_VERSION;
3300}
3301
3302/* Return the number of GCN devices on the system. */
3303
3304int
683f1184 3305GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
237957cc
AS
3306{
3307 if (!init_hsa_context ())
3308 return 0;
683f1184
TB
3309 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3310 devices were present. */
f84fdb13 3311 if (hsa_context.agent_count > 0
f1af7d65
TB
3312 && ((omp_requires_mask
3313 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3314 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
683f1184 3315 return -1;
237957cc
AS
3316 return hsa_context.agent_count;
3317}
3318
3319/* Initialize device (agent) number N so that it can be used for computation.
3320 Return TRUE on success. */
3321
3322bool
3323GOMP_OFFLOAD_init_device (int n)
3324{
3325 if (!init_hsa_context ())
3326 return false;
3327 if (n >= hsa_context.agent_count)
3328 {
3329 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3330 return false;
3331 }
3332 struct agent_info *agent = &hsa_context.agents[n];
3333
3334 if (agent->initialized)
3335 return true;
3336
3337 agent->device_id = n;
3338
3339 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3340 {
3341 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3342 return false;
3343 }
3344 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3345 {
3346 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3347 return false;
3348 }
3349 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3350 {
3351 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3352 return false;
3353 }
f6fff8a6 3354 if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
237957cc
AS
3355 {
3356 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3357 return false;
3358 }
3359 agent->async_queues = NULL;
3360 agent->omp_async_queue = NULL;
f6fff8a6 3361 agent->ephemeral_memories_list = NULL;
237957cc
AS
3362
3363 uint32_t queue_size;
3364 hsa_status_t status;
3365 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3366 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3367 &queue_size);
3368 if (status != HSA_STATUS_SUCCESS)
3369 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3370 status);
3371
237957cc 3372 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
2e5ea579 3373 &agent->name);
237957cc
AS
3374 if (status != HSA_STATUS_SUCCESS)
3375 return hsa_error ("Error querying the name of the agent", status);
7d593fd6 3376
2e5ea579 3377 agent->device_isa = isa_code (agent->name);
209ed06c 3378 if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED)
2e5ea579
FH
3379 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3380
3381 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3382 &agent->vendor_name);
3383 if (status != HSA_STATUS_SUCCESS)
3384 return hsa_error ("Error querying the vendor name of the agent", status);
237957cc
AS
3385
3386 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3387 HSA_QUEUE_TYPE_MULTI,
3388 hsa_queue_callback, NULL, UINT32_MAX,
3389 UINT32_MAX, &agent->sync_queue);
3390 if (status != HSA_STATUS_SUCCESS)
3391 return hsa_error ("Error creating command queue", status);
3392
3393 agent->kernarg_region.handle = (uint64_t) -1;
3394 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3395 get_kernarg_memory_region,
3396 &agent->kernarg_region);
966de09b
AS
3397 if (status != HSA_STATUS_SUCCESS
3398 && status != HSA_STATUS_INFO_BREAK)
3399 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3400 if (agent->kernarg_region.handle == (uint64_t) -1)
3401 {
3402 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3403 "arguments");
3404 return false;
3405 }
3406 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3407 dump_hsa_region (agent->kernarg_region, NULL);
3408
3409 agent->data_region.handle = (uint64_t) -1;
3410 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3411 get_data_memory_region,
3412 &agent->data_region);
966de09b
AS
3413 if (status != HSA_STATUS_SUCCESS
3414 && status != HSA_STATUS_INFO_BREAK)
3415 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3416 if (agent->data_region.handle == (uint64_t) -1)
3417 {
3418 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3419 "data");
3420 return false;
3421 }
3422 GCN_DEBUG ("Selected device data memory region:\n");
3423 dump_hsa_region (agent->data_region, NULL);
3424
3425 GCN_DEBUG ("GCN agent %d initialized\n", n);
3426
3427 agent->initialized = true;
3428 return true;
3429}
3430
3431/* Load GCN object-code module described by struct gcn_image_desc in
3432 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
0fcc0cf9
TB
3433 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3434 contain the on-device addresses of the functions for reverse offload. To be
3435 freed by the caller. */
237957cc
AS
3436
3437int
3438GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
0fcc0cf9 3439 struct addr_pair **target_table,
a49c7d31
KCY
3440 uint64_t **rev_fn_table,
3441 uint64_t *host_ind_fn_table)
237957cc
AS
3442{
3443 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3444 {
3445 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3446 " (expected %u, received %u)",
3447 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3448 return -1;
3449 }
3450
3451 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3452 struct agent_info *agent;
3453 struct addr_pair *pair;
3454 struct module_info *module;
3455 struct kernel_info *kernel;
3456 int kernel_count = image_desc->kernel_count;
a49c7d31
KCY
3457 unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3458 ? image_desc->ind_func_count : 0;
237957cc 3459 unsigned var_count = image_desc->global_variable_count;
9f2fca56 3460 /* Currently, "others" is a struct of ICVS. */
0bac793e 3461 int other_count = 1;
237957cc
AS
3462
3463 agent = get_agent_info (ord);
3464 if (!agent)
3465 return -1;
3466
3467 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3468 {
3469 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3470 return -1;
3471 }
3472 if (agent->prog_finalized
3473 && !destroy_hsa_program (agent))
3474 return -1;
3475
3476 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
a49c7d31 3477 GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
237957cc 3478 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
0bac793e
CLT
3479 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3480 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
237957cc
AS
3481 * sizeof (struct addr_pair));
3482 *target_table = pair;
3483 module = (struct module_info *)
3484 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3485 + kernel_count * sizeof (struct kernel_info));
3486 module->image_desc = image_desc;
3487 module->kernel_count = kernel_count;
3488 module->heap = NULL;
3489 module->constructors_run_p = false;
3490
3491 kernel = &module->kernels[0];
3492
3493 /* Allocate memory for kernel dependencies. */
3494 for (unsigned i = 0; i < kernel_count; i++)
3495 {
3496 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3497 if (!init_basic_kernel_info (kernel, d, agent, module))
3498 return -1;
3499 if (strcmp (d->name, "_init_array") == 0)
3500 module->init_array_func = kernel;
3501 else if (strcmp (d->name, "_fini_array") == 0)
3502 module->fini_array_func = kernel;
3503 else
3504 {
3505 pair->start = (uintptr_t) kernel;
3506 pair->end = (uintptr_t) (kernel + 1);
3507 pair++;
3508 }
3509 kernel++;
3510 }
3511
3512 agent->module = module;
3513 if (pthread_rwlock_unlock (&agent->module_rwlock))
3514 {
3515 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3516 return -1;
3517 }
3518
3519 if (!create_and_finalize_hsa_program (agent))
3520 return -1;
3521
4a87a8e4 3522 if (var_count > 0)
237957cc 3523 {
237957cc
AS
3524 hsa_status_t status;
3525 hsa_executable_symbol_t var_symbol;
3526 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
4a87a8e4
AS
3527 ".offload_var_table",
3528 agent->id,
237957cc
AS
3529 0, &var_symbol);
3530
3531 if (status != HSA_STATUS_SUCCESS)
3532 hsa_fatal ("Could not find symbol for variable in the code object",
3533 status);
3534
4a87a8e4 3535 uint64_t var_table_addr;
237957cc 3536 status = hsa_fns.hsa_executable_symbol_get_info_fn
4a87a8e4
AS
3537 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3538 &var_table_addr);
237957cc
AS
3539 if (status != HSA_STATUS_SUCCESS)
3540 hsa_fatal ("Could not extract a variable from its symbol", status);
237957cc 3541
4a87a8e4
AS
3542 struct {
3543 uint64_t addr;
3544 uint64_t size;
3545 } var_table[var_count];
3546 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3547 (void*)var_table_addr, sizeof (var_table));
3548
3549 for (unsigned i = 0; i < var_count; i++)
3550 {
3551 pair->start = var_table[i].addr;
3552 pair->end = var_table[i].addr + var_table[i].size;
3553 GCN_DEBUG ("Found variable at %p with size %lu\n",
3554 (void *)var_table[i].addr, var_table[i].size);
3555 pair++;
3556 }
237957cc
AS
3557 }
3558
a49c7d31
KCY
3559 if (ind_func_count > 0)
3560 {
3561 hsa_status_t status;
3562
3563 /* Read indirect function table from image. */
3564 hsa_executable_symbol_t ind_funcs_symbol;
3565 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3566 ".offload_ind_func_table",
3567 agent->id,
3568 0, &ind_funcs_symbol);
3569
3570 if (status != HSA_STATUS_SUCCESS)
3571 hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3572 "code object", status);
3573
3574 uint64_t ind_funcs_table_addr;
3575 status = hsa_fns.hsa_executable_symbol_get_info_fn
3576 (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3577 &ind_funcs_table_addr);
3578 if (status != HSA_STATUS_SUCCESS)
3579 hsa_fatal ("Could not extract a variable from its symbol", status);
3580
3581 uint64_t ind_funcs_table[ind_func_count];
3582 GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3583 (void*) ind_funcs_table_addr,
3584 sizeof (ind_funcs_table));
3585
3586 /* Build host->target address map for indirect functions. */
3587 uint64_t ind_fn_map[ind_func_count * 2 + 1];
3588 for (unsigned i = 0; i < ind_func_count; i++)
3589 {
3590 ind_fn_map[i * 2] = host_ind_fn_table[i];
3591 ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3592 GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3593 i, host_ind_fn_table[i], ind_funcs_table[i]);
3594 }
3595 ind_fn_map[ind_func_count * 2] = 0;
3596
3597 /* Write the map onto the target. */
3598 void *map_target_addr
3599 = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3600 GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3601
3602 GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3603 (void*) ind_fn_map,
3604 sizeof (ind_fn_map));
3605
3606 /* Write address of the map onto the target. */
3607 hsa_executable_symbol_t symbol;
3608
3609 status
3610 = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3611 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3612 agent->id, 0, &symbol);
3613 if (status != HSA_STATUS_SUCCESS)
3614 hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3615 status);
3616
3617 uint64_t varptr;
3618 uint32_t varsize;
3619
3620 status = hsa_fns.hsa_executable_symbol_get_info_fn
3621 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3622 &varptr);
3623 if (status != HSA_STATUS_SUCCESS)
3624 hsa_fatal ("Could not extract a variable from its symbol", status);
3625 status = hsa_fns.hsa_executable_symbol_get_info_fn
3626 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3627 &varsize);
3628 if (status != HSA_STATUS_SUCCESS)
3629 hsa_fatal ("Could not extract a variable size from its symbol",
3630 status);
3631
3632 GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3633 varptr, varsize);
3634
3635 GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3636 &map_target_addr,
3637 sizeof (map_target_addr));
3638 }
3639
9f2fca56 3640 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
0bac793e
CLT
3641
3642 hsa_status_t status;
3643 hsa_executable_symbol_t var_symbol;
3644 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
9f2fca56 3645 XSTRING (GOMP_ADDITIONAL_ICVS),
0bac793e
CLT
3646 agent->id, 0, &var_symbol);
3647 if (status == HSA_STATUS_SUCCESS)
3648 {
9f2fca56
MV
3649 uint64_t varptr;
3650 uint32_t varsize;
0bac793e
CLT
3651
3652 status = hsa_fns.hsa_executable_symbol_get_info_fn
3653 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
9f2fca56 3654 &varptr);
0bac793e
CLT
3655 if (status != HSA_STATUS_SUCCESS)
3656 hsa_fatal ("Could not extract a variable from its symbol", status);
3657 status = hsa_fns.hsa_executable_symbol_get_info_fn
3658 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
9f2fca56 3659 &varsize);
0bac793e 3660 if (status != HSA_STATUS_SUCCESS)
9f2fca56
MV
3661 hsa_fatal ("Could not extract a variable size from its symbol",
3662 status);
0bac793e 3663
9f2fca56
MV
3664 pair->start = varptr;
3665 pair->end = varptr + varsize;
0bac793e
CLT
3666 }
3667 else
9f2fca56
MV
3668 {
3669 /* The variable was not in this image. */
3670 GCN_DEBUG ("Variable not found in image: %s\n",
3671 XSTRING (GOMP_ADDITIONAL_ICVS));
3672 pair->start = pair->end = 0;
3673 }
0bac793e 3674
237957cc
AS
3675 /* Ensure that constructors are run first. */
3676 struct GOMP_kernel_launch_attributes kla =
3677 { 3,
3678 /* Grid size. */
3679 { 1, 64, 1 },
3680 /* Work-group size. */
3681 { 1, 64, 1 }
3682 };
3683
3684 if (module->init_array_func)
3685 {
3686 init_kernel (module->init_array_func);
3687 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3688 }
3689 module->constructors_run_p = true;
3690
3691 /* Don't report kernels that libgomp need not know about. */
3692 if (module->init_array_func)
3693 kernel_count--;
3694 if (module->fini_array_func)
3695 kernel_count--;
3696
dfd75bf7
TB
3697 if (rev_fn_table != NULL && kernel_count == 0)
3698 *rev_fn_table = NULL;
3699 else if (rev_fn_table != NULL)
3700 {
3701 hsa_status_t status;
3702 hsa_executable_symbol_t var_symbol;
3703 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3704 ".offload_func_table",
3705 agent->id, 0, &var_symbol);
3706 if (status != HSA_STATUS_SUCCESS)
3707 hsa_fatal ("Could not find symbol for variable in the code object",
3708 status);
3709 uint64_t fn_table_addr;
3710 status = hsa_fns.hsa_executable_symbol_get_info_fn
3711 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3712 &fn_table_addr);
3713 if (status != HSA_STATUS_SUCCESS)
3714 hsa_fatal ("Could not extract a variable from its symbol", status);
3715 *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3716 GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3717 (void*) fn_table_addr,
3718 kernel_count * sizeof (uint64_t));
3719 }
3720
0bac793e 3721 return kernel_count + var_count + other_count;
237957cc
AS
3722}
3723
3724/* Unload GCN object-code module described by struct gcn_image_desc in
3725 TARGET_DATA from agent number N. Return TRUE on success. */
3726
3727bool
3728GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3729{
3730 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3731 {
3732 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3733 " (expected %u, received %u)",
3734 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3735 return false;
3736 }
3737
3738 struct agent_info *agent;
3739 agent = get_agent_info (n);
3740 if (!agent)
3741 return false;
3742
3743 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3744 {
3745 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3746 return false;
3747 }
3748
3749 if (!agent->module || agent->module->image_desc != target_data)
3750 {
3751 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3752 "loaded before");
3753 return false;
3754 }
3755
3756 if (!destroy_module (agent->module, true))
3757 return false;
3758 free (agent->module);
3759 agent->module = NULL;
3760 if (!destroy_hsa_program (agent))
3761 return false;
3762 if (pthread_rwlock_unlock (&agent->module_rwlock))
3763 {
3764 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3765 return false;
3766 }
3767 return true;
3768}
3769
3770/* Deinitialize all information and status associated with agent number N. We
3771 do not attempt any synchronization, assuming the user and libgomp will not
3772 attempt deinitialization of a device that is in any way being used at the
3773 same time. Return TRUE on success. */
3774
3775bool
3776GOMP_OFFLOAD_fini_device (int n)
3777{
3778 struct agent_info *agent = get_agent_info (n);
3779 if (!agent)
3780 return false;
3781
3782 if (!agent->initialized)
3783 return true;
3784
3785 if (agent->omp_async_queue)
3786 {
3787 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3788 agent->omp_async_queue = NULL;
3789 }
3790
3791 if (agent->module)
3792 {
3793 if (!destroy_module (agent->module, false))
3794 return false;
3795 free (agent->module);
3796 agent->module = NULL;
3797 }
3798
f6fff8a6 3799 if (!destroy_ephemeral_memories (agent))
237957cc
AS
3800 return false;
3801
3802 if (!destroy_hsa_program (agent))
3803 return false;
3804
3805 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3806 if (status != HSA_STATUS_SUCCESS)
3807 return hsa_error ("Error destroying command queue", status);
3808
3809 if (pthread_mutex_destroy (&agent->prog_mutex))
3810 {
3811 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3812 return false;
3813 }
3814 if (pthread_rwlock_destroy (&agent->module_rwlock))
3815 {
3816 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3817 return false;
3818 }
3819
3820 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3821 {
3822 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3823 return false;
3824 }
f6fff8a6 3825 if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
237957cc 3826 {
f6fff8a6 3827 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
237957cc
AS
3828 return false;
3829 }
3830 agent->initialized = false;
3831 return true;
3832}
3833
3834/* Return true if the HSA runtime can run function FN_PTR. */
3835
3836bool
3837GOMP_OFFLOAD_can_run (void *fn_ptr)
3838{
3839 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3840
3841 init_kernel (kernel);
3842 if (kernel->initialization_failed)
3843 goto failure;
3844
3845 return true;
3846
3847failure:
3848 if (suppress_host_fallback)
3849 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3850 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3851 return false;
3852}
3853
3854/* Allocate memory on device N. */
3855
3856void *
3857GOMP_OFFLOAD_alloc (int n, size_t size)
3858{
3859 struct agent_info *agent = get_agent_info (n);
3860 return alloc_by_agent (agent, size);
3861}
3862
3863/* Free memory from device N. */
3864
3865bool
3866GOMP_OFFLOAD_free (int device, void *ptr)
3867{
3868 GCN_DEBUG ("Freeing memory on device %d\n", device);
3869
3870 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3871 if (status != HSA_STATUS_SUCCESS)
3872 {
3873 hsa_error ("Could not free device memory", status);
3874 return false;
3875 }
3876
3877 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3878 bool profiling_dispatch_p
3879 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3880 if (profiling_dispatch_p)
3881 {
3882 acc_prof_info *prof_info = thr->prof_info;
3883 acc_event_info data_event_info;
3884 acc_api_info *api_info = thr->api_info;
3885
3886 prof_info->event_type = acc_ev_free;
3887
3888 data_event_info.data_event.event_type = prof_info->event_type;
3889 data_event_info.data_event.valid_bytes
3890 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3891 data_event_info.data_event.parent_construct
3892 = acc_construct_parallel;
3893 data_event_info.data_event.implicit = 1;
3894 data_event_info.data_event.tool_info = NULL;
3895 data_event_info.data_event.var_name = NULL;
3896 data_event_info.data_event.bytes = 0;
3897 data_event_info.data_event.host_ptr = NULL;
3898 data_event_info.data_event.device_ptr = (void *) ptr;
3899
3900 api_info->device_api = acc_device_api_other;
3901
3902 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3903 api_info);
3904 }
3905
3906 return true;
3907}
3908
3909/* Copy data from DEVICE to host. */
3910
3911bool
3912GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3913{
3914 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3915 src, dst);
8d2f4ddf
JB
3916 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3917 if (status != HSA_STATUS_SUCCESS)
3918 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3919 return true;
3920}
3921
3922/* Copy data from host to DEVICE. */
3923
3924bool
3925GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3926{
3927 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3928 device, dst);
8d2f4ddf 3929 hsa_memory_copy_wrapper (dst, src, n);
237957cc
AS
3930 return true;
3931}
3932
3933/* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3934
3935bool
3936GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3937{
3938 struct gcn_thread *thread_data = gcn_thread ();
3939
3940 if (thread_data && !async_synchronous_p (thread_data->async))
3941 {
3942 struct agent_info *agent = get_agent_info (device);
3943 maybe_init_omp_async (agent);
9c41f5b9 3944 queue_push_copy (agent->omp_async_queue, dst, src, n);
237957cc
AS
3945 return true;
3946 }
3947
3948 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3949 device, src, device, dst);
8d2f4ddf
JB
3950 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3951 if (status != HSA_STATUS_SUCCESS)
3952 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3953 return true;
3954}
3955
a17299c1
JB
3956/* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
3957 measured in bytes. So we have:
3958
3959 dim1_size: number of bytes to copy on innermost dimension ("row")
3960 dim0_len: number of rows to copy
3961 dst: base pointer for destination of copy
3962 dst_offset1_size: innermost row offset (for dest), in bytes
3963 dst_offset0_len: offset, number of rows (for dest)
3964 dst_dim1_size: whole-array dest row length, in bytes (pitch)
3965 src: base pointer for source of copy
3966 src_offset1_size: innermost row offset (for source), in bytes
3967 src_offset0_len: offset, number of rows (for source)
3968 src_dim1_size: whole-array source row length, in bytes (pitch)
3969*/
3970
3971int
3972GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
3973 size_t dim0_len, void *dst, size_t dst_offset1_size,
3974 size_t dst_offset0_len, size_t dst_dim1_size,
3975 const void *src, size_t src_offset1_size,
3976 size_t src_offset0_len, size_t src_dim1_size)
3977{
3978 if (!hsa_fns.hsa_amd_memory_lock_fn
3979 || !hsa_fns.hsa_amd_memory_unlock_fn
3980 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
3981 return -1;
3982
3983 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
3984 out quietly if we have anything oddly-aligned rather than letting the
3985 driver raise an error. */
3986 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
3987 return -1;
3988
3989 if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
3990 return -1;
3991
3992 /* Only handle host to device or device to host transfers here. */
3993 if ((dst_ord == -1 && src_ord == -1)
3994 || (dst_ord != -1 && src_ord != -1))
3995 return -1;
3996
3997 hsa_amd_copy_direction_t dir
3998 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
3999 hsa_agent_t copy_agent;
4000
4001 /* We need to pin (lock) host memory before we start the transfer. Try to
4002 lock the minimum size necessary, i.e. using partial first/last rows of the
4003 whole array. Something like this:
4004
4005 rows -->
4006 ..............
4007 c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4008 o | ++#######+++++ <- whole row
4009 l | ++#######+++++ <- "
4010 s v ++#######..... <- last row apart from trailing remainder
4011 ..............
4012
4013 We could split very large transfers into several rectangular copies, but
4014 that is unimplemented for now. */
4015
4016 size_t bounded_size_host, first_elem_offset_host;
4017 void *host_ptr;
4018 if (dir == hsaHostToDevice)
4019 {
4020 bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
4021 first_elem_offset_host = src_offset0_len * src_dim1_size
4022 + src_offset1_size;
4023 host_ptr = (void *) src;
4024 struct agent_info *agent = get_agent_info (dst_ord);
4025 copy_agent = agent->id;
4026 }
4027 else
4028 {
4029 bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
4030 first_elem_offset_host = dst_offset0_len * dst_dim1_size
4031 + dst_offset1_size;
4032 host_ptr = dst;
4033 struct agent_info *agent = get_agent_info (src_ord);
4034 copy_agent = agent->id;
4035 }
4036
4037 void *agent_ptr;
4038
4039 hsa_status_t status
4040 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4041 bounded_size_host, NULL, 0, &agent_ptr);
4042 /* We can't lock the host memory: don't give up though, we might still be
4043 able to use the slow path in our caller. So, don't make this an
4044 error. */
4045 if (status != HSA_STATUS_SUCCESS)
4046 return -1;
4047
4048 hsa_pitched_ptr_t dstpp, srcpp;
4049 hsa_dim3_t dst_offsets, src_offsets, ranges;
4050
4051 int retval = 1;
4052
4053 hsa_signal_t completion_signal;
4054 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4055 if (status != HSA_STATUS_SUCCESS)
4056 {
4057 retval = -1;
4058 goto unlock;
4059 }
4060
4061 if (dir == hsaHostToDevice)
4062 {
4063 srcpp.base = agent_ptr - first_elem_offset_host;
4064 dstpp.base = dst;
4065 }
4066 else
4067 {
4068 srcpp.base = (void *) src;
4069 dstpp.base = agent_ptr - first_elem_offset_host;
4070 }
4071
4072 srcpp.pitch = src_dim1_size;
4073 srcpp.slice = 0;
4074
4075 src_offsets.x = src_offset1_size;
4076 src_offsets.y = src_offset0_len;
4077 src_offsets.z = 0;
4078
4079 dstpp.pitch = dst_dim1_size;
4080 dstpp.slice = 0;
4081
4082 dst_offsets.x = dst_offset1_size;
4083 dst_offsets.y = dst_offset0_len;
4084 dst_offsets.z = 0;
4085
4086 ranges.x = dim1_size;
4087 ranges.y = dim0_len;
4088 ranges.z = 1;
4089
4090 status
4091 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4092 &src_offsets, &ranges,
4093 copy_agent, dir, 0, NULL,
4094 completion_signal);
4095 /* If the rectangular copy fails, we might still be able to use the slow
4096 path. We need to unlock the host memory though, so don't return
4097 immediately. */
4098 if (status != HSA_STATUS_SUCCESS)
4099 retval = -1;
4100 else
4101 hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4102 HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
4103 HSA_WAIT_STATE_ACTIVE);
4104
4105 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4106
4107unlock:
4108 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4109 if (status != HSA_STATUS_SUCCESS)
4110 hsa_fatal ("Could not unlock host memory", status);
4111
4112 return retval;
4113}
4114
4115/* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4116 measured in bytes. So we have:
4117
4118 dim2_size: number of bytes to copy on innermost dimension ("row")
4119 dim1_len: number of rows per slice to copy
4120 dim0_len: number of slices to copy
4121 dst: base pointer for destination of copy
4122 dst_offset2_size: innermost row offset (for dest), in bytes
4123 dst_offset1_len: offset, number of rows (for dest)
4124 dst_offset0_len: offset, number of slices (for dest)
4125 dst_dim2_size: whole-array dest row length, in bytes (pitch)
4126 dst_dim1_len: whole-array number of rows in slice (for dest)
4127 src: base pointer for source of copy
4128 src_offset2_size: innermost row offset (for source), in bytes
4129 src_offset1_len: offset, number of rows (for source)
4130 src_offset0_len: offset, number of slices (for source)
4131 src_dim2_size: whole-array source row length, in bytes (pitch)
4132 src_dim1_len: whole-array number of rows in slice (for source)
4133*/
4134
4135int
4136GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
4137 size_t dim1_len, size_t dim0_len, void *dst,
4138 size_t dst_offset2_size, size_t dst_offset1_len,
4139 size_t dst_offset0_len, size_t dst_dim2_size,
4140 size_t dst_dim1_len, const void *src,
4141 size_t src_offset2_size, size_t src_offset1_len,
4142 size_t src_offset0_len, size_t src_dim2_size,
4143 size_t src_dim1_len)
4144{
4145 if (!hsa_fns.hsa_amd_memory_lock_fn
4146 || !hsa_fns.hsa_amd_memory_unlock_fn
4147 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4148 return -1;
4149
4150 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4151 out quietly if we have anything oddly-aligned rather than letting the
4152 driver raise an error. */
4153 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4154 return -1;
4155
4156 if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
4157 return -1;
4158
4159 /* Only handle host to device or device to host transfers here. */
4160 if ((dst_ord == -1 && src_ord == -1)
4161 || (dst_ord != -1 && src_ord != -1))
4162 return -1;
4163
4164 hsa_amd_copy_direction_t dir
4165 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4166 hsa_agent_t copy_agent;
4167
4168 /* We need to pin (lock) host memory before we start the transfer. Try to
4169 lock the minimum size necessary, i.e. using partial first/last slices of
4170 the whole 3D array. Something like this:
4171
4172 slice 0: slice 1: slice 2:
4173 __________ __________ __________
4174 ^ /+++++++++/ : /+++++++++/ : / /
4175 column /+++##++++/| | /+++##++++/| | /+++## / # = subarray
4176 / / ##++++/ | |/+++##++++/ | |/+++##++++/ + = area to pin
4177 /_________/ : /_________/ : /_________/
4178 row --->
4179
4180 We could split very large transfers into several rectangular copies, but
4181 that is unimplemented for now. */
4182
4183 size_t bounded_size_host, first_elem_offset_host;
4184 void *host_ptr;
4185 if (dir == hsaHostToDevice)
4186 {
4187 size_t slice_bytes = src_dim2_size * src_dim1_len;
4188 bounded_size_host = slice_bytes * (dim0_len - 1)
4189 + src_dim2_size * (dim1_len - 1)
4190 + dim2_size;
4191 first_elem_offset_host = src_offset0_len * slice_bytes
4192 + src_offset1_len * src_dim2_size
4193 + src_offset2_size;
4194 host_ptr = (void *) src;
4195 struct agent_info *agent = get_agent_info (dst_ord);
4196 copy_agent = agent->id;
4197 }
4198 else
4199 {
4200 size_t slice_bytes = dst_dim2_size * dst_dim1_len;
4201 bounded_size_host = slice_bytes * (dim0_len - 1)
4202 + dst_dim2_size * (dim1_len - 1)
4203 + dim2_size;
4204 first_elem_offset_host = dst_offset0_len * slice_bytes
4205 + dst_offset1_len * dst_dim2_size
4206 + dst_offset2_size;
4207 host_ptr = dst;
4208 struct agent_info *agent = get_agent_info (src_ord);
4209 copy_agent = agent->id;
4210 }
4211
4212 void *agent_ptr;
4213
4214 hsa_status_t status
4215 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4216 bounded_size_host, NULL, 0, &agent_ptr);
4217 /* We can't lock the host memory: don't give up though, we might still be
4218 able to use the slow path in our caller (maybe even with iterated memcpy2d
4219 calls). So, don't make this an error. */
4220 if (status != HSA_STATUS_SUCCESS)
4221 return -1;
4222
4223 hsa_pitched_ptr_t dstpp, srcpp;
4224 hsa_dim3_t dst_offsets, src_offsets, ranges;
4225
4226 int retval = 1;
4227
4228 hsa_signal_t completion_signal;
4229 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4230 if (status != HSA_STATUS_SUCCESS)
4231 {
4232 retval = -1;
4233 goto unlock;
4234 }
4235
4236 if (dir == hsaHostToDevice)
4237 {
4238 srcpp.base = agent_ptr - first_elem_offset_host;
4239 dstpp.base = dst;
4240 }
4241 else
4242 {
4243 srcpp.base = (void *) src;
4244 dstpp.base = agent_ptr - first_elem_offset_host;
4245 }
4246
4247 /* Pitch is measured in bytes. */
4248 srcpp.pitch = src_dim2_size;
4249 /* Slice is also measured in bytes (i.e. total per-slice). */
4250 srcpp.slice = src_dim2_size * src_dim1_len;
4251
4252 src_offsets.x = src_offset2_size;
4253 src_offsets.y = src_offset1_len;
4254 src_offsets.z = src_offset0_len;
4255
4256 /* As above. */
4257 dstpp.pitch = dst_dim2_size;
4258 dstpp.slice = dst_dim2_size * dst_dim1_len;
4259
4260 dst_offsets.x = dst_offset2_size;
4261 dst_offsets.y = dst_offset1_len;
4262 dst_offsets.z = dst_offset0_len;
4263
4264 ranges.x = dim2_size;
4265 ranges.y = dim1_len;
4266 ranges.z = dim0_len;
4267
4268 status
4269 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4270 &src_offsets, &ranges,
4271 copy_agent, dir, 0, NULL,
4272 completion_signal);
4273 /* If the rectangular copy fails, we might still be able to use the slow
4274 path. We need to unlock the host memory though, so don't return
4275 immediately. */
4276 if (status != HSA_STATUS_SUCCESS)
4277 retval = -1;
4278 else
4279 {
4280 hsa_signal_value_t sv
4281 = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4282 HSA_SIGNAL_CONDITION_LT, 1,
4283 UINT64_MAX,
4284 HSA_WAIT_STATE_ACTIVE);
4285 if (sv < 0)
4286 {
4287 GCN_WARNING ("async copy rect failure");
4288 retval = -1;
4289 }
4290 }
4291
4292 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4293
4294unlock:
4295 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4296 if (status != HSA_STATUS_SUCCESS)
4297 hsa_fatal ("Could not unlock host memory", status);
4298
4299 return retval;
4300}
4301
237957cc
AS
4302/* }}} */
4303/* {{{ OpenMP Plugin API */
4304
4305/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4306 in VARS as a parameter. The kernel is identified by FN_PTR which must point
4307 to a kernel_info structure, and must have previously been loaded to the
4308 specified device. */
4309
4310void
4311GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
4312{
4313 struct agent_info *agent = get_agent_info (device);
4314 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4315 struct GOMP_kernel_launch_attributes def;
4316 struct GOMP_kernel_launch_attributes *kla;
4317 assert (agent == kernel->agent);
4318
4319 /* If we get here then the kernel must be OpenMP. */
4320 kernel->kind = KIND_OPENMP;
4321
4322 if (!parse_target_attributes (args, &def, &kla, agent))
4323 {
4324 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4325 return;
4326 }
4327 run_kernel (kernel, vars, kla, NULL, false);
4328}
4329
4330/* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
4331 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4332 GOMP_PLUGIN_target_task_completion when it has finished. */
4333
4334void
4335GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
4336 void **args, void *async_data)
4337{
4338 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4339 struct agent_info *agent = get_agent_info (device);
4340 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
4341 struct GOMP_kernel_launch_attributes def;
4342 struct GOMP_kernel_launch_attributes *kla;
4343 assert (agent == kernel->agent);
4344
4345 /* If we get here then the kernel must be OpenMP. */
4346 kernel->kind = KIND_OPENMP;
4347
4348 if (!parse_target_attributes (args, &def, &kla, agent))
4349 {
4350 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4351 return;
4352 }
4353
4354 maybe_init_omp_async (agent);
4355 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
4356 queue_push_callback (agent->omp_async_queue,
4357 GOMP_PLUGIN_target_task_completion, async_data);
4358}
4359
4360/* }}} */
4361/* {{{ OpenACC Plugin API */
4362
4363/* Run a synchronous OpenACC kernel. The device number is inferred from the
4364 already-loaded KERNEL. */
4365
4366void
f8332e52
TS
4367GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
4368 size_t mapnum __attribute__((unused)),
199867d0
TS
4369 void **hostaddrs __attribute__((unused)),
4370 void **devaddrs, unsigned *dims,
237957cc
AS
4371 void *targ_mem_desc)
4372{
4373 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4374
f8332e52 4375 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
237957cc
AS
4376}
4377
4378/* Run an asynchronous OpenACC kernel on the specified queue. */
4379
4380void
f8332e52
TS
4381GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4382 size_t mapnum __attribute__((unused)),
199867d0
TS
4383 void **hostaddrs __attribute__((unused)),
4384 void **devaddrs,
237957cc
AS
4385 unsigned *dims, void *targ_mem_desc,
4386 struct goacc_asyncqueue *aq)
4387{
4388 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4389
f8332e52 4390 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
237957cc
AS
4391}
4392
4393/* Create a new asynchronous thread and queue for running future kernels. */
4394
4395struct goacc_asyncqueue *
4396GOMP_OFFLOAD_openacc_async_construct (int device)
4397{
4398 struct agent_info *agent = get_agent_info (device);
4399
4400 pthread_mutex_lock (&agent->async_queues_mutex);
4401
4402 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4403 aq->agent = get_agent_info (device);
4404 aq->prev = NULL;
4405 aq->next = agent->async_queues;
4406 if (aq->next)
4407 {
4408 aq->next->prev = aq;
4409 aq->id = aq->next->id + 1;
4410 }
4411 else
4412 aq->id = 1;
4413 agent->async_queues = aq;
4414
4415 aq->queue_first = 0;
4416 aq->queue_n = 0;
4417 aq->drain_queue_stop = 0;
4418
4419 if (pthread_mutex_init (&aq->mutex, NULL))
4420 {
4421 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4422 return false;
4423 }
4424 if (pthread_cond_init (&aq->queue_cond_in, NULL))
4425 {
4426 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4427 return false;
4428 }
4429 if (pthread_cond_init (&aq->queue_cond_out, NULL))
4430 {
4431 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4432 return false;
4433 }
4434
4435 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4436 ASYNC_QUEUE_SIZE,
4437 HSA_QUEUE_TYPE_MULTI,
4438 hsa_queue_callback, NULL,
4439 UINT32_MAX, UINT32_MAX,
4440 &aq->hsa_queue);
4441 if (status != HSA_STATUS_SUCCESS)
4442 hsa_fatal ("Error creating command queue", status);
4443
4444 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4445 if (err != 0)
4446 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4447 strerror (err));
4448 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4449 aq->id);
4450
4451 pthread_mutex_unlock (&agent->async_queues_mutex);
4452
4453 return aq;
4454}
4455
93d90219 4456/* Destroy an existing asynchronous thread and queue. Waits for any
237957cc
AS
4457 currently-running task to complete, but cancels any queued tasks. */
4458
4459bool
4460GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4461{
4462 struct agent_info *agent = aq->agent;
4463
4464 finalize_async_thread (aq);
4465
4466 pthread_mutex_lock (&agent->async_queues_mutex);
4467
4468 int err;
4469 if ((err = pthread_mutex_destroy (&aq->mutex)))
4470 {
4471 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4472 goto fail;
4473 }
4474 if (pthread_cond_destroy (&aq->queue_cond_in))
4475 {
4476 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4477 goto fail;
4478 }
4479 if (pthread_cond_destroy (&aq->queue_cond_out))
4480 {
4481 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4482 goto fail;
4483 }
4484 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4485 if (status != HSA_STATUS_SUCCESS)
4486 {
4487 hsa_error ("Error destroying command queue", status);
4488 goto fail;
4489 }
4490
4491 if (aq->prev)
4492 aq->prev->next = aq->next;
4493 if (aq->next)
4494 aq->next->prev = aq->prev;
4495 if (agent->async_queues == aq)
4496 agent->async_queues = aq->next;
4497
4498 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4499
4500 free (aq);
4501 pthread_mutex_unlock (&agent->async_queues_mutex);
4502 return true;
4503
4504fail:
4505 pthread_mutex_unlock (&agent->async_queues_mutex);
4506 return false;
4507}
4508
4509/* Return true if the specified async queue is currently empty. */
4510
4511int
4512GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4513{
4514 return queue_empty (aq);
4515}
4516
4517/* Block until the specified queue has executed all its tasks and the
4518 queue is empty. */
4519
4520bool
4521GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4522{
4523 wait_queue (aq);
4524 return true;
4525}
4526
4527/* Add a serialization point across two async queues. Any new tasks added to
4528 AQ2, after this call, will not run until all tasks on AQ1, at the time
4529 of this call, have completed. */
4530
4531bool
4532GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4533 struct goacc_asyncqueue *aq2)
4534{
4535 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4536 scheduled to run on it up to this point. */
4537 if (aq1 != aq2)
4538 {
4539 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4540 queue_push_asyncwait (aq2, placeholderp);
4541 }
4542 return true;
4543}
4544
4545/* Add an opaque callback to the given async queue. */
4546
4547void
4548GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4549 void (*fn) (void *), void *data)
4550{
4551 queue_push_callback (aq, fn, data);
4552}
4553
4554/* Queue up an asynchronous data copy from host to DEVICE. */
4555
4556bool
4557GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4558 size_t n, struct goacc_asyncqueue *aq)
4559{
4560 struct agent_info *agent = get_agent_info (device);
4561 assert (agent == aq->agent);
9c41f5b9 4562 queue_push_copy (aq, dst, src, n);
237957cc
AS
4563 return true;
4564}
4565
4566/* Queue up an asynchronous data copy from DEVICE to host. */
4567
4568bool
4569GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4570 size_t n, struct goacc_asyncqueue *aq)
4571{
4572 struct agent_info *agent = get_agent_info (device);
4573 assert (agent == aq->agent);
9c41f5b9 4574 queue_push_copy (aq, dst, src, n);
237957cc
AS
4575 return true;
4576}
4577
6fc0385c
TS
4578union goacc_property_value
4579GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4580{
2e5ea579
FH
4581 struct agent_info *agent = get_agent_info (device);
4582
4583 union goacc_property_value propval = { .val = 0 };
4584
4585 switch (prop)
4586 {
4587 case GOACC_PROPERTY_FREE_MEMORY:
4588 /* Not supported. */
4589 break;
4590 case GOACC_PROPERTY_MEMORY:
4591 {
4592 size_t size;
4593 hsa_region_t region = agent->data_region;
4594 hsa_status_t status =
4595 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4596 if (status == HSA_STATUS_SUCCESS)
4597 propval.val = size;
4598 break;
4599 }
4600 case GOACC_PROPERTY_NAME:
4601 propval.ptr = agent->name;
4602 break;
4603 case GOACC_PROPERTY_VENDOR:
4604 propval.ptr = agent->vendor_name;
4605 break;
4606 case GOACC_PROPERTY_DRIVER:
4607 propval.ptr = hsa_context.driver_version_s;
4608 break;
4609 }
6fc0385c 4610
2e5ea579 4611 return propval;
6fc0385c
TS
4612}
4613
237957cc
AS
4614/* Set up plugin-specific thread-local-data (host-side). */
4615
4616void *
4617GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4618{
4619 struct gcn_thread *thread_data
4620 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4621
4622 thread_data->async = GOMP_ASYNC_SYNC;
4623
4624 return (void *) thread_data;
4625}
4626
4627/* Clean up plugin-specific thread-local-data. */
4628
4629void
4630GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4631{
4632 free (data);
4633}
4634
4635/* }}} */