]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-gcn.c
amdgcn: Add gfx1036 target
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
CommitLineData
237957cc
AS
1/* Plugin for AMD GCN execution.
2
a945c346 3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
237957cc
AS
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* {{{ Includes and defines */
30
31#include "config.h"
0bac793e 32#include "symcat.h"
237957cc
AS
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <pthread.h>
37#include <inttypes.h>
38#include <stdbool.h>
39#include <limits.h>
40#include <hsa.h>
85f0a4d9 41#include <hsa_ext_amd.h>
237957cc
AS
42#include <dlfcn.h>
43#include <signal.h>
44#include "libgomp-plugin.h"
8c05d8cd 45#include "config/gcn/libgomp-gcn.h" /* For struct output. */
237957cc
AS
46#include "gomp-constants.h"
47#include <elf.h>
48#include "oacc-plugin.h"
49#include "oacc-int.h"
50#include <assert.h>
51
237957cc 52/* These probably won't be in elf.h for a while. */
97981e13 53#ifndef R_AMDGPU_NONE
237957cc
AS
54#define R_AMDGPU_NONE 0
55#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57#define R_AMDGPU_ABS64 3 /* S + A */
58#define R_AMDGPU_REL32 4 /* S + A - P */
59#define R_AMDGPU_REL64 5 /* S + A - P */
60#define R_AMDGPU_ABS32 6 /* S + A */
61#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
237957cc 66#define R_AMDGPU_RELATIVE64 13 /* B + A */
97981e13 67#endif
237957cc
AS
68
69/* GCN specific definitions for asynchronous queues. */
70
71#define ASYNC_QUEUE_SIZE 64
72#define DRAIN_QUEUE_SYNCHRONOUS_P false
73#define DEBUG_QUEUES 0
74#define DEBUG_THREAD_SLEEP 0
75#define DEBUG_THREAD_SIGNAL 0
76
77/* Defaults. */
78#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
79
80/* Secure getenv() which returns NULL if running as SUID/SGID. */
81#ifndef HAVE_SECURE_GETENV
82#ifdef HAVE___SECURE_GETENV
83#define secure_getenv __secure_getenv
84#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
86
87#include <unistd.h>
88
89/* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
91
92static char *
93secure_getenv (const char *name)
94{
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
96 return getenv (name);
97 else
98 return NULL;
99}
100
101#else
102#define secure_getenv getenv
103#endif
104#endif
105
106/* }}} */
107/* {{{ Types */
108
93d90219 109/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
237957cc
AS
110
111struct gcn_thread
112{
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
114 int async;
115};
116
117/* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
119
120struct hsa_runtime_fn_info
121{
122 /* HSA runtime. */
123 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
124 const char **status_string);
125 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
126 void *value);
127 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
128 hsa_agent_info_t attribute,
129 void *value);
130 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
131 hsa_isa_info_t attribute,
132 uint32_t index,
133 void *value);
134 hsa_status_t (*hsa_init_fn) (void);
135 hsa_status_t (*hsa_iterate_agents_fn)
136 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
137 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
138 hsa_region_info_t attribute,
139 void *value);
140 hsa_status_t (*hsa_queue_create_fn)
141 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
142 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
143 void *data, uint32_t private_segment_size,
144 uint32_t group_segment_size, hsa_queue_t **queue);
145 hsa_status_t (*hsa_agent_iterate_regions_fn)
146 (hsa_agent_t agent,
147 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
148 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
149 hsa_status_t (*hsa_executable_create_fn)
150 (hsa_profile_t profile, hsa_executable_state_t executable_state,
151 const char *options, hsa_executable_t *executable);
152 hsa_status_t (*hsa_executable_global_variable_define_fn)
153 (hsa_executable_t executable, const char *variable_name, void *address);
154 hsa_status_t (*hsa_executable_load_code_object_fn)
155 (hsa_executable_t executable, hsa_agent_t agent,
156 hsa_code_object_t code_object, const char *options);
157 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
158 const char *options);
159 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
160 uint32_t num_consumers,
161 const hsa_agent_t *consumers,
162 hsa_signal_t *signal);
163 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
164 void **ptr);
165 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
166 hsa_access_permission_t access);
167 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
168 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
169 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
170 hsa_status_t (*hsa_executable_get_symbol_fn)
171 (hsa_executable_t executable, const char *module_name,
172 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
173 hsa_executable_symbol_t *symbol);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn)
175 (hsa_executable_symbol_t executable_symbol,
176 hsa_executable_symbol_info_t attribute, void *value);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn)
178 (hsa_executable_t executable,
179 hsa_status_t (*callback)(hsa_executable_t executable,
180 hsa_executable_symbol_t symbol, void *data),
181 void *data);
182 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
183 uint64_t value);
184 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
185 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
190 (hsa_signal_t signal, hsa_signal_condition_t condition,
191 hsa_signal_value_t compare_value, uint64_t timeout_hint,
192 hsa_wait_state_t wait_state_hint);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
194 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
195
196 hsa_status_t (*hsa_code_object_deserialize_fn)
197 (void *serialized_code_object, size_t serialized_code_object_size,
198 const char *options, hsa_code_object_t *code_object);
a17299c1
JB
199 hsa_status_t (*hsa_amd_memory_lock_fn)
200 (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
201 void **agent_ptr);
202 hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
203 hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
204 (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
205 const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
206 const hsa_dim3_t *range, hsa_agent_t copy_agent,
207 hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
208 const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
237957cc
AS
209};
210
211/* Structure describing the run-time and grid properties of an HSA kernel
212 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
213
214struct GOMP_kernel_launch_attributes
215{
216 /* Number of dimensions the workload has. Maximum number is 3. */
217 uint32_t ndim;
218 /* Size of the grid in the three respective dimensions. */
219 uint32_t gdims[3];
220 /* Size of work-groups in the respective dimensions. */
221 uint32_t wdims[3];
222};
223
224/* Collection of information needed for a dispatch of a kernel from a
225 kernel. */
226
227struct kernel_dispatch
228{
229 struct agent_info *agent;
230 /* Pointer to a command queue associated with a kernel dispatch agent. */
231 void *queue;
232 /* Pointer to a memory space used for kernel arguments passing. */
233 void *kernarg_address;
234 /* Kernel object. */
235 uint64_t object;
236 /* Synchronization signal used for dispatch synchronization. */
237 uint64_t signal;
238 /* Private segment size. */
239 uint32_t private_segment_size;
240 /* Group segment size. */
241 uint32_t group_segment_size;
242};
243
244/* Structure of the kernargs segment, supporting console output.
245
246 This needs to match the definitions in Newlib, and the expectations
247 in libgomp target code. */
248
249struct kernargs {
f6fff8a6 250 struct kernargs_abi abi;
237957cc
AS
251
252 /* Output data. */
8c05d8cd 253 struct output output_data;
237957cc
AS
254};
255
256/* A queue entry for a future asynchronous launch. */
257
258struct kernel_launch
259{
260 struct kernel_info *kernel;
261 void *vars;
262 struct GOMP_kernel_launch_attributes kla;
263};
264
265/* A queue entry for a future callback. */
266
267struct callback
268{
269 void (*fn)(void *);
270 void *data;
271};
272
273/* A data struct for the copy_data callback. */
274
275struct copy_data
276{
277 void *dst;
278 const void *src;
279 size_t len;
237957cc
AS
280 struct goacc_asyncqueue *aq;
281};
282
283/* A queue entry for a placeholder. These correspond to a wait event. */
284
285struct placeholder
286{
287 int executed;
288 pthread_cond_t cond;
289 pthread_mutex_t mutex;
290};
291
292/* A queue entry for a wait directive. */
293
294struct asyncwait_info
295{
296 struct placeholder *placeholderp;
297};
298
299/* Encode the type of an entry in an async queue. */
300
301enum entry_type
302{
303 KERNEL_LAUNCH,
304 CALLBACK,
305 ASYNC_WAIT,
306 ASYNC_PLACEHOLDER
307};
308
309/* An entry in an async queue. */
310
311struct queue_entry
312{
313 enum entry_type type;
314 union {
315 struct kernel_launch launch;
316 struct callback callback;
317 struct asyncwait_info asyncwait;
318 struct placeholder placeholder;
319 } u;
320};
321
322/* An async queue header.
323
324 OpenMP may create one of these.
325 OpenACC may create many. */
326
327struct goacc_asyncqueue
328{
329 struct agent_info *agent;
330 hsa_queue_t *hsa_queue;
331
332 pthread_t thread_drain_queue;
333 pthread_mutex_t mutex;
334 pthread_cond_t queue_cond_in;
335 pthread_cond_t queue_cond_out;
336 struct queue_entry queue[ASYNC_QUEUE_SIZE];
337 int queue_first;
338 int queue_n;
339 int drain_queue_stop;
340
341 int id;
342 struct goacc_asyncqueue *prev;
343 struct goacc_asyncqueue *next;
344};
345
346/* Mkoffload uses this structure to describe a kernel.
347
348 OpenMP kernel dimensions are passed at runtime.
349 OpenACC kernel dimensions are passed at compile time, here. */
350
351struct hsa_kernel_description
352{
353 const char *name;
354 int oacc_dims[3]; /* Only present for GCN kernels. */
5a28e272
KCY
355 int sgpr_count;
356 int vpgr_count;
237957cc
AS
357};
358
359/* Mkoffload uses this structure to describe an offload variable. */
360
361struct global_var_info
362{
363 const char *name;
364 void *address;
365};
366
367/* Mkoffload uses this structure to describe all the kernels in a
368 loadable module. These are passed the libgomp via static constructors. */
369
370struct gcn_image_desc
371{
372 struct gcn_image {
373 size_t size;
374 void *image;
375 } *gcn_image;
376 const unsigned kernel_count;
377 struct hsa_kernel_description *kernel_infos;
a49c7d31 378 const unsigned ind_func_count;
237957cc 379 const unsigned global_variable_count;
237957cc
AS
380};
381
7d593fd6
FH
382/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
383 support.
384 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
385
386typedef enum {
209ed06c 387 EF_AMDGPU_MACH_UNSUPPORTED = -1,
7d593fd6
FH
388 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
389 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
390 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
cde52d3a 391 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
c7ec7bd1 392 EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
f9290cdf 393 EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
78b56a12 394 EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045,
1bf18629
AS
395 EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041,
396 EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044
7d593fd6
FH
397} EF_AMDGPU_MACH;
398
399const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
400typedef EF_AMDGPU_MACH gcn_isa;
401
237957cc
AS
402/* Description of an HSA GPU agent (device) and the program associated with
403 it. */
404
405struct agent_info
406{
407 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
408 hsa_agent_t id;
409 /* The user-visible device number. */
410 int device_id;
411 /* Whether the agent has been initialized. The fields below are usable only
412 if it has been. */
413 bool initialized;
7d593fd6
FH
414
415 /* The instruction set architecture of the device. */
416 gcn_isa device_isa;
2e5ea579
FH
417 /* Name of the agent. */
418 char name[64];
419 /* Name of the vendor of the agent. */
420 char vendor_name[64];
237957cc
AS
421 /* Command queues of the agent. */
422 hsa_queue_t *sync_queue;
423 struct goacc_asyncqueue *async_queues, *omp_async_queue;
424 pthread_mutex_t async_queues_mutex;
425
426 /* The HSA memory region from which to allocate kernel arguments. */
427 hsa_region_t kernarg_region;
428
429 /* The HSA memory region from which to allocate device data. */
430 hsa_region_t data_region;
431
f6fff8a6
AS
432 /* Allocated ephemeral memories (team arena and stack space). */
433 struct ephemeral_memories_list *ephemeral_memories_list;
434 pthread_mutex_t ephemeral_memories_write_lock;
237957cc
AS
435
436 /* Read-write lock that protects kernels which are running or about to be run
437 from interference with loading and unloading of images. Needs to be
438 locked for reading while a kernel is being run, and for writing if the
439 list of modules is manipulated (and thus the HSA program invalidated). */
440 pthread_rwlock_t module_rwlock;
441
442 /* The module associated with this kernel. */
443 struct module_info *module;
444
445 /* Mutex enforcing that only one thread will finalize the HSA program. A
446 thread should have locked agent->module_rwlock for reading before
447 acquiring it. */
448 pthread_mutex_t prog_mutex;
449 /* Flag whether the HSA program that consists of all the modules has been
450 finalized. */
451 bool prog_finalized;
452 /* HSA executable - the finalized program that is used to locate kernels. */
453 hsa_executable_t executable;
454};
455
456/* Information required to identify, finalize and run any given kernel. */
457
458enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
459
460struct kernel_info
461{
462 /* Name of the kernel, required to locate it within the GCN object-code
463 module. */
464 const char *name;
465 /* The specific agent the kernel has been or will be finalized for and run
466 on. */
467 struct agent_info *agent;
468 /* The specific module where the kernel takes place. */
469 struct module_info *module;
5a28e272
KCY
470 /* Information provided by mkoffload associated with the kernel. */
471 struct hsa_kernel_description *description;
237957cc
AS
472 /* Mutex enforcing that at most once thread ever initializes a kernel for
473 use. A thread should have locked agent->module_rwlock for reading before
474 acquiring it. */
475 pthread_mutex_t init_mutex;
476 /* Flag indicating whether the kernel has been initialized and all fields
477 below it contain valid data. */
478 bool initialized;
479 /* Flag indicating that the kernel has a problem that blocks an execution. */
480 bool initialization_failed;
481 /* The object to be put into the dispatch queue. */
482 uint64_t object;
483 /* Required size of kernel arguments. */
484 uint32_t kernarg_segment_size;
485 /* Required size of group segment. */
486 uint32_t group_segment_size;
487 /* Required size of private segment. */
488 uint32_t private_segment_size;
489 /* Set up for OpenMP or OpenACC? */
490 enum offload_kind kind;
491};
492
493/* Information about a particular GCN module, its image and kernels. */
494
495struct module_info
496{
497 /* The description with which the program has registered the image. */
498 struct gcn_image_desc *image_desc;
499 /* GCN heap allocation. */
500 struct heap *heap;
501 /* Physical boundaries of the loaded module. */
502 Elf64_Addr phys_address_start;
503 Elf64_Addr phys_address_end;
504
505 bool constructors_run_p;
506 struct kernel_info *init_array_func, *fini_array_func;
507
508 /* Number of kernels in this module. */
509 int kernel_count;
510 /* An array of kernel_info structures describing each kernel in this
511 module. */
512 struct kernel_info kernels[];
513};
514
515/* A linked list of memory arenas allocated on the device.
f6fff8a6
AS
516 These are used by OpenMP, as a means to optimize per-team malloc,
517 and for host-accessible stack space. */
237957cc 518
f6fff8a6 519struct ephemeral_memories_list
237957cc 520{
f6fff8a6 521 struct ephemeral_memories_list *next;
237957cc 522
f6fff8a6
AS
523 /* The size is determined by the number of teams and threads. */
524 size_t size;
525 /* The device address allocated memory. */
526 void *address;
527 /* A flag to prevent two asynchronous kernels trying to use the same memory.
237957cc
AS
528 The mutex is locked until the kernel exits. */
529 pthread_mutex_t in_use;
530};
531
532/* Information about the whole HSA environment and all of its agents. */
533
534struct hsa_context_info
535{
536 /* Whether the structure has been initialized. */
537 bool initialized;
538 /* Number of usable GPU HSA agents in the system. */
539 int agent_count;
540 /* Array of agent_info structures describing the individual HSA agents. */
541 struct agent_info *agents;
2e5ea579
FH
542 /* Driver version string. */
543 char driver_version_s[30];
237957cc
AS
544};
545
237957cc
AS
546/* }}} */
547/* {{{ Global variables */
548
549/* Information about the whole HSA environment and all of its agents. */
550
551static struct hsa_context_info hsa_context;
552
553/* HSA runtime functions that are initialized in init_hsa_context. */
554
555static struct hsa_runtime_fn_info hsa_fns;
556
557/* Heap space, allocated target-side, provided for use of newlib malloc.
558 Each module should have it's own heap allocated.
559 Beware that heap usage increases with OpenMP teams. See also arenas. */
560
561static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
562
f6fff8a6
AS
563/* Ephemeral memory sizes for each kernel launch. */
564
565static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
566static int stack_size = DEFAULT_GCN_STACK_SIZE;
e7d6c277 567static int lowlat_size = -1;
f6fff8a6 568
237957cc
AS
569/* Flag to decide whether print to stderr information about what is going on.
570 Set in init_debug depending on environment variables. */
571
572static bool debug;
573
574/* Flag to decide if the runtime should suppress a possible fallback to host
575 execution. */
576
577static bool suppress_host_fallback;
578
579/* Flag to locate HSA runtime shared library that is dlopened
580 by this plug-in. */
581
582static const char *hsa_runtime_lib;
583
584/* Flag to decide if the runtime should support also CPU devices (can be
585 a simulator). */
586
587static bool support_cpu_devices;
588
589/* Runtime dimension overrides. Zero indicates default. */
590
591static int override_x_dim = 0;
592static int override_z_dim = 0;
593
594/* }}} */
595/* {{{ Debug & Diagnostic */
596
597/* Print a message to stderr if GCN_DEBUG value is set to true. */
598
599#define DEBUG_PRINT(...) \
600 do \
601 { \
602 if (debug) \
603 { \
604 fprintf (stderr, __VA_ARGS__); \
605 } \
606 } \
607 while (false);
608
609/* Flush stderr if GCN_DEBUG value is set to true. */
610
611#define DEBUG_FLUSH() \
612 do { \
613 if (debug) \
614 fflush (stderr); \
615 } while (false)
616
617/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
618 is set to true. */
619
620#define DEBUG_LOG(prefix, ...) \
621 do \
622 { \
623 DEBUG_PRINT (prefix); \
624 DEBUG_PRINT (__VA_ARGS__); \
625 DEBUG_FLUSH (); \
626 } while (false)
627
628/* Print a debugging message to stderr. */
629
630#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
631
632/* Print a warning message to stderr. */
633
634#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
635
636/* Print HSA warning STR with an HSA STATUS code. */
637
638static void
639hsa_warn (const char *str, hsa_status_t status)
640{
641 if (!debug)
642 return;
643
644 const char *hsa_error_msg = "[unknown]";
645 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
646
647 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
648 hsa_error_msg);
649}
650
651/* Report a fatal error STR together with the HSA error corresponding to STATUS
652 and terminate execution of the current process. */
653
654static void
655hsa_fatal (const char *str, hsa_status_t status)
656{
657 const char *hsa_error_msg = "[unknown]";
658 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
659 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
660 hsa_error_msg);
661}
662
663/* Like hsa_fatal, except only report error message, and return FALSE
664 for propagating error processing to outside of plugin. */
665
666static bool
667hsa_error (const char *str, hsa_status_t status)
668{
669 const char *hsa_error_msg = "[unknown]";
670 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
671 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
672 hsa_error_msg);
673 return false;
674}
675
676/* Dump information about the available hardware. */
677
678static void
679dump_hsa_system_info (void)
680{
681 hsa_status_t status;
682
683 hsa_endianness_t endianness;
684 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
685 &endianness);
686 if (status == HSA_STATUS_SUCCESS)
687 switch (endianness)
688 {
689 case HSA_ENDIANNESS_LITTLE:
690 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
691 break;
692 case HSA_ENDIANNESS_BIG:
693 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
694 break;
695 default:
696 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
697 }
698 else
699 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
700
701 uint8_t extensions[128];
702 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
703 &extensions);
704 if (status == HSA_STATUS_SUCCESS)
705 {
706 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
707 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
708 }
709 else
710 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
711}
712
713/* Dump information about the available hardware. */
714
715static void
716dump_machine_model (hsa_machine_model_t machine_model, const char *s)
717{
718 switch (machine_model)
719 {
720 case HSA_MACHINE_MODEL_SMALL:
721 GCN_DEBUG ("%s: SMALL\n", s);
722 break;
723 case HSA_MACHINE_MODEL_LARGE:
724 GCN_DEBUG ("%s: LARGE\n", s);
725 break;
726 default:
727 GCN_WARNING ("%s: UNKNOWN\n", s);
728 break;
729 }
730}
731
732/* Dump information about the available hardware. */
733
734static void
735dump_profile (hsa_profile_t profile, const char *s)
736{
737 switch (profile)
738 {
739 case HSA_PROFILE_FULL:
740 GCN_DEBUG ("%s: FULL\n", s);
741 break;
742 case HSA_PROFILE_BASE:
743 GCN_DEBUG ("%s: BASE\n", s);
744 break;
745 default:
746 GCN_WARNING ("%s: UNKNOWN\n", s);
747 break;
748 }
749}
750
751/* Dump information about a device memory region. */
752
753static hsa_status_t
754dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
755{
756 hsa_status_t status;
757
758 hsa_region_segment_t segment;
759 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
760 &segment);
761 if (status == HSA_STATUS_SUCCESS)
762 {
763 if (segment == HSA_REGION_SEGMENT_GLOBAL)
764 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
765 else if (segment == HSA_REGION_SEGMENT_READONLY)
766 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
767 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
768 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
769 else if (segment == HSA_REGION_SEGMENT_GROUP)
770 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
771 else
772 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
773 }
774 else
775 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
776
777 if (segment == HSA_REGION_SEGMENT_GLOBAL)
778 {
779 uint32_t flags;
780 status
781 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
782 &flags);
783 if (status == HSA_STATUS_SUCCESS)
784 {
785 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
786 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
787 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
788 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
789 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
790 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
791 }
792 else
793 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
794 }
795
796 size_t size;
797 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
798 if (status == HSA_STATUS_SUCCESS)
799 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
800 else
801 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
802
803 status
804 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
805 &size);
806 if (status == HSA_STATUS_SUCCESS)
807 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
808 else
809 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
810
811 bool alloc_allowed;
812 status
813 = hsa_fns.hsa_region_get_info_fn (region,
814 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
815 &alloc_allowed);
816 if (status == HSA_STATUS_SUCCESS)
817 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
818 else
819 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
820
821 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
822 return HSA_STATUS_SUCCESS;
823
824 status
825 = hsa_fns.hsa_region_get_info_fn (region,
826 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
827 &size);
828 if (status == HSA_STATUS_SUCCESS)
829 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
830 else
831 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
832
833 size_t align;
834 status
835 = hsa_fns.hsa_region_get_info_fn (region,
836 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
837 &align);
838 if (status == HSA_STATUS_SUCCESS)
839 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
840 else
841 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
842
843 return HSA_STATUS_SUCCESS;
844}
845
846/* Dump information about all the device memory regions. */
847
848static void
849dump_hsa_regions (hsa_agent_t agent)
850{
851 hsa_status_t status;
852 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
853 dump_hsa_region,
854 NULL);
855 if (status != HSA_STATUS_SUCCESS)
856 hsa_error ("Dumping hsa regions failed", status);
857}
858
859/* Dump information about the available devices. */
860
861static hsa_status_t
862dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
863{
864 hsa_status_t status;
865
866 char buf[64];
867 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
868 &buf);
869 if (status == HSA_STATUS_SUCCESS)
870 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
871 else
872 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
873
874 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
875 &buf);
876 if (status == HSA_STATUS_SUCCESS)
877 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
878 else
879 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
880
881 hsa_machine_model_t machine_model;
882 status
883 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
884 &machine_model);
885 if (status == HSA_STATUS_SUCCESS)
886 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
887 else
888 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
889
890 hsa_profile_t profile;
891 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
892 &profile);
893 if (status == HSA_STATUS_SUCCESS)
894 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
895 else
896 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
897
898 hsa_device_type_t device_type;
899 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
900 &device_type);
901 if (status == HSA_STATUS_SUCCESS)
902 {
903 switch (device_type)
904 {
905 case HSA_DEVICE_TYPE_CPU:
906 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
907 break;
908 case HSA_DEVICE_TYPE_GPU:
909 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
910 break;
911 case HSA_DEVICE_TYPE_DSP:
912 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
913 break;
914 default:
915 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
916 break;
917 }
918 }
919 else
920 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
921
922 uint32_t cu_count;
923 status = hsa_fns.hsa_agent_get_info_fn
924 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
925 if (status == HSA_STATUS_SUCCESS)
926 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
927 else
928 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
929
930 uint32_t size;
931 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
932 &size);
933 if (status == HSA_STATUS_SUCCESS)
934 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
935 else
936 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
937
938 uint32_t max_dim;
939 status = hsa_fns.hsa_agent_get_info_fn (agent,
940 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
941 &max_dim);
942 if (status == HSA_STATUS_SUCCESS)
943 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
944 else
945 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
946
947 uint32_t max_size;
948 status = hsa_fns.hsa_agent_get_info_fn (agent,
949 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
950 &max_size);
951 if (status == HSA_STATUS_SUCCESS)
952 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
953 else
954 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
955
956 uint32_t grid_max_dim;
957 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
958 &grid_max_dim);
959 if (status == HSA_STATUS_SUCCESS)
960 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
961 else
962 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
963
964 uint32_t grid_max_size;
965 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
966 &grid_max_size);
967 if (status == HSA_STATUS_SUCCESS)
968 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
969 else
970 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
971
972 dump_hsa_regions (agent);
973
974 return HSA_STATUS_SUCCESS;
975}
976
977/* Forward reference. */
978
979static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
980
981/* Helper function for dump_executable_symbols. */
982
983static hsa_status_t
984dump_executable_symbol (hsa_executable_t executable,
985 hsa_executable_symbol_t symbol,
986 void *data __attribute__((unused)))
987{
988 char *name = get_executable_symbol_name (symbol);
989
990 if (name)
991 {
992 GCN_DEBUG ("executable symbol: %s\n", name);
993 free (name);
994 }
995
996 return HSA_STATUS_SUCCESS;
997}
998
999/* Dump all global symbol in an executable. */
1000
1001static void
1002dump_executable_symbols (hsa_executable_t executable)
1003{
1004 hsa_status_t status;
1005 status
1006 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1007 dump_executable_symbol,
1008 NULL);
1009 if (status != HSA_STATUS_SUCCESS)
1010 hsa_fatal ("Could not dump HSA executable symbols", status);
1011}
1012
1013/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1014
1015static void
1016print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1017{
1018 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1019
1020 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1021 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1022 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1023 fprintf (stderr, "%*sheap address: %p\n", indent, "",
f6fff8a6
AS
1024 (void*)kernargs->abi.heap_ptr);
1025 fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1026 "", (void*)kernargs->abi.arena_ptr,
1027 kernargs->abi.arena_size_per_team);
1028 fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1029 "", (void*)kernargs->abi.stack_ptr,
1030 kernargs->abi.stack_size_per_thread);
237957cc
AS
1031 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1032 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1033 dispatch->private_segment_size);
e7d6c277
AS
1034 fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1035 "", dispatch->group_segment_size);
237957cc
AS
1036 fprintf (stderr, "\n");
1037}
1038
1039/* }}} */
1040/* {{{ Utility functions */
1041
1042/* Cast the thread local storage to gcn_thread. */
1043
1044static inline struct gcn_thread *
1045gcn_thread (void)
1046{
1047 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1048}
1049
1050/* Initialize debug and suppress_host_fallback according to the environment. */
1051
1052static void
1053init_environment_variables (void)
1054{
1055 if (secure_getenv ("GCN_DEBUG"))
1056 debug = true;
1057 else
1058 debug = false;
1059
1060 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1061 suppress_host_fallback = true;
1062 else
1063 suppress_host_fallback = false;
1064
1065 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1066 if (hsa_runtime_lib == NULL)
7c1e856b 1067 hsa_runtime_lib = "libhsa-runtime64.so.1";
237957cc
AS
1068
1069 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1070
1071 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1072 if (!x)
1073 x = secure_getenv ("GCN_NUM_GANGS");
1074 if (x)
1075 override_x_dim = atoi (x);
1076
1077 const char *z = secure_getenv ("GCN_NUM_THREADS");
1078 if (!z)
1079 z = secure_getenv ("GCN_NUM_WORKERS");
1080 if (z)
1081 override_z_dim = atoi (z);
1082
1083 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1084 if (heap)
1085 {
1086 size_t tmp = atol (heap);
1087 if (tmp)
1088 gcn_kernel_heap_size = tmp;
1089 }
f6fff8a6
AS
1090
1091 const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1092 if (arena)
1093 {
1094 int tmp = atoi (arena);
1095 if (tmp)
1096 team_arena_size = tmp;;
1097 }
1098
1099 const char *stack = secure_getenv ("GCN_STACK_SIZE");
1100 if (stack)
1101 {
1102 int tmp = atoi (stack);
1103 if (tmp)
1104 stack_size = tmp;;
1105 }
e7d6c277
AS
1106
1107 const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1108 if (lowlat)
1109 lowlat_size = atoi (lowlat);
237957cc
AS
1110}
1111
1112/* Return malloc'd string with name of SYMBOL. */
1113
1114static char *
1115get_executable_symbol_name (hsa_executable_symbol_t symbol)
1116{
1117 hsa_status_t status;
1118 char *res;
1119 uint32_t len;
1120 const hsa_executable_symbol_info_t info_name_length
1121 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1122
1123 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1124 &len);
1125 if (status != HSA_STATUS_SUCCESS)
1126 {
1127 hsa_error ("Could not get length of symbol name", status);
1128 return NULL;
1129 }
1130
1131 res = GOMP_PLUGIN_malloc (len + 1);
1132
1133 const hsa_executable_symbol_info_t info_name
1134 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1135
1136 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1137
1138 if (status != HSA_STATUS_SUCCESS)
1139 {
1140 hsa_error ("Could not get symbol name", status);
1141 free (res);
1142 return NULL;
1143 }
1144
1145 res[len] = '\0';
1146
1147 return res;
1148}
1149
237957cc
AS
1150/* Get the number of GPU Compute Units. */
1151
1152static int
1153get_cu_count (struct agent_info *agent)
1154{
1155 uint32_t cu_count;
1156 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1157 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1158 if (status == HSA_STATUS_SUCCESS)
1159 return cu_count;
1160 else
1161 return 64; /* The usual number for older devices. */
1162}
1163
1164/* Calculate the maximum grid size for OMP threads / OACC workers.
1165 This depends on the kernel's resource usage levels. */
1166
1167static int
1168limit_worker_threads (int threads)
1169{
1170 /* FIXME Do something more inteligent here.
1171 GCN can always run 4 threads within a Compute Unit, but
1172 more than that depends on register usage. */
1173 if (threads > 16)
1174 threads = 16;
1175 return threads;
1176}
1177
81476bc4
MV
1178/* This sets the maximum number of teams to twice the number of GPU Compute
1179 Units to avoid memory waste and corresponding memory access faults. */
1180
1181static int
1182limit_teams (int teams, struct agent_info *agent)
1183{
1184 int max_teams = 2 * get_cu_count (agent);
1185 if (teams > max_teams)
1186 teams = max_teams;
1187 return teams;
1188}
1189
237957cc
AS
1190/* Parse the target attributes INPUT provided by the compiler and return true
1191 if we should run anything all. If INPUT is NULL, fill DEF with default
1192 values, then store INPUT or DEF into *RESULT.
1193
1194 This is used for OpenMP only. */
1195
1196static bool
1197parse_target_attributes (void **input,
1198 struct GOMP_kernel_launch_attributes *def,
1199 struct GOMP_kernel_launch_attributes **result,
1200 struct agent_info *agent)
1201{
1202 if (!input)
1203 GOMP_PLUGIN_fatal ("No target arguments provided");
1204
1205 bool grid_attrs_found = false;
1206 bool gcn_dims_found = false;
1207 int gcn_teams = 0;
1208 int gcn_threads = 0;
1209 while (*input)
1210 {
1211 intptr_t id = (intptr_t) *input++, val;
1212
1213 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1214 val = (intptr_t) *input++;
1215 else
1216 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1217
1218 val = (val > INT_MAX) ? INT_MAX : val;
1219
1220 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1221 && ((id & GOMP_TARGET_ARG_ID_MASK)
1222 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1223 {
1224 grid_attrs_found = true;
1225 break;
1226 }
14e5e746
AS
1227 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1228 == GOMP_TARGET_ARG_DEVICE_ALL)
237957cc
AS
1229 {
1230 gcn_dims_found = true;
1231 switch (id & GOMP_TARGET_ARG_ID_MASK)
1232 {
1233 case GOMP_TARGET_ARG_NUM_TEAMS:
81476bc4 1234 gcn_teams = limit_teams (val, agent);
237957cc
AS
1235 break;
1236 case GOMP_TARGET_ARG_THREAD_LIMIT:
1237 gcn_threads = limit_worker_threads (val);
1238 break;
1239 default:
1240 ;
1241 }
1242 }
1243 }
1244
1245 if (gcn_dims_found)
1246 {
a78b1ab1
KCY
1247 bool gfx900_workaround_p = false;
1248
7d593fd6
FH
1249 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1250 && gcn_threads == 0 && override_z_dim == 0)
237957cc 1251 {
a78b1ab1 1252 gfx900_workaround_p = true;
237957cc 1253 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
a78b1ab1 1254 "threads to at most 4 per team.\n");
237957cc
AS
1255 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1256 "GCN_NUM_THREADS=16\n");
1257 }
1258
a78b1ab1
KCY
1259 /* Ideally, when a dimension isn't explicitly specified, we should
1260 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1261 In practice, we tune for peak performance on BabelStream, which
1262 for OpenACC is currently 32 threads per CU. */
237957cc 1263 def->ndim = 3;
a78b1ab1
KCY
1264 if (gcn_teams <= 0 && gcn_threads <= 0)
1265 {
1266 /* Set up a reasonable number of teams and threads. */
1267 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1268 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1269 def->gdims[2] = gcn_threads;
1270 }
1271 else if (gcn_teams <= 0 && gcn_threads > 0)
1272 {
1273 /* Auto-scale the number of teams with the number of threads. */
1274 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1275 def->gdims[2] = gcn_threads;
1276 }
1277 else if (gcn_teams > 0 && gcn_threads <= 0)
1278 {
1279 int max_threads = gfx900_workaround_p ? 4 : 16;
1280
1281 /* Auto-scale the number of threads with the number of teams. */
1282 def->gdims[0] = gcn_teams;
1283 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1284 if (def->gdims[2] == 0)
1285 def->gdims[2] = 1;
1286 else if (def->gdims[2] > max_threads)
1287 def->gdims[2] = max_threads;
1288 }
1289 else
1290 {
1291 def->gdims[0] = gcn_teams;
1292 def->gdims[2] = gcn_threads;
1293 }
1294 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1295 def->wdims[0] = 1; /* Single team per work-group. */
237957cc
AS
1296 def->wdims[1] = 64;
1297 def->wdims[2] = 16;
1298 *result = def;
1299 return true;
1300 }
1301 else if (!grid_attrs_found)
1302 {
1303 def->ndim = 1;
1304 def->gdims[0] = 1;
1305 def->gdims[1] = 1;
1306 def->gdims[2] = 1;
1307 def->wdims[0] = 1;
1308 def->wdims[1] = 1;
1309 def->wdims[2] = 1;
1310 *result = def;
1311 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1312 return true;
1313 }
1314
1315 struct GOMP_kernel_launch_attributes *kla;
1316 kla = (struct GOMP_kernel_launch_attributes *) *input;
1317 *result = kla;
1318 if (kla->ndim == 0 || kla->ndim > 3)
1319 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1320
1321 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1322 unsigned i;
1323 for (i = 0; i < kla->ndim; i++)
1324 {
1325 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1326 kla->gdims[i], kla->wdims[i]);
1327 if (kla->gdims[i] == 0)
1328 return false;
1329 }
1330 return true;
1331}
1332
1333/* Return the group size given the requested GROUP size, GRID size and number
1334 of grid dimensions NDIM. */
1335
1336static uint32_t
1337get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1338{
1339 if (group == 0)
1340 {
1341 /* TODO: Provide a default via environment or device characteristics. */
1342 if (ndim == 1)
1343 group = 64;
1344 else if (ndim == 2)
1345 group = 8;
1346 else
1347 group = 4;
1348 }
1349
1350 if (group > grid)
1351 group = grid;
1352 return group;
1353}
1354
1355/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1356
1357static void
1358packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1359{
1360 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1361}
1362
1363/* A never-called callback for the HSA command queues. These signal events
1364 that we don't use, so we trigger an error.
1365
1366 This "queue" is not to be confused with the async queues, below. */
1367
1368static void
1369hsa_queue_callback (hsa_status_t status,
1370 hsa_queue_t *queue __attribute__ ((unused)),
1371 void *data __attribute__ ((unused)))
1372{
1373 hsa_fatal ("Asynchronous queue error", status);
1374}
1375
1376/* }}} */
1377/* {{{ HSA initialization */
1378
1379/* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1380
1381static bool
1382init_hsa_runtime_functions (void)
1383{
1384#define DLSYM_FN(function) \
1385 hsa_fns.function##_fn = dlsym (handle, #function); \
1386 if (hsa_fns.function##_fn == NULL) \
ab70addf 1387 GOMP_PLUGIN_fatal ("'%s' is missing '%s'", hsa_runtime_lib, #function);
a17299c1
JB
1388#define DLSYM_OPT_FN(function) \
1389 hsa_fns.function##_fn = dlsym (handle, #function);
ab70addf 1390
237957cc
AS
1391 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1392 if (handle == NULL)
1393 return false;
1394
1395 DLSYM_FN (hsa_status_string)
1396 DLSYM_FN (hsa_system_get_info)
1397 DLSYM_FN (hsa_agent_get_info)
1398 DLSYM_FN (hsa_init)
1399 DLSYM_FN (hsa_iterate_agents)
1400 DLSYM_FN (hsa_region_get_info)
1401 DLSYM_FN (hsa_queue_create)
1402 DLSYM_FN (hsa_agent_iterate_regions)
1403 DLSYM_FN (hsa_executable_destroy)
1404 DLSYM_FN (hsa_executable_create)
1405 DLSYM_FN (hsa_executable_global_variable_define)
1406 DLSYM_FN (hsa_executable_load_code_object)
1407 DLSYM_FN (hsa_executable_freeze)
1408 DLSYM_FN (hsa_signal_create)
1409 DLSYM_FN (hsa_memory_allocate)
1410 DLSYM_FN (hsa_memory_assign_agent)
1411 DLSYM_FN (hsa_memory_copy)
1412 DLSYM_FN (hsa_memory_free)
1413 DLSYM_FN (hsa_signal_destroy)
1414 DLSYM_FN (hsa_executable_get_symbol)
1415 DLSYM_FN (hsa_executable_symbol_get_info)
1416 DLSYM_FN (hsa_executable_iterate_symbols)
1417 DLSYM_FN (hsa_queue_add_write_index_release)
1418 DLSYM_FN (hsa_queue_load_read_index_acquire)
1419 DLSYM_FN (hsa_signal_wait_acquire)
1420 DLSYM_FN (hsa_signal_store_relaxed)
1421 DLSYM_FN (hsa_signal_store_release)
1422 DLSYM_FN (hsa_signal_load_acquire)
1423 DLSYM_FN (hsa_queue_destroy)
1424 DLSYM_FN (hsa_code_object_deserialize)
a17299c1
JB
1425 DLSYM_OPT_FN (hsa_amd_memory_lock)
1426 DLSYM_OPT_FN (hsa_amd_memory_unlock)
1427 DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
237957cc 1428 return true;
a17299c1 1429#undef DLSYM_OPT_FN
237957cc
AS
1430#undef DLSYM_FN
1431}
1432
c34ab549
RB
1433static gcn_isa isa_code (const char *isa);
1434
237957cc
AS
1435/* Return true if the agent is a GPU and can accept of concurrent submissions
1436 from different threads. */
1437
1438static bool
1439suitable_hsa_agent_p (hsa_agent_t agent)
1440{
1441 hsa_device_type_t device_type;
1442 hsa_status_t status
1443 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1444 &device_type);
1445 if (status != HSA_STATUS_SUCCESS)
1446 return false;
1447
1448 switch (device_type)
1449 {
1450 case HSA_DEVICE_TYPE_GPU:
c34ab549
RB
1451 {
1452 char name[64];
1453 hsa_status_t status
1454 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, name);
1455 if (status != HSA_STATUS_SUCCESS
1456 || isa_code (name) == EF_AMDGPU_MACH_UNSUPPORTED)
1457 {
1458 GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
1459 status == HSA_STATUS_SUCCESS ? name : "invalid");
1460 return false;
1461 }
1462 }
237957cc
AS
1463 break;
1464 case HSA_DEVICE_TYPE_CPU:
1465 if (!support_cpu_devices)
1466 return false;
1467 break;
1468 default:
1469 return false;
1470 }
1471
1472 uint32_t features = 0;
1473 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1474 &features);
1475 if (status != HSA_STATUS_SUCCESS
1476 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1477 return false;
1478 hsa_queue_type_t queue_type;
1479 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1480 &queue_type);
1481 if (status != HSA_STATUS_SUCCESS
1482 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1483 return false;
1484
1485 return true;
1486}
1487
1488/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1489 agent_count in hsa_context. */
1490
1491static hsa_status_t
1492count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1493{
1494 if (suitable_hsa_agent_p (agent))
1495 hsa_context.agent_count++;
1496 return HSA_STATUS_SUCCESS;
1497}
1498
1499/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1500 id to the describing structure in the hsa context. The index of the
1501 structure is pointed to by DATA, increment it afterwards. */
1502
1503static hsa_status_t
1504assign_agent_ids (hsa_agent_t agent, void *data)
1505{
1506 if (suitable_hsa_agent_p (agent))
1507 {
1508 int *agent_index = (int *) data;
1509 hsa_context.agents[*agent_index].id = agent;
1510 ++*agent_index;
1511 }
1512 return HSA_STATUS_SUCCESS;
1513}
1514
1515/* Initialize hsa_context if it has not already been done.
1516 Return TRUE on success. */
1517
1518static bool
1519init_hsa_context (void)
1520{
1521 hsa_status_t status;
1522 int agent_index = 0;
1523
1524 if (hsa_context.initialized)
1525 return true;
1526 init_environment_variables ();
1527 if (!init_hsa_runtime_functions ())
1528 {
84fc8f4f 1529 const char *msg = "Run-time could not be dynamically opened";
237957cc 1530 if (suppress_host_fallback)
84fc8f4f
TS
1531 GOMP_PLUGIN_fatal ("%s\n", msg);
1532 else
1533 GCN_WARNING ("%s\n", msg);
237957cc
AS
1534 return false;
1535 }
1536 status = hsa_fns.hsa_init_fn ();
1537 if (status != HSA_STATUS_SUCCESS)
1538 return hsa_error ("Run-time could not be initialized", status);
1539 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1540
1541 if (debug)
1542 dump_hsa_system_info ();
1543
1544 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1545 if (status != HSA_STATUS_SUCCESS)
1546 return hsa_error ("GCN GPU devices could not be enumerated", status);
1547 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1548
1549 hsa_context.agents
1550 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1551 * sizeof (struct agent_info));
1552 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
966de09b
AS
1553 if (status != HSA_STATUS_SUCCESS)
1554 return hsa_error ("Scanning compute agents failed", status);
237957cc
AS
1555 if (agent_index != hsa_context.agent_count)
1556 {
1557 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1558 return false;
1559 }
1560
1561 if (debug)
1562 {
1563 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1564 if (status != HSA_STATUS_SUCCESS)
1565 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1566 }
1567
2e5ea579
FH
1568 uint16_t minor, major;
1569 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1570 &minor);
1571 if (status != HSA_STATUS_SUCCESS)
1572 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1573 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1574 &major);
1575 if (status != HSA_STATUS_SUCCESS)
1576 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1577
1578 size_t len = sizeof hsa_context.driver_version_s;
1579 int printed = snprintf (hsa_context.driver_version_s, len,
1580 "HSA Runtime %hu.%hu", (unsigned short int)major,
1581 (unsigned short int)minor);
1582 if (printed >= len)
1583 GCN_WARNING ("HSA runtime version string was truncated."
1584 "Version %hu.%hu is too long.", (unsigned short int)major,
1585 (unsigned short int)minor);
1586
237957cc
AS
1587 hsa_context.initialized = true;
1588 return true;
1589}
1590
1591/* Verify that hsa_context has already been initialized and return the
1592 agent_info structure describing device number N. Return NULL on error. */
1593
1594static struct agent_info *
1595get_agent_info (int n)
1596{
1597 if (!hsa_context.initialized)
1598 {
1599 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1600 return NULL;
1601 }
1602 if (n >= hsa_context.agent_count)
1603 {
1604 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1605 return NULL;
1606 }
1607 if (!hsa_context.agents[n].initialized)
1608 {
1609 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1610 return NULL;
1611 }
1612 return &hsa_context.agents[n];
1613}
1614
1615/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1616
1617 Selects (breaks at) a suitable region of type KIND. */
1618
1619static hsa_status_t
1620get_memory_region (hsa_region_t region, hsa_region_t *retval,
1621 hsa_region_global_flag_t kind)
1622{
1623 hsa_status_t status;
1624 hsa_region_segment_t segment;
1625
1626 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1627 &segment);
1628 if (status != HSA_STATUS_SUCCESS)
1629 return status;
1630 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1631 return HSA_STATUS_SUCCESS;
1632
1633 uint32_t flags;
1634 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1635 &flags);
1636 if (status != HSA_STATUS_SUCCESS)
1637 return status;
1638 if (flags & kind)
1639 {
1640 *retval = region;
1641 return HSA_STATUS_INFO_BREAK;
1642 }
1643 return HSA_STATUS_SUCCESS;
1644}
1645
1646/* Callback of hsa_agent_iterate_regions.
1647
1648 Selects a kernargs memory region. */
1649
1650static hsa_status_t
1651get_kernarg_memory_region (hsa_region_t region, void *data)
1652{
1653 return get_memory_region (region, (hsa_region_t *)data,
1654 HSA_REGION_GLOBAL_FLAG_KERNARG);
1655}
1656
1657/* Callback of hsa_agent_iterate_regions.
1658
1659 Selects a coarse-grained memory region suitable for the heap and
1660 offload data. */
1661
1662static hsa_status_t
1663get_data_memory_region (hsa_region_t region, void *data)
1664{
1665 return get_memory_region (region, (hsa_region_t *)data,
1666 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1667}
1668
7d593fd6
FH
1669static int
1670elf_gcn_isa_field (Elf64_Ehdr *image)
1671{
1672 return image->e_flags & EF_AMDGPU_MACH_MASK;
1673}
1674
7d593fd6
FH
1675const static char *gcn_gfx803_s = "gfx803";
1676const static char *gcn_gfx900_s = "gfx900";
1677const static char *gcn_gfx906_s = "gfx906";
3535402e 1678const static char *gcn_gfx908_s = "gfx908";
cde52d3a 1679const static char *gcn_gfx90a_s = "gfx90a";
c7ec7bd1 1680const static char *gcn_gfx1030_s = "gfx1030";
78b56a12 1681const static char *gcn_gfx1036_s = "gfx1036";
52a2c659 1682const static char *gcn_gfx1100_s = "gfx1100";
1bf18629 1683const static char *gcn_gfx1103_s = "gfx1103";
52a2c659 1684const static int gcn_isa_name_len = 7;
7d593fd6
FH
1685
1686/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1687 support the ISA. */
1688
1689static const char*
1690isa_hsa_name (int isa) {
1691 switch(isa)
1692 {
7d593fd6
FH
1693 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1694 return gcn_gfx803_s;
1695 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1696 return gcn_gfx900_s;
1697 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1698 return gcn_gfx906_s;
3535402e
AS
1699 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1700 return gcn_gfx908_s;
cde52d3a
AS
1701 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1702 return gcn_gfx90a_s;
c7ec7bd1
AS
1703 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1704 return gcn_gfx1030_s;
78b56a12
RB
1705 case EF_AMDGPU_MACH_AMDGCN_GFX1036:
1706 return gcn_gfx1036_s;
52a2c659
TB
1707 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1708 return gcn_gfx1100_s;
1bf18629
AS
1709 case EF_AMDGPU_MACH_AMDGCN_GFX1103:
1710 return gcn_gfx1103_s;
7d593fd6
FH
1711 }
1712 return NULL;
1713}
1714
1715/* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1716 with -march) or NULL if we do not support the ISA.
1717 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1718
1719static const char*
1720isa_gcc_name (int isa) {
1721 switch(isa)
1722 {
7d593fd6
FH
1723 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1724 return "fiji";
1725 default:
1726 return isa_hsa_name (isa);
1727 }
1728}
1729
1730/* Returns the code which is used in the GCN object code to identify the ISA with
1731 the given name (as used by the HSA runtime). */
1732
1733static gcn_isa
1734isa_code(const char *isa) {
7d593fd6
FH
1735 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1736 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1737
1738 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1739 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1740
1741 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1742 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1743
3535402e
AS
1744 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1745 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1746
cde52d3a
AS
1747 if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1748 return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1749
c7ec7bd1
AS
1750 if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
1751 return EF_AMDGPU_MACH_AMDGCN_GFX1030;
1752
78b56a12
RB
1753 if (!strncmp (isa, gcn_gfx1036_s, gcn_isa_name_len))
1754 return EF_AMDGPU_MACH_AMDGCN_GFX1036;
1755
52a2c659
TB
1756 if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len))
1757 return EF_AMDGPU_MACH_AMDGCN_GFX1100;
1758
1bf18629
AS
1759 if (!strncmp (isa, gcn_gfx1103_s, gcn_isa_name_len))
1760 return EF_AMDGPU_MACH_AMDGCN_GFX1103;
1761
209ed06c 1762 return EF_AMDGPU_MACH_UNSUPPORTED;
7d593fd6
FH
1763}
1764
ae0d2c24
AS
1765/* CDNA2 devices have twice as many VGPRs compared to older devices. */
1766
1767static int
1768max_isa_vgprs (int isa)
1769{
1770 switch (isa)
1771 {
1772 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1773 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1774 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1775 case EF_AMDGPU_MACH_AMDGCN_GFX908:
ae0d2c24
AS
1776 return 256;
1777 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1778 return 512;
99890e15 1779 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
78b56a12 1780 case EF_AMDGPU_MACH_AMDGCN_GFX1036:
99890e15
AS
1781 return 512; /* 512 SIMD32 = 256 wavefrontsize64. */
1782 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1bf18629 1783 case EF_AMDGPU_MACH_AMDGCN_GFX1103:
99890e15 1784 return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */
ae0d2c24
AS
1785 }
1786 GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1787}
1788
237957cc
AS
1789/* }}} */
1790/* {{{ Run */
1791
f6fff8a6 1792/* Create or reuse a team arena and stack space.
237957cc
AS
1793
1794 Team arenas are used by OpenMP to avoid calling malloc multiple times
1795 while setting up each team. This is purely a performance optimization.
1796
f6fff8a6
AS
1797 The stack space is used by all kernels. We must allocate it in such a
1798 way that the reverse offload implmentation can access the data.
237957cc 1799
f6fff8a6
AS
1800 Allocating this memory costs performance, so this function will reuse an
1801 existing allocation if a large enough one is idle.
1802 The memory lock is released, but not deallocated, when the kernel exits. */
1803
1804static void
1805configure_ephemeral_memories (struct kernel_info *kernel,
1806 struct kernargs_abi *kernargs, int num_teams,
1807 int num_threads)
237957cc 1808{
f6fff8a6
AS
1809 struct agent_info *agent = kernel->agent;
1810 struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1811 struct ephemeral_memories_list *item;
1812
1813 int actual_arena_size = (kernel->kind == KIND_OPENMP
1814 ? team_arena_size : 0);
1815 int actual_arena_total_size = actual_arena_size * num_teams;
1816 size_t size = (actual_arena_total_size
1817 + num_teams * num_threads * stack_size);
237957cc
AS
1818
1819 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1820 {
f6fff8a6 1821 if (item->size < size)
237957cc
AS
1822 continue;
1823
f6fff8a6
AS
1824 if (pthread_mutex_trylock (&item->in_use) == 0)
1825 break;
237957cc
AS
1826 }
1827
f6fff8a6 1828 if (!item)
237957cc 1829 {
f6fff8a6
AS
1830 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1831 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1832 num_teams, num_threads, size);
237957cc 1833
f6fff8a6
AS
1834 if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1835 {
1836 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1837 return;
1838 }
1839 item = malloc (sizeof (*item));
1840 item->size = size;
1841 item->next = NULL;
1842 *next_ptr = item;
237957cc 1843
f6fff8a6
AS
1844 if (pthread_mutex_init (&item->in_use, NULL))
1845 {
1846 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1847 return;
1848 }
1849 if (pthread_mutex_lock (&item->in_use))
1850 {
1851 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1852 return;
1853 }
1854 if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1855 {
1856 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1857 return;
1858 }
1859
1860 hsa_status_t status;
1861 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1862 &item->address);
1863 if (status != HSA_STATUS_SUCCESS)
1864 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1865 status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1866 HSA_ACCESS_PERMISSION_RW);
1867 if (status != HSA_STATUS_SUCCESS)
1868 hsa_fatal ("Could not assign arena & stack memory to device", status);
1869 }
237957cc 1870
f6fff8a6
AS
1871 kernargs->arena_ptr = (actual_arena_total_size
1872 ? (uint64_t)item->address
1873 : 0);
1874 kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1875 kernargs->arena_size_per_team = actual_arena_size;
1876 kernargs->stack_size_per_thread = stack_size;
237957cc
AS
1877}
1878
f6fff8a6 1879/* Mark an ephemeral memory space available for reuse. */
237957cc
AS
1880
1881static void
f6fff8a6 1882release_ephemeral_memories (struct agent_info* agent, void *address)
237957cc 1883{
f6fff8a6 1884 struct ephemeral_memories_list *item;
237957cc 1885
f6fff8a6 1886 for (item = agent->ephemeral_memories_list; item; item = item->next)
237957cc 1887 {
f6fff8a6 1888 if (item->address == address)
237957cc
AS
1889 {
1890 if (pthread_mutex_unlock (&item->in_use))
1891 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1892 return;
1893 }
1894 }
1895 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1896}
1897
1898/* Clean up all the allocated team arenas. */
1899
1900static bool
f6fff8a6 1901destroy_ephemeral_memories (struct agent_info *agent)
237957cc 1902{
f6fff8a6 1903 struct ephemeral_memories_list *item, *next;
237957cc 1904
f6fff8a6 1905 for (item = agent->ephemeral_memories_list; item; item = next)
237957cc
AS
1906 {
1907 next = item->next;
f6fff8a6 1908 hsa_fns.hsa_memory_free_fn (item->address);
237957cc
AS
1909 if (pthread_mutex_destroy (&item->in_use))
1910 {
f6fff8a6 1911 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
237957cc
AS
1912 return false;
1913 }
1914 free (item);
1915 }
f6fff8a6 1916 agent->ephemeral_memories_list = NULL;
237957cc
AS
1917
1918 return true;
1919}
1920
1921/* Allocate memory on a specified device. */
1922
1923static void *
1924alloc_by_agent (struct agent_info *agent, size_t size)
1925{
1926 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1927
237957cc
AS
1928 void *ptr;
1929 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1930 size, &ptr);
1931 if (status != HSA_STATUS_SUCCESS)
1932 {
1933 hsa_error ("Could not allocate device memory", status);
1934 return NULL;
1935 }
1936
1937 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1938 HSA_ACCESS_PERMISSION_RW);
1939 if (status != HSA_STATUS_SUCCESS)
1940 {
1941 hsa_error ("Could not assign data memory to device", status);
1942 return NULL;
1943 }
1944
1945 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1946 bool profiling_dispatch_p
1947 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1948 if (profiling_dispatch_p)
1949 {
1950 acc_prof_info *prof_info = thr->prof_info;
1951 acc_event_info data_event_info;
1952 acc_api_info *api_info = thr->api_info;
1953
1954 prof_info->event_type = acc_ev_alloc;
1955
1956 data_event_info.data_event.event_type = prof_info->event_type;
1957 data_event_info.data_event.valid_bytes
1958 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1959 data_event_info.data_event.parent_construct
1960 = acc_construct_parallel;
1961 data_event_info.data_event.implicit = 1;
1962 data_event_info.data_event.tool_info = NULL;
1963 data_event_info.data_event.var_name = NULL;
1964 data_event_info.data_event.bytes = size;
1965 data_event_info.data_event.host_ptr = NULL;
1966 data_event_info.data_event.device_ptr = (void *) ptr;
1967
1968 api_info->device_api = acc_device_api_other;
1969
1970 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1971 api_info);
1972 }
1973
1974 return ptr;
1975}
1976
1977/* Create kernel dispatch data structure for given KERNEL, along with
1978 the necessary device signals and memory allocations. */
1979
1980static struct kernel_dispatch *
f6fff8a6
AS
1981create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1982 int num_threads)
237957cc
AS
1983{
1984 struct agent_info *agent = kernel->agent;
1985 struct kernel_dispatch *shadow
1986 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1987
1988 shadow->agent = kernel->agent;
1989 shadow->object = kernel->object;
1990
1991 hsa_signal_t sync_signal;
1992 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1993 if (status != HSA_STATUS_SUCCESS)
1994 hsa_fatal ("Error creating the GCN sync signal", status);
1995
1996 shadow->signal = sync_signal.handle;
1997 shadow->private_segment_size = kernel->private_segment_size;
e7d6c277
AS
1998
1999 if (lowlat_size < 0)
2000 {
2001 /* Divide the LDS between the number of running teams.
2002 Allocate not less than is defined in the kernel metadata. */
2003 int teams_per_cu = num_teams / get_cu_count (agent);
2004 int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
2005 shadow->group_segment_size
2006 = (kernel->group_segment_size > LDS_per_team
2007 ? kernel->group_segment_size
2008 : LDS_per_team);;
2009 }
2010 else if (lowlat_size < GCN_LOWLAT_HEAP+8)
2011 /* Ensure that there's space for the OpenMP libgomp data. */
2012 shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
2013 else
2014 shadow->group_segment_size = (lowlat_size > 65536
2015 ? 65536
2016 : lowlat_size);
237957cc
AS
2017
2018 /* We expect kernels to request a single pointer, explicitly, and the
2019 rest of struct kernargs, implicitly. If they request anything else
2020 then something is wrong. */
2021 if (kernel->kernarg_segment_size > 8)
2022 {
2023 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
2024 return NULL;
2025 }
2026
2027 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
2028 sizeof (struct kernargs),
2029 &shadow->kernarg_address);
2030 if (status != HSA_STATUS_SUCCESS)
2031 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
2032 struct kernargs *kernargs = shadow->kernarg_address;
2033
2034 /* Zero-initialize the output_data (minimum needed). */
f6fff8a6 2035 kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
237957cc
AS
2036 kernargs->output_data.next_output = 0;
2037 for (unsigned i = 0;
2038 i < (sizeof (kernargs->output_data.queue)
2039 / sizeof (kernargs->output_data.queue[0]));
2040 i++)
2041 kernargs->output_data.queue[i].written = 0;
2042 kernargs->output_data.consumed = 0;
2043
2044 /* Pass in the heap location. */
f6fff8a6 2045 kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
237957cc 2046
f6fff8a6
AS
2047 /* Create the ephemeral memory spaces. */
2048 configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
237957cc
AS
2049
2050 /* Ensure we can recognize unset return values. */
2051 kernargs->output_data.return_value = 0xcafe0000;
2052
2053 return shadow;
2054}
2055
8c05d8cd 2056static void
6edcb5dc
TB
2057process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
2058 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
8c05d8cd
TB
2059{
2060 int dev_num = dev_num64;
6edcb5dc 2061 GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
130c2f3c 2062 NULL);
8c05d8cd
TB
2063}
2064
237957cc
AS
2065/* Output any data written to console output from the kernel. It is expected
2066 that this function is polled during kernel execution.
2067
2068 We print all entries from the last item printed to the next entry without
2069 a "written" flag. If the "final" flag is set then it'll continue right to
2070 the end.
2071
2072 The print buffer is circular, but the from and to locations don't wrap when
2073 the buffer does, so the output limit is UINT_MAX. The target blocks on
2074 output when the buffer is full. */
2075
2076static void
2077console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2078 bool final)
2079{
2080 unsigned int limit = (sizeof (kernargs->output_data.queue)
2081 / sizeof (kernargs->output_data.queue[0]));
2082
2083 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2084 __ATOMIC_ACQUIRE);
2085 unsigned int to = kernargs->output_data.next_output;
2086
2087 if (from > to)
2088 {
2089 /* Overflow. */
2090 if (final)
2091 printf ("GCN print buffer overflowed.\n");
2092 return;
2093 }
2094
2095 unsigned int i;
2096 for (i = from; i < to; i++)
2097 {
2098 struct printf_data *data = &kernargs->output_data.queue[i%limit];
2099
2100 if (!data->written && !final)
2101 break;
2102
2103 switch (data->type)
2104 {
2105 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2106 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2107 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2108 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
8c05d8cd 2109 case 4:
6edcb5dc
TB
2110 process_reverse_offload (data->value_u64[0], data->value_u64[1],
2111 data->value_u64[2], data->value_u64[3],
2112 data->value_u64[4], data->value_u64[5]);
8c05d8cd 2113 break;
237957cc
AS
2114 default: printf ("GCN print buffer error!\n"); break;
2115 }
2116 data->written = 0;
2117 __atomic_store_n (&kernargs->output_data.consumed, i+1,
2118 __ATOMIC_RELEASE);
2119 }
2120 fflush (stdout);
2121}
2122
2123/* Release data structure created for a kernel dispatch in SHADOW argument,
2124 and clean up the signal and memory allocations. */
2125
2126static void
2127release_kernel_dispatch (struct kernel_dispatch *shadow)
2128{
2129 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2130
2131 struct kernargs *kernargs = shadow->kernarg_address;
f6fff8a6
AS
2132 void *addr = (void *)kernargs->abi.arena_ptr;
2133 if (!addr)
2134 addr = (void *)kernargs->abi.stack_ptr;
2135 release_ephemeral_memories (shadow->agent, addr);
237957cc
AS
2136
2137 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2138
2139 hsa_signal_t s;
2140 s.handle = shadow->signal;
2141 hsa_fns.hsa_signal_destroy_fn (s);
2142
2143 free (shadow);
2144}
2145
2146/* Extract the properties from a kernel binary. */
2147
2148static void
2149init_kernel_properties (struct kernel_info *kernel)
2150{
2151 hsa_status_t status;
2152 struct agent_info *agent = kernel->agent;
2153 hsa_executable_symbol_t kernel_symbol;
f062c3f1
AS
2154 char *buf = alloca (strlen (kernel->name) + 4);
2155 sprintf (buf, "%s.kd", kernel->name);
237957cc 2156 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
f062c3f1 2157 buf, agent->id,
237957cc
AS
2158 0, &kernel_symbol);
2159 if (status != HSA_STATUS_SUCCESS)
2160 {
2161 hsa_warn ("Could not find symbol for kernel in the code object", status);
f062c3f1 2162 fprintf (stderr, "not found name: '%s'\n", buf);
237957cc
AS
2163 dump_executable_symbols (agent->executable);
2164 goto failure;
2165 }
2166 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2167 status = hsa_fns.hsa_executable_symbol_get_info_fn
2168 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2169 if (status != HSA_STATUS_SUCCESS)
2170 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2171 status = hsa_fns.hsa_executable_symbol_get_info_fn
2172 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2173 &kernel->kernarg_segment_size);
2174 if (status != HSA_STATUS_SUCCESS)
2175 hsa_fatal ("Could not get info about kernel argument size", status);
2176 status = hsa_fns.hsa_executable_symbol_get_info_fn
2177 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2178 &kernel->group_segment_size);
2179 if (status != HSA_STATUS_SUCCESS)
2180 hsa_fatal ("Could not get info about kernel group segment size", status);
2181 status = hsa_fns.hsa_executable_symbol_get_info_fn
2182 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2183 &kernel->private_segment_size);
2184 if (status != HSA_STATUS_SUCCESS)
2185 hsa_fatal ("Could not get info about kernel private segment size",
2186 status);
2187
2188 /* The kernel type is not known until something tries to launch it. */
2189 kernel->kind = KIND_UNKNOWN;
2190
2191 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2192 "following segment sizes: \n", kernel->name);
2193 GCN_DEBUG (" group_segment_size: %u\n",
2194 (unsigned) kernel->group_segment_size);
2195 GCN_DEBUG (" private_segment_size: %u\n",
2196 (unsigned) kernel->private_segment_size);
2197 GCN_DEBUG (" kernarg_segment_size: %u\n",
2198 (unsigned) kernel->kernarg_segment_size);
2199 return;
2200
2201failure:
2202 kernel->initialization_failed = true;
2203}
2204
2205/* Do all the work that is necessary before running KERNEL for the first time.
2206 The function assumes the program has been created, finalized and frozen by
2207 create_and_finalize_hsa_program. */
2208
2209static void
2210init_kernel (struct kernel_info *kernel)
2211{
2212 if (pthread_mutex_lock (&kernel->init_mutex))
2213 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2214 if (kernel->initialized)
2215 {
2216 if (pthread_mutex_unlock (&kernel->init_mutex))
2217 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2218 "mutex");
2219
2220 return;
2221 }
2222
2223 init_kernel_properties (kernel);
2224
2225 if (!kernel->initialization_failed)
2226 {
2227 GCN_DEBUG ("\n");
2228
2229 kernel->initialized = true;
2230 }
2231 if (pthread_mutex_unlock (&kernel->init_mutex))
2232 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2233 "mutex");
2234}
2235
2236/* Run KERNEL on its agent, pass VARS to it as arguments and take
2237 launch attributes from KLA.
2238
2239 MODULE_LOCKED indicates that the caller already holds the lock and
2240 run_kernel need not lock it again.
2241 If AQ is NULL then agent->sync_queue will be used. */
2242
2243static void
2244run_kernel (struct kernel_info *kernel, void *vars,
2245 struct GOMP_kernel_launch_attributes *kla,
2246 struct goacc_asyncqueue *aq, bool module_locked)
2247{
ae0d2c24 2248 struct agent_info *agent = kernel->agent;
5a28e272
KCY
2249 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2250 kernel->description->vpgr_count);
2251
2252 /* Reduce the number of threads/workers if there are insufficient
2253 VGPRs available to run the kernels together. */
2254 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2255 {
ae0d2c24 2256 int max_vgprs = max_isa_vgprs (agent->device_isa);
5a28e272 2257 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
ae0d2c24 2258 int max_threads = (max_vgprs / granulated_vgprs) * 4;
5a28e272
KCY
2259 if (kla->gdims[2] > max_threads)
2260 {
2261 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2262 " per team/gang - reducing to %d threads/workers.\n",
2263 kla->gdims[2], max_threads);
2264 kla->gdims[2] = max_threads;
2265 }
2266 }
2267
237957cc
AS
2268 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2269 (aq ? aq->id : 0));
2270 GCN_DEBUG ("GCN launch attribs: gdims:[");
2271 int i;
2272 for (i = 0; i < kla->ndim; ++i)
2273 {
2274 if (i)
2275 DEBUG_PRINT (", ");
2276 DEBUG_PRINT ("%u", kla->gdims[i]);
2277 }
2278 DEBUG_PRINT ("], normalized gdims:[");
2279 for (i = 0; i < kla->ndim; ++i)
2280 {
2281 if (i)
2282 DEBUG_PRINT (", ");
2283 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2284 }
2285 DEBUG_PRINT ("], wdims:[");
2286 for (i = 0; i < kla->ndim; ++i)
2287 {
2288 if (i)
2289 DEBUG_PRINT (", ");
2290 DEBUG_PRINT ("%u", kla->wdims[i]);
2291 }
2292 DEBUG_PRINT ("]\n");
2293 DEBUG_FLUSH ();
2294
237957cc
AS
2295 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2296 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2297
2298 if (!agent->initialized)
2299 GOMP_PLUGIN_fatal ("Agent must be initialized");
2300
2301 if (!kernel->initialized)
2302 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2303
2304 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2305
2306 uint64_t index
2307 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2308 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2309
2310 /* Wait until the queue is not full before writing the packet. */
2311 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2312 >= command_q->size)
2313 ;
2314
2315 /* Do not allow the dimensions to be overridden when running
2316 constructors or destructors. */
2317 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2318 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2319
2320 hsa_kernel_dispatch_packet_t *packet;
2321 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2322 + index % command_q->size;
2323
2324 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2325 packet->grid_size_x = override_x ? : kla->gdims[0];
2326 packet->workgroup_size_x = get_group_size (kla->ndim,
2327 packet->grid_size_x,
2328 kla->wdims[0]);
2329
2330 if (kla->ndim >= 2)
2331 {
2332 packet->grid_size_y = kla->gdims[1];
2333 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2334 kla->wdims[1]);
2335 }
2336 else
2337 {
2338 packet->grid_size_y = 1;
2339 packet->workgroup_size_y = 1;
2340 }
2341
2342 if (kla->ndim == 3)
2343 {
2344 packet->grid_size_z = limit_worker_threads (override_z
2345 ? : kla->gdims[2]);
2346 packet->workgroup_size_z = get_group_size (kla->ndim,
2347 packet->grid_size_z,
2348 kla->wdims[2]);
2349 }
2350 else
2351 {
2352 packet->grid_size_z = 1;
2353 packet->workgroup_size_z = 1;
2354 }
2355
2356 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2357 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2358 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2359 packet->grid_size_x / packet->workgroup_size_x,
2360 packet->grid_size_y / packet->workgroup_size_y,
2361 packet->grid_size_z / packet->workgroup_size_z,
2362 packet->workgroup_size_x, packet->workgroup_size_y,
2363 packet->workgroup_size_z);
2364
2365 struct kernel_dispatch *shadow
f6fff8a6
AS
2366 = create_kernel_dispatch (kernel, packet->grid_size_x,
2367 packet->grid_size_z);
237957cc
AS
2368 shadow->queue = command_q;
2369
2370 if (debug)
2371 {
2372 fprintf (stderr, "\nKernel has following dependencies:\n");
2373 print_kernel_dispatch (shadow, 2);
2374 }
2375
e7d6c277
AS
2376 packet->private_segment_size = shadow->private_segment_size;
2377 packet->group_segment_size = shadow->group_segment_size;
2378 packet->kernel_object = shadow->object;
237957cc
AS
2379 packet->kernarg_address = shadow->kernarg_address;
2380 hsa_signal_t s;
2381 s.handle = shadow->signal;
2382 packet->completion_signal = s;
2383 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2384 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2385
2386 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2387
2388 uint16_t header;
2389 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2390 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2391 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2392
2393 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2394 agent->device_id);
2395
2396 packet_store_release ((uint32_t *) packet, header,
2397 (uint16_t) kla->ndim
2398 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2399
2400 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2401 index);
2402
2403 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2404
2405 /* Root signal waits with 1ms timeout. */
2406 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2407 1000 * 1000,
2408 HSA_WAIT_STATE_BLOCKED) != 0)
2409 {
2410 console_output (kernel, shadow->kernarg_address, false);
2411 }
2412 console_output (kernel, shadow->kernarg_address, true);
2413
2414 struct kernargs *kernargs = shadow->kernarg_address;
2415 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2416
2417 release_kernel_dispatch (shadow);
2418
2419 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2420 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2421
2422 unsigned int upper = (return_value & ~0xffff) >> 16;
2423 if (upper == 0xcafe)
2424 ; // exit not called, normal termination.
2425 else if (upper == 0xffff)
2426 ; // exit called.
2427 else
2428 {
2429 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2430 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2431 return_value);
2432 abort ();
2433 }
2434
2435 if (upper == 0xffff)
2436 {
2437 unsigned int signal = (return_value >> 8) & 0xff;
2438
2439 if (signal == SIGABRT)
2440 {
2441 GCN_WARNING ("GCN Kernel aborted\n");
2442 abort ();
2443 }
2444 else if (signal != 0)
2445 {
2446 GCN_WARNING ("GCN Kernel received unknown signal\n");
2447 abort ();
2448 }
2449
2450 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2451 exit (return_value & 0xff);
2452 }
2453}
2454
2455/* }}} */
2456/* {{{ Load/Unload */
2457
2458/* Initialize KERNEL from D and other parameters. Return true on success. */
2459
2460static bool
2461init_basic_kernel_info (struct kernel_info *kernel,
2462 struct hsa_kernel_description *d,
2463 struct agent_info *agent,
2464 struct module_info *module)
2465{
2466 kernel->agent = agent;
2467 kernel->module = module;
2468 kernel->name = d->name;
5a28e272 2469 kernel->description = d;
237957cc
AS
2470 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2471 {
2472 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2473 return false;
2474 }
2475 return true;
2476}
2477
7d593fd6
FH
2478/* Check that the GCN ISA of the given image matches the ISA of the agent. */
2479
2480static bool
2481isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2482{
2483 int isa_field = elf_gcn_isa_field (image);
2484 const char* isa_s = isa_hsa_name (isa_field);
2485 if (!isa_s)
2486 {
2487 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2488 return false;
2489 }
2490
2491 if (isa_field != agent->device_isa)
2492 {
2493 char msg[120];
2494 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2495 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2496 assert (agent_isa_s);
2497 assert (agent_isa_gcc_s);
2498
2499 snprintf (msg, sizeof msg,
2500 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
4a206161 2501 "Try to recompile with '-foffload-options=-march=%s'.\n",
7d593fd6
FH
2502 isa_s, agent_isa_s, agent_isa_gcc_s);
2503
2504 hsa_error (msg, HSA_STATUS_ERROR);
2505 return false;
2506 }
2507
2508 return true;
2509}
2510
237957cc
AS
2511/* Create and finalize the program consisting of all loaded modules. */
2512
2513static bool
2514create_and_finalize_hsa_program (struct agent_info *agent)
2515{
2516 hsa_status_t status;
237957cc
AS
2517 bool res = true;
2518 if (pthread_mutex_lock (&agent->prog_mutex))
2519 {
2520 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2521 return false;
2522 }
2523 if (agent->prog_finalized)
2524 goto final;
2525
2526 status
2527 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2528 HSA_EXECUTABLE_STATE_UNFROZEN,
2529 "", &agent->executable);
2530 if (status != HSA_STATUS_SUCCESS)
2531 {
2532 hsa_error ("Could not create GCN executable", status);
2533 goto fail;
2534 }
2535
2536 /* Load any GCN modules. */
2537 struct module_info *module = agent->module;
2538 if (module)
2539 {
2540 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2541
7d593fd6
FH
2542 if (!isa_matches_agent (agent, image))
2543 goto fail;
2544
237957cc
AS
2545 hsa_code_object_t co = { 0 };
2546 status = hsa_fns.hsa_code_object_deserialize_fn
2547 (module->image_desc->gcn_image->image,
2548 module->image_desc->gcn_image->size,
2549 NULL, &co);
2550 if (status != HSA_STATUS_SUCCESS)
2551 {
2552 hsa_error ("Could not deserialize GCN code object", status);
2553 goto fail;
2554 }
2555
2556 status = hsa_fns.hsa_executable_load_code_object_fn
2557 (agent->executable, agent->id, co, "");
2558 if (status != HSA_STATUS_SUCCESS)
2559 {
2560 hsa_error ("Could not load GCN code object", status);
2561 goto fail;
2562 }
2563
2564 if (!module->heap)
2565 {
2566 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2567 gcn_kernel_heap_size,
2568 (void**)&module->heap);
2569 if (status != HSA_STATUS_SUCCESS)
2570 {
2571 hsa_error ("Could not allocate memory for GCN heap", status);
2572 goto fail;
2573 }
2574
2575 status = hsa_fns.hsa_memory_assign_agent_fn
2576 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2577 if (status != HSA_STATUS_SUCCESS)
2578 {
2579 hsa_error ("Could not assign GCN heap memory to device", status);
2580 goto fail;
2581 }
2582
2583 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2584 &gcn_kernel_heap_size,
2585 sizeof (gcn_kernel_heap_size));
2586 }
2587
2588 }
2589
2590 if (debug)
2591 dump_executable_symbols (agent->executable);
2592
2593 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2594 if (status != HSA_STATUS_SUCCESS)
2595 {
2596 hsa_error ("Could not freeze the GCN executable", status);
2597 goto fail;
2598 }
2599
237957cc
AS
2600final:
2601 agent->prog_finalized = true;
2602
2603 if (pthread_mutex_unlock (&agent->prog_mutex))
2604 {
2605 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2606 res = false;
2607 }
2608
2609 return res;
2610
2611fail:
2612 res = false;
2613 goto final;
2614}
2615
2616/* Free the HSA program in agent and everything associated with it and set
2617 agent->prog_finalized and the initialized flags of all kernels to false.
2618 Return TRUE on success. */
2619
2620static bool
2621destroy_hsa_program (struct agent_info *agent)
2622{
2623 if (!agent->prog_finalized)
2624 return true;
2625
2626 hsa_status_t status;
2627
2628 GCN_DEBUG ("Destroying the current GCN program.\n");
2629
2630 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2631 if (status != HSA_STATUS_SUCCESS)
2632 return hsa_error ("Could not destroy GCN executable", status);
2633
2634 if (agent->module)
2635 {
2636 int i;
2637 for (i = 0; i < agent->module->kernel_count; i++)
2638 agent->module->kernels[i].initialized = false;
2639
2640 if (agent->module->heap)
2641 {
2642 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2643 agent->module->heap = NULL;
2644 }
2645 }
2646 agent->prog_finalized = false;
2647 return true;
2648}
2649
2650/* Deinitialize all information associated with MODULE and kernels within
2651 it. Return TRUE on success. */
2652
2653static bool
2654destroy_module (struct module_info *module, bool locked)
2655{
2656 /* Run destructors before destroying module. */
2657 struct GOMP_kernel_launch_attributes kla =
2658 { 3,
2659 /* Grid size. */
2660 { 1, 64, 1 },
2661 /* Work-group size. */
2662 { 1, 64, 1 }
2663 };
2664
2665 if (module->fini_array_func)
2666 {
2667 init_kernel (module->fini_array_func);
2668 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2669 }
2670 module->constructors_run_p = false;
2671
2672 int i;
2673 for (i = 0; i < module->kernel_count; i++)
2674 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2675 {
2676 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2677 "mutex");
2678 return false;
2679 }
2680
2681 return true;
2682}
2683
2684/* }}} */
2685/* {{{ Async */
2686
2687/* Callback of dispatch queues to report errors. */
2688
2689static void
2690execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2691{
2692 struct queue_entry *entry = &aq->queue[index];
2693
2694 switch (entry->type)
2695 {
2696 case KERNEL_LAUNCH:
2697 if (DEBUG_QUEUES)
2698 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2699 aq->agent->device_id, aq->id, index);
2700 run_kernel (entry->u.launch.kernel,
2701 entry->u.launch.vars,
2702 &entry->u.launch.kla, aq, false);
2703 if (DEBUG_QUEUES)
2704 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2705 aq->agent->device_id, aq->id, index);
2706 break;
2707
2708 case CALLBACK:
2709 if (DEBUG_QUEUES)
2710 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2711 aq->agent->device_id, aq->id, index);
2712 entry->u.callback.fn (entry->u.callback.data);
2713 if (DEBUG_QUEUES)
2714 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2715 aq->agent->device_id, aq->id, index);
2716 break;
2717
2718 case ASYNC_WAIT:
2719 {
2720 /* FIXME: is it safe to access a placeholder that may already have
2721 been executed? */
2722 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2723
2724 if (DEBUG_QUEUES)
2725 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2726 aq->agent->device_id, aq->id, index);
2727
2728 pthread_mutex_lock (&placeholderp->mutex);
2729
2730 while (!placeholderp->executed)
2731 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2732
2733 pthread_mutex_unlock (&placeholderp->mutex);
2734
2735 if (pthread_cond_destroy (&placeholderp->cond))
2736 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2737
2738 if (pthread_mutex_destroy (&placeholderp->mutex))
2739 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2740
2741 if (DEBUG_QUEUES)
2742 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2743 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2744 }
2745 break;
2746
2747 case ASYNC_PLACEHOLDER:
2748 pthread_mutex_lock (&entry->u.placeholder.mutex);
2749 entry->u.placeholder.executed = 1;
2750 pthread_cond_signal (&entry->u.placeholder.cond);
2751 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2752 break;
2753
2754 default:
2755 GOMP_PLUGIN_fatal ("Unknown queue element");
2756 }
2757}
2758
2759/* This function is run as a thread to service an async queue in the
2760 background. It runs continuously until the stop flag is set. */
2761
2762static void *
2763drain_queue (void *thread_arg)
2764{
2765 struct goacc_asyncqueue *aq = thread_arg;
2766
2767 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2768 {
2769 aq->drain_queue_stop = 2;
2770 return NULL;
2771 }
2772
2773 pthread_mutex_lock (&aq->mutex);
2774
2775 while (true)
2776 {
2777 if (aq->drain_queue_stop)
2778 break;
2779
2780 if (aq->queue_n > 0)
2781 {
2782 pthread_mutex_unlock (&aq->mutex);
2783 execute_queue_entry (aq, aq->queue_first);
2784
2785 pthread_mutex_lock (&aq->mutex);
2786 aq->queue_first = ((aq->queue_first + 1)
2787 % ASYNC_QUEUE_SIZE);
2788 aq->queue_n--;
2789
2790 if (DEBUG_THREAD_SIGNAL)
2791 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2792 aq->agent->device_id, aq->id);
2793 pthread_cond_broadcast (&aq->queue_cond_out);
2794 pthread_mutex_unlock (&aq->mutex);
2795
2796 if (DEBUG_QUEUES)
2797 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2798 aq->id);
2799 pthread_mutex_lock (&aq->mutex);
2800 }
2801 else
2802 {
2803 if (DEBUG_THREAD_SLEEP)
2804 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2805 aq->agent->device_id, aq->id);
2806 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2807 if (DEBUG_THREAD_SLEEP)
2808 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2809 aq->agent->device_id, aq->id);
2810 }
2811 }
2812
2813 aq->drain_queue_stop = 2;
2814 if (DEBUG_THREAD_SIGNAL)
2815 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2816 aq->agent->device_id, aq->id);
2817 pthread_cond_broadcast (&aq->queue_cond_out);
2818 pthread_mutex_unlock (&aq->mutex);
2819
2820 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2821 return NULL;
2822}
2823
2824/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2825 is not usually the case. This is just a debug tool. */
2826
2827static void
2828drain_queue_synchronous (struct goacc_asyncqueue *aq)
2829{
2830 pthread_mutex_lock (&aq->mutex);
2831
2832 while (aq->queue_n > 0)
2833 {
2834 execute_queue_entry (aq, aq->queue_first);
2835
2836 aq->queue_first = ((aq->queue_first + 1)
2837 % ASYNC_QUEUE_SIZE);
2838 aq->queue_n--;
2839 }
2840
2841 pthread_mutex_unlock (&aq->mutex);
2842}
2843
d88b27da
JB
2844/* Block the current thread until an async queue is writable. The aq->mutex
2845 lock should be held on entry, and remains locked on exit. */
237957cc
AS
2846
2847static void
2848wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2849{
2850 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2851 {
237957cc
AS
2852 /* Queue is full. Wait for it to not be full. */
2853 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2854 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
237957cc
AS
2855 }
2856}
2857
2858/* Request an asynchronous kernel launch on the specified queue. This
2859 may block if the queue is full, but returns without waiting for the
2860 kernel to run. */
2861
2862static void
2863queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2864 void *vars, struct GOMP_kernel_launch_attributes *kla)
2865{
2866 assert (aq->agent == kernel->agent);
2867
237957cc
AS
2868 pthread_mutex_lock (&aq->mutex);
2869
d88b27da
JB
2870 wait_for_queue_nonfull (aq);
2871
237957cc
AS
2872 int queue_last = ((aq->queue_first + aq->queue_n)
2873 % ASYNC_QUEUE_SIZE);
2874 if (DEBUG_QUEUES)
2875 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2876 aq->id, queue_last);
2877
2878 aq->queue[queue_last].type = KERNEL_LAUNCH;
2879 aq->queue[queue_last].u.launch.kernel = kernel;
2880 aq->queue[queue_last].u.launch.vars = vars;
2881 aq->queue[queue_last].u.launch.kla = *kla;
2882
2883 aq->queue_n++;
2884
2885 if (DEBUG_THREAD_SIGNAL)
2886 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2887 aq->agent->device_id, aq->id);
2888 pthread_cond_signal (&aq->queue_cond_in);
2889
2890 pthread_mutex_unlock (&aq->mutex);
2891}
2892
2893/* Request an asynchronous callback on the specified queue. The callback
2894 function will be called, with the given opaque data, from the appropriate
2895 async thread, when all previous items on that queue are complete. */
2896
2897static void
2898queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2899 void *data)
2900{
237957cc
AS
2901 pthread_mutex_lock (&aq->mutex);
2902
d88b27da
JB
2903 wait_for_queue_nonfull (aq);
2904
237957cc
AS
2905 int queue_last = ((aq->queue_first + aq->queue_n)
2906 % ASYNC_QUEUE_SIZE);
2907 if (DEBUG_QUEUES)
2908 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2909 aq->id, queue_last);
2910
2911 aq->queue[queue_last].type = CALLBACK;
2912 aq->queue[queue_last].u.callback.fn = fn;
2913 aq->queue[queue_last].u.callback.data = data;
2914
2915 aq->queue_n++;
2916
2917 if (DEBUG_THREAD_SIGNAL)
2918 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2919 aq->agent->device_id, aq->id);
2920 pthread_cond_signal (&aq->queue_cond_in);
2921
2922 pthread_mutex_unlock (&aq->mutex);
2923}
2924
2925/* Request that a given async thread wait for another thread (unspecified) to
2926 reach the given placeholder. The wait will occur when all previous entries
2927 on the queue are complete. A placeholder is effectively a kind of signal
2928 which simply sets a flag when encountered in a queue. */
2929
2930static void
2931queue_push_asyncwait (struct goacc_asyncqueue *aq,
2932 struct placeholder *placeholderp)
2933{
237957cc
AS
2934 pthread_mutex_lock (&aq->mutex);
2935
d88b27da
JB
2936 wait_for_queue_nonfull (aq);
2937
237957cc
AS
2938 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2939 if (DEBUG_QUEUES)
2940 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2941 aq->id, queue_last);
2942
2943 aq->queue[queue_last].type = ASYNC_WAIT;
2944 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2945
2946 aq->queue_n++;
2947
2948 if (DEBUG_THREAD_SIGNAL)
2949 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2950 aq->agent->device_id, aq->id);
2951 pthread_cond_signal (&aq->queue_cond_in);
2952
2953 pthread_mutex_unlock (&aq->mutex);
2954}
2955
2956/* Add a placeholder into an async queue. When the async thread reaches the
2957 placeholder it will set the "executed" flag to true and continue.
2958 Another thread may be waiting on this thread reaching the placeholder. */
2959
2960static struct placeholder *
2961queue_push_placeholder (struct goacc_asyncqueue *aq)
2962{
2963 struct placeholder *placeholderp;
2964
237957cc
AS
2965 pthread_mutex_lock (&aq->mutex);
2966
d88b27da
JB
2967 wait_for_queue_nonfull (aq);
2968
237957cc
AS
2969 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2970 if (DEBUG_QUEUES)
2971 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2972 aq->id, queue_last);
2973
2974 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2975 placeholderp = &aq->queue[queue_last].u.placeholder;
2976
2977 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2978 {
2979 pthread_mutex_unlock (&aq->mutex);
2980 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2981 }
2982
2983 if (pthread_cond_init (&placeholderp->cond, NULL))
2984 {
2985 pthread_mutex_unlock (&aq->mutex);
2986 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2987 }
2988
2989 placeholderp->executed = 0;
2990
2991 aq->queue_n++;
2992
2993 if (DEBUG_THREAD_SIGNAL)
2994 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2995 aq->agent->device_id, aq->id);
2996 pthread_cond_signal (&aq->queue_cond_in);
2997
2998 pthread_mutex_unlock (&aq->mutex);
2999
3000 return placeholderp;
3001}
3002
3003/* Signal an asynchronous thread to terminate, and wait for it to do so. */
3004
3005static void
3006finalize_async_thread (struct goacc_asyncqueue *aq)
3007{
3008 pthread_mutex_lock (&aq->mutex);
3009 if (aq->drain_queue_stop == 2)
3010 {
3011 pthread_mutex_unlock (&aq->mutex);
3012 return;
3013 }
3014
3015 aq->drain_queue_stop = 1;
3016
3017 if (DEBUG_THREAD_SIGNAL)
3018 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3019 aq->agent->device_id, aq->id);
3020 pthread_cond_signal (&aq->queue_cond_in);
3021
3022 while (aq->drain_queue_stop != 2)
3023 {
3024 if (DEBUG_THREAD_SLEEP)
3025 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3026 " to sleep\n", aq->agent->device_id, aq->id);
3027 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3028 if (DEBUG_THREAD_SLEEP)
3029 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
3030 aq->agent->device_id, aq->id);
3031 }
3032
3033 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
3034 aq->id);
3035 pthread_mutex_unlock (&aq->mutex);
3036
3037 int err = pthread_join (aq->thread_drain_queue, NULL);
3038 if (err != 0)
3039 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3040 aq->agent->device_id, aq->id, strerror (err));
3041 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3042}
3043
3044/* Set up an async queue for OpenMP. There will be only one. The
3045 implementation simply uses an OpenACC async queue.
3046 FIXME: is this thread-safe if two threads call this function? */
3047
3048static void
3049maybe_init_omp_async (struct agent_info *agent)
3050{
3051 if (!agent->omp_async_queue)
3052 agent->omp_async_queue
3053 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3054}
3055
8d2f4ddf
JB
3056/* A wrapper that works around an issue in the HSA runtime with host-to-device
3057 copies from read-only pages. */
3058
3059static void
3060hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3061{
3062 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3063
3064 if (status == HSA_STATUS_SUCCESS)
3065 return;
3066
3067 /* It appears that the copy fails if the source data is in a read-only page.
3068 We can't detect that easily, so try copying the data to a temporary buffer
3069 and doing the copy again if we got an error above. */
3070
3071 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3072 "[%p:+%d]\n", (void *) src, (int) len);
3073
3074 void *src_copy = malloc (len);
3075 memcpy (src_copy, src, len);
3076 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3077 free (src_copy);
3078 if (status != HSA_STATUS_SUCCESS)
3079 GOMP_PLUGIN_error ("memory copy failed");
3080}
3081
237957cc
AS
3082/* Copy data to or from a device. This is intended for use as an async
3083 callback event. */
3084
3085static void
3086copy_data (void *data_)
3087{
3088 struct copy_data *data = (struct copy_data *)data_;
3089 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3090 data->aq->agent->device_id, data->aq->id, data->len, data->src,
3091 data->dst);
8d2f4ddf 3092 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
237957cc
AS
3093 free (data);
3094}
3095
237957cc 3096/* Request an asynchronous data copy, to or from a device, on a given queue.
9c41f5b9 3097 The event will be registered as a callback. */
237957cc
AS
3098
3099static void
3100queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
9c41f5b9 3101 size_t len)
237957cc
AS
3102{
3103 if (DEBUG_QUEUES)
3104 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3105 aq->agent->device_id, aq->id, len, src, dst);
3106 struct copy_data *data
3107 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3108 data->dst = dst;
3109 data->src = src;
3110 data->len = len;
237957cc
AS
3111 data->aq = aq;
3112 queue_push_callback (aq, copy_data, data);
3113}
3114
3115/* Return true if the given queue is currently empty. */
3116
3117static int
3118queue_empty (struct goacc_asyncqueue *aq)
3119{
3120 pthread_mutex_lock (&aq->mutex);
3121 int res = aq->queue_n == 0 ? 1 : 0;
3122 pthread_mutex_unlock (&aq->mutex);
3123
3124 return res;
3125}
3126
3127/* Wait for a given queue to become empty. This implements an OpenACC wait
3128 directive. */
3129
3130static void
3131wait_queue (struct goacc_asyncqueue *aq)
3132{
3133 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3134 {
3135 drain_queue_synchronous (aq);
3136 return;
3137 }
3138
3139 pthread_mutex_lock (&aq->mutex);
3140
3141 while (aq->queue_n > 0)
3142 {
3143 if (DEBUG_THREAD_SLEEP)
3144 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3145 aq->agent->device_id, aq->id);
3146 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3147 if (DEBUG_THREAD_SLEEP)
3148 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3149 aq->id);
3150 }
3151
3152 pthread_mutex_unlock (&aq->mutex);
3153 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3154}
3155
3156/* }}} */
3157/* {{{ OpenACC support */
3158
3159/* Execute an OpenACC kernel, synchronously or asynchronously. */
3160
3161static void
f8332e52 3162gcn_exec (struct kernel_info *kernel,
237957cc
AS
3163 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3164 struct goacc_asyncqueue *aq)
3165{
3166 if (!GOMP_OFFLOAD_can_run (kernel))
3167 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3168
3169 /* If we get here then this must be an OpenACC kernel. */
3170 kernel->kind = KIND_OPENACC;
3171
237957cc
AS
3172 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3173 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3174 {
3175 struct hsa_kernel_description *d
3176 = &kernel->module->image_desc->kernel_infos[i];
3177 if (d->name == kernel->name)
3178 {
3179 hsa_kernel_desc = d;
3180 break;
3181 }
3182 }
3183
3184 /* We may have statically-determined dimensions in
3185 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3186 invocation at runtime in dims[]. We allow static dimensions to take
3187 priority over dynamic dimensions when present (non-zero). */
3188 if (hsa_kernel_desc->oacc_dims[0] > 0)
3189 dims[0] = hsa_kernel_desc->oacc_dims[0];
3190 if (hsa_kernel_desc->oacc_dims[1] > 0)
3191 dims[1] = hsa_kernel_desc->oacc_dims[1];
3192 if (hsa_kernel_desc->oacc_dims[2] > 0)
3193 dims[2] = hsa_kernel_desc->oacc_dims[2];
3194
a78b1ab1
KCY
3195 /* Ideally, when a dimension isn't explicitly specified, we should
3196 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3197 In practice, we tune for peak performance on BabelStream, which
3198 for OpenACC is currently 32 threads per CU. */
3199 if (dims[0] == 0 && dims[1] == 0)
3200 {
3201 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3202 number. There isn't really a correct answer for this without a clue
3203 about the problem size, so let's do a reasonable number of workers
3204 and gangs. */
237957cc 3205
a78b1ab1
KCY
3206 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3207 dims[1] = 8; /* Workers. */
3208 }
3209 else if (dims[0] == 0 && dims[1] > 0)
3210 {
3211 /* Auto-scale the number of gangs with the requested number of workers. */
3212 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3213 }
3214 else if (dims[0] > 0 && dims[1] == 0)
3215 {
3216 /* Auto-scale the number of workers with the requested number of gangs. */
3217 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3218 if (dims[1] == 0)
3219 dims[1] = 1;
3220 if (dims[1] > 16)
3221 dims[1] = 16;
3222 }
237957cc
AS
3223
3224 /* The incoming dimensions are expressed in terms of gangs, workers, and
3225 vectors. The HSA dimensions are expressed in terms of "work-items",
3226 which means multiples of vector lanes.
3227
3228 The "grid size" specifies the size of the problem space, and the
3229 "work-group size" specifies how much of that we want a single compute
3230 unit to chew on at once.
3231
3232 The three dimensions do not really correspond to hardware, but the
3233 important thing is that the HSA runtime will launch as many
3234 work-groups as it takes to process the entire grid, and each
3235 work-group will contain as many wave-fronts as it takes to process
3236 the work-items in that group.
3237
3238 Essentially, as long as we set the Y dimension to 64 (the number of
3239 vector lanes in hardware), and the Z group size to the maximum (16),
3240 then we will get the gangs (X) and workers (Z) launched as we expect.
3241
3242 The reason for the apparent reversal of vector and worker dimension
3243 order is to do with the way the run-time distributes work-items across
3244 v1 and v2. */
3245 struct GOMP_kernel_launch_attributes kla =
3246 {3,
3247 /* Grid size. */
3248 {dims[0], 64, dims[1]},
3249 /* Work-group size. */
3250 {1, 64, 16}
3251 };
3252
3253 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3254 acc_prof_info *prof_info = thr->prof_info;
3255 acc_event_info enqueue_launch_event_info;
3256 acc_api_info *api_info = thr->api_info;
3257 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3258 if (profiling_dispatch_p)
3259 {
3260 prof_info->event_type = acc_ev_enqueue_launch_start;
3261
3262 enqueue_launch_event_info.launch_event.event_type
3263 = prof_info->event_type;
3264 enqueue_launch_event_info.launch_event.valid_bytes
3265 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3266 enqueue_launch_event_info.launch_event.parent_construct
3267 = acc_construct_parallel;
3268 enqueue_launch_event_info.launch_event.implicit = 1;
3269 enqueue_launch_event_info.launch_event.tool_info = NULL;
3270 enqueue_launch_event_info.launch_event.kernel_name
3271 = (char *) kernel->name;
3272 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3273 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3274 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3275
3276 api_info->device_api = acc_device_api_other;
3277
3278 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3279 &enqueue_launch_event_info, api_info);
3280 }
3281
3282 if (!async)
f8332e52 3283 run_kernel (kernel, devaddrs, &kla, NULL, false);
237957cc 3284 else
f8332e52 3285 queue_push_launch (aq, kernel, devaddrs, &kla);
237957cc
AS
3286
3287 if (profiling_dispatch_p)
3288 {
3289 prof_info->event_type = acc_ev_enqueue_launch_end;
3290 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3291 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3292 &enqueue_launch_event_info,
3293 api_info);
3294 }
3295}
3296
3297/* }}} */
3298/* {{{ Generic Plugin API */
3299
3300/* Return the name of the accelerator, which is "gcn". */
3301
3302const char *
3303GOMP_OFFLOAD_get_name (void)
3304{
3305 return "gcn";
3306}
3307
3308/* Return the specific capabilities the HSA accelerator have. */
3309
3310unsigned int
3311GOMP_OFFLOAD_get_caps (void)
3312{
3313 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3314 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3315 | GOMP_OFFLOAD_CAP_OPENACC_200;
3316}
3317
3318/* Identify as GCN accelerator. */
3319
3320int
3321GOMP_OFFLOAD_get_type (void)
3322{
3323 return OFFLOAD_TARGET_TYPE_GCN;
3324}
3325
3326/* Return the libgomp version number we're compatible with. There is
3327 no requirement for cross-version compatibility. */
3328
3329unsigned
3330GOMP_OFFLOAD_version (void)
3331{
3332 return GOMP_VERSION;
3333}
3334
3335/* Return the number of GCN devices on the system. */
3336
3337int
683f1184 3338GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
237957cc
AS
3339{
3340 if (!init_hsa_context ())
3341 return 0;
683f1184
TB
3342 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3343 devices were present. */
f84fdb13 3344 if (hsa_context.agent_count > 0
f1af7d65
TB
3345 && ((omp_requires_mask
3346 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3347 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
683f1184 3348 return -1;
237957cc
AS
3349 return hsa_context.agent_count;
3350}
3351
3352/* Initialize device (agent) number N so that it can be used for computation.
3353 Return TRUE on success. */
3354
3355bool
3356GOMP_OFFLOAD_init_device (int n)
3357{
3358 if (!init_hsa_context ())
3359 return false;
3360 if (n >= hsa_context.agent_count)
3361 {
3362 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3363 return false;
3364 }
3365 struct agent_info *agent = &hsa_context.agents[n];
3366
3367 if (agent->initialized)
3368 return true;
3369
3370 agent->device_id = n;
3371
3372 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3373 {
3374 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3375 return false;
3376 }
3377 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3378 {
3379 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3380 return false;
3381 }
3382 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3383 {
3384 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3385 return false;
3386 }
f6fff8a6 3387 if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
237957cc
AS
3388 {
3389 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3390 return false;
3391 }
3392 agent->async_queues = NULL;
3393 agent->omp_async_queue = NULL;
f6fff8a6 3394 agent->ephemeral_memories_list = NULL;
237957cc
AS
3395
3396 uint32_t queue_size;
3397 hsa_status_t status;
3398 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3399 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3400 &queue_size);
3401 if (status != HSA_STATUS_SUCCESS)
3402 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3403 status);
3404
237957cc 3405 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
2e5ea579 3406 &agent->name);
237957cc
AS
3407 if (status != HSA_STATUS_SUCCESS)
3408 return hsa_error ("Error querying the name of the agent", status);
7d593fd6 3409
2e5ea579 3410 agent->device_isa = isa_code (agent->name);
209ed06c 3411 if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED)
2e5ea579
FH
3412 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3413
3414 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3415 &agent->vendor_name);
3416 if (status != HSA_STATUS_SUCCESS)
3417 return hsa_error ("Error querying the vendor name of the agent", status);
237957cc
AS
3418
3419 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3420 HSA_QUEUE_TYPE_MULTI,
3421 hsa_queue_callback, NULL, UINT32_MAX,
3422 UINT32_MAX, &agent->sync_queue);
3423 if (status != HSA_STATUS_SUCCESS)
3424 return hsa_error ("Error creating command queue", status);
3425
3426 agent->kernarg_region.handle = (uint64_t) -1;
3427 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3428 get_kernarg_memory_region,
3429 &agent->kernarg_region);
966de09b
AS
3430 if (status != HSA_STATUS_SUCCESS
3431 && status != HSA_STATUS_INFO_BREAK)
3432 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3433 if (agent->kernarg_region.handle == (uint64_t) -1)
3434 {
3435 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3436 "arguments");
3437 return false;
3438 }
3439 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3440 dump_hsa_region (agent->kernarg_region, NULL);
3441
3442 agent->data_region.handle = (uint64_t) -1;
3443 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3444 get_data_memory_region,
3445 &agent->data_region);
966de09b
AS
3446 if (status != HSA_STATUS_SUCCESS
3447 && status != HSA_STATUS_INFO_BREAK)
3448 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3449 if (agent->data_region.handle == (uint64_t) -1)
3450 {
3451 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3452 "data");
3453 return false;
3454 }
3455 GCN_DEBUG ("Selected device data memory region:\n");
3456 dump_hsa_region (agent->data_region, NULL);
3457
3458 GCN_DEBUG ("GCN agent %d initialized\n", n);
3459
3460 agent->initialized = true;
3461 return true;
3462}
3463
3464/* Load GCN object-code module described by struct gcn_image_desc in
3465 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
0fcc0cf9
TB
3466 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3467 contain the on-device addresses of the functions for reverse offload. To be
3468 freed by the caller. */
237957cc
AS
3469
3470int
3471GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
0fcc0cf9 3472 struct addr_pair **target_table,
a49c7d31
KCY
3473 uint64_t **rev_fn_table,
3474 uint64_t *host_ind_fn_table)
237957cc
AS
3475{
3476 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3477 {
3478 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3479 " (expected %u, received %u)",
3480 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3481 return -1;
3482 }
3483
3484 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3485 struct agent_info *agent;
3486 struct addr_pair *pair;
3487 struct module_info *module;
3488 struct kernel_info *kernel;
3489 int kernel_count = image_desc->kernel_count;
a49c7d31
KCY
3490 unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3491 ? image_desc->ind_func_count : 0;
237957cc 3492 unsigned var_count = image_desc->global_variable_count;
9f2fca56 3493 /* Currently, "others" is a struct of ICVS. */
0bac793e 3494 int other_count = 1;
237957cc
AS
3495
3496 agent = get_agent_info (ord);
3497 if (!agent)
3498 return -1;
3499
3500 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3501 {
3502 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3503 return -1;
3504 }
3505 if (agent->prog_finalized
3506 && !destroy_hsa_program (agent))
3507 return -1;
3508
3509 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
a49c7d31 3510 GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
237957cc 3511 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
0bac793e
CLT
3512 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3513 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
237957cc
AS
3514 * sizeof (struct addr_pair));
3515 *target_table = pair;
3516 module = (struct module_info *)
3517 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3518 + kernel_count * sizeof (struct kernel_info));
3519 module->image_desc = image_desc;
3520 module->kernel_count = kernel_count;
3521 module->heap = NULL;
3522 module->constructors_run_p = false;
3523
3524 kernel = &module->kernels[0];
3525
3526 /* Allocate memory for kernel dependencies. */
3527 for (unsigned i = 0; i < kernel_count; i++)
3528 {
3529 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3530 if (!init_basic_kernel_info (kernel, d, agent, module))
3531 return -1;
3532 if (strcmp (d->name, "_init_array") == 0)
3533 module->init_array_func = kernel;
3534 else if (strcmp (d->name, "_fini_array") == 0)
3535 module->fini_array_func = kernel;
3536 else
3537 {
3538 pair->start = (uintptr_t) kernel;
3539 pair->end = (uintptr_t) (kernel + 1);
3540 pair++;
3541 }
3542 kernel++;
3543 }
3544
3545 agent->module = module;
3546 if (pthread_rwlock_unlock (&agent->module_rwlock))
3547 {
3548 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3549 return -1;
3550 }
3551
3552 if (!create_and_finalize_hsa_program (agent))
3553 return -1;
3554
4a87a8e4 3555 if (var_count > 0)
237957cc 3556 {
237957cc
AS
3557 hsa_status_t status;
3558 hsa_executable_symbol_t var_symbol;
3559 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
4a87a8e4
AS
3560 ".offload_var_table",
3561 agent->id,
237957cc
AS
3562 0, &var_symbol);
3563
3564 if (status != HSA_STATUS_SUCCESS)
3565 hsa_fatal ("Could not find symbol for variable in the code object",
3566 status);
3567
4a87a8e4 3568 uint64_t var_table_addr;
237957cc 3569 status = hsa_fns.hsa_executable_symbol_get_info_fn
4a87a8e4
AS
3570 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3571 &var_table_addr);
237957cc
AS
3572 if (status != HSA_STATUS_SUCCESS)
3573 hsa_fatal ("Could not extract a variable from its symbol", status);
237957cc 3574
4a87a8e4
AS
3575 struct {
3576 uint64_t addr;
3577 uint64_t size;
3578 } var_table[var_count];
3579 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3580 (void*)var_table_addr, sizeof (var_table));
3581
3582 for (unsigned i = 0; i < var_count; i++)
3583 {
3584 pair->start = var_table[i].addr;
3585 pair->end = var_table[i].addr + var_table[i].size;
3586 GCN_DEBUG ("Found variable at %p with size %lu\n",
3587 (void *)var_table[i].addr, var_table[i].size);
3588 pair++;
3589 }
237957cc
AS
3590 }
3591
a49c7d31
KCY
3592 if (ind_func_count > 0)
3593 {
3594 hsa_status_t status;
3595
3596 /* Read indirect function table from image. */
3597 hsa_executable_symbol_t ind_funcs_symbol;
3598 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3599 ".offload_ind_func_table",
3600 agent->id,
3601 0, &ind_funcs_symbol);
3602
3603 if (status != HSA_STATUS_SUCCESS)
3604 hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3605 "code object", status);
3606
3607 uint64_t ind_funcs_table_addr;
3608 status = hsa_fns.hsa_executable_symbol_get_info_fn
3609 (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3610 &ind_funcs_table_addr);
3611 if (status != HSA_STATUS_SUCCESS)
3612 hsa_fatal ("Could not extract a variable from its symbol", status);
3613
3614 uint64_t ind_funcs_table[ind_func_count];
3615 GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3616 (void*) ind_funcs_table_addr,
3617 sizeof (ind_funcs_table));
3618
3619 /* Build host->target address map for indirect functions. */
3620 uint64_t ind_fn_map[ind_func_count * 2 + 1];
3621 for (unsigned i = 0; i < ind_func_count; i++)
3622 {
3623 ind_fn_map[i * 2] = host_ind_fn_table[i];
3624 ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3625 GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3626 i, host_ind_fn_table[i], ind_funcs_table[i]);
3627 }
3628 ind_fn_map[ind_func_count * 2] = 0;
3629
3630 /* Write the map onto the target. */
3631 void *map_target_addr
3632 = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3633 GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3634
3635 GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3636 (void*) ind_fn_map,
3637 sizeof (ind_fn_map));
3638
3639 /* Write address of the map onto the target. */
3640 hsa_executable_symbol_t symbol;
3641
3642 status
3643 = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3644 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3645 agent->id, 0, &symbol);
3646 if (status != HSA_STATUS_SUCCESS)
3647 hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3648 status);
3649
3650 uint64_t varptr;
3651 uint32_t varsize;
3652
3653 status = hsa_fns.hsa_executable_symbol_get_info_fn
3654 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3655 &varptr);
3656 if (status != HSA_STATUS_SUCCESS)
3657 hsa_fatal ("Could not extract a variable from its symbol", status);
3658 status = hsa_fns.hsa_executable_symbol_get_info_fn
3659 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3660 &varsize);
3661 if (status != HSA_STATUS_SUCCESS)
3662 hsa_fatal ("Could not extract a variable size from its symbol",
3663 status);
3664
3665 GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3666 varptr, varsize);
3667
3668 GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3669 &map_target_addr,
3670 sizeof (map_target_addr));
3671 }
3672
9f2fca56 3673 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
0bac793e
CLT
3674
3675 hsa_status_t status;
3676 hsa_executable_symbol_t var_symbol;
3677 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
9f2fca56 3678 XSTRING (GOMP_ADDITIONAL_ICVS),
0bac793e
CLT
3679 agent->id, 0, &var_symbol);
3680 if (status == HSA_STATUS_SUCCESS)
3681 {
9f2fca56
MV
3682 uint64_t varptr;
3683 uint32_t varsize;
0bac793e
CLT
3684
3685 status = hsa_fns.hsa_executable_symbol_get_info_fn
3686 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
9f2fca56 3687 &varptr);
0bac793e
CLT
3688 if (status != HSA_STATUS_SUCCESS)
3689 hsa_fatal ("Could not extract a variable from its symbol", status);
3690 status = hsa_fns.hsa_executable_symbol_get_info_fn
3691 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
9f2fca56 3692 &varsize);
0bac793e 3693 if (status != HSA_STATUS_SUCCESS)
9f2fca56
MV
3694 hsa_fatal ("Could not extract a variable size from its symbol",
3695 status);
0bac793e 3696
9f2fca56
MV
3697 pair->start = varptr;
3698 pair->end = varptr + varsize;
0bac793e
CLT
3699 }
3700 else
9f2fca56
MV
3701 {
3702 /* The variable was not in this image. */
3703 GCN_DEBUG ("Variable not found in image: %s\n",
3704 XSTRING (GOMP_ADDITIONAL_ICVS));
3705 pair->start = pair->end = 0;
3706 }
0bac793e 3707
237957cc
AS
3708 /* Ensure that constructors are run first. */
3709 struct GOMP_kernel_launch_attributes kla =
3710 { 3,
3711 /* Grid size. */
3712 { 1, 64, 1 },
3713 /* Work-group size. */
3714 { 1, 64, 1 }
3715 };
3716
3717 if (module->init_array_func)
3718 {
3719 init_kernel (module->init_array_func);
3720 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3721 }
3722 module->constructors_run_p = true;
3723
3724 /* Don't report kernels that libgomp need not know about. */
3725 if (module->init_array_func)
3726 kernel_count--;
3727 if (module->fini_array_func)
3728 kernel_count--;
3729
dfd75bf7
TB
3730 if (rev_fn_table != NULL && kernel_count == 0)
3731 *rev_fn_table = NULL;
3732 else if (rev_fn_table != NULL)
3733 {
3734 hsa_status_t status;
3735 hsa_executable_symbol_t var_symbol;
3736 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3737 ".offload_func_table",
3738 agent->id, 0, &var_symbol);
3739 if (status != HSA_STATUS_SUCCESS)
3740 hsa_fatal ("Could not find symbol for variable in the code object",
3741 status);
3742 uint64_t fn_table_addr;
3743 status = hsa_fns.hsa_executable_symbol_get_info_fn
3744 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3745 &fn_table_addr);
3746 if (status != HSA_STATUS_SUCCESS)
3747 hsa_fatal ("Could not extract a variable from its symbol", status);
3748 *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3749 GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3750 (void*) fn_table_addr,
3751 kernel_count * sizeof (uint64_t));
3752 }
3753
0bac793e 3754 return kernel_count + var_count + other_count;
237957cc
AS
3755}
3756
3757/* Unload GCN object-code module described by struct gcn_image_desc in
3758 TARGET_DATA from agent number N. Return TRUE on success. */
3759
3760bool
3761GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3762{
3763 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3764 {
3765 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3766 " (expected %u, received %u)",
3767 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3768 return false;
3769 }
3770
3771 struct agent_info *agent;
3772 agent = get_agent_info (n);
3773 if (!agent)
3774 return false;
3775
3776 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3777 {
3778 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3779 return false;
3780 }
3781
3782 if (!agent->module || agent->module->image_desc != target_data)
3783 {
3784 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3785 "loaded before");
3786 return false;
3787 }
3788
3789 if (!destroy_module (agent->module, true))
3790 return false;
3791 free (agent->module);
3792 agent->module = NULL;
3793 if (!destroy_hsa_program (agent))
3794 return false;
3795 if (pthread_rwlock_unlock (&agent->module_rwlock))
3796 {
3797 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3798 return false;
3799 }
3800 return true;
3801}
3802
3803/* Deinitialize all information and status associated with agent number N. We
3804 do not attempt any synchronization, assuming the user and libgomp will not
3805 attempt deinitialization of a device that is in any way being used at the
3806 same time. Return TRUE on success. */
3807
3808bool
3809GOMP_OFFLOAD_fini_device (int n)
3810{
3811 struct agent_info *agent = get_agent_info (n);
3812 if (!agent)
3813 return false;
3814
3815 if (!agent->initialized)
3816 return true;
3817
3818 if (agent->omp_async_queue)
3819 {
3820 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3821 agent->omp_async_queue = NULL;
3822 }
3823
3824 if (agent->module)
3825 {
3826 if (!destroy_module (agent->module, false))
3827 return false;
3828 free (agent->module);
3829 agent->module = NULL;
3830 }
3831
f6fff8a6 3832 if (!destroy_ephemeral_memories (agent))
237957cc
AS
3833 return false;
3834
3835 if (!destroy_hsa_program (agent))
3836 return false;
3837
3838 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3839 if (status != HSA_STATUS_SUCCESS)
3840 return hsa_error ("Error destroying command queue", status);
3841
3842 if (pthread_mutex_destroy (&agent->prog_mutex))
3843 {
3844 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3845 return false;
3846 }
3847 if (pthread_rwlock_destroy (&agent->module_rwlock))
3848 {
3849 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3850 return false;
3851 }
3852
3853 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3854 {
3855 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3856 return false;
3857 }
f6fff8a6 3858 if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
237957cc 3859 {
f6fff8a6 3860 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
237957cc
AS
3861 return false;
3862 }
3863 agent->initialized = false;
3864 return true;
3865}
3866
3867/* Return true if the HSA runtime can run function FN_PTR. */
3868
3869bool
3870GOMP_OFFLOAD_can_run (void *fn_ptr)
3871{
3872 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3873
3874 init_kernel (kernel);
3875 if (kernel->initialization_failed)
84fc8f4f 3876 GOMP_PLUGIN_fatal ("kernel initialization failed");
237957cc
AS
3877
3878 return true;
237957cc
AS
3879}
3880
3881/* Allocate memory on device N. */
3882
3883void *
3884GOMP_OFFLOAD_alloc (int n, size_t size)
3885{
3886 struct agent_info *agent = get_agent_info (n);
3887 return alloc_by_agent (agent, size);
3888}
3889
3890/* Free memory from device N. */
3891
3892bool
3893GOMP_OFFLOAD_free (int device, void *ptr)
3894{
3895 GCN_DEBUG ("Freeing memory on device %d\n", device);
3896
3897 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3898 if (status != HSA_STATUS_SUCCESS)
3899 {
3900 hsa_error ("Could not free device memory", status);
3901 return false;
3902 }
3903
3904 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3905 bool profiling_dispatch_p
3906 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3907 if (profiling_dispatch_p)
3908 {
3909 acc_prof_info *prof_info = thr->prof_info;
3910 acc_event_info data_event_info;
3911 acc_api_info *api_info = thr->api_info;
3912
3913 prof_info->event_type = acc_ev_free;
3914
3915 data_event_info.data_event.event_type = prof_info->event_type;
3916 data_event_info.data_event.valid_bytes
3917 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3918 data_event_info.data_event.parent_construct
3919 = acc_construct_parallel;
3920 data_event_info.data_event.implicit = 1;
3921 data_event_info.data_event.tool_info = NULL;
3922 data_event_info.data_event.var_name = NULL;
3923 data_event_info.data_event.bytes = 0;
3924 data_event_info.data_event.host_ptr = NULL;
3925 data_event_info.data_event.device_ptr = (void *) ptr;
3926
3927 api_info->device_api = acc_device_api_other;
3928
3929 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3930 api_info);
3931 }
3932
3933 return true;
3934}
3935
3936/* Copy data from DEVICE to host. */
3937
3938bool
3939GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3940{
3941 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3942 src, dst);
8d2f4ddf
JB
3943 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3944 if (status != HSA_STATUS_SUCCESS)
3945 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3946 return true;
3947}
3948
3949/* Copy data from host to DEVICE. */
3950
3951bool
3952GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3953{
3954 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3955 device, dst);
8d2f4ddf 3956 hsa_memory_copy_wrapper (dst, src, n);
237957cc
AS
3957 return true;
3958}
3959
3960/* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3961
3962bool
3963GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3964{
3965 struct gcn_thread *thread_data = gcn_thread ();
3966
3967 if (thread_data && !async_synchronous_p (thread_data->async))
3968 {
3969 struct agent_info *agent = get_agent_info (device);
3970 maybe_init_omp_async (agent);
9c41f5b9 3971 queue_push_copy (agent->omp_async_queue, dst, src, n);
237957cc
AS
3972 return true;
3973 }
3974
3975 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3976 device, src, device, dst);
8d2f4ddf
JB
3977 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3978 if (status != HSA_STATUS_SUCCESS)
3979 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3980 return true;
3981}
3982
a17299c1
JB
3983/* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
3984 measured in bytes. So we have:
3985
3986 dim1_size: number of bytes to copy on innermost dimension ("row")
3987 dim0_len: number of rows to copy
3988 dst: base pointer for destination of copy
3989 dst_offset1_size: innermost row offset (for dest), in bytes
3990 dst_offset0_len: offset, number of rows (for dest)
3991 dst_dim1_size: whole-array dest row length, in bytes (pitch)
3992 src: base pointer for source of copy
3993 src_offset1_size: innermost row offset (for source), in bytes
3994 src_offset0_len: offset, number of rows (for source)
3995 src_dim1_size: whole-array source row length, in bytes (pitch)
3996*/
3997
3998int
3999GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
4000 size_t dim0_len, void *dst, size_t dst_offset1_size,
4001 size_t dst_offset0_len, size_t dst_dim1_size,
4002 const void *src, size_t src_offset1_size,
4003 size_t src_offset0_len, size_t src_dim1_size)
4004{
4005 if (!hsa_fns.hsa_amd_memory_lock_fn
4006 || !hsa_fns.hsa_amd_memory_unlock_fn
4007 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4008 return -1;
4009
4010 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4011 out quietly if we have anything oddly-aligned rather than letting the
4012 driver raise an error. */
4013 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4014 return -1;
4015
4016 if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
4017 return -1;
4018
4019 /* Only handle host to device or device to host transfers here. */
4020 if ((dst_ord == -1 && src_ord == -1)
4021 || (dst_ord != -1 && src_ord != -1))
4022 return -1;
4023
4024 hsa_amd_copy_direction_t dir
4025 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4026 hsa_agent_t copy_agent;
4027
4028 /* We need to pin (lock) host memory before we start the transfer. Try to
4029 lock the minimum size necessary, i.e. using partial first/last rows of the
4030 whole array. Something like this:
4031
4032 rows -->
4033 ..............
4034 c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4035 o | ++#######+++++ <- whole row
4036 l | ++#######+++++ <- "
4037 s v ++#######..... <- last row apart from trailing remainder
4038 ..............
4039
4040 We could split very large transfers into several rectangular copies, but
4041 that is unimplemented for now. */
4042
4043 size_t bounded_size_host, first_elem_offset_host;
4044 void *host_ptr;
4045 if (dir == hsaHostToDevice)
4046 {
4047 bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
4048 first_elem_offset_host = src_offset0_len * src_dim1_size
4049 + src_offset1_size;
4050 host_ptr = (void *) src;
4051 struct agent_info *agent = get_agent_info (dst_ord);
4052 copy_agent = agent->id;
4053 }
4054 else
4055 {
4056 bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
4057 first_elem_offset_host = dst_offset0_len * dst_dim1_size
4058 + dst_offset1_size;
4059 host_ptr = dst;
4060 struct agent_info *agent = get_agent_info (src_ord);
4061 copy_agent = agent->id;
4062 }
4063
4064 void *agent_ptr;
4065
4066 hsa_status_t status
4067 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4068 bounded_size_host, NULL, 0, &agent_ptr);
4069 /* We can't lock the host memory: don't give up though, we might still be
4070 able to use the slow path in our caller. So, don't make this an
4071 error. */
4072 if (status != HSA_STATUS_SUCCESS)
4073 return -1;
4074
4075 hsa_pitched_ptr_t dstpp, srcpp;
4076 hsa_dim3_t dst_offsets, src_offsets, ranges;
4077
4078 int retval = 1;
4079
4080 hsa_signal_t completion_signal;
4081 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4082 if (status != HSA_STATUS_SUCCESS)
4083 {
4084 retval = -1;
4085 goto unlock;
4086 }
4087
4088 if (dir == hsaHostToDevice)
4089 {
4090 srcpp.base = agent_ptr - first_elem_offset_host;
4091 dstpp.base = dst;
4092 }
4093 else
4094 {
4095 srcpp.base = (void *) src;
4096 dstpp.base = agent_ptr - first_elem_offset_host;
4097 }
4098
4099 srcpp.pitch = src_dim1_size;
4100 srcpp.slice = 0;
4101
4102 src_offsets.x = src_offset1_size;
4103 src_offsets.y = src_offset0_len;
4104 src_offsets.z = 0;
4105
4106 dstpp.pitch = dst_dim1_size;
4107 dstpp.slice = 0;
4108
4109 dst_offsets.x = dst_offset1_size;
4110 dst_offsets.y = dst_offset0_len;
4111 dst_offsets.z = 0;
4112
4113 ranges.x = dim1_size;
4114 ranges.y = dim0_len;
4115 ranges.z = 1;
4116
4117 status
4118 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4119 &src_offsets, &ranges,
4120 copy_agent, dir, 0, NULL,
4121 completion_signal);
4122 /* If the rectangular copy fails, we might still be able to use the slow
4123 path. We need to unlock the host memory though, so don't return
4124 immediately. */
4125 if (status != HSA_STATUS_SUCCESS)
4126 retval = -1;
4127 else
4128 hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4129 HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
4130 HSA_WAIT_STATE_ACTIVE);
4131
4132 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4133
4134unlock:
4135 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4136 if (status != HSA_STATUS_SUCCESS)
4137 hsa_fatal ("Could not unlock host memory", status);
4138
4139 return retval;
4140}
4141
4142/* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4143 measured in bytes. So we have:
4144
4145 dim2_size: number of bytes to copy on innermost dimension ("row")
4146 dim1_len: number of rows per slice to copy
4147 dim0_len: number of slices to copy
4148 dst: base pointer for destination of copy
4149 dst_offset2_size: innermost row offset (for dest), in bytes
4150 dst_offset1_len: offset, number of rows (for dest)
4151 dst_offset0_len: offset, number of slices (for dest)
4152 dst_dim2_size: whole-array dest row length, in bytes (pitch)
4153 dst_dim1_len: whole-array number of rows in slice (for dest)
4154 src: base pointer for source of copy
4155 src_offset2_size: innermost row offset (for source), in bytes
4156 src_offset1_len: offset, number of rows (for source)
4157 src_offset0_len: offset, number of slices (for source)
4158 src_dim2_size: whole-array source row length, in bytes (pitch)
4159 src_dim1_len: whole-array number of rows in slice (for source)
4160*/
4161
4162int
4163GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
4164 size_t dim1_len, size_t dim0_len, void *dst,
4165 size_t dst_offset2_size, size_t dst_offset1_len,
4166 size_t dst_offset0_len, size_t dst_dim2_size,
4167 size_t dst_dim1_len, const void *src,
4168 size_t src_offset2_size, size_t src_offset1_len,
4169 size_t src_offset0_len, size_t src_dim2_size,
4170 size_t src_dim1_len)
4171{
4172 if (!hsa_fns.hsa_amd_memory_lock_fn
4173 || !hsa_fns.hsa_amd_memory_unlock_fn
4174 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4175 return -1;
4176
4177 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4178 out quietly if we have anything oddly-aligned rather than letting the
4179 driver raise an error. */
4180 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4181 return -1;
4182
4183 if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
4184 return -1;
4185
4186 /* Only handle host to device or device to host transfers here. */
4187 if ((dst_ord == -1 && src_ord == -1)
4188 || (dst_ord != -1 && src_ord != -1))
4189 return -1;
4190
4191 hsa_amd_copy_direction_t dir
4192 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4193 hsa_agent_t copy_agent;
4194
4195 /* We need to pin (lock) host memory before we start the transfer. Try to
4196 lock the minimum size necessary, i.e. using partial first/last slices of
4197 the whole 3D array. Something like this:
4198
4199 slice 0: slice 1: slice 2:
4200 __________ __________ __________
4201 ^ /+++++++++/ : /+++++++++/ : / /
4202 column /+++##++++/| | /+++##++++/| | /+++## / # = subarray
4203 / / ##++++/ | |/+++##++++/ | |/+++##++++/ + = area to pin
4204 /_________/ : /_________/ : /_________/
4205 row --->
4206
4207 We could split very large transfers into several rectangular copies, but
4208 that is unimplemented for now. */
4209
4210 size_t bounded_size_host, first_elem_offset_host;
4211 void *host_ptr;
4212 if (dir == hsaHostToDevice)
4213 {
4214 size_t slice_bytes = src_dim2_size * src_dim1_len;
4215 bounded_size_host = slice_bytes * (dim0_len - 1)
4216 + src_dim2_size * (dim1_len - 1)
4217 + dim2_size;
4218 first_elem_offset_host = src_offset0_len * slice_bytes
4219 + src_offset1_len * src_dim2_size
4220 + src_offset2_size;
4221 host_ptr = (void *) src;
4222 struct agent_info *agent = get_agent_info (dst_ord);
4223 copy_agent = agent->id;
4224 }
4225 else
4226 {
4227 size_t slice_bytes = dst_dim2_size * dst_dim1_len;
4228 bounded_size_host = slice_bytes * (dim0_len - 1)
4229 + dst_dim2_size * (dim1_len - 1)
4230 + dim2_size;
4231 first_elem_offset_host = dst_offset0_len * slice_bytes
4232 + dst_offset1_len * dst_dim2_size
4233 + dst_offset2_size;
4234 host_ptr = dst;
4235 struct agent_info *agent = get_agent_info (src_ord);
4236 copy_agent = agent->id;
4237 }
4238
4239 void *agent_ptr;
4240
4241 hsa_status_t status
4242 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4243 bounded_size_host, NULL, 0, &agent_ptr);
4244 /* We can't lock the host memory: don't give up though, we might still be
4245 able to use the slow path in our caller (maybe even with iterated memcpy2d
4246 calls). So, don't make this an error. */
4247 if (status != HSA_STATUS_SUCCESS)
4248 return -1;
4249
4250 hsa_pitched_ptr_t dstpp, srcpp;
4251 hsa_dim3_t dst_offsets, src_offsets, ranges;
4252
4253 int retval = 1;
4254
4255 hsa_signal_t completion_signal;
4256 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4257 if (status != HSA_STATUS_SUCCESS)
4258 {
4259 retval = -1;
4260 goto unlock;
4261 }
4262
4263 if (dir == hsaHostToDevice)
4264 {
4265 srcpp.base = agent_ptr - first_elem_offset_host;
4266 dstpp.base = dst;
4267 }
4268 else
4269 {
4270 srcpp.base = (void *) src;
4271 dstpp.base = agent_ptr - first_elem_offset_host;
4272 }
4273
4274 /* Pitch is measured in bytes. */
4275 srcpp.pitch = src_dim2_size;
4276 /* Slice is also measured in bytes (i.e. total per-slice). */
4277 srcpp.slice = src_dim2_size * src_dim1_len;
4278
4279 src_offsets.x = src_offset2_size;
4280 src_offsets.y = src_offset1_len;
4281 src_offsets.z = src_offset0_len;
4282
4283 /* As above. */
4284 dstpp.pitch = dst_dim2_size;
4285 dstpp.slice = dst_dim2_size * dst_dim1_len;
4286
4287 dst_offsets.x = dst_offset2_size;
4288 dst_offsets.y = dst_offset1_len;
4289 dst_offsets.z = dst_offset0_len;
4290
4291 ranges.x = dim2_size;
4292 ranges.y = dim1_len;
4293 ranges.z = dim0_len;
4294
4295 status
4296 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4297 &src_offsets, &ranges,
4298 copy_agent, dir, 0, NULL,
4299 completion_signal);
4300 /* If the rectangular copy fails, we might still be able to use the slow
4301 path. We need to unlock the host memory though, so don't return
4302 immediately. */
4303 if (status != HSA_STATUS_SUCCESS)
4304 retval = -1;
4305 else
4306 {
4307 hsa_signal_value_t sv
4308 = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4309 HSA_SIGNAL_CONDITION_LT, 1,
4310 UINT64_MAX,
4311 HSA_WAIT_STATE_ACTIVE);
4312 if (sv < 0)
4313 {
4314 GCN_WARNING ("async copy rect failure");
4315 retval = -1;
4316 }
4317 }
4318
4319 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4320
4321unlock:
4322 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4323 if (status != HSA_STATUS_SUCCESS)
4324 hsa_fatal ("Could not unlock host memory", status);
4325
4326 return retval;
4327}
4328
237957cc
AS
4329/* }}} */
4330/* {{{ OpenMP Plugin API */
4331
4332/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4333 in VARS as a parameter. The kernel is identified by FN_PTR which must point
4334 to a kernel_info structure, and must have previously been loaded to the
4335 specified device. */
4336
4337void
4338GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
4339{
4340 struct agent_info *agent = get_agent_info (device);
4341 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4342 struct GOMP_kernel_launch_attributes def;
4343 struct GOMP_kernel_launch_attributes *kla;
4344 assert (agent == kernel->agent);
4345
4346 /* If we get here then the kernel must be OpenMP. */
4347 kernel->kind = KIND_OPENMP;
4348
4349 if (!parse_target_attributes (args, &def, &kla, agent))
4350 {
4351 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4352 return;
4353 }
4354 run_kernel (kernel, vars, kla, NULL, false);
4355}
4356
4357/* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
4358 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4359 GOMP_PLUGIN_target_task_completion when it has finished. */
4360
4361void
4362GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
4363 void **args, void *async_data)
4364{
4365 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4366 struct agent_info *agent = get_agent_info (device);
4367 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
4368 struct GOMP_kernel_launch_attributes def;
4369 struct GOMP_kernel_launch_attributes *kla;
4370 assert (agent == kernel->agent);
4371
4372 /* If we get here then the kernel must be OpenMP. */
4373 kernel->kind = KIND_OPENMP;
4374
4375 if (!parse_target_attributes (args, &def, &kla, agent))
4376 {
4377 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4378 return;
4379 }
4380
4381 maybe_init_omp_async (agent);
4382 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
4383 queue_push_callback (agent->omp_async_queue,
4384 GOMP_PLUGIN_target_task_completion, async_data);
4385}
4386
4387/* }}} */
4388/* {{{ OpenACC Plugin API */
4389
4390/* Run a synchronous OpenACC kernel. The device number is inferred from the
4391 already-loaded KERNEL. */
4392
4393void
f8332e52
TS
4394GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
4395 size_t mapnum __attribute__((unused)),
199867d0
TS
4396 void **hostaddrs __attribute__((unused)),
4397 void **devaddrs, unsigned *dims,
237957cc
AS
4398 void *targ_mem_desc)
4399{
4400 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4401
f8332e52 4402 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
237957cc
AS
4403}
4404
4405/* Run an asynchronous OpenACC kernel on the specified queue. */
4406
4407void
f8332e52
TS
4408GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4409 size_t mapnum __attribute__((unused)),
199867d0
TS
4410 void **hostaddrs __attribute__((unused)),
4411 void **devaddrs,
237957cc
AS
4412 unsigned *dims, void *targ_mem_desc,
4413 struct goacc_asyncqueue *aq)
4414{
4415 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4416
f8332e52 4417 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
237957cc
AS
4418}
4419
4420/* Create a new asynchronous thread and queue for running future kernels. */
4421
4422struct goacc_asyncqueue *
4423GOMP_OFFLOAD_openacc_async_construct (int device)
4424{
4425 struct agent_info *agent = get_agent_info (device);
4426
4427 pthread_mutex_lock (&agent->async_queues_mutex);
4428
4429 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4430 aq->agent = get_agent_info (device);
4431 aq->prev = NULL;
4432 aq->next = agent->async_queues;
4433 if (aq->next)
4434 {
4435 aq->next->prev = aq;
4436 aq->id = aq->next->id + 1;
4437 }
4438 else
4439 aq->id = 1;
4440 agent->async_queues = aq;
4441
4442 aq->queue_first = 0;
4443 aq->queue_n = 0;
4444 aq->drain_queue_stop = 0;
4445
4446 if (pthread_mutex_init (&aq->mutex, NULL))
4447 {
4448 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4449 return false;
4450 }
4451 if (pthread_cond_init (&aq->queue_cond_in, NULL))
4452 {
4453 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4454 return false;
4455 }
4456 if (pthread_cond_init (&aq->queue_cond_out, NULL))
4457 {
4458 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4459 return false;
4460 }
4461
4462 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4463 ASYNC_QUEUE_SIZE,
4464 HSA_QUEUE_TYPE_MULTI,
4465 hsa_queue_callback, NULL,
4466 UINT32_MAX, UINT32_MAX,
4467 &aq->hsa_queue);
4468 if (status != HSA_STATUS_SUCCESS)
4469 hsa_fatal ("Error creating command queue", status);
4470
4471 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4472 if (err != 0)
4473 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4474 strerror (err));
4475 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4476 aq->id);
4477
4478 pthread_mutex_unlock (&agent->async_queues_mutex);
4479
4480 return aq;
4481}
4482
93d90219 4483/* Destroy an existing asynchronous thread and queue. Waits for any
237957cc
AS
4484 currently-running task to complete, but cancels any queued tasks. */
4485
4486bool
4487GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4488{
4489 struct agent_info *agent = aq->agent;
4490
4491 finalize_async_thread (aq);
4492
4493 pthread_mutex_lock (&agent->async_queues_mutex);
4494
4495 int err;
4496 if ((err = pthread_mutex_destroy (&aq->mutex)))
4497 {
4498 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4499 goto fail;
4500 }
4501 if (pthread_cond_destroy (&aq->queue_cond_in))
4502 {
4503 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4504 goto fail;
4505 }
4506 if (pthread_cond_destroy (&aq->queue_cond_out))
4507 {
4508 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4509 goto fail;
4510 }
4511 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4512 if (status != HSA_STATUS_SUCCESS)
4513 {
4514 hsa_error ("Error destroying command queue", status);
4515 goto fail;
4516 }
4517
4518 if (aq->prev)
4519 aq->prev->next = aq->next;
4520 if (aq->next)
4521 aq->next->prev = aq->prev;
4522 if (agent->async_queues == aq)
4523 agent->async_queues = aq->next;
4524
4525 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4526
4527 free (aq);
4528 pthread_mutex_unlock (&agent->async_queues_mutex);
4529 return true;
4530
4531fail:
4532 pthread_mutex_unlock (&agent->async_queues_mutex);
4533 return false;
4534}
4535
4536/* Return true if the specified async queue is currently empty. */
4537
4538int
4539GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4540{
4541 return queue_empty (aq);
4542}
4543
4544/* Block until the specified queue has executed all its tasks and the
4545 queue is empty. */
4546
4547bool
4548GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4549{
4550 wait_queue (aq);
4551 return true;
4552}
4553
4554/* Add a serialization point across two async queues. Any new tasks added to
4555 AQ2, after this call, will not run until all tasks on AQ1, at the time
4556 of this call, have completed. */
4557
4558bool
4559GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4560 struct goacc_asyncqueue *aq2)
4561{
4562 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4563 scheduled to run on it up to this point. */
4564 if (aq1 != aq2)
4565 {
4566 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4567 queue_push_asyncwait (aq2, placeholderp);
4568 }
4569 return true;
4570}
4571
4572/* Add an opaque callback to the given async queue. */
4573
4574void
4575GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4576 void (*fn) (void *), void *data)
4577{
4578 queue_push_callback (aq, fn, data);
4579}
4580
4581/* Queue up an asynchronous data copy from host to DEVICE. */
4582
4583bool
4584GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4585 size_t n, struct goacc_asyncqueue *aq)
4586{
4587 struct agent_info *agent = get_agent_info (device);
4588 assert (agent == aq->agent);
9c41f5b9 4589 queue_push_copy (aq, dst, src, n);
237957cc
AS
4590 return true;
4591}
4592
4593/* Queue up an asynchronous data copy from DEVICE to host. */
4594
4595bool
4596GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4597 size_t n, struct goacc_asyncqueue *aq)
4598{
4599 struct agent_info *agent = get_agent_info (device);
4600 assert (agent == aq->agent);
9c41f5b9 4601 queue_push_copy (aq, dst, src, n);
237957cc
AS
4602 return true;
4603}
4604
6fc0385c
TS
4605union goacc_property_value
4606GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4607{
2e5ea579
FH
4608 struct agent_info *agent = get_agent_info (device);
4609
4610 union goacc_property_value propval = { .val = 0 };
4611
4612 switch (prop)
4613 {
4614 case GOACC_PROPERTY_FREE_MEMORY:
4615 /* Not supported. */
4616 break;
4617 case GOACC_PROPERTY_MEMORY:
4618 {
4619 size_t size;
4620 hsa_region_t region = agent->data_region;
4621 hsa_status_t status =
4622 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4623 if (status == HSA_STATUS_SUCCESS)
4624 propval.val = size;
4625 break;
4626 }
4627 case GOACC_PROPERTY_NAME:
4628 propval.ptr = agent->name;
4629 break;
4630 case GOACC_PROPERTY_VENDOR:
4631 propval.ptr = agent->vendor_name;
4632 break;
4633 case GOACC_PROPERTY_DRIVER:
4634 propval.ptr = hsa_context.driver_version_s;
4635 break;
4636 }
6fc0385c 4637
2e5ea579 4638 return propval;
6fc0385c
TS
4639}
4640
237957cc
AS
4641/* Set up plugin-specific thread-local-data (host-side). */
4642
4643void *
4644GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4645{
4646 struct gcn_thread *thread_data
4647 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4648
4649 thread_data->async = GOMP_ASYNC_SYNC;
4650
4651 return (void *) thread_data;
4652}
4653
4654/* Clean up plugin-specific thread-local-data. */
4655
4656void
4657GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4658{
4659 free (data);
4660}
4661
4662/* }}} */