]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-gcn.c
ARM: Fix builtin-bswap-1.c test [PR113915]
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
CommitLineData
237957cc
AS
1/* Plugin for AMD GCN execution.
2
a945c346 3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
237957cc
AS
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* {{{ Includes and defines */
30
31#include "config.h"
0bac793e 32#include "symcat.h"
237957cc
AS
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <pthread.h>
37#include <inttypes.h>
38#include <stdbool.h>
39#include <limits.h>
40#include <hsa.h>
85f0a4d9 41#include <hsa_ext_amd.h>
237957cc
AS
42#include <dlfcn.h>
43#include <signal.h>
44#include "libgomp-plugin.h"
8c05d8cd 45#include "config/gcn/libgomp-gcn.h" /* For struct output. */
237957cc
AS
46#include "gomp-constants.h"
47#include <elf.h>
48#include "oacc-plugin.h"
49#include "oacc-int.h"
50#include <assert.h>
51
237957cc 52/* These probably won't be in elf.h for a while. */
97981e13 53#ifndef R_AMDGPU_NONE
237957cc
AS
54#define R_AMDGPU_NONE 0
55#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57#define R_AMDGPU_ABS64 3 /* S + A */
58#define R_AMDGPU_REL32 4 /* S + A - P */
59#define R_AMDGPU_REL64 5 /* S + A - P */
60#define R_AMDGPU_ABS32 6 /* S + A */
61#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
237957cc 66#define R_AMDGPU_RELATIVE64 13 /* B + A */
97981e13 67#endif
237957cc
AS
68
69/* GCN specific definitions for asynchronous queues. */
70
71#define ASYNC_QUEUE_SIZE 64
72#define DRAIN_QUEUE_SYNCHRONOUS_P false
73#define DEBUG_QUEUES 0
74#define DEBUG_THREAD_SLEEP 0
75#define DEBUG_THREAD_SIGNAL 0
76
77/* Defaults. */
78#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
79
80/* Secure getenv() which returns NULL if running as SUID/SGID. */
81#ifndef HAVE_SECURE_GETENV
82#ifdef HAVE___SECURE_GETENV
83#define secure_getenv __secure_getenv
84#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
86
87#include <unistd.h>
88
89/* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
91
92static char *
93secure_getenv (const char *name)
94{
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
96 return getenv (name);
97 else
98 return NULL;
99}
100
101#else
102#define secure_getenv getenv
103#endif
104#endif
105
106/* }}} */
107/* {{{ Types */
108
93d90219 109/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
237957cc
AS
110
111struct gcn_thread
112{
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
114 int async;
115};
116
117/* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
119
120struct hsa_runtime_fn_info
121{
122 /* HSA runtime. */
123 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
124 const char **status_string);
125 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
126 void *value);
127 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
128 hsa_agent_info_t attribute,
129 void *value);
130 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
131 hsa_isa_info_t attribute,
132 uint32_t index,
133 void *value);
134 hsa_status_t (*hsa_init_fn) (void);
135 hsa_status_t (*hsa_iterate_agents_fn)
136 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
137 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
138 hsa_region_info_t attribute,
139 void *value);
140 hsa_status_t (*hsa_queue_create_fn)
141 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
142 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
143 void *data, uint32_t private_segment_size,
144 uint32_t group_segment_size, hsa_queue_t **queue);
145 hsa_status_t (*hsa_agent_iterate_regions_fn)
146 (hsa_agent_t agent,
147 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
148 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
149 hsa_status_t (*hsa_executable_create_fn)
150 (hsa_profile_t profile, hsa_executable_state_t executable_state,
151 const char *options, hsa_executable_t *executable);
152 hsa_status_t (*hsa_executable_global_variable_define_fn)
153 (hsa_executable_t executable, const char *variable_name, void *address);
154 hsa_status_t (*hsa_executable_load_code_object_fn)
155 (hsa_executable_t executable, hsa_agent_t agent,
156 hsa_code_object_t code_object, const char *options);
157 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
158 const char *options);
159 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
160 uint32_t num_consumers,
161 const hsa_agent_t *consumers,
162 hsa_signal_t *signal);
163 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
164 void **ptr);
165 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
166 hsa_access_permission_t access);
167 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
168 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
169 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
170 hsa_status_t (*hsa_executable_get_symbol_fn)
171 (hsa_executable_t executable, const char *module_name,
172 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
173 hsa_executable_symbol_t *symbol);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn)
175 (hsa_executable_symbol_t executable_symbol,
176 hsa_executable_symbol_info_t attribute, void *value);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn)
178 (hsa_executable_t executable,
179 hsa_status_t (*callback)(hsa_executable_t executable,
180 hsa_executable_symbol_t symbol, void *data),
181 void *data);
182 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
183 uint64_t value);
184 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
185 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
190 (hsa_signal_t signal, hsa_signal_condition_t condition,
191 hsa_signal_value_t compare_value, uint64_t timeout_hint,
192 hsa_wait_state_t wait_state_hint);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
194 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
195
196 hsa_status_t (*hsa_code_object_deserialize_fn)
197 (void *serialized_code_object, size_t serialized_code_object_size,
198 const char *options, hsa_code_object_t *code_object);
a17299c1
JB
199 hsa_status_t (*hsa_amd_memory_lock_fn)
200 (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
201 void **agent_ptr);
202 hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
203 hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
204 (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
205 const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
206 const hsa_dim3_t *range, hsa_agent_t copy_agent,
207 hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
208 const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
237957cc
AS
209};
210
211/* Structure describing the run-time and grid properties of an HSA kernel
212 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
213
214struct GOMP_kernel_launch_attributes
215{
216 /* Number of dimensions the workload has. Maximum number is 3. */
217 uint32_t ndim;
218 /* Size of the grid in the three respective dimensions. */
219 uint32_t gdims[3];
220 /* Size of work-groups in the respective dimensions. */
221 uint32_t wdims[3];
222};
223
224/* Collection of information needed for a dispatch of a kernel from a
225 kernel. */
226
227struct kernel_dispatch
228{
229 struct agent_info *agent;
230 /* Pointer to a command queue associated with a kernel dispatch agent. */
231 void *queue;
232 /* Pointer to a memory space used for kernel arguments passing. */
233 void *kernarg_address;
234 /* Kernel object. */
235 uint64_t object;
236 /* Synchronization signal used for dispatch synchronization. */
237 uint64_t signal;
238 /* Private segment size. */
239 uint32_t private_segment_size;
240 /* Group segment size. */
241 uint32_t group_segment_size;
242};
243
244/* Structure of the kernargs segment, supporting console output.
245
246 This needs to match the definitions in Newlib, and the expectations
247 in libgomp target code. */
248
249struct kernargs {
f6fff8a6 250 struct kernargs_abi abi;
237957cc
AS
251
252 /* Output data. */
8c05d8cd 253 struct output output_data;
237957cc
AS
254};
255
256/* A queue entry for a future asynchronous launch. */
257
258struct kernel_launch
259{
260 struct kernel_info *kernel;
261 void *vars;
262 struct GOMP_kernel_launch_attributes kla;
263};
264
265/* A queue entry for a future callback. */
266
267struct callback
268{
269 void (*fn)(void *);
270 void *data;
271};
272
273/* A data struct for the copy_data callback. */
274
275struct copy_data
276{
277 void *dst;
278 const void *src;
279 size_t len;
237957cc
AS
280 struct goacc_asyncqueue *aq;
281};
282
283/* A queue entry for a placeholder. These correspond to a wait event. */
284
285struct placeholder
286{
287 int executed;
288 pthread_cond_t cond;
289 pthread_mutex_t mutex;
290};
291
292/* A queue entry for a wait directive. */
293
294struct asyncwait_info
295{
296 struct placeholder *placeholderp;
297};
298
299/* Encode the type of an entry in an async queue. */
300
301enum entry_type
302{
303 KERNEL_LAUNCH,
304 CALLBACK,
305 ASYNC_WAIT,
306 ASYNC_PLACEHOLDER
307};
308
309/* An entry in an async queue. */
310
311struct queue_entry
312{
313 enum entry_type type;
314 union {
315 struct kernel_launch launch;
316 struct callback callback;
317 struct asyncwait_info asyncwait;
318 struct placeholder placeholder;
319 } u;
320};
321
322/* An async queue header.
323
324 OpenMP may create one of these.
325 OpenACC may create many. */
326
327struct goacc_asyncqueue
328{
329 struct agent_info *agent;
330 hsa_queue_t *hsa_queue;
331
332 pthread_t thread_drain_queue;
333 pthread_mutex_t mutex;
334 pthread_cond_t queue_cond_in;
335 pthread_cond_t queue_cond_out;
336 struct queue_entry queue[ASYNC_QUEUE_SIZE];
337 int queue_first;
338 int queue_n;
339 int drain_queue_stop;
340
341 int id;
342 struct goacc_asyncqueue *prev;
343 struct goacc_asyncqueue *next;
344};
345
346/* Mkoffload uses this structure to describe a kernel.
347
348 OpenMP kernel dimensions are passed at runtime.
349 OpenACC kernel dimensions are passed at compile time, here. */
350
351struct hsa_kernel_description
352{
353 const char *name;
354 int oacc_dims[3]; /* Only present for GCN kernels. */
5a28e272
KCY
355 int sgpr_count;
356 int vpgr_count;
237957cc
AS
357};
358
359/* Mkoffload uses this structure to describe an offload variable. */
360
361struct global_var_info
362{
363 const char *name;
364 void *address;
365};
366
367/* Mkoffload uses this structure to describe all the kernels in a
368 loadable module. These are passed the libgomp via static constructors. */
369
370struct gcn_image_desc
371{
372 struct gcn_image {
373 size_t size;
374 void *image;
375 } *gcn_image;
376 const unsigned kernel_count;
377 struct hsa_kernel_description *kernel_infos;
a49c7d31 378 const unsigned ind_func_count;
237957cc 379 const unsigned global_variable_count;
237957cc
AS
380};
381
7d593fd6
FH
382/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
383 support.
384 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
385
386typedef enum {
209ed06c 387 EF_AMDGPU_MACH_UNSUPPORTED = -1,
7d593fd6
FH
388 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
389 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
390 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
cde52d3a 391 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
c7ec7bd1 392 EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
f9290cdf
TS
393 EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
394 EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041
7d593fd6
FH
395} EF_AMDGPU_MACH;
396
397const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
398typedef EF_AMDGPU_MACH gcn_isa;
399
237957cc
AS
400/* Description of an HSA GPU agent (device) and the program associated with
401 it. */
402
403struct agent_info
404{
405 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
406 hsa_agent_t id;
407 /* The user-visible device number. */
408 int device_id;
409 /* Whether the agent has been initialized. The fields below are usable only
410 if it has been. */
411 bool initialized;
7d593fd6
FH
412
413 /* The instruction set architecture of the device. */
414 gcn_isa device_isa;
2e5ea579
FH
415 /* Name of the agent. */
416 char name[64];
417 /* Name of the vendor of the agent. */
418 char vendor_name[64];
237957cc
AS
419 /* Command queues of the agent. */
420 hsa_queue_t *sync_queue;
421 struct goacc_asyncqueue *async_queues, *omp_async_queue;
422 pthread_mutex_t async_queues_mutex;
423
424 /* The HSA memory region from which to allocate kernel arguments. */
425 hsa_region_t kernarg_region;
426
427 /* The HSA memory region from which to allocate device data. */
428 hsa_region_t data_region;
429
f6fff8a6
AS
430 /* Allocated ephemeral memories (team arena and stack space). */
431 struct ephemeral_memories_list *ephemeral_memories_list;
432 pthread_mutex_t ephemeral_memories_write_lock;
237957cc
AS
433
434 /* Read-write lock that protects kernels which are running or about to be run
435 from interference with loading and unloading of images. Needs to be
436 locked for reading while a kernel is being run, and for writing if the
437 list of modules is manipulated (and thus the HSA program invalidated). */
438 pthread_rwlock_t module_rwlock;
439
440 /* The module associated with this kernel. */
441 struct module_info *module;
442
443 /* Mutex enforcing that only one thread will finalize the HSA program. A
444 thread should have locked agent->module_rwlock for reading before
445 acquiring it. */
446 pthread_mutex_t prog_mutex;
447 /* Flag whether the HSA program that consists of all the modules has been
448 finalized. */
449 bool prog_finalized;
450 /* HSA executable - the finalized program that is used to locate kernels. */
451 hsa_executable_t executable;
452};
453
454/* Information required to identify, finalize and run any given kernel. */
455
456enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
457
458struct kernel_info
459{
460 /* Name of the kernel, required to locate it within the GCN object-code
461 module. */
462 const char *name;
463 /* The specific agent the kernel has been or will be finalized for and run
464 on. */
465 struct agent_info *agent;
466 /* The specific module where the kernel takes place. */
467 struct module_info *module;
5a28e272
KCY
468 /* Information provided by mkoffload associated with the kernel. */
469 struct hsa_kernel_description *description;
237957cc
AS
470 /* Mutex enforcing that at most once thread ever initializes a kernel for
471 use. A thread should have locked agent->module_rwlock for reading before
472 acquiring it. */
473 pthread_mutex_t init_mutex;
474 /* Flag indicating whether the kernel has been initialized and all fields
475 below it contain valid data. */
476 bool initialized;
477 /* Flag indicating that the kernel has a problem that blocks an execution. */
478 bool initialization_failed;
479 /* The object to be put into the dispatch queue. */
480 uint64_t object;
481 /* Required size of kernel arguments. */
482 uint32_t kernarg_segment_size;
483 /* Required size of group segment. */
484 uint32_t group_segment_size;
485 /* Required size of private segment. */
486 uint32_t private_segment_size;
487 /* Set up for OpenMP or OpenACC? */
488 enum offload_kind kind;
489};
490
491/* Information about a particular GCN module, its image and kernels. */
492
493struct module_info
494{
495 /* The description with which the program has registered the image. */
496 struct gcn_image_desc *image_desc;
497 /* GCN heap allocation. */
498 struct heap *heap;
499 /* Physical boundaries of the loaded module. */
500 Elf64_Addr phys_address_start;
501 Elf64_Addr phys_address_end;
502
503 bool constructors_run_p;
504 struct kernel_info *init_array_func, *fini_array_func;
505
506 /* Number of kernels in this module. */
507 int kernel_count;
508 /* An array of kernel_info structures describing each kernel in this
509 module. */
510 struct kernel_info kernels[];
511};
512
513/* A linked list of memory arenas allocated on the device.
f6fff8a6
AS
514 These are used by OpenMP, as a means to optimize per-team malloc,
515 and for host-accessible stack space. */
237957cc 516
f6fff8a6 517struct ephemeral_memories_list
237957cc 518{
f6fff8a6 519 struct ephemeral_memories_list *next;
237957cc 520
f6fff8a6
AS
521 /* The size is determined by the number of teams and threads. */
522 size_t size;
523 /* The device address allocated memory. */
524 void *address;
525 /* A flag to prevent two asynchronous kernels trying to use the same memory.
237957cc
AS
526 The mutex is locked until the kernel exits. */
527 pthread_mutex_t in_use;
528};
529
530/* Information about the whole HSA environment and all of its agents. */
531
532struct hsa_context_info
533{
534 /* Whether the structure has been initialized. */
535 bool initialized;
536 /* Number of usable GPU HSA agents in the system. */
537 int agent_count;
538 /* Array of agent_info structures describing the individual HSA agents. */
539 struct agent_info *agents;
2e5ea579
FH
540 /* Driver version string. */
541 char driver_version_s[30];
237957cc
AS
542};
543
237957cc
AS
544/* }}} */
545/* {{{ Global variables */
546
547/* Information about the whole HSA environment and all of its agents. */
548
549static struct hsa_context_info hsa_context;
550
551/* HSA runtime functions that are initialized in init_hsa_context. */
552
553static struct hsa_runtime_fn_info hsa_fns;
554
555/* Heap space, allocated target-side, provided for use of newlib malloc.
556 Each module should have it's own heap allocated.
557 Beware that heap usage increases with OpenMP teams. See also arenas. */
558
559static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
560
f6fff8a6
AS
561/* Ephemeral memory sizes for each kernel launch. */
562
563static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
564static int stack_size = DEFAULT_GCN_STACK_SIZE;
e7d6c277 565static int lowlat_size = -1;
f6fff8a6 566
237957cc
AS
567/* Flag to decide whether print to stderr information about what is going on.
568 Set in init_debug depending on environment variables. */
569
570static bool debug;
571
572/* Flag to decide if the runtime should suppress a possible fallback to host
573 execution. */
574
575static bool suppress_host_fallback;
576
577/* Flag to locate HSA runtime shared library that is dlopened
578 by this plug-in. */
579
580static const char *hsa_runtime_lib;
581
582/* Flag to decide if the runtime should support also CPU devices (can be
583 a simulator). */
584
585static bool support_cpu_devices;
586
587/* Runtime dimension overrides. Zero indicates default. */
588
589static int override_x_dim = 0;
590static int override_z_dim = 0;
591
592/* }}} */
593/* {{{ Debug & Diagnostic */
594
595/* Print a message to stderr if GCN_DEBUG value is set to true. */
596
597#define DEBUG_PRINT(...) \
598 do \
599 { \
600 if (debug) \
601 { \
602 fprintf (stderr, __VA_ARGS__); \
603 } \
604 } \
605 while (false);
606
607/* Flush stderr if GCN_DEBUG value is set to true. */
608
609#define DEBUG_FLUSH() \
610 do { \
611 if (debug) \
612 fflush (stderr); \
613 } while (false)
614
615/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
616 is set to true. */
617
618#define DEBUG_LOG(prefix, ...) \
619 do \
620 { \
621 DEBUG_PRINT (prefix); \
622 DEBUG_PRINT (__VA_ARGS__); \
623 DEBUG_FLUSH (); \
624 } while (false)
625
626/* Print a debugging message to stderr. */
627
628#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
629
630/* Print a warning message to stderr. */
631
632#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
633
634/* Print HSA warning STR with an HSA STATUS code. */
635
636static void
637hsa_warn (const char *str, hsa_status_t status)
638{
639 if (!debug)
640 return;
641
642 const char *hsa_error_msg = "[unknown]";
643 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
644
645 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
646 hsa_error_msg);
647}
648
649/* Report a fatal error STR together with the HSA error corresponding to STATUS
650 and terminate execution of the current process. */
651
652static void
653hsa_fatal (const char *str, hsa_status_t status)
654{
655 const char *hsa_error_msg = "[unknown]";
656 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
657 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
658 hsa_error_msg);
659}
660
661/* Like hsa_fatal, except only report error message, and return FALSE
662 for propagating error processing to outside of plugin. */
663
664static bool
665hsa_error (const char *str, hsa_status_t status)
666{
667 const char *hsa_error_msg = "[unknown]";
668 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
669 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
670 hsa_error_msg);
671 return false;
672}
673
674/* Dump information about the available hardware. */
675
676static void
677dump_hsa_system_info (void)
678{
679 hsa_status_t status;
680
681 hsa_endianness_t endianness;
682 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
683 &endianness);
684 if (status == HSA_STATUS_SUCCESS)
685 switch (endianness)
686 {
687 case HSA_ENDIANNESS_LITTLE:
688 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
689 break;
690 case HSA_ENDIANNESS_BIG:
691 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
692 break;
693 default:
694 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
695 }
696 else
697 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
698
699 uint8_t extensions[128];
700 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
701 &extensions);
702 if (status == HSA_STATUS_SUCCESS)
703 {
704 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
705 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
706 }
707 else
708 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
709}
710
711/* Dump information about the available hardware. */
712
713static void
714dump_machine_model (hsa_machine_model_t machine_model, const char *s)
715{
716 switch (machine_model)
717 {
718 case HSA_MACHINE_MODEL_SMALL:
719 GCN_DEBUG ("%s: SMALL\n", s);
720 break;
721 case HSA_MACHINE_MODEL_LARGE:
722 GCN_DEBUG ("%s: LARGE\n", s);
723 break;
724 default:
725 GCN_WARNING ("%s: UNKNOWN\n", s);
726 break;
727 }
728}
729
730/* Dump information about the available hardware. */
731
732static void
733dump_profile (hsa_profile_t profile, const char *s)
734{
735 switch (profile)
736 {
737 case HSA_PROFILE_FULL:
738 GCN_DEBUG ("%s: FULL\n", s);
739 break;
740 case HSA_PROFILE_BASE:
741 GCN_DEBUG ("%s: BASE\n", s);
742 break;
743 default:
744 GCN_WARNING ("%s: UNKNOWN\n", s);
745 break;
746 }
747}
748
749/* Dump information about a device memory region. */
750
751static hsa_status_t
752dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
753{
754 hsa_status_t status;
755
756 hsa_region_segment_t segment;
757 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
758 &segment);
759 if (status == HSA_STATUS_SUCCESS)
760 {
761 if (segment == HSA_REGION_SEGMENT_GLOBAL)
762 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
763 else if (segment == HSA_REGION_SEGMENT_READONLY)
764 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
765 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
766 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
767 else if (segment == HSA_REGION_SEGMENT_GROUP)
768 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
769 else
770 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
771 }
772 else
773 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
774
775 if (segment == HSA_REGION_SEGMENT_GLOBAL)
776 {
777 uint32_t flags;
778 status
779 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
780 &flags);
781 if (status == HSA_STATUS_SUCCESS)
782 {
783 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
784 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
785 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
786 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
787 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
788 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
789 }
790 else
791 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
792 }
793
794 size_t size;
795 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
796 if (status == HSA_STATUS_SUCCESS)
797 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
798 else
799 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
800
801 status
802 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
803 &size);
804 if (status == HSA_STATUS_SUCCESS)
805 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
806 else
807 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
808
809 bool alloc_allowed;
810 status
811 = hsa_fns.hsa_region_get_info_fn (region,
812 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
813 &alloc_allowed);
814 if (status == HSA_STATUS_SUCCESS)
815 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
816 else
817 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
818
819 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
820 return HSA_STATUS_SUCCESS;
821
822 status
823 = hsa_fns.hsa_region_get_info_fn (region,
824 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
825 &size);
826 if (status == HSA_STATUS_SUCCESS)
827 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
828 else
829 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
830
831 size_t align;
832 status
833 = hsa_fns.hsa_region_get_info_fn (region,
834 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
835 &align);
836 if (status == HSA_STATUS_SUCCESS)
837 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
838 else
839 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
840
841 return HSA_STATUS_SUCCESS;
842}
843
844/* Dump information about all the device memory regions. */
845
846static void
847dump_hsa_regions (hsa_agent_t agent)
848{
849 hsa_status_t status;
850 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
851 dump_hsa_region,
852 NULL);
853 if (status != HSA_STATUS_SUCCESS)
854 hsa_error ("Dumping hsa regions failed", status);
855}
856
857/* Dump information about the available devices. */
858
859static hsa_status_t
860dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
861{
862 hsa_status_t status;
863
864 char buf[64];
865 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
866 &buf);
867 if (status == HSA_STATUS_SUCCESS)
868 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
869 else
870 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
871
872 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
873 &buf);
874 if (status == HSA_STATUS_SUCCESS)
875 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
876 else
877 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
878
879 hsa_machine_model_t machine_model;
880 status
881 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
882 &machine_model);
883 if (status == HSA_STATUS_SUCCESS)
884 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
885 else
886 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
887
888 hsa_profile_t profile;
889 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
890 &profile);
891 if (status == HSA_STATUS_SUCCESS)
892 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
893 else
894 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
895
896 hsa_device_type_t device_type;
897 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
898 &device_type);
899 if (status == HSA_STATUS_SUCCESS)
900 {
901 switch (device_type)
902 {
903 case HSA_DEVICE_TYPE_CPU:
904 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
905 break;
906 case HSA_DEVICE_TYPE_GPU:
907 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
908 break;
909 case HSA_DEVICE_TYPE_DSP:
910 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
911 break;
912 default:
913 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
914 break;
915 }
916 }
917 else
918 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
919
920 uint32_t cu_count;
921 status = hsa_fns.hsa_agent_get_info_fn
922 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
923 if (status == HSA_STATUS_SUCCESS)
924 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
925 else
926 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
927
928 uint32_t size;
929 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
930 &size);
931 if (status == HSA_STATUS_SUCCESS)
932 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
933 else
934 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
935
936 uint32_t max_dim;
937 status = hsa_fns.hsa_agent_get_info_fn (agent,
938 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
939 &max_dim);
940 if (status == HSA_STATUS_SUCCESS)
941 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
942 else
943 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
944
945 uint32_t max_size;
946 status = hsa_fns.hsa_agent_get_info_fn (agent,
947 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
948 &max_size);
949 if (status == HSA_STATUS_SUCCESS)
950 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
951 else
952 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
953
954 uint32_t grid_max_dim;
955 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
956 &grid_max_dim);
957 if (status == HSA_STATUS_SUCCESS)
958 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
959 else
960 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
961
962 uint32_t grid_max_size;
963 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
964 &grid_max_size);
965 if (status == HSA_STATUS_SUCCESS)
966 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
967 else
968 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
969
970 dump_hsa_regions (agent);
971
972 return HSA_STATUS_SUCCESS;
973}
974
975/* Forward reference. */
976
977static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
978
979/* Helper function for dump_executable_symbols. */
980
981static hsa_status_t
982dump_executable_symbol (hsa_executable_t executable,
983 hsa_executable_symbol_t symbol,
984 void *data __attribute__((unused)))
985{
986 char *name = get_executable_symbol_name (symbol);
987
988 if (name)
989 {
990 GCN_DEBUG ("executable symbol: %s\n", name);
991 free (name);
992 }
993
994 return HSA_STATUS_SUCCESS;
995}
996
997/* Dump all global symbol in an executable. */
998
999static void
1000dump_executable_symbols (hsa_executable_t executable)
1001{
1002 hsa_status_t status;
1003 status
1004 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1005 dump_executable_symbol,
1006 NULL);
1007 if (status != HSA_STATUS_SUCCESS)
1008 hsa_fatal ("Could not dump HSA executable symbols", status);
1009}
1010
1011/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1012
1013static void
1014print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1015{
1016 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1017
1018 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1019 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1020 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1021 fprintf (stderr, "%*sheap address: %p\n", indent, "",
f6fff8a6
AS
1022 (void*)kernargs->abi.heap_ptr);
1023 fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1024 "", (void*)kernargs->abi.arena_ptr,
1025 kernargs->abi.arena_size_per_team);
1026 fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1027 "", (void*)kernargs->abi.stack_ptr,
1028 kernargs->abi.stack_size_per_thread);
237957cc
AS
1029 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1030 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1031 dispatch->private_segment_size);
e7d6c277
AS
1032 fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1033 "", dispatch->group_segment_size);
237957cc
AS
1034 fprintf (stderr, "\n");
1035}
1036
1037/* }}} */
1038/* {{{ Utility functions */
1039
1040/* Cast the thread local storage to gcn_thread. */
1041
1042static inline struct gcn_thread *
1043gcn_thread (void)
1044{
1045 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1046}
1047
1048/* Initialize debug and suppress_host_fallback according to the environment. */
1049
1050static void
1051init_environment_variables (void)
1052{
1053 if (secure_getenv ("GCN_DEBUG"))
1054 debug = true;
1055 else
1056 debug = false;
1057
1058 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1059 suppress_host_fallback = true;
1060 else
1061 suppress_host_fallback = false;
1062
1063 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1064 if (hsa_runtime_lib == NULL)
7c1e856b 1065 hsa_runtime_lib = "libhsa-runtime64.so.1";
237957cc
AS
1066
1067 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1068
1069 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1070 if (!x)
1071 x = secure_getenv ("GCN_NUM_GANGS");
1072 if (x)
1073 override_x_dim = atoi (x);
1074
1075 const char *z = secure_getenv ("GCN_NUM_THREADS");
1076 if (!z)
1077 z = secure_getenv ("GCN_NUM_WORKERS");
1078 if (z)
1079 override_z_dim = atoi (z);
1080
1081 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1082 if (heap)
1083 {
1084 size_t tmp = atol (heap);
1085 if (tmp)
1086 gcn_kernel_heap_size = tmp;
1087 }
f6fff8a6
AS
1088
1089 const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1090 if (arena)
1091 {
1092 int tmp = atoi (arena);
1093 if (tmp)
1094 team_arena_size = tmp;;
1095 }
1096
1097 const char *stack = secure_getenv ("GCN_STACK_SIZE");
1098 if (stack)
1099 {
1100 int tmp = atoi (stack);
1101 if (tmp)
1102 stack_size = tmp;;
1103 }
e7d6c277
AS
1104
1105 const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1106 if (lowlat)
1107 lowlat_size = atoi (lowlat);
237957cc
AS
1108}
1109
1110/* Return malloc'd string with name of SYMBOL. */
1111
1112static char *
1113get_executable_symbol_name (hsa_executable_symbol_t symbol)
1114{
1115 hsa_status_t status;
1116 char *res;
1117 uint32_t len;
1118 const hsa_executable_symbol_info_t info_name_length
1119 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1120
1121 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1122 &len);
1123 if (status != HSA_STATUS_SUCCESS)
1124 {
1125 hsa_error ("Could not get length of symbol name", status);
1126 return NULL;
1127 }
1128
1129 res = GOMP_PLUGIN_malloc (len + 1);
1130
1131 const hsa_executable_symbol_info_t info_name
1132 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1133
1134 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1135
1136 if (status != HSA_STATUS_SUCCESS)
1137 {
1138 hsa_error ("Could not get symbol name", status);
1139 free (res);
1140 return NULL;
1141 }
1142
1143 res[len] = '\0';
1144
1145 return res;
1146}
1147
237957cc
AS
1148/* Get the number of GPU Compute Units. */
1149
1150static int
1151get_cu_count (struct agent_info *agent)
1152{
1153 uint32_t cu_count;
1154 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1155 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1156 if (status == HSA_STATUS_SUCCESS)
1157 return cu_count;
1158 else
1159 return 64; /* The usual number for older devices. */
1160}
1161
1162/* Calculate the maximum grid size for OMP threads / OACC workers.
1163 This depends on the kernel's resource usage levels. */
1164
1165static int
1166limit_worker_threads (int threads)
1167{
1168 /* FIXME Do something more inteligent here.
1169 GCN can always run 4 threads within a Compute Unit, but
1170 more than that depends on register usage. */
1171 if (threads > 16)
1172 threads = 16;
1173 return threads;
1174}
1175
81476bc4
MV
1176/* This sets the maximum number of teams to twice the number of GPU Compute
1177 Units to avoid memory waste and corresponding memory access faults. */
1178
1179static int
1180limit_teams (int teams, struct agent_info *agent)
1181{
1182 int max_teams = 2 * get_cu_count (agent);
1183 if (teams > max_teams)
1184 teams = max_teams;
1185 return teams;
1186}
1187
237957cc
AS
1188/* Parse the target attributes INPUT provided by the compiler and return true
1189 if we should run anything all. If INPUT is NULL, fill DEF with default
1190 values, then store INPUT or DEF into *RESULT.
1191
1192 This is used for OpenMP only. */
1193
1194static bool
1195parse_target_attributes (void **input,
1196 struct GOMP_kernel_launch_attributes *def,
1197 struct GOMP_kernel_launch_attributes **result,
1198 struct agent_info *agent)
1199{
1200 if (!input)
1201 GOMP_PLUGIN_fatal ("No target arguments provided");
1202
1203 bool grid_attrs_found = false;
1204 bool gcn_dims_found = false;
1205 int gcn_teams = 0;
1206 int gcn_threads = 0;
1207 while (*input)
1208 {
1209 intptr_t id = (intptr_t) *input++, val;
1210
1211 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1212 val = (intptr_t) *input++;
1213 else
1214 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1215
1216 val = (val > INT_MAX) ? INT_MAX : val;
1217
1218 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1219 && ((id & GOMP_TARGET_ARG_ID_MASK)
1220 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1221 {
1222 grid_attrs_found = true;
1223 break;
1224 }
14e5e746
AS
1225 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1226 == GOMP_TARGET_ARG_DEVICE_ALL)
237957cc
AS
1227 {
1228 gcn_dims_found = true;
1229 switch (id & GOMP_TARGET_ARG_ID_MASK)
1230 {
1231 case GOMP_TARGET_ARG_NUM_TEAMS:
81476bc4 1232 gcn_teams = limit_teams (val, agent);
237957cc
AS
1233 break;
1234 case GOMP_TARGET_ARG_THREAD_LIMIT:
1235 gcn_threads = limit_worker_threads (val);
1236 break;
1237 default:
1238 ;
1239 }
1240 }
1241 }
1242
1243 if (gcn_dims_found)
1244 {
a78b1ab1
KCY
1245 bool gfx900_workaround_p = false;
1246
7d593fd6
FH
1247 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1248 && gcn_threads == 0 && override_z_dim == 0)
237957cc 1249 {
a78b1ab1 1250 gfx900_workaround_p = true;
237957cc 1251 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
a78b1ab1 1252 "threads to at most 4 per team.\n");
237957cc
AS
1253 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1254 "GCN_NUM_THREADS=16\n");
1255 }
1256
a78b1ab1
KCY
1257 /* Ideally, when a dimension isn't explicitly specified, we should
1258 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1259 In practice, we tune for peak performance on BabelStream, which
1260 for OpenACC is currently 32 threads per CU. */
237957cc 1261 def->ndim = 3;
a78b1ab1
KCY
1262 if (gcn_teams <= 0 && gcn_threads <= 0)
1263 {
1264 /* Set up a reasonable number of teams and threads. */
1265 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1266 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1267 def->gdims[2] = gcn_threads;
1268 }
1269 else if (gcn_teams <= 0 && gcn_threads > 0)
1270 {
1271 /* Auto-scale the number of teams with the number of threads. */
1272 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1273 def->gdims[2] = gcn_threads;
1274 }
1275 else if (gcn_teams > 0 && gcn_threads <= 0)
1276 {
1277 int max_threads = gfx900_workaround_p ? 4 : 16;
1278
1279 /* Auto-scale the number of threads with the number of teams. */
1280 def->gdims[0] = gcn_teams;
1281 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1282 if (def->gdims[2] == 0)
1283 def->gdims[2] = 1;
1284 else if (def->gdims[2] > max_threads)
1285 def->gdims[2] = max_threads;
1286 }
1287 else
1288 {
1289 def->gdims[0] = gcn_teams;
1290 def->gdims[2] = gcn_threads;
1291 }
1292 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1293 def->wdims[0] = 1; /* Single team per work-group. */
237957cc
AS
1294 def->wdims[1] = 64;
1295 def->wdims[2] = 16;
1296 *result = def;
1297 return true;
1298 }
1299 else if (!grid_attrs_found)
1300 {
1301 def->ndim = 1;
1302 def->gdims[0] = 1;
1303 def->gdims[1] = 1;
1304 def->gdims[2] = 1;
1305 def->wdims[0] = 1;
1306 def->wdims[1] = 1;
1307 def->wdims[2] = 1;
1308 *result = def;
1309 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1310 return true;
1311 }
1312
1313 struct GOMP_kernel_launch_attributes *kla;
1314 kla = (struct GOMP_kernel_launch_attributes *) *input;
1315 *result = kla;
1316 if (kla->ndim == 0 || kla->ndim > 3)
1317 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1318
1319 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1320 unsigned i;
1321 for (i = 0; i < kla->ndim; i++)
1322 {
1323 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1324 kla->gdims[i], kla->wdims[i]);
1325 if (kla->gdims[i] == 0)
1326 return false;
1327 }
1328 return true;
1329}
1330
1331/* Return the group size given the requested GROUP size, GRID size and number
1332 of grid dimensions NDIM. */
1333
1334static uint32_t
1335get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1336{
1337 if (group == 0)
1338 {
1339 /* TODO: Provide a default via environment or device characteristics. */
1340 if (ndim == 1)
1341 group = 64;
1342 else if (ndim == 2)
1343 group = 8;
1344 else
1345 group = 4;
1346 }
1347
1348 if (group > grid)
1349 group = grid;
1350 return group;
1351}
1352
1353/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1354
1355static void
1356packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1357{
1358 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1359}
1360
1361/* A never-called callback for the HSA command queues. These signal events
1362 that we don't use, so we trigger an error.
1363
1364 This "queue" is not to be confused with the async queues, below. */
1365
1366static void
1367hsa_queue_callback (hsa_status_t status,
1368 hsa_queue_t *queue __attribute__ ((unused)),
1369 void *data __attribute__ ((unused)))
1370{
1371 hsa_fatal ("Asynchronous queue error", status);
1372}
1373
1374/* }}} */
1375/* {{{ HSA initialization */
1376
1377/* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1378
1379static bool
1380init_hsa_runtime_functions (void)
1381{
1382#define DLSYM_FN(function) \
1383 hsa_fns.function##_fn = dlsym (handle, #function); \
1384 if (hsa_fns.function##_fn == NULL) \
1385 return false;
a17299c1
JB
1386#define DLSYM_OPT_FN(function) \
1387 hsa_fns.function##_fn = dlsym (handle, #function);
237957cc
AS
1388 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1389 if (handle == NULL)
1390 return false;
1391
1392 DLSYM_FN (hsa_status_string)
1393 DLSYM_FN (hsa_system_get_info)
1394 DLSYM_FN (hsa_agent_get_info)
1395 DLSYM_FN (hsa_init)
1396 DLSYM_FN (hsa_iterate_agents)
1397 DLSYM_FN (hsa_region_get_info)
1398 DLSYM_FN (hsa_queue_create)
1399 DLSYM_FN (hsa_agent_iterate_regions)
1400 DLSYM_FN (hsa_executable_destroy)
1401 DLSYM_FN (hsa_executable_create)
1402 DLSYM_FN (hsa_executable_global_variable_define)
1403 DLSYM_FN (hsa_executable_load_code_object)
1404 DLSYM_FN (hsa_executable_freeze)
1405 DLSYM_FN (hsa_signal_create)
1406 DLSYM_FN (hsa_memory_allocate)
1407 DLSYM_FN (hsa_memory_assign_agent)
1408 DLSYM_FN (hsa_memory_copy)
1409 DLSYM_FN (hsa_memory_free)
1410 DLSYM_FN (hsa_signal_destroy)
1411 DLSYM_FN (hsa_executable_get_symbol)
1412 DLSYM_FN (hsa_executable_symbol_get_info)
1413 DLSYM_FN (hsa_executable_iterate_symbols)
1414 DLSYM_FN (hsa_queue_add_write_index_release)
1415 DLSYM_FN (hsa_queue_load_read_index_acquire)
1416 DLSYM_FN (hsa_signal_wait_acquire)
1417 DLSYM_FN (hsa_signal_store_relaxed)
1418 DLSYM_FN (hsa_signal_store_release)
1419 DLSYM_FN (hsa_signal_load_acquire)
1420 DLSYM_FN (hsa_queue_destroy)
1421 DLSYM_FN (hsa_code_object_deserialize)
a17299c1
JB
1422 DLSYM_OPT_FN (hsa_amd_memory_lock)
1423 DLSYM_OPT_FN (hsa_amd_memory_unlock)
1424 DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
237957cc 1425 return true;
a17299c1 1426#undef DLSYM_OPT_FN
237957cc
AS
1427#undef DLSYM_FN
1428}
1429
c34ab549
RB
1430static gcn_isa isa_code (const char *isa);
1431
237957cc
AS
1432/* Return true if the agent is a GPU and can accept of concurrent submissions
1433 from different threads. */
1434
1435static bool
1436suitable_hsa_agent_p (hsa_agent_t agent)
1437{
1438 hsa_device_type_t device_type;
1439 hsa_status_t status
1440 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1441 &device_type);
1442 if (status != HSA_STATUS_SUCCESS)
1443 return false;
1444
1445 switch (device_type)
1446 {
1447 case HSA_DEVICE_TYPE_GPU:
c34ab549
RB
1448 {
1449 char name[64];
1450 hsa_status_t status
1451 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME, name);
1452 if (status != HSA_STATUS_SUCCESS
1453 || isa_code (name) == EF_AMDGPU_MACH_UNSUPPORTED)
1454 {
1455 GCN_DEBUG ("Ignoring unsupported agent '%s'\n",
1456 status == HSA_STATUS_SUCCESS ? name : "invalid");
1457 return false;
1458 }
1459 }
237957cc
AS
1460 break;
1461 case HSA_DEVICE_TYPE_CPU:
1462 if (!support_cpu_devices)
1463 return false;
1464 break;
1465 default:
1466 return false;
1467 }
1468
1469 uint32_t features = 0;
1470 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1471 &features);
1472 if (status != HSA_STATUS_SUCCESS
1473 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1474 return false;
1475 hsa_queue_type_t queue_type;
1476 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1477 &queue_type);
1478 if (status != HSA_STATUS_SUCCESS
1479 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1480 return false;
1481
1482 return true;
1483}
1484
1485/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1486 agent_count in hsa_context. */
1487
1488static hsa_status_t
1489count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1490{
1491 if (suitable_hsa_agent_p (agent))
1492 hsa_context.agent_count++;
1493 return HSA_STATUS_SUCCESS;
1494}
1495
1496/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1497 id to the describing structure in the hsa context. The index of the
1498 structure is pointed to by DATA, increment it afterwards. */
1499
1500static hsa_status_t
1501assign_agent_ids (hsa_agent_t agent, void *data)
1502{
1503 if (suitable_hsa_agent_p (agent))
1504 {
1505 int *agent_index = (int *) data;
1506 hsa_context.agents[*agent_index].id = agent;
1507 ++*agent_index;
1508 }
1509 return HSA_STATUS_SUCCESS;
1510}
1511
1512/* Initialize hsa_context if it has not already been done.
1513 Return TRUE on success. */
1514
1515static bool
1516init_hsa_context (void)
1517{
1518 hsa_status_t status;
1519 int agent_index = 0;
1520
1521 if (hsa_context.initialized)
1522 return true;
1523 init_environment_variables ();
1524 if (!init_hsa_runtime_functions ())
1525 {
1526 GCN_WARNING ("Run-time could not be dynamically opened\n");
1527 if (suppress_host_fallback)
1528 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1529 return false;
1530 }
1531 status = hsa_fns.hsa_init_fn ();
1532 if (status != HSA_STATUS_SUCCESS)
1533 return hsa_error ("Run-time could not be initialized", status);
1534 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1535
1536 if (debug)
1537 dump_hsa_system_info ();
1538
1539 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1540 if (status != HSA_STATUS_SUCCESS)
1541 return hsa_error ("GCN GPU devices could not be enumerated", status);
1542 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1543
1544 hsa_context.agents
1545 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1546 * sizeof (struct agent_info));
1547 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
966de09b
AS
1548 if (status != HSA_STATUS_SUCCESS)
1549 return hsa_error ("Scanning compute agents failed", status);
237957cc
AS
1550 if (agent_index != hsa_context.agent_count)
1551 {
1552 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1553 return false;
1554 }
1555
1556 if (debug)
1557 {
1558 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1559 if (status != HSA_STATUS_SUCCESS)
1560 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1561 }
1562
2e5ea579
FH
1563 uint16_t minor, major;
1564 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1565 &minor);
1566 if (status != HSA_STATUS_SUCCESS)
1567 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1568 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1569 &major);
1570 if (status != HSA_STATUS_SUCCESS)
1571 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1572
1573 size_t len = sizeof hsa_context.driver_version_s;
1574 int printed = snprintf (hsa_context.driver_version_s, len,
1575 "HSA Runtime %hu.%hu", (unsigned short int)major,
1576 (unsigned short int)minor);
1577 if (printed >= len)
1578 GCN_WARNING ("HSA runtime version string was truncated."
1579 "Version %hu.%hu is too long.", (unsigned short int)major,
1580 (unsigned short int)minor);
1581
237957cc
AS
1582 hsa_context.initialized = true;
1583 return true;
1584}
1585
1586/* Verify that hsa_context has already been initialized and return the
1587 agent_info structure describing device number N. Return NULL on error. */
1588
1589static struct agent_info *
1590get_agent_info (int n)
1591{
1592 if (!hsa_context.initialized)
1593 {
1594 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1595 return NULL;
1596 }
1597 if (n >= hsa_context.agent_count)
1598 {
1599 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1600 return NULL;
1601 }
1602 if (!hsa_context.agents[n].initialized)
1603 {
1604 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1605 return NULL;
1606 }
1607 return &hsa_context.agents[n];
1608}
1609
1610/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1611
1612 Selects (breaks at) a suitable region of type KIND. */
1613
1614static hsa_status_t
1615get_memory_region (hsa_region_t region, hsa_region_t *retval,
1616 hsa_region_global_flag_t kind)
1617{
1618 hsa_status_t status;
1619 hsa_region_segment_t segment;
1620
1621 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1622 &segment);
1623 if (status != HSA_STATUS_SUCCESS)
1624 return status;
1625 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1626 return HSA_STATUS_SUCCESS;
1627
1628 uint32_t flags;
1629 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1630 &flags);
1631 if (status != HSA_STATUS_SUCCESS)
1632 return status;
1633 if (flags & kind)
1634 {
1635 *retval = region;
1636 return HSA_STATUS_INFO_BREAK;
1637 }
1638 return HSA_STATUS_SUCCESS;
1639}
1640
1641/* Callback of hsa_agent_iterate_regions.
1642
1643 Selects a kernargs memory region. */
1644
1645static hsa_status_t
1646get_kernarg_memory_region (hsa_region_t region, void *data)
1647{
1648 return get_memory_region (region, (hsa_region_t *)data,
1649 HSA_REGION_GLOBAL_FLAG_KERNARG);
1650}
1651
1652/* Callback of hsa_agent_iterate_regions.
1653
1654 Selects a coarse-grained memory region suitable for the heap and
1655 offload data. */
1656
1657static hsa_status_t
1658get_data_memory_region (hsa_region_t region, void *data)
1659{
1660 return get_memory_region (region, (hsa_region_t *)data,
1661 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1662}
1663
7d593fd6
FH
1664static int
1665elf_gcn_isa_field (Elf64_Ehdr *image)
1666{
1667 return image->e_flags & EF_AMDGPU_MACH_MASK;
1668}
1669
7d593fd6
FH
1670const static char *gcn_gfx803_s = "gfx803";
1671const static char *gcn_gfx900_s = "gfx900";
1672const static char *gcn_gfx906_s = "gfx906";
3535402e 1673const static char *gcn_gfx908_s = "gfx908";
cde52d3a 1674const static char *gcn_gfx90a_s = "gfx90a";
c7ec7bd1 1675const static char *gcn_gfx1030_s = "gfx1030";
52a2c659
TB
1676const static char *gcn_gfx1100_s = "gfx1100";
1677const static int gcn_isa_name_len = 7;
7d593fd6
FH
1678
1679/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1680 support the ISA. */
1681
1682static const char*
1683isa_hsa_name (int isa) {
1684 switch(isa)
1685 {
7d593fd6
FH
1686 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1687 return gcn_gfx803_s;
1688 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1689 return gcn_gfx900_s;
1690 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1691 return gcn_gfx906_s;
3535402e
AS
1692 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1693 return gcn_gfx908_s;
cde52d3a
AS
1694 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1695 return gcn_gfx90a_s;
c7ec7bd1
AS
1696 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1697 return gcn_gfx1030_s;
52a2c659
TB
1698 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1699 return gcn_gfx1100_s;
7d593fd6
FH
1700 }
1701 return NULL;
1702}
1703
1704/* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1705 with -march) or NULL if we do not support the ISA.
1706 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1707
1708static const char*
1709isa_gcc_name (int isa) {
1710 switch(isa)
1711 {
7d593fd6
FH
1712 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1713 return "fiji";
1714 default:
1715 return isa_hsa_name (isa);
1716 }
1717}
1718
1719/* Returns the code which is used in the GCN object code to identify the ISA with
1720 the given name (as used by the HSA runtime). */
1721
1722static gcn_isa
1723isa_code(const char *isa) {
7d593fd6
FH
1724 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1725 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1726
1727 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1728 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1729
1730 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1731 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1732
3535402e
AS
1733 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1734 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1735
cde52d3a
AS
1736 if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1737 return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1738
c7ec7bd1
AS
1739 if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
1740 return EF_AMDGPU_MACH_AMDGCN_GFX1030;
1741
52a2c659
TB
1742 if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len))
1743 return EF_AMDGPU_MACH_AMDGCN_GFX1100;
1744
209ed06c 1745 return EF_AMDGPU_MACH_UNSUPPORTED;
7d593fd6
FH
1746}
1747
ae0d2c24
AS
1748/* CDNA2 devices have twice as many VGPRs compared to older devices. */
1749
1750static int
1751max_isa_vgprs (int isa)
1752{
1753 switch (isa)
1754 {
1755 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1756 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1757 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1758 case EF_AMDGPU_MACH_AMDGCN_GFX908:
ae0d2c24
AS
1759 return 256;
1760 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1761 return 512;
99890e15
AS
1762 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1763 return 512; /* 512 SIMD32 = 256 wavefrontsize64. */
1764 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1765 return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */
ae0d2c24
AS
1766 }
1767 GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1768}
1769
237957cc
AS
1770/* }}} */
1771/* {{{ Run */
1772
f6fff8a6 1773/* Create or reuse a team arena and stack space.
237957cc
AS
1774
1775 Team arenas are used by OpenMP to avoid calling malloc multiple times
1776 while setting up each team. This is purely a performance optimization.
1777
f6fff8a6
AS
1778 The stack space is used by all kernels. We must allocate it in such a
1779 way that the reverse offload implmentation can access the data.
237957cc 1780
f6fff8a6
AS
1781 Allocating this memory costs performance, so this function will reuse an
1782 existing allocation if a large enough one is idle.
1783 The memory lock is released, but not deallocated, when the kernel exits. */
1784
1785static void
1786configure_ephemeral_memories (struct kernel_info *kernel,
1787 struct kernargs_abi *kernargs, int num_teams,
1788 int num_threads)
237957cc 1789{
f6fff8a6
AS
1790 struct agent_info *agent = kernel->agent;
1791 struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1792 struct ephemeral_memories_list *item;
1793
1794 int actual_arena_size = (kernel->kind == KIND_OPENMP
1795 ? team_arena_size : 0);
1796 int actual_arena_total_size = actual_arena_size * num_teams;
1797 size_t size = (actual_arena_total_size
1798 + num_teams * num_threads * stack_size);
237957cc
AS
1799
1800 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1801 {
f6fff8a6 1802 if (item->size < size)
237957cc
AS
1803 continue;
1804
f6fff8a6
AS
1805 if (pthread_mutex_trylock (&item->in_use) == 0)
1806 break;
237957cc
AS
1807 }
1808
f6fff8a6 1809 if (!item)
237957cc 1810 {
f6fff8a6
AS
1811 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1812 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1813 num_teams, num_threads, size);
237957cc 1814
f6fff8a6
AS
1815 if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1816 {
1817 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1818 return;
1819 }
1820 item = malloc (sizeof (*item));
1821 item->size = size;
1822 item->next = NULL;
1823 *next_ptr = item;
237957cc 1824
f6fff8a6
AS
1825 if (pthread_mutex_init (&item->in_use, NULL))
1826 {
1827 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1828 return;
1829 }
1830 if (pthread_mutex_lock (&item->in_use))
1831 {
1832 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1833 return;
1834 }
1835 if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1836 {
1837 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1838 return;
1839 }
1840
1841 hsa_status_t status;
1842 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1843 &item->address);
1844 if (status != HSA_STATUS_SUCCESS)
1845 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1846 status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1847 HSA_ACCESS_PERMISSION_RW);
1848 if (status != HSA_STATUS_SUCCESS)
1849 hsa_fatal ("Could not assign arena & stack memory to device", status);
1850 }
237957cc 1851
f6fff8a6
AS
1852 kernargs->arena_ptr = (actual_arena_total_size
1853 ? (uint64_t)item->address
1854 : 0);
1855 kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1856 kernargs->arena_size_per_team = actual_arena_size;
1857 kernargs->stack_size_per_thread = stack_size;
237957cc
AS
1858}
1859
f6fff8a6 1860/* Mark an ephemeral memory space available for reuse. */
237957cc
AS
1861
1862static void
f6fff8a6 1863release_ephemeral_memories (struct agent_info* agent, void *address)
237957cc 1864{
f6fff8a6 1865 struct ephemeral_memories_list *item;
237957cc 1866
f6fff8a6 1867 for (item = agent->ephemeral_memories_list; item; item = item->next)
237957cc 1868 {
f6fff8a6 1869 if (item->address == address)
237957cc
AS
1870 {
1871 if (pthread_mutex_unlock (&item->in_use))
1872 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1873 return;
1874 }
1875 }
1876 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1877}
1878
1879/* Clean up all the allocated team arenas. */
1880
1881static bool
f6fff8a6 1882destroy_ephemeral_memories (struct agent_info *agent)
237957cc 1883{
f6fff8a6 1884 struct ephemeral_memories_list *item, *next;
237957cc 1885
f6fff8a6 1886 for (item = agent->ephemeral_memories_list; item; item = next)
237957cc
AS
1887 {
1888 next = item->next;
f6fff8a6 1889 hsa_fns.hsa_memory_free_fn (item->address);
237957cc
AS
1890 if (pthread_mutex_destroy (&item->in_use))
1891 {
f6fff8a6 1892 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
237957cc
AS
1893 return false;
1894 }
1895 free (item);
1896 }
f6fff8a6 1897 agent->ephemeral_memories_list = NULL;
237957cc
AS
1898
1899 return true;
1900}
1901
1902/* Allocate memory on a specified device. */
1903
1904static void *
1905alloc_by_agent (struct agent_info *agent, size_t size)
1906{
1907 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1908
237957cc
AS
1909 void *ptr;
1910 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1911 size, &ptr);
1912 if (status != HSA_STATUS_SUCCESS)
1913 {
1914 hsa_error ("Could not allocate device memory", status);
1915 return NULL;
1916 }
1917
1918 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1919 HSA_ACCESS_PERMISSION_RW);
1920 if (status != HSA_STATUS_SUCCESS)
1921 {
1922 hsa_error ("Could not assign data memory to device", status);
1923 return NULL;
1924 }
1925
1926 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1927 bool profiling_dispatch_p
1928 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1929 if (profiling_dispatch_p)
1930 {
1931 acc_prof_info *prof_info = thr->prof_info;
1932 acc_event_info data_event_info;
1933 acc_api_info *api_info = thr->api_info;
1934
1935 prof_info->event_type = acc_ev_alloc;
1936
1937 data_event_info.data_event.event_type = prof_info->event_type;
1938 data_event_info.data_event.valid_bytes
1939 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1940 data_event_info.data_event.parent_construct
1941 = acc_construct_parallel;
1942 data_event_info.data_event.implicit = 1;
1943 data_event_info.data_event.tool_info = NULL;
1944 data_event_info.data_event.var_name = NULL;
1945 data_event_info.data_event.bytes = size;
1946 data_event_info.data_event.host_ptr = NULL;
1947 data_event_info.data_event.device_ptr = (void *) ptr;
1948
1949 api_info->device_api = acc_device_api_other;
1950
1951 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1952 api_info);
1953 }
1954
1955 return ptr;
1956}
1957
1958/* Create kernel dispatch data structure for given KERNEL, along with
1959 the necessary device signals and memory allocations. */
1960
1961static struct kernel_dispatch *
f6fff8a6
AS
1962create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1963 int num_threads)
237957cc
AS
1964{
1965 struct agent_info *agent = kernel->agent;
1966 struct kernel_dispatch *shadow
1967 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1968
1969 shadow->agent = kernel->agent;
1970 shadow->object = kernel->object;
1971
1972 hsa_signal_t sync_signal;
1973 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1974 if (status != HSA_STATUS_SUCCESS)
1975 hsa_fatal ("Error creating the GCN sync signal", status);
1976
1977 shadow->signal = sync_signal.handle;
1978 shadow->private_segment_size = kernel->private_segment_size;
e7d6c277
AS
1979
1980 if (lowlat_size < 0)
1981 {
1982 /* Divide the LDS between the number of running teams.
1983 Allocate not less than is defined in the kernel metadata. */
1984 int teams_per_cu = num_teams / get_cu_count (agent);
1985 int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
1986 shadow->group_segment_size
1987 = (kernel->group_segment_size > LDS_per_team
1988 ? kernel->group_segment_size
1989 : LDS_per_team);;
1990 }
1991 else if (lowlat_size < GCN_LOWLAT_HEAP+8)
1992 /* Ensure that there's space for the OpenMP libgomp data. */
1993 shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
1994 else
1995 shadow->group_segment_size = (lowlat_size > 65536
1996 ? 65536
1997 : lowlat_size);
237957cc
AS
1998
1999 /* We expect kernels to request a single pointer, explicitly, and the
2000 rest of struct kernargs, implicitly. If they request anything else
2001 then something is wrong. */
2002 if (kernel->kernarg_segment_size > 8)
2003 {
2004 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
2005 return NULL;
2006 }
2007
2008 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
2009 sizeof (struct kernargs),
2010 &shadow->kernarg_address);
2011 if (status != HSA_STATUS_SUCCESS)
2012 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
2013 struct kernargs *kernargs = shadow->kernarg_address;
2014
2015 /* Zero-initialize the output_data (minimum needed). */
f6fff8a6 2016 kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
237957cc
AS
2017 kernargs->output_data.next_output = 0;
2018 for (unsigned i = 0;
2019 i < (sizeof (kernargs->output_data.queue)
2020 / sizeof (kernargs->output_data.queue[0]));
2021 i++)
2022 kernargs->output_data.queue[i].written = 0;
2023 kernargs->output_data.consumed = 0;
2024
2025 /* Pass in the heap location. */
f6fff8a6 2026 kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
237957cc 2027
f6fff8a6
AS
2028 /* Create the ephemeral memory spaces. */
2029 configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
237957cc
AS
2030
2031 /* Ensure we can recognize unset return values. */
2032 kernargs->output_data.return_value = 0xcafe0000;
2033
2034 return shadow;
2035}
2036
8c05d8cd 2037static void
6edcb5dc
TB
2038process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
2039 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
8c05d8cd
TB
2040{
2041 int dev_num = dev_num64;
6edcb5dc 2042 GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
130c2f3c 2043 NULL);
8c05d8cd
TB
2044}
2045
237957cc
AS
2046/* Output any data written to console output from the kernel. It is expected
2047 that this function is polled during kernel execution.
2048
2049 We print all entries from the last item printed to the next entry without
2050 a "written" flag. If the "final" flag is set then it'll continue right to
2051 the end.
2052
2053 The print buffer is circular, but the from and to locations don't wrap when
2054 the buffer does, so the output limit is UINT_MAX. The target blocks on
2055 output when the buffer is full. */
2056
2057static void
2058console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2059 bool final)
2060{
2061 unsigned int limit = (sizeof (kernargs->output_data.queue)
2062 / sizeof (kernargs->output_data.queue[0]));
2063
2064 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2065 __ATOMIC_ACQUIRE);
2066 unsigned int to = kernargs->output_data.next_output;
2067
2068 if (from > to)
2069 {
2070 /* Overflow. */
2071 if (final)
2072 printf ("GCN print buffer overflowed.\n");
2073 return;
2074 }
2075
2076 unsigned int i;
2077 for (i = from; i < to; i++)
2078 {
2079 struct printf_data *data = &kernargs->output_data.queue[i%limit];
2080
2081 if (!data->written && !final)
2082 break;
2083
2084 switch (data->type)
2085 {
2086 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2087 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2088 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2089 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
8c05d8cd 2090 case 4:
6edcb5dc
TB
2091 process_reverse_offload (data->value_u64[0], data->value_u64[1],
2092 data->value_u64[2], data->value_u64[3],
2093 data->value_u64[4], data->value_u64[5]);
8c05d8cd 2094 break;
237957cc
AS
2095 default: printf ("GCN print buffer error!\n"); break;
2096 }
2097 data->written = 0;
2098 __atomic_store_n (&kernargs->output_data.consumed, i+1,
2099 __ATOMIC_RELEASE);
2100 }
2101 fflush (stdout);
2102}
2103
2104/* Release data structure created for a kernel dispatch in SHADOW argument,
2105 and clean up the signal and memory allocations. */
2106
2107static void
2108release_kernel_dispatch (struct kernel_dispatch *shadow)
2109{
2110 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2111
2112 struct kernargs *kernargs = shadow->kernarg_address;
f6fff8a6
AS
2113 void *addr = (void *)kernargs->abi.arena_ptr;
2114 if (!addr)
2115 addr = (void *)kernargs->abi.stack_ptr;
2116 release_ephemeral_memories (shadow->agent, addr);
237957cc
AS
2117
2118 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2119
2120 hsa_signal_t s;
2121 s.handle = shadow->signal;
2122 hsa_fns.hsa_signal_destroy_fn (s);
2123
2124 free (shadow);
2125}
2126
2127/* Extract the properties from a kernel binary. */
2128
2129static void
2130init_kernel_properties (struct kernel_info *kernel)
2131{
2132 hsa_status_t status;
2133 struct agent_info *agent = kernel->agent;
2134 hsa_executable_symbol_t kernel_symbol;
f062c3f1
AS
2135 char *buf = alloca (strlen (kernel->name) + 4);
2136 sprintf (buf, "%s.kd", kernel->name);
237957cc 2137 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
f062c3f1 2138 buf, agent->id,
237957cc
AS
2139 0, &kernel_symbol);
2140 if (status != HSA_STATUS_SUCCESS)
2141 {
2142 hsa_warn ("Could not find symbol for kernel in the code object", status);
f062c3f1 2143 fprintf (stderr, "not found name: '%s'\n", buf);
237957cc
AS
2144 dump_executable_symbols (agent->executable);
2145 goto failure;
2146 }
2147 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2148 status = hsa_fns.hsa_executable_symbol_get_info_fn
2149 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2150 if (status != HSA_STATUS_SUCCESS)
2151 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2152 status = hsa_fns.hsa_executable_symbol_get_info_fn
2153 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2154 &kernel->kernarg_segment_size);
2155 if (status != HSA_STATUS_SUCCESS)
2156 hsa_fatal ("Could not get info about kernel argument size", status);
2157 status = hsa_fns.hsa_executable_symbol_get_info_fn
2158 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2159 &kernel->group_segment_size);
2160 if (status != HSA_STATUS_SUCCESS)
2161 hsa_fatal ("Could not get info about kernel group segment size", status);
2162 status = hsa_fns.hsa_executable_symbol_get_info_fn
2163 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2164 &kernel->private_segment_size);
2165 if (status != HSA_STATUS_SUCCESS)
2166 hsa_fatal ("Could not get info about kernel private segment size",
2167 status);
2168
2169 /* The kernel type is not known until something tries to launch it. */
2170 kernel->kind = KIND_UNKNOWN;
2171
2172 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2173 "following segment sizes: \n", kernel->name);
2174 GCN_DEBUG (" group_segment_size: %u\n",
2175 (unsigned) kernel->group_segment_size);
2176 GCN_DEBUG (" private_segment_size: %u\n",
2177 (unsigned) kernel->private_segment_size);
2178 GCN_DEBUG (" kernarg_segment_size: %u\n",
2179 (unsigned) kernel->kernarg_segment_size);
2180 return;
2181
2182failure:
2183 kernel->initialization_failed = true;
2184}
2185
2186/* Do all the work that is necessary before running KERNEL for the first time.
2187 The function assumes the program has been created, finalized and frozen by
2188 create_and_finalize_hsa_program. */
2189
2190static void
2191init_kernel (struct kernel_info *kernel)
2192{
2193 if (pthread_mutex_lock (&kernel->init_mutex))
2194 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2195 if (kernel->initialized)
2196 {
2197 if (pthread_mutex_unlock (&kernel->init_mutex))
2198 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2199 "mutex");
2200
2201 return;
2202 }
2203
2204 init_kernel_properties (kernel);
2205
2206 if (!kernel->initialization_failed)
2207 {
2208 GCN_DEBUG ("\n");
2209
2210 kernel->initialized = true;
2211 }
2212 if (pthread_mutex_unlock (&kernel->init_mutex))
2213 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2214 "mutex");
2215}
2216
2217/* Run KERNEL on its agent, pass VARS to it as arguments and take
2218 launch attributes from KLA.
2219
2220 MODULE_LOCKED indicates that the caller already holds the lock and
2221 run_kernel need not lock it again.
2222 If AQ is NULL then agent->sync_queue will be used. */
2223
2224static void
2225run_kernel (struct kernel_info *kernel, void *vars,
2226 struct GOMP_kernel_launch_attributes *kla,
2227 struct goacc_asyncqueue *aq, bool module_locked)
2228{
ae0d2c24 2229 struct agent_info *agent = kernel->agent;
5a28e272
KCY
2230 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2231 kernel->description->vpgr_count);
2232
2233 /* Reduce the number of threads/workers if there are insufficient
2234 VGPRs available to run the kernels together. */
2235 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2236 {
ae0d2c24 2237 int max_vgprs = max_isa_vgprs (agent->device_isa);
5a28e272 2238 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
ae0d2c24 2239 int max_threads = (max_vgprs / granulated_vgprs) * 4;
5a28e272
KCY
2240 if (kla->gdims[2] > max_threads)
2241 {
2242 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2243 " per team/gang - reducing to %d threads/workers.\n",
2244 kla->gdims[2], max_threads);
2245 kla->gdims[2] = max_threads;
2246 }
2247 }
2248
237957cc
AS
2249 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2250 (aq ? aq->id : 0));
2251 GCN_DEBUG ("GCN launch attribs: gdims:[");
2252 int i;
2253 for (i = 0; i < kla->ndim; ++i)
2254 {
2255 if (i)
2256 DEBUG_PRINT (", ");
2257 DEBUG_PRINT ("%u", kla->gdims[i]);
2258 }
2259 DEBUG_PRINT ("], normalized gdims:[");
2260 for (i = 0; i < kla->ndim; ++i)
2261 {
2262 if (i)
2263 DEBUG_PRINT (", ");
2264 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2265 }
2266 DEBUG_PRINT ("], wdims:[");
2267 for (i = 0; i < kla->ndim; ++i)
2268 {
2269 if (i)
2270 DEBUG_PRINT (", ");
2271 DEBUG_PRINT ("%u", kla->wdims[i]);
2272 }
2273 DEBUG_PRINT ("]\n");
2274 DEBUG_FLUSH ();
2275
237957cc
AS
2276 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2277 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2278
2279 if (!agent->initialized)
2280 GOMP_PLUGIN_fatal ("Agent must be initialized");
2281
2282 if (!kernel->initialized)
2283 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2284
2285 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2286
2287 uint64_t index
2288 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2289 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2290
2291 /* Wait until the queue is not full before writing the packet. */
2292 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2293 >= command_q->size)
2294 ;
2295
2296 /* Do not allow the dimensions to be overridden when running
2297 constructors or destructors. */
2298 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2299 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2300
2301 hsa_kernel_dispatch_packet_t *packet;
2302 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2303 + index % command_q->size;
2304
2305 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2306 packet->grid_size_x = override_x ? : kla->gdims[0];
2307 packet->workgroup_size_x = get_group_size (kla->ndim,
2308 packet->grid_size_x,
2309 kla->wdims[0]);
2310
2311 if (kla->ndim >= 2)
2312 {
2313 packet->grid_size_y = kla->gdims[1];
2314 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2315 kla->wdims[1]);
2316 }
2317 else
2318 {
2319 packet->grid_size_y = 1;
2320 packet->workgroup_size_y = 1;
2321 }
2322
2323 if (kla->ndim == 3)
2324 {
2325 packet->grid_size_z = limit_worker_threads (override_z
2326 ? : kla->gdims[2]);
2327 packet->workgroup_size_z = get_group_size (kla->ndim,
2328 packet->grid_size_z,
2329 kla->wdims[2]);
2330 }
2331 else
2332 {
2333 packet->grid_size_z = 1;
2334 packet->workgroup_size_z = 1;
2335 }
2336
2337 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2338 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2339 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2340 packet->grid_size_x / packet->workgroup_size_x,
2341 packet->grid_size_y / packet->workgroup_size_y,
2342 packet->grid_size_z / packet->workgroup_size_z,
2343 packet->workgroup_size_x, packet->workgroup_size_y,
2344 packet->workgroup_size_z);
2345
2346 struct kernel_dispatch *shadow
f6fff8a6
AS
2347 = create_kernel_dispatch (kernel, packet->grid_size_x,
2348 packet->grid_size_z);
237957cc
AS
2349 shadow->queue = command_q;
2350
2351 if (debug)
2352 {
2353 fprintf (stderr, "\nKernel has following dependencies:\n");
2354 print_kernel_dispatch (shadow, 2);
2355 }
2356
e7d6c277
AS
2357 packet->private_segment_size = shadow->private_segment_size;
2358 packet->group_segment_size = shadow->group_segment_size;
2359 packet->kernel_object = shadow->object;
237957cc
AS
2360 packet->kernarg_address = shadow->kernarg_address;
2361 hsa_signal_t s;
2362 s.handle = shadow->signal;
2363 packet->completion_signal = s;
2364 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2365 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2366
2367 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2368
2369 uint16_t header;
2370 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2371 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2372 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2373
2374 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2375 agent->device_id);
2376
2377 packet_store_release ((uint32_t *) packet, header,
2378 (uint16_t) kla->ndim
2379 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2380
2381 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2382 index);
2383
2384 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2385
2386 /* Root signal waits with 1ms timeout. */
2387 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2388 1000 * 1000,
2389 HSA_WAIT_STATE_BLOCKED) != 0)
2390 {
2391 console_output (kernel, shadow->kernarg_address, false);
2392 }
2393 console_output (kernel, shadow->kernarg_address, true);
2394
2395 struct kernargs *kernargs = shadow->kernarg_address;
2396 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2397
2398 release_kernel_dispatch (shadow);
2399
2400 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2401 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2402
2403 unsigned int upper = (return_value & ~0xffff) >> 16;
2404 if (upper == 0xcafe)
2405 ; // exit not called, normal termination.
2406 else if (upper == 0xffff)
2407 ; // exit called.
2408 else
2409 {
2410 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2411 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2412 return_value);
2413 abort ();
2414 }
2415
2416 if (upper == 0xffff)
2417 {
2418 unsigned int signal = (return_value >> 8) & 0xff;
2419
2420 if (signal == SIGABRT)
2421 {
2422 GCN_WARNING ("GCN Kernel aborted\n");
2423 abort ();
2424 }
2425 else if (signal != 0)
2426 {
2427 GCN_WARNING ("GCN Kernel received unknown signal\n");
2428 abort ();
2429 }
2430
2431 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2432 exit (return_value & 0xff);
2433 }
2434}
2435
2436/* }}} */
2437/* {{{ Load/Unload */
2438
2439/* Initialize KERNEL from D and other parameters. Return true on success. */
2440
2441static bool
2442init_basic_kernel_info (struct kernel_info *kernel,
2443 struct hsa_kernel_description *d,
2444 struct agent_info *agent,
2445 struct module_info *module)
2446{
2447 kernel->agent = agent;
2448 kernel->module = module;
2449 kernel->name = d->name;
5a28e272 2450 kernel->description = d;
237957cc
AS
2451 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2452 {
2453 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2454 return false;
2455 }
2456 return true;
2457}
2458
7d593fd6
FH
2459/* Check that the GCN ISA of the given image matches the ISA of the agent. */
2460
2461static bool
2462isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2463{
2464 int isa_field = elf_gcn_isa_field (image);
2465 const char* isa_s = isa_hsa_name (isa_field);
2466 if (!isa_s)
2467 {
2468 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2469 return false;
2470 }
2471
2472 if (isa_field != agent->device_isa)
2473 {
2474 char msg[120];
2475 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2476 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2477 assert (agent_isa_s);
2478 assert (agent_isa_gcc_s);
2479
2480 snprintf (msg, sizeof msg,
2481 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
4a206161 2482 "Try to recompile with '-foffload-options=-march=%s'.\n",
7d593fd6
FH
2483 isa_s, agent_isa_s, agent_isa_gcc_s);
2484
2485 hsa_error (msg, HSA_STATUS_ERROR);
2486 return false;
2487 }
2488
2489 return true;
2490}
2491
237957cc
AS
2492/* Create and finalize the program consisting of all loaded modules. */
2493
2494static bool
2495create_and_finalize_hsa_program (struct agent_info *agent)
2496{
2497 hsa_status_t status;
237957cc
AS
2498 bool res = true;
2499 if (pthread_mutex_lock (&agent->prog_mutex))
2500 {
2501 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2502 return false;
2503 }
2504 if (agent->prog_finalized)
2505 goto final;
2506
2507 status
2508 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2509 HSA_EXECUTABLE_STATE_UNFROZEN,
2510 "", &agent->executable);
2511 if (status != HSA_STATUS_SUCCESS)
2512 {
2513 hsa_error ("Could not create GCN executable", status);
2514 goto fail;
2515 }
2516
2517 /* Load any GCN modules. */
2518 struct module_info *module = agent->module;
2519 if (module)
2520 {
2521 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2522
7d593fd6
FH
2523 if (!isa_matches_agent (agent, image))
2524 goto fail;
2525
237957cc
AS
2526 hsa_code_object_t co = { 0 };
2527 status = hsa_fns.hsa_code_object_deserialize_fn
2528 (module->image_desc->gcn_image->image,
2529 module->image_desc->gcn_image->size,
2530 NULL, &co);
2531 if (status != HSA_STATUS_SUCCESS)
2532 {
2533 hsa_error ("Could not deserialize GCN code object", status);
2534 goto fail;
2535 }
2536
2537 status = hsa_fns.hsa_executable_load_code_object_fn
2538 (agent->executable, agent->id, co, "");
2539 if (status != HSA_STATUS_SUCCESS)
2540 {
2541 hsa_error ("Could not load GCN code object", status);
2542 goto fail;
2543 }
2544
2545 if (!module->heap)
2546 {
2547 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2548 gcn_kernel_heap_size,
2549 (void**)&module->heap);
2550 if (status != HSA_STATUS_SUCCESS)
2551 {
2552 hsa_error ("Could not allocate memory for GCN heap", status);
2553 goto fail;
2554 }
2555
2556 status = hsa_fns.hsa_memory_assign_agent_fn
2557 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2558 if (status != HSA_STATUS_SUCCESS)
2559 {
2560 hsa_error ("Could not assign GCN heap memory to device", status);
2561 goto fail;
2562 }
2563
2564 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2565 &gcn_kernel_heap_size,
2566 sizeof (gcn_kernel_heap_size));
2567 }
2568
2569 }
2570
2571 if (debug)
2572 dump_executable_symbols (agent->executable);
2573
2574 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2575 if (status != HSA_STATUS_SUCCESS)
2576 {
2577 hsa_error ("Could not freeze the GCN executable", status);
2578 goto fail;
2579 }
2580
237957cc
AS
2581final:
2582 agent->prog_finalized = true;
2583
2584 if (pthread_mutex_unlock (&agent->prog_mutex))
2585 {
2586 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2587 res = false;
2588 }
2589
2590 return res;
2591
2592fail:
2593 res = false;
2594 goto final;
2595}
2596
2597/* Free the HSA program in agent and everything associated with it and set
2598 agent->prog_finalized and the initialized flags of all kernels to false.
2599 Return TRUE on success. */
2600
2601static bool
2602destroy_hsa_program (struct agent_info *agent)
2603{
2604 if (!agent->prog_finalized)
2605 return true;
2606
2607 hsa_status_t status;
2608
2609 GCN_DEBUG ("Destroying the current GCN program.\n");
2610
2611 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2612 if (status != HSA_STATUS_SUCCESS)
2613 return hsa_error ("Could not destroy GCN executable", status);
2614
2615 if (agent->module)
2616 {
2617 int i;
2618 for (i = 0; i < agent->module->kernel_count; i++)
2619 agent->module->kernels[i].initialized = false;
2620
2621 if (agent->module->heap)
2622 {
2623 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2624 agent->module->heap = NULL;
2625 }
2626 }
2627 agent->prog_finalized = false;
2628 return true;
2629}
2630
2631/* Deinitialize all information associated with MODULE and kernels within
2632 it. Return TRUE on success. */
2633
2634static bool
2635destroy_module (struct module_info *module, bool locked)
2636{
2637 /* Run destructors before destroying module. */
2638 struct GOMP_kernel_launch_attributes kla =
2639 { 3,
2640 /* Grid size. */
2641 { 1, 64, 1 },
2642 /* Work-group size. */
2643 { 1, 64, 1 }
2644 };
2645
2646 if (module->fini_array_func)
2647 {
2648 init_kernel (module->fini_array_func);
2649 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2650 }
2651 module->constructors_run_p = false;
2652
2653 int i;
2654 for (i = 0; i < module->kernel_count; i++)
2655 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2656 {
2657 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2658 "mutex");
2659 return false;
2660 }
2661
2662 return true;
2663}
2664
2665/* }}} */
2666/* {{{ Async */
2667
2668/* Callback of dispatch queues to report errors. */
2669
2670static void
2671execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2672{
2673 struct queue_entry *entry = &aq->queue[index];
2674
2675 switch (entry->type)
2676 {
2677 case KERNEL_LAUNCH:
2678 if (DEBUG_QUEUES)
2679 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2680 aq->agent->device_id, aq->id, index);
2681 run_kernel (entry->u.launch.kernel,
2682 entry->u.launch.vars,
2683 &entry->u.launch.kla, aq, false);
2684 if (DEBUG_QUEUES)
2685 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2686 aq->agent->device_id, aq->id, index);
2687 break;
2688
2689 case CALLBACK:
2690 if (DEBUG_QUEUES)
2691 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2692 aq->agent->device_id, aq->id, index);
2693 entry->u.callback.fn (entry->u.callback.data);
2694 if (DEBUG_QUEUES)
2695 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2696 aq->agent->device_id, aq->id, index);
2697 break;
2698
2699 case ASYNC_WAIT:
2700 {
2701 /* FIXME: is it safe to access a placeholder that may already have
2702 been executed? */
2703 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2704
2705 if (DEBUG_QUEUES)
2706 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2707 aq->agent->device_id, aq->id, index);
2708
2709 pthread_mutex_lock (&placeholderp->mutex);
2710
2711 while (!placeholderp->executed)
2712 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2713
2714 pthread_mutex_unlock (&placeholderp->mutex);
2715
2716 if (pthread_cond_destroy (&placeholderp->cond))
2717 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2718
2719 if (pthread_mutex_destroy (&placeholderp->mutex))
2720 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2721
2722 if (DEBUG_QUEUES)
2723 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2724 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2725 }
2726 break;
2727
2728 case ASYNC_PLACEHOLDER:
2729 pthread_mutex_lock (&entry->u.placeholder.mutex);
2730 entry->u.placeholder.executed = 1;
2731 pthread_cond_signal (&entry->u.placeholder.cond);
2732 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2733 break;
2734
2735 default:
2736 GOMP_PLUGIN_fatal ("Unknown queue element");
2737 }
2738}
2739
2740/* This function is run as a thread to service an async queue in the
2741 background. It runs continuously until the stop flag is set. */
2742
2743static void *
2744drain_queue (void *thread_arg)
2745{
2746 struct goacc_asyncqueue *aq = thread_arg;
2747
2748 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2749 {
2750 aq->drain_queue_stop = 2;
2751 return NULL;
2752 }
2753
2754 pthread_mutex_lock (&aq->mutex);
2755
2756 while (true)
2757 {
2758 if (aq->drain_queue_stop)
2759 break;
2760
2761 if (aq->queue_n > 0)
2762 {
2763 pthread_mutex_unlock (&aq->mutex);
2764 execute_queue_entry (aq, aq->queue_first);
2765
2766 pthread_mutex_lock (&aq->mutex);
2767 aq->queue_first = ((aq->queue_first + 1)
2768 % ASYNC_QUEUE_SIZE);
2769 aq->queue_n--;
2770
2771 if (DEBUG_THREAD_SIGNAL)
2772 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2773 aq->agent->device_id, aq->id);
2774 pthread_cond_broadcast (&aq->queue_cond_out);
2775 pthread_mutex_unlock (&aq->mutex);
2776
2777 if (DEBUG_QUEUES)
2778 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2779 aq->id);
2780 pthread_mutex_lock (&aq->mutex);
2781 }
2782 else
2783 {
2784 if (DEBUG_THREAD_SLEEP)
2785 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2786 aq->agent->device_id, aq->id);
2787 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2788 if (DEBUG_THREAD_SLEEP)
2789 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2790 aq->agent->device_id, aq->id);
2791 }
2792 }
2793
2794 aq->drain_queue_stop = 2;
2795 if (DEBUG_THREAD_SIGNAL)
2796 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2797 aq->agent->device_id, aq->id);
2798 pthread_cond_broadcast (&aq->queue_cond_out);
2799 pthread_mutex_unlock (&aq->mutex);
2800
2801 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2802 return NULL;
2803}
2804
2805/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2806 is not usually the case. This is just a debug tool. */
2807
2808static void
2809drain_queue_synchronous (struct goacc_asyncqueue *aq)
2810{
2811 pthread_mutex_lock (&aq->mutex);
2812
2813 while (aq->queue_n > 0)
2814 {
2815 execute_queue_entry (aq, aq->queue_first);
2816
2817 aq->queue_first = ((aq->queue_first + 1)
2818 % ASYNC_QUEUE_SIZE);
2819 aq->queue_n--;
2820 }
2821
2822 pthread_mutex_unlock (&aq->mutex);
2823}
2824
d88b27da
JB
2825/* Block the current thread until an async queue is writable. The aq->mutex
2826 lock should be held on entry, and remains locked on exit. */
237957cc
AS
2827
2828static void
2829wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2830{
2831 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2832 {
237957cc
AS
2833 /* Queue is full. Wait for it to not be full. */
2834 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2835 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
237957cc
AS
2836 }
2837}
2838
2839/* Request an asynchronous kernel launch on the specified queue. This
2840 may block if the queue is full, but returns without waiting for the
2841 kernel to run. */
2842
2843static void
2844queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2845 void *vars, struct GOMP_kernel_launch_attributes *kla)
2846{
2847 assert (aq->agent == kernel->agent);
2848
237957cc
AS
2849 pthread_mutex_lock (&aq->mutex);
2850
d88b27da
JB
2851 wait_for_queue_nonfull (aq);
2852
237957cc
AS
2853 int queue_last = ((aq->queue_first + aq->queue_n)
2854 % ASYNC_QUEUE_SIZE);
2855 if (DEBUG_QUEUES)
2856 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2857 aq->id, queue_last);
2858
2859 aq->queue[queue_last].type = KERNEL_LAUNCH;
2860 aq->queue[queue_last].u.launch.kernel = kernel;
2861 aq->queue[queue_last].u.launch.vars = vars;
2862 aq->queue[queue_last].u.launch.kla = *kla;
2863
2864 aq->queue_n++;
2865
2866 if (DEBUG_THREAD_SIGNAL)
2867 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2868 aq->agent->device_id, aq->id);
2869 pthread_cond_signal (&aq->queue_cond_in);
2870
2871 pthread_mutex_unlock (&aq->mutex);
2872}
2873
2874/* Request an asynchronous callback on the specified queue. The callback
2875 function will be called, with the given opaque data, from the appropriate
2876 async thread, when all previous items on that queue are complete. */
2877
2878static void
2879queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2880 void *data)
2881{
237957cc
AS
2882 pthread_mutex_lock (&aq->mutex);
2883
d88b27da
JB
2884 wait_for_queue_nonfull (aq);
2885
237957cc
AS
2886 int queue_last = ((aq->queue_first + aq->queue_n)
2887 % ASYNC_QUEUE_SIZE);
2888 if (DEBUG_QUEUES)
2889 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2890 aq->id, queue_last);
2891
2892 aq->queue[queue_last].type = CALLBACK;
2893 aq->queue[queue_last].u.callback.fn = fn;
2894 aq->queue[queue_last].u.callback.data = data;
2895
2896 aq->queue_n++;
2897
2898 if (DEBUG_THREAD_SIGNAL)
2899 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2900 aq->agent->device_id, aq->id);
2901 pthread_cond_signal (&aq->queue_cond_in);
2902
2903 pthread_mutex_unlock (&aq->mutex);
2904}
2905
2906/* Request that a given async thread wait for another thread (unspecified) to
2907 reach the given placeholder. The wait will occur when all previous entries
2908 on the queue are complete. A placeholder is effectively a kind of signal
2909 which simply sets a flag when encountered in a queue. */
2910
2911static void
2912queue_push_asyncwait (struct goacc_asyncqueue *aq,
2913 struct placeholder *placeholderp)
2914{
237957cc
AS
2915 pthread_mutex_lock (&aq->mutex);
2916
d88b27da
JB
2917 wait_for_queue_nonfull (aq);
2918
237957cc
AS
2919 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2920 if (DEBUG_QUEUES)
2921 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2922 aq->id, queue_last);
2923
2924 aq->queue[queue_last].type = ASYNC_WAIT;
2925 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2926
2927 aq->queue_n++;
2928
2929 if (DEBUG_THREAD_SIGNAL)
2930 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2931 aq->agent->device_id, aq->id);
2932 pthread_cond_signal (&aq->queue_cond_in);
2933
2934 pthread_mutex_unlock (&aq->mutex);
2935}
2936
2937/* Add a placeholder into an async queue. When the async thread reaches the
2938 placeholder it will set the "executed" flag to true and continue.
2939 Another thread may be waiting on this thread reaching the placeholder. */
2940
2941static struct placeholder *
2942queue_push_placeholder (struct goacc_asyncqueue *aq)
2943{
2944 struct placeholder *placeholderp;
2945
237957cc
AS
2946 pthread_mutex_lock (&aq->mutex);
2947
d88b27da
JB
2948 wait_for_queue_nonfull (aq);
2949
237957cc
AS
2950 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2951 if (DEBUG_QUEUES)
2952 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2953 aq->id, queue_last);
2954
2955 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2956 placeholderp = &aq->queue[queue_last].u.placeholder;
2957
2958 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2959 {
2960 pthread_mutex_unlock (&aq->mutex);
2961 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2962 }
2963
2964 if (pthread_cond_init (&placeholderp->cond, NULL))
2965 {
2966 pthread_mutex_unlock (&aq->mutex);
2967 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2968 }
2969
2970 placeholderp->executed = 0;
2971
2972 aq->queue_n++;
2973
2974 if (DEBUG_THREAD_SIGNAL)
2975 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2976 aq->agent->device_id, aq->id);
2977 pthread_cond_signal (&aq->queue_cond_in);
2978
2979 pthread_mutex_unlock (&aq->mutex);
2980
2981 return placeholderp;
2982}
2983
2984/* Signal an asynchronous thread to terminate, and wait for it to do so. */
2985
2986static void
2987finalize_async_thread (struct goacc_asyncqueue *aq)
2988{
2989 pthread_mutex_lock (&aq->mutex);
2990 if (aq->drain_queue_stop == 2)
2991 {
2992 pthread_mutex_unlock (&aq->mutex);
2993 return;
2994 }
2995
2996 aq->drain_queue_stop = 1;
2997
2998 if (DEBUG_THREAD_SIGNAL)
2999 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3000 aq->agent->device_id, aq->id);
3001 pthread_cond_signal (&aq->queue_cond_in);
3002
3003 while (aq->drain_queue_stop != 2)
3004 {
3005 if (DEBUG_THREAD_SLEEP)
3006 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3007 " to sleep\n", aq->agent->device_id, aq->id);
3008 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3009 if (DEBUG_THREAD_SLEEP)
3010 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
3011 aq->agent->device_id, aq->id);
3012 }
3013
3014 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
3015 aq->id);
3016 pthread_mutex_unlock (&aq->mutex);
3017
3018 int err = pthread_join (aq->thread_drain_queue, NULL);
3019 if (err != 0)
3020 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3021 aq->agent->device_id, aq->id, strerror (err));
3022 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3023}
3024
3025/* Set up an async queue for OpenMP. There will be only one. The
3026 implementation simply uses an OpenACC async queue.
3027 FIXME: is this thread-safe if two threads call this function? */
3028
3029static void
3030maybe_init_omp_async (struct agent_info *agent)
3031{
3032 if (!agent->omp_async_queue)
3033 agent->omp_async_queue
3034 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3035}
3036
8d2f4ddf
JB
3037/* A wrapper that works around an issue in the HSA runtime with host-to-device
3038 copies from read-only pages. */
3039
3040static void
3041hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3042{
3043 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3044
3045 if (status == HSA_STATUS_SUCCESS)
3046 return;
3047
3048 /* It appears that the copy fails if the source data is in a read-only page.
3049 We can't detect that easily, so try copying the data to a temporary buffer
3050 and doing the copy again if we got an error above. */
3051
3052 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3053 "[%p:+%d]\n", (void *) src, (int) len);
3054
3055 void *src_copy = malloc (len);
3056 memcpy (src_copy, src, len);
3057 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3058 free (src_copy);
3059 if (status != HSA_STATUS_SUCCESS)
3060 GOMP_PLUGIN_error ("memory copy failed");
3061}
3062
237957cc
AS
3063/* Copy data to or from a device. This is intended for use as an async
3064 callback event. */
3065
3066static void
3067copy_data (void *data_)
3068{
3069 struct copy_data *data = (struct copy_data *)data_;
3070 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3071 data->aq->agent->device_id, data->aq->id, data->len, data->src,
3072 data->dst);
8d2f4ddf 3073 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
237957cc
AS
3074 free (data);
3075}
3076
237957cc 3077/* Request an asynchronous data copy, to or from a device, on a given queue.
9c41f5b9 3078 The event will be registered as a callback. */
237957cc
AS
3079
3080static void
3081queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
9c41f5b9 3082 size_t len)
237957cc
AS
3083{
3084 if (DEBUG_QUEUES)
3085 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3086 aq->agent->device_id, aq->id, len, src, dst);
3087 struct copy_data *data
3088 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3089 data->dst = dst;
3090 data->src = src;
3091 data->len = len;
237957cc
AS
3092 data->aq = aq;
3093 queue_push_callback (aq, copy_data, data);
3094}
3095
3096/* Return true if the given queue is currently empty. */
3097
3098static int
3099queue_empty (struct goacc_asyncqueue *aq)
3100{
3101 pthread_mutex_lock (&aq->mutex);
3102 int res = aq->queue_n == 0 ? 1 : 0;
3103 pthread_mutex_unlock (&aq->mutex);
3104
3105 return res;
3106}
3107
3108/* Wait for a given queue to become empty. This implements an OpenACC wait
3109 directive. */
3110
3111static void
3112wait_queue (struct goacc_asyncqueue *aq)
3113{
3114 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3115 {
3116 drain_queue_synchronous (aq);
3117 return;
3118 }
3119
3120 pthread_mutex_lock (&aq->mutex);
3121
3122 while (aq->queue_n > 0)
3123 {
3124 if (DEBUG_THREAD_SLEEP)
3125 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3126 aq->agent->device_id, aq->id);
3127 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3128 if (DEBUG_THREAD_SLEEP)
3129 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3130 aq->id);
3131 }
3132
3133 pthread_mutex_unlock (&aq->mutex);
3134 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3135}
3136
3137/* }}} */
3138/* {{{ OpenACC support */
3139
3140/* Execute an OpenACC kernel, synchronously or asynchronously. */
3141
3142static void
f8332e52 3143gcn_exec (struct kernel_info *kernel,
237957cc
AS
3144 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3145 struct goacc_asyncqueue *aq)
3146{
3147 if (!GOMP_OFFLOAD_can_run (kernel))
3148 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3149
3150 /* If we get here then this must be an OpenACC kernel. */
3151 kernel->kind = KIND_OPENACC;
3152
237957cc
AS
3153 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3154 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3155 {
3156 struct hsa_kernel_description *d
3157 = &kernel->module->image_desc->kernel_infos[i];
3158 if (d->name == kernel->name)
3159 {
3160 hsa_kernel_desc = d;
3161 break;
3162 }
3163 }
3164
3165 /* We may have statically-determined dimensions in
3166 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3167 invocation at runtime in dims[]. We allow static dimensions to take
3168 priority over dynamic dimensions when present (non-zero). */
3169 if (hsa_kernel_desc->oacc_dims[0] > 0)
3170 dims[0] = hsa_kernel_desc->oacc_dims[0];
3171 if (hsa_kernel_desc->oacc_dims[1] > 0)
3172 dims[1] = hsa_kernel_desc->oacc_dims[1];
3173 if (hsa_kernel_desc->oacc_dims[2] > 0)
3174 dims[2] = hsa_kernel_desc->oacc_dims[2];
3175
a78b1ab1
KCY
3176 /* Ideally, when a dimension isn't explicitly specified, we should
3177 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3178 In practice, we tune for peak performance on BabelStream, which
3179 for OpenACC is currently 32 threads per CU. */
3180 if (dims[0] == 0 && dims[1] == 0)
3181 {
3182 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3183 number. There isn't really a correct answer for this without a clue
3184 about the problem size, so let's do a reasonable number of workers
3185 and gangs. */
237957cc 3186
a78b1ab1
KCY
3187 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3188 dims[1] = 8; /* Workers. */
3189 }
3190 else if (dims[0] == 0 && dims[1] > 0)
3191 {
3192 /* Auto-scale the number of gangs with the requested number of workers. */
3193 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3194 }
3195 else if (dims[0] > 0 && dims[1] == 0)
3196 {
3197 /* Auto-scale the number of workers with the requested number of gangs. */
3198 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3199 if (dims[1] == 0)
3200 dims[1] = 1;
3201 if (dims[1] > 16)
3202 dims[1] = 16;
3203 }
237957cc
AS
3204
3205 /* The incoming dimensions are expressed in terms of gangs, workers, and
3206 vectors. The HSA dimensions are expressed in terms of "work-items",
3207 which means multiples of vector lanes.
3208
3209 The "grid size" specifies the size of the problem space, and the
3210 "work-group size" specifies how much of that we want a single compute
3211 unit to chew on at once.
3212
3213 The three dimensions do not really correspond to hardware, but the
3214 important thing is that the HSA runtime will launch as many
3215 work-groups as it takes to process the entire grid, and each
3216 work-group will contain as many wave-fronts as it takes to process
3217 the work-items in that group.
3218
3219 Essentially, as long as we set the Y dimension to 64 (the number of
3220 vector lanes in hardware), and the Z group size to the maximum (16),
3221 then we will get the gangs (X) and workers (Z) launched as we expect.
3222
3223 The reason for the apparent reversal of vector and worker dimension
3224 order is to do with the way the run-time distributes work-items across
3225 v1 and v2. */
3226 struct GOMP_kernel_launch_attributes kla =
3227 {3,
3228 /* Grid size. */
3229 {dims[0], 64, dims[1]},
3230 /* Work-group size. */
3231 {1, 64, 16}
3232 };
3233
3234 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3235 acc_prof_info *prof_info = thr->prof_info;
3236 acc_event_info enqueue_launch_event_info;
3237 acc_api_info *api_info = thr->api_info;
3238 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3239 if (profiling_dispatch_p)
3240 {
3241 prof_info->event_type = acc_ev_enqueue_launch_start;
3242
3243 enqueue_launch_event_info.launch_event.event_type
3244 = prof_info->event_type;
3245 enqueue_launch_event_info.launch_event.valid_bytes
3246 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3247 enqueue_launch_event_info.launch_event.parent_construct
3248 = acc_construct_parallel;
3249 enqueue_launch_event_info.launch_event.implicit = 1;
3250 enqueue_launch_event_info.launch_event.tool_info = NULL;
3251 enqueue_launch_event_info.launch_event.kernel_name
3252 = (char *) kernel->name;
3253 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3254 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3255 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3256
3257 api_info->device_api = acc_device_api_other;
3258
3259 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3260 &enqueue_launch_event_info, api_info);
3261 }
3262
3263 if (!async)
f8332e52 3264 run_kernel (kernel, devaddrs, &kla, NULL, false);
237957cc 3265 else
f8332e52 3266 queue_push_launch (aq, kernel, devaddrs, &kla);
237957cc
AS
3267
3268 if (profiling_dispatch_p)
3269 {
3270 prof_info->event_type = acc_ev_enqueue_launch_end;
3271 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3272 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3273 &enqueue_launch_event_info,
3274 api_info);
3275 }
3276}
3277
3278/* }}} */
3279/* {{{ Generic Plugin API */
3280
3281/* Return the name of the accelerator, which is "gcn". */
3282
3283const char *
3284GOMP_OFFLOAD_get_name (void)
3285{
3286 return "gcn";
3287}
3288
3289/* Return the specific capabilities the HSA accelerator have. */
3290
3291unsigned int
3292GOMP_OFFLOAD_get_caps (void)
3293{
3294 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3295 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3296 | GOMP_OFFLOAD_CAP_OPENACC_200;
3297}
3298
3299/* Identify as GCN accelerator. */
3300
3301int
3302GOMP_OFFLOAD_get_type (void)
3303{
3304 return OFFLOAD_TARGET_TYPE_GCN;
3305}
3306
3307/* Return the libgomp version number we're compatible with. There is
3308 no requirement for cross-version compatibility. */
3309
3310unsigned
3311GOMP_OFFLOAD_version (void)
3312{
3313 return GOMP_VERSION;
3314}
3315
3316/* Return the number of GCN devices on the system. */
3317
3318int
683f1184 3319GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
237957cc
AS
3320{
3321 if (!init_hsa_context ())
3322 return 0;
683f1184
TB
3323 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3324 devices were present. */
f84fdb13 3325 if (hsa_context.agent_count > 0
f1af7d65
TB
3326 && ((omp_requires_mask
3327 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3328 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
683f1184 3329 return -1;
237957cc
AS
3330 return hsa_context.agent_count;
3331}
3332
3333/* Initialize device (agent) number N so that it can be used for computation.
3334 Return TRUE on success. */
3335
3336bool
3337GOMP_OFFLOAD_init_device (int n)
3338{
3339 if (!init_hsa_context ())
3340 return false;
3341 if (n >= hsa_context.agent_count)
3342 {
3343 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3344 return false;
3345 }
3346 struct agent_info *agent = &hsa_context.agents[n];
3347
3348 if (agent->initialized)
3349 return true;
3350
3351 agent->device_id = n;
3352
3353 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3354 {
3355 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3356 return false;
3357 }
3358 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3359 {
3360 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3361 return false;
3362 }
3363 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3364 {
3365 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3366 return false;
3367 }
f6fff8a6 3368 if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
237957cc
AS
3369 {
3370 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3371 return false;
3372 }
3373 agent->async_queues = NULL;
3374 agent->omp_async_queue = NULL;
f6fff8a6 3375 agent->ephemeral_memories_list = NULL;
237957cc
AS
3376
3377 uint32_t queue_size;
3378 hsa_status_t status;
3379 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3380 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3381 &queue_size);
3382 if (status != HSA_STATUS_SUCCESS)
3383 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3384 status);
3385
237957cc 3386 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
2e5ea579 3387 &agent->name);
237957cc
AS
3388 if (status != HSA_STATUS_SUCCESS)
3389 return hsa_error ("Error querying the name of the agent", status);
7d593fd6 3390
2e5ea579 3391 agent->device_isa = isa_code (agent->name);
209ed06c 3392 if (agent->device_isa == EF_AMDGPU_MACH_UNSUPPORTED)
2e5ea579
FH
3393 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3394
3395 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3396 &agent->vendor_name);
3397 if (status != HSA_STATUS_SUCCESS)
3398 return hsa_error ("Error querying the vendor name of the agent", status);
237957cc
AS
3399
3400 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3401 HSA_QUEUE_TYPE_MULTI,
3402 hsa_queue_callback, NULL, UINT32_MAX,
3403 UINT32_MAX, &agent->sync_queue);
3404 if (status != HSA_STATUS_SUCCESS)
3405 return hsa_error ("Error creating command queue", status);
3406
3407 agent->kernarg_region.handle = (uint64_t) -1;
3408 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3409 get_kernarg_memory_region,
3410 &agent->kernarg_region);
966de09b
AS
3411 if (status != HSA_STATUS_SUCCESS
3412 && status != HSA_STATUS_INFO_BREAK)
3413 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3414 if (agent->kernarg_region.handle == (uint64_t) -1)
3415 {
3416 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3417 "arguments");
3418 return false;
3419 }
3420 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3421 dump_hsa_region (agent->kernarg_region, NULL);
3422
3423 agent->data_region.handle = (uint64_t) -1;
3424 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3425 get_data_memory_region,
3426 &agent->data_region);
966de09b
AS
3427 if (status != HSA_STATUS_SUCCESS
3428 && status != HSA_STATUS_INFO_BREAK)
3429 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3430 if (agent->data_region.handle == (uint64_t) -1)
3431 {
3432 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3433 "data");
3434 return false;
3435 }
3436 GCN_DEBUG ("Selected device data memory region:\n");
3437 dump_hsa_region (agent->data_region, NULL);
3438
3439 GCN_DEBUG ("GCN agent %d initialized\n", n);
3440
3441 agent->initialized = true;
3442 return true;
3443}
3444
3445/* Load GCN object-code module described by struct gcn_image_desc in
3446 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
0fcc0cf9
TB
3447 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3448 contain the on-device addresses of the functions for reverse offload. To be
3449 freed by the caller. */
237957cc
AS
3450
3451int
3452GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
0fcc0cf9 3453 struct addr_pair **target_table,
a49c7d31
KCY
3454 uint64_t **rev_fn_table,
3455 uint64_t *host_ind_fn_table)
237957cc
AS
3456{
3457 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3458 {
3459 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3460 " (expected %u, received %u)",
3461 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3462 return -1;
3463 }
3464
3465 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3466 struct agent_info *agent;
3467 struct addr_pair *pair;
3468 struct module_info *module;
3469 struct kernel_info *kernel;
3470 int kernel_count = image_desc->kernel_count;
a49c7d31
KCY
3471 unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3472 ? image_desc->ind_func_count : 0;
237957cc 3473 unsigned var_count = image_desc->global_variable_count;
9f2fca56 3474 /* Currently, "others" is a struct of ICVS. */
0bac793e 3475 int other_count = 1;
237957cc
AS
3476
3477 agent = get_agent_info (ord);
3478 if (!agent)
3479 return -1;
3480
3481 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3482 {
3483 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3484 return -1;
3485 }
3486 if (agent->prog_finalized
3487 && !destroy_hsa_program (agent))
3488 return -1;
3489
3490 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
a49c7d31 3491 GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
237957cc 3492 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
0bac793e
CLT
3493 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3494 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
237957cc
AS
3495 * sizeof (struct addr_pair));
3496 *target_table = pair;
3497 module = (struct module_info *)
3498 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3499 + kernel_count * sizeof (struct kernel_info));
3500 module->image_desc = image_desc;
3501 module->kernel_count = kernel_count;
3502 module->heap = NULL;
3503 module->constructors_run_p = false;
3504
3505 kernel = &module->kernels[0];
3506
3507 /* Allocate memory for kernel dependencies. */
3508 for (unsigned i = 0; i < kernel_count; i++)
3509 {
3510 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3511 if (!init_basic_kernel_info (kernel, d, agent, module))
3512 return -1;
3513 if (strcmp (d->name, "_init_array") == 0)
3514 module->init_array_func = kernel;
3515 else if (strcmp (d->name, "_fini_array") == 0)
3516 module->fini_array_func = kernel;
3517 else
3518 {
3519 pair->start = (uintptr_t) kernel;
3520 pair->end = (uintptr_t) (kernel + 1);
3521 pair++;
3522 }
3523 kernel++;
3524 }
3525
3526 agent->module = module;
3527 if (pthread_rwlock_unlock (&agent->module_rwlock))
3528 {
3529 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3530 return -1;
3531 }
3532
3533 if (!create_and_finalize_hsa_program (agent))
3534 return -1;
3535
4a87a8e4 3536 if (var_count > 0)
237957cc 3537 {
237957cc
AS
3538 hsa_status_t status;
3539 hsa_executable_symbol_t var_symbol;
3540 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
4a87a8e4
AS
3541 ".offload_var_table",
3542 agent->id,
237957cc
AS
3543 0, &var_symbol);
3544
3545 if (status != HSA_STATUS_SUCCESS)
3546 hsa_fatal ("Could not find symbol for variable in the code object",
3547 status);
3548
4a87a8e4 3549 uint64_t var_table_addr;
237957cc 3550 status = hsa_fns.hsa_executable_symbol_get_info_fn
4a87a8e4
AS
3551 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3552 &var_table_addr);
237957cc
AS
3553 if (status != HSA_STATUS_SUCCESS)
3554 hsa_fatal ("Could not extract a variable from its symbol", status);
237957cc 3555
4a87a8e4
AS
3556 struct {
3557 uint64_t addr;
3558 uint64_t size;
3559 } var_table[var_count];
3560 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3561 (void*)var_table_addr, sizeof (var_table));
3562
3563 for (unsigned i = 0; i < var_count; i++)
3564 {
3565 pair->start = var_table[i].addr;
3566 pair->end = var_table[i].addr + var_table[i].size;
3567 GCN_DEBUG ("Found variable at %p with size %lu\n",
3568 (void *)var_table[i].addr, var_table[i].size);
3569 pair++;
3570 }
237957cc
AS
3571 }
3572
a49c7d31
KCY
3573 if (ind_func_count > 0)
3574 {
3575 hsa_status_t status;
3576
3577 /* Read indirect function table from image. */
3578 hsa_executable_symbol_t ind_funcs_symbol;
3579 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3580 ".offload_ind_func_table",
3581 agent->id,
3582 0, &ind_funcs_symbol);
3583
3584 if (status != HSA_STATUS_SUCCESS)
3585 hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3586 "code object", status);
3587
3588 uint64_t ind_funcs_table_addr;
3589 status = hsa_fns.hsa_executable_symbol_get_info_fn
3590 (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3591 &ind_funcs_table_addr);
3592 if (status != HSA_STATUS_SUCCESS)
3593 hsa_fatal ("Could not extract a variable from its symbol", status);
3594
3595 uint64_t ind_funcs_table[ind_func_count];
3596 GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3597 (void*) ind_funcs_table_addr,
3598 sizeof (ind_funcs_table));
3599
3600 /* Build host->target address map for indirect functions. */
3601 uint64_t ind_fn_map[ind_func_count * 2 + 1];
3602 for (unsigned i = 0; i < ind_func_count; i++)
3603 {
3604 ind_fn_map[i * 2] = host_ind_fn_table[i];
3605 ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3606 GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3607 i, host_ind_fn_table[i], ind_funcs_table[i]);
3608 }
3609 ind_fn_map[ind_func_count * 2] = 0;
3610
3611 /* Write the map onto the target. */
3612 void *map_target_addr
3613 = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3614 GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3615
3616 GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3617 (void*) ind_fn_map,
3618 sizeof (ind_fn_map));
3619
3620 /* Write address of the map onto the target. */
3621 hsa_executable_symbol_t symbol;
3622
3623 status
3624 = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3625 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3626 agent->id, 0, &symbol);
3627 if (status != HSA_STATUS_SUCCESS)
3628 hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3629 status);
3630
3631 uint64_t varptr;
3632 uint32_t varsize;
3633
3634 status = hsa_fns.hsa_executable_symbol_get_info_fn
3635 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3636 &varptr);
3637 if (status != HSA_STATUS_SUCCESS)
3638 hsa_fatal ("Could not extract a variable from its symbol", status);
3639 status = hsa_fns.hsa_executable_symbol_get_info_fn
3640 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3641 &varsize);
3642 if (status != HSA_STATUS_SUCCESS)
3643 hsa_fatal ("Could not extract a variable size from its symbol",
3644 status);
3645
3646 GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3647 varptr, varsize);
3648
3649 GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3650 &map_target_addr,
3651 sizeof (map_target_addr));
3652 }
3653
9f2fca56 3654 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
0bac793e
CLT
3655
3656 hsa_status_t status;
3657 hsa_executable_symbol_t var_symbol;
3658 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
9f2fca56 3659 XSTRING (GOMP_ADDITIONAL_ICVS),
0bac793e
CLT
3660 agent->id, 0, &var_symbol);
3661 if (status == HSA_STATUS_SUCCESS)
3662 {
9f2fca56
MV
3663 uint64_t varptr;
3664 uint32_t varsize;
0bac793e
CLT
3665
3666 status = hsa_fns.hsa_executable_symbol_get_info_fn
3667 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
9f2fca56 3668 &varptr);
0bac793e
CLT
3669 if (status != HSA_STATUS_SUCCESS)
3670 hsa_fatal ("Could not extract a variable from its symbol", status);
3671 status = hsa_fns.hsa_executable_symbol_get_info_fn
3672 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
9f2fca56 3673 &varsize);
0bac793e 3674 if (status != HSA_STATUS_SUCCESS)
9f2fca56
MV
3675 hsa_fatal ("Could not extract a variable size from its symbol",
3676 status);
0bac793e 3677
9f2fca56
MV
3678 pair->start = varptr;
3679 pair->end = varptr + varsize;
0bac793e
CLT
3680 }
3681 else
9f2fca56
MV
3682 {
3683 /* The variable was not in this image. */
3684 GCN_DEBUG ("Variable not found in image: %s\n",
3685 XSTRING (GOMP_ADDITIONAL_ICVS));
3686 pair->start = pair->end = 0;
3687 }
0bac793e 3688
237957cc
AS
3689 /* Ensure that constructors are run first. */
3690 struct GOMP_kernel_launch_attributes kla =
3691 { 3,
3692 /* Grid size. */
3693 { 1, 64, 1 },
3694 /* Work-group size. */
3695 { 1, 64, 1 }
3696 };
3697
3698 if (module->init_array_func)
3699 {
3700 init_kernel (module->init_array_func);
3701 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3702 }
3703 module->constructors_run_p = true;
3704
3705 /* Don't report kernels that libgomp need not know about. */
3706 if (module->init_array_func)
3707 kernel_count--;
3708 if (module->fini_array_func)
3709 kernel_count--;
3710
dfd75bf7
TB
3711 if (rev_fn_table != NULL && kernel_count == 0)
3712 *rev_fn_table = NULL;
3713 else if (rev_fn_table != NULL)
3714 {
3715 hsa_status_t status;
3716 hsa_executable_symbol_t var_symbol;
3717 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3718 ".offload_func_table",
3719 agent->id, 0, &var_symbol);
3720 if (status != HSA_STATUS_SUCCESS)
3721 hsa_fatal ("Could not find symbol for variable in the code object",
3722 status);
3723 uint64_t fn_table_addr;
3724 status = hsa_fns.hsa_executable_symbol_get_info_fn
3725 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3726 &fn_table_addr);
3727 if (status != HSA_STATUS_SUCCESS)
3728 hsa_fatal ("Could not extract a variable from its symbol", status);
3729 *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3730 GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3731 (void*) fn_table_addr,
3732 kernel_count * sizeof (uint64_t));
3733 }
3734
0bac793e 3735 return kernel_count + var_count + other_count;
237957cc
AS
3736}
3737
3738/* Unload GCN object-code module described by struct gcn_image_desc in
3739 TARGET_DATA from agent number N. Return TRUE on success. */
3740
3741bool
3742GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3743{
3744 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3745 {
3746 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3747 " (expected %u, received %u)",
3748 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3749 return false;
3750 }
3751
3752 struct agent_info *agent;
3753 agent = get_agent_info (n);
3754 if (!agent)
3755 return false;
3756
3757 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3758 {
3759 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3760 return false;
3761 }
3762
3763 if (!agent->module || agent->module->image_desc != target_data)
3764 {
3765 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3766 "loaded before");
3767 return false;
3768 }
3769
3770 if (!destroy_module (agent->module, true))
3771 return false;
3772 free (agent->module);
3773 agent->module = NULL;
3774 if (!destroy_hsa_program (agent))
3775 return false;
3776 if (pthread_rwlock_unlock (&agent->module_rwlock))
3777 {
3778 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3779 return false;
3780 }
3781 return true;
3782}
3783
3784/* Deinitialize all information and status associated with agent number N. We
3785 do not attempt any synchronization, assuming the user and libgomp will not
3786 attempt deinitialization of a device that is in any way being used at the
3787 same time. Return TRUE on success. */
3788
3789bool
3790GOMP_OFFLOAD_fini_device (int n)
3791{
3792 struct agent_info *agent = get_agent_info (n);
3793 if (!agent)
3794 return false;
3795
3796 if (!agent->initialized)
3797 return true;
3798
3799 if (agent->omp_async_queue)
3800 {
3801 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3802 agent->omp_async_queue = NULL;
3803 }
3804
3805 if (agent->module)
3806 {
3807 if (!destroy_module (agent->module, false))
3808 return false;
3809 free (agent->module);
3810 agent->module = NULL;
3811 }
3812
f6fff8a6 3813 if (!destroy_ephemeral_memories (agent))
237957cc
AS
3814 return false;
3815
3816 if (!destroy_hsa_program (agent))
3817 return false;
3818
3819 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3820 if (status != HSA_STATUS_SUCCESS)
3821 return hsa_error ("Error destroying command queue", status);
3822
3823 if (pthread_mutex_destroy (&agent->prog_mutex))
3824 {
3825 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3826 return false;
3827 }
3828 if (pthread_rwlock_destroy (&agent->module_rwlock))
3829 {
3830 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3831 return false;
3832 }
3833
3834 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3835 {
3836 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3837 return false;
3838 }
f6fff8a6 3839 if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
237957cc 3840 {
f6fff8a6 3841 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
237957cc
AS
3842 return false;
3843 }
3844 agent->initialized = false;
3845 return true;
3846}
3847
3848/* Return true if the HSA runtime can run function FN_PTR. */
3849
3850bool
3851GOMP_OFFLOAD_can_run (void *fn_ptr)
3852{
3853 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3854
3855 init_kernel (kernel);
3856 if (kernel->initialization_failed)
3857 goto failure;
3858
3859 return true;
3860
3861failure:
3862 if (suppress_host_fallback)
3863 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3864 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3865 return false;
3866}
3867
3868/* Allocate memory on device N. */
3869
3870void *
3871GOMP_OFFLOAD_alloc (int n, size_t size)
3872{
3873 struct agent_info *agent = get_agent_info (n);
3874 return alloc_by_agent (agent, size);
3875}
3876
3877/* Free memory from device N. */
3878
3879bool
3880GOMP_OFFLOAD_free (int device, void *ptr)
3881{
3882 GCN_DEBUG ("Freeing memory on device %d\n", device);
3883
3884 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3885 if (status != HSA_STATUS_SUCCESS)
3886 {
3887 hsa_error ("Could not free device memory", status);
3888 return false;
3889 }
3890
3891 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3892 bool profiling_dispatch_p
3893 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3894 if (profiling_dispatch_p)
3895 {
3896 acc_prof_info *prof_info = thr->prof_info;
3897 acc_event_info data_event_info;
3898 acc_api_info *api_info = thr->api_info;
3899
3900 prof_info->event_type = acc_ev_free;
3901
3902 data_event_info.data_event.event_type = prof_info->event_type;
3903 data_event_info.data_event.valid_bytes
3904 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3905 data_event_info.data_event.parent_construct
3906 = acc_construct_parallel;
3907 data_event_info.data_event.implicit = 1;
3908 data_event_info.data_event.tool_info = NULL;
3909 data_event_info.data_event.var_name = NULL;
3910 data_event_info.data_event.bytes = 0;
3911 data_event_info.data_event.host_ptr = NULL;
3912 data_event_info.data_event.device_ptr = (void *) ptr;
3913
3914 api_info->device_api = acc_device_api_other;
3915
3916 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3917 api_info);
3918 }
3919
3920 return true;
3921}
3922
3923/* Copy data from DEVICE to host. */
3924
3925bool
3926GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3927{
3928 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3929 src, dst);
8d2f4ddf
JB
3930 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3931 if (status != HSA_STATUS_SUCCESS)
3932 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3933 return true;
3934}
3935
3936/* Copy data from host to DEVICE. */
3937
3938bool
3939GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3940{
3941 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3942 device, dst);
8d2f4ddf 3943 hsa_memory_copy_wrapper (dst, src, n);
237957cc
AS
3944 return true;
3945}
3946
3947/* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3948
3949bool
3950GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3951{
3952 struct gcn_thread *thread_data = gcn_thread ();
3953
3954 if (thread_data && !async_synchronous_p (thread_data->async))
3955 {
3956 struct agent_info *agent = get_agent_info (device);
3957 maybe_init_omp_async (agent);
9c41f5b9 3958 queue_push_copy (agent->omp_async_queue, dst, src, n);
237957cc
AS
3959 return true;
3960 }
3961
3962 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3963 device, src, device, dst);
8d2f4ddf
JB
3964 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3965 if (status != HSA_STATUS_SUCCESS)
3966 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3967 return true;
3968}
3969
a17299c1
JB
3970/* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
3971 measured in bytes. So we have:
3972
3973 dim1_size: number of bytes to copy on innermost dimension ("row")
3974 dim0_len: number of rows to copy
3975 dst: base pointer for destination of copy
3976 dst_offset1_size: innermost row offset (for dest), in bytes
3977 dst_offset0_len: offset, number of rows (for dest)
3978 dst_dim1_size: whole-array dest row length, in bytes (pitch)
3979 src: base pointer for source of copy
3980 src_offset1_size: innermost row offset (for source), in bytes
3981 src_offset0_len: offset, number of rows (for source)
3982 src_dim1_size: whole-array source row length, in bytes (pitch)
3983*/
3984
3985int
3986GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
3987 size_t dim0_len, void *dst, size_t dst_offset1_size,
3988 size_t dst_offset0_len, size_t dst_dim1_size,
3989 const void *src, size_t src_offset1_size,
3990 size_t src_offset0_len, size_t src_dim1_size)
3991{
3992 if (!hsa_fns.hsa_amd_memory_lock_fn
3993 || !hsa_fns.hsa_amd_memory_unlock_fn
3994 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
3995 return -1;
3996
3997 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
3998 out quietly if we have anything oddly-aligned rather than letting the
3999 driver raise an error. */
4000 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4001 return -1;
4002
4003 if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
4004 return -1;
4005
4006 /* Only handle host to device or device to host transfers here. */
4007 if ((dst_ord == -1 && src_ord == -1)
4008 || (dst_ord != -1 && src_ord != -1))
4009 return -1;
4010
4011 hsa_amd_copy_direction_t dir
4012 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4013 hsa_agent_t copy_agent;
4014
4015 /* We need to pin (lock) host memory before we start the transfer. Try to
4016 lock the minimum size necessary, i.e. using partial first/last rows of the
4017 whole array. Something like this:
4018
4019 rows -->
4020 ..............
4021 c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4022 o | ++#######+++++ <- whole row
4023 l | ++#######+++++ <- "
4024 s v ++#######..... <- last row apart from trailing remainder
4025 ..............
4026
4027 We could split very large transfers into several rectangular copies, but
4028 that is unimplemented for now. */
4029
4030 size_t bounded_size_host, first_elem_offset_host;
4031 void *host_ptr;
4032 if (dir == hsaHostToDevice)
4033 {
4034 bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
4035 first_elem_offset_host = src_offset0_len * src_dim1_size
4036 + src_offset1_size;
4037 host_ptr = (void *) src;
4038 struct agent_info *agent = get_agent_info (dst_ord);
4039 copy_agent = agent->id;
4040 }
4041 else
4042 {
4043 bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
4044 first_elem_offset_host = dst_offset0_len * dst_dim1_size
4045 + dst_offset1_size;
4046 host_ptr = dst;
4047 struct agent_info *agent = get_agent_info (src_ord);
4048 copy_agent = agent->id;
4049 }
4050
4051 void *agent_ptr;
4052
4053 hsa_status_t status
4054 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4055 bounded_size_host, NULL, 0, &agent_ptr);
4056 /* We can't lock the host memory: don't give up though, we might still be
4057 able to use the slow path in our caller. So, don't make this an
4058 error. */
4059 if (status != HSA_STATUS_SUCCESS)
4060 return -1;
4061
4062 hsa_pitched_ptr_t dstpp, srcpp;
4063 hsa_dim3_t dst_offsets, src_offsets, ranges;
4064
4065 int retval = 1;
4066
4067 hsa_signal_t completion_signal;
4068 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4069 if (status != HSA_STATUS_SUCCESS)
4070 {
4071 retval = -1;
4072 goto unlock;
4073 }
4074
4075 if (dir == hsaHostToDevice)
4076 {
4077 srcpp.base = agent_ptr - first_elem_offset_host;
4078 dstpp.base = dst;
4079 }
4080 else
4081 {
4082 srcpp.base = (void *) src;
4083 dstpp.base = agent_ptr - first_elem_offset_host;
4084 }
4085
4086 srcpp.pitch = src_dim1_size;
4087 srcpp.slice = 0;
4088
4089 src_offsets.x = src_offset1_size;
4090 src_offsets.y = src_offset0_len;
4091 src_offsets.z = 0;
4092
4093 dstpp.pitch = dst_dim1_size;
4094 dstpp.slice = 0;
4095
4096 dst_offsets.x = dst_offset1_size;
4097 dst_offsets.y = dst_offset0_len;
4098 dst_offsets.z = 0;
4099
4100 ranges.x = dim1_size;
4101 ranges.y = dim0_len;
4102 ranges.z = 1;
4103
4104 status
4105 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4106 &src_offsets, &ranges,
4107 copy_agent, dir, 0, NULL,
4108 completion_signal);
4109 /* If the rectangular copy fails, we might still be able to use the slow
4110 path. We need to unlock the host memory though, so don't return
4111 immediately. */
4112 if (status != HSA_STATUS_SUCCESS)
4113 retval = -1;
4114 else
4115 hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4116 HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
4117 HSA_WAIT_STATE_ACTIVE);
4118
4119 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4120
4121unlock:
4122 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4123 if (status != HSA_STATUS_SUCCESS)
4124 hsa_fatal ("Could not unlock host memory", status);
4125
4126 return retval;
4127}
4128
4129/* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4130 measured in bytes. So we have:
4131
4132 dim2_size: number of bytes to copy on innermost dimension ("row")
4133 dim1_len: number of rows per slice to copy
4134 dim0_len: number of slices to copy
4135 dst: base pointer for destination of copy
4136 dst_offset2_size: innermost row offset (for dest), in bytes
4137 dst_offset1_len: offset, number of rows (for dest)
4138 dst_offset0_len: offset, number of slices (for dest)
4139 dst_dim2_size: whole-array dest row length, in bytes (pitch)
4140 dst_dim1_len: whole-array number of rows in slice (for dest)
4141 src: base pointer for source of copy
4142 src_offset2_size: innermost row offset (for source), in bytes
4143 src_offset1_len: offset, number of rows (for source)
4144 src_offset0_len: offset, number of slices (for source)
4145 src_dim2_size: whole-array source row length, in bytes (pitch)
4146 src_dim1_len: whole-array number of rows in slice (for source)
4147*/
4148
4149int
4150GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
4151 size_t dim1_len, size_t dim0_len, void *dst,
4152 size_t dst_offset2_size, size_t dst_offset1_len,
4153 size_t dst_offset0_len, size_t dst_dim2_size,
4154 size_t dst_dim1_len, const void *src,
4155 size_t src_offset2_size, size_t src_offset1_len,
4156 size_t src_offset0_len, size_t src_dim2_size,
4157 size_t src_dim1_len)
4158{
4159 if (!hsa_fns.hsa_amd_memory_lock_fn
4160 || !hsa_fns.hsa_amd_memory_unlock_fn
4161 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4162 return -1;
4163
4164 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4165 out quietly if we have anything oddly-aligned rather than letting the
4166 driver raise an error. */
4167 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4168 return -1;
4169
4170 if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
4171 return -1;
4172
4173 /* Only handle host to device or device to host transfers here. */
4174 if ((dst_ord == -1 && src_ord == -1)
4175 || (dst_ord != -1 && src_ord != -1))
4176 return -1;
4177
4178 hsa_amd_copy_direction_t dir
4179 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4180 hsa_agent_t copy_agent;
4181
4182 /* We need to pin (lock) host memory before we start the transfer. Try to
4183 lock the minimum size necessary, i.e. using partial first/last slices of
4184 the whole 3D array. Something like this:
4185
4186 slice 0: slice 1: slice 2:
4187 __________ __________ __________
4188 ^ /+++++++++/ : /+++++++++/ : / /
4189 column /+++##++++/| | /+++##++++/| | /+++## / # = subarray
4190 / / ##++++/ | |/+++##++++/ | |/+++##++++/ + = area to pin
4191 /_________/ : /_________/ : /_________/
4192 row --->
4193
4194 We could split very large transfers into several rectangular copies, but
4195 that is unimplemented for now. */
4196
4197 size_t bounded_size_host, first_elem_offset_host;
4198 void *host_ptr;
4199 if (dir == hsaHostToDevice)
4200 {
4201 size_t slice_bytes = src_dim2_size * src_dim1_len;
4202 bounded_size_host = slice_bytes * (dim0_len - 1)
4203 + src_dim2_size * (dim1_len - 1)
4204 + dim2_size;
4205 first_elem_offset_host = src_offset0_len * slice_bytes
4206 + src_offset1_len * src_dim2_size
4207 + src_offset2_size;
4208 host_ptr = (void *) src;
4209 struct agent_info *agent = get_agent_info (dst_ord);
4210 copy_agent = agent->id;
4211 }
4212 else
4213 {
4214 size_t slice_bytes = dst_dim2_size * dst_dim1_len;
4215 bounded_size_host = slice_bytes * (dim0_len - 1)
4216 + dst_dim2_size * (dim1_len - 1)
4217 + dim2_size;
4218 first_elem_offset_host = dst_offset0_len * slice_bytes
4219 + dst_offset1_len * dst_dim2_size
4220 + dst_offset2_size;
4221 host_ptr = dst;
4222 struct agent_info *agent = get_agent_info (src_ord);
4223 copy_agent = agent->id;
4224 }
4225
4226 void *agent_ptr;
4227
4228 hsa_status_t status
4229 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4230 bounded_size_host, NULL, 0, &agent_ptr);
4231 /* We can't lock the host memory: don't give up though, we might still be
4232 able to use the slow path in our caller (maybe even with iterated memcpy2d
4233 calls). So, don't make this an error. */
4234 if (status != HSA_STATUS_SUCCESS)
4235 return -1;
4236
4237 hsa_pitched_ptr_t dstpp, srcpp;
4238 hsa_dim3_t dst_offsets, src_offsets, ranges;
4239
4240 int retval = 1;
4241
4242 hsa_signal_t completion_signal;
4243 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4244 if (status != HSA_STATUS_SUCCESS)
4245 {
4246 retval = -1;
4247 goto unlock;
4248 }
4249
4250 if (dir == hsaHostToDevice)
4251 {
4252 srcpp.base = agent_ptr - first_elem_offset_host;
4253 dstpp.base = dst;
4254 }
4255 else
4256 {
4257 srcpp.base = (void *) src;
4258 dstpp.base = agent_ptr - first_elem_offset_host;
4259 }
4260
4261 /* Pitch is measured in bytes. */
4262 srcpp.pitch = src_dim2_size;
4263 /* Slice is also measured in bytes (i.e. total per-slice). */
4264 srcpp.slice = src_dim2_size * src_dim1_len;
4265
4266 src_offsets.x = src_offset2_size;
4267 src_offsets.y = src_offset1_len;
4268 src_offsets.z = src_offset0_len;
4269
4270 /* As above. */
4271 dstpp.pitch = dst_dim2_size;
4272 dstpp.slice = dst_dim2_size * dst_dim1_len;
4273
4274 dst_offsets.x = dst_offset2_size;
4275 dst_offsets.y = dst_offset1_len;
4276 dst_offsets.z = dst_offset0_len;
4277
4278 ranges.x = dim2_size;
4279 ranges.y = dim1_len;
4280 ranges.z = dim0_len;
4281
4282 status
4283 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4284 &src_offsets, &ranges,
4285 copy_agent, dir, 0, NULL,
4286 completion_signal);
4287 /* If the rectangular copy fails, we might still be able to use the slow
4288 path. We need to unlock the host memory though, so don't return
4289 immediately. */
4290 if (status != HSA_STATUS_SUCCESS)
4291 retval = -1;
4292 else
4293 {
4294 hsa_signal_value_t sv
4295 = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4296 HSA_SIGNAL_CONDITION_LT, 1,
4297 UINT64_MAX,
4298 HSA_WAIT_STATE_ACTIVE);
4299 if (sv < 0)
4300 {
4301 GCN_WARNING ("async copy rect failure");
4302 retval = -1;
4303 }
4304 }
4305
4306 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4307
4308unlock:
4309 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4310 if (status != HSA_STATUS_SUCCESS)
4311 hsa_fatal ("Could not unlock host memory", status);
4312
4313 return retval;
4314}
4315
237957cc
AS
4316/* }}} */
4317/* {{{ OpenMP Plugin API */
4318
4319/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4320 in VARS as a parameter. The kernel is identified by FN_PTR which must point
4321 to a kernel_info structure, and must have previously been loaded to the
4322 specified device. */
4323
4324void
4325GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
4326{
4327 struct agent_info *agent = get_agent_info (device);
4328 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4329 struct GOMP_kernel_launch_attributes def;
4330 struct GOMP_kernel_launch_attributes *kla;
4331 assert (agent == kernel->agent);
4332
4333 /* If we get here then the kernel must be OpenMP. */
4334 kernel->kind = KIND_OPENMP;
4335
4336 if (!parse_target_attributes (args, &def, &kla, agent))
4337 {
4338 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4339 return;
4340 }
4341 run_kernel (kernel, vars, kla, NULL, false);
4342}
4343
4344/* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
4345 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4346 GOMP_PLUGIN_target_task_completion when it has finished. */
4347
4348void
4349GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
4350 void **args, void *async_data)
4351{
4352 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4353 struct agent_info *agent = get_agent_info (device);
4354 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
4355 struct GOMP_kernel_launch_attributes def;
4356 struct GOMP_kernel_launch_attributes *kla;
4357 assert (agent == kernel->agent);
4358
4359 /* If we get here then the kernel must be OpenMP. */
4360 kernel->kind = KIND_OPENMP;
4361
4362 if (!parse_target_attributes (args, &def, &kla, agent))
4363 {
4364 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4365 return;
4366 }
4367
4368 maybe_init_omp_async (agent);
4369 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
4370 queue_push_callback (agent->omp_async_queue,
4371 GOMP_PLUGIN_target_task_completion, async_data);
4372}
4373
4374/* }}} */
4375/* {{{ OpenACC Plugin API */
4376
4377/* Run a synchronous OpenACC kernel. The device number is inferred from the
4378 already-loaded KERNEL. */
4379
4380void
f8332e52
TS
4381GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
4382 size_t mapnum __attribute__((unused)),
199867d0
TS
4383 void **hostaddrs __attribute__((unused)),
4384 void **devaddrs, unsigned *dims,
237957cc
AS
4385 void *targ_mem_desc)
4386{
4387 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4388
f8332e52 4389 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
237957cc
AS
4390}
4391
4392/* Run an asynchronous OpenACC kernel on the specified queue. */
4393
4394void
f8332e52
TS
4395GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4396 size_t mapnum __attribute__((unused)),
199867d0
TS
4397 void **hostaddrs __attribute__((unused)),
4398 void **devaddrs,
237957cc
AS
4399 unsigned *dims, void *targ_mem_desc,
4400 struct goacc_asyncqueue *aq)
4401{
4402 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4403
f8332e52 4404 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
237957cc
AS
4405}
4406
4407/* Create a new asynchronous thread and queue for running future kernels. */
4408
4409struct goacc_asyncqueue *
4410GOMP_OFFLOAD_openacc_async_construct (int device)
4411{
4412 struct agent_info *agent = get_agent_info (device);
4413
4414 pthread_mutex_lock (&agent->async_queues_mutex);
4415
4416 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4417 aq->agent = get_agent_info (device);
4418 aq->prev = NULL;
4419 aq->next = agent->async_queues;
4420 if (aq->next)
4421 {
4422 aq->next->prev = aq;
4423 aq->id = aq->next->id + 1;
4424 }
4425 else
4426 aq->id = 1;
4427 agent->async_queues = aq;
4428
4429 aq->queue_first = 0;
4430 aq->queue_n = 0;
4431 aq->drain_queue_stop = 0;
4432
4433 if (pthread_mutex_init (&aq->mutex, NULL))
4434 {
4435 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4436 return false;
4437 }
4438 if (pthread_cond_init (&aq->queue_cond_in, NULL))
4439 {
4440 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4441 return false;
4442 }
4443 if (pthread_cond_init (&aq->queue_cond_out, NULL))
4444 {
4445 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4446 return false;
4447 }
4448
4449 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4450 ASYNC_QUEUE_SIZE,
4451 HSA_QUEUE_TYPE_MULTI,
4452 hsa_queue_callback, NULL,
4453 UINT32_MAX, UINT32_MAX,
4454 &aq->hsa_queue);
4455 if (status != HSA_STATUS_SUCCESS)
4456 hsa_fatal ("Error creating command queue", status);
4457
4458 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4459 if (err != 0)
4460 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4461 strerror (err));
4462 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4463 aq->id);
4464
4465 pthread_mutex_unlock (&agent->async_queues_mutex);
4466
4467 return aq;
4468}
4469
93d90219 4470/* Destroy an existing asynchronous thread and queue. Waits for any
237957cc
AS
4471 currently-running task to complete, but cancels any queued tasks. */
4472
4473bool
4474GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4475{
4476 struct agent_info *agent = aq->agent;
4477
4478 finalize_async_thread (aq);
4479
4480 pthread_mutex_lock (&agent->async_queues_mutex);
4481
4482 int err;
4483 if ((err = pthread_mutex_destroy (&aq->mutex)))
4484 {
4485 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4486 goto fail;
4487 }
4488 if (pthread_cond_destroy (&aq->queue_cond_in))
4489 {
4490 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4491 goto fail;
4492 }
4493 if (pthread_cond_destroy (&aq->queue_cond_out))
4494 {
4495 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4496 goto fail;
4497 }
4498 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4499 if (status != HSA_STATUS_SUCCESS)
4500 {
4501 hsa_error ("Error destroying command queue", status);
4502 goto fail;
4503 }
4504
4505 if (aq->prev)
4506 aq->prev->next = aq->next;
4507 if (aq->next)
4508 aq->next->prev = aq->prev;
4509 if (agent->async_queues == aq)
4510 agent->async_queues = aq->next;
4511
4512 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4513
4514 free (aq);
4515 pthread_mutex_unlock (&agent->async_queues_mutex);
4516 return true;
4517
4518fail:
4519 pthread_mutex_unlock (&agent->async_queues_mutex);
4520 return false;
4521}
4522
4523/* Return true if the specified async queue is currently empty. */
4524
4525int
4526GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4527{
4528 return queue_empty (aq);
4529}
4530
4531/* Block until the specified queue has executed all its tasks and the
4532 queue is empty. */
4533
4534bool
4535GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4536{
4537 wait_queue (aq);
4538 return true;
4539}
4540
4541/* Add a serialization point across two async queues. Any new tasks added to
4542 AQ2, after this call, will not run until all tasks on AQ1, at the time
4543 of this call, have completed. */
4544
4545bool
4546GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4547 struct goacc_asyncqueue *aq2)
4548{
4549 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4550 scheduled to run on it up to this point. */
4551 if (aq1 != aq2)
4552 {
4553 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4554 queue_push_asyncwait (aq2, placeholderp);
4555 }
4556 return true;
4557}
4558
4559/* Add an opaque callback to the given async queue. */
4560
4561void
4562GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4563 void (*fn) (void *), void *data)
4564{
4565 queue_push_callback (aq, fn, data);
4566}
4567
4568/* Queue up an asynchronous data copy from host to DEVICE. */
4569
4570bool
4571GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4572 size_t n, struct goacc_asyncqueue *aq)
4573{
4574 struct agent_info *agent = get_agent_info (device);
4575 assert (agent == aq->agent);
9c41f5b9 4576 queue_push_copy (aq, dst, src, n);
237957cc
AS
4577 return true;
4578}
4579
4580/* Queue up an asynchronous data copy from DEVICE to host. */
4581
4582bool
4583GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4584 size_t n, struct goacc_asyncqueue *aq)
4585{
4586 struct agent_info *agent = get_agent_info (device);
4587 assert (agent == aq->agent);
9c41f5b9 4588 queue_push_copy (aq, dst, src, n);
237957cc
AS
4589 return true;
4590}
4591
6fc0385c
TS
4592union goacc_property_value
4593GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4594{
2e5ea579
FH
4595 struct agent_info *agent = get_agent_info (device);
4596
4597 union goacc_property_value propval = { .val = 0 };
4598
4599 switch (prop)
4600 {
4601 case GOACC_PROPERTY_FREE_MEMORY:
4602 /* Not supported. */
4603 break;
4604 case GOACC_PROPERTY_MEMORY:
4605 {
4606 size_t size;
4607 hsa_region_t region = agent->data_region;
4608 hsa_status_t status =
4609 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4610 if (status == HSA_STATUS_SUCCESS)
4611 propval.val = size;
4612 break;
4613 }
4614 case GOACC_PROPERTY_NAME:
4615 propval.ptr = agent->name;
4616 break;
4617 case GOACC_PROPERTY_VENDOR:
4618 propval.ptr = agent->vendor_name;
4619 break;
4620 case GOACC_PROPERTY_DRIVER:
4621 propval.ptr = hsa_context.driver_version_s;
4622 break;
4623 }
6fc0385c 4624
2e5ea579 4625 return propval;
6fc0385c
TS
4626}
4627
237957cc
AS
4628/* Set up plugin-specific thread-local-data (host-side). */
4629
4630void *
4631GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4632{
4633 struct gcn_thread *thread_data
4634 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4635
4636 thread_data->async = GOMP_ASYNC_SYNC;
4637
4638 return (void *) thread_data;
4639}
4640
4641/* Clean up plugin-specific thread-local-data. */
4642
4643void
4644GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4645{
4646 free (data);
4647}
4648
4649/* }}} */