]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-gcn.c
aarch64: disable tests on ilp32 [PR94514]
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
CommitLineData
237957cc
AS
1/* Plugin for AMD GCN execution.
2
8d9254fc 3 Copyright (C) 2013-2020 Free Software Foundation, Inc.
237957cc
AS
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* {{{ Includes and defines */
30
31#include "config.h"
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35#include <pthread.h>
36#include <inttypes.h>
37#include <stdbool.h>
38#include <limits.h>
39#include <hsa.h>
40#include <dlfcn.h>
41#include <signal.h>
42#include "libgomp-plugin.h"
43#include "gomp-constants.h"
44#include <elf.h>
45#include "oacc-plugin.h"
46#include "oacc-int.h"
47#include <assert.h>
48
49/* Additional definitions not in HSA 1.1.
50 FIXME: this needs to be updated in hsa.h for upstream, but the only source
51 right now is the ROCr source which may cause license issues. */
52#define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
53
54/* These probably won't be in elf.h for a while. */
55#define R_AMDGPU_NONE 0
56#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
57#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
58#define R_AMDGPU_ABS64 3 /* S + A */
59#define R_AMDGPU_REL32 4 /* S + A - P */
60#define R_AMDGPU_REL64 5 /* S + A - P */
61#define R_AMDGPU_ABS32 6 /* S + A */
62#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
63#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
64#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
65#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
66#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
67#define reserved 12
68#define R_AMDGPU_RELATIVE64 13 /* B + A */
69
70/* GCN specific definitions for asynchronous queues. */
71
72#define ASYNC_QUEUE_SIZE 64
73#define DRAIN_QUEUE_SYNCHRONOUS_P false
74#define DEBUG_QUEUES 0
75#define DEBUG_THREAD_SLEEP 0
76#define DEBUG_THREAD_SIGNAL 0
77
78/* Defaults. */
79#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
80
81/* Secure getenv() which returns NULL if running as SUID/SGID. */
82#ifndef HAVE_SECURE_GETENV
83#ifdef HAVE___SECURE_GETENV
84#define secure_getenv __secure_getenv
85#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
86 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
87
88#include <unistd.h>
89
90/* Implementation of secure_getenv() for targets where it is not provided but
91 we have at least means to test real and effective IDs. */
92
93static char *
94secure_getenv (const char *name)
95{
96 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
97 return getenv (name);
98 else
99 return NULL;
100}
101
102#else
103#define secure_getenv getenv
104#endif
105#endif
106
107/* }}} */
108/* {{{ Types */
109
93d90219 110/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
237957cc
AS
111
112struct gcn_thread
113{
114 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
115 int async;
116};
117
118/* As an HSA runtime is dlopened, following structure defines function
119 pointers utilized by the HSA plug-in. */
120
121struct hsa_runtime_fn_info
122{
123 /* HSA runtime. */
124 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
125 const char **status_string);
126 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
127 void *value);
128 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
129 hsa_agent_info_t attribute,
130 void *value);
131 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
132 hsa_isa_info_t attribute,
133 uint32_t index,
134 void *value);
135 hsa_status_t (*hsa_init_fn) (void);
136 hsa_status_t (*hsa_iterate_agents_fn)
137 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
138 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
139 hsa_region_info_t attribute,
140 void *value);
141 hsa_status_t (*hsa_queue_create_fn)
142 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
143 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
144 void *data, uint32_t private_segment_size,
145 uint32_t group_segment_size, hsa_queue_t **queue);
146 hsa_status_t (*hsa_agent_iterate_regions_fn)
147 (hsa_agent_t agent,
148 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
149 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
150 hsa_status_t (*hsa_executable_create_fn)
151 (hsa_profile_t profile, hsa_executable_state_t executable_state,
152 const char *options, hsa_executable_t *executable);
153 hsa_status_t (*hsa_executable_global_variable_define_fn)
154 (hsa_executable_t executable, const char *variable_name, void *address);
155 hsa_status_t (*hsa_executable_load_code_object_fn)
156 (hsa_executable_t executable, hsa_agent_t agent,
157 hsa_code_object_t code_object, const char *options);
158 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
159 const char *options);
160 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
161 uint32_t num_consumers,
162 const hsa_agent_t *consumers,
163 hsa_signal_t *signal);
164 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
165 void **ptr);
166 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
167 hsa_access_permission_t access);
168 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
169 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
170 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
171 hsa_status_t (*hsa_executable_get_symbol_fn)
172 (hsa_executable_t executable, const char *module_name,
173 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
174 hsa_executable_symbol_t *symbol);
175 hsa_status_t (*hsa_executable_symbol_get_info_fn)
176 (hsa_executable_symbol_t executable_symbol,
177 hsa_executable_symbol_info_t attribute, void *value);
178 hsa_status_t (*hsa_executable_iterate_symbols_fn)
179 (hsa_executable_t executable,
180 hsa_status_t (*callback)(hsa_executable_t executable,
181 hsa_executable_symbol_t symbol, void *data),
182 void *data);
183 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
184 uint64_t value);
185 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
186 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
187 hsa_signal_value_t value);
188 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
189 hsa_signal_value_t value);
190 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
191 (hsa_signal_t signal, hsa_signal_condition_t condition,
192 hsa_signal_value_t compare_value, uint64_t timeout_hint,
193 hsa_wait_state_t wait_state_hint);
194 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
195 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
196
197 hsa_status_t (*hsa_code_object_deserialize_fn)
198 (void *serialized_code_object, size_t serialized_code_object_size,
199 const char *options, hsa_code_object_t *code_object);
200};
201
202/* Structure describing the run-time and grid properties of an HSA kernel
203 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
204
205struct GOMP_kernel_launch_attributes
206{
207 /* Number of dimensions the workload has. Maximum number is 3. */
208 uint32_t ndim;
209 /* Size of the grid in the three respective dimensions. */
210 uint32_t gdims[3];
211 /* Size of work-groups in the respective dimensions. */
212 uint32_t wdims[3];
213};
214
215/* Collection of information needed for a dispatch of a kernel from a
216 kernel. */
217
218struct kernel_dispatch
219{
220 struct agent_info *agent;
221 /* Pointer to a command queue associated with a kernel dispatch agent. */
222 void *queue;
223 /* Pointer to a memory space used for kernel arguments passing. */
224 void *kernarg_address;
225 /* Kernel object. */
226 uint64_t object;
227 /* Synchronization signal used for dispatch synchronization. */
228 uint64_t signal;
229 /* Private segment size. */
230 uint32_t private_segment_size;
231 /* Group segment size. */
232 uint32_t group_segment_size;
233};
234
235/* Structure of the kernargs segment, supporting console output.
236
237 This needs to match the definitions in Newlib, and the expectations
238 in libgomp target code. */
239
240struct kernargs {
241 /* Leave space for the real kernel arguments.
242 OpenACC and OpenMP only use one pointer. */
243 int64_t dummy1;
244 int64_t dummy2;
245
246 /* A pointer to struct output, below, for console output data. */
247 int64_t out_ptr;
248
249 /* A pointer to struct heap, below. */
250 int64_t heap_ptr;
251
252 /* A pointer to an ephemeral memory arena.
253 Only needed for OpenMP. */
254 int64_t arena_ptr;
255
256 /* Output data. */
257 struct output {
258 int return_value;
259 unsigned int next_output;
260 struct printf_data {
261 int written;
262 char msg[128];
263 int type;
264 union {
265 int64_t ivalue;
266 double dvalue;
267 char text[128];
268 };
269 } queue[1024];
270 unsigned int consumed;
271 } output_data;
272};
273
274/* A queue entry for a future asynchronous launch. */
275
276struct kernel_launch
277{
278 struct kernel_info *kernel;
279 void *vars;
280 struct GOMP_kernel_launch_attributes kla;
281};
282
283/* A queue entry for a future callback. */
284
285struct callback
286{
287 void (*fn)(void *);
288 void *data;
289};
290
291/* A data struct for the copy_data callback. */
292
293struct copy_data
294{
295 void *dst;
296 const void *src;
297 size_t len;
298 bool free_src;
299 struct goacc_asyncqueue *aq;
300};
301
302/* A queue entry for a placeholder. These correspond to a wait event. */
303
304struct placeholder
305{
306 int executed;
307 pthread_cond_t cond;
308 pthread_mutex_t mutex;
309};
310
311/* A queue entry for a wait directive. */
312
313struct asyncwait_info
314{
315 struct placeholder *placeholderp;
316};
317
318/* Encode the type of an entry in an async queue. */
319
320enum entry_type
321{
322 KERNEL_LAUNCH,
323 CALLBACK,
324 ASYNC_WAIT,
325 ASYNC_PLACEHOLDER
326};
327
328/* An entry in an async queue. */
329
330struct queue_entry
331{
332 enum entry_type type;
333 union {
334 struct kernel_launch launch;
335 struct callback callback;
336 struct asyncwait_info asyncwait;
337 struct placeholder placeholder;
338 } u;
339};
340
341/* An async queue header.
342
343 OpenMP may create one of these.
344 OpenACC may create many. */
345
346struct goacc_asyncqueue
347{
348 struct agent_info *agent;
349 hsa_queue_t *hsa_queue;
350
351 pthread_t thread_drain_queue;
352 pthread_mutex_t mutex;
353 pthread_cond_t queue_cond_in;
354 pthread_cond_t queue_cond_out;
355 struct queue_entry queue[ASYNC_QUEUE_SIZE];
356 int queue_first;
357 int queue_n;
358 int drain_queue_stop;
359
360 int id;
361 struct goacc_asyncqueue *prev;
362 struct goacc_asyncqueue *next;
363};
364
365/* Mkoffload uses this structure to describe a kernel.
366
367 OpenMP kernel dimensions are passed at runtime.
368 OpenACC kernel dimensions are passed at compile time, here. */
369
370struct hsa_kernel_description
371{
372 const char *name;
373 int oacc_dims[3]; /* Only present for GCN kernels. */
5a28e272
KCY
374 int sgpr_count;
375 int vpgr_count;
237957cc
AS
376};
377
378/* Mkoffload uses this structure to describe an offload variable. */
379
380struct global_var_info
381{
382 const char *name;
383 void *address;
384};
385
386/* Mkoffload uses this structure to describe all the kernels in a
387 loadable module. These are passed the libgomp via static constructors. */
388
389struct gcn_image_desc
390{
391 struct gcn_image {
392 size_t size;
393 void *image;
394 } *gcn_image;
395 const unsigned kernel_count;
396 struct hsa_kernel_description *kernel_infos;
397 const unsigned global_variable_count;
398 struct global_var_info *global_variables;
399};
400
7d593fd6
FH
401/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
402 support.
403 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
404
405typedef enum {
7d593fd6
FH
406 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
407 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
408 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
409} EF_AMDGPU_MACH;
410
411const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
412typedef EF_AMDGPU_MACH gcn_isa;
413
237957cc
AS
414/* Description of an HSA GPU agent (device) and the program associated with
415 it. */
416
417struct agent_info
418{
419 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
420 hsa_agent_t id;
421 /* The user-visible device number. */
422 int device_id;
423 /* Whether the agent has been initialized. The fields below are usable only
424 if it has been. */
425 bool initialized;
7d593fd6
FH
426
427 /* The instruction set architecture of the device. */
428 gcn_isa device_isa;
2e5ea579
FH
429 /* Name of the agent. */
430 char name[64];
431 /* Name of the vendor of the agent. */
432 char vendor_name[64];
237957cc
AS
433 /* Command queues of the agent. */
434 hsa_queue_t *sync_queue;
435 struct goacc_asyncqueue *async_queues, *omp_async_queue;
436 pthread_mutex_t async_queues_mutex;
437
438 /* The HSA memory region from which to allocate kernel arguments. */
439 hsa_region_t kernarg_region;
440
441 /* The HSA memory region from which to allocate device data. */
442 hsa_region_t data_region;
443
444 /* Allocated team arenas. */
445 struct team_arena_list *team_arena_list;
446 pthread_mutex_t team_arena_write_lock;
447
448 /* Read-write lock that protects kernels which are running or about to be run
449 from interference with loading and unloading of images. Needs to be
450 locked for reading while a kernel is being run, and for writing if the
451 list of modules is manipulated (and thus the HSA program invalidated). */
452 pthread_rwlock_t module_rwlock;
453
454 /* The module associated with this kernel. */
455 struct module_info *module;
456
457 /* Mutex enforcing that only one thread will finalize the HSA program. A
458 thread should have locked agent->module_rwlock for reading before
459 acquiring it. */
460 pthread_mutex_t prog_mutex;
461 /* Flag whether the HSA program that consists of all the modules has been
462 finalized. */
463 bool prog_finalized;
464 /* HSA executable - the finalized program that is used to locate kernels. */
465 hsa_executable_t executable;
466};
467
468/* Information required to identify, finalize and run any given kernel. */
469
470enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
471
472struct kernel_info
473{
474 /* Name of the kernel, required to locate it within the GCN object-code
475 module. */
476 const char *name;
477 /* The specific agent the kernel has been or will be finalized for and run
478 on. */
479 struct agent_info *agent;
480 /* The specific module where the kernel takes place. */
481 struct module_info *module;
5a28e272
KCY
482 /* Information provided by mkoffload associated with the kernel. */
483 struct hsa_kernel_description *description;
237957cc
AS
484 /* Mutex enforcing that at most once thread ever initializes a kernel for
485 use. A thread should have locked agent->module_rwlock for reading before
486 acquiring it. */
487 pthread_mutex_t init_mutex;
488 /* Flag indicating whether the kernel has been initialized and all fields
489 below it contain valid data. */
490 bool initialized;
491 /* Flag indicating that the kernel has a problem that blocks an execution. */
492 bool initialization_failed;
493 /* The object to be put into the dispatch queue. */
494 uint64_t object;
495 /* Required size of kernel arguments. */
496 uint32_t kernarg_segment_size;
497 /* Required size of group segment. */
498 uint32_t group_segment_size;
499 /* Required size of private segment. */
500 uint32_t private_segment_size;
501 /* Set up for OpenMP or OpenACC? */
502 enum offload_kind kind;
503};
504
505/* Information about a particular GCN module, its image and kernels. */
506
507struct module_info
508{
509 /* The description with which the program has registered the image. */
510 struct gcn_image_desc *image_desc;
511 /* GCN heap allocation. */
512 struct heap *heap;
513 /* Physical boundaries of the loaded module. */
514 Elf64_Addr phys_address_start;
515 Elf64_Addr phys_address_end;
516
517 bool constructors_run_p;
518 struct kernel_info *init_array_func, *fini_array_func;
519
520 /* Number of kernels in this module. */
521 int kernel_count;
522 /* An array of kernel_info structures describing each kernel in this
523 module. */
524 struct kernel_info kernels[];
525};
526
527/* A linked list of memory arenas allocated on the device.
528 These are only used by OpenMP, as a means to optimize per-team malloc. */
529
530struct team_arena_list
531{
532 struct team_arena_list *next;
533
534 /* The number of teams determines the size of the allocation. */
535 int num_teams;
536 /* The device address of the arena itself. */
537 void *arena;
538 /* A flag to prevent two asynchronous kernels trying to use the same arena.
539 The mutex is locked until the kernel exits. */
540 pthread_mutex_t in_use;
541};
542
543/* Information about the whole HSA environment and all of its agents. */
544
545struct hsa_context_info
546{
547 /* Whether the structure has been initialized. */
548 bool initialized;
549 /* Number of usable GPU HSA agents in the system. */
550 int agent_count;
551 /* Array of agent_info structures describing the individual HSA agents. */
552 struct agent_info *agents;
2e5ea579
FH
553 /* Driver version string. */
554 char driver_version_s[30];
237957cc
AS
555};
556
557/* Format of the on-device heap.
558
559 This must match the definition in Newlib and gcn-run. */
560
561struct heap {
562 int64_t size;
563 char data[0];
564};
565
566/* }}} */
567/* {{{ Global variables */
568
569/* Information about the whole HSA environment and all of its agents. */
570
571static struct hsa_context_info hsa_context;
572
573/* HSA runtime functions that are initialized in init_hsa_context. */
574
575static struct hsa_runtime_fn_info hsa_fns;
576
577/* Heap space, allocated target-side, provided for use of newlib malloc.
578 Each module should have it's own heap allocated.
579 Beware that heap usage increases with OpenMP teams. See also arenas. */
580
581static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
582
583/* Flag to decide whether print to stderr information about what is going on.
584 Set in init_debug depending on environment variables. */
585
586static bool debug;
587
588/* Flag to decide if the runtime should suppress a possible fallback to host
589 execution. */
590
591static bool suppress_host_fallback;
592
593/* Flag to locate HSA runtime shared library that is dlopened
594 by this plug-in. */
595
596static const char *hsa_runtime_lib;
597
598/* Flag to decide if the runtime should support also CPU devices (can be
599 a simulator). */
600
601static bool support_cpu_devices;
602
603/* Runtime dimension overrides. Zero indicates default. */
604
605static int override_x_dim = 0;
606static int override_z_dim = 0;
607
608/* }}} */
609/* {{{ Debug & Diagnostic */
610
611/* Print a message to stderr if GCN_DEBUG value is set to true. */
612
613#define DEBUG_PRINT(...) \
614 do \
615 { \
616 if (debug) \
617 { \
618 fprintf (stderr, __VA_ARGS__); \
619 } \
620 } \
621 while (false);
622
623/* Flush stderr if GCN_DEBUG value is set to true. */
624
625#define DEBUG_FLUSH() \
626 do { \
627 if (debug) \
628 fflush (stderr); \
629 } while (false)
630
631/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
632 is set to true. */
633
634#define DEBUG_LOG(prefix, ...) \
635 do \
636 { \
637 DEBUG_PRINT (prefix); \
638 DEBUG_PRINT (__VA_ARGS__); \
639 DEBUG_FLUSH (); \
640 } while (false)
641
642/* Print a debugging message to stderr. */
643
644#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
645
646/* Print a warning message to stderr. */
647
648#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
649
650/* Print HSA warning STR with an HSA STATUS code. */
651
652static void
653hsa_warn (const char *str, hsa_status_t status)
654{
655 if (!debug)
656 return;
657
658 const char *hsa_error_msg = "[unknown]";
659 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
660
661 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
662 hsa_error_msg);
663}
664
665/* Report a fatal error STR together with the HSA error corresponding to STATUS
666 and terminate execution of the current process. */
667
668static void
669hsa_fatal (const char *str, hsa_status_t status)
670{
671 const char *hsa_error_msg = "[unknown]";
672 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
673 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
674 hsa_error_msg);
675}
676
677/* Like hsa_fatal, except only report error message, and return FALSE
678 for propagating error processing to outside of plugin. */
679
680static bool
681hsa_error (const char *str, hsa_status_t status)
682{
683 const char *hsa_error_msg = "[unknown]";
684 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
685 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
686 hsa_error_msg);
687 return false;
688}
689
690/* Dump information about the available hardware. */
691
692static void
693dump_hsa_system_info (void)
694{
695 hsa_status_t status;
696
697 hsa_endianness_t endianness;
698 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
699 &endianness);
700 if (status == HSA_STATUS_SUCCESS)
701 switch (endianness)
702 {
703 case HSA_ENDIANNESS_LITTLE:
704 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
705 break;
706 case HSA_ENDIANNESS_BIG:
707 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
708 break;
709 default:
710 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
711 }
712 else
713 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
714
715 uint8_t extensions[128];
716 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
717 &extensions);
718 if (status == HSA_STATUS_SUCCESS)
719 {
720 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
721 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
722 }
723 else
724 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
725}
726
727/* Dump information about the available hardware. */
728
729static void
730dump_machine_model (hsa_machine_model_t machine_model, const char *s)
731{
732 switch (machine_model)
733 {
734 case HSA_MACHINE_MODEL_SMALL:
735 GCN_DEBUG ("%s: SMALL\n", s);
736 break;
737 case HSA_MACHINE_MODEL_LARGE:
738 GCN_DEBUG ("%s: LARGE\n", s);
739 break;
740 default:
741 GCN_WARNING ("%s: UNKNOWN\n", s);
742 break;
743 }
744}
745
746/* Dump information about the available hardware. */
747
748static void
749dump_profile (hsa_profile_t profile, const char *s)
750{
751 switch (profile)
752 {
753 case HSA_PROFILE_FULL:
754 GCN_DEBUG ("%s: FULL\n", s);
755 break;
756 case HSA_PROFILE_BASE:
757 GCN_DEBUG ("%s: BASE\n", s);
758 break;
759 default:
760 GCN_WARNING ("%s: UNKNOWN\n", s);
761 break;
762 }
763}
764
765/* Dump information about a device memory region. */
766
767static hsa_status_t
768dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
769{
770 hsa_status_t status;
771
772 hsa_region_segment_t segment;
773 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
774 &segment);
775 if (status == HSA_STATUS_SUCCESS)
776 {
777 if (segment == HSA_REGION_SEGMENT_GLOBAL)
778 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
779 else if (segment == HSA_REGION_SEGMENT_READONLY)
780 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
781 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
782 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
783 else if (segment == HSA_REGION_SEGMENT_GROUP)
784 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
785 else
786 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
787 }
788 else
789 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
790
791 if (segment == HSA_REGION_SEGMENT_GLOBAL)
792 {
793 uint32_t flags;
794 status
795 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
796 &flags);
797 if (status == HSA_STATUS_SUCCESS)
798 {
799 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
800 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
801 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
802 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
803 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
804 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
805 }
806 else
807 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
808 }
809
810 size_t size;
811 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
812 if (status == HSA_STATUS_SUCCESS)
813 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
814 else
815 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
816
817 status
818 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
819 &size);
820 if (status == HSA_STATUS_SUCCESS)
821 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
822 else
823 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
824
825 bool alloc_allowed;
826 status
827 = hsa_fns.hsa_region_get_info_fn (region,
828 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
829 &alloc_allowed);
830 if (status == HSA_STATUS_SUCCESS)
831 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
832 else
833 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
834
835 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
836 return HSA_STATUS_SUCCESS;
837
838 status
839 = hsa_fns.hsa_region_get_info_fn (region,
840 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
841 &size);
842 if (status == HSA_STATUS_SUCCESS)
843 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
844 else
845 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
846
847 size_t align;
848 status
849 = hsa_fns.hsa_region_get_info_fn (region,
850 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
851 &align);
852 if (status == HSA_STATUS_SUCCESS)
853 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
854 else
855 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
856
857 return HSA_STATUS_SUCCESS;
858}
859
860/* Dump information about all the device memory regions. */
861
862static void
863dump_hsa_regions (hsa_agent_t agent)
864{
865 hsa_status_t status;
866 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
867 dump_hsa_region,
868 NULL);
869 if (status != HSA_STATUS_SUCCESS)
870 hsa_error ("Dumping hsa regions failed", status);
871}
872
873/* Dump information about the available devices. */
874
875static hsa_status_t
876dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
877{
878 hsa_status_t status;
879
880 char buf[64];
881 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
882 &buf);
883 if (status == HSA_STATUS_SUCCESS)
884 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
885 else
886 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
887
888 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
889 &buf);
890 if (status == HSA_STATUS_SUCCESS)
891 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
892 else
893 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
894
895 hsa_machine_model_t machine_model;
896 status
897 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
898 &machine_model);
899 if (status == HSA_STATUS_SUCCESS)
900 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
901 else
902 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
903
904 hsa_profile_t profile;
905 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
906 &profile);
907 if (status == HSA_STATUS_SUCCESS)
908 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
909 else
910 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
911
912 hsa_device_type_t device_type;
913 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
914 &device_type);
915 if (status == HSA_STATUS_SUCCESS)
916 {
917 switch (device_type)
918 {
919 case HSA_DEVICE_TYPE_CPU:
920 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
921 break;
922 case HSA_DEVICE_TYPE_GPU:
923 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
924 break;
925 case HSA_DEVICE_TYPE_DSP:
926 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
927 break;
928 default:
929 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
930 break;
931 }
932 }
933 else
934 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
935
936 uint32_t cu_count;
937 status = hsa_fns.hsa_agent_get_info_fn
938 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
939 if (status == HSA_STATUS_SUCCESS)
940 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
941 else
942 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
943
944 uint32_t size;
945 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
946 &size);
947 if (status == HSA_STATUS_SUCCESS)
948 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
949 else
950 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
951
952 uint32_t max_dim;
953 status = hsa_fns.hsa_agent_get_info_fn (agent,
954 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
955 &max_dim);
956 if (status == HSA_STATUS_SUCCESS)
957 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
958 else
959 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
960
961 uint32_t max_size;
962 status = hsa_fns.hsa_agent_get_info_fn (agent,
963 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
964 &max_size);
965 if (status == HSA_STATUS_SUCCESS)
966 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
967 else
968 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
969
970 uint32_t grid_max_dim;
971 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
972 &grid_max_dim);
973 if (status == HSA_STATUS_SUCCESS)
974 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
975 else
976 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
977
978 uint32_t grid_max_size;
979 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
980 &grid_max_size);
981 if (status == HSA_STATUS_SUCCESS)
982 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
983 else
984 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
985
986 dump_hsa_regions (agent);
987
988 return HSA_STATUS_SUCCESS;
989}
990
991/* Forward reference. */
992
993static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
994
995/* Helper function for dump_executable_symbols. */
996
997static hsa_status_t
998dump_executable_symbol (hsa_executable_t executable,
999 hsa_executable_symbol_t symbol,
1000 void *data __attribute__((unused)))
1001{
1002 char *name = get_executable_symbol_name (symbol);
1003
1004 if (name)
1005 {
1006 GCN_DEBUG ("executable symbol: %s\n", name);
1007 free (name);
1008 }
1009
1010 return HSA_STATUS_SUCCESS;
1011}
1012
1013/* Dump all global symbol in an executable. */
1014
1015static void
1016dump_executable_symbols (hsa_executable_t executable)
1017{
1018 hsa_status_t status;
1019 status
1020 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1021 dump_executable_symbol,
1022 NULL);
1023 if (status != HSA_STATUS_SUCCESS)
1024 hsa_fatal ("Could not dump HSA executable symbols", status);
1025}
1026
1027/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1028
1029static void
1030print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1031{
1032 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1033
1034 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1035 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1036 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1037 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1038 (void*)kernargs->heap_ptr);
1039 fprintf (stderr, "%*sarena address: %p\n", indent, "",
1040 (void*)kernargs->arena_ptr);
1041 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1042 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1043 dispatch->private_segment_size);
1044 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1045 dispatch->group_segment_size);
1046 fprintf (stderr, "\n");
1047}
1048
1049/* }}} */
1050/* {{{ Utility functions */
1051
1052/* Cast the thread local storage to gcn_thread. */
1053
1054static inline struct gcn_thread *
1055gcn_thread (void)
1056{
1057 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1058}
1059
1060/* Initialize debug and suppress_host_fallback according to the environment. */
1061
1062static void
1063init_environment_variables (void)
1064{
1065 if (secure_getenv ("GCN_DEBUG"))
1066 debug = true;
1067 else
1068 debug = false;
1069
1070 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1071 suppress_host_fallback = true;
1072 else
1073 suppress_host_fallback = false;
1074
1075 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1076 if (hsa_runtime_lib == NULL)
1077 hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so";
1078
1079 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1080
1081 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1082 if (!x)
1083 x = secure_getenv ("GCN_NUM_GANGS");
1084 if (x)
1085 override_x_dim = atoi (x);
1086
1087 const char *z = secure_getenv ("GCN_NUM_THREADS");
1088 if (!z)
1089 z = secure_getenv ("GCN_NUM_WORKERS");
1090 if (z)
1091 override_z_dim = atoi (z);
1092
1093 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1094 if (heap)
1095 {
1096 size_t tmp = atol (heap);
1097 if (tmp)
1098 gcn_kernel_heap_size = tmp;
1099 }
1100}
1101
1102/* Return malloc'd string with name of SYMBOL. */
1103
1104static char *
1105get_executable_symbol_name (hsa_executable_symbol_t symbol)
1106{
1107 hsa_status_t status;
1108 char *res;
1109 uint32_t len;
1110 const hsa_executable_symbol_info_t info_name_length
1111 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1112
1113 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1114 &len);
1115 if (status != HSA_STATUS_SUCCESS)
1116 {
1117 hsa_error ("Could not get length of symbol name", status);
1118 return NULL;
1119 }
1120
1121 res = GOMP_PLUGIN_malloc (len + 1);
1122
1123 const hsa_executable_symbol_info_t info_name
1124 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1125
1126 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1127
1128 if (status != HSA_STATUS_SUCCESS)
1129 {
1130 hsa_error ("Could not get symbol name", status);
1131 free (res);
1132 return NULL;
1133 }
1134
1135 res[len] = '\0';
1136
1137 return res;
1138}
1139
1140/* Helper function for find_executable_symbol. */
1141
1142static hsa_status_t
1143find_executable_symbol_1 (hsa_executable_t executable,
1144 hsa_executable_symbol_t symbol,
1145 void *data)
1146{
1147 hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
1148 *res = symbol;
1149 return HSA_STATUS_INFO_BREAK;
1150}
1151
1152/* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true. If not
1153 found, return false. */
1154
1155static bool
1156find_executable_symbol (hsa_executable_t executable,
1157 hsa_executable_symbol_t *symbol)
1158{
1159 hsa_status_t status;
1160
1161 status
1162 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1163 find_executable_symbol_1,
1164 symbol);
1165 if (status != HSA_STATUS_INFO_BREAK)
1166 {
1167 hsa_error ("Could not find executable symbol", status);
1168 return false;
1169 }
1170
1171 return true;
1172}
1173
1174/* Get the number of GPU Compute Units. */
1175
1176static int
1177get_cu_count (struct agent_info *agent)
1178{
1179 uint32_t cu_count;
1180 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1181 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1182 if (status == HSA_STATUS_SUCCESS)
1183 return cu_count;
1184 else
1185 return 64; /* The usual number for older devices. */
1186}
1187
1188/* Calculate the maximum grid size for OMP threads / OACC workers.
1189 This depends on the kernel's resource usage levels. */
1190
1191static int
1192limit_worker_threads (int threads)
1193{
1194 /* FIXME Do something more inteligent here.
1195 GCN can always run 4 threads within a Compute Unit, but
1196 more than that depends on register usage. */
1197 if (threads > 16)
1198 threads = 16;
1199 return threads;
1200}
1201
1202/* Parse the target attributes INPUT provided by the compiler and return true
1203 if we should run anything all. If INPUT is NULL, fill DEF with default
1204 values, then store INPUT or DEF into *RESULT.
1205
1206 This is used for OpenMP only. */
1207
1208static bool
1209parse_target_attributes (void **input,
1210 struct GOMP_kernel_launch_attributes *def,
1211 struct GOMP_kernel_launch_attributes **result,
1212 struct agent_info *agent)
1213{
1214 if (!input)
1215 GOMP_PLUGIN_fatal ("No target arguments provided");
1216
1217 bool grid_attrs_found = false;
1218 bool gcn_dims_found = false;
1219 int gcn_teams = 0;
1220 int gcn_threads = 0;
1221 while (*input)
1222 {
1223 intptr_t id = (intptr_t) *input++, val;
1224
1225 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1226 val = (intptr_t) *input++;
1227 else
1228 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1229
1230 val = (val > INT_MAX) ? INT_MAX : val;
1231
1232 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1233 && ((id & GOMP_TARGET_ARG_ID_MASK)
1234 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1235 {
1236 grid_attrs_found = true;
1237 break;
1238 }
14e5e746
AS
1239 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1240 == GOMP_TARGET_ARG_DEVICE_ALL)
237957cc
AS
1241 {
1242 gcn_dims_found = true;
1243 switch (id & GOMP_TARGET_ARG_ID_MASK)
1244 {
1245 case GOMP_TARGET_ARG_NUM_TEAMS:
1246 gcn_teams = val;
1247 break;
1248 case GOMP_TARGET_ARG_THREAD_LIMIT:
1249 gcn_threads = limit_worker_threads (val);
1250 break;
1251 default:
1252 ;
1253 }
1254 }
1255 }
1256
1257 if (gcn_dims_found)
1258 {
7d593fd6
FH
1259 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1260 && gcn_threads == 0 && override_z_dim == 0)
237957cc
AS
1261 {
1262 gcn_threads = 4;
1263 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1264 "threads to 4 per team.\n");
1265 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1266 "GCN_NUM_THREADS=16\n");
1267 }
1268
1269 def->ndim = 3;
1270 /* Fiji has 64 CUs, but Vega20 has 60. */
1271 def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
1272 /* Each thread is 64 work items wide. */
1273 def->gdims[1] = 64;
1274 /* A work group can have 16 wavefronts. */
1275 def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
1276 def->wdims[0] = 1; /* Single team per work-group. */
1277 def->wdims[1] = 64;
1278 def->wdims[2] = 16;
1279 *result = def;
1280 return true;
1281 }
1282 else if (!grid_attrs_found)
1283 {
1284 def->ndim = 1;
1285 def->gdims[0] = 1;
1286 def->gdims[1] = 1;
1287 def->gdims[2] = 1;
1288 def->wdims[0] = 1;
1289 def->wdims[1] = 1;
1290 def->wdims[2] = 1;
1291 *result = def;
1292 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1293 return true;
1294 }
1295
1296 struct GOMP_kernel_launch_attributes *kla;
1297 kla = (struct GOMP_kernel_launch_attributes *) *input;
1298 *result = kla;
1299 if (kla->ndim == 0 || kla->ndim > 3)
1300 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1301
1302 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1303 unsigned i;
1304 for (i = 0; i < kla->ndim; i++)
1305 {
1306 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1307 kla->gdims[i], kla->wdims[i]);
1308 if (kla->gdims[i] == 0)
1309 return false;
1310 }
1311 return true;
1312}
1313
1314/* Return the group size given the requested GROUP size, GRID size and number
1315 of grid dimensions NDIM. */
1316
1317static uint32_t
1318get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1319{
1320 if (group == 0)
1321 {
1322 /* TODO: Provide a default via environment or device characteristics. */
1323 if (ndim == 1)
1324 group = 64;
1325 else if (ndim == 2)
1326 group = 8;
1327 else
1328 group = 4;
1329 }
1330
1331 if (group > grid)
1332 group = grid;
1333 return group;
1334}
1335
1336/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1337
1338static void
1339packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1340{
1341 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1342}
1343
1344/* A never-called callback for the HSA command queues. These signal events
1345 that we don't use, so we trigger an error.
1346
1347 This "queue" is not to be confused with the async queues, below. */
1348
1349static void
1350hsa_queue_callback (hsa_status_t status,
1351 hsa_queue_t *queue __attribute__ ((unused)),
1352 void *data __attribute__ ((unused)))
1353{
1354 hsa_fatal ("Asynchronous queue error", status);
1355}
1356
1357/* }}} */
1358/* {{{ HSA initialization */
1359
1360/* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1361
1362static bool
1363init_hsa_runtime_functions (void)
1364{
1365#define DLSYM_FN(function) \
1366 hsa_fns.function##_fn = dlsym (handle, #function); \
1367 if (hsa_fns.function##_fn == NULL) \
1368 return false;
1369 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1370 if (handle == NULL)
1371 return false;
1372
1373 DLSYM_FN (hsa_status_string)
1374 DLSYM_FN (hsa_system_get_info)
1375 DLSYM_FN (hsa_agent_get_info)
1376 DLSYM_FN (hsa_init)
1377 DLSYM_FN (hsa_iterate_agents)
1378 DLSYM_FN (hsa_region_get_info)
1379 DLSYM_FN (hsa_queue_create)
1380 DLSYM_FN (hsa_agent_iterate_regions)
1381 DLSYM_FN (hsa_executable_destroy)
1382 DLSYM_FN (hsa_executable_create)
1383 DLSYM_FN (hsa_executable_global_variable_define)
1384 DLSYM_FN (hsa_executable_load_code_object)
1385 DLSYM_FN (hsa_executable_freeze)
1386 DLSYM_FN (hsa_signal_create)
1387 DLSYM_FN (hsa_memory_allocate)
1388 DLSYM_FN (hsa_memory_assign_agent)
1389 DLSYM_FN (hsa_memory_copy)
1390 DLSYM_FN (hsa_memory_free)
1391 DLSYM_FN (hsa_signal_destroy)
1392 DLSYM_FN (hsa_executable_get_symbol)
1393 DLSYM_FN (hsa_executable_symbol_get_info)
1394 DLSYM_FN (hsa_executable_iterate_symbols)
1395 DLSYM_FN (hsa_queue_add_write_index_release)
1396 DLSYM_FN (hsa_queue_load_read_index_acquire)
1397 DLSYM_FN (hsa_signal_wait_acquire)
1398 DLSYM_FN (hsa_signal_store_relaxed)
1399 DLSYM_FN (hsa_signal_store_release)
1400 DLSYM_FN (hsa_signal_load_acquire)
1401 DLSYM_FN (hsa_queue_destroy)
1402 DLSYM_FN (hsa_code_object_deserialize)
1403 return true;
1404#undef DLSYM_FN
1405}
1406
1407/* Return true if the agent is a GPU and can accept of concurrent submissions
1408 from different threads. */
1409
1410static bool
1411suitable_hsa_agent_p (hsa_agent_t agent)
1412{
1413 hsa_device_type_t device_type;
1414 hsa_status_t status
1415 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1416 &device_type);
1417 if (status != HSA_STATUS_SUCCESS)
1418 return false;
1419
1420 switch (device_type)
1421 {
1422 case HSA_DEVICE_TYPE_GPU:
1423 break;
1424 case HSA_DEVICE_TYPE_CPU:
1425 if (!support_cpu_devices)
1426 return false;
1427 break;
1428 default:
1429 return false;
1430 }
1431
1432 uint32_t features = 0;
1433 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1434 &features);
1435 if (status != HSA_STATUS_SUCCESS
1436 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1437 return false;
1438 hsa_queue_type_t queue_type;
1439 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1440 &queue_type);
1441 if (status != HSA_STATUS_SUCCESS
1442 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1443 return false;
1444
1445 return true;
1446}
1447
1448/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1449 agent_count in hsa_context. */
1450
1451static hsa_status_t
1452count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1453{
1454 if (suitable_hsa_agent_p (agent))
1455 hsa_context.agent_count++;
1456 return HSA_STATUS_SUCCESS;
1457}
1458
1459/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1460 id to the describing structure in the hsa context. The index of the
1461 structure is pointed to by DATA, increment it afterwards. */
1462
1463static hsa_status_t
1464assign_agent_ids (hsa_agent_t agent, void *data)
1465{
1466 if (suitable_hsa_agent_p (agent))
1467 {
1468 int *agent_index = (int *) data;
1469 hsa_context.agents[*agent_index].id = agent;
1470 ++*agent_index;
1471 }
1472 return HSA_STATUS_SUCCESS;
1473}
1474
1475/* Initialize hsa_context if it has not already been done.
1476 Return TRUE on success. */
1477
1478static bool
1479init_hsa_context (void)
1480{
1481 hsa_status_t status;
1482 int agent_index = 0;
1483
1484 if (hsa_context.initialized)
1485 return true;
1486 init_environment_variables ();
1487 if (!init_hsa_runtime_functions ())
1488 {
1489 GCN_WARNING ("Run-time could not be dynamically opened\n");
1490 if (suppress_host_fallback)
1491 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1492 return false;
1493 }
1494 status = hsa_fns.hsa_init_fn ();
1495 if (status != HSA_STATUS_SUCCESS)
1496 return hsa_error ("Run-time could not be initialized", status);
1497 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1498
1499 if (debug)
1500 dump_hsa_system_info ();
1501
1502 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1503 if (status != HSA_STATUS_SUCCESS)
1504 return hsa_error ("GCN GPU devices could not be enumerated", status);
1505 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1506
1507 hsa_context.agents
1508 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1509 * sizeof (struct agent_info));
1510 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1511 if (agent_index != hsa_context.agent_count)
1512 {
1513 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1514 return false;
1515 }
1516
1517 if (debug)
1518 {
1519 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1520 if (status != HSA_STATUS_SUCCESS)
1521 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1522 }
1523
2e5ea579
FH
1524 uint16_t minor, major;
1525 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1526 &minor);
1527 if (status != HSA_STATUS_SUCCESS)
1528 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1529 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1530 &major);
1531 if (status != HSA_STATUS_SUCCESS)
1532 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1533
1534 size_t len = sizeof hsa_context.driver_version_s;
1535 int printed = snprintf (hsa_context.driver_version_s, len,
1536 "HSA Runtime %hu.%hu", (unsigned short int)major,
1537 (unsigned short int)minor);
1538 if (printed >= len)
1539 GCN_WARNING ("HSA runtime version string was truncated."
1540 "Version %hu.%hu is too long.", (unsigned short int)major,
1541 (unsigned short int)minor);
1542
237957cc
AS
1543 hsa_context.initialized = true;
1544 return true;
1545}
1546
1547/* Verify that hsa_context has already been initialized and return the
1548 agent_info structure describing device number N. Return NULL on error. */
1549
1550static struct agent_info *
1551get_agent_info (int n)
1552{
1553 if (!hsa_context.initialized)
1554 {
1555 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1556 return NULL;
1557 }
1558 if (n >= hsa_context.agent_count)
1559 {
1560 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1561 return NULL;
1562 }
1563 if (!hsa_context.agents[n].initialized)
1564 {
1565 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1566 return NULL;
1567 }
1568 return &hsa_context.agents[n];
1569}
1570
1571/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1572
1573 Selects (breaks at) a suitable region of type KIND. */
1574
1575static hsa_status_t
1576get_memory_region (hsa_region_t region, hsa_region_t *retval,
1577 hsa_region_global_flag_t kind)
1578{
1579 hsa_status_t status;
1580 hsa_region_segment_t segment;
1581
1582 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1583 &segment);
1584 if (status != HSA_STATUS_SUCCESS)
1585 return status;
1586 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1587 return HSA_STATUS_SUCCESS;
1588
1589 uint32_t flags;
1590 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1591 &flags);
1592 if (status != HSA_STATUS_SUCCESS)
1593 return status;
1594 if (flags & kind)
1595 {
1596 *retval = region;
1597 return HSA_STATUS_INFO_BREAK;
1598 }
1599 return HSA_STATUS_SUCCESS;
1600}
1601
1602/* Callback of hsa_agent_iterate_regions.
1603
1604 Selects a kernargs memory region. */
1605
1606static hsa_status_t
1607get_kernarg_memory_region (hsa_region_t region, void *data)
1608{
1609 return get_memory_region (region, (hsa_region_t *)data,
1610 HSA_REGION_GLOBAL_FLAG_KERNARG);
1611}
1612
1613/* Callback of hsa_agent_iterate_regions.
1614
1615 Selects a coarse-grained memory region suitable for the heap and
1616 offload data. */
1617
1618static hsa_status_t
1619get_data_memory_region (hsa_region_t region, void *data)
1620{
1621 return get_memory_region (region, (hsa_region_t *)data,
1622 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1623}
1624
7d593fd6
FH
1625static int
1626elf_gcn_isa_field (Elf64_Ehdr *image)
1627{
1628 return image->e_flags & EF_AMDGPU_MACH_MASK;
1629}
1630
7d593fd6
FH
1631const static char *gcn_gfx803_s = "gfx803";
1632const static char *gcn_gfx900_s = "gfx900";
1633const static char *gcn_gfx906_s = "gfx906";
1634const static int gcn_isa_name_len = 6;
1635
1636/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1637 support the ISA. */
1638
1639static const char*
1640isa_hsa_name (int isa) {
1641 switch(isa)
1642 {
7d593fd6
FH
1643 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1644 return gcn_gfx803_s;
1645 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1646 return gcn_gfx900_s;
1647 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1648 return gcn_gfx906_s;
1649 }
1650 return NULL;
1651}
1652
1653/* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1654 with -march) or NULL if we do not support the ISA.
1655 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1656
1657static const char*
1658isa_gcc_name (int isa) {
1659 switch(isa)
1660 {
7d593fd6
FH
1661 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1662 return "fiji";
1663 default:
1664 return isa_hsa_name (isa);
1665 }
1666}
1667
1668/* Returns the code which is used in the GCN object code to identify the ISA with
1669 the given name (as used by the HSA runtime). */
1670
1671static gcn_isa
1672isa_code(const char *isa) {
7d593fd6
FH
1673 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1674 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1675
1676 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1677 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1678
1679 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1680 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1681
1682 return -1;
1683}
1684
237957cc
AS
1685/* }}} */
1686/* {{{ Run */
1687
1688/* Create or reuse a team arena.
1689
1690 Team arenas are used by OpenMP to avoid calling malloc multiple times
1691 while setting up each team. This is purely a performance optimization.
1692
1693 Allocating an arena also costs performance, albeit on the host side, so
1694 this function will reuse an existing arena if a large enough one is idle.
1695 The arena is released, but not deallocated, when the kernel exits. */
1696
1697static void *
1698get_team_arena (struct agent_info *agent, int num_teams)
1699{
1700 struct team_arena_list **next_ptr = &agent->team_arena_list;
1701 struct team_arena_list *item;
1702
1703 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1704 {
1705 if (item->num_teams < num_teams)
1706 continue;
1707
1708 if (pthread_mutex_trylock (&item->in_use))
1709 continue;
1710
1711 return item->arena;
1712 }
1713
1714 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1715
1716 if (pthread_mutex_lock (&agent->team_arena_write_lock))
1717 {
1718 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1719 return false;
1720 }
1721 item = malloc (sizeof (*item));
1722 item->num_teams = num_teams;
1723 item->next = NULL;
1724 *next_ptr = item;
1725
1726 if (pthread_mutex_init (&item->in_use, NULL))
1727 {
1728 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1729 return false;
1730 }
1731 if (pthread_mutex_lock (&item->in_use))
1732 {
1733 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1734 return false;
1735 }
1736 if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1737 {
1738 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1739 return false;
1740 }
1741
1742 const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */
1743 hsa_status_t status;
1744 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1745 TEAM_ARENA_SIZE*num_teams,
1746 &item->arena);
1747 if (status != HSA_STATUS_SUCCESS)
1748 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1749 status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1750 HSA_ACCESS_PERMISSION_RW);
1751 if (status != HSA_STATUS_SUCCESS)
1752 hsa_fatal ("Could not assign arena memory to device", status);
1753
1754 return item->arena;
1755}
1756
1757/* Mark a team arena available for reuse. */
1758
1759static void
1760release_team_arena (struct agent_info* agent, void *arena)
1761{
1762 struct team_arena_list *item;
1763
1764 for (item = agent->team_arena_list; item; item = item->next)
1765 {
1766 if (item->arena == arena)
1767 {
1768 if (pthread_mutex_unlock (&item->in_use))
1769 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1770 return;
1771 }
1772 }
1773 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1774}
1775
1776/* Clean up all the allocated team arenas. */
1777
1778static bool
1779destroy_team_arenas (struct agent_info *agent)
1780{
1781 struct team_arena_list *item, *next;
1782
1783 for (item = agent->team_arena_list; item; item = next)
1784 {
1785 next = item->next;
1786 hsa_fns.hsa_memory_free_fn (item->arena);
1787 if (pthread_mutex_destroy (&item->in_use))
1788 {
1789 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1790 return false;
1791 }
1792 free (item);
1793 }
1794 agent->team_arena_list = NULL;
1795
1796 return true;
1797}
1798
1799/* Allocate memory on a specified device. */
1800
1801static void *
1802alloc_by_agent (struct agent_info *agent, size_t size)
1803{
1804 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1805
1806 /* Zero-size allocations are invalid, so in order to return a valid pointer
1807 we need to pass a valid size. One source of zero-size allocations is
1808 kernargs for kernels that have no inputs or outputs (the kernel may
1809 only use console output, for example). */
1810 if (size == 0)
1811 size = 4;
1812
1813 void *ptr;
1814 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1815 size, &ptr);
1816 if (status != HSA_STATUS_SUCCESS)
1817 {
1818 hsa_error ("Could not allocate device memory", status);
1819 return NULL;
1820 }
1821
1822 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1823 HSA_ACCESS_PERMISSION_RW);
1824 if (status != HSA_STATUS_SUCCESS)
1825 {
1826 hsa_error ("Could not assign data memory to device", status);
1827 return NULL;
1828 }
1829
1830 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1831 bool profiling_dispatch_p
1832 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1833 if (profiling_dispatch_p)
1834 {
1835 acc_prof_info *prof_info = thr->prof_info;
1836 acc_event_info data_event_info;
1837 acc_api_info *api_info = thr->api_info;
1838
1839 prof_info->event_type = acc_ev_alloc;
1840
1841 data_event_info.data_event.event_type = prof_info->event_type;
1842 data_event_info.data_event.valid_bytes
1843 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1844 data_event_info.data_event.parent_construct
1845 = acc_construct_parallel;
1846 data_event_info.data_event.implicit = 1;
1847 data_event_info.data_event.tool_info = NULL;
1848 data_event_info.data_event.var_name = NULL;
1849 data_event_info.data_event.bytes = size;
1850 data_event_info.data_event.host_ptr = NULL;
1851 data_event_info.data_event.device_ptr = (void *) ptr;
1852
1853 api_info->device_api = acc_device_api_other;
1854
1855 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1856 api_info);
1857 }
1858
1859 return ptr;
1860}
1861
1862/* Create kernel dispatch data structure for given KERNEL, along with
1863 the necessary device signals and memory allocations. */
1864
1865static struct kernel_dispatch *
1866create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1867{
1868 struct agent_info *agent = kernel->agent;
1869 struct kernel_dispatch *shadow
1870 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1871
1872 shadow->agent = kernel->agent;
1873 shadow->object = kernel->object;
1874
1875 hsa_signal_t sync_signal;
1876 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1877 if (status != HSA_STATUS_SUCCESS)
1878 hsa_fatal ("Error creating the GCN sync signal", status);
1879
1880 shadow->signal = sync_signal.handle;
1881 shadow->private_segment_size = kernel->private_segment_size;
1882 shadow->group_segment_size = kernel->group_segment_size;
1883
1884 /* We expect kernels to request a single pointer, explicitly, and the
1885 rest of struct kernargs, implicitly. If they request anything else
1886 then something is wrong. */
1887 if (kernel->kernarg_segment_size > 8)
1888 {
1889 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1890 return NULL;
1891 }
1892
1893 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1894 sizeof (struct kernargs),
1895 &shadow->kernarg_address);
1896 if (status != HSA_STATUS_SUCCESS)
1897 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1898 struct kernargs *kernargs = shadow->kernarg_address;
1899
1900 /* Zero-initialize the output_data (minimum needed). */
1901 kernargs->out_ptr = (int64_t)&kernargs->output_data;
1902 kernargs->output_data.next_output = 0;
1903 for (unsigned i = 0;
1904 i < (sizeof (kernargs->output_data.queue)
1905 / sizeof (kernargs->output_data.queue[0]));
1906 i++)
1907 kernargs->output_data.queue[i].written = 0;
1908 kernargs->output_data.consumed = 0;
1909
1910 /* Pass in the heap location. */
1911 kernargs->heap_ptr = (int64_t)kernel->module->heap;
1912
1913 /* Create an arena. */
1914 if (kernel->kind == KIND_OPENMP)
1915 kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1916 else
1917 kernargs->arena_ptr = 0;
1918
1919 /* Ensure we can recognize unset return values. */
1920 kernargs->output_data.return_value = 0xcafe0000;
1921
1922 return shadow;
1923}
1924
1925/* Output any data written to console output from the kernel. It is expected
1926 that this function is polled during kernel execution.
1927
1928 We print all entries from the last item printed to the next entry without
1929 a "written" flag. If the "final" flag is set then it'll continue right to
1930 the end.
1931
1932 The print buffer is circular, but the from and to locations don't wrap when
1933 the buffer does, so the output limit is UINT_MAX. The target blocks on
1934 output when the buffer is full. */
1935
1936static void
1937console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1938 bool final)
1939{
1940 unsigned int limit = (sizeof (kernargs->output_data.queue)
1941 / sizeof (kernargs->output_data.queue[0]));
1942
1943 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1944 __ATOMIC_ACQUIRE);
1945 unsigned int to = kernargs->output_data.next_output;
1946
1947 if (from > to)
1948 {
1949 /* Overflow. */
1950 if (final)
1951 printf ("GCN print buffer overflowed.\n");
1952 return;
1953 }
1954
1955 unsigned int i;
1956 for (i = from; i < to; i++)
1957 {
1958 struct printf_data *data = &kernargs->output_data.queue[i%limit];
1959
1960 if (!data->written && !final)
1961 break;
1962
1963 switch (data->type)
1964 {
1965 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1966 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1967 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1968 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1969 default: printf ("GCN print buffer error!\n"); break;
1970 }
1971 data->written = 0;
1972 __atomic_store_n (&kernargs->output_data.consumed, i+1,
1973 __ATOMIC_RELEASE);
1974 }
1975 fflush (stdout);
1976}
1977
1978/* Release data structure created for a kernel dispatch in SHADOW argument,
1979 and clean up the signal and memory allocations. */
1980
1981static void
1982release_kernel_dispatch (struct kernel_dispatch *shadow)
1983{
1984 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1985
1986 struct kernargs *kernargs = shadow->kernarg_address;
1987 void *arena = (void *)kernargs->arena_ptr;
1988 if (arena)
1989 release_team_arena (shadow->agent, arena);
1990
1991 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1992
1993 hsa_signal_t s;
1994 s.handle = shadow->signal;
1995 hsa_fns.hsa_signal_destroy_fn (s);
1996
1997 free (shadow);
1998}
1999
2000/* Extract the properties from a kernel binary. */
2001
2002static void
2003init_kernel_properties (struct kernel_info *kernel)
2004{
2005 hsa_status_t status;
2006 struct agent_info *agent = kernel->agent;
2007 hsa_executable_symbol_t kernel_symbol;
2008 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2009 kernel->name, agent->id,
2010 0, &kernel_symbol);
2011 if (status != HSA_STATUS_SUCCESS)
2012 {
2013 hsa_warn ("Could not find symbol for kernel in the code object", status);
2014 fprintf (stderr, "not found name: '%s'\n", kernel->name);
2015 dump_executable_symbols (agent->executable);
2016 goto failure;
2017 }
2018 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2019 status = hsa_fns.hsa_executable_symbol_get_info_fn
2020 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2021 if (status != HSA_STATUS_SUCCESS)
2022 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2023 status = hsa_fns.hsa_executable_symbol_get_info_fn
2024 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2025 &kernel->kernarg_segment_size);
2026 if (status != HSA_STATUS_SUCCESS)
2027 hsa_fatal ("Could not get info about kernel argument size", status);
2028 status = hsa_fns.hsa_executable_symbol_get_info_fn
2029 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2030 &kernel->group_segment_size);
2031 if (status != HSA_STATUS_SUCCESS)
2032 hsa_fatal ("Could not get info about kernel group segment size", status);
2033 status = hsa_fns.hsa_executable_symbol_get_info_fn
2034 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2035 &kernel->private_segment_size);
2036 if (status != HSA_STATUS_SUCCESS)
2037 hsa_fatal ("Could not get info about kernel private segment size",
2038 status);
2039
2040 /* The kernel type is not known until something tries to launch it. */
2041 kernel->kind = KIND_UNKNOWN;
2042
2043 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2044 "following segment sizes: \n", kernel->name);
2045 GCN_DEBUG (" group_segment_size: %u\n",
2046 (unsigned) kernel->group_segment_size);
2047 GCN_DEBUG (" private_segment_size: %u\n",
2048 (unsigned) kernel->private_segment_size);
2049 GCN_DEBUG (" kernarg_segment_size: %u\n",
2050 (unsigned) kernel->kernarg_segment_size);
2051 return;
2052
2053failure:
2054 kernel->initialization_failed = true;
2055}
2056
2057/* Do all the work that is necessary before running KERNEL for the first time.
2058 The function assumes the program has been created, finalized and frozen by
2059 create_and_finalize_hsa_program. */
2060
2061static void
2062init_kernel (struct kernel_info *kernel)
2063{
2064 if (pthread_mutex_lock (&kernel->init_mutex))
2065 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2066 if (kernel->initialized)
2067 {
2068 if (pthread_mutex_unlock (&kernel->init_mutex))
2069 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2070 "mutex");
2071
2072 return;
2073 }
2074
2075 init_kernel_properties (kernel);
2076
2077 if (!kernel->initialization_failed)
2078 {
2079 GCN_DEBUG ("\n");
2080
2081 kernel->initialized = true;
2082 }
2083 if (pthread_mutex_unlock (&kernel->init_mutex))
2084 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2085 "mutex");
2086}
2087
2088/* Run KERNEL on its agent, pass VARS to it as arguments and take
2089 launch attributes from KLA.
2090
2091 MODULE_LOCKED indicates that the caller already holds the lock and
2092 run_kernel need not lock it again.
2093 If AQ is NULL then agent->sync_queue will be used. */
2094
2095static void
2096run_kernel (struct kernel_info *kernel, void *vars,
2097 struct GOMP_kernel_launch_attributes *kla,
2098 struct goacc_asyncqueue *aq, bool module_locked)
2099{
5a28e272
KCY
2100 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2101 kernel->description->vpgr_count);
2102
2103 /* Reduce the number of threads/workers if there are insufficient
2104 VGPRs available to run the kernels together. */
2105 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2106 {
2107 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2108 int max_threads = (256 / granulated_vgprs) * 4;
2109 if (kla->gdims[2] > max_threads)
2110 {
2111 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2112 " per team/gang - reducing to %d threads/workers.\n",
2113 kla->gdims[2], max_threads);
2114 kla->gdims[2] = max_threads;
2115 }
2116 }
2117
237957cc
AS
2118 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2119 (aq ? aq->id : 0));
2120 GCN_DEBUG ("GCN launch attribs: gdims:[");
2121 int i;
2122 for (i = 0; i < kla->ndim; ++i)
2123 {
2124 if (i)
2125 DEBUG_PRINT (", ");
2126 DEBUG_PRINT ("%u", kla->gdims[i]);
2127 }
2128 DEBUG_PRINT ("], normalized gdims:[");
2129 for (i = 0; i < kla->ndim; ++i)
2130 {
2131 if (i)
2132 DEBUG_PRINT (", ");
2133 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2134 }
2135 DEBUG_PRINT ("], wdims:[");
2136 for (i = 0; i < kla->ndim; ++i)
2137 {
2138 if (i)
2139 DEBUG_PRINT (", ");
2140 DEBUG_PRINT ("%u", kla->wdims[i]);
2141 }
2142 DEBUG_PRINT ("]\n");
2143 DEBUG_FLUSH ();
2144
2145 struct agent_info *agent = kernel->agent;
2146 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2147 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2148
2149 if (!agent->initialized)
2150 GOMP_PLUGIN_fatal ("Agent must be initialized");
2151
2152 if (!kernel->initialized)
2153 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2154
2155 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2156
2157 uint64_t index
2158 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2159 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2160
2161 /* Wait until the queue is not full before writing the packet. */
2162 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2163 >= command_q->size)
2164 ;
2165
2166 /* Do not allow the dimensions to be overridden when running
2167 constructors or destructors. */
2168 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2169 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2170
2171 hsa_kernel_dispatch_packet_t *packet;
2172 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2173 + index % command_q->size;
2174
2175 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2176 packet->grid_size_x = override_x ? : kla->gdims[0];
2177 packet->workgroup_size_x = get_group_size (kla->ndim,
2178 packet->grid_size_x,
2179 kla->wdims[0]);
2180
2181 if (kla->ndim >= 2)
2182 {
2183 packet->grid_size_y = kla->gdims[1];
2184 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2185 kla->wdims[1]);
2186 }
2187 else
2188 {
2189 packet->grid_size_y = 1;
2190 packet->workgroup_size_y = 1;
2191 }
2192
2193 if (kla->ndim == 3)
2194 {
2195 packet->grid_size_z = limit_worker_threads (override_z
2196 ? : kla->gdims[2]);
2197 packet->workgroup_size_z = get_group_size (kla->ndim,
2198 packet->grid_size_z,
2199 kla->wdims[2]);
2200 }
2201 else
2202 {
2203 packet->grid_size_z = 1;
2204 packet->workgroup_size_z = 1;
2205 }
2206
2207 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2208 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2209 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2210 packet->grid_size_x / packet->workgroup_size_x,
2211 packet->grid_size_y / packet->workgroup_size_y,
2212 packet->grid_size_z / packet->workgroup_size_z,
2213 packet->workgroup_size_x, packet->workgroup_size_y,
2214 packet->workgroup_size_z);
2215
2216 struct kernel_dispatch *shadow
2217 = create_kernel_dispatch (kernel, packet->grid_size_x);
2218 shadow->queue = command_q;
2219
2220 if (debug)
2221 {
2222 fprintf (stderr, "\nKernel has following dependencies:\n");
2223 print_kernel_dispatch (shadow, 2);
2224 }
2225
2226 packet->private_segment_size = kernel->private_segment_size;
2227 packet->group_segment_size = kernel->group_segment_size;
2228 packet->kernel_object = kernel->object;
2229 packet->kernarg_address = shadow->kernarg_address;
2230 hsa_signal_t s;
2231 s.handle = shadow->signal;
2232 packet->completion_signal = s;
2233 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2234 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2235
2236 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2237
2238 uint16_t header;
2239 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2240 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2241 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2242
2243 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2244 agent->device_id);
2245
2246 packet_store_release ((uint32_t *) packet, header,
2247 (uint16_t) kla->ndim
2248 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2249
2250 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2251 index);
2252
2253 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2254
2255 /* Root signal waits with 1ms timeout. */
2256 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2257 1000 * 1000,
2258 HSA_WAIT_STATE_BLOCKED) != 0)
2259 {
2260 console_output (kernel, shadow->kernarg_address, false);
2261 }
2262 console_output (kernel, shadow->kernarg_address, true);
2263
2264 struct kernargs *kernargs = shadow->kernarg_address;
2265 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2266
2267 release_kernel_dispatch (shadow);
2268
2269 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2270 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2271
2272 unsigned int upper = (return_value & ~0xffff) >> 16;
2273 if (upper == 0xcafe)
2274 ; // exit not called, normal termination.
2275 else if (upper == 0xffff)
2276 ; // exit called.
2277 else
2278 {
2279 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2280 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2281 return_value);
2282 abort ();
2283 }
2284
2285 if (upper == 0xffff)
2286 {
2287 unsigned int signal = (return_value >> 8) & 0xff;
2288
2289 if (signal == SIGABRT)
2290 {
2291 GCN_WARNING ("GCN Kernel aborted\n");
2292 abort ();
2293 }
2294 else if (signal != 0)
2295 {
2296 GCN_WARNING ("GCN Kernel received unknown signal\n");
2297 abort ();
2298 }
2299
2300 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2301 exit (return_value & 0xff);
2302 }
2303}
2304
2305/* }}} */
2306/* {{{ Load/Unload */
2307
2308/* Initialize KERNEL from D and other parameters. Return true on success. */
2309
2310static bool
2311init_basic_kernel_info (struct kernel_info *kernel,
2312 struct hsa_kernel_description *d,
2313 struct agent_info *agent,
2314 struct module_info *module)
2315{
2316 kernel->agent = agent;
2317 kernel->module = module;
2318 kernel->name = d->name;
5a28e272 2319 kernel->description = d;
237957cc
AS
2320 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2321 {
2322 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2323 return false;
2324 }
2325 return true;
2326}
2327
2328/* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true. If
2329 not found, return false. */
2330
2331static bool
2332find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
2333 struct module_info *module, Elf64_Ehdr *image,
2334 Elf64_Shdr *sections)
2335{
2336 bool res = false;
2337
2338 hsa_status_t status;
2339
2340 hsa_executable_symbol_t symbol;
2341 if (!find_executable_symbol (agent->executable, &symbol))
2342 return false;
2343
2344 status = hsa_fns.hsa_executable_symbol_get_info_fn
2345 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
2346 if (status != HSA_STATUS_SUCCESS)
2347 {
2348 hsa_error ("Could not extract symbol address", status);
2349 return false;
2350 }
2351
2352 char *symbol_name = get_executable_symbol_name (symbol);
2353 if (symbol_name == NULL)
2354 return false;
2355
2356 /* Find the kernel function in ELF, and calculate actual load offset. */
2357 for (int i = 0; i < image->e_shnum; i++)
2358 if (sections[i].sh_type == SHT_SYMTAB)
2359 {
2360 Elf64_Shdr *strtab = &sections[sections[i].sh_link];
2361 char *strings = (char *)image + strtab->sh_offset;
2362
2363 for (size_t offset = 0;
2364 offset < sections[i].sh_size;
2365 offset += sections[i].sh_entsize)
2366 {
2367 Elf64_Sym *sym = (Elf64_Sym*)((char*)image
2368 + sections[i].sh_offset
2369 + offset);
2370 if (strcmp (symbol_name, strings + sym->st_name) == 0)
2371 {
2372 *load_offset -= sym->st_value;
2373 res = true;
2374 break;
2375 }
2376 }
2377 }
2378
2379 free (symbol_name);
2380 return res;
2381}
2382
7d593fd6
FH
2383/* Check that the GCN ISA of the given image matches the ISA of the agent. */
2384
2385static bool
2386isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2387{
2388 int isa_field = elf_gcn_isa_field (image);
2389 const char* isa_s = isa_hsa_name (isa_field);
2390 if (!isa_s)
2391 {
2392 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2393 return false;
2394 }
2395
2396 if (isa_field != agent->device_isa)
2397 {
2398 char msg[120];
2399 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2400 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2401 assert (agent_isa_s);
2402 assert (agent_isa_gcc_s);
2403
2404 snprintf (msg, sizeof msg,
2405 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2406 "Try to recompile with '-foffload=-march=%s'.\n",
2407 isa_s, agent_isa_s, agent_isa_gcc_s);
2408
2409 hsa_error (msg, HSA_STATUS_ERROR);
2410 return false;
2411 }
2412
2413 return true;
2414}
2415
237957cc
AS
2416/* Create and finalize the program consisting of all loaded modules. */
2417
2418static bool
2419create_and_finalize_hsa_program (struct agent_info *agent)
2420{
2421 hsa_status_t status;
2422 int reloc_count = 0;
2423 bool res = true;
2424 if (pthread_mutex_lock (&agent->prog_mutex))
2425 {
2426 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2427 return false;
2428 }
2429 if (agent->prog_finalized)
2430 goto final;
2431
2432 status
2433 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2434 HSA_EXECUTABLE_STATE_UNFROZEN,
2435 "", &agent->executable);
2436 if (status != HSA_STATUS_SUCCESS)
2437 {
2438 hsa_error ("Could not create GCN executable", status);
2439 goto fail;
2440 }
2441
2442 /* Load any GCN modules. */
2443 struct module_info *module = agent->module;
2444 if (module)
2445 {
2446 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2447
7d593fd6
FH
2448 if (!isa_matches_agent (agent, image))
2449 goto fail;
2450
237957cc
AS
2451 /* Hide relocations from the HSA runtime loader.
2452 Keep a copy of the unmodified section headers to use later. */
2453 Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
2454 + image->e_shoff);
2455 for (int i = image->e_shnum - 1; i >= 0; i--)
2456 {
2457 if (image_sections[i].sh_type == SHT_RELA
2458 || image_sections[i].sh_type == SHT_REL)
2459 /* Change section type to something harmless. */
2460 image_sections[i].sh_type |= 0x80;
2461 }
2462
2463 hsa_code_object_t co = { 0 };
2464 status = hsa_fns.hsa_code_object_deserialize_fn
2465 (module->image_desc->gcn_image->image,
2466 module->image_desc->gcn_image->size,
2467 NULL, &co);
2468 if (status != HSA_STATUS_SUCCESS)
2469 {
2470 hsa_error ("Could not deserialize GCN code object", status);
2471 goto fail;
2472 }
2473
2474 status = hsa_fns.hsa_executable_load_code_object_fn
2475 (agent->executable, agent->id, co, "");
2476 if (status != HSA_STATUS_SUCCESS)
2477 {
2478 hsa_error ("Could not load GCN code object", status);
2479 goto fail;
2480 }
2481
2482 if (!module->heap)
2483 {
2484 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2485 gcn_kernel_heap_size,
2486 (void**)&module->heap);
2487 if (status != HSA_STATUS_SUCCESS)
2488 {
2489 hsa_error ("Could not allocate memory for GCN heap", status);
2490 goto fail;
2491 }
2492
2493 status = hsa_fns.hsa_memory_assign_agent_fn
2494 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2495 if (status != HSA_STATUS_SUCCESS)
2496 {
2497 hsa_error ("Could not assign GCN heap memory to device", status);
2498 goto fail;
2499 }
2500
2501 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2502 &gcn_kernel_heap_size,
2503 sizeof (gcn_kernel_heap_size));
2504 }
2505
2506 }
2507
2508 if (debug)
2509 dump_executable_symbols (agent->executable);
2510
2511 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2512 if (status != HSA_STATUS_SUCCESS)
2513 {
2514 hsa_error ("Could not freeze the GCN executable", status);
2515 goto fail;
2516 }
2517
2518 if (agent->module)
2519 {
2520 struct module_info *module = agent->module;
2521 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2522 Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
2523
2524 Elf64_Addr load_offset;
2525 if (!find_load_offset (&load_offset, agent, module, image, sections))
2526 goto fail;
2527
2528 /* Record the physical load address range.
2529 We need this for data copies later. */
2530 Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
2531 Elf64_Addr low = ~0, high = 0;
2532 for (int i = 0; i < image->e_phnum; i++)
2533 if (segments[i].p_memsz > 0)
2534 {
2535 if (segments[i].p_paddr < low)
2536 low = segments[i].p_paddr;
2537 if (segments[i].p_paddr > high)
2538 high = segments[i].p_paddr + segments[i].p_memsz - 1;
2539 }
2540 module->phys_address_start = low + load_offset;
2541 module->phys_address_end = high + load_offset;
2542
2543 // Find dynamic symbol table
2544 Elf64_Shdr *dynsym = NULL;
2545 for (int i = 0; i < image->e_shnum; i++)
2546 if (sections[i].sh_type == SHT_DYNSYM)
2547 {
2548 dynsym = &sections[i];
2549 break;
2550 }
2551
2552 /* Fix up relocations. */
2553 for (int i = 0; i < image->e_shnum; i++)
2554 {
2555 if (sections[i].sh_type == (SHT_RELA | 0x80))
2556 for (size_t offset = 0;
2557 offset < sections[i].sh_size;
2558 offset += sections[i].sh_entsize)
2559 {
2560 Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
2561 + sections[i].sh_offset
2562 + offset);
2563 Elf64_Sym *sym =
2564 (dynsym
2565 ? (Elf64_Sym*)((char*)image
2566 + dynsym->sh_offset
2567 + (dynsym->sh_entsize
2568 * ELF64_R_SYM (reloc->r_info)))
2569 : NULL);
2570
2571 int64_t S = (sym ? sym->st_value : 0);
2572 int64_t P = reloc->r_offset + load_offset;
2573 int64_t A = reloc->r_addend;
2574 int64_t B = load_offset;
2575 int64_t V, size;
2576 switch (ELF64_R_TYPE (reloc->r_info))
2577 {
2578 case R_AMDGPU_ABS32_LO:
2579 V = (S + A) & 0xFFFFFFFF;
2580 size = 4;
2581 break;
2582 case R_AMDGPU_ABS32_HI:
2583 V = (S + A) >> 32;
2584 size = 4;
2585 break;
2586 case R_AMDGPU_ABS64:
2587 V = S + A;
2588 size = 8;
2589 break;
2590 case R_AMDGPU_REL32:
2591 V = S + A - P;
2592 size = 4;
2593 break;
2594 case R_AMDGPU_REL64:
2595 /* FIXME
2596 LLD seems to emit REL64 where the the assembler has
2597 ABS64. This is clearly wrong because it's not what the
2598 compiler is expecting. Let's assume, for now, that
2599 it's a bug. In any case, GCN kernels are always self
2600 contained and therefore relative relocations will have
2601 been resolved already, so this should be a safe
2602 workaround. */
2603 V = S + A/* - P*/;
2604 size = 8;
2605 break;
2606 case R_AMDGPU_ABS32:
2607 V = S + A;
2608 size = 4;
2609 break;
2610 /* TODO R_AMDGPU_GOTPCREL */
2611 /* TODO R_AMDGPU_GOTPCREL32_LO */
2612 /* TODO R_AMDGPU_GOTPCREL32_HI */
2613 case R_AMDGPU_REL32_LO:
2614 V = (S + A - P) & 0xFFFFFFFF;
2615 size = 4;
2616 break;
2617 case R_AMDGPU_REL32_HI:
2618 V = (S + A - P) >> 32;
2619 size = 4;
2620 break;
2621 case R_AMDGPU_RELATIVE64:
2622 V = B + A;
2623 size = 8;
2624 break;
2625 default:
2626 fprintf (stderr, "Error: unsupported relocation type.\n");
2627 exit (1);
2628 }
2629 status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
2630 if (status != HSA_STATUS_SUCCESS)
2631 {
2632 hsa_error ("Failed to fix up relocation", status);
2633 goto fail;
2634 }
2635 reloc_count++;
2636 }
2637 }
2638 }
2639
2640 GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
2641 agent->device_id, reloc_count);
2642
2643final:
2644 agent->prog_finalized = true;
2645
2646 if (pthread_mutex_unlock (&agent->prog_mutex))
2647 {
2648 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2649 res = false;
2650 }
2651
2652 return res;
2653
2654fail:
2655 res = false;
2656 goto final;
2657}
2658
2659/* Free the HSA program in agent and everything associated with it and set
2660 agent->prog_finalized and the initialized flags of all kernels to false.
2661 Return TRUE on success. */
2662
2663static bool
2664destroy_hsa_program (struct agent_info *agent)
2665{
2666 if (!agent->prog_finalized)
2667 return true;
2668
2669 hsa_status_t status;
2670
2671 GCN_DEBUG ("Destroying the current GCN program.\n");
2672
2673 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2674 if (status != HSA_STATUS_SUCCESS)
2675 return hsa_error ("Could not destroy GCN executable", status);
2676
2677 if (agent->module)
2678 {
2679 int i;
2680 for (i = 0; i < agent->module->kernel_count; i++)
2681 agent->module->kernels[i].initialized = false;
2682
2683 if (agent->module->heap)
2684 {
2685 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2686 agent->module->heap = NULL;
2687 }
2688 }
2689 agent->prog_finalized = false;
2690 return true;
2691}
2692
2693/* Deinitialize all information associated with MODULE and kernels within
2694 it. Return TRUE on success. */
2695
2696static bool
2697destroy_module (struct module_info *module, bool locked)
2698{
2699 /* Run destructors before destroying module. */
2700 struct GOMP_kernel_launch_attributes kla =
2701 { 3,
2702 /* Grid size. */
2703 { 1, 64, 1 },
2704 /* Work-group size. */
2705 { 1, 64, 1 }
2706 };
2707
2708 if (module->fini_array_func)
2709 {
2710 init_kernel (module->fini_array_func);
2711 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2712 }
2713 module->constructors_run_p = false;
2714
2715 int i;
2716 for (i = 0; i < module->kernel_count; i++)
2717 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2718 {
2719 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2720 "mutex");
2721 return false;
2722 }
2723
2724 return true;
2725}
2726
2727/* }}} */
2728/* {{{ Async */
2729
2730/* Callback of dispatch queues to report errors. */
2731
2732static void
2733execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2734{
2735 struct queue_entry *entry = &aq->queue[index];
2736
2737 switch (entry->type)
2738 {
2739 case KERNEL_LAUNCH:
2740 if (DEBUG_QUEUES)
2741 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2742 aq->agent->device_id, aq->id, index);
2743 run_kernel (entry->u.launch.kernel,
2744 entry->u.launch.vars,
2745 &entry->u.launch.kla, aq, false);
2746 if (DEBUG_QUEUES)
2747 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2748 aq->agent->device_id, aq->id, index);
2749 break;
2750
2751 case CALLBACK:
2752 if (DEBUG_QUEUES)
2753 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2754 aq->agent->device_id, aq->id, index);
2755 entry->u.callback.fn (entry->u.callback.data);
2756 if (DEBUG_QUEUES)
2757 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2758 aq->agent->device_id, aq->id, index);
2759 break;
2760
2761 case ASYNC_WAIT:
2762 {
2763 /* FIXME: is it safe to access a placeholder that may already have
2764 been executed? */
2765 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2766
2767 if (DEBUG_QUEUES)
2768 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2769 aq->agent->device_id, aq->id, index);
2770
2771 pthread_mutex_lock (&placeholderp->mutex);
2772
2773 while (!placeholderp->executed)
2774 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2775
2776 pthread_mutex_unlock (&placeholderp->mutex);
2777
2778 if (pthread_cond_destroy (&placeholderp->cond))
2779 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2780
2781 if (pthread_mutex_destroy (&placeholderp->mutex))
2782 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2783
2784 if (DEBUG_QUEUES)
2785 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2786 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2787 }
2788 break;
2789
2790 case ASYNC_PLACEHOLDER:
2791 pthread_mutex_lock (&entry->u.placeholder.mutex);
2792 entry->u.placeholder.executed = 1;
2793 pthread_cond_signal (&entry->u.placeholder.cond);
2794 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2795 break;
2796
2797 default:
2798 GOMP_PLUGIN_fatal ("Unknown queue element");
2799 }
2800}
2801
2802/* This function is run as a thread to service an async queue in the
2803 background. It runs continuously until the stop flag is set. */
2804
2805static void *
2806drain_queue (void *thread_arg)
2807{
2808 struct goacc_asyncqueue *aq = thread_arg;
2809
2810 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2811 {
2812 aq->drain_queue_stop = 2;
2813 return NULL;
2814 }
2815
2816 pthread_mutex_lock (&aq->mutex);
2817
2818 while (true)
2819 {
2820 if (aq->drain_queue_stop)
2821 break;
2822
2823 if (aq->queue_n > 0)
2824 {
2825 pthread_mutex_unlock (&aq->mutex);
2826 execute_queue_entry (aq, aq->queue_first);
2827
2828 pthread_mutex_lock (&aq->mutex);
2829 aq->queue_first = ((aq->queue_first + 1)
2830 % ASYNC_QUEUE_SIZE);
2831 aq->queue_n--;
2832
2833 if (DEBUG_THREAD_SIGNAL)
2834 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2835 aq->agent->device_id, aq->id);
2836 pthread_cond_broadcast (&aq->queue_cond_out);
2837 pthread_mutex_unlock (&aq->mutex);
2838
2839 if (DEBUG_QUEUES)
2840 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2841 aq->id);
2842 pthread_mutex_lock (&aq->mutex);
2843 }
2844 else
2845 {
2846 if (DEBUG_THREAD_SLEEP)
2847 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2848 aq->agent->device_id, aq->id);
2849 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2850 if (DEBUG_THREAD_SLEEP)
2851 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2852 aq->agent->device_id, aq->id);
2853 }
2854 }
2855
2856 aq->drain_queue_stop = 2;
2857 if (DEBUG_THREAD_SIGNAL)
2858 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2859 aq->agent->device_id, aq->id);
2860 pthread_cond_broadcast (&aq->queue_cond_out);
2861 pthread_mutex_unlock (&aq->mutex);
2862
2863 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2864 return NULL;
2865}
2866
2867/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2868 is not usually the case. This is just a debug tool. */
2869
2870static void
2871drain_queue_synchronous (struct goacc_asyncqueue *aq)
2872{
2873 pthread_mutex_lock (&aq->mutex);
2874
2875 while (aq->queue_n > 0)
2876 {
2877 execute_queue_entry (aq, aq->queue_first);
2878
2879 aq->queue_first = ((aq->queue_first + 1)
2880 % ASYNC_QUEUE_SIZE);
2881 aq->queue_n--;
2882 }
2883
2884 pthread_mutex_unlock (&aq->mutex);
2885}
2886
d88b27da
JB
2887/* Block the current thread until an async queue is writable. The aq->mutex
2888 lock should be held on entry, and remains locked on exit. */
237957cc
AS
2889
2890static void
2891wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2892{
2893 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2894 {
237957cc
AS
2895 /* Queue is full. Wait for it to not be full. */
2896 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2897 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
237957cc
AS
2898 }
2899}
2900
2901/* Request an asynchronous kernel launch on the specified queue. This
2902 may block if the queue is full, but returns without waiting for the
2903 kernel to run. */
2904
2905static void
2906queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2907 void *vars, struct GOMP_kernel_launch_attributes *kla)
2908{
2909 assert (aq->agent == kernel->agent);
2910
237957cc
AS
2911 pthread_mutex_lock (&aq->mutex);
2912
d88b27da
JB
2913 wait_for_queue_nonfull (aq);
2914
237957cc
AS
2915 int queue_last = ((aq->queue_first + aq->queue_n)
2916 % ASYNC_QUEUE_SIZE);
2917 if (DEBUG_QUEUES)
2918 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2919 aq->id, queue_last);
2920
2921 aq->queue[queue_last].type = KERNEL_LAUNCH;
2922 aq->queue[queue_last].u.launch.kernel = kernel;
2923 aq->queue[queue_last].u.launch.vars = vars;
2924 aq->queue[queue_last].u.launch.kla = *kla;
2925
2926 aq->queue_n++;
2927
2928 if (DEBUG_THREAD_SIGNAL)
2929 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2930 aq->agent->device_id, aq->id);
2931 pthread_cond_signal (&aq->queue_cond_in);
2932
2933 pthread_mutex_unlock (&aq->mutex);
2934}
2935
2936/* Request an asynchronous callback on the specified queue. The callback
2937 function will be called, with the given opaque data, from the appropriate
2938 async thread, when all previous items on that queue are complete. */
2939
2940static void
2941queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2942 void *data)
2943{
237957cc
AS
2944 pthread_mutex_lock (&aq->mutex);
2945
d88b27da
JB
2946 wait_for_queue_nonfull (aq);
2947
237957cc
AS
2948 int queue_last = ((aq->queue_first + aq->queue_n)
2949 % ASYNC_QUEUE_SIZE);
2950 if (DEBUG_QUEUES)
2951 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2952 aq->id, queue_last);
2953
2954 aq->queue[queue_last].type = CALLBACK;
2955 aq->queue[queue_last].u.callback.fn = fn;
2956 aq->queue[queue_last].u.callback.data = data;
2957
2958 aq->queue_n++;
2959
2960 if (DEBUG_THREAD_SIGNAL)
2961 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2962 aq->agent->device_id, aq->id);
2963 pthread_cond_signal (&aq->queue_cond_in);
2964
2965 pthread_mutex_unlock (&aq->mutex);
2966}
2967
2968/* Request that a given async thread wait for another thread (unspecified) to
2969 reach the given placeholder. The wait will occur when all previous entries
2970 on the queue are complete. A placeholder is effectively a kind of signal
2971 which simply sets a flag when encountered in a queue. */
2972
2973static void
2974queue_push_asyncwait (struct goacc_asyncqueue *aq,
2975 struct placeholder *placeholderp)
2976{
237957cc
AS
2977 pthread_mutex_lock (&aq->mutex);
2978
d88b27da
JB
2979 wait_for_queue_nonfull (aq);
2980
237957cc
AS
2981 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2982 if (DEBUG_QUEUES)
2983 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2984 aq->id, queue_last);
2985
2986 aq->queue[queue_last].type = ASYNC_WAIT;
2987 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2988
2989 aq->queue_n++;
2990
2991 if (DEBUG_THREAD_SIGNAL)
2992 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2993 aq->agent->device_id, aq->id);
2994 pthread_cond_signal (&aq->queue_cond_in);
2995
2996 pthread_mutex_unlock (&aq->mutex);
2997}
2998
2999/* Add a placeholder into an async queue. When the async thread reaches the
3000 placeholder it will set the "executed" flag to true and continue.
3001 Another thread may be waiting on this thread reaching the placeholder. */
3002
3003static struct placeholder *
3004queue_push_placeholder (struct goacc_asyncqueue *aq)
3005{
3006 struct placeholder *placeholderp;
3007
237957cc
AS
3008 pthread_mutex_lock (&aq->mutex);
3009
d88b27da
JB
3010 wait_for_queue_nonfull (aq);
3011
237957cc
AS
3012 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
3013 if (DEBUG_QUEUES)
3014 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
3015 aq->id, queue_last);
3016
3017 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
3018 placeholderp = &aq->queue[queue_last].u.placeholder;
3019
3020 if (pthread_mutex_init (&placeholderp->mutex, NULL))
3021 {
3022 pthread_mutex_unlock (&aq->mutex);
3023 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
3024 }
3025
3026 if (pthread_cond_init (&placeholderp->cond, NULL))
3027 {
3028 pthread_mutex_unlock (&aq->mutex);
3029 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
3030 }
3031
3032 placeholderp->executed = 0;
3033
3034 aq->queue_n++;
3035
3036 if (DEBUG_THREAD_SIGNAL)
3037 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
3038 aq->agent->device_id, aq->id);
3039 pthread_cond_signal (&aq->queue_cond_in);
3040
3041 pthread_mutex_unlock (&aq->mutex);
3042
3043 return placeholderp;
3044}
3045
3046/* Signal an asynchronous thread to terminate, and wait for it to do so. */
3047
3048static void
3049finalize_async_thread (struct goacc_asyncqueue *aq)
3050{
3051 pthread_mutex_lock (&aq->mutex);
3052 if (aq->drain_queue_stop == 2)
3053 {
3054 pthread_mutex_unlock (&aq->mutex);
3055 return;
3056 }
3057
3058 aq->drain_queue_stop = 1;
3059
3060 if (DEBUG_THREAD_SIGNAL)
3061 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
3062 aq->agent->device_id, aq->id);
3063 pthread_cond_signal (&aq->queue_cond_in);
3064
3065 while (aq->drain_queue_stop != 2)
3066 {
3067 if (DEBUG_THREAD_SLEEP)
3068 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
3069 " to sleep\n", aq->agent->device_id, aq->id);
3070 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3071 if (DEBUG_THREAD_SLEEP)
3072 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
3073 aq->agent->device_id, aq->id);
3074 }
3075
3076 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
3077 aq->id);
3078 pthread_mutex_unlock (&aq->mutex);
3079
3080 int err = pthread_join (aq->thread_drain_queue, NULL);
3081 if (err != 0)
3082 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3083 aq->agent->device_id, aq->id, strerror (err));
3084 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3085}
3086
3087/* Set up an async queue for OpenMP. There will be only one. The
3088 implementation simply uses an OpenACC async queue.
3089 FIXME: is this thread-safe if two threads call this function? */
3090
3091static void
3092maybe_init_omp_async (struct agent_info *agent)
3093{
3094 if (!agent->omp_async_queue)
3095 agent->omp_async_queue
3096 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3097}
3098
8d2f4ddf
JB
3099/* A wrapper that works around an issue in the HSA runtime with host-to-device
3100 copies from read-only pages. */
3101
3102static void
3103hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3104{
3105 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3106
3107 if (status == HSA_STATUS_SUCCESS)
3108 return;
3109
3110 /* It appears that the copy fails if the source data is in a read-only page.
3111 We can't detect that easily, so try copying the data to a temporary buffer
3112 and doing the copy again if we got an error above. */
3113
3114 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3115 "[%p:+%d]\n", (void *) src, (int) len);
3116
3117 void *src_copy = malloc (len);
3118 memcpy (src_copy, src, len);
3119 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3120 free (src_copy);
3121 if (status != HSA_STATUS_SUCCESS)
3122 GOMP_PLUGIN_error ("memory copy failed");
3123}
3124
237957cc
AS
3125/* Copy data to or from a device. This is intended for use as an async
3126 callback event. */
3127
3128static void
3129copy_data (void *data_)
3130{
3131 struct copy_data *data = (struct copy_data *)data_;
3132 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3133 data->aq->agent->device_id, data->aq->id, data->len, data->src,
3134 data->dst);
8d2f4ddf 3135 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
237957cc
AS
3136 if (data->free_src)
3137 free ((void *) data->src);
3138 free (data);
3139}
3140
3141/* Free device data. This is intended for use as an async callback event. */
3142
3143static void
3144gomp_offload_free (void *ptr)
3145{
3146 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
3147 GOMP_OFFLOAD_free (0, ptr);
3148}
3149
3150/* Request an asynchronous data copy, to or from a device, on a given queue.
3151 The event will be registered as a callback. If FREE_SRC is true
3152 then the source data will be freed following the copy. */
3153
3154static void
3155queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3156 size_t len, bool free_src)
3157{
3158 if (DEBUG_QUEUES)
3159 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3160 aq->agent->device_id, aq->id, len, src, dst);
3161 struct copy_data *data
3162 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3163 data->dst = dst;
3164 data->src = src;
3165 data->len = len;
3166 data->free_src = free_src;
3167 data->aq = aq;
3168 queue_push_callback (aq, copy_data, data);
3169}
3170
3171/* Return true if the given queue is currently empty. */
3172
3173static int
3174queue_empty (struct goacc_asyncqueue *aq)
3175{
3176 pthread_mutex_lock (&aq->mutex);
3177 int res = aq->queue_n == 0 ? 1 : 0;
3178 pthread_mutex_unlock (&aq->mutex);
3179
3180 return res;
3181}
3182
3183/* Wait for a given queue to become empty. This implements an OpenACC wait
3184 directive. */
3185
3186static void
3187wait_queue (struct goacc_asyncqueue *aq)
3188{
3189 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3190 {
3191 drain_queue_synchronous (aq);
3192 return;
3193 }
3194
3195 pthread_mutex_lock (&aq->mutex);
3196
3197 while (aq->queue_n > 0)
3198 {
3199 if (DEBUG_THREAD_SLEEP)
3200 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3201 aq->agent->device_id, aq->id);
3202 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3203 if (DEBUG_THREAD_SLEEP)
3204 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3205 aq->id);
3206 }
3207
3208 pthread_mutex_unlock (&aq->mutex);
3209 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3210}
3211
3212/* }}} */
3213/* {{{ OpenACC support */
3214
3215/* Execute an OpenACC kernel, synchronously or asynchronously. */
3216
3217static void
3218gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3219 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3220 struct goacc_asyncqueue *aq)
3221{
3222 if (!GOMP_OFFLOAD_can_run (kernel))
3223 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3224
3225 /* If we get here then this must be an OpenACC kernel. */
3226 kernel->kind = KIND_OPENACC;
3227
3228 /* devaddrs must be double-indirect on the target. */
3229 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3230 for (size_t i = 0; i < mapnum; i++)
3231 hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3232 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3233 sizeof (void *));
3234
3235 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3236 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3237 {
3238 struct hsa_kernel_description *d
3239 = &kernel->module->image_desc->kernel_infos[i];
3240 if (d->name == kernel->name)
3241 {
3242 hsa_kernel_desc = d;
3243 break;
3244 }
3245 }
3246
3247 /* We may have statically-determined dimensions in
3248 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3249 invocation at runtime in dims[]. We allow static dimensions to take
3250 priority over dynamic dimensions when present (non-zero). */
3251 if (hsa_kernel_desc->oacc_dims[0] > 0)
3252 dims[0] = hsa_kernel_desc->oacc_dims[0];
3253 if (hsa_kernel_desc->oacc_dims[1] > 0)
3254 dims[1] = hsa_kernel_desc->oacc_dims[1];
3255 if (hsa_kernel_desc->oacc_dims[2] > 0)
3256 dims[2] = hsa_kernel_desc->oacc_dims[2];
3257
3258 /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3259 There isn't really a correct answer for this without a clue about the
3260 problem size, so let's do a reasonable number of single-worker gangs.
3261 64 gangs matches a typical Fiji device. */
3262
3263 /* NOTE: Until support for middle-end worker partitioning is merged, use 1
3264 for the default number of workers. */
3265 if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs. */
3266 if (dims[1] == 0) dims[1] = 1; /* Workers. */
3267
3268 /* The incoming dimensions are expressed in terms of gangs, workers, and
3269 vectors. The HSA dimensions are expressed in terms of "work-items",
3270 which means multiples of vector lanes.
3271
3272 The "grid size" specifies the size of the problem space, and the
3273 "work-group size" specifies how much of that we want a single compute
3274 unit to chew on at once.
3275
3276 The three dimensions do not really correspond to hardware, but the
3277 important thing is that the HSA runtime will launch as many
3278 work-groups as it takes to process the entire grid, and each
3279 work-group will contain as many wave-fronts as it takes to process
3280 the work-items in that group.
3281
3282 Essentially, as long as we set the Y dimension to 64 (the number of
3283 vector lanes in hardware), and the Z group size to the maximum (16),
3284 then we will get the gangs (X) and workers (Z) launched as we expect.
3285
3286 The reason for the apparent reversal of vector and worker dimension
3287 order is to do with the way the run-time distributes work-items across
3288 v1 and v2. */
3289 struct GOMP_kernel_launch_attributes kla =
3290 {3,
3291 /* Grid size. */
3292 {dims[0], 64, dims[1]},
3293 /* Work-group size. */
3294 {1, 64, 16}
3295 };
3296
3297 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3298 acc_prof_info *prof_info = thr->prof_info;
3299 acc_event_info enqueue_launch_event_info;
3300 acc_api_info *api_info = thr->api_info;
3301 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3302 if (profiling_dispatch_p)
3303 {
3304 prof_info->event_type = acc_ev_enqueue_launch_start;
3305
3306 enqueue_launch_event_info.launch_event.event_type
3307 = prof_info->event_type;
3308 enqueue_launch_event_info.launch_event.valid_bytes
3309 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3310 enqueue_launch_event_info.launch_event.parent_construct
3311 = acc_construct_parallel;
3312 enqueue_launch_event_info.launch_event.implicit = 1;
3313 enqueue_launch_event_info.launch_event.tool_info = NULL;
3314 enqueue_launch_event_info.launch_event.kernel_name
3315 = (char *) kernel->name;
3316 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3317 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3318 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3319
3320 api_info->device_api = acc_device_api_other;
3321
3322 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3323 &enqueue_launch_event_info, api_info);
3324 }
3325
3326 if (!async)
3327 {
3328 run_kernel (kernel, ind_da, &kla, NULL, false);
3329 gomp_offload_free (ind_da);
3330 }
3331 else
3332 {
3333 queue_push_launch (aq, kernel, ind_da, &kla);
3334 if (DEBUG_QUEUES)
3335 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3336 aq->agent->device_id, aq->id, ind_da);
3337 queue_push_callback (aq, gomp_offload_free, ind_da);
3338 }
3339
3340 if (profiling_dispatch_p)
3341 {
3342 prof_info->event_type = acc_ev_enqueue_launch_end;
3343 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3344 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3345 &enqueue_launch_event_info,
3346 api_info);
3347 }
3348}
3349
3350/* }}} */
3351/* {{{ Generic Plugin API */
3352
3353/* Return the name of the accelerator, which is "gcn". */
3354
3355const char *
3356GOMP_OFFLOAD_get_name (void)
3357{
3358 return "gcn";
3359}
3360
3361/* Return the specific capabilities the HSA accelerator have. */
3362
3363unsigned int
3364GOMP_OFFLOAD_get_caps (void)
3365{
3366 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3367 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3368 | GOMP_OFFLOAD_CAP_OPENACC_200;
3369}
3370
3371/* Identify as GCN accelerator. */
3372
3373int
3374GOMP_OFFLOAD_get_type (void)
3375{
3376 return OFFLOAD_TARGET_TYPE_GCN;
3377}
3378
3379/* Return the libgomp version number we're compatible with. There is
3380 no requirement for cross-version compatibility. */
3381
3382unsigned
3383GOMP_OFFLOAD_version (void)
3384{
3385 return GOMP_VERSION;
3386}
3387
3388/* Return the number of GCN devices on the system. */
3389
3390int
3391GOMP_OFFLOAD_get_num_devices (void)
3392{
3393 if (!init_hsa_context ())
3394 return 0;
3395 return hsa_context.agent_count;
3396}
3397
3398/* Initialize device (agent) number N so that it can be used for computation.
3399 Return TRUE on success. */
3400
3401bool
3402GOMP_OFFLOAD_init_device (int n)
3403{
3404 if (!init_hsa_context ())
3405 return false;
3406 if (n >= hsa_context.agent_count)
3407 {
3408 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3409 return false;
3410 }
3411 struct agent_info *agent = &hsa_context.agents[n];
3412
3413 if (agent->initialized)
3414 return true;
3415
3416 agent->device_id = n;
3417
3418 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3419 {
3420 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3421 return false;
3422 }
3423 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3424 {
3425 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3426 return false;
3427 }
3428 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3429 {
3430 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3431 return false;
3432 }
3433 if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3434 {
3435 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3436 return false;
3437 }
3438 agent->async_queues = NULL;
3439 agent->omp_async_queue = NULL;
3440 agent->team_arena_list = NULL;
3441
3442 uint32_t queue_size;
3443 hsa_status_t status;
3444 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3445 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3446 &queue_size);
3447 if (status != HSA_STATUS_SUCCESS)
3448 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3449 status);
3450
237957cc 3451 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
2e5ea579 3452 &agent->name);
237957cc
AS
3453 if (status != HSA_STATUS_SUCCESS)
3454 return hsa_error ("Error querying the name of the agent", status);
7d593fd6 3455
2e5ea579 3456 agent->device_isa = isa_code (agent->name);
7d593fd6 3457 if (agent->device_isa < 0)
2e5ea579
FH
3458 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3459
3460 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3461 &agent->vendor_name);
3462 if (status != HSA_STATUS_SUCCESS)
3463 return hsa_error ("Error querying the vendor name of the agent", status);
237957cc
AS
3464
3465 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3466 HSA_QUEUE_TYPE_MULTI,
3467 hsa_queue_callback, NULL, UINT32_MAX,
3468 UINT32_MAX, &agent->sync_queue);
3469 if (status != HSA_STATUS_SUCCESS)
3470 return hsa_error ("Error creating command queue", status);
3471
3472 agent->kernarg_region.handle = (uint64_t) -1;
3473 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3474 get_kernarg_memory_region,
3475 &agent->kernarg_region);
3476 if (agent->kernarg_region.handle == (uint64_t) -1)
3477 {
3478 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3479 "arguments");
3480 return false;
3481 }
3482 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3483 dump_hsa_region (agent->kernarg_region, NULL);
3484
3485 agent->data_region.handle = (uint64_t) -1;
3486 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3487 get_data_memory_region,
3488 &agent->data_region);
3489 if (agent->data_region.handle == (uint64_t) -1)
3490 {
3491 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3492 "data");
3493 return false;
3494 }
3495 GCN_DEBUG ("Selected device data memory region:\n");
3496 dump_hsa_region (agent->data_region, NULL);
3497
3498 GCN_DEBUG ("GCN agent %d initialized\n", n);
3499
3500 agent->initialized = true;
3501 return true;
3502}
3503
3504/* Load GCN object-code module described by struct gcn_image_desc in
3505 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3506 If there are any constructors then run them. */
3507
3508int
3509GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3510 struct addr_pair **target_table)
3511{
3512 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3513 {
3514 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3515 " (expected %u, received %u)",
3516 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3517 return -1;
3518 }
3519
3520 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3521 struct agent_info *agent;
3522 struct addr_pair *pair;
3523 struct module_info *module;
3524 struct kernel_info *kernel;
3525 int kernel_count = image_desc->kernel_count;
3526 unsigned var_count = image_desc->global_variable_count;
3527
3528 agent = get_agent_info (ord);
3529 if (!agent)
3530 return -1;
3531
3532 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3533 {
3534 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3535 return -1;
3536 }
3537 if (agent->prog_finalized
3538 && !destroy_hsa_program (agent))
3539 return -1;
3540
3541 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3542 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3543 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
3544 * sizeof (struct addr_pair));
3545 *target_table = pair;
3546 module = (struct module_info *)
3547 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3548 + kernel_count * sizeof (struct kernel_info));
3549 module->image_desc = image_desc;
3550 module->kernel_count = kernel_count;
3551 module->heap = NULL;
3552 module->constructors_run_p = false;
3553
3554 kernel = &module->kernels[0];
3555
3556 /* Allocate memory for kernel dependencies. */
3557 for (unsigned i = 0; i < kernel_count; i++)
3558 {
3559 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3560 if (!init_basic_kernel_info (kernel, d, agent, module))
3561 return -1;
3562 if (strcmp (d->name, "_init_array") == 0)
3563 module->init_array_func = kernel;
3564 else if (strcmp (d->name, "_fini_array") == 0)
3565 module->fini_array_func = kernel;
3566 else
3567 {
3568 pair->start = (uintptr_t) kernel;
3569 pair->end = (uintptr_t) (kernel + 1);
3570 pair++;
3571 }
3572 kernel++;
3573 }
3574
3575 agent->module = module;
3576 if (pthread_rwlock_unlock (&agent->module_rwlock))
3577 {
3578 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3579 return -1;
3580 }
3581
3582 if (!create_and_finalize_hsa_program (agent))
3583 return -1;
3584
3585 for (unsigned i = 0; i < var_count; i++)
3586 {
3587 struct global_var_info *v = &image_desc->global_variables[i];
3588 GCN_DEBUG ("Looking for variable %s\n", v->name);
3589
3590 hsa_status_t status;
3591 hsa_executable_symbol_t var_symbol;
3592 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3593 v->name, agent->id,
3594 0, &var_symbol);
3595
3596 if (status != HSA_STATUS_SUCCESS)
3597 hsa_fatal ("Could not find symbol for variable in the code object",
3598 status);
3599
3600 uint64_t var_addr;
3601 uint32_t var_size;
3602 status = hsa_fns.hsa_executable_symbol_get_info_fn
3603 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
3604 if (status != HSA_STATUS_SUCCESS)
3605 hsa_fatal ("Could not extract a variable from its symbol", status);
3606 status = hsa_fns.hsa_executable_symbol_get_info_fn
3607 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
3608 if (status != HSA_STATUS_SUCCESS)
3609 hsa_fatal ("Could not extract a variable size from its symbol", status);
3610
3611 pair->start = var_addr;
3612 pair->end = var_addr + var_size;
3613 GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
3614 (void *)var_addr, var_size);
3615 pair++;
3616 }
3617
3618 /* Ensure that constructors are run first. */
3619 struct GOMP_kernel_launch_attributes kla =
3620 { 3,
3621 /* Grid size. */
3622 { 1, 64, 1 },
3623 /* Work-group size. */
3624 { 1, 64, 1 }
3625 };
3626
3627 if (module->init_array_func)
3628 {
3629 init_kernel (module->init_array_func);
3630 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3631 }
3632 module->constructors_run_p = true;
3633
3634 /* Don't report kernels that libgomp need not know about. */
3635 if (module->init_array_func)
3636 kernel_count--;
3637 if (module->fini_array_func)
3638 kernel_count--;
3639
3640 return kernel_count + var_count;
3641}
3642
3643/* Unload GCN object-code module described by struct gcn_image_desc in
3644 TARGET_DATA from agent number N. Return TRUE on success. */
3645
3646bool
3647GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3648{
3649 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3650 {
3651 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3652 " (expected %u, received %u)",
3653 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3654 return false;
3655 }
3656
3657 struct agent_info *agent;
3658 agent = get_agent_info (n);
3659 if (!agent)
3660 return false;
3661
3662 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3663 {
3664 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3665 return false;
3666 }
3667
3668 if (!agent->module || agent->module->image_desc != target_data)
3669 {
3670 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3671 "loaded before");
3672 return false;
3673 }
3674
3675 if (!destroy_module (agent->module, true))
3676 return false;
3677 free (agent->module);
3678 agent->module = NULL;
3679 if (!destroy_hsa_program (agent))
3680 return false;
3681 if (pthread_rwlock_unlock (&agent->module_rwlock))
3682 {
3683 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3684 return false;
3685 }
3686 return true;
3687}
3688
3689/* Deinitialize all information and status associated with agent number N. We
3690 do not attempt any synchronization, assuming the user and libgomp will not
3691 attempt deinitialization of a device that is in any way being used at the
3692 same time. Return TRUE on success. */
3693
3694bool
3695GOMP_OFFLOAD_fini_device (int n)
3696{
3697 struct agent_info *agent = get_agent_info (n);
3698 if (!agent)
3699 return false;
3700
3701 if (!agent->initialized)
3702 return true;
3703
3704 if (agent->omp_async_queue)
3705 {
3706 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3707 agent->omp_async_queue = NULL;
3708 }
3709
3710 if (agent->module)
3711 {
3712 if (!destroy_module (agent->module, false))
3713 return false;
3714 free (agent->module);
3715 agent->module = NULL;
3716 }
3717
3718 if (!destroy_team_arenas (agent))
3719 return false;
3720
3721 if (!destroy_hsa_program (agent))
3722 return false;
3723
3724 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3725 if (status != HSA_STATUS_SUCCESS)
3726 return hsa_error ("Error destroying command queue", status);
3727
3728 if (pthread_mutex_destroy (&agent->prog_mutex))
3729 {
3730 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3731 return false;
3732 }
3733 if (pthread_rwlock_destroy (&agent->module_rwlock))
3734 {
3735 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3736 return false;
3737 }
3738
3739 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3740 {
3741 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3742 return false;
3743 }
3744 if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3745 {
3746 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3747 return false;
3748 }
3749 agent->initialized = false;
3750 return true;
3751}
3752
3753/* Return true if the HSA runtime can run function FN_PTR. */
3754
3755bool
3756GOMP_OFFLOAD_can_run (void *fn_ptr)
3757{
3758 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3759
3760 init_kernel (kernel);
3761 if (kernel->initialization_failed)
3762 goto failure;
3763
3764 return true;
3765
3766failure:
3767 if (suppress_host_fallback)
3768 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3769 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3770 return false;
3771}
3772
3773/* Allocate memory on device N. */
3774
3775void *
3776GOMP_OFFLOAD_alloc (int n, size_t size)
3777{
3778 struct agent_info *agent = get_agent_info (n);
3779 return alloc_by_agent (agent, size);
3780}
3781
3782/* Free memory from device N. */
3783
3784bool
3785GOMP_OFFLOAD_free (int device, void *ptr)
3786{
3787 GCN_DEBUG ("Freeing memory on device %d\n", device);
3788
3789 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3790 if (status != HSA_STATUS_SUCCESS)
3791 {
3792 hsa_error ("Could not free device memory", status);
3793 return false;
3794 }
3795
3796 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3797 bool profiling_dispatch_p
3798 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3799 if (profiling_dispatch_p)
3800 {
3801 acc_prof_info *prof_info = thr->prof_info;
3802 acc_event_info data_event_info;
3803 acc_api_info *api_info = thr->api_info;
3804
3805 prof_info->event_type = acc_ev_free;
3806
3807 data_event_info.data_event.event_type = prof_info->event_type;
3808 data_event_info.data_event.valid_bytes
3809 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3810 data_event_info.data_event.parent_construct
3811 = acc_construct_parallel;
3812 data_event_info.data_event.implicit = 1;
3813 data_event_info.data_event.tool_info = NULL;
3814 data_event_info.data_event.var_name = NULL;
3815 data_event_info.data_event.bytes = 0;
3816 data_event_info.data_event.host_ptr = NULL;
3817 data_event_info.data_event.device_ptr = (void *) ptr;
3818
3819 api_info->device_api = acc_device_api_other;
3820
3821 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3822 api_info);
3823 }
3824
3825 return true;
3826}
3827
3828/* Copy data from DEVICE to host. */
3829
3830bool
3831GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3832{
3833 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3834 src, dst);
8d2f4ddf
JB
3835 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3836 if (status != HSA_STATUS_SUCCESS)
3837 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3838 return true;
3839}
3840
3841/* Copy data from host to DEVICE. */
3842
3843bool
3844GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3845{
3846 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3847 device, dst);
8d2f4ddf 3848 hsa_memory_copy_wrapper (dst, src, n);
237957cc
AS
3849 return true;
3850}
3851
3852/* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3853
3854bool
3855GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3856{
3857 struct gcn_thread *thread_data = gcn_thread ();
3858
3859 if (thread_data && !async_synchronous_p (thread_data->async))
3860 {
3861 struct agent_info *agent = get_agent_info (device);
3862 maybe_init_omp_async (agent);
3863 queue_push_copy (agent->omp_async_queue, dst, src, n, false);
3864 return true;
3865 }
3866
3867 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3868 device, src, device, dst);
8d2f4ddf
JB
3869 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3870 if (status != HSA_STATUS_SUCCESS)
3871 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3872 return true;
3873}
3874
3875/* }}} */
3876/* {{{ OpenMP Plugin API */
3877
3878/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3879 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3880 to a kernel_info structure, and must have previously been loaded to the
3881 specified device. */
3882
3883void
3884GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3885{
3886 struct agent_info *agent = get_agent_info (device);
3887 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3888 struct GOMP_kernel_launch_attributes def;
3889 struct GOMP_kernel_launch_attributes *kla;
3890 assert (agent == kernel->agent);
3891
3892 /* If we get here then the kernel must be OpenMP. */
3893 kernel->kind = KIND_OPENMP;
3894
3895 if (!parse_target_attributes (args, &def, &kla, agent))
3896 {
3897 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3898 return;
3899 }
3900 run_kernel (kernel, vars, kla, NULL, false);
3901}
3902
3903/* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3904 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3905 GOMP_PLUGIN_target_task_completion when it has finished. */
3906
3907void
3908GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3909 void **args, void *async_data)
3910{
3911 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3912 struct agent_info *agent = get_agent_info (device);
3913 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3914 struct GOMP_kernel_launch_attributes def;
3915 struct GOMP_kernel_launch_attributes *kla;
3916 assert (agent == kernel->agent);
3917
3918 /* If we get here then the kernel must be OpenMP. */
3919 kernel->kind = KIND_OPENMP;
3920
3921 if (!parse_target_attributes (args, &def, &kla, agent))
3922 {
3923 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3924 return;
3925 }
3926
3927 maybe_init_omp_async (agent);
3928 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3929 queue_push_callback (agent->omp_async_queue,
3930 GOMP_PLUGIN_target_task_completion, async_data);
3931}
3932
3933/* }}} */
3934/* {{{ OpenACC Plugin API */
3935
3936/* Run a synchronous OpenACC kernel. The device number is inferred from the
3937 already-loaded KERNEL. */
3938
3939void
3940GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3941 void **hostaddrs, void **devaddrs, unsigned *dims,
3942 void *targ_mem_desc)
3943{
3944 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3945
3946 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3947 NULL);
3948}
3949
3950/* Run an asynchronous OpenACC kernel on the specified queue. */
3951
3952void
3953GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3954 void **hostaddrs, void **devaddrs,
3955 unsigned *dims, void *targ_mem_desc,
3956 struct goacc_asyncqueue *aq)
3957{
3958 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3959
3960 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3961 aq);
3962}
3963
3964/* Create a new asynchronous thread and queue for running future kernels. */
3965
3966struct goacc_asyncqueue *
3967GOMP_OFFLOAD_openacc_async_construct (int device)
3968{
3969 struct agent_info *agent = get_agent_info (device);
3970
3971 pthread_mutex_lock (&agent->async_queues_mutex);
3972
3973 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3974 aq->agent = get_agent_info (device);
3975 aq->prev = NULL;
3976 aq->next = agent->async_queues;
3977 if (aq->next)
3978 {
3979 aq->next->prev = aq;
3980 aq->id = aq->next->id + 1;
3981 }
3982 else
3983 aq->id = 1;
3984 agent->async_queues = aq;
3985
3986 aq->queue_first = 0;
3987 aq->queue_n = 0;
3988 aq->drain_queue_stop = 0;
3989
3990 if (pthread_mutex_init (&aq->mutex, NULL))
3991 {
3992 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3993 return false;
3994 }
3995 if (pthread_cond_init (&aq->queue_cond_in, NULL))
3996 {
3997 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3998 return false;
3999 }
4000 if (pthread_cond_init (&aq->queue_cond_out, NULL))
4001 {
4002 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4003 return false;
4004 }
4005
4006 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4007 ASYNC_QUEUE_SIZE,
4008 HSA_QUEUE_TYPE_MULTI,
4009 hsa_queue_callback, NULL,
4010 UINT32_MAX, UINT32_MAX,
4011 &aq->hsa_queue);
4012 if (status != HSA_STATUS_SUCCESS)
4013 hsa_fatal ("Error creating command queue", status);
4014
4015 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4016 if (err != 0)
4017 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4018 strerror (err));
4019 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4020 aq->id);
4021
4022 pthread_mutex_unlock (&agent->async_queues_mutex);
4023
4024 return aq;
4025}
4026
93d90219 4027/* Destroy an existing asynchronous thread and queue. Waits for any
237957cc
AS
4028 currently-running task to complete, but cancels any queued tasks. */
4029
4030bool
4031GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4032{
4033 struct agent_info *agent = aq->agent;
4034
4035 finalize_async_thread (aq);
4036
4037 pthread_mutex_lock (&agent->async_queues_mutex);
4038
4039 int err;
4040 if ((err = pthread_mutex_destroy (&aq->mutex)))
4041 {
4042 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4043 goto fail;
4044 }
4045 if (pthread_cond_destroy (&aq->queue_cond_in))
4046 {
4047 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4048 goto fail;
4049 }
4050 if (pthread_cond_destroy (&aq->queue_cond_out))
4051 {
4052 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4053 goto fail;
4054 }
4055 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4056 if (status != HSA_STATUS_SUCCESS)
4057 {
4058 hsa_error ("Error destroying command queue", status);
4059 goto fail;
4060 }
4061
4062 if (aq->prev)
4063 aq->prev->next = aq->next;
4064 if (aq->next)
4065 aq->next->prev = aq->prev;
4066 if (agent->async_queues == aq)
4067 agent->async_queues = aq->next;
4068
4069 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4070
4071 free (aq);
4072 pthread_mutex_unlock (&agent->async_queues_mutex);
4073 return true;
4074
4075fail:
4076 pthread_mutex_unlock (&agent->async_queues_mutex);
4077 return false;
4078}
4079
4080/* Return true if the specified async queue is currently empty. */
4081
4082int
4083GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4084{
4085 return queue_empty (aq);
4086}
4087
4088/* Block until the specified queue has executed all its tasks and the
4089 queue is empty. */
4090
4091bool
4092GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4093{
4094 wait_queue (aq);
4095 return true;
4096}
4097
4098/* Add a serialization point across two async queues. Any new tasks added to
4099 AQ2, after this call, will not run until all tasks on AQ1, at the time
4100 of this call, have completed. */
4101
4102bool
4103GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4104 struct goacc_asyncqueue *aq2)
4105{
4106 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4107 scheduled to run on it up to this point. */
4108 if (aq1 != aq2)
4109 {
4110 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4111 queue_push_asyncwait (aq2, placeholderp);
4112 }
4113 return true;
4114}
4115
4116/* Add an opaque callback to the given async queue. */
4117
4118void
4119GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4120 void (*fn) (void *), void *data)
4121{
4122 queue_push_callback (aq, fn, data);
4123}
4124
4125/* Queue up an asynchronous data copy from host to DEVICE. */
4126
4127bool
4128GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4129 size_t n, struct goacc_asyncqueue *aq)
4130{
4131 struct agent_info *agent = get_agent_info (device);
4132 assert (agent == aq->agent);
4133 /* The source data does not necessarily remain live until the deferred
4134 copy happens. Taking a snapshot of the data here avoids reading
4135 uninitialised data later, but means that (a) data is copied twice and
4136 (b) modifications to the copied data between the "spawning" point of
4137 the asynchronous kernel and when it is executed will not be seen.
4138 But, that is probably correct. */
4139 void *src_copy = GOMP_PLUGIN_malloc (n);
4140 memcpy (src_copy, src, n);
4141 queue_push_copy (aq, dst, src_copy, n, true);
4142 return true;
4143}
4144
4145/* Queue up an asynchronous data copy from DEVICE to host. */
4146
4147bool
4148GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4149 size_t n, struct goacc_asyncqueue *aq)
4150{
4151 struct agent_info *agent = get_agent_info (device);
4152 assert (agent == aq->agent);
4153 queue_push_copy (aq, dst, src, n, false);
4154 return true;
4155}
4156
6fc0385c
TS
4157union goacc_property_value
4158GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4159{
2e5ea579
FH
4160 struct agent_info *agent = get_agent_info (device);
4161
4162 union goacc_property_value propval = { .val = 0 };
4163
4164 switch (prop)
4165 {
4166 case GOACC_PROPERTY_FREE_MEMORY:
4167 /* Not supported. */
4168 break;
4169 case GOACC_PROPERTY_MEMORY:
4170 {
4171 size_t size;
4172 hsa_region_t region = agent->data_region;
4173 hsa_status_t status =
4174 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4175 if (status == HSA_STATUS_SUCCESS)
4176 propval.val = size;
4177 break;
4178 }
4179 case GOACC_PROPERTY_NAME:
4180 propval.ptr = agent->name;
4181 break;
4182 case GOACC_PROPERTY_VENDOR:
4183 propval.ptr = agent->vendor_name;
4184 break;
4185 case GOACC_PROPERTY_DRIVER:
4186 propval.ptr = hsa_context.driver_version_s;
4187 break;
4188 }
6fc0385c 4189
2e5ea579 4190 return propval;
6fc0385c
TS
4191}
4192
237957cc
AS
4193/* Set up plugin-specific thread-local-data (host-side). */
4194
4195void *
4196GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4197{
4198 struct gcn_thread *thread_data
4199 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4200
4201 thread_data->async = GOMP_ASYNC_SYNC;
4202
4203 return (void *) thread_data;
4204}
4205
4206/* Clean up plugin-specific thread-local-data. */
4207
4208void
4209GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4210{
4211 free (data);
4212}
4213
4214/* }}} */