]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-gcn.c
Import HSA header files from AMD
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
CommitLineData
237957cc
AS
1/* Plugin for AMD GCN execution.
2
8d9254fc 3 Copyright (C) 2013-2020 Free Software Foundation, Inc.
237957cc
AS
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* {{{ Includes and defines */
30
31#include "config.h"
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35#include <pthread.h>
36#include <inttypes.h>
37#include <stdbool.h>
38#include <limits.h>
39#include <hsa.h>
85f0a4d9 40#include <hsa_ext_amd.h>
237957cc
AS
41#include <dlfcn.h>
42#include <signal.h>
43#include "libgomp-plugin.h"
44#include "gomp-constants.h"
45#include <elf.h>
46#include "oacc-plugin.h"
47#include "oacc-int.h"
48#include <assert.h>
49
237957cc 50/* These probably won't be in elf.h for a while. */
97981e13 51#ifndef R_AMDGPU_NONE
237957cc
AS
52#define R_AMDGPU_NONE 0
53#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
54#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
55#define R_AMDGPU_ABS64 3 /* S + A */
56#define R_AMDGPU_REL32 4 /* S + A - P */
57#define R_AMDGPU_REL64 5 /* S + A - P */
58#define R_AMDGPU_ABS32 6 /* S + A */
59#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
60#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
61#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
62#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
63#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
237957cc 64#define R_AMDGPU_RELATIVE64 13 /* B + A */
97981e13 65#endif
237957cc
AS
66
67/* GCN specific definitions for asynchronous queues. */
68
69#define ASYNC_QUEUE_SIZE 64
70#define DRAIN_QUEUE_SYNCHRONOUS_P false
71#define DEBUG_QUEUES 0
72#define DEBUG_THREAD_SLEEP 0
73#define DEBUG_THREAD_SIGNAL 0
74
75/* Defaults. */
76#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
77
78/* Secure getenv() which returns NULL if running as SUID/SGID. */
79#ifndef HAVE_SECURE_GETENV
80#ifdef HAVE___SECURE_GETENV
81#define secure_getenv __secure_getenv
82#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
83 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
84
85#include <unistd.h>
86
87/* Implementation of secure_getenv() for targets where it is not provided but
88 we have at least means to test real and effective IDs. */
89
90static char *
91secure_getenv (const char *name)
92{
93 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
94 return getenv (name);
95 else
96 return NULL;
97}
98
99#else
100#define secure_getenv getenv
101#endif
102#endif
103
104/* }}} */
105/* {{{ Types */
106
93d90219 107/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
237957cc
AS
108
109struct gcn_thread
110{
111 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
112 int async;
113};
114
115/* As an HSA runtime is dlopened, following structure defines function
116 pointers utilized by the HSA plug-in. */
117
118struct hsa_runtime_fn_info
119{
120 /* HSA runtime. */
121 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
122 const char **status_string);
123 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
124 void *value);
125 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
126 hsa_agent_info_t attribute,
127 void *value);
128 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
129 hsa_isa_info_t attribute,
130 uint32_t index,
131 void *value);
132 hsa_status_t (*hsa_init_fn) (void);
133 hsa_status_t (*hsa_iterate_agents_fn)
134 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
135 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
136 hsa_region_info_t attribute,
137 void *value);
138 hsa_status_t (*hsa_queue_create_fn)
139 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
140 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
141 void *data, uint32_t private_segment_size,
142 uint32_t group_segment_size, hsa_queue_t **queue);
143 hsa_status_t (*hsa_agent_iterate_regions_fn)
144 (hsa_agent_t agent,
145 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
146 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
147 hsa_status_t (*hsa_executable_create_fn)
148 (hsa_profile_t profile, hsa_executable_state_t executable_state,
149 const char *options, hsa_executable_t *executable);
150 hsa_status_t (*hsa_executable_global_variable_define_fn)
151 (hsa_executable_t executable, const char *variable_name, void *address);
152 hsa_status_t (*hsa_executable_load_code_object_fn)
153 (hsa_executable_t executable, hsa_agent_t agent,
154 hsa_code_object_t code_object, const char *options);
155 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
156 const char *options);
157 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
158 uint32_t num_consumers,
159 const hsa_agent_t *consumers,
160 hsa_signal_t *signal);
161 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
162 void **ptr);
163 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
164 hsa_access_permission_t access);
165 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
166 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
167 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
168 hsa_status_t (*hsa_executable_get_symbol_fn)
169 (hsa_executable_t executable, const char *module_name,
170 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
171 hsa_executable_symbol_t *symbol);
172 hsa_status_t (*hsa_executable_symbol_get_info_fn)
173 (hsa_executable_symbol_t executable_symbol,
174 hsa_executable_symbol_info_t attribute, void *value);
175 hsa_status_t (*hsa_executable_iterate_symbols_fn)
176 (hsa_executable_t executable,
177 hsa_status_t (*callback)(hsa_executable_t executable,
178 hsa_executable_symbol_t symbol, void *data),
179 void *data);
180 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
181 uint64_t value);
182 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
183 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
184 hsa_signal_value_t value);
185 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
188 (hsa_signal_t signal, hsa_signal_condition_t condition,
189 hsa_signal_value_t compare_value, uint64_t timeout_hint,
190 hsa_wait_state_t wait_state_hint);
191 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
192 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
193
194 hsa_status_t (*hsa_code_object_deserialize_fn)
195 (void *serialized_code_object, size_t serialized_code_object_size,
196 const char *options, hsa_code_object_t *code_object);
197};
198
199/* Structure describing the run-time and grid properties of an HSA kernel
200 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
201
202struct GOMP_kernel_launch_attributes
203{
204 /* Number of dimensions the workload has. Maximum number is 3. */
205 uint32_t ndim;
206 /* Size of the grid in the three respective dimensions. */
207 uint32_t gdims[3];
208 /* Size of work-groups in the respective dimensions. */
209 uint32_t wdims[3];
210};
211
212/* Collection of information needed for a dispatch of a kernel from a
213 kernel. */
214
215struct kernel_dispatch
216{
217 struct agent_info *agent;
218 /* Pointer to a command queue associated with a kernel dispatch agent. */
219 void *queue;
220 /* Pointer to a memory space used for kernel arguments passing. */
221 void *kernarg_address;
222 /* Kernel object. */
223 uint64_t object;
224 /* Synchronization signal used for dispatch synchronization. */
225 uint64_t signal;
226 /* Private segment size. */
227 uint32_t private_segment_size;
228 /* Group segment size. */
229 uint32_t group_segment_size;
230};
231
232/* Structure of the kernargs segment, supporting console output.
233
234 This needs to match the definitions in Newlib, and the expectations
235 in libgomp target code. */
236
237struct kernargs {
238 /* Leave space for the real kernel arguments.
239 OpenACC and OpenMP only use one pointer. */
240 int64_t dummy1;
241 int64_t dummy2;
242
243 /* A pointer to struct output, below, for console output data. */
244 int64_t out_ptr;
245
246 /* A pointer to struct heap, below. */
247 int64_t heap_ptr;
248
249 /* A pointer to an ephemeral memory arena.
250 Only needed for OpenMP. */
251 int64_t arena_ptr;
252
253 /* Output data. */
254 struct output {
255 int return_value;
256 unsigned int next_output;
257 struct printf_data {
258 int written;
259 char msg[128];
260 int type;
261 union {
262 int64_t ivalue;
263 double dvalue;
264 char text[128];
265 };
266 } queue[1024];
267 unsigned int consumed;
268 } output_data;
269};
270
271/* A queue entry for a future asynchronous launch. */
272
273struct kernel_launch
274{
275 struct kernel_info *kernel;
276 void *vars;
277 struct GOMP_kernel_launch_attributes kla;
278};
279
280/* A queue entry for a future callback. */
281
282struct callback
283{
284 void (*fn)(void *);
285 void *data;
286};
287
288/* A data struct for the copy_data callback. */
289
290struct copy_data
291{
292 void *dst;
293 const void *src;
294 size_t len;
295 bool free_src;
296 struct goacc_asyncqueue *aq;
297};
298
299/* A queue entry for a placeholder. These correspond to a wait event. */
300
301struct placeholder
302{
303 int executed;
304 pthread_cond_t cond;
305 pthread_mutex_t mutex;
306};
307
308/* A queue entry for a wait directive. */
309
310struct asyncwait_info
311{
312 struct placeholder *placeholderp;
313};
314
315/* Encode the type of an entry in an async queue. */
316
317enum entry_type
318{
319 KERNEL_LAUNCH,
320 CALLBACK,
321 ASYNC_WAIT,
322 ASYNC_PLACEHOLDER
323};
324
325/* An entry in an async queue. */
326
327struct queue_entry
328{
329 enum entry_type type;
330 union {
331 struct kernel_launch launch;
332 struct callback callback;
333 struct asyncwait_info asyncwait;
334 struct placeholder placeholder;
335 } u;
336};
337
338/* An async queue header.
339
340 OpenMP may create one of these.
341 OpenACC may create many. */
342
343struct goacc_asyncqueue
344{
345 struct agent_info *agent;
346 hsa_queue_t *hsa_queue;
347
348 pthread_t thread_drain_queue;
349 pthread_mutex_t mutex;
350 pthread_cond_t queue_cond_in;
351 pthread_cond_t queue_cond_out;
352 struct queue_entry queue[ASYNC_QUEUE_SIZE];
353 int queue_first;
354 int queue_n;
355 int drain_queue_stop;
356
357 int id;
358 struct goacc_asyncqueue *prev;
359 struct goacc_asyncqueue *next;
360};
361
362/* Mkoffload uses this structure to describe a kernel.
363
364 OpenMP kernel dimensions are passed at runtime.
365 OpenACC kernel dimensions are passed at compile time, here. */
366
367struct hsa_kernel_description
368{
369 const char *name;
370 int oacc_dims[3]; /* Only present for GCN kernels. */
5a28e272
KCY
371 int sgpr_count;
372 int vpgr_count;
237957cc
AS
373};
374
375/* Mkoffload uses this structure to describe an offload variable. */
376
377struct global_var_info
378{
379 const char *name;
380 void *address;
381};
382
383/* Mkoffload uses this structure to describe all the kernels in a
384 loadable module. These are passed the libgomp via static constructors. */
385
386struct gcn_image_desc
387{
388 struct gcn_image {
389 size_t size;
390 void *image;
391 } *gcn_image;
392 const unsigned kernel_count;
393 struct hsa_kernel_description *kernel_infos;
394 const unsigned global_variable_count;
395 struct global_var_info *global_variables;
396};
397
7d593fd6
FH
398/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
399 support.
400 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
401
402typedef enum {
7d593fd6
FH
403 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
404 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
405 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
406} EF_AMDGPU_MACH;
407
408const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
409typedef EF_AMDGPU_MACH gcn_isa;
410
237957cc
AS
411/* Description of an HSA GPU agent (device) and the program associated with
412 it. */
413
414struct agent_info
415{
416 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
417 hsa_agent_t id;
418 /* The user-visible device number. */
419 int device_id;
420 /* Whether the agent has been initialized. The fields below are usable only
421 if it has been. */
422 bool initialized;
7d593fd6
FH
423
424 /* The instruction set architecture of the device. */
425 gcn_isa device_isa;
2e5ea579
FH
426 /* Name of the agent. */
427 char name[64];
428 /* Name of the vendor of the agent. */
429 char vendor_name[64];
237957cc
AS
430 /* Command queues of the agent. */
431 hsa_queue_t *sync_queue;
432 struct goacc_asyncqueue *async_queues, *omp_async_queue;
433 pthread_mutex_t async_queues_mutex;
434
435 /* The HSA memory region from which to allocate kernel arguments. */
436 hsa_region_t kernarg_region;
437
438 /* The HSA memory region from which to allocate device data. */
439 hsa_region_t data_region;
440
441 /* Allocated team arenas. */
442 struct team_arena_list *team_arena_list;
443 pthread_mutex_t team_arena_write_lock;
444
445 /* Read-write lock that protects kernels which are running or about to be run
446 from interference with loading and unloading of images. Needs to be
447 locked for reading while a kernel is being run, and for writing if the
448 list of modules is manipulated (and thus the HSA program invalidated). */
449 pthread_rwlock_t module_rwlock;
450
451 /* The module associated with this kernel. */
452 struct module_info *module;
453
454 /* Mutex enforcing that only one thread will finalize the HSA program. A
455 thread should have locked agent->module_rwlock for reading before
456 acquiring it. */
457 pthread_mutex_t prog_mutex;
458 /* Flag whether the HSA program that consists of all the modules has been
459 finalized. */
460 bool prog_finalized;
461 /* HSA executable - the finalized program that is used to locate kernels. */
462 hsa_executable_t executable;
463};
464
465/* Information required to identify, finalize and run any given kernel. */
466
467enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
468
469struct kernel_info
470{
471 /* Name of the kernel, required to locate it within the GCN object-code
472 module. */
473 const char *name;
474 /* The specific agent the kernel has been or will be finalized for and run
475 on. */
476 struct agent_info *agent;
477 /* The specific module where the kernel takes place. */
478 struct module_info *module;
5a28e272
KCY
479 /* Information provided by mkoffload associated with the kernel. */
480 struct hsa_kernel_description *description;
237957cc
AS
481 /* Mutex enforcing that at most once thread ever initializes a kernel for
482 use. A thread should have locked agent->module_rwlock for reading before
483 acquiring it. */
484 pthread_mutex_t init_mutex;
485 /* Flag indicating whether the kernel has been initialized and all fields
486 below it contain valid data. */
487 bool initialized;
488 /* Flag indicating that the kernel has a problem that blocks an execution. */
489 bool initialization_failed;
490 /* The object to be put into the dispatch queue. */
491 uint64_t object;
492 /* Required size of kernel arguments. */
493 uint32_t kernarg_segment_size;
494 /* Required size of group segment. */
495 uint32_t group_segment_size;
496 /* Required size of private segment. */
497 uint32_t private_segment_size;
498 /* Set up for OpenMP or OpenACC? */
499 enum offload_kind kind;
500};
501
502/* Information about a particular GCN module, its image and kernels. */
503
504struct module_info
505{
506 /* The description with which the program has registered the image. */
507 struct gcn_image_desc *image_desc;
508 /* GCN heap allocation. */
509 struct heap *heap;
510 /* Physical boundaries of the loaded module. */
511 Elf64_Addr phys_address_start;
512 Elf64_Addr phys_address_end;
513
514 bool constructors_run_p;
515 struct kernel_info *init_array_func, *fini_array_func;
516
517 /* Number of kernels in this module. */
518 int kernel_count;
519 /* An array of kernel_info structures describing each kernel in this
520 module. */
521 struct kernel_info kernels[];
522};
523
524/* A linked list of memory arenas allocated on the device.
525 These are only used by OpenMP, as a means to optimize per-team malloc. */
526
527struct team_arena_list
528{
529 struct team_arena_list *next;
530
531 /* The number of teams determines the size of the allocation. */
532 int num_teams;
533 /* The device address of the arena itself. */
534 void *arena;
535 /* A flag to prevent two asynchronous kernels trying to use the same arena.
536 The mutex is locked until the kernel exits. */
537 pthread_mutex_t in_use;
538};
539
540/* Information about the whole HSA environment and all of its agents. */
541
542struct hsa_context_info
543{
544 /* Whether the structure has been initialized. */
545 bool initialized;
546 /* Number of usable GPU HSA agents in the system. */
547 int agent_count;
548 /* Array of agent_info structures describing the individual HSA agents. */
549 struct agent_info *agents;
2e5ea579
FH
550 /* Driver version string. */
551 char driver_version_s[30];
237957cc
AS
552};
553
554/* Format of the on-device heap.
555
556 This must match the definition in Newlib and gcn-run. */
557
558struct heap {
559 int64_t size;
560 char data[0];
561};
562
563/* }}} */
564/* {{{ Global variables */
565
566/* Information about the whole HSA environment and all of its agents. */
567
568static struct hsa_context_info hsa_context;
569
570/* HSA runtime functions that are initialized in init_hsa_context. */
571
572static struct hsa_runtime_fn_info hsa_fns;
573
574/* Heap space, allocated target-side, provided for use of newlib malloc.
575 Each module should have it's own heap allocated.
576 Beware that heap usage increases with OpenMP teams. See also arenas. */
577
578static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
579
580/* Flag to decide whether print to stderr information about what is going on.
581 Set in init_debug depending on environment variables. */
582
583static bool debug;
584
585/* Flag to decide if the runtime should suppress a possible fallback to host
586 execution. */
587
588static bool suppress_host_fallback;
589
590/* Flag to locate HSA runtime shared library that is dlopened
591 by this plug-in. */
592
593static const char *hsa_runtime_lib;
594
595/* Flag to decide if the runtime should support also CPU devices (can be
596 a simulator). */
597
598static bool support_cpu_devices;
599
600/* Runtime dimension overrides. Zero indicates default. */
601
602static int override_x_dim = 0;
603static int override_z_dim = 0;
604
605/* }}} */
606/* {{{ Debug & Diagnostic */
607
608/* Print a message to stderr if GCN_DEBUG value is set to true. */
609
610#define DEBUG_PRINT(...) \
611 do \
612 { \
613 if (debug) \
614 { \
615 fprintf (stderr, __VA_ARGS__); \
616 } \
617 } \
618 while (false);
619
620/* Flush stderr if GCN_DEBUG value is set to true. */
621
622#define DEBUG_FLUSH() \
623 do { \
624 if (debug) \
625 fflush (stderr); \
626 } while (false)
627
628/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
629 is set to true. */
630
631#define DEBUG_LOG(prefix, ...) \
632 do \
633 { \
634 DEBUG_PRINT (prefix); \
635 DEBUG_PRINT (__VA_ARGS__); \
636 DEBUG_FLUSH (); \
637 } while (false)
638
639/* Print a debugging message to stderr. */
640
641#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
642
643/* Print a warning message to stderr. */
644
645#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
646
647/* Print HSA warning STR with an HSA STATUS code. */
648
649static void
650hsa_warn (const char *str, hsa_status_t status)
651{
652 if (!debug)
653 return;
654
655 const char *hsa_error_msg = "[unknown]";
656 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
657
658 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
659 hsa_error_msg);
660}
661
662/* Report a fatal error STR together with the HSA error corresponding to STATUS
663 and terminate execution of the current process. */
664
665static void
666hsa_fatal (const char *str, hsa_status_t status)
667{
668 const char *hsa_error_msg = "[unknown]";
669 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
670 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
671 hsa_error_msg);
672}
673
674/* Like hsa_fatal, except only report error message, and return FALSE
675 for propagating error processing to outside of plugin. */
676
677static bool
678hsa_error (const char *str, hsa_status_t status)
679{
680 const char *hsa_error_msg = "[unknown]";
681 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
682 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
683 hsa_error_msg);
684 return false;
685}
686
687/* Dump information about the available hardware. */
688
689static void
690dump_hsa_system_info (void)
691{
692 hsa_status_t status;
693
694 hsa_endianness_t endianness;
695 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
696 &endianness);
697 if (status == HSA_STATUS_SUCCESS)
698 switch (endianness)
699 {
700 case HSA_ENDIANNESS_LITTLE:
701 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
702 break;
703 case HSA_ENDIANNESS_BIG:
704 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
705 break;
706 default:
707 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
708 }
709 else
710 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
711
712 uint8_t extensions[128];
713 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
714 &extensions);
715 if (status == HSA_STATUS_SUCCESS)
716 {
717 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
718 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
719 }
720 else
721 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
722}
723
724/* Dump information about the available hardware. */
725
726static void
727dump_machine_model (hsa_machine_model_t machine_model, const char *s)
728{
729 switch (machine_model)
730 {
731 case HSA_MACHINE_MODEL_SMALL:
732 GCN_DEBUG ("%s: SMALL\n", s);
733 break;
734 case HSA_MACHINE_MODEL_LARGE:
735 GCN_DEBUG ("%s: LARGE\n", s);
736 break;
737 default:
738 GCN_WARNING ("%s: UNKNOWN\n", s);
739 break;
740 }
741}
742
743/* Dump information about the available hardware. */
744
745static void
746dump_profile (hsa_profile_t profile, const char *s)
747{
748 switch (profile)
749 {
750 case HSA_PROFILE_FULL:
751 GCN_DEBUG ("%s: FULL\n", s);
752 break;
753 case HSA_PROFILE_BASE:
754 GCN_DEBUG ("%s: BASE\n", s);
755 break;
756 default:
757 GCN_WARNING ("%s: UNKNOWN\n", s);
758 break;
759 }
760}
761
762/* Dump information about a device memory region. */
763
764static hsa_status_t
765dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
766{
767 hsa_status_t status;
768
769 hsa_region_segment_t segment;
770 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
771 &segment);
772 if (status == HSA_STATUS_SUCCESS)
773 {
774 if (segment == HSA_REGION_SEGMENT_GLOBAL)
775 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
776 else if (segment == HSA_REGION_SEGMENT_READONLY)
777 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
778 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
779 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
780 else if (segment == HSA_REGION_SEGMENT_GROUP)
781 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
782 else
783 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
784 }
785 else
786 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
787
788 if (segment == HSA_REGION_SEGMENT_GLOBAL)
789 {
790 uint32_t flags;
791 status
792 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
793 &flags);
794 if (status == HSA_STATUS_SUCCESS)
795 {
796 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
797 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
798 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
799 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
800 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
801 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
802 }
803 else
804 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
805 }
806
807 size_t size;
808 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
809 if (status == HSA_STATUS_SUCCESS)
810 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
811 else
812 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
813
814 status
815 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
816 &size);
817 if (status == HSA_STATUS_SUCCESS)
818 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
819 else
820 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
821
822 bool alloc_allowed;
823 status
824 = hsa_fns.hsa_region_get_info_fn (region,
825 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
826 &alloc_allowed);
827 if (status == HSA_STATUS_SUCCESS)
828 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
829 else
830 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
831
832 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
833 return HSA_STATUS_SUCCESS;
834
835 status
836 = hsa_fns.hsa_region_get_info_fn (region,
837 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
838 &size);
839 if (status == HSA_STATUS_SUCCESS)
840 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
841 else
842 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
843
844 size_t align;
845 status
846 = hsa_fns.hsa_region_get_info_fn (region,
847 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
848 &align);
849 if (status == HSA_STATUS_SUCCESS)
850 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
851 else
852 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
853
854 return HSA_STATUS_SUCCESS;
855}
856
857/* Dump information about all the device memory regions. */
858
859static void
860dump_hsa_regions (hsa_agent_t agent)
861{
862 hsa_status_t status;
863 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
864 dump_hsa_region,
865 NULL);
866 if (status != HSA_STATUS_SUCCESS)
867 hsa_error ("Dumping hsa regions failed", status);
868}
869
870/* Dump information about the available devices. */
871
872static hsa_status_t
873dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
874{
875 hsa_status_t status;
876
877 char buf[64];
878 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
879 &buf);
880 if (status == HSA_STATUS_SUCCESS)
881 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
882 else
883 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
884
885 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
886 &buf);
887 if (status == HSA_STATUS_SUCCESS)
888 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
889 else
890 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
891
892 hsa_machine_model_t machine_model;
893 status
894 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
895 &machine_model);
896 if (status == HSA_STATUS_SUCCESS)
897 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
898 else
899 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
900
901 hsa_profile_t profile;
902 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
903 &profile);
904 if (status == HSA_STATUS_SUCCESS)
905 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
906 else
907 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
908
909 hsa_device_type_t device_type;
910 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
911 &device_type);
912 if (status == HSA_STATUS_SUCCESS)
913 {
914 switch (device_type)
915 {
916 case HSA_DEVICE_TYPE_CPU:
917 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
918 break;
919 case HSA_DEVICE_TYPE_GPU:
920 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
921 break;
922 case HSA_DEVICE_TYPE_DSP:
923 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
924 break;
925 default:
926 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
927 break;
928 }
929 }
930 else
931 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
932
933 uint32_t cu_count;
934 status = hsa_fns.hsa_agent_get_info_fn
935 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
936 if (status == HSA_STATUS_SUCCESS)
937 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
938 else
939 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
940
941 uint32_t size;
942 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
943 &size);
944 if (status == HSA_STATUS_SUCCESS)
945 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
946 else
947 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
948
949 uint32_t max_dim;
950 status = hsa_fns.hsa_agent_get_info_fn (agent,
951 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
952 &max_dim);
953 if (status == HSA_STATUS_SUCCESS)
954 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
955 else
956 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
957
958 uint32_t max_size;
959 status = hsa_fns.hsa_agent_get_info_fn (agent,
960 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
961 &max_size);
962 if (status == HSA_STATUS_SUCCESS)
963 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
964 else
965 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
966
967 uint32_t grid_max_dim;
968 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
969 &grid_max_dim);
970 if (status == HSA_STATUS_SUCCESS)
971 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
972 else
973 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
974
975 uint32_t grid_max_size;
976 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
977 &grid_max_size);
978 if (status == HSA_STATUS_SUCCESS)
979 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
980 else
981 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
982
983 dump_hsa_regions (agent);
984
985 return HSA_STATUS_SUCCESS;
986}
987
988/* Forward reference. */
989
990static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
991
992/* Helper function for dump_executable_symbols. */
993
994static hsa_status_t
995dump_executable_symbol (hsa_executable_t executable,
996 hsa_executable_symbol_t symbol,
997 void *data __attribute__((unused)))
998{
999 char *name = get_executable_symbol_name (symbol);
1000
1001 if (name)
1002 {
1003 GCN_DEBUG ("executable symbol: %s\n", name);
1004 free (name);
1005 }
1006
1007 return HSA_STATUS_SUCCESS;
1008}
1009
1010/* Dump all global symbol in an executable. */
1011
1012static void
1013dump_executable_symbols (hsa_executable_t executable)
1014{
1015 hsa_status_t status;
1016 status
1017 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1018 dump_executable_symbol,
1019 NULL);
1020 if (status != HSA_STATUS_SUCCESS)
1021 hsa_fatal ("Could not dump HSA executable symbols", status);
1022}
1023
1024/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1025
1026static void
1027print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1028{
1029 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1030
1031 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1032 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1033 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1034 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1035 (void*)kernargs->heap_ptr);
1036 fprintf (stderr, "%*sarena address: %p\n", indent, "",
1037 (void*)kernargs->arena_ptr);
1038 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1039 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1040 dispatch->private_segment_size);
1041 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1042 dispatch->group_segment_size);
1043 fprintf (stderr, "\n");
1044}
1045
1046/* }}} */
1047/* {{{ Utility functions */
1048
1049/* Cast the thread local storage to gcn_thread. */
1050
1051static inline struct gcn_thread *
1052gcn_thread (void)
1053{
1054 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1055}
1056
1057/* Initialize debug and suppress_host_fallback according to the environment. */
1058
1059static void
1060init_environment_variables (void)
1061{
1062 if (secure_getenv ("GCN_DEBUG"))
1063 debug = true;
1064 else
1065 debug = false;
1066
1067 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1068 suppress_host_fallback = true;
1069 else
1070 suppress_host_fallback = false;
1071
1072 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1073 if (hsa_runtime_lib == NULL)
f062c3f1 1074 hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so.1";
237957cc
AS
1075
1076 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1077
1078 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1079 if (!x)
1080 x = secure_getenv ("GCN_NUM_GANGS");
1081 if (x)
1082 override_x_dim = atoi (x);
1083
1084 const char *z = secure_getenv ("GCN_NUM_THREADS");
1085 if (!z)
1086 z = secure_getenv ("GCN_NUM_WORKERS");
1087 if (z)
1088 override_z_dim = atoi (z);
1089
1090 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1091 if (heap)
1092 {
1093 size_t tmp = atol (heap);
1094 if (tmp)
1095 gcn_kernel_heap_size = tmp;
1096 }
1097}
1098
1099/* Return malloc'd string with name of SYMBOL. */
1100
1101static char *
1102get_executable_symbol_name (hsa_executable_symbol_t symbol)
1103{
1104 hsa_status_t status;
1105 char *res;
1106 uint32_t len;
1107 const hsa_executable_symbol_info_t info_name_length
1108 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1109
1110 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1111 &len);
1112 if (status != HSA_STATUS_SUCCESS)
1113 {
1114 hsa_error ("Could not get length of symbol name", status);
1115 return NULL;
1116 }
1117
1118 res = GOMP_PLUGIN_malloc (len + 1);
1119
1120 const hsa_executable_symbol_info_t info_name
1121 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1122
1123 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1124
1125 if (status != HSA_STATUS_SUCCESS)
1126 {
1127 hsa_error ("Could not get symbol name", status);
1128 free (res);
1129 return NULL;
1130 }
1131
1132 res[len] = '\0';
1133
1134 return res;
1135}
1136
237957cc
AS
1137/* Get the number of GPU Compute Units. */
1138
1139static int
1140get_cu_count (struct agent_info *agent)
1141{
1142 uint32_t cu_count;
1143 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1144 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1145 if (status == HSA_STATUS_SUCCESS)
1146 return cu_count;
1147 else
1148 return 64; /* The usual number for older devices. */
1149}
1150
1151/* Calculate the maximum grid size for OMP threads / OACC workers.
1152 This depends on the kernel's resource usage levels. */
1153
1154static int
1155limit_worker_threads (int threads)
1156{
1157 /* FIXME Do something more inteligent here.
1158 GCN can always run 4 threads within a Compute Unit, but
1159 more than that depends on register usage. */
1160 if (threads > 16)
1161 threads = 16;
1162 return threads;
1163}
1164
1165/* Parse the target attributes INPUT provided by the compiler and return true
1166 if we should run anything all. If INPUT is NULL, fill DEF with default
1167 values, then store INPUT or DEF into *RESULT.
1168
1169 This is used for OpenMP only. */
1170
1171static bool
1172parse_target_attributes (void **input,
1173 struct GOMP_kernel_launch_attributes *def,
1174 struct GOMP_kernel_launch_attributes **result,
1175 struct agent_info *agent)
1176{
1177 if (!input)
1178 GOMP_PLUGIN_fatal ("No target arguments provided");
1179
1180 bool grid_attrs_found = false;
1181 bool gcn_dims_found = false;
1182 int gcn_teams = 0;
1183 int gcn_threads = 0;
1184 while (*input)
1185 {
1186 intptr_t id = (intptr_t) *input++, val;
1187
1188 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1189 val = (intptr_t) *input++;
1190 else
1191 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1192
1193 val = (val > INT_MAX) ? INT_MAX : val;
1194
1195 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1196 && ((id & GOMP_TARGET_ARG_ID_MASK)
1197 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1198 {
1199 grid_attrs_found = true;
1200 break;
1201 }
14e5e746
AS
1202 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1203 == GOMP_TARGET_ARG_DEVICE_ALL)
237957cc
AS
1204 {
1205 gcn_dims_found = true;
1206 switch (id & GOMP_TARGET_ARG_ID_MASK)
1207 {
1208 case GOMP_TARGET_ARG_NUM_TEAMS:
1209 gcn_teams = val;
1210 break;
1211 case GOMP_TARGET_ARG_THREAD_LIMIT:
1212 gcn_threads = limit_worker_threads (val);
1213 break;
1214 default:
1215 ;
1216 }
1217 }
1218 }
1219
1220 if (gcn_dims_found)
1221 {
7d593fd6
FH
1222 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1223 && gcn_threads == 0 && override_z_dim == 0)
237957cc
AS
1224 {
1225 gcn_threads = 4;
1226 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1227 "threads to 4 per team.\n");
1228 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1229 "GCN_NUM_THREADS=16\n");
1230 }
1231
1232 def->ndim = 3;
1233 /* Fiji has 64 CUs, but Vega20 has 60. */
1234 def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
1235 /* Each thread is 64 work items wide. */
1236 def->gdims[1] = 64;
1237 /* A work group can have 16 wavefronts. */
1238 def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
1239 def->wdims[0] = 1; /* Single team per work-group. */
1240 def->wdims[1] = 64;
1241 def->wdims[2] = 16;
1242 *result = def;
1243 return true;
1244 }
1245 else if (!grid_attrs_found)
1246 {
1247 def->ndim = 1;
1248 def->gdims[0] = 1;
1249 def->gdims[1] = 1;
1250 def->gdims[2] = 1;
1251 def->wdims[0] = 1;
1252 def->wdims[1] = 1;
1253 def->wdims[2] = 1;
1254 *result = def;
1255 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1256 return true;
1257 }
1258
1259 struct GOMP_kernel_launch_attributes *kla;
1260 kla = (struct GOMP_kernel_launch_attributes *) *input;
1261 *result = kla;
1262 if (kla->ndim == 0 || kla->ndim > 3)
1263 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1264
1265 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1266 unsigned i;
1267 for (i = 0; i < kla->ndim; i++)
1268 {
1269 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1270 kla->gdims[i], kla->wdims[i]);
1271 if (kla->gdims[i] == 0)
1272 return false;
1273 }
1274 return true;
1275}
1276
1277/* Return the group size given the requested GROUP size, GRID size and number
1278 of grid dimensions NDIM. */
1279
1280static uint32_t
1281get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1282{
1283 if (group == 0)
1284 {
1285 /* TODO: Provide a default via environment or device characteristics. */
1286 if (ndim == 1)
1287 group = 64;
1288 else if (ndim == 2)
1289 group = 8;
1290 else
1291 group = 4;
1292 }
1293
1294 if (group > grid)
1295 group = grid;
1296 return group;
1297}
1298
1299/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1300
1301static void
1302packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1303{
1304 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1305}
1306
1307/* A never-called callback for the HSA command queues. These signal events
1308 that we don't use, so we trigger an error.
1309
1310 This "queue" is not to be confused with the async queues, below. */
1311
1312static void
1313hsa_queue_callback (hsa_status_t status,
1314 hsa_queue_t *queue __attribute__ ((unused)),
1315 void *data __attribute__ ((unused)))
1316{
1317 hsa_fatal ("Asynchronous queue error", status);
1318}
1319
1320/* }}} */
1321/* {{{ HSA initialization */
1322
1323/* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1324
1325static bool
1326init_hsa_runtime_functions (void)
1327{
1328#define DLSYM_FN(function) \
1329 hsa_fns.function##_fn = dlsym (handle, #function); \
1330 if (hsa_fns.function##_fn == NULL) \
1331 return false;
1332 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1333 if (handle == NULL)
1334 return false;
1335
1336 DLSYM_FN (hsa_status_string)
1337 DLSYM_FN (hsa_system_get_info)
1338 DLSYM_FN (hsa_agent_get_info)
1339 DLSYM_FN (hsa_init)
1340 DLSYM_FN (hsa_iterate_agents)
1341 DLSYM_FN (hsa_region_get_info)
1342 DLSYM_FN (hsa_queue_create)
1343 DLSYM_FN (hsa_agent_iterate_regions)
1344 DLSYM_FN (hsa_executable_destroy)
1345 DLSYM_FN (hsa_executable_create)
1346 DLSYM_FN (hsa_executable_global_variable_define)
1347 DLSYM_FN (hsa_executable_load_code_object)
1348 DLSYM_FN (hsa_executable_freeze)
1349 DLSYM_FN (hsa_signal_create)
1350 DLSYM_FN (hsa_memory_allocate)
1351 DLSYM_FN (hsa_memory_assign_agent)
1352 DLSYM_FN (hsa_memory_copy)
1353 DLSYM_FN (hsa_memory_free)
1354 DLSYM_FN (hsa_signal_destroy)
1355 DLSYM_FN (hsa_executable_get_symbol)
1356 DLSYM_FN (hsa_executable_symbol_get_info)
1357 DLSYM_FN (hsa_executable_iterate_symbols)
1358 DLSYM_FN (hsa_queue_add_write_index_release)
1359 DLSYM_FN (hsa_queue_load_read_index_acquire)
1360 DLSYM_FN (hsa_signal_wait_acquire)
1361 DLSYM_FN (hsa_signal_store_relaxed)
1362 DLSYM_FN (hsa_signal_store_release)
1363 DLSYM_FN (hsa_signal_load_acquire)
1364 DLSYM_FN (hsa_queue_destroy)
1365 DLSYM_FN (hsa_code_object_deserialize)
1366 return true;
1367#undef DLSYM_FN
1368}
1369
1370/* Return true if the agent is a GPU and can accept of concurrent submissions
1371 from different threads. */
1372
1373static bool
1374suitable_hsa_agent_p (hsa_agent_t agent)
1375{
1376 hsa_device_type_t device_type;
1377 hsa_status_t status
1378 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1379 &device_type);
1380 if (status != HSA_STATUS_SUCCESS)
1381 return false;
1382
1383 switch (device_type)
1384 {
1385 case HSA_DEVICE_TYPE_GPU:
1386 break;
1387 case HSA_DEVICE_TYPE_CPU:
1388 if (!support_cpu_devices)
1389 return false;
1390 break;
1391 default:
1392 return false;
1393 }
1394
1395 uint32_t features = 0;
1396 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1397 &features);
1398 if (status != HSA_STATUS_SUCCESS
1399 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1400 return false;
1401 hsa_queue_type_t queue_type;
1402 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1403 &queue_type);
1404 if (status != HSA_STATUS_SUCCESS
1405 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1406 return false;
1407
1408 return true;
1409}
1410
1411/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1412 agent_count in hsa_context. */
1413
1414static hsa_status_t
1415count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1416{
1417 if (suitable_hsa_agent_p (agent))
1418 hsa_context.agent_count++;
1419 return HSA_STATUS_SUCCESS;
1420}
1421
1422/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1423 id to the describing structure in the hsa context. The index of the
1424 structure is pointed to by DATA, increment it afterwards. */
1425
1426static hsa_status_t
1427assign_agent_ids (hsa_agent_t agent, void *data)
1428{
1429 if (suitable_hsa_agent_p (agent))
1430 {
1431 int *agent_index = (int *) data;
1432 hsa_context.agents[*agent_index].id = agent;
1433 ++*agent_index;
1434 }
1435 return HSA_STATUS_SUCCESS;
1436}
1437
1438/* Initialize hsa_context if it has not already been done.
1439 Return TRUE on success. */
1440
1441static bool
1442init_hsa_context (void)
1443{
1444 hsa_status_t status;
1445 int agent_index = 0;
1446
1447 if (hsa_context.initialized)
1448 return true;
1449 init_environment_variables ();
1450 if (!init_hsa_runtime_functions ())
1451 {
1452 GCN_WARNING ("Run-time could not be dynamically opened\n");
1453 if (suppress_host_fallback)
1454 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1455 return false;
1456 }
1457 status = hsa_fns.hsa_init_fn ();
1458 if (status != HSA_STATUS_SUCCESS)
1459 return hsa_error ("Run-time could not be initialized", status);
1460 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1461
1462 if (debug)
1463 dump_hsa_system_info ();
1464
1465 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1466 if (status != HSA_STATUS_SUCCESS)
1467 return hsa_error ("GCN GPU devices could not be enumerated", status);
1468 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1469
1470 hsa_context.agents
1471 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1472 * sizeof (struct agent_info));
1473 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
966de09b
AS
1474 if (status != HSA_STATUS_SUCCESS)
1475 return hsa_error ("Scanning compute agents failed", status);
237957cc
AS
1476 if (agent_index != hsa_context.agent_count)
1477 {
1478 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1479 return false;
1480 }
1481
1482 if (debug)
1483 {
1484 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1485 if (status != HSA_STATUS_SUCCESS)
1486 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1487 }
1488
2e5ea579
FH
1489 uint16_t minor, major;
1490 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1491 &minor);
1492 if (status != HSA_STATUS_SUCCESS)
1493 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1494 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1495 &major);
1496 if (status != HSA_STATUS_SUCCESS)
1497 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1498
1499 size_t len = sizeof hsa_context.driver_version_s;
1500 int printed = snprintf (hsa_context.driver_version_s, len,
1501 "HSA Runtime %hu.%hu", (unsigned short int)major,
1502 (unsigned short int)minor);
1503 if (printed >= len)
1504 GCN_WARNING ("HSA runtime version string was truncated."
1505 "Version %hu.%hu is too long.", (unsigned short int)major,
1506 (unsigned short int)minor);
1507
237957cc
AS
1508 hsa_context.initialized = true;
1509 return true;
1510}
1511
1512/* Verify that hsa_context has already been initialized and return the
1513 agent_info structure describing device number N. Return NULL on error. */
1514
1515static struct agent_info *
1516get_agent_info (int n)
1517{
1518 if (!hsa_context.initialized)
1519 {
1520 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1521 return NULL;
1522 }
1523 if (n >= hsa_context.agent_count)
1524 {
1525 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1526 return NULL;
1527 }
1528 if (!hsa_context.agents[n].initialized)
1529 {
1530 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1531 return NULL;
1532 }
1533 return &hsa_context.agents[n];
1534}
1535
1536/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1537
1538 Selects (breaks at) a suitable region of type KIND. */
1539
1540static hsa_status_t
1541get_memory_region (hsa_region_t region, hsa_region_t *retval,
1542 hsa_region_global_flag_t kind)
1543{
1544 hsa_status_t status;
1545 hsa_region_segment_t segment;
1546
1547 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1548 &segment);
1549 if (status != HSA_STATUS_SUCCESS)
1550 return status;
1551 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1552 return HSA_STATUS_SUCCESS;
1553
1554 uint32_t flags;
1555 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1556 &flags);
1557 if (status != HSA_STATUS_SUCCESS)
1558 return status;
1559 if (flags & kind)
1560 {
1561 *retval = region;
1562 return HSA_STATUS_INFO_BREAK;
1563 }
1564 return HSA_STATUS_SUCCESS;
1565}
1566
1567/* Callback of hsa_agent_iterate_regions.
1568
1569 Selects a kernargs memory region. */
1570
1571static hsa_status_t
1572get_kernarg_memory_region (hsa_region_t region, void *data)
1573{
1574 return get_memory_region (region, (hsa_region_t *)data,
1575 HSA_REGION_GLOBAL_FLAG_KERNARG);
1576}
1577
1578/* Callback of hsa_agent_iterate_regions.
1579
1580 Selects a coarse-grained memory region suitable for the heap and
1581 offload data. */
1582
1583static hsa_status_t
1584get_data_memory_region (hsa_region_t region, void *data)
1585{
1586 return get_memory_region (region, (hsa_region_t *)data,
1587 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1588}
1589
7d593fd6
FH
1590static int
1591elf_gcn_isa_field (Elf64_Ehdr *image)
1592{
1593 return image->e_flags & EF_AMDGPU_MACH_MASK;
1594}
1595
7d593fd6
FH
1596const static char *gcn_gfx803_s = "gfx803";
1597const static char *gcn_gfx900_s = "gfx900";
1598const static char *gcn_gfx906_s = "gfx906";
1599const static int gcn_isa_name_len = 6;
1600
1601/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1602 support the ISA. */
1603
1604static const char*
1605isa_hsa_name (int isa) {
1606 switch(isa)
1607 {
7d593fd6
FH
1608 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1609 return gcn_gfx803_s;
1610 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1611 return gcn_gfx900_s;
1612 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1613 return gcn_gfx906_s;
1614 }
1615 return NULL;
1616}
1617
1618/* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1619 with -march) or NULL if we do not support the ISA.
1620 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1621
1622static const char*
1623isa_gcc_name (int isa) {
1624 switch(isa)
1625 {
7d593fd6
FH
1626 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1627 return "fiji";
1628 default:
1629 return isa_hsa_name (isa);
1630 }
1631}
1632
1633/* Returns the code which is used in the GCN object code to identify the ISA with
1634 the given name (as used by the HSA runtime). */
1635
1636static gcn_isa
1637isa_code(const char *isa) {
7d593fd6
FH
1638 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1639 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1640
1641 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1642 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1643
1644 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1645 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1646
1647 return -1;
1648}
1649
237957cc
AS
1650/* }}} */
1651/* {{{ Run */
1652
1653/* Create or reuse a team arena.
1654
1655 Team arenas are used by OpenMP to avoid calling malloc multiple times
1656 while setting up each team. This is purely a performance optimization.
1657
1658 Allocating an arena also costs performance, albeit on the host side, so
1659 this function will reuse an existing arena if a large enough one is idle.
1660 The arena is released, but not deallocated, when the kernel exits. */
1661
1662static void *
1663get_team_arena (struct agent_info *agent, int num_teams)
1664{
1665 struct team_arena_list **next_ptr = &agent->team_arena_list;
1666 struct team_arena_list *item;
1667
1668 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1669 {
1670 if (item->num_teams < num_teams)
1671 continue;
1672
1673 if (pthread_mutex_trylock (&item->in_use))
1674 continue;
1675
1676 return item->arena;
1677 }
1678
1679 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1680
1681 if (pthread_mutex_lock (&agent->team_arena_write_lock))
1682 {
1683 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1684 return false;
1685 }
1686 item = malloc (sizeof (*item));
1687 item->num_teams = num_teams;
1688 item->next = NULL;
1689 *next_ptr = item;
1690
1691 if (pthread_mutex_init (&item->in_use, NULL))
1692 {
1693 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1694 return false;
1695 }
1696 if (pthread_mutex_lock (&item->in_use))
1697 {
1698 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1699 return false;
1700 }
1701 if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1702 {
1703 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1704 return false;
1705 }
1706
1707 const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */
1708 hsa_status_t status;
1709 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1710 TEAM_ARENA_SIZE*num_teams,
1711 &item->arena);
1712 if (status != HSA_STATUS_SUCCESS)
1713 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1714 status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1715 HSA_ACCESS_PERMISSION_RW);
1716 if (status != HSA_STATUS_SUCCESS)
1717 hsa_fatal ("Could not assign arena memory to device", status);
1718
1719 return item->arena;
1720}
1721
1722/* Mark a team arena available for reuse. */
1723
1724static void
1725release_team_arena (struct agent_info* agent, void *arena)
1726{
1727 struct team_arena_list *item;
1728
1729 for (item = agent->team_arena_list; item; item = item->next)
1730 {
1731 if (item->arena == arena)
1732 {
1733 if (pthread_mutex_unlock (&item->in_use))
1734 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1735 return;
1736 }
1737 }
1738 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1739}
1740
1741/* Clean up all the allocated team arenas. */
1742
1743static bool
1744destroy_team_arenas (struct agent_info *agent)
1745{
1746 struct team_arena_list *item, *next;
1747
1748 for (item = agent->team_arena_list; item; item = next)
1749 {
1750 next = item->next;
1751 hsa_fns.hsa_memory_free_fn (item->arena);
1752 if (pthread_mutex_destroy (&item->in_use))
1753 {
1754 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1755 return false;
1756 }
1757 free (item);
1758 }
1759 agent->team_arena_list = NULL;
1760
1761 return true;
1762}
1763
1764/* Allocate memory on a specified device. */
1765
1766static void *
1767alloc_by_agent (struct agent_info *agent, size_t size)
1768{
1769 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1770
1771 /* Zero-size allocations are invalid, so in order to return a valid pointer
1772 we need to pass a valid size. One source of zero-size allocations is
1773 kernargs for kernels that have no inputs or outputs (the kernel may
1774 only use console output, for example). */
1775 if (size == 0)
1776 size = 4;
1777
1778 void *ptr;
1779 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1780 size, &ptr);
1781 if (status != HSA_STATUS_SUCCESS)
1782 {
1783 hsa_error ("Could not allocate device memory", status);
1784 return NULL;
1785 }
1786
1787 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1788 HSA_ACCESS_PERMISSION_RW);
1789 if (status != HSA_STATUS_SUCCESS)
1790 {
1791 hsa_error ("Could not assign data memory to device", status);
1792 return NULL;
1793 }
1794
1795 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1796 bool profiling_dispatch_p
1797 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1798 if (profiling_dispatch_p)
1799 {
1800 acc_prof_info *prof_info = thr->prof_info;
1801 acc_event_info data_event_info;
1802 acc_api_info *api_info = thr->api_info;
1803
1804 prof_info->event_type = acc_ev_alloc;
1805
1806 data_event_info.data_event.event_type = prof_info->event_type;
1807 data_event_info.data_event.valid_bytes
1808 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1809 data_event_info.data_event.parent_construct
1810 = acc_construct_parallel;
1811 data_event_info.data_event.implicit = 1;
1812 data_event_info.data_event.tool_info = NULL;
1813 data_event_info.data_event.var_name = NULL;
1814 data_event_info.data_event.bytes = size;
1815 data_event_info.data_event.host_ptr = NULL;
1816 data_event_info.data_event.device_ptr = (void *) ptr;
1817
1818 api_info->device_api = acc_device_api_other;
1819
1820 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1821 api_info);
1822 }
1823
1824 return ptr;
1825}
1826
1827/* Create kernel dispatch data structure for given KERNEL, along with
1828 the necessary device signals and memory allocations. */
1829
1830static struct kernel_dispatch *
1831create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1832{
1833 struct agent_info *agent = kernel->agent;
1834 struct kernel_dispatch *shadow
1835 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1836
1837 shadow->agent = kernel->agent;
1838 shadow->object = kernel->object;
1839
1840 hsa_signal_t sync_signal;
1841 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1842 if (status != HSA_STATUS_SUCCESS)
1843 hsa_fatal ("Error creating the GCN sync signal", status);
1844
1845 shadow->signal = sync_signal.handle;
1846 shadow->private_segment_size = kernel->private_segment_size;
1847 shadow->group_segment_size = kernel->group_segment_size;
1848
1849 /* We expect kernels to request a single pointer, explicitly, and the
1850 rest of struct kernargs, implicitly. If they request anything else
1851 then something is wrong. */
1852 if (kernel->kernarg_segment_size > 8)
1853 {
1854 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1855 return NULL;
1856 }
1857
1858 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1859 sizeof (struct kernargs),
1860 &shadow->kernarg_address);
1861 if (status != HSA_STATUS_SUCCESS)
1862 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1863 struct kernargs *kernargs = shadow->kernarg_address;
1864
1865 /* Zero-initialize the output_data (minimum needed). */
1866 kernargs->out_ptr = (int64_t)&kernargs->output_data;
1867 kernargs->output_data.next_output = 0;
1868 for (unsigned i = 0;
1869 i < (sizeof (kernargs->output_data.queue)
1870 / sizeof (kernargs->output_data.queue[0]));
1871 i++)
1872 kernargs->output_data.queue[i].written = 0;
1873 kernargs->output_data.consumed = 0;
1874
1875 /* Pass in the heap location. */
1876 kernargs->heap_ptr = (int64_t)kernel->module->heap;
1877
1878 /* Create an arena. */
1879 if (kernel->kind == KIND_OPENMP)
1880 kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1881 else
1882 kernargs->arena_ptr = 0;
1883
1884 /* Ensure we can recognize unset return values. */
1885 kernargs->output_data.return_value = 0xcafe0000;
1886
1887 return shadow;
1888}
1889
1890/* Output any data written to console output from the kernel. It is expected
1891 that this function is polled during kernel execution.
1892
1893 We print all entries from the last item printed to the next entry without
1894 a "written" flag. If the "final" flag is set then it'll continue right to
1895 the end.
1896
1897 The print buffer is circular, but the from and to locations don't wrap when
1898 the buffer does, so the output limit is UINT_MAX. The target blocks on
1899 output when the buffer is full. */
1900
1901static void
1902console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1903 bool final)
1904{
1905 unsigned int limit = (sizeof (kernargs->output_data.queue)
1906 / sizeof (kernargs->output_data.queue[0]));
1907
1908 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1909 __ATOMIC_ACQUIRE);
1910 unsigned int to = kernargs->output_data.next_output;
1911
1912 if (from > to)
1913 {
1914 /* Overflow. */
1915 if (final)
1916 printf ("GCN print buffer overflowed.\n");
1917 return;
1918 }
1919
1920 unsigned int i;
1921 for (i = from; i < to; i++)
1922 {
1923 struct printf_data *data = &kernargs->output_data.queue[i%limit];
1924
1925 if (!data->written && !final)
1926 break;
1927
1928 switch (data->type)
1929 {
1930 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1931 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1932 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1933 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1934 default: printf ("GCN print buffer error!\n"); break;
1935 }
1936 data->written = 0;
1937 __atomic_store_n (&kernargs->output_data.consumed, i+1,
1938 __ATOMIC_RELEASE);
1939 }
1940 fflush (stdout);
1941}
1942
1943/* Release data structure created for a kernel dispatch in SHADOW argument,
1944 and clean up the signal and memory allocations. */
1945
1946static void
1947release_kernel_dispatch (struct kernel_dispatch *shadow)
1948{
1949 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1950
1951 struct kernargs *kernargs = shadow->kernarg_address;
1952 void *arena = (void *)kernargs->arena_ptr;
1953 if (arena)
1954 release_team_arena (shadow->agent, arena);
1955
1956 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1957
1958 hsa_signal_t s;
1959 s.handle = shadow->signal;
1960 hsa_fns.hsa_signal_destroy_fn (s);
1961
1962 free (shadow);
1963}
1964
1965/* Extract the properties from a kernel binary. */
1966
1967static void
1968init_kernel_properties (struct kernel_info *kernel)
1969{
1970 hsa_status_t status;
1971 struct agent_info *agent = kernel->agent;
1972 hsa_executable_symbol_t kernel_symbol;
f062c3f1
AS
1973 char *buf = alloca (strlen (kernel->name) + 4);
1974 sprintf (buf, "%s.kd", kernel->name);
237957cc 1975 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
f062c3f1 1976 buf, agent->id,
237957cc
AS
1977 0, &kernel_symbol);
1978 if (status != HSA_STATUS_SUCCESS)
1979 {
1980 hsa_warn ("Could not find symbol for kernel in the code object", status);
f062c3f1 1981 fprintf (stderr, "not found name: '%s'\n", buf);
237957cc
AS
1982 dump_executable_symbols (agent->executable);
1983 goto failure;
1984 }
1985 GCN_DEBUG ("Located kernel %s\n", kernel->name);
1986 status = hsa_fns.hsa_executable_symbol_get_info_fn
1987 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
1988 if (status != HSA_STATUS_SUCCESS)
1989 hsa_fatal ("Could not extract a kernel object from its symbol", status);
1990 status = hsa_fns.hsa_executable_symbol_get_info_fn
1991 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1992 &kernel->kernarg_segment_size);
1993 if (status != HSA_STATUS_SUCCESS)
1994 hsa_fatal ("Could not get info about kernel argument size", status);
1995 status = hsa_fns.hsa_executable_symbol_get_info_fn
1996 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
1997 &kernel->group_segment_size);
1998 if (status != HSA_STATUS_SUCCESS)
1999 hsa_fatal ("Could not get info about kernel group segment size", status);
2000 status = hsa_fns.hsa_executable_symbol_get_info_fn
2001 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2002 &kernel->private_segment_size);
2003 if (status != HSA_STATUS_SUCCESS)
2004 hsa_fatal ("Could not get info about kernel private segment size",
2005 status);
2006
2007 /* The kernel type is not known until something tries to launch it. */
2008 kernel->kind = KIND_UNKNOWN;
2009
2010 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2011 "following segment sizes: \n", kernel->name);
2012 GCN_DEBUG (" group_segment_size: %u\n",
2013 (unsigned) kernel->group_segment_size);
2014 GCN_DEBUG (" private_segment_size: %u\n",
2015 (unsigned) kernel->private_segment_size);
2016 GCN_DEBUG (" kernarg_segment_size: %u\n",
2017 (unsigned) kernel->kernarg_segment_size);
2018 return;
2019
2020failure:
2021 kernel->initialization_failed = true;
2022}
2023
2024/* Do all the work that is necessary before running KERNEL for the first time.
2025 The function assumes the program has been created, finalized and frozen by
2026 create_and_finalize_hsa_program. */
2027
2028static void
2029init_kernel (struct kernel_info *kernel)
2030{
2031 if (pthread_mutex_lock (&kernel->init_mutex))
2032 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2033 if (kernel->initialized)
2034 {
2035 if (pthread_mutex_unlock (&kernel->init_mutex))
2036 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2037 "mutex");
2038
2039 return;
2040 }
2041
2042 init_kernel_properties (kernel);
2043
2044 if (!kernel->initialization_failed)
2045 {
2046 GCN_DEBUG ("\n");
2047
2048 kernel->initialized = true;
2049 }
2050 if (pthread_mutex_unlock (&kernel->init_mutex))
2051 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2052 "mutex");
2053}
2054
2055/* Run KERNEL on its agent, pass VARS to it as arguments and take
2056 launch attributes from KLA.
2057
2058 MODULE_LOCKED indicates that the caller already holds the lock and
2059 run_kernel need not lock it again.
2060 If AQ is NULL then agent->sync_queue will be used. */
2061
2062static void
2063run_kernel (struct kernel_info *kernel, void *vars,
2064 struct GOMP_kernel_launch_attributes *kla,
2065 struct goacc_asyncqueue *aq, bool module_locked)
2066{
5a28e272
KCY
2067 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2068 kernel->description->vpgr_count);
2069
2070 /* Reduce the number of threads/workers if there are insufficient
2071 VGPRs available to run the kernels together. */
2072 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2073 {
2074 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2075 int max_threads = (256 / granulated_vgprs) * 4;
2076 if (kla->gdims[2] > max_threads)
2077 {
2078 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2079 " per team/gang - reducing to %d threads/workers.\n",
2080 kla->gdims[2], max_threads);
2081 kla->gdims[2] = max_threads;
2082 }
2083 }
2084
237957cc
AS
2085 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2086 (aq ? aq->id : 0));
2087 GCN_DEBUG ("GCN launch attribs: gdims:[");
2088 int i;
2089 for (i = 0; i < kla->ndim; ++i)
2090 {
2091 if (i)
2092 DEBUG_PRINT (", ");
2093 DEBUG_PRINT ("%u", kla->gdims[i]);
2094 }
2095 DEBUG_PRINT ("], normalized gdims:[");
2096 for (i = 0; i < kla->ndim; ++i)
2097 {
2098 if (i)
2099 DEBUG_PRINT (", ");
2100 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2101 }
2102 DEBUG_PRINT ("], wdims:[");
2103 for (i = 0; i < kla->ndim; ++i)
2104 {
2105 if (i)
2106 DEBUG_PRINT (", ");
2107 DEBUG_PRINT ("%u", kla->wdims[i]);
2108 }
2109 DEBUG_PRINT ("]\n");
2110 DEBUG_FLUSH ();
2111
2112 struct agent_info *agent = kernel->agent;
2113 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2114 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2115
2116 if (!agent->initialized)
2117 GOMP_PLUGIN_fatal ("Agent must be initialized");
2118
2119 if (!kernel->initialized)
2120 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2121
2122 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2123
2124 uint64_t index
2125 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2126 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2127
2128 /* Wait until the queue is not full before writing the packet. */
2129 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2130 >= command_q->size)
2131 ;
2132
2133 /* Do not allow the dimensions to be overridden when running
2134 constructors or destructors. */
2135 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2136 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2137
2138 hsa_kernel_dispatch_packet_t *packet;
2139 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2140 + index % command_q->size;
2141
2142 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2143 packet->grid_size_x = override_x ? : kla->gdims[0];
2144 packet->workgroup_size_x = get_group_size (kla->ndim,
2145 packet->grid_size_x,
2146 kla->wdims[0]);
2147
2148 if (kla->ndim >= 2)
2149 {
2150 packet->grid_size_y = kla->gdims[1];
2151 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2152 kla->wdims[1]);
2153 }
2154 else
2155 {
2156 packet->grid_size_y = 1;
2157 packet->workgroup_size_y = 1;
2158 }
2159
2160 if (kla->ndim == 3)
2161 {
2162 packet->grid_size_z = limit_worker_threads (override_z
2163 ? : kla->gdims[2]);
2164 packet->workgroup_size_z = get_group_size (kla->ndim,
2165 packet->grid_size_z,
2166 kla->wdims[2]);
2167 }
2168 else
2169 {
2170 packet->grid_size_z = 1;
2171 packet->workgroup_size_z = 1;
2172 }
2173
2174 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2175 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2176 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2177 packet->grid_size_x / packet->workgroup_size_x,
2178 packet->grid_size_y / packet->workgroup_size_y,
2179 packet->grid_size_z / packet->workgroup_size_z,
2180 packet->workgroup_size_x, packet->workgroup_size_y,
2181 packet->workgroup_size_z);
2182
2183 struct kernel_dispatch *shadow
2184 = create_kernel_dispatch (kernel, packet->grid_size_x);
2185 shadow->queue = command_q;
2186
2187 if (debug)
2188 {
2189 fprintf (stderr, "\nKernel has following dependencies:\n");
2190 print_kernel_dispatch (shadow, 2);
2191 }
2192
2193 packet->private_segment_size = kernel->private_segment_size;
2194 packet->group_segment_size = kernel->group_segment_size;
2195 packet->kernel_object = kernel->object;
2196 packet->kernarg_address = shadow->kernarg_address;
2197 hsa_signal_t s;
2198 s.handle = shadow->signal;
2199 packet->completion_signal = s;
2200 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2201 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2202
2203 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2204
2205 uint16_t header;
2206 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2207 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2208 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2209
2210 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2211 agent->device_id);
2212
2213 packet_store_release ((uint32_t *) packet, header,
2214 (uint16_t) kla->ndim
2215 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2216
2217 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2218 index);
2219
2220 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2221
2222 /* Root signal waits with 1ms timeout. */
2223 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2224 1000 * 1000,
2225 HSA_WAIT_STATE_BLOCKED) != 0)
2226 {
2227 console_output (kernel, shadow->kernarg_address, false);
2228 }
2229 console_output (kernel, shadow->kernarg_address, true);
2230
2231 struct kernargs *kernargs = shadow->kernarg_address;
2232 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2233
2234 release_kernel_dispatch (shadow);
2235
2236 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2237 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2238
2239 unsigned int upper = (return_value & ~0xffff) >> 16;
2240 if (upper == 0xcafe)
2241 ; // exit not called, normal termination.
2242 else if (upper == 0xffff)
2243 ; // exit called.
2244 else
2245 {
2246 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2247 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2248 return_value);
2249 abort ();
2250 }
2251
2252 if (upper == 0xffff)
2253 {
2254 unsigned int signal = (return_value >> 8) & 0xff;
2255
2256 if (signal == SIGABRT)
2257 {
2258 GCN_WARNING ("GCN Kernel aborted\n");
2259 abort ();
2260 }
2261 else if (signal != 0)
2262 {
2263 GCN_WARNING ("GCN Kernel received unknown signal\n");
2264 abort ();
2265 }
2266
2267 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2268 exit (return_value & 0xff);
2269 }
2270}
2271
2272/* }}} */
2273/* {{{ Load/Unload */
2274
2275/* Initialize KERNEL from D and other parameters. Return true on success. */
2276
2277static bool
2278init_basic_kernel_info (struct kernel_info *kernel,
2279 struct hsa_kernel_description *d,
2280 struct agent_info *agent,
2281 struct module_info *module)
2282{
2283 kernel->agent = agent;
2284 kernel->module = module;
2285 kernel->name = d->name;
5a28e272 2286 kernel->description = d;
237957cc
AS
2287 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2288 {
2289 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2290 return false;
2291 }
2292 return true;
2293}
2294
7d593fd6
FH
2295/* Check that the GCN ISA of the given image matches the ISA of the agent. */
2296
2297static bool
2298isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2299{
2300 int isa_field = elf_gcn_isa_field (image);
2301 const char* isa_s = isa_hsa_name (isa_field);
2302 if (!isa_s)
2303 {
2304 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2305 return false;
2306 }
2307
2308 if (isa_field != agent->device_isa)
2309 {
2310 char msg[120];
2311 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2312 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2313 assert (agent_isa_s);
2314 assert (agent_isa_gcc_s);
2315
2316 snprintf (msg, sizeof msg,
2317 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2318 "Try to recompile with '-foffload=-march=%s'.\n",
2319 isa_s, agent_isa_s, agent_isa_gcc_s);
2320
2321 hsa_error (msg, HSA_STATUS_ERROR);
2322 return false;
2323 }
2324
2325 return true;
2326}
2327
237957cc
AS
2328/* Create and finalize the program consisting of all loaded modules. */
2329
2330static bool
2331create_and_finalize_hsa_program (struct agent_info *agent)
2332{
2333 hsa_status_t status;
237957cc
AS
2334 bool res = true;
2335 if (pthread_mutex_lock (&agent->prog_mutex))
2336 {
2337 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2338 return false;
2339 }
2340 if (agent->prog_finalized)
2341 goto final;
2342
2343 status
2344 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2345 HSA_EXECUTABLE_STATE_UNFROZEN,
2346 "", &agent->executable);
2347 if (status != HSA_STATUS_SUCCESS)
2348 {
2349 hsa_error ("Could not create GCN executable", status);
2350 goto fail;
2351 }
2352
2353 /* Load any GCN modules. */
2354 struct module_info *module = agent->module;
2355 if (module)
2356 {
2357 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2358
7d593fd6
FH
2359 if (!isa_matches_agent (agent, image))
2360 goto fail;
2361
237957cc
AS
2362 hsa_code_object_t co = { 0 };
2363 status = hsa_fns.hsa_code_object_deserialize_fn
2364 (module->image_desc->gcn_image->image,
2365 module->image_desc->gcn_image->size,
2366 NULL, &co);
2367 if (status != HSA_STATUS_SUCCESS)
2368 {
2369 hsa_error ("Could not deserialize GCN code object", status);
2370 goto fail;
2371 }
2372
2373 status = hsa_fns.hsa_executable_load_code_object_fn
2374 (agent->executable, agent->id, co, "");
2375 if (status != HSA_STATUS_SUCCESS)
2376 {
2377 hsa_error ("Could not load GCN code object", status);
2378 goto fail;
2379 }
2380
2381 if (!module->heap)
2382 {
2383 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2384 gcn_kernel_heap_size,
2385 (void**)&module->heap);
2386 if (status != HSA_STATUS_SUCCESS)
2387 {
2388 hsa_error ("Could not allocate memory for GCN heap", status);
2389 goto fail;
2390 }
2391
2392 status = hsa_fns.hsa_memory_assign_agent_fn
2393 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2394 if (status != HSA_STATUS_SUCCESS)
2395 {
2396 hsa_error ("Could not assign GCN heap memory to device", status);
2397 goto fail;
2398 }
2399
2400 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2401 &gcn_kernel_heap_size,
2402 sizeof (gcn_kernel_heap_size));
2403 }
2404
2405 }
2406
2407 if (debug)
2408 dump_executable_symbols (agent->executable);
2409
2410 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2411 if (status != HSA_STATUS_SUCCESS)
2412 {
2413 hsa_error ("Could not freeze the GCN executable", status);
2414 goto fail;
2415 }
2416
237957cc
AS
2417final:
2418 agent->prog_finalized = true;
2419
2420 if (pthread_mutex_unlock (&agent->prog_mutex))
2421 {
2422 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2423 res = false;
2424 }
2425
2426 return res;
2427
2428fail:
2429 res = false;
2430 goto final;
2431}
2432
2433/* Free the HSA program in agent and everything associated with it and set
2434 agent->prog_finalized and the initialized flags of all kernels to false.
2435 Return TRUE on success. */
2436
2437static bool
2438destroy_hsa_program (struct agent_info *agent)
2439{
2440 if (!agent->prog_finalized)
2441 return true;
2442
2443 hsa_status_t status;
2444
2445 GCN_DEBUG ("Destroying the current GCN program.\n");
2446
2447 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2448 if (status != HSA_STATUS_SUCCESS)
2449 return hsa_error ("Could not destroy GCN executable", status);
2450
2451 if (agent->module)
2452 {
2453 int i;
2454 for (i = 0; i < agent->module->kernel_count; i++)
2455 agent->module->kernels[i].initialized = false;
2456
2457 if (agent->module->heap)
2458 {
2459 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2460 agent->module->heap = NULL;
2461 }
2462 }
2463 agent->prog_finalized = false;
2464 return true;
2465}
2466
2467/* Deinitialize all information associated with MODULE and kernels within
2468 it. Return TRUE on success. */
2469
2470static bool
2471destroy_module (struct module_info *module, bool locked)
2472{
2473 /* Run destructors before destroying module. */
2474 struct GOMP_kernel_launch_attributes kla =
2475 { 3,
2476 /* Grid size. */
2477 { 1, 64, 1 },
2478 /* Work-group size. */
2479 { 1, 64, 1 }
2480 };
2481
2482 if (module->fini_array_func)
2483 {
2484 init_kernel (module->fini_array_func);
2485 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2486 }
2487 module->constructors_run_p = false;
2488
2489 int i;
2490 for (i = 0; i < module->kernel_count; i++)
2491 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2492 {
2493 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2494 "mutex");
2495 return false;
2496 }
2497
2498 return true;
2499}
2500
2501/* }}} */
2502/* {{{ Async */
2503
2504/* Callback of dispatch queues to report errors. */
2505
2506static void
2507execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2508{
2509 struct queue_entry *entry = &aq->queue[index];
2510
2511 switch (entry->type)
2512 {
2513 case KERNEL_LAUNCH:
2514 if (DEBUG_QUEUES)
2515 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2516 aq->agent->device_id, aq->id, index);
2517 run_kernel (entry->u.launch.kernel,
2518 entry->u.launch.vars,
2519 &entry->u.launch.kla, aq, false);
2520 if (DEBUG_QUEUES)
2521 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2522 aq->agent->device_id, aq->id, index);
2523 break;
2524
2525 case CALLBACK:
2526 if (DEBUG_QUEUES)
2527 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2528 aq->agent->device_id, aq->id, index);
2529 entry->u.callback.fn (entry->u.callback.data);
2530 if (DEBUG_QUEUES)
2531 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2532 aq->agent->device_id, aq->id, index);
2533 break;
2534
2535 case ASYNC_WAIT:
2536 {
2537 /* FIXME: is it safe to access a placeholder that may already have
2538 been executed? */
2539 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2540
2541 if (DEBUG_QUEUES)
2542 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2543 aq->agent->device_id, aq->id, index);
2544
2545 pthread_mutex_lock (&placeholderp->mutex);
2546
2547 while (!placeholderp->executed)
2548 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2549
2550 pthread_mutex_unlock (&placeholderp->mutex);
2551
2552 if (pthread_cond_destroy (&placeholderp->cond))
2553 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2554
2555 if (pthread_mutex_destroy (&placeholderp->mutex))
2556 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2557
2558 if (DEBUG_QUEUES)
2559 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2560 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2561 }
2562 break;
2563
2564 case ASYNC_PLACEHOLDER:
2565 pthread_mutex_lock (&entry->u.placeholder.mutex);
2566 entry->u.placeholder.executed = 1;
2567 pthread_cond_signal (&entry->u.placeholder.cond);
2568 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2569 break;
2570
2571 default:
2572 GOMP_PLUGIN_fatal ("Unknown queue element");
2573 }
2574}
2575
2576/* This function is run as a thread to service an async queue in the
2577 background. It runs continuously until the stop flag is set. */
2578
2579static void *
2580drain_queue (void *thread_arg)
2581{
2582 struct goacc_asyncqueue *aq = thread_arg;
2583
2584 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2585 {
2586 aq->drain_queue_stop = 2;
2587 return NULL;
2588 }
2589
2590 pthread_mutex_lock (&aq->mutex);
2591
2592 while (true)
2593 {
2594 if (aq->drain_queue_stop)
2595 break;
2596
2597 if (aq->queue_n > 0)
2598 {
2599 pthread_mutex_unlock (&aq->mutex);
2600 execute_queue_entry (aq, aq->queue_first);
2601
2602 pthread_mutex_lock (&aq->mutex);
2603 aq->queue_first = ((aq->queue_first + 1)
2604 % ASYNC_QUEUE_SIZE);
2605 aq->queue_n--;
2606
2607 if (DEBUG_THREAD_SIGNAL)
2608 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2609 aq->agent->device_id, aq->id);
2610 pthread_cond_broadcast (&aq->queue_cond_out);
2611 pthread_mutex_unlock (&aq->mutex);
2612
2613 if (DEBUG_QUEUES)
2614 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2615 aq->id);
2616 pthread_mutex_lock (&aq->mutex);
2617 }
2618 else
2619 {
2620 if (DEBUG_THREAD_SLEEP)
2621 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2622 aq->agent->device_id, aq->id);
2623 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2624 if (DEBUG_THREAD_SLEEP)
2625 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2626 aq->agent->device_id, aq->id);
2627 }
2628 }
2629
2630 aq->drain_queue_stop = 2;
2631 if (DEBUG_THREAD_SIGNAL)
2632 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2633 aq->agent->device_id, aq->id);
2634 pthread_cond_broadcast (&aq->queue_cond_out);
2635 pthread_mutex_unlock (&aq->mutex);
2636
2637 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2638 return NULL;
2639}
2640
2641/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2642 is not usually the case. This is just a debug tool. */
2643
2644static void
2645drain_queue_synchronous (struct goacc_asyncqueue *aq)
2646{
2647 pthread_mutex_lock (&aq->mutex);
2648
2649 while (aq->queue_n > 0)
2650 {
2651 execute_queue_entry (aq, aq->queue_first);
2652
2653 aq->queue_first = ((aq->queue_first + 1)
2654 % ASYNC_QUEUE_SIZE);
2655 aq->queue_n--;
2656 }
2657
2658 pthread_mutex_unlock (&aq->mutex);
2659}
2660
d88b27da
JB
2661/* Block the current thread until an async queue is writable. The aq->mutex
2662 lock should be held on entry, and remains locked on exit. */
237957cc
AS
2663
2664static void
2665wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2666{
2667 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2668 {
237957cc
AS
2669 /* Queue is full. Wait for it to not be full. */
2670 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2671 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
237957cc
AS
2672 }
2673}
2674
2675/* Request an asynchronous kernel launch on the specified queue. This
2676 may block if the queue is full, but returns without waiting for the
2677 kernel to run. */
2678
2679static void
2680queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2681 void *vars, struct GOMP_kernel_launch_attributes *kla)
2682{
2683 assert (aq->agent == kernel->agent);
2684
237957cc
AS
2685 pthread_mutex_lock (&aq->mutex);
2686
d88b27da
JB
2687 wait_for_queue_nonfull (aq);
2688
237957cc
AS
2689 int queue_last = ((aq->queue_first + aq->queue_n)
2690 % ASYNC_QUEUE_SIZE);
2691 if (DEBUG_QUEUES)
2692 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2693 aq->id, queue_last);
2694
2695 aq->queue[queue_last].type = KERNEL_LAUNCH;
2696 aq->queue[queue_last].u.launch.kernel = kernel;
2697 aq->queue[queue_last].u.launch.vars = vars;
2698 aq->queue[queue_last].u.launch.kla = *kla;
2699
2700 aq->queue_n++;
2701
2702 if (DEBUG_THREAD_SIGNAL)
2703 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2704 aq->agent->device_id, aq->id);
2705 pthread_cond_signal (&aq->queue_cond_in);
2706
2707 pthread_mutex_unlock (&aq->mutex);
2708}
2709
2710/* Request an asynchronous callback on the specified queue. The callback
2711 function will be called, with the given opaque data, from the appropriate
2712 async thread, when all previous items on that queue are complete. */
2713
2714static void
2715queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2716 void *data)
2717{
237957cc
AS
2718 pthread_mutex_lock (&aq->mutex);
2719
d88b27da
JB
2720 wait_for_queue_nonfull (aq);
2721
237957cc
AS
2722 int queue_last = ((aq->queue_first + aq->queue_n)
2723 % ASYNC_QUEUE_SIZE);
2724 if (DEBUG_QUEUES)
2725 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2726 aq->id, queue_last);
2727
2728 aq->queue[queue_last].type = CALLBACK;
2729 aq->queue[queue_last].u.callback.fn = fn;
2730 aq->queue[queue_last].u.callback.data = data;
2731
2732 aq->queue_n++;
2733
2734 if (DEBUG_THREAD_SIGNAL)
2735 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2736 aq->agent->device_id, aq->id);
2737 pthread_cond_signal (&aq->queue_cond_in);
2738
2739 pthread_mutex_unlock (&aq->mutex);
2740}
2741
2742/* Request that a given async thread wait for another thread (unspecified) to
2743 reach the given placeholder. The wait will occur when all previous entries
2744 on the queue are complete. A placeholder is effectively a kind of signal
2745 which simply sets a flag when encountered in a queue. */
2746
2747static void
2748queue_push_asyncwait (struct goacc_asyncqueue *aq,
2749 struct placeholder *placeholderp)
2750{
237957cc
AS
2751 pthread_mutex_lock (&aq->mutex);
2752
d88b27da
JB
2753 wait_for_queue_nonfull (aq);
2754
237957cc
AS
2755 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2756 if (DEBUG_QUEUES)
2757 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2758 aq->id, queue_last);
2759
2760 aq->queue[queue_last].type = ASYNC_WAIT;
2761 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2762
2763 aq->queue_n++;
2764
2765 if (DEBUG_THREAD_SIGNAL)
2766 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2767 aq->agent->device_id, aq->id);
2768 pthread_cond_signal (&aq->queue_cond_in);
2769
2770 pthread_mutex_unlock (&aq->mutex);
2771}
2772
2773/* Add a placeholder into an async queue. When the async thread reaches the
2774 placeholder it will set the "executed" flag to true and continue.
2775 Another thread may be waiting on this thread reaching the placeholder. */
2776
2777static struct placeholder *
2778queue_push_placeholder (struct goacc_asyncqueue *aq)
2779{
2780 struct placeholder *placeholderp;
2781
237957cc
AS
2782 pthread_mutex_lock (&aq->mutex);
2783
d88b27da
JB
2784 wait_for_queue_nonfull (aq);
2785
237957cc
AS
2786 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2787 if (DEBUG_QUEUES)
2788 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2789 aq->id, queue_last);
2790
2791 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2792 placeholderp = &aq->queue[queue_last].u.placeholder;
2793
2794 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2795 {
2796 pthread_mutex_unlock (&aq->mutex);
2797 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2798 }
2799
2800 if (pthread_cond_init (&placeholderp->cond, NULL))
2801 {
2802 pthread_mutex_unlock (&aq->mutex);
2803 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2804 }
2805
2806 placeholderp->executed = 0;
2807
2808 aq->queue_n++;
2809
2810 if (DEBUG_THREAD_SIGNAL)
2811 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2812 aq->agent->device_id, aq->id);
2813 pthread_cond_signal (&aq->queue_cond_in);
2814
2815 pthread_mutex_unlock (&aq->mutex);
2816
2817 return placeholderp;
2818}
2819
2820/* Signal an asynchronous thread to terminate, and wait for it to do so. */
2821
2822static void
2823finalize_async_thread (struct goacc_asyncqueue *aq)
2824{
2825 pthread_mutex_lock (&aq->mutex);
2826 if (aq->drain_queue_stop == 2)
2827 {
2828 pthread_mutex_unlock (&aq->mutex);
2829 return;
2830 }
2831
2832 aq->drain_queue_stop = 1;
2833
2834 if (DEBUG_THREAD_SIGNAL)
2835 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2836 aq->agent->device_id, aq->id);
2837 pthread_cond_signal (&aq->queue_cond_in);
2838
2839 while (aq->drain_queue_stop != 2)
2840 {
2841 if (DEBUG_THREAD_SLEEP)
2842 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2843 " to sleep\n", aq->agent->device_id, aq->id);
2844 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2845 if (DEBUG_THREAD_SLEEP)
2846 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2847 aq->agent->device_id, aq->id);
2848 }
2849
2850 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2851 aq->id);
2852 pthread_mutex_unlock (&aq->mutex);
2853
2854 int err = pthread_join (aq->thread_drain_queue, NULL);
2855 if (err != 0)
2856 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2857 aq->agent->device_id, aq->id, strerror (err));
2858 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2859}
2860
2861/* Set up an async queue for OpenMP. There will be only one. The
2862 implementation simply uses an OpenACC async queue.
2863 FIXME: is this thread-safe if two threads call this function? */
2864
2865static void
2866maybe_init_omp_async (struct agent_info *agent)
2867{
2868 if (!agent->omp_async_queue)
2869 agent->omp_async_queue
2870 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2871}
2872
8d2f4ddf
JB
2873/* A wrapper that works around an issue in the HSA runtime with host-to-device
2874 copies from read-only pages. */
2875
2876static void
2877hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2878{
2879 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2880
2881 if (status == HSA_STATUS_SUCCESS)
2882 return;
2883
2884 /* It appears that the copy fails if the source data is in a read-only page.
2885 We can't detect that easily, so try copying the data to a temporary buffer
2886 and doing the copy again if we got an error above. */
2887
2888 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2889 "[%p:+%d]\n", (void *) src, (int) len);
2890
2891 void *src_copy = malloc (len);
2892 memcpy (src_copy, src, len);
2893 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2894 free (src_copy);
2895 if (status != HSA_STATUS_SUCCESS)
2896 GOMP_PLUGIN_error ("memory copy failed");
2897}
2898
237957cc
AS
2899/* Copy data to or from a device. This is intended for use as an async
2900 callback event. */
2901
2902static void
2903copy_data (void *data_)
2904{
2905 struct copy_data *data = (struct copy_data *)data_;
2906 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2907 data->aq->agent->device_id, data->aq->id, data->len, data->src,
2908 data->dst);
8d2f4ddf 2909 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
237957cc
AS
2910 if (data->free_src)
2911 free ((void *) data->src);
2912 free (data);
2913}
2914
2915/* Free device data. This is intended for use as an async callback event. */
2916
2917static void
2918gomp_offload_free (void *ptr)
2919{
2920 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2921 GOMP_OFFLOAD_free (0, ptr);
2922}
2923
2924/* Request an asynchronous data copy, to or from a device, on a given queue.
2925 The event will be registered as a callback. If FREE_SRC is true
2926 then the source data will be freed following the copy. */
2927
2928static void
2929queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
2930 size_t len, bool free_src)
2931{
2932 if (DEBUG_QUEUES)
2933 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2934 aq->agent->device_id, aq->id, len, src, dst);
2935 struct copy_data *data
2936 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
2937 data->dst = dst;
2938 data->src = src;
2939 data->len = len;
2940 data->free_src = free_src;
2941 data->aq = aq;
2942 queue_push_callback (aq, copy_data, data);
2943}
2944
2945/* Return true if the given queue is currently empty. */
2946
2947static int
2948queue_empty (struct goacc_asyncqueue *aq)
2949{
2950 pthread_mutex_lock (&aq->mutex);
2951 int res = aq->queue_n == 0 ? 1 : 0;
2952 pthread_mutex_unlock (&aq->mutex);
2953
2954 return res;
2955}
2956
2957/* Wait for a given queue to become empty. This implements an OpenACC wait
2958 directive. */
2959
2960static void
2961wait_queue (struct goacc_asyncqueue *aq)
2962{
2963 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2964 {
2965 drain_queue_synchronous (aq);
2966 return;
2967 }
2968
2969 pthread_mutex_lock (&aq->mutex);
2970
2971 while (aq->queue_n > 0)
2972 {
2973 if (DEBUG_THREAD_SLEEP)
2974 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
2975 aq->agent->device_id, aq->id);
2976 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2977 if (DEBUG_THREAD_SLEEP)
2978 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
2979 aq->id);
2980 }
2981
2982 pthread_mutex_unlock (&aq->mutex);
2983 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
2984}
2985
2986/* }}} */
2987/* {{{ OpenACC support */
2988
2989/* Execute an OpenACC kernel, synchronously or asynchronously. */
2990
2991static void
2992gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
2993 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
2994 struct goacc_asyncqueue *aq)
2995{
2996 if (!GOMP_OFFLOAD_can_run (kernel))
2997 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
2998
2999 /* If we get here then this must be an OpenACC kernel. */
3000 kernel->kind = KIND_OPENACC;
3001
3002 /* devaddrs must be double-indirect on the target. */
3003 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3004 for (size_t i = 0; i < mapnum; i++)
3005 hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3006 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3007 sizeof (void *));
3008
3009 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3010 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3011 {
3012 struct hsa_kernel_description *d
3013 = &kernel->module->image_desc->kernel_infos[i];
3014 if (d->name == kernel->name)
3015 {
3016 hsa_kernel_desc = d;
3017 break;
3018 }
3019 }
3020
3021 /* We may have statically-determined dimensions in
3022 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3023 invocation at runtime in dims[]. We allow static dimensions to take
3024 priority over dynamic dimensions when present (non-zero). */
3025 if (hsa_kernel_desc->oacc_dims[0] > 0)
3026 dims[0] = hsa_kernel_desc->oacc_dims[0];
3027 if (hsa_kernel_desc->oacc_dims[1] > 0)
3028 dims[1] = hsa_kernel_desc->oacc_dims[1];
3029 if (hsa_kernel_desc->oacc_dims[2] > 0)
3030 dims[2] = hsa_kernel_desc->oacc_dims[2];
3031
3032 /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3033 There isn't really a correct answer for this without a clue about the
3034 problem size, so let's do a reasonable number of single-worker gangs.
3035 64 gangs matches a typical Fiji device. */
3036
3037 /* NOTE: Until support for middle-end worker partitioning is merged, use 1
3038 for the default number of workers. */
3039 if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs. */
3040 if (dims[1] == 0) dims[1] = 1; /* Workers. */
3041
3042 /* The incoming dimensions are expressed in terms of gangs, workers, and
3043 vectors. The HSA dimensions are expressed in terms of "work-items",
3044 which means multiples of vector lanes.
3045
3046 The "grid size" specifies the size of the problem space, and the
3047 "work-group size" specifies how much of that we want a single compute
3048 unit to chew on at once.
3049
3050 The three dimensions do not really correspond to hardware, but the
3051 important thing is that the HSA runtime will launch as many
3052 work-groups as it takes to process the entire grid, and each
3053 work-group will contain as many wave-fronts as it takes to process
3054 the work-items in that group.
3055
3056 Essentially, as long as we set the Y dimension to 64 (the number of
3057 vector lanes in hardware), and the Z group size to the maximum (16),
3058 then we will get the gangs (X) and workers (Z) launched as we expect.
3059
3060 The reason for the apparent reversal of vector and worker dimension
3061 order is to do with the way the run-time distributes work-items across
3062 v1 and v2. */
3063 struct GOMP_kernel_launch_attributes kla =
3064 {3,
3065 /* Grid size. */
3066 {dims[0], 64, dims[1]},
3067 /* Work-group size. */
3068 {1, 64, 16}
3069 };
3070
3071 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3072 acc_prof_info *prof_info = thr->prof_info;
3073 acc_event_info enqueue_launch_event_info;
3074 acc_api_info *api_info = thr->api_info;
3075 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3076 if (profiling_dispatch_p)
3077 {
3078 prof_info->event_type = acc_ev_enqueue_launch_start;
3079
3080 enqueue_launch_event_info.launch_event.event_type
3081 = prof_info->event_type;
3082 enqueue_launch_event_info.launch_event.valid_bytes
3083 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3084 enqueue_launch_event_info.launch_event.parent_construct
3085 = acc_construct_parallel;
3086 enqueue_launch_event_info.launch_event.implicit = 1;
3087 enqueue_launch_event_info.launch_event.tool_info = NULL;
3088 enqueue_launch_event_info.launch_event.kernel_name
3089 = (char *) kernel->name;
3090 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3091 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3092 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3093
3094 api_info->device_api = acc_device_api_other;
3095
3096 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3097 &enqueue_launch_event_info, api_info);
3098 }
3099
3100 if (!async)
3101 {
3102 run_kernel (kernel, ind_da, &kla, NULL, false);
3103 gomp_offload_free (ind_da);
3104 }
3105 else
3106 {
3107 queue_push_launch (aq, kernel, ind_da, &kla);
3108 if (DEBUG_QUEUES)
3109 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3110 aq->agent->device_id, aq->id, ind_da);
3111 queue_push_callback (aq, gomp_offload_free, ind_da);
3112 }
3113
3114 if (profiling_dispatch_p)
3115 {
3116 prof_info->event_type = acc_ev_enqueue_launch_end;
3117 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3118 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3119 &enqueue_launch_event_info,
3120 api_info);
3121 }
3122}
3123
3124/* }}} */
3125/* {{{ Generic Plugin API */
3126
3127/* Return the name of the accelerator, which is "gcn". */
3128
3129const char *
3130GOMP_OFFLOAD_get_name (void)
3131{
3132 return "gcn";
3133}
3134
3135/* Return the specific capabilities the HSA accelerator have. */
3136
3137unsigned int
3138GOMP_OFFLOAD_get_caps (void)
3139{
3140 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3141 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3142 | GOMP_OFFLOAD_CAP_OPENACC_200;
3143}
3144
3145/* Identify as GCN accelerator. */
3146
3147int
3148GOMP_OFFLOAD_get_type (void)
3149{
3150 return OFFLOAD_TARGET_TYPE_GCN;
3151}
3152
3153/* Return the libgomp version number we're compatible with. There is
3154 no requirement for cross-version compatibility. */
3155
3156unsigned
3157GOMP_OFFLOAD_version (void)
3158{
3159 return GOMP_VERSION;
3160}
3161
3162/* Return the number of GCN devices on the system. */
3163
3164int
3165GOMP_OFFLOAD_get_num_devices (void)
3166{
3167 if (!init_hsa_context ())
3168 return 0;
3169 return hsa_context.agent_count;
3170}
3171
3172/* Initialize device (agent) number N so that it can be used for computation.
3173 Return TRUE on success. */
3174
3175bool
3176GOMP_OFFLOAD_init_device (int n)
3177{
3178 if (!init_hsa_context ())
3179 return false;
3180 if (n >= hsa_context.agent_count)
3181 {
3182 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3183 return false;
3184 }
3185 struct agent_info *agent = &hsa_context.agents[n];
3186
3187 if (agent->initialized)
3188 return true;
3189
3190 agent->device_id = n;
3191
3192 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3193 {
3194 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3195 return false;
3196 }
3197 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3198 {
3199 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3200 return false;
3201 }
3202 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3203 {
3204 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3205 return false;
3206 }
3207 if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3208 {
3209 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3210 return false;
3211 }
3212 agent->async_queues = NULL;
3213 agent->omp_async_queue = NULL;
3214 agent->team_arena_list = NULL;
3215
3216 uint32_t queue_size;
3217 hsa_status_t status;
3218 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3219 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3220 &queue_size);
3221 if (status != HSA_STATUS_SUCCESS)
3222 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3223 status);
3224
237957cc 3225 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
2e5ea579 3226 &agent->name);
237957cc
AS
3227 if (status != HSA_STATUS_SUCCESS)
3228 return hsa_error ("Error querying the name of the agent", status);
7d593fd6 3229
2e5ea579 3230 agent->device_isa = isa_code (agent->name);
7d593fd6 3231 if (agent->device_isa < 0)
2e5ea579
FH
3232 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3233
3234 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3235 &agent->vendor_name);
3236 if (status != HSA_STATUS_SUCCESS)
3237 return hsa_error ("Error querying the vendor name of the agent", status);
237957cc
AS
3238
3239 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3240 HSA_QUEUE_TYPE_MULTI,
3241 hsa_queue_callback, NULL, UINT32_MAX,
3242 UINT32_MAX, &agent->sync_queue);
3243 if (status != HSA_STATUS_SUCCESS)
3244 return hsa_error ("Error creating command queue", status);
3245
3246 agent->kernarg_region.handle = (uint64_t) -1;
3247 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3248 get_kernarg_memory_region,
3249 &agent->kernarg_region);
966de09b
AS
3250 if (status != HSA_STATUS_SUCCESS
3251 && status != HSA_STATUS_INFO_BREAK)
3252 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3253 if (agent->kernarg_region.handle == (uint64_t) -1)
3254 {
3255 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3256 "arguments");
3257 return false;
3258 }
3259 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3260 dump_hsa_region (agent->kernarg_region, NULL);
3261
3262 agent->data_region.handle = (uint64_t) -1;
3263 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3264 get_data_memory_region,
3265 &agent->data_region);
966de09b
AS
3266 if (status != HSA_STATUS_SUCCESS
3267 && status != HSA_STATUS_INFO_BREAK)
3268 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3269 if (agent->data_region.handle == (uint64_t) -1)
3270 {
3271 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3272 "data");
3273 return false;
3274 }
3275 GCN_DEBUG ("Selected device data memory region:\n");
3276 dump_hsa_region (agent->data_region, NULL);
3277
3278 GCN_DEBUG ("GCN agent %d initialized\n", n);
3279
3280 agent->initialized = true;
3281 return true;
3282}
3283
3284/* Load GCN object-code module described by struct gcn_image_desc in
3285 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3286 If there are any constructors then run them. */
3287
3288int
3289GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3290 struct addr_pair **target_table)
3291{
3292 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3293 {
3294 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3295 " (expected %u, received %u)",
3296 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3297 return -1;
3298 }
3299
3300 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3301 struct agent_info *agent;
3302 struct addr_pair *pair;
3303 struct module_info *module;
3304 struct kernel_info *kernel;
3305 int kernel_count = image_desc->kernel_count;
3306 unsigned var_count = image_desc->global_variable_count;
3307
3308 agent = get_agent_info (ord);
3309 if (!agent)
3310 return -1;
3311
3312 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3313 {
3314 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3315 return -1;
3316 }
3317 if (agent->prog_finalized
3318 && !destroy_hsa_program (agent))
3319 return -1;
3320
3321 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3322 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3323 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
3324 * sizeof (struct addr_pair));
3325 *target_table = pair;
3326 module = (struct module_info *)
3327 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3328 + kernel_count * sizeof (struct kernel_info));
3329 module->image_desc = image_desc;
3330 module->kernel_count = kernel_count;
3331 module->heap = NULL;
3332 module->constructors_run_p = false;
3333
3334 kernel = &module->kernels[0];
3335
3336 /* Allocate memory for kernel dependencies. */
3337 for (unsigned i = 0; i < kernel_count; i++)
3338 {
3339 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3340 if (!init_basic_kernel_info (kernel, d, agent, module))
3341 return -1;
3342 if (strcmp (d->name, "_init_array") == 0)
3343 module->init_array_func = kernel;
3344 else if (strcmp (d->name, "_fini_array") == 0)
3345 module->fini_array_func = kernel;
3346 else
3347 {
3348 pair->start = (uintptr_t) kernel;
3349 pair->end = (uintptr_t) (kernel + 1);
3350 pair++;
3351 }
3352 kernel++;
3353 }
3354
3355 agent->module = module;
3356 if (pthread_rwlock_unlock (&agent->module_rwlock))
3357 {
3358 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3359 return -1;
3360 }
3361
3362 if (!create_and_finalize_hsa_program (agent))
3363 return -1;
3364
3365 for (unsigned i = 0; i < var_count; i++)
3366 {
3367 struct global_var_info *v = &image_desc->global_variables[i];
3368 GCN_DEBUG ("Looking for variable %s\n", v->name);
3369
3370 hsa_status_t status;
3371 hsa_executable_symbol_t var_symbol;
3372 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3373 v->name, agent->id,
3374 0, &var_symbol);
3375
3376 if (status != HSA_STATUS_SUCCESS)
3377 hsa_fatal ("Could not find symbol for variable in the code object",
3378 status);
3379
3380 uint64_t var_addr;
3381 uint32_t var_size;
3382 status = hsa_fns.hsa_executable_symbol_get_info_fn
3383 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
3384 if (status != HSA_STATUS_SUCCESS)
3385 hsa_fatal ("Could not extract a variable from its symbol", status);
3386 status = hsa_fns.hsa_executable_symbol_get_info_fn
3387 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
3388 if (status != HSA_STATUS_SUCCESS)
3389 hsa_fatal ("Could not extract a variable size from its symbol", status);
3390
3391 pair->start = var_addr;
3392 pair->end = var_addr + var_size;
3393 GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
3394 (void *)var_addr, var_size);
3395 pair++;
3396 }
3397
3398 /* Ensure that constructors are run first. */
3399 struct GOMP_kernel_launch_attributes kla =
3400 { 3,
3401 /* Grid size. */
3402 { 1, 64, 1 },
3403 /* Work-group size. */
3404 { 1, 64, 1 }
3405 };
3406
3407 if (module->init_array_func)
3408 {
3409 init_kernel (module->init_array_func);
3410 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3411 }
3412 module->constructors_run_p = true;
3413
3414 /* Don't report kernels that libgomp need not know about. */
3415 if (module->init_array_func)
3416 kernel_count--;
3417 if (module->fini_array_func)
3418 kernel_count--;
3419
3420 return kernel_count + var_count;
3421}
3422
3423/* Unload GCN object-code module described by struct gcn_image_desc in
3424 TARGET_DATA from agent number N. Return TRUE on success. */
3425
3426bool
3427GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3428{
3429 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3430 {
3431 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3432 " (expected %u, received %u)",
3433 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3434 return false;
3435 }
3436
3437 struct agent_info *agent;
3438 agent = get_agent_info (n);
3439 if (!agent)
3440 return false;
3441
3442 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3443 {
3444 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3445 return false;
3446 }
3447
3448 if (!agent->module || agent->module->image_desc != target_data)
3449 {
3450 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3451 "loaded before");
3452 return false;
3453 }
3454
3455 if (!destroy_module (agent->module, true))
3456 return false;
3457 free (agent->module);
3458 agent->module = NULL;
3459 if (!destroy_hsa_program (agent))
3460 return false;
3461 if (pthread_rwlock_unlock (&agent->module_rwlock))
3462 {
3463 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3464 return false;
3465 }
3466 return true;
3467}
3468
3469/* Deinitialize all information and status associated with agent number N. We
3470 do not attempt any synchronization, assuming the user and libgomp will not
3471 attempt deinitialization of a device that is in any way being used at the
3472 same time. Return TRUE on success. */
3473
3474bool
3475GOMP_OFFLOAD_fini_device (int n)
3476{
3477 struct agent_info *agent = get_agent_info (n);
3478 if (!agent)
3479 return false;
3480
3481 if (!agent->initialized)
3482 return true;
3483
3484 if (agent->omp_async_queue)
3485 {
3486 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3487 agent->omp_async_queue = NULL;
3488 }
3489
3490 if (agent->module)
3491 {
3492 if (!destroy_module (agent->module, false))
3493 return false;
3494 free (agent->module);
3495 agent->module = NULL;
3496 }
3497
3498 if (!destroy_team_arenas (agent))
3499 return false;
3500
3501 if (!destroy_hsa_program (agent))
3502 return false;
3503
3504 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3505 if (status != HSA_STATUS_SUCCESS)
3506 return hsa_error ("Error destroying command queue", status);
3507
3508 if (pthread_mutex_destroy (&agent->prog_mutex))
3509 {
3510 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3511 return false;
3512 }
3513 if (pthread_rwlock_destroy (&agent->module_rwlock))
3514 {
3515 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3516 return false;
3517 }
3518
3519 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3520 {
3521 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3522 return false;
3523 }
3524 if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3525 {
3526 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3527 return false;
3528 }
3529 agent->initialized = false;
3530 return true;
3531}
3532
3533/* Return true if the HSA runtime can run function FN_PTR. */
3534
3535bool
3536GOMP_OFFLOAD_can_run (void *fn_ptr)
3537{
3538 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3539
3540 init_kernel (kernel);
3541 if (kernel->initialization_failed)
3542 goto failure;
3543
3544 return true;
3545
3546failure:
3547 if (suppress_host_fallback)
3548 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3549 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3550 return false;
3551}
3552
3553/* Allocate memory on device N. */
3554
3555void *
3556GOMP_OFFLOAD_alloc (int n, size_t size)
3557{
3558 struct agent_info *agent = get_agent_info (n);
3559 return alloc_by_agent (agent, size);
3560}
3561
3562/* Free memory from device N. */
3563
3564bool
3565GOMP_OFFLOAD_free (int device, void *ptr)
3566{
3567 GCN_DEBUG ("Freeing memory on device %d\n", device);
3568
3569 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3570 if (status != HSA_STATUS_SUCCESS)
3571 {
3572 hsa_error ("Could not free device memory", status);
3573 return false;
3574 }
3575
3576 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3577 bool profiling_dispatch_p
3578 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3579 if (profiling_dispatch_p)
3580 {
3581 acc_prof_info *prof_info = thr->prof_info;
3582 acc_event_info data_event_info;
3583 acc_api_info *api_info = thr->api_info;
3584
3585 prof_info->event_type = acc_ev_free;
3586
3587 data_event_info.data_event.event_type = prof_info->event_type;
3588 data_event_info.data_event.valid_bytes
3589 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3590 data_event_info.data_event.parent_construct
3591 = acc_construct_parallel;
3592 data_event_info.data_event.implicit = 1;
3593 data_event_info.data_event.tool_info = NULL;
3594 data_event_info.data_event.var_name = NULL;
3595 data_event_info.data_event.bytes = 0;
3596 data_event_info.data_event.host_ptr = NULL;
3597 data_event_info.data_event.device_ptr = (void *) ptr;
3598
3599 api_info->device_api = acc_device_api_other;
3600
3601 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3602 api_info);
3603 }
3604
3605 return true;
3606}
3607
3608/* Copy data from DEVICE to host. */
3609
3610bool
3611GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3612{
3613 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3614 src, dst);
8d2f4ddf
JB
3615 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3616 if (status != HSA_STATUS_SUCCESS)
3617 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3618 return true;
3619}
3620
3621/* Copy data from host to DEVICE. */
3622
3623bool
3624GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3625{
3626 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3627 device, dst);
8d2f4ddf 3628 hsa_memory_copy_wrapper (dst, src, n);
237957cc
AS
3629 return true;
3630}
3631
3632/* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3633
3634bool
3635GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3636{
3637 struct gcn_thread *thread_data = gcn_thread ();
3638
3639 if (thread_data && !async_synchronous_p (thread_data->async))
3640 {
3641 struct agent_info *agent = get_agent_info (device);
3642 maybe_init_omp_async (agent);
3643 queue_push_copy (agent->omp_async_queue, dst, src, n, false);
3644 return true;
3645 }
3646
3647 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3648 device, src, device, dst);
8d2f4ddf
JB
3649 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3650 if (status != HSA_STATUS_SUCCESS)
3651 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3652 return true;
3653}
3654
3655/* }}} */
3656/* {{{ OpenMP Plugin API */
3657
3658/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3659 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3660 to a kernel_info structure, and must have previously been loaded to the
3661 specified device. */
3662
3663void
3664GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3665{
3666 struct agent_info *agent = get_agent_info (device);
3667 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3668 struct GOMP_kernel_launch_attributes def;
3669 struct GOMP_kernel_launch_attributes *kla;
3670 assert (agent == kernel->agent);
3671
3672 /* If we get here then the kernel must be OpenMP. */
3673 kernel->kind = KIND_OPENMP;
3674
3675 if (!parse_target_attributes (args, &def, &kla, agent))
3676 {
3677 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3678 return;
3679 }
3680 run_kernel (kernel, vars, kla, NULL, false);
3681}
3682
3683/* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3684 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3685 GOMP_PLUGIN_target_task_completion when it has finished. */
3686
3687void
3688GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3689 void **args, void *async_data)
3690{
3691 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3692 struct agent_info *agent = get_agent_info (device);
3693 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3694 struct GOMP_kernel_launch_attributes def;
3695 struct GOMP_kernel_launch_attributes *kla;
3696 assert (agent == kernel->agent);
3697
3698 /* If we get here then the kernel must be OpenMP. */
3699 kernel->kind = KIND_OPENMP;
3700
3701 if (!parse_target_attributes (args, &def, &kla, agent))
3702 {
3703 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3704 return;
3705 }
3706
3707 maybe_init_omp_async (agent);
3708 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3709 queue_push_callback (agent->omp_async_queue,
3710 GOMP_PLUGIN_target_task_completion, async_data);
3711}
3712
3713/* }}} */
3714/* {{{ OpenACC Plugin API */
3715
3716/* Run a synchronous OpenACC kernel. The device number is inferred from the
3717 already-loaded KERNEL. */
3718
3719void
3720GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3721 void **hostaddrs, void **devaddrs, unsigned *dims,
3722 void *targ_mem_desc)
3723{
3724 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3725
3726 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3727 NULL);
3728}
3729
3730/* Run an asynchronous OpenACC kernel on the specified queue. */
3731
3732void
3733GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3734 void **hostaddrs, void **devaddrs,
3735 unsigned *dims, void *targ_mem_desc,
3736 struct goacc_asyncqueue *aq)
3737{
3738 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3739
3740 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3741 aq);
3742}
3743
3744/* Create a new asynchronous thread and queue for running future kernels. */
3745
3746struct goacc_asyncqueue *
3747GOMP_OFFLOAD_openacc_async_construct (int device)
3748{
3749 struct agent_info *agent = get_agent_info (device);
3750
3751 pthread_mutex_lock (&agent->async_queues_mutex);
3752
3753 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3754 aq->agent = get_agent_info (device);
3755 aq->prev = NULL;
3756 aq->next = agent->async_queues;
3757 if (aq->next)
3758 {
3759 aq->next->prev = aq;
3760 aq->id = aq->next->id + 1;
3761 }
3762 else
3763 aq->id = 1;
3764 agent->async_queues = aq;
3765
3766 aq->queue_first = 0;
3767 aq->queue_n = 0;
3768 aq->drain_queue_stop = 0;
3769
3770 if (pthread_mutex_init (&aq->mutex, NULL))
3771 {
3772 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3773 return false;
3774 }
3775 if (pthread_cond_init (&aq->queue_cond_in, NULL))
3776 {
3777 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3778 return false;
3779 }
3780 if (pthread_cond_init (&aq->queue_cond_out, NULL))
3781 {
3782 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3783 return false;
3784 }
3785
3786 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3787 ASYNC_QUEUE_SIZE,
3788 HSA_QUEUE_TYPE_MULTI,
3789 hsa_queue_callback, NULL,
3790 UINT32_MAX, UINT32_MAX,
3791 &aq->hsa_queue);
3792 if (status != HSA_STATUS_SUCCESS)
3793 hsa_fatal ("Error creating command queue", status);
3794
3795 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3796 if (err != 0)
3797 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3798 strerror (err));
3799 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3800 aq->id);
3801
3802 pthread_mutex_unlock (&agent->async_queues_mutex);
3803
3804 return aq;
3805}
3806
93d90219 3807/* Destroy an existing asynchronous thread and queue. Waits for any
237957cc
AS
3808 currently-running task to complete, but cancels any queued tasks. */
3809
3810bool
3811GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3812{
3813 struct agent_info *agent = aq->agent;
3814
3815 finalize_async_thread (aq);
3816
3817 pthread_mutex_lock (&agent->async_queues_mutex);
3818
3819 int err;
3820 if ((err = pthread_mutex_destroy (&aq->mutex)))
3821 {
3822 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3823 goto fail;
3824 }
3825 if (pthread_cond_destroy (&aq->queue_cond_in))
3826 {
3827 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3828 goto fail;
3829 }
3830 if (pthread_cond_destroy (&aq->queue_cond_out))
3831 {
3832 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3833 goto fail;
3834 }
3835 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3836 if (status != HSA_STATUS_SUCCESS)
3837 {
3838 hsa_error ("Error destroying command queue", status);
3839 goto fail;
3840 }
3841
3842 if (aq->prev)
3843 aq->prev->next = aq->next;
3844 if (aq->next)
3845 aq->next->prev = aq->prev;
3846 if (agent->async_queues == aq)
3847 agent->async_queues = aq->next;
3848
3849 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3850
3851 free (aq);
3852 pthread_mutex_unlock (&agent->async_queues_mutex);
3853 return true;
3854
3855fail:
3856 pthread_mutex_unlock (&agent->async_queues_mutex);
3857 return false;
3858}
3859
3860/* Return true if the specified async queue is currently empty. */
3861
3862int
3863GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
3864{
3865 return queue_empty (aq);
3866}
3867
3868/* Block until the specified queue has executed all its tasks and the
3869 queue is empty. */
3870
3871bool
3872GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
3873{
3874 wait_queue (aq);
3875 return true;
3876}
3877
3878/* Add a serialization point across two async queues. Any new tasks added to
3879 AQ2, after this call, will not run until all tasks on AQ1, at the time
3880 of this call, have completed. */
3881
3882bool
3883GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
3884 struct goacc_asyncqueue *aq2)
3885{
3886 /* For serialize, stream aq2 waits for aq1 to complete work that has been
3887 scheduled to run on it up to this point. */
3888 if (aq1 != aq2)
3889 {
3890 struct placeholder *placeholderp = queue_push_placeholder (aq1);
3891 queue_push_asyncwait (aq2, placeholderp);
3892 }
3893 return true;
3894}
3895
3896/* Add an opaque callback to the given async queue. */
3897
3898void
3899GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
3900 void (*fn) (void *), void *data)
3901{
3902 queue_push_callback (aq, fn, data);
3903}
3904
3905/* Queue up an asynchronous data copy from host to DEVICE. */
3906
3907bool
3908GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
3909 size_t n, struct goacc_asyncqueue *aq)
3910{
3911 struct agent_info *agent = get_agent_info (device);
3912 assert (agent == aq->agent);
3913 /* The source data does not necessarily remain live until the deferred
3914 copy happens. Taking a snapshot of the data here avoids reading
3915 uninitialised data later, but means that (a) data is copied twice and
3916 (b) modifications to the copied data between the "spawning" point of
3917 the asynchronous kernel and when it is executed will not be seen.
3918 But, that is probably correct. */
3919 void *src_copy = GOMP_PLUGIN_malloc (n);
3920 memcpy (src_copy, src, n);
3921 queue_push_copy (aq, dst, src_copy, n, true);
3922 return true;
3923}
3924
3925/* Queue up an asynchronous data copy from DEVICE to host. */
3926
3927bool
3928GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
3929 size_t n, struct goacc_asyncqueue *aq)
3930{
3931 struct agent_info *agent = get_agent_info (device);
3932 assert (agent == aq->agent);
3933 queue_push_copy (aq, dst, src, n, false);
3934 return true;
3935}
3936
6fc0385c
TS
3937union goacc_property_value
3938GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
3939{
2e5ea579
FH
3940 struct agent_info *agent = get_agent_info (device);
3941
3942 union goacc_property_value propval = { .val = 0 };
3943
3944 switch (prop)
3945 {
3946 case GOACC_PROPERTY_FREE_MEMORY:
3947 /* Not supported. */
3948 break;
3949 case GOACC_PROPERTY_MEMORY:
3950 {
3951 size_t size;
3952 hsa_region_t region = agent->data_region;
3953 hsa_status_t status =
3954 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
3955 if (status == HSA_STATUS_SUCCESS)
3956 propval.val = size;
3957 break;
3958 }
3959 case GOACC_PROPERTY_NAME:
3960 propval.ptr = agent->name;
3961 break;
3962 case GOACC_PROPERTY_VENDOR:
3963 propval.ptr = agent->vendor_name;
3964 break;
3965 case GOACC_PROPERTY_DRIVER:
3966 propval.ptr = hsa_context.driver_version_s;
3967 break;
3968 }
6fc0385c 3969
2e5ea579 3970 return propval;
6fc0385c
TS
3971}
3972
237957cc
AS
3973/* Set up plugin-specific thread-local-data (host-side). */
3974
3975void *
3976GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
3977{
3978 struct gcn_thread *thread_data
3979 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
3980
3981 thread_data->async = GOMP_ASYNC_SYNC;
3982
3983 return (void *) thread_data;
3984}
3985
3986/* Clean up plugin-specific thread-local-data. */
3987
3988void
3989GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
3990{
3991 free (data);
3992}
3993
3994/* }}} */