]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-gcn.c
Update cpplib es.po
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
CommitLineData
237957cc
AS
1/* Plugin for AMD GCN execution.
2
7adcbafe 3 Copyright (C) 2013-2022 Free Software Foundation, Inc.
237957cc
AS
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* {{{ Includes and defines */
30
31#include "config.h"
0bac793e 32#include "symcat.h"
237957cc
AS
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#include <pthread.h>
37#include <inttypes.h>
38#include <stdbool.h>
39#include <limits.h>
40#include <hsa.h>
85f0a4d9 41#include <hsa_ext_amd.h>
237957cc
AS
42#include <dlfcn.h>
43#include <signal.h>
44#include "libgomp-plugin.h"
45#include "gomp-constants.h"
46#include <elf.h>
47#include "oacc-plugin.h"
48#include "oacc-int.h"
49#include <assert.h>
50
237957cc 51/* These probably won't be in elf.h for a while. */
97981e13 52#ifndef R_AMDGPU_NONE
237957cc
AS
53#define R_AMDGPU_NONE 0
54#define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
55#define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
56#define R_AMDGPU_ABS64 3 /* S + A */
57#define R_AMDGPU_REL32 4 /* S + A - P */
58#define R_AMDGPU_REL64 5 /* S + A - P */
59#define R_AMDGPU_ABS32 6 /* S + A */
60#define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
61#define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
62#define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
63#define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
64#define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
237957cc 65#define R_AMDGPU_RELATIVE64 13 /* B + A */
97981e13 66#endif
237957cc
AS
67
68/* GCN specific definitions for asynchronous queues. */
69
70#define ASYNC_QUEUE_SIZE 64
71#define DRAIN_QUEUE_SYNCHRONOUS_P false
72#define DEBUG_QUEUES 0
73#define DEBUG_THREAD_SLEEP 0
74#define DEBUG_THREAD_SIGNAL 0
75
76/* Defaults. */
77#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
78
79/* Secure getenv() which returns NULL if running as SUID/SGID. */
80#ifndef HAVE_SECURE_GETENV
81#ifdef HAVE___SECURE_GETENV
82#define secure_getenv __secure_getenv
83#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
84 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
85
86#include <unistd.h>
87
88/* Implementation of secure_getenv() for targets where it is not provided but
89 we have at least means to test real and effective IDs. */
90
91static char *
92secure_getenv (const char *name)
93{
94 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
95 return getenv (name);
96 else
97 return NULL;
98}
99
100#else
101#define secure_getenv getenv
102#endif
103#endif
104
105/* }}} */
106/* {{{ Types */
107
93d90219 108/* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
237957cc
AS
109
110struct gcn_thread
111{
112 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
113 int async;
114};
115
116/* As an HSA runtime is dlopened, following structure defines function
117 pointers utilized by the HSA plug-in. */
118
119struct hsa_runtime_fn_info
120{
121 /* HSA runtime. */
122 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
123 const char **status_string);
124 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
125 void *value);
126 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
127 hsa_agent_info_t attribute,
128 void *value);
129 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
130 hsa_isa_info_t attribute,
131 uint32_t index,
132 void *value);
133 hsa_status_t (*hsa_init_fn) (void);
134 hsa_status_t (*hsa_iterate_agents_fn)
135 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
136 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
137 hsa_region_info_t attribute,
138 void *value);
139 hsa_status_t (*hsa_queue_create_fn)
140 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
141 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
142 void *data, uint32_t private_segment_size,
143 uint32_t group_segment_size, hsa_queue_t **queue);
144 hsa_status_t (*hsa_agent_iterate_regions_fn)
145 (hsa_agent_t agent,
146 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
147 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
148 hsa_status_t (*hsa_executable_create_fn)
149 (hsa_profile_t profile, hsa_executable_state_t executable_state,
150 const char *options, hsa_executable_t *executable);
151 hsa_status_t (*hsa_executable_global_variable_define_fn)
152 (hsa_executable_t executable, const char *variable_name, void *address);
153 hsa_status_t (*hsa_executable_load_code_object_fn)
154 (hsa_executable_t executable, hsa_agent_t agent,
155 hsa_code_object_t code_object, const char *options);
156 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
157 const char *options);
158 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
159 uint32_t num_consumers,
160 const hsa_agent_t *consumers,
161 hsa_signal_t *signal);
162 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
163 void **ptr);
164 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
165 hsa_access_permission_t access);
166 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
167 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
168 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
169 hsa_status_t (*hsa_executable_get_symbol_fn)
170 (hsa_executable_t executable, const char *module_name,
171 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
172 hsa_executable_symbol_t *symbol);
173 hsa_status_t (*hsa_executable_symbol_get_info_fn)
174 (hsa_executable_symbol_t executable_symbol,
175 hsa_executable_symbol_info_t attribute, void *value);
176 hsa_status_t (*hsa_executable_iterate_symbols_fn)
177 (hsa_executable_t executable,
178 hsa_status_t (*callback)(hsa_executable_t executable,
179 hsa_executable_symbol_t symbol, void *data),
180 void *data);
181 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
182 uint64_t value);
183 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
184 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
185 hsa_signal_value_t value);
186 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
187 hsa_signal_value_t value);
188 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
189 (hsa_signal_t signal, hsa_signal_condition_t condition,
190 hsa_signal_value_t compare_value, uint64_t timeout_hint,
191 hsa_wait_state_t wait_state_hint);
192 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
193 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
194
195 hsa_status_t (*hsa_code_object_deserialize_fn)
196 (void *serialized_code_object, size_t serialized_code_object_size,
197 const char *options, hsa_code_object_t *code_object);
198};
199
200/* Structure describing the run-time and grid properties of an HSA kernel
201 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
202
203struct GOMP_kernel_launch_attributes
204{
205 /* Number of dimensions the workload has. Maximum number is 3. */
206 uint32_t ndim;
207 /* Size of the grid in the three respective dimensions. */
208 uint32_t gdims[3];
209 /* Size of work-groups in the respective dimensions. */
210 uint32_t wdims[3];
211};
212
213/* Collection of information needed for a dispatch of a kernel from a
214 kernel. */
215
216struct kernel_dispatch
217{
218 struct agent_info *agent;
219 /* Pointer to a command queue associated with a kernel dispatch agent. */
220 void *queue;
221 /* Pointer to a memory space used for kernel arguments passing. */
222 void *kernarg_address;
223 /* Kernel object. */
224 uint64_t object;
225 /* Synchronization signal used for dispatch synchronization. */
226 uint64_t signal;
227 /* Private segment size. */
228 uint32_t private_segment_size;
229 /* Group segment size. */
230 uint32_t group_segment_size;
231};
232
233/* Structure of the kernargs segment, supporting console output.
234
235 This needs to match the definitions in Newlib, and the expectations
236 in libgomp target code. */
237
238struct kernargs {
239 /* Leave space for the real kernel arguments.
240 OpenACC and OpenMP only use one pointer. */
241 int64_t dummy1;
242 int64_t dummy2;
243
244 /* A pointer to struct output, below, for console output data. */
245 int64_t out_ptr;
246
247 /* A pointer to struct heap, below. */
248 int64_t heap_ptr;
249
250 /* A pointer to an ephemeral memory arena.
251 Only needed for OpenMP. */
252 int64_t arena_ptr;
253
254 /* Output data. */
255 struct output {
256 int return_value;
257 unsigned int next_output;
258 struct printf_data {
259 int written;
260 char msg[128];
261 int type;
262 union {
263 int64_t ivalue;
264 double dvalue;
265 char text[128];
266 };
267 } queue[1024];
268 unsigned int consumed;
269 } output_data;
270};
271
272/* A queue entry for a future asynchronous launch. */
273
274struct kernel_launch
275{
276 struct kernel_info *kernel;
277 void *vars;
278 struct GOMP_kernel_launch_attributes kla;
279};
280
281/* A queue entry for a future callback. */
282
283struct callback
284{
285 void (*fn)(void *);
286 void *data;
287};
288
289/* A data struct for the copy_data callback. */
290
291struct copy_data
292{
293 void *dst;
294 const void *src;
295 size_t len;
237957cc
AS
296 struct goacc_asyncqueue *aq;
297};
298
299/* A queue entry for a placeholder. These correspond to a wait event. */
300
301struct placeholder
302{
303 int executed;
304 pthread_cond_t cond;
305 pthread_mutex_t mutex;
306};
307
308/* A queue entry for a wait directive. */
309
310struct asyncwait_info
311{
312 struct placeholder *placeholderp;
313};
314
315/* Encode the type of an entry in an async queue. */
316
317enum entry_type
318{
319 KERNEL_LAUNCH,
320 CALLBACK,
321 ASYNC_WAIT,
322 ASYNC_PLACEHOLDER
323};
324
325/* An entry in an async queue. */
326
327struct queue_entry
328{
329 enum entry_type type;
330 union {
331 struct kernel_launch launch;
332 struct callback callback;
333 struct asyncwait_info asyncwait;
334 struct placeholder placeholder;
335 } u;
336};
337
338/* An async queue header.
339
340 OpenMP may create one of these.
341 OpenACC may create many. */
342
343struct goacc_asyncqueue
344{
345 struct agent_info *agent;
346 hsa_queue_t *hsa_queue;
347
348 pthread_t thread_drain_queue;
349 pthread_mutex_t mutex;
350 pthread_cond_t queue_cond_in;
351 pthread_cond_t queue_cond_out;
352 struct queue_entry queue[ASYNC_QUEUE_SIZE];
353 int queue_first;
354 int queue_n;
355 int drain_queue_stop;
356
357 int id;
358 struct goacc_asyncqueue *prev;
359 struct goacc_asyncqueue *next;
360};
361
362/* Mkoffload uses this structure to describe a kernel.
363
364 OpenMP kernel dimensions are passed at runtime.
365 OpenACC kernel dimensions are passed at compile time, here. */
366
367struct hsa_kernel_description
368{
369 const char *name;
370 int oacc_dims[3]; /* Only present for GCN kernels. */
5a28e272
KCY
371 int sgpr_count;
372 int vpgr_count;
237957cc
AS
373};
374
375/* Mkoffload uses this structure to describe an offload variable. */
376
377struct global_var_info
378{
379 const char *name;
380 void *address;
381};
382
383/* Mkoffload uses this structure to describe all the kernels in a
384 loadable module. These are passed the libgomp via static constructors. */
385
386struct gcn_image_desc
387{
388 struct gcn_image {
389 size_t size;
390 void *image;
391 } *gcn_image;
392 const unsigned kernel_count;
393 struct hsa_kernel_description *kernel_infos;
394 const unsigned global_variable_count;
237957cc
AS
395};
396
7d593fd6
FH
397/* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
398 support.
399 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
400
401typedef enum {
7d593fd6
FH
402 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
403 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
404 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
3535402e 405 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030
7d593fd6
FH
406} EF_AMDGPU_MACH;
407
408const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
409typedef EF_AMDGPU_MACH gcn_isa;
410
237957cc
AS
411/* Description of an HSA GPU agent (device) and the program associated with
412 it. */
413
414struct agent_info
415{
416 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
417 hsa_agent_t id;
418 /* The user-visible device number. */
419 int device_id;
420 /* Whether the agent has been initialized. The fields below are usable only
421 if it has been. */
422 bool initialized;
7d593fd6
FH
423
424 /* The instruction set architecture of the device. */
425 gcn_isa device_isa;
2e5ea579
FH
426 /* Name of the agent. */
427 char name[64];
428 /* Name of the vendor of the agent. */
429 char vendor_name[64];
237957cc
AS
430 /* Command queues of the agent. */
431 hsa_queue_t *sync_queue;
432 struct goacc_asyncqueue *async_queues, *omp_async_queue;
433 pthread_mutex_t async_queues_mutex;
434
435 /* The HSA memory region from which to allocate kernel arguments. */
436 hsa_region_t kernarg_region;
437
438 /* The HSA memory region from which to allocate device data. */
439 hsa_region_t data_region;
440
441 /* Allocated team arenas. */
442 struct team_arena_list *team_arena_list;
443 pthread_mutex_t team_arena_write_lock;
444
445 /* Read-write lock that protects kernels which are running or about to be run
446 from interference with loading and unloading of images. Needs to be
447 locked for reading while a kernel is being run, and for writing if the
448 list of modules is manipulated (and thus the HSA program invalidated). */
449 pthread_rwlock_t module_rwlock;
450
451 /* The module associated with this kernel. */
452 struct module_info *module;
453
454 /* Mutex enforcing that only one thread will finalize the HSA program. A
455 thread should have locked agent->module_rwlock for reading before
456 acquiring it. */
457 pthread_mutex_t prog_mutex;
458 /* Flag whether the HSA program that consists of all the modules has been
459 finalized. */
460 bool prog_finalized;
461 /* HSA executable - the finalized program that is used to locate kernels. */
462 hsa_executable_t executable;
463};
464
465/* Information required to identify, finalize and run any given kernel. */
466
467enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
468
469struct kernel_info
470{
471 /* Name of the kernel, required to locate it within the GCN object-code
472 module. */
473 const char *name;
474 /* The specific agent the kernel has been or will be finalized for and run
475 on. */
476 struct agent_info *agent;
477 /* The specific module where the kernel takes place. */
478 struct module_info *module;
5a28e272
KCY
479 /* Information provided by mkoffload associated with the kernel. */
480 struct hsa_kernel_description *description;
237957cc
AS
481 /* Mutex enforcing that at most once thread ever initializes a kernel for
482 use. A thread should have locked agent->module_rwlock for reading before
483 acquiring it. */
484 pthread_mutex_t init_mutex;
485 /* Flag indicating whether the kernel has been initialized and all fields
486 below it contain valid data. */
487 bool initialized;
488 /* Flag indicating that the kernel has a problem that blocks an execution. */
489 bool initialization_failed;
490 /* The object to be put into the dispatch queue. */
491 uint64_t object;
492 /* Required size of kernel arguments. */
493 uint32_t kernarg_segment_size;
494 /* Required size of group segment. */
495 uint32_t group_segment_size;
496 /* Required size of private segment. */
497 uint32_t private_segment_size;
498 /* Set up for OpenMP or OpenACC? */
499 enum offload_kind kind;
500};
501
502/* Information about a particular GCN module, its image and kernels. */
503
504struct module_info
505{
506 /* The description with which the program has registered the image. */
507 struct gcn_image_desc *image_desc;
508 /* GCN heap allocation. */
509 struct heap *heap;
510 /* Physical boundaries of the loaded module. */
511 Elf64_Addr phys_address_start;
512 Elf64_Addr phys_address_end;
513
514 bool constructors_run_p;
515 struct kernel_info *init_array_func, *fini_array_func;
516
517 /* Number of kernels in this module. */
518 int kernel_count;
519 /* An array of kernel_info structures describing each kernel in this
520 module. */
521 struct kernel_info kernels[];
522};
523
524/* A linked list of memory arenas allocated on the device.
525 These are only used by OpenMP, as a means to optimize per-team malloc. */
526
527struct team_arena_list
528{
529 struct team_arena_list *next;
530
531 /* The number of teams determines the size of the allocation. */
532 int num_teams;
533 /* The device address of the arena itself. */
534 void *arena;
535 /* A flag to prevent two asynchronous kernels trying to use the same arena.
536 The mutex is locked until the kernel exits. */
537 pthread_mutex_t in_use;
538};
539
540/* Information about the whole HSA environment and all of its agents. */
541
542struct hsa_context_info
543{
544 /* Whether the structure has been initialized. */
545 bool initialized;
546 /* Number of usable GPU HSA agents in the system. */
547 int agent_count;
548 /* Array of agent_info structures describing the individual HSA agents. */
549 struct agent_info *agents;
2e5ea579
FH
550 /* Driver version string. */
551 char driver_version_s[30];
237957cc
AS
552};
553
554/* Format of the on-device heap.
555
556 This must match the definition in Newlib and gcn-run. */
557
558struct heap {
559 int64_t size;
560 char data[0];
561};
562
563/* }}} */
564/* {{{ Global variables */
565
566/* Information about the whole HSA environment and all of its agents. */
567
568static struct hsa_context_info hsa_context;
569
570/* HSA runtime functions that are initialized in init_hsa_context. */
571
572static struct hsa_runtime_fn_info hsa_fns;
573
574/* Heap space, allocated target-side, provided for use of newlib malloc.
575 Each module should have it's own heap allocated.
576 Beware that heap usage increases with OpenMP teams. See also arenas. */
577
578static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
579
580/* Flag to decide whether print to stderr information about what is going on.
581 Set in init_debug depending on environment variables. */
582
583static bool debug;
584
585/* Flag to decide if the runtime should suppress a possible fallback to host
586 execution. */
587
588static bool suppress_host_fallback;
589
590/* Flag to locate HSA runtime shared library that is dlopened
591 by this plug-in. */
592
593static const char *hsa_runtime_lib;
594
595/* Flag to decide if the runtime should support also CPU devices (can be
596 a simulator). */
597
598static bool support_cpu_devices;
599
600/* Runtime dimension overrides. Zero indicates default. */
601
602static int override_x_dim = 0;
603static int override_z_dim = 0;
604
605/* }}} */
606/* {{{ Debug & Diagnostic */
607
608/* Print a message to stderr if GCN_DEBUG value is set to true. */
609
610#define DEBUG_PRINT(...) \
611 do \
612 { \
613 if (debug) \
614 { \
615 fprintf (stderr, __VA_ARGS__); \
616 } \
617 } \
618 while (false);
619
620/* Flush stderr if GCN_DEBUG value is set to true. */
621
622#define DEBUG_FLUSH() \
623 do { \
624 if (debug) \
625 fflush (stderr); \
626 } while (false)
627
628/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
629 is set to true. */
630
631#define DEBUG_LOG(prefix, ...) \
632 do \
633 { \
634 DEBUG_PRINT (prefix); \
635 DEBUG_PRINT (__VA_ARGS__); \
636 DEBUG_FLUSH (); \
637 } while (false)
638
639/* Print a debugging message to stderr. */
640
641#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
642
643/* Print a warning message to stderr. */
644
645#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
646
647/* Print HSA warning STR with an HSA STATUS code. */
648
649static void
650hsa_warn (const char *str, hsa_status_t status)
651{
652 if (!debug)
653 return;
654
655 const char *hsa_error_msg = "[unknown]";
656 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
657
658 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
659 hsa_error_msg);
660}
661
662/* Report a fatal error STR together with the HSA error corresponding to STATUS
663 and terminate execution of the current process. */
664
665static void
666hsa_fatal (const char *str, hsa_status_t status)
667{
668 const char *hsa_error_msg = "[unknown]";
669 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
670 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
671 hsa_error_msg);
672}
673
674/* Like hsa_fatal, except only report error message, and return FALSE
675 for propagating error processing to outside of plugin. */
676
677static bool
678hsa_error (const char *str, hsa_status_t status)
679{
680 const char *hsa_error_msg = "[unknown]";
681 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
682 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
683 hsa_error_msg);
684 return false;
685}
686
687/* Dump information about the available hardware. */
688
689static void
690dump_hsa_system_info (void)
691{
692 hsa_status_t status;
693
694 hsa_endianness_t endianness;
695 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
696 &endianness);
697 if (status == HSA_STATUS_SUCCESS)
698 switch (endianness)
699 {
700 case HSA_ENDIANNESS_LITTLE:
701 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
702 break;
703 case HSA_ENDIANNESS_BIG:
704 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
705 break;
706 default:
707 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
708 }
709 else
710 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
711
712 uint8_t extensions[128];
713 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
714 &extensions);
715 if (status == HSA_STATUS_SUCCESS)
716 {
717 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
718 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
719 }
720 else
721 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
722}
723
724/* Dump information about the available hardware. */
725
726static void
727dump_machine_model (hsa_machine_model_t machine_model, const char *s)
728{
729 switch (machine_model)
730 {
731 case HSA_MACHINE_MODEL_SMALL:
732 GCN_DEBUG ("%s: SMALL\n", s);
733 break;
734 case HSA_MACHINE_MODEL_LARGE:
735 GCN_DEBUG ("%s: LARGE\n", s);
736 break;
737 default:
738 GCN_WARNING ("%s: UNKNOWN\n", s);
739 break;
740 }
741}
742
743/* Dump information about the available hardware. */
744
745static void
746dump_profile (hsa_profile_t profile, const char *s)
747{
748 switch (profile)
749 {
750 case HSA_PROFILE_FULL:
751 GCN_DEBUG ("%s: FULL\n", s);
752 break;
753 case HSA_PROFILE_BASE:
754 GCN_DEBUG ("%s: BASE\n", s);
755 break;
756 default:
757 GCN_WARNING ("%s: UNKNOWN\n", s);
758 break;
759 }
760}
761
762/* Dump information about a device memory region. */
763
764static hsa_status_t
765dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
766{
767 hsa_status_t status;
768
769 hsa_region_segment_t segment;
770 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
771 &segment);
772 if (status == HSA_STATUS_SUCCESS)
773 {
774 if (segment == HSA_REGION_SEGMENT_GLOBAL)
775 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
776 else if (segment == HSA_REGION_SEGMENT_READONLY)
777 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
778 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
779 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
780 else if (segment == HSA_REGION_SEGMENT_GROUP)
781 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
782 else
783 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
784 }
785 else
786 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
787
788 if (segment == HSA_REGION_SEGMENT_GLOBAL)
789 {
790 uint32_t flags;
791 status
792 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
793 &flags);
794 if (status == HSA_STATUS_SUCCESS)
795 {
796 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
797 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
798 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
799 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
800 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
801 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
802 }
803 else
804 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
805 }
806
807 size_t size;
808 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
809 if (status == HSA_STATUS_SUCCESS)
810 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
811 else
812 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
813
814 status
815 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
816 &size);
817 if (status == HSA_STATUS_SUCCESS)
818 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
819 else
820 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
821
822 bool alloc_allowed;
823 status
824 = hsa_fns.hsa_region_get_info_fn (region,
825 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
826 &alloc_allowed);
827 if (status == HSA_STATUS_SUCCESS)
828 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
829 else
830 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
831
832 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
833 return HSA_STATUS_SUCCESS;
834
835 status
836 = hsa_fns.hsa_region_get_info_fn (region,
837 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
838 &size);
839 if (status == HSA_STATUS_SUCCESS)
840 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
841 else
842 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
843
844 size_t align;
845 status
846 = hsa_fns.hsa_region_get_info_fn (region,
847 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
848 &align);
849 if (status == HSA_STATUS_SUCCESS)
850 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
851 else
852 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
853
854 return HSA_STATUS_SUCCESS;
855}
856
857/* Dump information about all the device memory regions. */
858
859static void
860dump_hsa_regions (hsa_agent_t agent)
861{
862 hsa_status_t status;
863 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
864 dump_hsa_region,
865 NULL);
866 if (status != HSA_STATUS_SUCCESS)
867 hsa_error ("Dumping hsa regions failed", status);
868}
869
870/* Dump information about the available devices. */
871
872static hsa_status_t
873dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
874{
875 hsa_status_t status;
876
877 char buf[64];
878 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
879 &buf);
880 if (status == HSA_STATUS_SUCCESS)
881 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
882 else
883 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
884
885 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
886 &buf);
887 if (status == HSA_STATUS_SUCCESS)
888 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
889 else
890 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
891
892 hsa_machine_model_t machine_model;
893 status
894 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
895 &machine_model);
896 if (status == HSA_STATUS_SUCCESS)
897 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
898 else
899 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
900
901 hsa_profile_t profile;
902 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
903 &profile);
904 if (status == HSA_STATUS_SUCCESS)
905 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
906 else
907 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
908
909 hsa_device_type_t device_type;
910 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
911 &device_type);
912 if (status == HSA_STATUS_SUCCESS)
913 {
914 switch (device_type)
915 {
916 case HSA_DEVICE_TYPE_CPU:
917 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
918 break;
919 case HSA_DEVICE_TYPE_GPU:
920 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
921 break;
922 case HSA_DEVICE_TYPE_DSP:
923 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
924 break;
925 default:
926 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
927 break;
928 }
929 }
930 else
931 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
932
933 uint32_t cu_count;
934 status = hsa_fns.hsa_agent_get_info_fn
935 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
936 if (status == HSA_STATUS_SUCCESS)
937 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
938 else
939 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
940
941 uint32_t size;
942 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
943 &size);
944 if (status == HSA_STATUS_SUCCESS)
945 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
946 else
947 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
948
949 uint32_t max_dim;
950 status = hsa_fns.hsa_agent_get_info_fn (agent,
951 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
952 &max_dim);
953 if (status == HSA_STATUS_SUCCESS)
954 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
955 else
956 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
957
958 uint32_t max_size;
959 status = hsa_fns.hsa_agent_get_info_fn (agent,
960 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
961 &max_size);
962 if (status == HSA_STATUS_SUCCESS)
963 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
964 else
965 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
966
967 uint32_t grid_max_dim;
968 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
969 &grid_max_dim);
970 if (status == HSA_STATUS_SUCCESS)
971 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
972 else
973 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
974
975 uint32_t grid_max_size;
976 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
977 &grid_max_size);
978 if (status == HSA_STATUS_SUCCESS)
979 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
980 else
981 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
982
983 dump_hsa_regions (agent);
984
985 return HSA_STATUS_SUCCESS;
986}
987
988/* Forward reference. */
989
990static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
991
992/* Helper function for dump_executable_symbols. */
993
994static hsa_status_t
995dump_executable_symbol (hsa_executable_t executable,
996 hsa_executable_symbol_t symbol,
997 void *data __attribute__((unused)))
998{
999 char *name = get_executable_symbol_name (symbol);
1000
1001 if (name)
1002 {
1003 GCN_DEBUG ("executable symbol: %s\n", name);
1004 free (name);
1005 }
1006
1007 return HSA_STATUS_SUCCESS;
1008}
1009
1010/* Dump all global symbol in an executable. */
1011
1012static void
1013dump_executable_symbols (hsa_executable_t executable)
1014{
1015 hsa_status_t status;
1016 status
1017 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1018 dump_executable_symbol,
1019 NULL);
1020 if (status != HSA_STATUS_SUCCESS)
1021 hsa_fatal ("Could not dump HSA executable symbols", status);
1022}
1023
1024/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1025
1026static void
1027print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1028{
1029 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1030
1031 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1032 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1033 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1034 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1035 (void*)kernargs->heap_ptr);
1036 fprintf (stderr, "%*sarena address: %p\n", indent, "",
1037 (void*)kernargs->arena_ptr);
1038 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1039 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1040 dispatch->private_segment_size);
1041 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1042 dispatch->group_segment_size);
1043 fprintf (stderr, "\n");
1044}
1045
1046/* }}} */
1047/* {{{ Utility functions */
1048
1049/* Cast the thread local storage to gcn_thread. */
1050
1051static inline struct gcn_thread *
1052gcn_thread (void)
1053{
1054 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1055}
1056
1057/* Initialize debug and suppress_host_fallback according to the environment. */
1058
1059static void
1060init_environment_variables (void)
1061{
1062 if (secure_getenv ("GCN_DEBUG"))
1063 debug = true;
1064 else
1065 debug = false;
1066
1067 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1068 suppress_host_fallback = true;
1069 else
1070 suppress_host_fallback = false;
1071
1072 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1073 if (hsa_runtime_lib == NULL)
7c1e856b 1074 hsa_runtime_lib = "libhsa-runtime64.so.1";
237957cc
AS
1075
1076 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1077
1078 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1079 if (!x)
1080 x = secure_getenv ("GCN_NUM_GANGS");
1081 if (x)
1082 override_x_dim = atoi (x);
1083
1084 const char *z = secure_getenv ("GCN_NUM_THREADS");
1085 if (!z)
1086 z = secure_getenv ("GCN_NUM_WORKERS");
1087 if (z)
1088 override_z_dim = atoi (z);
1089
1090 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1091 if (heap)
1092 {
1093 size_t tmp = atol (heap);
1094 if (tmp)
1095 gcn_kernel_heap_size = tmp;
1096 }
1097}
1098
1099/* Return malloc'd string with name of SYMBOL. */
1100
1101static char *
1102get_executable_symbol_name (hsa_executable_symbol_t symbol)
1103{
1104 hsa_status_t status;
1105 char *res;
1106 uint32_t len;
1107 const hsa_executable_symbol_info_t info_name_length
1108 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1109
1110 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1111 &len);
1112 if (status != HSA_STATUS_SUCCESS)
1113 {
1114 hsa_error ("Could not get length of symbol name", status);
1115 return NULL;
1116 }
1117
1118 res = GOMP_PLUGIN_malloc (len + 1);
1119
1120 const hsa_executable_symbol_info_t info_name
1121 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1122
1123 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1124
1125 if (status != HSA_STATUS_SUCCESS)
1126 {
1127 hsa_error ("Could not get symbol name", status);
1128 free (res);
1129 return NULL;
1130 }
1131
1132 res[len] = '\0';
1133
1134 return res;
1135}
1136
237957cc
AS
1137/* Get the number of GPU Compute Units. */
1138
1139static int
1140get_cu_count (struct agent_info *agent)
1141{
1142 uint32_t cu_count;
1143 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1144 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1145 if (status == HSA_STATUS_SUCCESS)
1146 return cu_count;
1147 else
1148 return 64; /* The usual number for older devices. */
1149}
1150
1151/* Calculate the maximum grid size for OMP threads / OACC workers.
1152 This depends on the kernel's resource usage levels. */
1153
1154static int
1155limit_worker_threads (int threads)
1156{
1157 /* FIXME Do something more inteligent here.
1158 GCN can always run 4 threads within a Compute Unit, but
1159 more than that depends on register usage. */
1160 if (threads > 16)
1161 threads = 16;
1162 return threads;
1163}
1164
1165/* Parse the target attributes INPUT provided by the compiler and return true
1166 if we should run anything all. If INPUT is NULL, fill DEF with default
1167 values, then store INPUT or DEF into *RESULT.
1168
1169 This is used for OpenMP only. */
1170
1171static bool
1172parse_target_attributes (void **input,
1173 struct GOMP_kernel_launch_attributes *def,
1174 struct GOMP_kernel_launch_attributes **result,
1175 struct agent_info *agent)
1176{
1177 if (!input)
1178 GOMP_PLUGIN_fatal ("No target arguments provided");
1179
1180 bool grid_attrs_found = false;
1181 bool gcn_dims_found = false;
1182 int gcn_teams = 0;
1183 int gcn_threads = 0;
1184 while (*input)
1185 {
1186 intptr_t id = (intptr_t) *input++, val;
1187
1188 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1189 val = (intptr_t) *input++;
1190 else
1191 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1192
1193 val = (val > INT_MAX) ? INT_MAX : val;
1194
1195 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1196 && ((id & GOMP_TARGET_ARG_ID_MASK)
1197 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1198 {
1199 grid_attrs_found = true;
1200 break;
1201 }
14e5e746
AS
1202 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1203 == GOMP_TARGET_ARG_DEVICE_ALL)
237957cc
AS
1204 {
1205 gcn_dims_found = true;
1206 switch (id & GOMP_TARGET_ARG_ID_MASK)
1207 {
1208 case GOMP_TARGET_ARG_NUM_TEAMS:
1209 gcn_teams = val;
1210 break;
1211 case GOMP_TARGET_ARG_THREAD_LIMIT:
1212 gcn_threads = limit_worker_threads (val);
1213 break;
1214 default:
1215 ;
1216 }
1217 }
1218 }
1219
1220 if (gcn_dims_found)
1221 {
a78b1ab1
KCY
1222 bool gfx900_workaround_p = false;
1223
7d593fd6
FH
1224 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1225 && gcn_threads == 0 && override_z_dim == 0)
237957cc 1226 {
a78b1ab1 1227 gfx900_workaround_p = true;
237957cc 1228 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
a78b1ab1 1229 "threads to at most 4 per team.\n");
237957cc
AS
1230 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1231 "GCN_NUM_THREADS=16\n");
1232 }
1233
a78b1ab1
KCY
1234 /* Ideally, when a dimension isn't explicitly specified, we should
1235 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1236 In practice, we tune for peak performance on BabelStream, which
1237 for OpenACC is currently 32 threads per CU. */
237957cc 1238 def->ndim = 3;
a78b1ab1
KCY
1239 if (gcn_teams <= 0 && gcn_threads <= 0)
1240 {
1241 /* Set up a reasonable number of teams and threads. */
1242 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1243 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1244 def->gdims[2] = gcn_threads;
1245 }
1246 else if (gcn_teams <= 0 && gcn_threads > 0)
1247 {
1248 /* Auto-scale the number of teams with the number of threads. */
1249 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1250 def->gdims[2] = gcn_threads;
1251 }
1252 else if (gcn_teams > 0 && gcn_threads <= 0)
1253 {
1254 int max_threads = gfx900_workaround_p ? 4 : 16;
1255
1256 /* Auto-scale the number of threads with the number of teams. */
1257 def->gdims[0] = gcn_teams;
1258 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1259 if (def->gdims[2] == 0)
1260 def->gdims[2] = 1;
1261 else if (def->gdims[2] > max_threads)
1262 def->gdims[2] = max_threads;
1263 }
1264 else
1265 {
1266 def->gdims[0] = gcn_teams;
1267 def->gdims[2] = gcn_threads;
1268 }
1269 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1270 def->wdims[0] = 1; /* Single team per work-group. */
237957cc
AS
1271 def->wdims[1] = 64;
1272 def->wdims[2] = 16;
1273 *result = def;
1274 return true;
1275 }
1276 else if (!grid_attrs_found)
1277 {
1278 def->ndim = 1;
1279 def->gdims[0] = 1;
1280 def->gdims[1] = 1;
1281 def->gdims[2] = 1;
1282 def->wdims[0] = 1;
1283 def->wdims[1] = 1;
1284 def->wdims[2] = 1;
1285 *result = def;
1286 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1287 return true;
1288 }
1289
1290 struct GOMP_kernel_launch_attributes *kla;
1291 kla = (struct GOMP_kernel_launch_attributes *) *input;
1292 *result = kla;
1293 if (kla->ndim == 0 || kla->ndim > 3)
1294 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1295
1296 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1297 unsigned i;
1298 for (i = 0; i < kla->ndim; i++)
1299 {
1300 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1301 kla->gdims[i], kla->wdims[i]);
1302 if (kla->gdims[i] == 0)
1303 return false;
1304 }
1305 return true;
1306}
1307
1308/* Return the group size given the requested GROUP size, GRID size and number
1309 of grid dimensions NDIM. */
1310
1311static uint32_t
1312get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1313{
1314 if (group == 0)
1315 {
1316 /* TODO: Provide a default via environment or device characteristics. */
1317 if (ndim == 1)
1318 group = 64;
1319 else if (ndim == 2)
1320 group = 8;
1321 else
1322 group = 4;
1323 }
1324
1325 if (group > grid)
1326 group = grid;
1327 return group;
1328}
1329
1330/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1331
1332static void
1333packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1334{
1335 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1336}
1337
1338/* A never-called callback for the HSA command queues. These signal events
1339 that we don't use, so we trigger an error.
1340
1341 This "queue" is not to be confused with the async queues, below. */
1342
1343static void
1344hsa_queue_callback (hsa_status_t status,
1345 hsa_queue_t *queue __attribute__ ((unused)),
1346 void *data __attribute__ ((unused)))
1347{
1348 hsa_fatal ("Asynchronous queue error", status);
1349}
1350
1351/* }}} */
1352/* {{{ HSA initialization */
1353
1354/* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1355
1356static bool
1357init_hsa_runtime_functions (void)
1358{
1359#define DLSYM_FN(function) \
1360 hsa_fns.function##_fn = dlsym (handle, #function); \
1361 if (hsa_fns.function##_fn == NULL) \
1362 return false;
1363 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1364 if (handle == NULL)
1365 return false;
1366
1367 DLSYM_FN (hsa_status_string)
1368 DLSYM_FN (hsa_system_get_info)
1369 DLSYM_FN (hsa_agent_get_info)
1370 DLSYM_FN (hsa_init)
1371 DLSYM_FN (hsa_iterate_agents)
1372 DLSYM_FN (hsa_region_get_info)
1373 DLSYM_FN (hsa_queue_create)
1374 DLSYM_FN (hsa_agent_iterate_regions)
1375 DLSYM_FN (hsa_executable_destroy)
1376 DLSYM_FN (hsa_executable_create)
1377 DLSYM_FN (hsa_executable_global_variable_define)
1378 DLSYM_FN (hsa_executable_load_code_object)
1379 DLSYM_FN (hsa_executable_freeze)
1380 DLSYM_FN (hsa_signal_create)
1381 DLSYM_FN (hsa_memory_allocate)
1382 DLSYM_FN (hsa_memory_assign_agent)
1383 DLSYM_FN (hsa_memory_copy)
1384 DLSYM_FN (hsa_memory_free)
1385 DLSYM_FN (hsa_signal_destroy)
1386 DLSYM_FN (hsa_executable_get_symbol)
1387 DLSYM_FN (hsa_executable_symbol_get_info)
1388 DLSYM_FN (hsa_executable_iterate_symbols)
1389 DLSYM_FN (hsa_queue_add_write_index_release)
1390 DLSYM_FN (hsa_queue_load_read_index_acquire)
1391 DLSYM_FN (hsa_signal_wait_acquire)
1392 DLSYM_FN (hsa_signal_store_relaxed)
1393 DLSYM_FN (hsa_signal_store_release)
1394 DLSYM_FN (hsa_signal_load_acquire)
1395 DLSYM_FN (hsa_queue_destroy)
1396 DLSYM_FN (hsa_code_object_deserialize)
1397 return true;
1398#undef DLSYM_FN
1399}
1400
1401/* Return true if the agent is a GPU and can accept of concurrent submissions
1402 from different threads. */
1403
1404static bool
1405suitable_hsa_agent_p (hsa_agent_t agent)
1406{
1407 hsa_device_type_t device_type;
1408 hsa_status_t status
1409 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1410 &device_type);
1411 if (status != HSA_STATUS_SUCCESS)
1412 return false;
1413
1414 switch (device_type)
1415 {
1416 case HSA_DEVICE_TYPE_GPU:
1417 break;
1418 case HSA_DEVICE_TYPE_CPU:
1419 if (!support_cpu_devices)
1420 return false;
1421 break;
1422 default:
1423 return false;
1424 }
1425
1426 uint32_t features = 0;
1427 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1428 &features);
1429 if (status != HSA_STATUS_SUCCESS
1430 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1431 return false;
1432 hsa_queue_type_t queue_type;
1433 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1434 &queue_type);
1435 if (status != HSA_STATUS_SUCCESS
1436 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1437 return false;
1438
1439 return true;
1440}
1441
1442/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1443 agent_count in hsa_context. */
1444
1445static hsa_status_t
1446count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1447{
1448 if (suitable_hsa_agent_p (agent))
1449 hsa_context.agent_count++;
1450 return HSA_STATUS_SUCCESS;
1451}
1452
1453/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1454 id to the describing structure in the hsa context. The index of the
1455 structure is pointed to by DATA, increment it afterwards. */
1456
1457static hsa_status_t
1458assign_agent_ids (hsa_agent_t agent, void *data)
1459{
1460 if (suitable_hsa_agent_p (agent))
1461 {
1462 int *agent_index = (int *) data;
1463 hsa_context.agents[*agent_index].id = agent;
1464 ++*agent_index;
1465 }
1466 return HSA_STATUS_SUCCESS;
1467}
1468
1469/* Initialize hsa_context if it has not already been done.
1470 Return TRUE on success. */
1471
1472static bool
1473init_hsa_context (void)
1474{
1475 hsa_status_t status;
1476 int agent_index = 0;
1477
1478 if (hsa_context.initialized)
1479 return true;
1480 init_environment_variables ();
1481 if (!init_hsa_runtime_functions ())
1482 {
1483 GCN_WARNING ("Run-time could not be dynamically opened\n");
1484 if (suppress_host_fallback)
1485 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1486 return false;
1487 }
1488 status = hsa_fns.hsa_init_fn ();
1489 if (status != HSA_STATUS_SUCCESS)
1490 return hsa_error ("Run-time could not be initialized", status);
1491 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1492
1493 if (debug)
1494 dump_hsa_system_info ();
1495
1496 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1497 if (status != HSA_STATUS_SUCCESS)
1498 return hsa_error ("GCN GPU devices could not be enumerated", status);
1499 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1500
1501 hsa_context.agents
1502 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1503 * sizeof (struct agent_info));
1504 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
966de09b
AS
1505 if (status != HSA_STATUS_SUCCESS)
1506 return hsa_error ("Scanning compute agents failed", status);
237957cc
AS
1507 if (agent_index != hsa_context.agent_count)
1508 {
1509 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1510 return false;
1511 }
1512
1513 if (debug)
1514 {
1515 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1516 if (status != HSA_STATUS_SUCCESS)
1517 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1518 }
1519
2e5ea579
FH
1520 uint16_t minor, major;
1521 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1522 &minor);
1523 if (status != HSA_STATUS_SUCCESS)
1524 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1525 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1526 &major);
1527 if (status != HSA_STATUS_SUCCESS)
1528 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1529
1530 size_t len = sizeof hsa_context.driver_version_s;
1531 int printed = snprintf (hsa_context.driver_version_s, len,
1532 "HSA Runtime %hu.%hu", (unsigned short int)major,
1533 (unsigned short int)minor);
1534 if (printed >= len)
1535 GCN_WARNING ("HSA runtime version string was truncated."
1536 "Version %hu.%hu is too long.", (unsigned short int)major,
1537 (unsigned short int)minor);
1538
237957cc
AS
1539 hsa_context.initialized = true;
1540 return true;
1541}
1542
1543/* Verify that hsa_context has already been initialized and return the
1544 agent_info structure describing device number N. Return NULL on error. */
1545
1546static struct agent_info *
1547get_agent_info (int n)
1548{
1549 if (!hsa_context.initialized)
1550 {
1551 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1552 return NULL;
1553 }
1554 if (n >= hsa_context.agent_count)
1555 {
1556 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1557 return NULL;
1558 }
1559 if (!hsa_context.agents[n].initialized)
1560 {
1561 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1562 return NULL;
1563 }
1564 return &hsa_context.agents[n];
1565}
1566
1567/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1568
1569 Selects (breaks at) a suitable region of type KIND. */
1570
1571static hsa_status_t
1572get_memory_region (hsa_region_t region, hsa_region_t *retval,
1573 hsa_region_global_flag_t kind)
1574{
1575 hsa_status_t status;
1576 hsa_region_segment_t segment;
1577
1578 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1579 &segment);
1580 if (status != HSA_STATUS_SUCCESS)
1581 return status;
1582 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1583 return HSA_STATUS_SUCCESS;
1584
1585 uint32_t flags;
1586 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1587 &flags);
1588 if (status != HSA_STATUS_SUCCESS)
1589 return status;
1590 if (flags & kind)
1591 {
1592 *retval = region;
1593 return HSA_STATUS_INFO_BREAK;
1594 }
1595 return HSA_STATUS_SUCCESS;
1596}
1597
1598/* Callback of hsa_agent_iterate_regions.
1599
1600 Selects a kernargs memory region. */
1601
1602static hsa_status_t
1603get_kernarg_memory_region (hsa_region_t region, void *data)
1604{
1605 return get_memory_region (region, (hsa_region_t *)data,
1606 HSA_REGION_GLOBAL_FLAG_KERNARG);
1607}
1608
1609/* Callback of hsa_agent_iterate_regions.
1610
1611 Selects a coarse-grained memory region suitable for the heap and
1612 offload data. */
1613
1614static hsa_status_t
1615get_data_memory_region (hsa_region_t region, void *data)
1616{
1617 return get_memory_region (region, (hsa_region_t *)data,
1618 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1619}
1620
7d593fd6
FH
1621static int
1622elf_gcn_isa_field (Elf64_Ehdr *image)
1623{
1624 return image->e_flags & EF_AMDGPU_MACH_MASK;
1625}
1626
7d593fd6
FH
1627const static char *gcn_gfx803_s = "gfx803";
1628const static char *gcn_gfx900_s = "gfx900";
1629const static char *gcn_gfx906_s = "gfx906";
3535402e 1630const static char *gcn_gfx908_s = "gfx908";
7d593fd6
FH
1631const static int gcn_isa_name_len = 6;
1632
1633/* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1634 support the ISA. */
1635
1636static const char*
1637isa_hsa_name (int isa) {
1638 switch(isa)
1639 {
7d593fd6
FH
1640 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1641 return gcn_gfx803_s;
1642 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1643 return gcn_gfx900_s;
1644 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1645 return gcn_gfx906_s;
3535402e
AS
1646 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1647 return gcn_gfx908_s;
7d593fd6
FH
1648 }
1649 return NULL;
1650}
1651
1652/* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1653 with -march) or NULL if we do not support the ISA.
1654 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1655
1656static const char*
1657isa_gcc_name (int isa) {
1658 switch(isa)
1659 {
7d593fd6
FH
1660 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1661 return "fiji";
1662 default:
1663 return isa_hsa_name (isa);
1664 }
1665}
1666
1667/* Returns the code which is used in the GCN object code to identify the ISA with
1668 the given name (as used by the HSA runtime). */
1669
1670static gcn_isa
1671isa_code(const char *isa) {
7d593fd6
FH
1672 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1673 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1674
1675 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1676 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1677
1678 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1679 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1680
3535402e
AS
1681 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1682 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1683
7d593fd6
FH
1684 return -1;
1685}
1686
237957cc
AS
1687/* }}} */
1688/* {{{ Run */
1689
1690/* Create or reuse a team arena.
1691
1692 Team arenas are used by OpenMP to avoid calling malloc multiple times
1693 while setting up each team. This is purely a performance optimization.
1694
1695 Allocating an arena also costs performance, albeit on the host side, so
1696 this function will reuse an existing arena if a large enough one is idle.
1697 The arena is released, but not deallocated, when the kernel exits. */
1698
1699static void *
1700get_team_arena (struct agent_info *agent, int num_teams)
1701{
1702 struct team_arena_list **next_ptr = &agent->team_arena_list;
1703 struct team_arena_list *item;
1704
1705 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1706 {
1707 if (item->num_teams < num_teams)
1708 continue;
1709
1710 if (pthread_mutex_trylock (&item->in_use))
1711 continue;
1712
1713 return item->arena;
1714 }
1715
1716 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1717
1718 if (pthread_mutex_lock (&agent->team_arena_write_lock))
1719 {
1720 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1721 return false;
1722 }
1723 item = malloc (sizeof (*item));
1724 item->num_teams = num_teams;
1725 item->next = NULL;
1726 *next_ptr = item;
1727
1728 if (pthread_mutex_init (&item->in_use, NULL))
1729 {
1730 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1731 return false;
1732 }
1733 if (pthread_mutex_lock (&item->in_use))
1734 {
1735 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1736 return false;
1737 }
1738 if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1739 {
1740 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1741 return false;
1742 }
1743
1744 const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */
1745 hsa_status_t status;
1746 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1747 TEAM_ARENA_SIZE*num_teams,
1748 &item->arena);
1749 if (status != HSA_STATUS_SUCCESS)
1750 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1751 status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1752 HSA_ACCESS_PERMISSION_RW);
1753 if (status != HSA_STATUS_SUCCESS)
1754 hsa_fatal ("Could not assign arena memory to device", status);
1755
1756 return item->arena;
1757}
1758
1759/* Mark a team arena available for reuse. */
1760
1761static void
1762release_team_arena (struct agent_info* agent, void *arena)
1763{
1764 struct team_arena_list *item;
1765
1766 for (item = agent->team_arena_list; item; item = item->next)
1767 {
1768 if (item->arena == arena)
1769 {
1770 if (pthread_mutex_unlock (&item->in_use))
1771 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1772 return;
1773 }
1774 }
1775 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1776}
1777
1778/* Clean up all the allocated team arenas. */
1779
1780static bool
1781destroy_team_arenas (struct agent_info *agent)
1782{
1783 struct team_arena_list *item, *next;
1784
1785 for (item = agent->team_arena_list; item; item = next)
1786 {
1787 next = item->next;
1788 hsa_fns.hsa_memory_free_fn (item->arena);
1789 if (pthread_mutex_destroy (&item->in_use))
1790 {
1791 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1792 return false;
1793 }
1794 free (item);
1795 }
1796 agent->team_arena_list = NULL;
1797
1798 return true;
1799}
1800
1801/* Allocate memory on a specified device. */
1802
1803static void *
1804alloc_by_agent (struct agent_info *agent, size_t size)
1805{
1806 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1807
1808 /* Zero-size allocations are invalid, so in order to return a valid pointer
1809 we need to pass a valid size. One source of zero-size allocations is
1810 kernargs for kernels that have no inputs or outputs (the kernel may
1811 only use console output, for example). */
1812 if (size == 0)
1813 size = 4;
1814
1815 void *ptr;
1816 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1817 size, &ptr);
1818 if (status != HSA_STATUS_SUCCESS)
1819 {
1820 hsa_error ("Could not allocate device memory", status);
1821 return NULL;
1822 }
1823
1824 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1825 HSA_ACCESS_PERMISSION_RW);
1826 if (status != HSA_STATUS_SUCCESS)
1827 {
1828 hsa_error ("Could not assign data memory to device", status);
1829 return NULL;
1830 }
1831
1832 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1833 bool profiling_dispatch_p
1834 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1835 if (profiling_dispatch_p)
1836 {
1837 acc_prof_info *prof_info = thr->prof_info;
1838 acc_event_info data_event_info;
1839 acc_api_info *api_info = thr->api_info;
1840
1841 prof_info->event_type = acc_ev_alloc;
1842
1843 data_event_info.data_event.event_type = prof_info->event_type;
1844 data_event_info.data_event.valid_bytes
1845 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1846 data_event_info.data_event.parent_construct
1847 = acc_construct_parallel;
1848 data_event_info.data_event.implicit = 1;
1849 data_event_info.data_event.tool_info = NULL;
1850 data_event_info.data_event.var_name = NULL;
1851 data_event_info.data_event.bytes = size;
1852 data_event_info.data_event.host_ptr = NULL;
1853 data_event_info.data_event.device_ptr = (void *) ptr;
1854
1855 api_info->device_api = acc_device_api_other;
1856
1857 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1858 api_info);
1859 }
1860
1861 return ptr;
1862}
1863
1864/* Create kernel dispatch data structure for given KERNEL, along with
1865 the necessary device signals and memory allocations. */
1866
1867static struct kernel_dispatch *
1868create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1869{
1870 struct agent_info *agent = kernel->agent;
1871 struct kernel_dispatch *shadow
1872 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1873
1874 shadow->agent = kernel->agent;
1875 shadow->object = kernel->object;
1876
1877 hsa_signal_t sync_signal;
1878 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1879 if (status != HSA_STATUS_SUCCESS)
1880 hsa_fatal ("Error creating the GCN sync signal", status);
1881
1882 shadow->signal = sync_signal.handle;
1883 shadow->private_segment_size = kernel->private_segment_size;
1884 shadow->group_segment_size = kernel->group_segment_size;
1885
1886 /* We expect kernels to request a single pointer, explicitly, and the
1887 rest of struct kernargs, implicitly. If they request anything else
1888 then something is wrong. */
1889 if (kernel->kernarg_segment_size > 8)
1890 {
1891 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1892 return NULL;
1893 }
1894
1895 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1896 sizeof (struct kernargs),
1897 &shadow->kernarg_address);
1898 if (status != HSA_STATUS_SUCCESS)
1899 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1900 struct kernargs *kernargs = shadow->kernarg_address;
1901
1902 /* Zero-initialize the output_data (minimum needed). */
1903 kernargs->out_ptr = (int64_t)&kernargs->output_data;
1904 kernargs->output_data.next_output = 0;
1905 for (unsigned i = 0;
1906 i < (sizeof (kernargs->output_data.queue)
1907 / sizeof (kernargs->output_data.queue[0]));
1908 i++)
1909 kernargs->output_data.queue[i].written = 0;
1910 kernargs->output_data.consumed = 0;
1911
1912 /* Pass in the heap location. */
1913 kernargs->heap_ptr = (int64_t)kernel->module->heap;
1914
1915 /* Create an arena. */
1916 if (kernel->kind == KIND_OPENMP)
1917 kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1918 else
1919 kernargs->arena_ptr = 0;
1920
1921 /* Ensure we can recognize unset return values. */
1922 kernargs->output_data.return_value = 0xcafe0000;
1923
1924 return shadow;
1925}
1926
1927/* Output any data written to console output from the kernel. It is expected
1928 that this function is polled during kernel execution.
1929
1930 We print all entries from the last item printed to the next entry without
1931 a "written" flag. If the "final" flag is set then it'll continue right to
1932 the end.
1933
1934 The print buffer is circular, but the from and to locations don't wrap when
1935 the buffer does, so the output limit is UINT_MAX. The target blocks on
1936 output when the buffer is full. */
1937
1938static void
1939console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1940 bool final)
1941{
1942 unsigned int limit = (sizeof (kernargs->output_data.queue)
1943 / sizeof (kernargs->output_data.queue[0]));
1944
1945 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1946 __ATOMIC_ACQUIRE);
1947 unsigned int to = kernargs->output_data.next_output;
1948
1949 if (from > to)
1950 {
1951 /* Overflow. */
1952 if (final)
1953 printf ("GCN print buffer overflowed.\n");
1954 return;
1955 }
1956
1957 unsigned int i;
1958 for (i = from; i < to; i++)
1959 {
1960 struct printf_data *data = &kernargs->output_data.queue[i%limit];
1961
1962 if (!data->written && !final)
1963 break;
1964
1965 switch (data->type)
1966 {
1967 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1968 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1969 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1970 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1971 default: printf ("GCN print buffer error!\n"); break;
1972 }
1973 data->written = 0;
1974 __atomic_store_n (&kernargs->output_data.consumed, i+1,
1975 __ATOMIC_RELEASE);
1976 }
1977 fflush (stdout);
1978}
1979
1980/* Release data structure created for a kernel dispatch in SHADOW argument,
1981 and clean up the signal and memory allocations. */
1982
1983static void
1984release_kernel_dispatch (struct kernel_dispatch *shadow)
1985{
1986 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1987
1988 struct kernargs *kernargs = shadow->kernarg_address;
1989 void *arena = (void *)kernargs->arena_ptr;
1990 if (arena)
1991 release_team_arena (shadow->agent, arena);
1992
1993 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1994
1995 hsa_signal_t s;
1996 s.handle = shadow->signal;
1997 hsa_fns.hsa_signal_destroy_fn (s);
1998
1999 free (shadow);
2000}
2001
2002/* Extract the properties from a kernel binary. */
2003
2004static void
2005init_kernel_properties (struct kernel_info *kernel)
2006{
2007 hsa_status_t status;
2008 struct agent_info *agent = kernel->agent;
2009 hsa_executable_symbol_t kernel_symbol;
f062c3f1
AS
2010 char *buf = alloca (strlen (kernel->name) + 4);
2011 sprintf (buf, "%s.kd", kernel->name);
237957cc 2012 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
f062c3f1 2013 buf, agent->id,
237957cc
AS
2014 0, &kernel_symbol);
2015 if (status != HSA_STATUS_SUCCESS)
2016 {
2017 hsa_warn ("Could not find symbol for kernel in the code object", status);
f062c3f1 2018 fprintf (stderr, "not found name: '%s'\n", buf);
237957cc
AS
2019 dump_executable_symbols (agent->executable);
2020 goto failure;
2021 }
2022 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2023 status = hsa_fns.hsa_executable_symbol_get_info_fn
2024 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2025 if (status != HSA_STATUS_SUCCESS)
2026 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2027 status = hsa_fns.hsa_executable_symbol_get_info_fn
2028 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2029 &kernel->kernarg_segment_size);
2030 if (status != HSA_STATUS_SUCCESS)
2031 hsa_fatal ("Could not get info about kernel argument size", status);
2032 status = hsa_fns.hsa_executable_symbol_get_info_fn
2033 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2034 &kernel->group_segment_size);
2035 if (status != HSA_STATUS_SUCCESS)
2036 hsa_fatal ("Could not get info about kernel group segment size", status);
2037 status = hsa_fns.hsa_executable_symbol_get_info_fn
2038 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2039 &kernel->private_segment_size);
2040 if (status != HSA_STATUS_SUCCESS)
2041 hsa_fatal ("Could not get info about kernel private segment size",
2042 status);
2043
2044 /* The kernel type is not known until something tries to launch it. */
2045 kernel->kind = KIND_UNKNOWN;
2046
2047 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2048 "following segment sizes: \n", kernel->name);
2049 GCN_DEBUG (" group_segment_size: %u\n",
2050 (unsigned) kernel->group_segment_size);
2051 GCN_DEBUG (" private_segment_size: %u\n",
2052 (unsigned) kernel->private_segment_size);
2053 GCN_DEBUG (" kernarg_segment_size: %u\n",
2054 (unsigned) kernel->kernarg_segment_size);
2055 return;
2056
2057failure:
2058 kernel->initialization_failed = true;
2059}
2060
2061/* Do all the work that is necessary before running KERNEL for the first time.
2062 The function assumes the program has been created, finalized and frozen by
2063 create_and_finalize_hsa_program. */
2064
2065static void
2066init_kernel (struct kernel_info *kernel)
2067{
2068 if (pthread_mutex_lock (&kernel->init_mutex))
2069 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2070 if (kernel->initialized)
2071 {
2072 if (pthread_mutex_unlock (&kernel->init_mutex))
2073 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2074 "mutex");
2075
2076 return;
2077 }
2078
2079 init_kernel_properties (kernel);
2080
2081 if (!kernel->initialization_failed)
2082 {
2083 GCN_DEBUG ("\n");
2084
2085 kernel->initialized = true;
2086 }
2087 if (pthread_mutex_unlock (&kernel->init_mutex))
2088 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2089 "mutex");
2090}
2091
2092/* Run KERNEL on its agent, pass VARS to it as arguments and take
2093 launch attributes from KLA.
2094
2095 MODULE_LOCKED indicates that the caller already holds the lock and
2096 run_kernel need not lock it again.
2097 If AQ is NULL then agent->sync_queue will be used. */
2098
2099static void
2100run_kernel (struct kernel_info *kernel, void *vars,
2101 struct GOMP_kernel_launch_attributes *kla,
2102 struct goacc_asyncqueue *aq, bool module_locked)
2103{
5a28e272
KCY
2104 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2105 kernel->description->vpgr_count);
2106
2107 /* Reduce the number of threads/workers if there are insufficient
2108 VGPRs available to run the kernels together. */
2109 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2110 {
2111 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2112 int max_threads = (256 / granulated_vgprs) * 4;
2113 if (kla->gdims[2] > max_threads)
2114 {
2115 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2116 " per team/gang - reducing to %d threads/workers.\n",
2117 kla->gdims[2], max_threads);
2118 kla->gdims[2] = max_threads;
2119 }
2120 }
2121
237957cc
AS
2122 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2123 (aq ? aq->id : 0));
2124 GCN_DEBUG ("GCN launch attribs: gdims:[");
2125 int i;
2126 for (i = 0; i < kla->ndim; ++i)
2127 {
2128 if (i)
2129 DEBUG_PRINT (", ");
2130 DEBUG_PRINT ("%u", kla->gdims[i]);
2131 }
2132 DEBUG_PRINT ("], normalized gdims:[");
2133 for (i = 0; i < kla->ndim; ++i)
2134 {
2135 if (i)
2136 DEBUG_PRINT (", ");
2137 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2138 }
2139 DEBUG_PRINT ("], wdims:[");
2140 for (i = 0; i < kla->ndim; ++i)
2141 {
2142 if (i)
2143 DEBUG_PRINT (", ");
2144 DEBUG_PRINT ("%u", kla->wdims[i]);
2145 }
2146 DEBUG_PRINT ("]\n");
2147 DEBUG_FLUSH ();
2148
2149 struct agent_info *agent = kernel->agent;
2150 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2151 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2152
2153 if (!agent->initialized)
2154 GOMP_PLUGIN_fatal ("Agent must be initialized");
2155
2156 if (!kernel->initialized)
2157 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2158
2159 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2160
2161 uint64_t index
2162 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2163 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2164
2165 /* Wait until the queue is not full before writing the packet. */
2166 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2167 >= command_q->size)
2168 ;
2169
2170 /* Do not allow the dimensions to be overridden when running
2171 constructors or destructors. */
2172 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2173 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2174
2175 hsa_kernel_dispatch_packet_t *packet;
2176 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2177 + index % command_q->size;
2178
2179 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2180 packet->grid_size_x = override_x ? : kla->gdims[0];
2181 packet->workgroup_size_x = get_group_size (kla->ndim,
2182 packet->grid_size_x,
2183 kla->wdims[0]);
2184
2185 if (kla->ndim >= 2)
2186 {
2187 packet->grid_size_y = kla->gdims[1];
2188 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2189 kla->wdims[1]);
2190 }
2191 else
2192 {
2193 packet->grid_size_y = 1;
2194 packet->workgroup_size_y = 1;
2195 }
2196
2197 if (kla->ndim == 3)
2198 {
2199 packet->grid_size_z = limit_worker_threads (override_z
2200 ? : kla->gdims[2]);
2201 packet->workgroup_size_z = get_group_size (kla->ndim,
2202 packet->grid_size_z,
2203 kla->wdims[2]);
2204 }
2205 else
2206 {
2207 packet->grid_size_z = 1;
2208 packet->workgroup_size_z = 1;
2209 }
2210
2211 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2212 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2213 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2214 packet->grid_size_x / packet->workgroup_size_x,
2215 packet->grid_size_y / packet->workgroup_size_y,
2216 packet->grid_size_z / packet->workgroup_size_z,
2217 packet->workgroup_size_x, packet->workgroup_size_y,
2218 packet->workgroup_size_z);
2219
2220 struct kernel_dispatch *shadow
2221 = create_kernel_dispatch (kernel, packet->grid_size_x);
2222 shadow->queue = command_q;
2223
2224 if (debug)
2225 {
2226 fprintf (stderr, "\nKernel has following dependencies:\n");
2227 print_kernel_dispatch (shadow, 2);
2228 }
2229
2230 packet->private_segment_size = kernel->private_segment_size;
2231 packet->group_segment_size = kernel->group_segment_size;
2232 packet->kernel_object = kernel->object;
2233 packet->kernarg_address = shadow->kernarg_address;
2234 hsa_signal_t s;
2235 s.handle = shadow->signal;
2236 packet->completion_signal = s;
2237 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2238 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2239
2240 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2241
2242 uint16_t header;
2243 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2244 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2245 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2246
2247 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2248 agent->device_id);
2249
2250 packet_store_release ((uint32_t *) packet, header,
2251 (uint16_t) kla->ndim
2252 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2253
2254 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2255 index);
2256
2257 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2258
2259 /* Root signal waits with 1ms timeout. */
2260 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2261 1000 * 1000,
2262 HSA_WAIT_STATE_BLOCKED) != 0)
2263 {
2264 console_output (kernel, shadow->kernarg_address, false);
2265 }
2266 console_output (kernel, shadow->kernarg_address, true);
2267
2268 struct kernargs *kernargs = shadow->kernarg_address;
2269 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2270
2271 release_kernel_dispatch (shadow);
2272
2273 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2274 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2275
2276 unsigned int upper = (return_value & ~0xffff) >> 16;
2277 if (upper == 0xcafe)
2278 ; // exit not called, normal termination.
2279 else if (upper == 0xffff)
2280 ; // exit called.
2281 else
2282 {
2283 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2284 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2285 return_value);
2286 abort ();
2287 }
2288
2289 if (upper == 0xffff)
2290 {
2291 unsigned int signal = (return_value >> 8) & 0xff;
2292
2293 if (signal == SIGABRT)
2294 {
2295 GCN_WARNING ("GCN Kernel aborted\n");
2296 abort ();
2297 }
2298 else if (signal != 0)
2299 {
2300 GCN_WARNING ("GCN Kernel received unknown signal\n");
2301 abort ();
2302 }
2303
2304 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2305 exit (return_value & 0xff);
2306 }
2307}
2308
2309/* }}} */
2310/* {{{ Load/Unload */
2311
2312/* Initialize KERNEL from D and other parameters. Return true on success. */
2313
2314static bool
2315init_basic_kernel_info (struct kernel_info *kernel,
2316 struct hsa_kernel_description *d,
2317 struct agent_info *agent,
2318 struct module_info *module)
2319{
2320 kernel->agent = agent;
2321 kernel->module = module;
2322 kernel->name = d->name;
5a28e272 2323 kernel->description = d;
237957cc
AS
2324 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2325 {
2326 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2327 return false;
2328 }
2329 return true;
2330}
2331
7d593fd6
FH
2332/* Check that the GCN ISA of the given image matches the ISA of the agent. */
2333
2334static bool
2335isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2336{
2337 int isa_field = elf_gcn_isa_field (image);
2338 const char* isa_s = isa_hsa_name (isa_field);
2339 if (!isa_s)
2340 {
2341 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2342 return false;
2343 }
2344
2345 if (isa_field != agent->device_isa)
2346 {
2347 char msg[120];
2348 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2349 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2350 assert (agent_isa_s);
2351 assert (agent_isa_gcc_s);
2352
2353 snprintf (msg, sizeof msg,
2354 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2355 "Try to recompile with '-foffload=-march=%s'.\n",
2356 isa_s, agent_isa_s, agent_isa_gcc_s);
2357
2358 hsa_error (msg, HSA_STATUS_ERROR);
2359 return false;
2360 }
2361
2362 return true;
2363}
2364
237957cc
AS
2365/* Create and finalize the program consisting of all loaded modules. */
2366
2367static bool
2368create_and_finalize_hsa_program (struct agent_info *agent)
2369{
2370 hsa_status_t status;
237957cc
AS
2371 bool res = true;
2372 if (pthread_mutex_lock (&agent->prog_mutex))
2373 {
2374 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2375 return false;
2376 }
2377 if (agent->prog_finalized)
2378 goto final;
2379
2380 status
2381 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2382 HSA_EXECUTABLE_STATE_UNFROZEN,
2383 "", &agent->executable);
2384 if (status != HSA_STATUS_SUCCESS)
2385 {
2386 hsa_error ("Could not create GCN executable", status);
2387 goto fail;
2388 }
2389
2390 /* Load any GCN modules. */
2391 struct module_info *module = agent->module;
2392 if (module)
2393 {
2394 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2395
7d593fd6
FH
2396 if (!isa_matches_agent (agent, image))
2397 goto fail;
2398
237957cc
AS
2399 hsa_code_object_t co = { 0 };
2400 status = hsa_fns.hsa_code_object_deserialize_fn
2401 (module->image_desc->gcn_image->image,
2402 module->image_desc->gcn_image->size,
2403 NULL, &co);
2404 if (status != HSA_STATUS_SUCCESS)
2405 {
2406 hsa_error ("Could not deserialize GCN code object", status);
2407 goto fail;
2408 }
2409
2410 status = hsa_fns.hsa_executable_load_code_object_fn
2411 (agent->executable, agent->id, co, "");
2412 if (status != HSA_STATUS_SUCCESS)
2413 {
2414 hsa_error ("Could not load GCN code object", status);
2415 goto fail;
2416 }
2417
2418 if (!module->heap)
2419 {
2420 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2421 gcn_kernel_heap_size,
2422 (void**)&module->heap);
2423 if (status != HSA_STATUS_SUCCESS)
2424 {
2425 hsa_error ("Could not allocate memory for GCN heap", status);
2426 goto fail;
2427 }
2428
2429 status = hsa_fns.hsa_memory_assign_agent_fn
2430 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2431 if (status != HSA_STATUS_SUCCESS)
2432 {
2433 hsa_error ("Could not assign GCN heap memory to device", status);
2434 goto fail;
2435 }
2436
2437 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2438 &gcn_kernel_heap_size,
2439 sizeof (gcn_kernel_heap_size));
2440 }
2441
2442 }
2443
2444 if (debug)
2445 dump_executable_symbols (agent->executable);
2446
2447 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2448 if (status != HSA_STATUS_SUCCESS)
2449 {
2450 hsa_error ("Could not freeze the GCN executable", status);
2451 goto fail;
2452 }
2453
237957cc
AS
2454final:
2455 agent->prog_finalized = true;
2456
2457 if (pthread_mutex_unlock (&agent->prog_mutex))
2458 {
2459 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2460 res = false;
2461 }
2462
2463 return res;
2464
2465fail:
2466 res = false;
2467 goto final;
2468}
2469
2470/* Free the HSA program in agent and everything associated with it and set
2471 agent->prog_finalized and the initialized flags of all kernels to false.
2472 Return TRUE on success. */
2473
2474static bool
2475destroy_hsa_program (struct agent_info *agent)
2476{
2477 if (!agent->prog_finalized)
2478 return true;
2479
2480 hsa_status_t status;
2481
2482 GCN_DEBUG ("Destroying the current GCN program.\n");
2483
2484 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2485 if (status != HSA_STATUS_SUCCESS)
2486 return hsa_error ("Could not destroy GCN executable", status);
2487
2488 if (agent->module)
2489 {
2490 int i;
2491 for (i = 0; i < agent->module->kernel_count; i++)
2492 agent->module->kernels[i].initialized = false;
2493
2494 if (agent->module->heap)
2495 {
2496 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2497 agent->module->heap = NULL;
2498 }
2499 }
2500 agent->prog_finalized = false;
2501 return true;
2502}
2503
2504/* Deinitialize all information associated with MODULE and kernels within
2505 it. Return TRUE on success. */
2506
2507static bool
2508destroy_module (struct module_info *module, bool locked)
2509{
2510 /* Run destructors before destroying module. */
2511 struct GOMP_kernel_launch_attributes kla =
2512 { 3,
2513 /* Grid size. */
2514 { 1, 64, 1 },
2515 /* Work-group size. */
2516 { 1, 64, 1 }
2517 };
2518
2519 if (module->fini_array_func)
2520 {
2521 init_kernel (module->fini_array_func);
2522 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2523 }
2524 module->constructors_run_p = false;
2525
2526 int i;
2527 for (i = 0; i < module->kernel_count; i++)
2528 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2529 {
2530 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2531 "mutex");
2532 return false;
2533 }
2534
2535 return true;
2536}
2537
2538/* }}} */
2539/* {{{ Async */
2540
2541/* Callback of dispatch queues to report errors. */
2542
2543static void
2544execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2545{
2546 struct queue_entry *entry = &aq->queue[index];
2547
2548 switch (entry->type)
2549 {
2550 case KERNEL_LAUNCH:
2551 if (DEBUG_QUEUES)
2552 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2553 aq->agent->device_id, aq->id, index);
2554 run_kernel (entry->u.launch.kernel,
2555 entry->u.launch.vars,
2556 &entry->u.launch.kla, aq, false);
2557 if (DEBUG_QUEUES)
2558 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2559 aq->agent->device_id, aq->id, index);
2560 break;
2561
2562 case CALLBACK:
2563 if (DEBUG_QUEUES)
2564 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2565 aq->agent->device_id, aq->id, index);
2566 entry->u.callback.fn (entry->u.callback.data);
2567 if (DEBUG_QUEUES)
2568 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2569 aq->agent->device_id, aq->id, index);
2570 break;
2571
2572 case ASYNC_WAIT:
2573 {
2574 /* FIXME: is it safe to access a placeholder that may already have
2575 been executed? */
2576 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2577
2578 if (DEBUG_QUEUES)
2579 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2580 aq->agent->device_id, aq->id, index);
2581
2582 pthread_mutex_lock (&placeholderp->mutex);
2583
2584 while (!placeholderp->executed)
2585 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2586
2587 pthread_mutex_unlock (&placeholderp->mutex);
2588
2589 if (pthread_cond_destroy (&placeholderp->cond))
2590 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2591
2592 if (pthread_mutex_destroy (&placeholderp->mutex))
2593 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2594
2595 if (DEBUG_QUEUES)
2596 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2597 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2598 }
2599 break;
2600
2601 case ASYNC_PLACEHOLDER:
2602 pthread_mutex_lock (&entry->u.placeholder.mutex);
2603 entry->u.placeholder.executed = 1;
2604 pthread_cond_signal (&entry->u.placeholder.cond);
2605 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2606 break;
2607
2608 default:
2609 GOMP_PLUGIN_fatal ("Unknown queue element");
2610 }
2611}
2612
2613/* This function is run as a thread to service an async queue in the
2614 background. It runs continuously until the stop flag is set. */
2615
2616static void *
2617drain_queue (void *thread_arg)
2618{
2619 struct goacc_asyncqueue *aq = thread_arg;
2620
2621 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2622 {
2623 aq->drain_queue_stop = 2;
2624 return NULL;
2625 }
2626
2627 pthread_mutex_lock (&aq->mutex);
2628
2629 while (true)
2630 {
2631 if (aq->drain_queue_stop)
2632 break;
2633
2634 if (aq->queue_n > 0)
2635 {
2636 pthread_mutex_unlock (&aq->mutex);
2637 execute_queue_entry (aq, aq->queue_first);
2638
2639 pthread_mutex_lock (&aq->mutex);
2640 aq->queue_first = ((aq->queue_first + 1)
2641 % ASYNC_QUEUE_SIZE);
2642 aq->queue_n--;
2643
2644 if (DEBUG_THREAD_SIGNAL)
2645 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2646 aq->agent->device_id, aq->id);
2647 pthread_cond_broadcast (&aq->queue_cond_out);
2648 pthread_mutex_unlock (&aq->mutex);
2649
2650 if (DEBUG_QUEUES)
2651 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2652 aq->id);
2653 pthread_mutex_lock (&aq->mutex);
2654 }
2655 else
2656 {
2657 if (DEBUG_THREAD_SLEEP)
2658 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2659 aq->agent->device_id, aq->id);
2660 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2661 if (DEBUG_THREAD_SLEEP)
2662 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2663 aq->agent->device_id, aq->id);
2664 }
2665 }
2666
2667 aq->drain_queue_stop = 2;
2668 if (DEBUG_THREAD_SIGNAL)
2669 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2670 aq->agent->device_id, aq->id);
2671 pthread_cond_broadcast (&aq->queue_cond_out);
2672 pthread_mutex_unlock (&aq->mutex);
2673
2674 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2675 return NULL;
2676}
2677
2678/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2679 is not usually the case. This is just a debug tool. */
2680
2681static void
2682drain_queue_synchronous (struct goacc_asyncqueue *aq)
2683{
2684 pthread_mutex_lock (&aq->mutex);
2685
2686 while (aq->queue_n > 0)
2687 {
2688 execute_queue_entry (aq, aq->queue_first);
2689
2690 aq->queue_first = ((aq->queue_first + 1)
2691 % ASYNC_QUEUE_SIZE);
2692 aq->queue_n--;
2693 }
2694
2695 pthread_mutex_unlock (&aq->mutex);
2696}
2697
d88b27da
JB
2698/* Block the current thread until an async queue is writable. The aq->mutex
2699 lock should be held on entry, and remains locked on exit. */
237957cc
AS
2700
2701static void
2702wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2703{
2704 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2705 {
237957cc
AS
2706 /* Queue is full. Wait for it to not be full. */
2707 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2708 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
237957cc
AS
2709 }
2710}
2711
2712/* Request an asynchronous kernel launch on the specified queue. This
2713 may block if the queue is full, but returns without waiting for the
2714 kernel to run. */
2715
2716static void
2717queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2718 void *vars, struct GOMP_kernel_launch_attributes *kla)
2719{
2720 assert (aq->agent == kernel->agent);
2721
237957cc
AS
2722 pthread_mutex_lock (&aq->mutex);
2723
d88b27da
JB
2724 wait_for_queue_nonfull (aq);
2725
237957cc
AS
2726 int queue_last = ((aq->queue_first + aq->queue_n)
2727 % ASYNC_QUEUE_SIZE);
2728 if (DEBUG_QUEUES)
2729 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2730 aq->id, queue_last);
2731
2732 aq->queue[queue_last].type = KERNEL_LAUNCH;
2733 aq->queue[queue_last].u.launch.kernel = kernel;
2734 aq->queue[queue_last].u.launch.vars = vars;
2735 aq->queue[queue_last].u.launch.kla = *kla;
2736
2737 aq->queue_n++;
2738
2739 if (DEBUG_THREAD_SIGNAL)
2740 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2741 aq->agent->device_id, aq->id);
2742 pthread_cond_signal (&aq->queue_cond_in);
2743
2744 pthread_mutex_unlock (&aq->mutex);
2745}
2746
2747/* Request an asynchronous callback on the specified queue. The callback
2748 function will be called, with the given opaque data, from the appropriate
2749 async thread, when all previous items on that queue are complete. */
2750
2751static void
2752queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2753 void *data)
2754{
237957cc
AS
2755 pthread_mutex_lock (&aq->mutex);
2756
d88b27da
JB
2757 wait_for_queue_nonfull (aq);
2758
237957cc
AS
2759 int queue_last = ((aq->queue_first + aq->queue_n)
2760 % ASYNC_QUEUE_SIZE);
2761 if (DEBUG_QUEUES)
2762 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2763 aq->id, queue_last);
2764
2765 aq->queue[queue_last].type = CALLBACK;
2766 aq->queue[queue_last].u.callback.fn = fn;
2767 aq->queue[queue_last].u.callback.data = data;
2768
2769 aq->queue_n++;
2770
2771 if (DEBUG_THREAD_SIGNAL)
2772 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2773 aq->agent->device_id, aq->id);
2774 pthread_cond_signal (&aq->queue_cond_in);
2775
2776 pthread_mutex_unlock (&aq->mutex);
2777}
2778
2779/* Request that a given async thread wait for another thread (unspecified) to
2780 reach the given placeholder. The wait will occur when all previous entries
2781 on the queue are complete. A placeholder is effectively a kind of signal
2782 which simply sets a flag when encountered in a queue. */
2783
2784static void
2785queue_push_asyncwait (struct goacc_asyncqueue *aq,
2786 struct placeholder *placeholderp)
2787{
237957cc
AS
2788 pthread_mutex_lock (&aq->mutex);
2789
d88b27da
JB
2790 wait_for_queue_nonfull (aq);
2791
237957cc
AS
2792 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2793 if (DEBUG_QUEUES)
2794 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2795 aq->id, queue_last);
2796
2797 aq->queue[queue_last].type = ASYNC_WAIT;
2798 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2799
2800 aq->queue_n++;
2801
2802 if (DEBUG_THREAD_SIGNAL)
2803 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2804 aq->agent->device_id, aq->id);
2805 pthread_cond_signal (&aq->queue_cond_in);
2806
2807 pthread_mutex_unlock (&aq->mutex);
2808}
2809
2810/* Add a placeholder into an async queue. When the async thread reaches the
2811 placeholder it will set the "executed" flag to true and continue.
2812 Another thread may be waiting on this thread reaching the placeholder. */
2813
2814static struct placeholder *
2815queue_push_placeholder (struct goacc_asyncqueue *aq)
2816{
2817 struct placeholder *placeholderp;
2818
237957cc
AS
2819 pthread_mutex_lock (&aq->mutex);
2820
d88b27da
JB
2821 wait_for_queue_nonfull (aq);
2822
237957cc
AS
2823 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2824 if (DEBUG_QUEUES)
2825 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2826 aq->id, queue_last);
2827
2828 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2829 placeholderp = &aq->queue[queue_last].u.placeholder;
2830
2831 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2832 {
2833 pthread_mutex_unlock (&aq->mutex);
2834 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2835 }
2836
2837 if (pthread_cond_init (&placeholderp->cond, NULL))
2838 {
2839 pthread_mutex_unlock (&aq->mutex);
2840 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2841 }
2842
2843 placeholderp->executed = 0;
2844
2845 aq->queue_n++;
2846
2847 if (DEBUG_THREAD_SIGNAL)
2848 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2849 aq->agent->device_id, aq->id);
2850 pthread_cond_signal (&aq->queue_cond_in);
2851
2852 pthread_mutex_unlock (&aq->mutex);
2853
2854 return placeholderp;
2855}
2856
2857/* Signal an asynchronous thread to terminate, and wait for it to do so. */
2858
2859static void
2860finalize_async_thread (struct goacc_asyncqueue *aq)
2861{
2862 pthread_mutex_lock (&aq->mutex);
2863 if (aq->drain_queue_stop == 2)
2864 {
2865 pthread_mutex_unlock (&aq->mutex);
2866 return;
2867 }
2868
2869 aq->drain_queue_stop = 1;
2870
2871 if (DEBUG_THREAD_SIGNAL)
2872 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2873 aq->agent->device_id, aq->id);
2874 pthread_cond_signal (&aq->queue_cond_in);
2875
2876 while (aq->drain_queue_stop != 2)
2877 {
2878 if (DEBUG_THREAD_SLEEP)
2879 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2880 " to sleep\n", aq->agent->device_id, aq->id);
2881 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2882 if (DEBUG_THREAD_SLEEP)
2883 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2884 aq->agent->device_id, aq->id);
2885 }
2886
2887 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2888 aq->id);
2889 pthread_mutex_unlock (&aq->mutex);
2890
2891 int err = pthread_join (aq->thread_drain_queue, NULL);
2892 if (err != 0)
2893 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2894 aq->agent->device_id, aq->id, strerror (err));
2895 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2896}
2897
2898/* Set up an async queue for OpenMP. There will be only one. The
2899 implementation simply uses an OpenACC async queue.
2900 FIXME: is this thread-safe if two threads call this function? */
2901
2902static void
2903maybe_init_omp_async (struct agent_info *agent)
2904{
2905 if (!agent->omp_async_queue)
2906 agent->omp_async_queue
2907 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2908}
2909
8d2f4ddf
JB
2910/* A wrapper that works around an issue in the HSA runtime with host-to-device
2911 copies from read-only pages. */
2912
2913static void
2914hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2915{
2916 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2917
2918 if (status == HSA_STATUS_SUCCESS)
2919 return;
2920
2921 /* It appears that the copy fails if the source data is in a read-only page.
2922 We can't detect that easily, so try copying the data to a temporary buffer
2923 and doing the copy again if we got an error above. */
2924
2925 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2926 "[%p:+%d]\n", (void *) src, (int) len);
2927
2928 void *src_copy = malloc (len);
2929 memcpy (src_copy, src, len);
2930 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2931 free (src_copy);
2932 if (status != HSA_STATUS_SUCCESS)
2933 GOMP_PLUGIN_error ("memory copy failed");
2934}
2935
237957cc
AS
2936/* Copy data to or from a device. This is intended for use as an async
2937 callback event. */
2938
2939static void
2940copy_data (void *data_)
2941{
2942 struct copy_data *data = (struct copy_data *)data_;
2943 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2944 data->aq->agent->device_id, data->aq->id, data->len, data->src,
2945 data->dst);
8d2f4ddf 2946 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
237957cc
AS
2947 free (data);
2948}
2949
2950/* Free device data. This is intended for use as an async callback event. */
2951
2952static void
2953gomp_offload_free (void *ptr)
2954{
2955 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2956 GOMP_OFFLOAD_free (0, ptr);
2957}
2958
2959/* Request an asynchronous data copy, to or from a device, on a given queue.
9c41f5b9 2960 The event will be registered as a callback. */
237957cc
AS
2961
2962static void
2963queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
9c41f5b9 2964 size_t len)
237957cc
AS
2965{
2966 if (DEBUG_QUEUES)
2967 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2968 aq->agent->device_id, aq->id, len, src, dst);
2969 struct copy_data *data
2970 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
2971 data->dst = dst;
2972 data->src = src;
2973 data->len = len;
237957cc
AS
2974 data->aq = aq;
2975 queue_push_callback (aq, copy_data, data);
2976}
2977
2978/* Return true if the given queue is currently empty. */
2979
2980static int
2981queue_empty (struct goacc_asyncqueue *aq)
2982{
2983 pthread_mutex_lock (&aq->mutex);
2984 int res = aq->queue_n == 0 ? 1 : 0;
2985 pthread_mutex_unlock (&aq->mutex);
2986
2987 return res;
2988}
2989
2990/* Wait for a given queue to become empty. This implements an OpenACC wait
2991 directive. */
2992
2993static void
2994wait_queue (struct goacc_asyncqueue *aq)
2995{
2996 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2997 {
2998 drain_queue_synchronous (aq);
2999 return;
3000 }
3001
3002 pthread_mutex_lock (&aq->mutex);
3003
3004 while (aq->queue_n > 0)
3005 {
3006 if (DEBUG_THREAD_SLEEP)
3007 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3008 aq->agent->device_id, aq->id);
3009 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3010 if (DEBUG_THREAD_SLEEP)
3011 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3012 aq->id);
3013 }
3014
3015 pthread_mutex_unlock (&aq->mutex);
3016 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3017}
3018
3019/* }}} */
3020/* {{{ OpenACC support */
3021
3022/* Execute an OpenACC kernel, synchronously or asynchronously. */
3023
3024static void
3025gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3026 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3027 struct goacc_asyncqueue *aq)
3028{
3029 if (!GOMP_OFFLOAD_can_run (kernel))
3030 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3031
3032 /* If we get here then this must be an OpenACC kernel. */
3033 kernel->kind = KIND_OPENACC;
3034
3035 /* devaddrs must be double-indirect on the target. */
3036 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3037 for (size_t i = 0; i < mapnum; i++)
3038 hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3039 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3040 sizeof (void *));
3041
3042 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3043 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3044 {
3045 struct hsa_kernel_description *d
3046 = &kernel->module->image_desc->kernel_infos[i];
3047 if (d->name == kernel->name)
3048 {
3049 hsa_kernel_desc = d;
3050 break;
3051 }
3052 }
3053
3054 /* We may have statically-determined dimensions in
3055 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3056 invocation at runtime in dims[]. We allow static dimensions to take
3057 priority over dynamic dimensions when present (non-zero). */
3058 if (hsa_kernel_desc->oacc_dims[0] > 0)
3059 dims[0] = hsa_kernel_desc->oacc_dims[0];
3060 if (hsa_kernel_desc->oacc_dims[1] > 0)
3061 dims[1] = hsa_kernel_desc->oacc_dims[1];
3062 if (hsa_kernel_desc->oacc_dims[2] > 0)
3063 dims[2] = hsa_kernel_desc->oacc_dims[2];
3064
a78b1ab1
KCY
3065 /* Ideally, when a dimension isn't explicitly specified, we should
3066 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3067 In practice, we tune for peak performance on BabelStream, which
3068 for OpenACC is currently 32 threads per CU. */
3069 if (dims[0] == 0 && dims[1] == 0)
3070 {
3071 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3072 number. There isn't really a correct answer for this without a clue
3073 about the problem size, so let's do a reasonable number of workers
3074 and gangs. */
237957cc 3075
a78b1ab1
KCY
3076 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3077 dims[1] = 8; /* Workers. */
3078 }
3079 else if (dims[0] == 0 && dims[1] > 0)
3080 {
3081 /* Auto-scale the number of gangs with the requested number of workers. */
3082 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3083 }
3084 else if (dims[0] > 0 && dims[1] == 0)
3085 {
3086 /* Auto-scale the number of workers with the requested number of gangs. */
3087 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3088 if (dims[1] == 0)
3089 dims[1] = 1;
3090 if (dims[1] > 16)
3091 dims[1] = 16;
3092 }
237957cc
AS
3093
3094 /* The incoming dimensions are expressed in terms of gangs, workers, and
3095 vectors. The HSA dimensions are expressed in terms of "work-items",
3096 which means multiples of vector lanes.
3097
3098 The "grid size" specifies the size of the problem space, and the
3099 "work-group size" specifies how much of that we want a single compute
3100 unit to chew on at once.
3101
3102 The three dimensions do not really correspond to hardware, but the
3103 important thing is that the HSA runtime will launch as many
3104 work-groups as it takes to process the entire grid, and each
3105 work-group will contain as many wave-fronts as it takes to process
3106 the work-items in that group.
3107
3108 Essentially, as long as we set the Y dimension to 64 (the number of
3109 vector lanes in hardware), and the Z group size to the maximum (16),
3110 then we will get the gangs (X) and workers (Z) launched as we expect.
3111
3112 The reason for the apparent reversal of vector and worker dimension
3113 order is to do with the way the run-time distributes work-items across
3114 v1 and v2. */
3115 struct GOMP_kernel_launch_attributes kla =
3116 {3,
3117 /* Grid size. */
3118 {dims[0], 64, dims[1]},
3119 /* Work-group size. */
3120 {1, 64, 16}
3121 };
3122
3123 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3124 acc_prof_info *prof_info = thr->prof_info;
3125 acc_event_info enqueue_launch_event_info;
3126 acc_api_info *api_info = thr->api_info;
3127 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3128 if (profiling_dispatch_p)
3129 {
3130 prof_info->event_type = acc_ev_enqueue_launch_start;
3131
3132 enqueue_launch_event_info.launch_event.event_type
3133 = prof_info->event_type;
3134 enqueue_launch_event_info.launch_event.valid_bytes
3135 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3136 enqueue_launch_event_info.launch_event.parent_construct
3137 = acc_construct_parallel;
3138 enqueue_launch_event_info.launch_event.implicit = 1;
3139 enqueue_launch_event_info.launch_event.tool_info = NULL;
3140 enqueue_launch_event_info.launch_event.kernel_name
3141 = (char *) kernel->name;
3142 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3143 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3144 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3145
3146 api_info->device_api = acc_device_api_other;
3147
3148 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3149 &enqueue_launch_event_info, api_info);
3150 }
3151
3152 if (!async)
3153 {
3154 run_kernel (kernel, ind_da, &kla, NULL, false);
3155 gomp_offload_free (ind_da);
3156 }
3157 else
3158 {
3159 queue_push_launch (aq, kernel, ind_da, &kla);
3160 if (DEBUG_QUEUES)
3161 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3162 aq->agent->device_id, aq->id, ind_da);
3163 queue_push_callback (aq, gomp_offload_free, ind_da);
3164 }
3165
3166 if (profiling_dispatch_p)
3167 {
3168 prof_info->event_type = acc_ev_enqueue_launch_end;
3169 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3170 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3171 &enqueue_launch_event_info,
3172 api_info);
3173 }
3174}
3175
3176/* }}} */
3177/* {{{ Generic Plugin API */
3178
3179/* Return the name of the accelerator, which is "gcn". */
3180
3181const char *
3182GOMP_OFFLOAD_get_name (void)
3183{
3184 return "gcn";
3185}
3186
3187/* Return the specific capabilities the HSA accelerator have. */
3188
3189unsigned int
3190GOMP_OFFLOAD_get_caps (void)
3191{
3192 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3193 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3194 | GOMP_OFFLOAD_CAP_OPENACC_200;
3195}
3196
3197/* Identify as GCN accelerator. */
3198
3199int
3200GOMP_OFFLOAD_get_type (void)
3201{
3202 return OFFLOAD_TARGET_TYPE_GCN;
3203}
3204
3205/* Return the libgomp version number we're compatible with. There is
3206 no requirement for cross-version compatibility. */
3207
3208unsigned
3209GOMP_OFFLOAD_version (void)
3210{
3211 return GOMP_VERSION;
3212}
3213
3214/* Return the number of GCN devices on the system. */
3215
3216int
3217GOMP_OFFLOAD_get_num_devices (void)
3218{
3219 if (!init_hsa_context ())
3220 return 0;
3221 return hsa_context.agent_count;
3222}
3223
3224/* Initialize device (agent) number N so that it can be used for computation.
3225 Return TRUE on success. */
3226
3227bool
3228GOMP_OFFLOAD_init_device (int n)
3229{
3230 if (!init_hsa_context ())
3231 return false;
3232 if (n >= hsa_context.agent_count)
3233 {
3234 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3235 return false;
3236 }
3237 struct agent_info *agent = &hsa_context.agents[n];
3238
3239 if (agent->initialized)
3240 return true;
3241
3242 agent->device_id = n;
3243
3244 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3245 {
3246 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3247 return false;
3248 }
3249 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3250 {
3251 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3252 return false;
3253 }
3254 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3255 {
3256 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3257 return false;
3258 }
3259 if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3260 {
3261 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3262 return false;
3263 }
3264 agent->async_queues = NULL;
3265 agent->omp_async_queue = NULL;
3266 agent->team_arena_list = NULL;
3267
3268 uint32_t queue_size;
3269 hsa_status_t status;
3270 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3271 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3272 &queue_size);
3273 if (status != HSA_STATUS_SUCCESS)
3274 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3275 status);
3276
237957cc 3277 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
2e5ea579 3278 &agent->name);
237957cc
AS
3279 if (status != HSA_STATUS_SUCCESS)
3280 return hsa_error ("Error querying the name of the agent", status);
7d593fd6 3281
2e5ea579 3282 agent->device_isa = isa_code (agent->name);
7d593fd6 3283 if (agent->device_isa < 0)
2e5ea579
FH
3284 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3285
3286 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3287 &agent->vendor_name);
3288 if (status != HSA_STATUS_SUCCESS)
3289 return hsa_error ("Error querying the vendor name of the agent", status);
237957cc
AS
3290
3291 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3292 HSA_QUEUE_TYPE_MULTI,
3293 hsa_queue_callback, NULL, UINT32_MAX,
3294 UINT32_MAX, &agent->sync_queue);
3295 if (status != HSA_STATUS_SUCCESS)
3296 return hsa_error ("Error creating command queue", status);
3297
3298 agent->kernarg_region.handle = (uint64_t) -1;
3299 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3300 get_kernarg_memory_region,
3301 &agent->kernarg_region);
966de09b
AS
3302 if (status != HSA_STATUS_SUCCESS
3303 && status != HSA_STATUS_INFO_BREAK)
3304 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3305 if (agent->kernarg_region.handle == (uint64_t) -1)
3306 {
3307 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3308 "arguments");
3309 return false;
3310 }
3311 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3312 dump_hsa_region (agent->kernarg_region, NULL);
3313
3314 agent->data_region.handle = (uint64_t) -1;
3315 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3316 get_data_memory_region,
3317 &agent->data_region);
966de09b
AS
3318 if (status != HSA_STATUS_SUCCESS
3319 && status != HSA_STATUS_INFO_BREAK)
3320 hsa_error ("Scanning memory regions failed", status);
237957cc
AS
3321 if (agent->data_region.handle == (uint64_t) -1)
3322 {
3323 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3324 "data");
3325 return false;
3326 }
3327 GCN_DEBUG ("Selected device data memory region:\n");
3328 dump_hsa_region (agent->data_region, NULL);
3329
3330 GCN_DEBUG ("GCN agent %d initialized\n", n);
3331
3332 agent->initialized = true;
3333 return true;
3334}
3335
3336/* Load GCN object-code module described by struct gcn_image_desc in
3337 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3338 If there are any constructors then run them. */
3339
3340int
3341GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3342 struct addr_pair **target_table)
3343{
3344 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3345 {
3346 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3347 " (expected %u, received %u)",
3348 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3349 return -1;
3350 }
3351
3352 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3353 struct agent_info *agent;
3354 struct addr_pair *pair;
3355 struct module_info *module;
3356 struct kernel_info *kernel;
3357 int kernel_count = image_desc->kernel_count;
3358 unsigned var_count = image_desc->global_variable_count;
0bac793e 3359 int other_count = 1;
237957cc
AS
3360
3361 agent = get_agent_info (ord);
3362 if (!agent)
3363 return -1;
3364
3365 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3366 {
3367 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3368 return -1;
3369 }
3370 if (agent->prog_finalized
3371 && !destroy_hsa_program (agent))
3372 return -1;
3373
3374 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3375 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
0bac793e
CLT
3376 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3377 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
237957cc
AS
3378 * sizeof (struct addr_pair));
3379 *target_table = pair;
3380 module = (struct module_info *)
3381 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3382 + kernel_count * sizeof (struct kernel_info));
3383 module->image_desc = image_desc;
3384 module->kernel_count = kernel_count;
3385 module->heap = NULL;
3386 module->constructors_run_p = false;
3387
3388 kernel = &module->kernels[0];
3389
3390 /* Allocate memory for kernel dependencies. */
3391 for (unsigned i = 0; i < kernel_count; i++)
3392 {
3393 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3394 if (!init_basic_kernel_info (kernel, d, agent, module))
3395 return -1;
3396 if (strcmp (d->name, "_init_array") == 0)
3397 module->init_array_func = kernel;
3398 else if (strcmp (d->name, "_fini_array") == 0)
3399 module->fini_array_func = kernel;
3400 else
3401 {
3402 pair->start = (uintptr_t) kernel;
3403 pair->end = (uintptr_t) (kernel + 1);
3404 pair++;
3405 }
3406 kernel++;
3407 }
3408
3409 agent->module = module;
3410 if (pthread_rwlock_unlock (&agent->module_rwlock))
3411 {
3412 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3413 return -1;
3414 }
3415
3416 if (!create_and_finalize_hsa_program (agent))
3417 return -1;
3418
4a87a8e4 3419 if (var_count > 0)
237957cc 3420 {
237957cc
AS
3421 hsa_status_t status;
3422 hsa_executable_symbol_t var_symbol;
3423 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
4a87a8e4
AS
3424 ".offload_var_table",
3425 agent->id,
237957cc
AS
3426 0, &var_symbol);
3427
3428 if (status != HSA_STATUS_SUCCESS)
3429 hsa_fatal ("Could not find symbol for variable in the code object",
3430 status);
3431
4a87a8e4 3432 uint64_t var_table_addr;
237957cc 3433 status = hsa_fns.hsa_executable_symbol_get_info_fn
4a87a8e4
AS
3434 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3435 &var_table_addr);
237957cc
AS
3436 if (status != HSA_STATUS_SUCCESS)
3437 hsa_fatal ("Could not extract a variable from its symbol", status);
237957cc 3438
4a87a8e4
AS
3439 struct {
3440 uint64_t addr;
3441 uint64_t size;
3442 } var_table[var_count];
3443 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3444 (void*)var_table_addr, sizeof (var_table));
3445
3446 for (unsigned i = 0; i < var_count; i++)
3447 {
3448 pair->start = var_table[i].addr;
3449 pair->end = var_table[i].addr + var_table[i].size;
3450 GCN_DEBUG ("Found variable at %p with size %lu\n",
3451 (void *)var_table[i].addr, var_table[i].size);
3452 pair++;
3453 }
237957cc
AS
3454 }
3455
fbb59240 3456 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_DEVICE_NUM_VAR));
0bac793e
CLT
3457
3458 hsa_status_t status;
3459 hsa_executable_symbol_t var_symbol;
3460 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
fbb59240 3461 XSTRING (GOMP_DEVICE_NUM_VAR),
0bac793e
CLT
3462 agent->id, 0, &var_symbol);
3463 if (status == HSA_STATUS_SUCCESS)
3464 {
3465 uint64_t device_num_varptr;
3466 uint32_t device_num_varsize;
3467
3468 status = hsa_fns.hsa_executable_symbol_get_info_fn
3469 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3470 &device_num_varptr);
3471 if (status != HSA_STATUS_SUCCESS)
3472 hsa_fatal ("Could not extract a variable from its symbol", status);
3473 status = hsa_fns.hsa_executable_symbol_get_info_fn
3474 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3475 &device_num_varsize);
3476 if (status != HSA_STATUS_SUCCESS)
3477 hsa_fatal ("Could not extract a variable size from its symbol", status);
3478
3479 pair->start = device_num_varptr;
3480 pair->end = device_num_varptr + device_num_varsize;
3481 }
3482 else
3483 /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image. */
3484 pair->start = pair->end = 0;
3485 pair++;
3486
237957cc
AS
3487 /* Ensure that constructors are run first. */
3488 struct GOMP_kernel_launch_attributes kla =
3489 { 3,
3490 /* Grid size. */
3491 { 1, 64, 1 },
3492 /* Work-group size. */
3493 { 1, 64, 1 }
3494 };
3495
3496 if (module->init_array_func)
3497 {
3498 init_kernel (module->init_array_func);
3499 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3500 }
3501 module->constructors_run_p = true;
3502
3503 /* Don't report kernels that libgomp need not know about. */
3504 if (module->init_array_func)
3505 kernel_count--;
3506 if (module->fini_array_func)
3507 kernel_count--;
3508
0bac793e 3509 return kernel_count + var_count + other_count;
237957cc
AS
3510}
3511
3512/* Unload GCN object-code module described by struct gcn_image_desc in
3513 TARGET_DATA from agent number N. Return TRUE on success. */
3514
3515bool
3516GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3517{
3518 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3519 {
3520 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3521 " (expected %u, received %u)",
3522 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3523 return false;
3524 }
3525
3526 struct agent_info *agent;
3527 agent = get_agent_info (n);
3528 if (!agent)
3529 return false;
3530
3531 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3532 {
3533 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3534 return false;
3535 }
3536
3537 if (!agent->module || agent->module->image_desc != target_data)
3538 {
3539 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3540 "loaded before");
3541 return false;
3542 }
3543
3544 if (!destroy_module (agent->module, true))
3545 return false;
3546 free (agent->module);
3547 agent->module = NULL;
3548 if (!destroy_hsa_program (agent))
3549 return false;
3550 if (pthread_rwlock_unlock (&agent->module_rwlock))
3551 {
3552 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3553 return false;
3554 }
3555 return true;
3556}
3557
3558/* Deinitialize all information and status associated with agent number N. We
3559 do not attempt any synchronization, assuming the user and libgomp will not
3560 attempt deinitialization of a device that is in any way being used at the
3561 same time. Return TRUE on success. */
3562
3563bool
3564GOMP_OFFLOAD_fini_device (int n)
3565{
3566 struct agent_info *agent = get_agent_info (n);
3567 if (!agent)
3568 return false;
3569
3570 if (!agent->initialized)
3571 return true;
3572
3573 if (agent->omp_async_queue)
3574 {
3575 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3576 agent->omp_async_queue = NULL;
3577 }
3578
3579 if (agent->module)
3580 {
3581 if (!destroy_module (agent->module, false))
3582 return false;
3583 free (agent->module);
3584 agent->module = NULL;
3585 }
3586
3587 if (!destroy_team_arenas (agent))
3588 return false;
3589
3590 if (!destroy_hsa_program (agent))
3591 return false;
3592
3593 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3594 if (status != HSA_STATUS_SUCCESS)
3595 return hsa_error ("Error destroying command queue", status);
3596
3597 if (pthread_mutex_destroy (&agent->prog_mutex))
3598 {
3599 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3600 return false;
3601 }
3602 if (pthread_rwlock_destroy (&agent->module_rwlock))
3603 {
3604 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3605 return false;
3606 }
3607
3608 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3609 {
3610 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3611 return false;
3612 }
3613 if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3614 {
3615 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3616 return false;
3617 }
3618 agent->initialized = false;
3619 return true;
3620}
3621
3622/* Return true if the HSA runtime can run function FN_PTR. */
3623
3624bool
3625GOMP_OFFLOAD_can_run (void *fn_ptr)
3626{
3627 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3628
3629 init_kernel (kernel);
3630 if (kernel->initialization_failed)
3631 goto failure;
3632
3633 return true;
3634
3635failure:
3636 if (suppress_host_fallback)
3637 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3638 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3639 return false;
3640}
3641
3642/* Allocate memory on device N. */
3643
3644void *
3645GOMP_OFFLOAD_alloc (int n, size_t size)
3646{
3647 struct agent_info *agent = get_agent_info (n);
3648 return alloc_by_agent (agent, size);
3649}
3650
3651/* Free memory from device N. */
3652
3653bool
3654GOMP_OFFLOAD_free (int device, void *ptr)
3655{
3656 GCN_DEBUG ("Freeing memory on device %d\n", device);
3657
3658 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3659 if (status != HSA_STATUS_SUCCESS)
3660 {
3661 hsa_error ("Could not free device memory", status);
3662 return false;
3663 }
3664
3665 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3666 bool profiling_dispatch_p
3667 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3668 if (profiling_dispatch_p)
3669 {
3670 acc_prof_info *prof_info = thr->prof_info;
3671 acc_event_info data_event_info;
3672 acc_api_info *api_info = thr->api_info;
3673
3674 prof_info->event_type = acc_ev_free;
3675
3676 data_event_info.data_event.event_type = prof_info->event_type;
3677 data_event_info.data_event.valid_bytes
3678 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3679 data_event_info.data_event.parent_construct
3680 = acc_construct_parallel;
3681 data_event_info.data_event.implicit = 1;
3682 data_event_info.data_event.tool_info = NULL;
3683 data_event_info.data_event.var_name = NULL;
3684 data_event_info.data_event.bytes = 0;
3685 data_event_info.data_event.host_ptr = NULL;
3686 data_event_info.data_event.device_ptr = (void *) ptr;
3687
3688 api_info->device_api = acc_device_api_other;
3689
3690 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3691 api_info);
3692 }
3693
3694 return true;
3695}
3696
3697/* Copy data from DEVICE to host. */
3698
3699bool
3700GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3701{
3702 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3703 src, dst);
8d2f4ddf
JB
3704 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3705 if (status != HSA_STATUS_SUCCESS)
3706 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3707 return true;
3708}
3709
3710/* Copy data from host to DEVICE. */
3711
3712bool
3713GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3714{
3715 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3716 device, dst);
8d2f4ddf 3717 hsa_memory_copy_wrapper (dst, src, n);
237957cc
AS
3718 return true;
3719}
3720
3721/* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3722
3723bool
3724GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3725{
3726 struct gcn_thread *thread_data = gcn_thread ();
3727
3728 if (thread_data && !async_synchronous_p (thread_data->async))
3729 {
3730 struct agent_info *agent = get_agent_info (device);
3731 maybe_init_omp_async (agent);
9c41f5b9 3732 queue_push_copy (agent->omp_async_queue, dst, src, n);
237957cc
AS
3733 return true;
3734 }
3735
3736 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3737 device, src, device, dst);
8d2f4ddf
JB
3738 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3739 if (status != HSA_STATUS_SUCCESS)
3740 GOMP_PLUGIN_error ("memory copy failed");
237957cc
AS
3741 return true;
3742}
3743
3744/* }}} */
3745/* {{{ OpenMP Plugin API */
3746
3747/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3748 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3749 to a kernel_info structure, and must have previously been loaded to the
3750 specified device. */
3751
3752void
3753GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3754{
3755 struct agent_info *agent = get_agent_info (device);
3756 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3757 struct GOMP_kernel_launch_attributes def;
3758 struct GOMP_kernel_launch_attributes *kla;
3759 assert (agent == kernel->agent);
3760
3761 /* If we get here then the kernel must be OpenMP. */
3762 kernel->kind = KIND_OPENMP;
3763
3764 if (!parse_target_attributes (args, &def, &kla, agent))
3765 {
3766 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3767 return;
3768 }
3769 run_kernel (kernel, vars, kla, NULL, false);
3770}
3771
3772/* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3773 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3774 GOMP_PLUGIN_target_task_completion when it has finished. */
3775
3776void
3777GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3778 void **args, void *async_data)
3779{
3780 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3781 struct agent_info *agent = get_agent_info (device);
3782 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3783 struct GOMP_kernel_launch_attributes def;
3784 struct GOMP_kernel_launch_attributes *kla;
3785 assert (agent == kernel->agent);
3786
3787 /* If we get here then the kernel must be OpenMP. */
3788 kernel->kind = KIND_OPENMP;
3789
3790 if (!parse_target_attributes (args, &def, &kla, agent))
3791 {
3792 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3793 return;
3794 }
3795
3796 maybe_init_omp_async (agent);
3797 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3798 queue_push_callback (agent->omp_async_queue,
3799 GOMP_PLUGIN_target_task_completion, async_data);
3800}
3801
3802/* }}} */
3803/* {{{ OpenACC Plugin API */
3804
3805/* Run a synchronous OpenACC kernel. The device number is inferred from the
3806 already-loaded KERNEL. */
3807
3808void
3809GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3810 void **hostaddrs, void **devaddrs, unsigned *dims,
3811 void *targ_mem_desc)
3812{
3813 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3814
3815 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3816 NULL);
3817}
3818
3819/* Run an asynchronous OpenACC kernel on the specified queue. */
3820
3821void
3822GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3823 void **hostaddrs, void **devaddrs,
3824 unsigned *dims, void *targ_mem_desc,
3825 struct goacc_asyncqueue *aq)
3826{
3827 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3828
3829 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3830 aq);
3831}
3832
3833/* Create a new asynchronous thread and queue for running future kernels. */
3834
3835struct goacc_asyncqueue *
3836GOMP_OFFLOAD_openacc_async_construct (int device)
3837{
3838 struct agent_info *agent = get_agent_info (device);
3839
3840 pthread_mutex_lock (&agent->async_queues_mutex);
3841
3842 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3843 aq->agent = get_agent_info (device);
3844 aq->prev = NULL;
3845 aq->next = agent->async_queues;
3846 if (aq->next)
3847 {
3848 aq->next->prev = aq;
3849 aq->id = aq->next->id + 1;
3850 }
3851 else
3852 aq->id = 1;
3853 agent->async_queues = aq;
3854
3855 aq->queue_first = 0;
3856 aq->queue_n = 0;
3857 aq->drain_queue_stop = 0;
3858
3859 if (pthread_mutex_init (&aq->mutex, NULL))
3860 {
3861 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3862 return false;
3863 }
3864 if (pthread_cond_init (&aq->queue_cond_in, NULL))
3865 {
3866 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3867 return false;
3868 }
3869 if (pthread_cond_init (&aq->queue_cond_out, NULL))
3870 {
3871 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3872 return false;
3873 }
3874
3875 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3876 ASYNC_QUEUE_SIZE,
3877 HSA_QUEUE_TYPE_MULTI,
3878 hsa_queue_callback, NULL,
3879 UINT32_MAX, UINT32_MAX,
3880 &aq->hsa_queue);
3881 if (status != HSA_STATUS_SUCCESS)
3882 hsa_fatal ("Error creating command queue", status);
3883
3884 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3885 if (err != 0)
3886 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3887 strerror (err));
3888 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3889 aq->id);
3890
3891 pthread_mutex_unlock (&agent->async_queues_mutex);
3892
3893 return aq;
3894}
3895
93d90219 3896/* Destroy an existing asynchronous thread and queue. Waits for any
237957cc
AS
3897 currently-running task to complete, but cancels any queued tasks. */
3898
3899bool
3900GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3901{
3902 struct agent_info *agent = aq->agent;
3903
3904 finalize_async_thread (aq);
3905
3906 pthread_mutex_lock (&agent->async_queues_mutex);
3907
3908 int err;
3909 if ((err = pthread_mutex_destroy (&aq->mutex)))
3910 {
3911 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3912 goto fail;
3913 }
3914 if (pthread_cond_destroy (&aq->queue_cond_in))
3915 {
3916 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3917 goto fail;
3918 }
3919 if (pthread_cond_destroy (&aq->queue_cond_out))
3920 {
3921 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3922 goto fail;
3923 }
3924 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3925 if (status != HSA_STATUS_SUCCESS)
3926 {
3927 hsa_error ("Error destroying command queue", status);
3928 goto fail;
3929 }
3930
3931 if (aq->prev)
3932 aq->prev->next = aq->next;
3933 if (aq->next)
3934 aq->next->prev = aq->prev;
3935 if (agent->async_queues == aq)
3936 agent->async_queues = aq->next;
3937
3938 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3939
3940 free (aq);
3941 pthread_mutex_unlock (&agent->async_queues_mutex);
3942 return true;
3943
3944fail:
3945 pthread_mutex_unlock (&agent->async_queues_mutex);
3946 return false;
3947}
3948
3949/* Return true if the specified async queue is currently empty. */
3950
3951int
3952GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
3953{
3954 return queue_empty (aq);
3955}
3956
3957/* Block until the specified queue has executed all its tasks and the
3958 queue is empty. */
3959
3960bool
3961GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
3962{
3963 wait_queue (aq);
3964 return true;
3965}
3966
3967/* Add a serialization point across two async queues. Any new tasks added to
3968 AQ2, after this call, will not run until all tasks on AQ1, at the time
3969 of this call, have completed. */
3970
3971bool
3972GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
3973 struct goacc_asyncqueue *aq2)
3974{
3975 /* For serialize, stream aq2 waits for aq1 to complete work that has been
3976 scheduled to run on it up to this point. */
3977 if (aq1 != aq2)
3978 {
3979 struct placeholder *placeholderp = queue_push_placeholder (aq1);
3980 queue_push_asyncwait (aq2, placeholderp);
3981 }
3982 return true;
3983}
3984
3985/* Add an opaque callback to the given async queue. */
3986
3987void
3988GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
3989 void (*fn) (void *), void *data)
3990{
3991 queue_push_callback (aq, fn, data);
3992}
3993
3994/* Queue up an asynchronous data copy from host to DEVICE. */
3995
3996bool
3997GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
3998 size_t n, struct goacc_asyncqueue *aq)
3999{
4000 struct agent_info *agent = get_agent_info (device);
4001 assert (agent == aq->agent);
9c41f5b9 4002 queue_push_copy (aq, dst, src, n);
237957cc
AS
4003 return true;
4004}
4005
4006/* Queue up an asynchronous data copy from DEVICE to host. */
4007
4008bool
4009GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4010 size_t n, struct goacc_asyncqueue *aq)
4011{
4012 struct agent_info *agent = get_agent_info (device);
4013 assert (agent == aq->agent);
9c41f5b9 4014 queue_push_copy (aq, dst, src, n);
237957cc
AS
4015 return true;
4016}
4017
6fc0385c
TS
4018union goacc_property_value
4019GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4020{
2e5ea579
FH
4021 struct agent_info *agent = get_agent_info (device);
4022
4023 union goacc_property_value propval = { .val = 0 };
4024
4025 switch (prop)
4026 {
4027 case GOACC_PROPERTY_FREE_MEMORY:
4028 /* Not supported. */
4029 break;
4030 case GOACC_PROPERTY_MEMORY:
4031 {
4032 size_t size;
4033 hsa_region_t region = agent->data_region;
4034 hsa_status_t status =
4035 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4036 if (status == HSA_STATUS_SUCCESS)
4037 propval.val = size;
4038 break;
4039 }
4040 case GOACC_PROPERTY_NAME:
4041 propval.ptr = agent->name;
4042 break;
4043 case GOACC_PROPERTY_VENDOR:
4044 propval.ptr = agent->vendor_name;
4045 break;
4046 case GOACC_PROPERTY_DRIVER:
4047 propval.ptr = hsa_context.driver_version_s;
4048 break;
4049 }
6fc0385c 4050
2e5ea579 4051 return propval;
6fc0385c
TS
4052}
4053
237957cc
AS
4054/* Set up plugin-specific thread-local-data (host-side). */
4055
4056void *
4057GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4058{
4059 struct gcn_thread *thread_data
4060 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4061
4062 thread_data->async = GOMP_ASYNC_SYNC;
4063
4064 return (void *) thread_data;
4065}
4066
4067/* Clean up plugin-specific thread-local-data. */
4068
4069void
4070GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4071{
4072 free (data);
4073}
4074
4075/* }}} */