]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgomp/plugin/plugin-gcn.c
ebb6fbcfdbaecf0340d9db16caddd4189e635ef6
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
1 /* Plugin for AMD GCN execution.
2
3 Copyright (C) 2013-2020 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* {{{ Includes and defines */
30
31 #include "config.h"
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <pthread.h>
36 #include <inttypes.h>
37 #include <stdbool.h>
38 #include <limits.h>
39 #include <hsa.h>
40 #include <dlfcn.h>
41 #include <signal.h>
42 #include "libgomp-plugin.h"
43 #include "gomp-constants.h"
44 #include <elf.h>
45 #include "oacc-plugin.h"
46 #include "oacc-int.h"
47 #include <assert.h>
48
49 /* Additional definitions not in HSA 1.1.
50 FIXME: this needs to be updated in hsa.h for upstream, but the only source
51 right now is the ROCr source which may cause license issues. */
52 #define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
53
54 /* These probably won't be in elf.h for a while. */
55 #ifndef R_AMDGPU_NONE
56 #define R_AMDGPU_NONE 0
57 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
58 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
59 #define R_AMDGPU_ABS64 3 /* S + A */
60 #define R_AMDGPU_REL32 4 /* S + A - P */
61 #define R_AMDGPU_REL64 5 /* S + A - P */
62 #define R_AMDGPU_ABS32 6 /* S + A */
63 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
64 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
66 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
67 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
68 #define R_AMDGPU_RELATIVE64 13 /* B + A */
69 #endif
70
71 /* GCN specific definitions for asynchronous queues. */
72
73 #define ASYNC_QUEUE_SIZE 64
74 #define DRAIN_QUEUE_SYNCHRONOUS_P false
75 #define DEBUG_QUEUES 0
76 #define DEBUG_THREAD_SLEEP 0
77 #define DEBUG_THREAD_SIGNAL 0
78
79 /* Defaults. */
80 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
81
82 /* Secure getenv() which returns NULL if running as SUID/SGID. */
83 #ifndef HAVE_SECURE_GETENV
84 #ifdef HAVE___SECURE_GETENV
85 #define secure_getenv __secure_getenv
86 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
87 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
88
89 #include <unistd.h>
90
91 /* Implementation of secure_getenv() for targets where it is not provided but
92 we have at least means to test real and effective IDs. */
93
94 static char *
95 secure_getenv (const char *name)
96 {
97 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
98 return getenv (name);
99 else
100 return NULL;
101 }
102
103 #else
104 #define secure_getenv getenv
105 #endif
106 #endif
107
108 /* }}} */
109 /* {{{ Types */
110
111 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
112
113 struct gcn_thread
114 {
115 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
116 int async;
117 };
118
119 /* As an HSA runtime is dlopened, following structure defines function
120 pointers utilized by the HSA plug-in. */
121
122 struct hsa_runtime_fn_info
123 {
124 /* HSA runtime. */
125 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
126 const char **status_string);
127 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
128 void *value);
129 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
130 hsa_agent_info_t attribute,
131 void *value);
132 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
133 hsa_isa_info_t attribute,
134 uint32_t index,
135 void *value);
136 hsa_status_t (*hsa_init_fn) (void);
137 hsa_status_t (*hsa_iterate_agents_fn)
138 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
139 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
140 hsa_region_info_t attribute,
141 void *value);
142 hsa_status_t (*hsa_queue_create_fn)
143 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
144 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
145 void *data, uint32_t private_segment_size,
146 uint32_t group_segment_size, hsa_queue_t **queue);
147 hsa_status_t (*hsa_agent_iterate_regions_fn)
148 (hsa_agent_t agent,
149 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
150 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
151 hsa_status_t (*hsa_executable_create_fn)
152 (hsa_profile_t profile, hsa_executable_state_t executable_state,
153 const char *options, hsa_executable_t *executable);
154 hsa_status_t (*hsa_executable_global_variable_define_fn)
155 (hsa_executable_t executable, const char *variable_name, void *address);
156 hsa_status_t (*hsa_executable_load_code_object_fn)
157 (hsa_executable_t executable, hsa_agent_t agent,
158 hsa_code_object_t code_object, const char *options);
159 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
160 const char *options);
161 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
162 uint32_t num_consumers,
163 const hsa_agent_t *consumers,
164 hsa_signal_t *signal);
165 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
166 void **ptr);
167 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
168 hsa_access_permission_t access);
169 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
170 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
171 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
172 hsa_status_t (*hsa_executable_get_symbol_fn)
173 (hsa_executable_t executable, const char *module_name,
174 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
175 hsa_executable_symbol_t *symbol);
176 hsa_status_t (*hsa_executable_symbol_get_info_fn)
177 (hsa_executable_symbol_t executable_symbol,
178 hsa_executable_symbol_info_t attribute, void *value);
179 hsa_status_t (*hsa_executable_iterate_symbols_fn)
180 (hsa_executable_t executable,
181 hsa_status_t (*callback)(hsa_executable_t executable,
182 hsa_executable_symbol_t symbol, void *data),
183 void *data);
184 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
185 uint64_t value);
186 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
187 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
190 hsa_signal_value_t value);
191 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
192 (hsa_signal_t signal, hsa_signal_condition_t condition,
193 hsa_signal_value_t compare_value, uint64_t timeout_hint,
194 hsa_wait_state_t wait_state_hint);
195 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
196 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
197
198 hsa_status_t (*hsa_code_object_deserialize_fn)
199 (void *serialized_code_object, size_t serialized_code_object_size,
200 const char *options, hsa_code_object_t *code_object);
201 };
202
203 /* Structure describing the run-time and grid properties of an HSA kernel
204 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
205
206 struct GOMP_kernel_launch_attributes
207 {
208 /* Number of dimensions the workload has. Maximum number is 3. */
209 uint32_t ndim;
210 /* Size of the grid in the three respective dimensions. */
211 uint32_t gdims[3];
212 /* Size of work-groups in the respective dimensions. */
213 uint32_t wdims[3];
214 };
215
216 /* Collection of information needed for a dispatch of a kernel from a
217 kernel. */
218
219 struct kernel_dispatch
220 {
221 struct agent_info *agent;
222 /* Pointer to a command queue associated with a kernel dispatch agent. */
223 void *queue;
224 /* Pointer to a memory space used for kernel arguments passing. */
225 void *kernarg_address;
226 /* Kernel object. */
227 uint64_t object;
228 /* Synchronization signal used for dispatch synchronization. */
229 uint64_t signal;
230 /* Private segment size. */
231 uint32_t private_segment_size;
232 /* Group segment size. */
233 uint32_t group_segment_size;
234 };
235
236 /* Structure of the kernargs segment, supporting console output.
237
238 This needs to match the definitions in Newlib, and the expectations
239 in libgomp target code. */
240
241 struct kernargs {
242 /* Leave space for the real kernel arguments.
243 OpenACC and OpenMP only use one pointer. */
244 int64_t dummy1;
245 int64_t dummy2;
246
247 /* A pointer to struct output, below, for console output data. */
248 int64_t out_ptr;
249
250 /* A pointer to struct heap, below. */
251 int64_t heap_ptr;
252
253 /* A pointer to an ephemeral memory arena.
254 Only needed for OpenMP. */
255 int64_t arena_ptr;
256
257 /* Output data. */
258 struct output {
259 int return_value;
260 unsigned int next_output;
261 struct printf_data {
262 int written;
263 char msg[128];
264 int type;
265 union {
266 int64_t ivalue;
267 double dvalue;
268 char text[128];
269 };
270 } queue[1024];
271 unsigned int consumed;
272 } output_data;
273 };
274
275 /* A queue entry for a future asynchronous launch. */
276
277 struct kernel_launch
278 {
279 struct kernel_info *kernel;
280 void *vars;
281 struct GOMP_kernel_launch_attributes kla;
282 };
283
284 /* A queue entry for a future callback. */
285
286 struct callback
287 {
288 void (*fn)(void *);
289 void *data;
290 };
291
292 /* A data struct for the copy_data callback. */
293
294 struct copy_data
295 {
296 void *dst;
297 const void *src;
298 size_t len;
299 bool free_src;
300 struct goacc_asyncqueue *aq;
301 };
302
303 /* A queue entry for a placeholder. These correspond to a wait event. */
304
305 struct placeholder
306 {
307 int executed;
308 pthread_cond_t cond;
309 pthread_mutex_t mutex;
310 };
311
312 /* A queue entry for a wait directive. */
313
314 struct asyncwait_info
315 {
316 struct placeholder *placeholderp;
317 };
318
319 /* Encode the type of an entry in an async queue. */
320
321 enum entry_type
322 {
323 KERNEL_LAUNCH,
324 CALLBACK,
325 ASYNC_WAIT,
326 ASYNC_PLACEHOLDER
327 };
328
329 /* An entry in an async queue. */
330
331 struct queue_entry
332 {
333 enum entry_type type;
334 union {
335 struct kernel_launch launch;
336 struct callback callback;
337 struct asyncwait_info asyncwait;
338 struct placeholder placeholder;
339 } u;
340 };
341
342 /* An async queue header.
343
344 OpenMP may create one of these.
345 OpenACC may create many. */
346
347 struct goacc_asyncqueue
348 {
349 struct agent_info *agent;
350 hsa_queue_t *hsa_queue;
351
352 pthread_t thread_drain_queue;
353 pthread_mutex_t mutex;
354 pthread_cond_t queue_cond_in;
355 pthread_cond_t queue_cond_out;
356 struct queue_entry queue[ASYNC_QUEUE_SIZE];
357 int queue_first;
358 int queue_n;
359 int drain_queue_stop;
360
361 int id;
362 struct goacc_asyncqueue *prev;
363 struct goacc_asyncqueue *next;
364 };
365
366 /* Mkoffload uses this structure to describe a kernel.
367
368 OpenMP kernel dimensions are passed at runtime.
369 OpenACC kernel dimensions are passed at compile time, here. */
370
371 struct hsa_kernel_description
372 {
373 const char *name;
374 int oacc_dims[3]; /* Only present for GCN kernels. */
375 int sgpr_count;
376 int vpgr_count;
377 };
378
379 /* Mkoffload uses this structure to describe an offload variable. */
380
381 struct global_var_info
382 {
383 const char *name;
384 void *address;
385 };
386
387 /* Mkoffload uses this structure to describe all the kernels in a
388 loadable module. These are passed the libgomp via static constructors. */
389
390 struct gcn_image_desc
391 {
392 struct gcn_image {
393 size_t size;
394 void *image;
395 } *gcn_image;
396 const unsigned kernel_count;
397 struct hsa_kernel_description *kernel_infos;
398 const unsigned global_variable_count;
399 struct global_var_info *global_variables;
400 };
401
402 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
403 support.
404 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
405
406 typedef enum {
407 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
408 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
409 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
410 } EF_AMDGPU_MACH;
411
412 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
413 typedef EF_AMDGPU_MACH gcn_isa;
414
415 /* Description of an HSA GPU agent (device) and the program associated with
416 it. */
417
418 struct agent_info
419 {
420 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
421 hsa_agent_t id;
422 /* The user-visible device number. */
423 int device_id;
424 /* Whether the agent has been initialized. The fields below are usable only
425 if it has been. */
426 bool initialized;
427
428 /* The instruction set architecture of the device. */
429 gcn_isa device_isa;
430 /* Name of the agent. */
431 char name[64];
432 /* Name of the vendor of the agent. */
433 char vendor_name[64];
434 /* Command queues of the agent. */
435 hsa_queue_t *sync_queue;
436 struct goacc_asyncqueue *async_queues, *omp_async_queue;
437 pthread_mutex_t async_queues_mutex;
438
439 /* The HSA memory region from which to allocate kernel arguments. */
440 hsa_region_t kernarg_region;
441
442 /* The HSA memory region from which to allocate device data. */
443 hsa_region_t data_region;
444
445 /* Allocated team arenas. */
446 struct team_arena_list *team_arena_list;
447 pthread_mutex_t team_arena_write_lock;
448
449 /* Read-write lock that protects kernels which are running or about to be run
450 from interference with loading and unloading of images. Needs to be
451 locked for reading while a kernel is being run, and for writing if the
452 list of modules is manipulated (and thus the HSA program invalidated). */
453 pthread_rwlock_t module_rwlock;
454
455 /* The module associated with this kernel. */
456 struct module_info *module;
457
458 /* Mutex enforcing that only one thread will finalize the HSA program. A
459 thread should have locked agent->module_rwlock for reading before
460 acquiring it. */
461 pthread_mutex_t prog_mutex;
462 /* Flag whether the HSA program that consists of all the modules has been
463 finalized. */
464 bool prog_finalized;
465 /* HSA executable - the finalized program that is used to locate kernels. */
466 hsa_executable_t executable;
467 };
468
469 /* Information required to identify, finalize and run any given kernel. */
470
471 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
472
473 struct kernel_info
474 {
475 /* Name of the kernel, required to locate it within the GCN object-code
476 module. */
477 const char *name;
478 /* The specific agent the kernel has been or will be finalized for and run
479 on. */
480 struct agent_info *agent;
481 /* The specific module where the kernel takes place. */
482 struct module_info *module;
483 /* Information provided by mkoffload associated with the kernel. */
484 struct hsa_kernel_description *description;
485 /* Mutex enforcing that at most once thread ever initializes a kernel for
486 use. A thread should have locked agent->module_rwlock for reading before
487 acquiring it. */
488 pthread_mutex_t init_mutex;
489 /* Flag indicating whether the kernel has been initialized and all fields
490 below it contain valid data. */
491 bool initialized;
492 /* Flag indicating that the kernel has a problem that blocks an execution. */
493 bool initialization_failed;
494 /* The object to be put into the dispatch queue. */
495 uint64_t object;
496 /* Required size of kernel arguments. */
497 uint32_t kernarg_segment_size;
498 /* Required size of group segment. */
499 uint32_t group_segment_size;
500 /* Required size of private segment. */
501 uint32_t private_segment_size;
502 /* Set up for OpenMP or OpenACC? */
503 enum offload_kind kind;
504 };
505
506 /* Information about a particular GCN module, its image and kernels. */
507
508 struct module_info
509 {
510 /* The description with which the program has registered the image. */
511 struct gcn_image_desc *image_desc;
512 /* GCN heap allocation. */
513 struct heap *heap;
514 /* Physical boundaries of the loaded module. */
515 Elf64_Addr phys_address_start;
516 Elf64_Addr phys_address_end;
517
518 bool constructors_run_p;
519 struct kernel_info *init_array_func, *fini_array_func;
520
521 /* Number of kernels in this module. */
522 int kernel_count;
523 /* An array of kernel_info structures describing each kernel in this
524 module. */
525 struct kernel_info kernels[];
526 };
527
528 /* A linked list of memory arenas allocated on the device.
529 These are only used by OpenMP, as a means to optimize per-team malloc. */
530
531 struct team_arena_list
532 {
533 struct team_arena_list *next;
534
535 /* The number of teams determines the size of the allocation. */
536 int num_teams;
537 /* The device address of the arena itself. */
538 void *arena;
539 /* A flag to prevent two asynchronous kernels trying to use the same arena.
540 The mutex is locked until the kernel exits. */
541 pthread_mutex_t in_use;
542 };
543
544 /* Information about the whole HSA environment and all of its agents. */
545
546 struct hsa_context_info
547 {
548 /* Whether the structure has been initialized. */
549 bool initialized;
550 /* Number of usable GPU HSA agents in the system. */
551 int agent_count;
552 /* Array of agent_info structures describing the individual HSA agents. */
553 struct agent_info *agents;
554 /* Driver version string. */
555 char driver_version_s[30];
556 };
557
558 /* Format of the on-device heap.
559
560 This must match the definition in Newlib and gcn-run. */
561
562 struct heap {
563 int64_t size;
564 char data[0];
565 };
566
567 /* }}} */
568 /* {{{ Global variables */
569
570 /* Information about the whole HSA environment and all of its agents. */
571
572 static struct hsa_context_info hsa_context;
573
574 /* HSA runtime functions that are initialized in init_hsa_context. */
575
576 static struct hsa_runtime_fn_info hsa_fns;
577
578 /* Heap space, allocated target-side, provided for use of newlib malloc.
579 Each module should have it's own heap allocated.
580 Beware that heap usage increases with OpenMP teams. See also arenas. */
581
582 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
583
584 /* Flag to decide whether print to stderr information about what is going on.
585 Set in init_debug depending on environment variables. */
586
587 static bool debug;
588
589 /* Flag to decide if the runtime should suppress a possible fallback to host
590 execution. */
591
592 static bool suppress_host_fallback;
593
594 /* Flag to locate HSA runtime shared library that is dlopened
595 by this plug-in. */
596
597 static const char *hsa_runtime_lib;
598
599 /* Flag to decide if the runtime should support also CPU devices (can be
600 a simulator). */
601
602 static bool support_cpu_devices;
603
604 /* Runtime dimension overrides. Zero indicates default. */
605
606 static int override_x_dim = 0;
607 static int override_z_dim = 0;
608
609 /* }}} */
610 /* {{{ Debug & Diagnostic */
611
612 /* Print a message to stderr if GCN_DEBUG value is set to true. */
613
614 #define DEBUG_PRINT(...) \
615 do \
616 { \
617 if (debug) \
618 { \
619 fprintf (stderr, __VA_ARGS__); \
620 } \
621 } \
622 while (false);
623
624 /* Flush stderr if GCN_DEBUG value is set to true. */
625
626 #define DEBUG_FLUSH() \
627 do { \
628 if (debug) \
629 fflush (stderr); \
630 } while (false)
631
632 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
633 is set to true. */
634
635 #define DEBUG_LOG(prefix, ...) \
636 do \
637 { \
638 DEBUG_PRINT (prefix); \
639 DEBUG_PRINT (__VA_ARGS__); \
640 DEBUG_FLUSH (); \
641 } while (false)
642
643 /* Print a debugging message to stderr. */
644
645 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
646
647 /* Print a warning message to stderr. */
648
649 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
650
651 /* Print HSA warning STR with an HSA STATUS code. */
652
653 static void
654 hsa_warn (const char *str, hsa_status_t status)
655 {
656 if (!debug)
657 return;
658
659 const char *hsa_error_msg = "[unknown]";
660 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
661
662 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
663 hsa_error_msg);
664 }
665
666 /* Report a fatal error STR together with the HSA error corresponding to STATUS
667 and terminate execution of the current process. */
668
669 static void
670 hsa_fatal (const char *str, hsa_status_t status)
671 {
672 const char *hsa_error_msg = "[unknown]";
673 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
674 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
675 hsa_error_msg);
676 }
677
678 /* Like hsa_fatal, except only report error message, and return FALSE
679 for propagating error processing to outside of plugin. */
680
681 static bool
682 hsa_error (const char *str, hsa_status_t status)
683 {
684 const char *hsa_error_msg = "[unknown]";
685 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
686 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
687 hsa_error_msg);
688 return false;
689 }
690
691 /* Dump information about the available hardware. */
692
693 static void
694 dump_hsa_system_info (void)
695 {
696 hsa_status_t status;
697
698 hsa_endianness_t endianness;
699 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
700 &endianness);
701 if (status == HSA_STATUS_SUCCESS)
702 switch (endianness)
703 {
704 case HSA_ENDIANNESS_LITTLE:
705 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
706 break;
707 case HSA_ENDIANNESS_BIG:
708 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
709 break;
710 default:
711 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
712 }
713 else
714 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
715
716 uint8_t extensions[128];
717 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
718 &extensions);
719 if (status == HSA_STATUS_SUCCESS)
720 {
721 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
722 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
723 }
724 else
725 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
726 }
727
728 /* Dump information about the available hardware. */
729
730 static void
731 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
732 {
733 switch (machine_model)
734 {
735 case HSA_MACHINE_MODEL_SMALL:
736 GCN_DEBUG ("%s: SMALL\n", s);
737 break;
738 case HSA_MACHINE_MODEL_LARGE:
739 GCN_DEBUG ("%s: LARGE\n", s);
740 break;
741 default:
742 GCN_WARNING ("%s: UNKNOWN\n", s);
743 break;
744 }
745 }
746
747 /* Dump information about the available hardware. */
748
749 static void
750 dump_profile (hsa_profile_t profile, const char *s)
751 {
752 switch (profile)
753 {
754 case HSA_PROFILE_FULL:
755 GCN_DEBUG ("%s: FULL\n", s);
756 break;
757 case HSA_PROFILE_BASE:
758 GCN_DEBUG ("%s: BASE\n", s);
759 break;
760 default:
761 GCN_WARNING ("%s: UNKNOWN\n", s);
762 break;
763 }
764 }
765
766 /* Dump information about a device memory region. */
767
768 static hsa_status_t
769 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
770 {
771 hsa_status_t status;
772
773 hsa_region_segment_t segment;
774 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
775 &segment);
776 if (status == HSA_STATUS_SUCCESS)
777 {
778 if (segment == HSA_REGION_SEGMENT_GLOBAL)
779 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
780 else if (segment == HSA_REGION_SEGMENT_READONLY)
781 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
782 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
783 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
784 else if (segment == HSA_REGION_SEGMENT_GROUP)
785 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
786 else
787 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
788 }
789 else
790 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
791
792 if (segment == HSA_REGION_SEGMENT_GLOBAL)
793 {
794 uint32_t flags;
795 status
796 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
797 &flags);
798 if (status == HSA_STATUS_SUCCESS)
799 {
800 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
801 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
802 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
803 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
804 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
805 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
806 }
807 else
808 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
809 }
810
811 size_t size;
812 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
813 if (status == HSA_STATUS_SUCCESS)
814 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
815 else
816 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
817
818 status
819 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
820 &size);
821 if (status == HSA_STATUS_SUCCESS)
822 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
823 else
824 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
825
826 bool alloc_allowed;
827 status
828 = hsa_fns.hsa_region_get_info_fn (region,
829 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
830 &alloc_allowed);
831 if (status == HSA_STATUS_SUCCESS)
832 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
833 else
834 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
835
836 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
837 return HSA_STATUS_SUCCESS;
838
839 status
840 = hsa_fns.hsa_region_get_info_fn (region,
841 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
842 &size);
843 if (status == HSA_STATUS_SUCCESS)
844 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
845 else
846 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
847
848 size_t align;
849 status
850 = hsa_fns.hsa_region_get_info_fn (region,
851 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
852 &align);
853 if (status == HSA_STATUS_SUCCESS)
854 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
855 else
856 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
857
858 return HSA_STATUS_SUCCESS;
859 }
860
861 /* Dump information about all the device memory regions. */
862
863 static void
864 dump_hsa_regions (hsa_agent_t agent)
865 {
866 hsa_status_t status;
867 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
868 dump_hsa_region,
869 NULL);
870 if (status != HSA_STATUS_SUCCESS)
871 hsa_error ("Dumping hsa regions failed", status);
872 }
873
874 /* Dump information about the available devices. */
875
876 static hsa_status_t
877 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
878 {
879 hsa_status_t status;
880
881 char buf[64];
882 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
883 &buf);
884 if (status == HSA_STATUS_SUCCESS)
885 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
886 else
887 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
888
889 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
890 &buf);
891 if (status == HSA_STATUS_SUCCESS)
892 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
893 else
894 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
895
896 hsa_machine_model_t machine_model;
897 status
898 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
899 &machine_model);
900 if (status == HSA_STATUS_SUCCESS)
901 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
902 else
903 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
904
905 hsa_profile_t profile;
906 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
907 &profile);
908 if (status == HSA_STATUS_SUCCESS)
909 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
910 else
911 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
912
913 hsa_device_type_t device_type;
914 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
915 &device_type);
916 if (status == HSA_STATUS_SUCCESS)
917 {
918 switch (device_type)
919 {
920 case HSA_DEVICE_TYPE_CPU:
921 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
922 break;
923 case HSA_DEVICE_TYPE_GPU:
924 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
925 break;
926 case HSA_DEVICE_TYPE_DSP:
927 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
928 break;
929 default:
930 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
931 break;
932 }
933 }
934 else
935 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
936
937 uint32_t cu_count;
938 status = hsa_fns.hsa_agent_get_info_fn
939 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
940 if (status == HSA_STATUS_SUCCESS)
941 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
942 else
943 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
944
945 uint32_t size;
946 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
947 &size);
948 if (status == HSA_STATUS_SUCCESS)
949 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
950 else
951 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
952
953 uint32_t max_dim;
954 status = hsa_fns.hsa_agent_get_info_fn (agent,
955 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
956 &max_dim);
957 if (status == HSA_STATUS_SUCCESS)
958 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
959 else
960 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
961
962 uint32_t max_size;
963 status = hsa_fns.hsa_agent_get_info_fn (agent,
964 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
965 &max_size);
966 if (status == HSA_STATUS_SUCCESS)
967 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
968 else
969 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
970
971 uint32_t grid_max_dim;
972 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
973 &grid_max_dim);
974 if (status == HSA_STATUS_SUCCESS)
975 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
976 else
977 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
978
979 uint32_t grid_max_size;
980 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
981 &grid_max_size);
982 if (status == HSA_STATUS_SUCCESS)
983 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
984 else
985 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
986
987 dump_hsa_regions (agent);
988
989 return HSA_STATUS_SUCCESS;
990 }
991
992 /* Forward reference. */
993
994 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
995
996 /* Helper function for dump_executable_symbols. */
997
998 static hsa_status_t
999 dump_executable_symbol (hsa_executable_t executable,
1000 hsa_executable_symbol_t symbol,
1001 void *data __attribute__((unused)))
1002 {
1003 char *name = get_executable_symbol_name (symbol);
1004
1005 if (name)
1006 {
1007 GCN_DEBUG ("executable symbol: %s\n", name);
1008 free (name);
1009 }
1010
1011 return HSA_STATUS_SUCCESS;
1012 }
1013
1014 /* Dump all global symbol in an executable. */
1015
1016 static void
1017 dump_executable_symbols (hsa_executable_t executable)
1018 {
1019 hsa_status_t status;
1020 status
1021 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1022 dump_executable_symbol,
1023 NULL);
1024 if (status != HSA_STATUS_SUCCESS)
1025 hsa_fatal ("Could not dump HSA executable symbols", status);
1026 }
1027
1028 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1029
1030 static void
1031 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1032 {
1033 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1034
1035 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1036 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1037 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1038 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1039 (void*)kernargs->heap_ptr);
1040 fprintf (stderr, "%*sarena address: %p\n", indent, "",
1041 (void*)kernargs->arena_ptr);
1042 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1043 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1044 dispatch->private_segment_size);
1045 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1046 dispatch->group_segment_size);
1047 fprintf (stderr, "\n");
1048 }
1049
1050 /* }}} */
1051 /* {{{ Utility functions */
1052
1053 /* Cast the thread local storage to gcn_thread. */
1054
1055 static inline struct gcn_thread *
1056 gcn_thread (void)
1057 {
1058 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1059 }
1060
1061 /* Initialize debug and suppress_host_fallback according to the environment. */
1062
1063 static void
1064 init_environment_variables (void)
1065 {
1066 if (secure_getenv ("GCN_DEBUG"))
1067 debug = true;
1068 else
1069 debug = false;
1070
1071 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1072 suppress_host_fallback = true;
1073 else
1074 suppress_host_fallback = false;
1075
1076 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1077 if (hsa_runtime_lib == NULL)
1078 hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so.1";
1079
1080 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1081
1082 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1083 if (!x)
1084 x = secure_getenv ("GCN_NUM_GANGS");
1085 if (x)
1086 override_x_dim = atoi (x);
1087
1088 const char *z = secure_getenv ("GCN_NUM_THREADS");
1089 if (!z)
1090 z = secure_getenv ("GCN_NUM_WORKERS");
1091 if (z)
1092 override_z_dim = atoi (z);
1093
1094 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1095 if (heap)
1096 {
1097 size_t tmp = atol (heap);
1098 if (tmp)
1099 gcn_kernel_heap_size = tmp;
1100 }
1101 }
1102
1103 /* Return malloc'd string with name of SYMBOL. */
1104
1105 static char *
1106 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1107 {
1108 hsa_status_t status;
1109 char *res;
1110 uint32_t len;
1111 const hsa_executable_symbol_info_t info_name_length
1112 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1113
1114 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1115 &len);
1116 if (status != HSA_STATUS_SUCCESS)
1117 {
1118 hsa_error ("Could not get length of symbol name", status);
1119 return NULL;
1120 }
1121
1122 res = GOMP_PLUGIN_malloc (len + 1);
1123
1124 const hsa_executable_symbol_info_t info_name
1125 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1126
1127 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1128
1129 if (status != HSA_STATUS_SUCCESS)
1130 {
1131 hsa_error ("Could not get symbol name", status);
1132 free (res);
1133 return NULL;
1134 }
1135
1136 res[len] = '\0';
1137
1138 return res;
1139 }
1140
1141 /* Get the number of GPU Compute Units. */
1142
1143 static int
1144 get_cu_count (struct agent_info *agent)
1145 {
1146 uint32_t cu_count;
1147 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1148 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1149 if (status == HSA_STATUS_SUCCESS)
1150 return cu_count;
1151 else
1152 return 64; /* The usual number for older devices. */
1153 }
1154
1155 /* Calculate the maximum grid size for OMP threads / OACC workers.
1156 This depends on the kernel's resource usage levels. */
1157
1158 static int
1159 limit_worker_threads (int threads)
1160 {
1161 /* FIXME Do something more inteligent here.
1162 GCN can always run 4 threads within a Compute Unit, but
1163 more than that depends on register usage. */
1164 if (threads > 16)
1165 threads = 16;
1166 return threads;
1167 }
1168
1169 /* Parse the target attributes INPUT provided by the compiler and return true
1170 if we should run anything all. If INPUT is NULL, fill DEF with default
1171 values, then store INPUT or DEF into *RESULT.
1172
1173 This is used for OpenMP only. */
1174
1175 static bool
1176 parse_target_attributes (void **input,
1177 struct GOMP_kernel_launch_attributes *def,
1178 struct GOMP_kernel_launch_attributes **result,
1179 struct agent_info *agent)
1180 {
1181 if (!input)
1182 GOMP_PLUGIN_fatal ("No target arguments provided");
1183
1184 bool grid_attrs_found = false;
1185 bool gcn_dims_found = false;
1186 int gcn_teams = 0;
1187 int gcn_threads = 0;
1188 while (*input)
1189 {
1190 intptr_t id = (intptr_t) *input++, val;
1191
1192 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1193 val = (intptr_t) *input++;
1194 else
1195 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1196
1197 val = (val > INT_MAX) ? INT_MAX : val;
1198
1199 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1200 && ((id & GOMP_TARGET_ARG_ID_MASK)
1201 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1202 {
1203 grid_attrs_found = true;
1204 break;
1205 }
1206 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1207 == GOMP_TARGET_ARG_DEVICE_ALL)
1208 {
1209 gcn_dims_found = true;
1210 switch (id & GOMP_TARGET_ARG_ID_MASK)
1211 {
1212 case GOMP_TARGET_ARG_NUM_TEAMS:
1213 gcn_teams = val;
1214 break;
1215 case GOMP_TARGET_ARG_THREAD_LIMIT:
1216 gcn_threads = limit_worker_threads (val);
1217 break;
1218 default:
1219 ;
1220 }
1221 }
1222 }
1223
1224 if (gcn_dims_found)
1225 {
1226 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1227 && gcn_threads == 0 && override_z_dim == 0)
1228 {
1229 gcn_threads = 4;
1230 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1231 "threads to 4 per team.\n");
1232 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1233 "GCN_NUM_THREADS=16\n");
1234 }
1235
1236 def->ndim = 3;
1237 /* Fiji has 64 CUs, but Vega20 has 60. */
1238 def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
1239 /* Each thread is 64 work items wide. */
1240 def->gdims[1] = 64;
1241 /* A work group can have 16 wavefronts. */
1242 def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
1243 def->wdims[0] = 1; /* Single team per work-group. */
1244 def->wdims[1] = 64;
1245 def->wdims[2] = 16;
1246 *result = def;
1247 return true;
1248 }
1249 else if (!grid_attrs_found)
1250 {
1251 def->ndim = 1;
1252 def->gdims[0] = 1;
1253 def->gdims[1] = 1;
1254 def->gdims[2] = 1;
1255 def->wdims[0] = 1;
1256 def->wdims[1] = 1;
1257 def->wdims[2] = 1;
1258 *result = def;
1259 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1260 return true;
1261 }
1262
1263 struct GOMP_kernel_launch_attributes *kla;
1264 kla = (struct GOMP_kernel_launch_attributes *) *input;
1265 *result = kla;
1266 if (kla->ndim == 0 || kla->ndim > 3)
1267 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1268
1269 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1270 unsigned i;
1271 for (i = 0; i < kla->ndim; i++)
1272 {
1273 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1274 kla->gdims[i], kla->wdims[i]);
1275 if (kla->gdims[i] == 0)
1276 return false;
1277 }
1278 return true;
1279 }
1280
1281 /* Return the group size given the requested GROUP size, GRID size and number
1282 of grid dimensions NDIM. */
1283
1284 static uint32_t
1285 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1286 {
1287 if (group == 0)
1288 {
1289 /* TODO: Provide a default via environment or device characteristics. */
1290 if (ndim == 1)
1291 group = 64;
1292 else if (ndim == 2)
1293 group = 8;
1294 else
1295 group = 4;
1296 }
1297
1298 if (group > grid)
1299 group = grid;
1300 return group;
1301 }
1302
1303 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1304
1305 static void
1306 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1307 {
1308 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1309 }
1310
1311 /* A never-called callback for the HSA command queues. These signal events
1312 that we don't use, so we trigger an error.
1313
1314 This "queue" is not to be confused with the async queues, below. */
1315
1316 static void
1317 hsa_queue_callback (hsa_status_t status,
1318 hsa_queue_t *queue __attribute__ ((unused)),
1319 void *data __attribute__ ((unused)))
1320 {
1321 hsa_fatal ("Asynchronous queue error", status);
1322 }
1323
1324 /* }}} */
1325 /* {{{ HSA initialization */
1326
1327 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1328
1329 static bool
1330 init_hsa_runtime_functions (void)
1331 {
1332 #define DLSYM_FN(function) \
1333 hsa_fns.function##_fn = dlsym (handle, #function); \
1334 if (hsa_fns.function##_fn == NULL) \
1335 return false;
1336 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1337 if (handle == NULL)
1338 return false;
1339
1340 DLSYM_FN (hsa_status_string)
1341 DLSYM_FN (hsa_system_get_info)
1342 DLSYM_FN (hsa_agent_get_info)
1343 DLSYM_FN (hsa_init)
1344 DLSYM_FN (hsa_iterate_agents)
1345 DLSYM_FN (hsa_region_get_info)
1346 DLSYM_FN (hsa_queue_create)
1347 DLSYM_FN (hsa_agent_iterate_regions)
1348 DLSYM_FN (hsa_executable_destroy)
1349 DLSYM_FN (hsa_executable_create)
1350 DLSYM_FN (hsa_executable_global_variable_define)
1351 DLSYM_FN (hsa_executable_load_code_object)
1352 DLSYM_FN (hsa_executable_freeze)
1353 DLSYM_FN (hsa_signal_create)
1354 DLSYM_FN (hsa_memory_allocate)
1355 DLSYM_FN (hsa_memory_assign_agent)
1356 DLSYM_FN (hsa_memory_copy)
1357 DLSYM_FN (hsa_memory_free)
1358 DLSYM_FN (hsa_signal_destroy)
1359 DLSYM_FN (hsa_executable_get_symbol)
1360 DLSYM_FN (hsa_executable_symbol_get_info)
1361 DLSYM_FN (hsa_executable_iterate_symbols)
1362 DLSYM_FN (hsa_queue_add_write_index_release)
1363 DLSYM_FN (hsa_queue_load_read_index_acquire)
1364 DLSYM_FN (hsa_signal_wait_acquire)
1365 DLSYM_FN (hsa_signal_store_relaxed)
1366 DLSYM_FN (hsa_signal_store_release)
1367 DLSYM_FN (hsa_signal_load_acquire)
1368 DLSYM_FN (hsa_queue_destroy)
1369 DLSYM_FN (hsa_code_object_deserialize)
1370 return true;
1371 #undef DLSYM_FN
1372 }
1373
1374 /* Return true if the agent is a GPU and can accept of concurrent submissions
1375 from different threads. */
1376
1377 static bool
1378 suitable_hsa_agent_p (hsa_agent_t agent)
1379 {
1380 hsa_device_type_t device_type;
1381 hsa_status_t status
1382 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1383 &device_type);
1384 if (status != HSA_STATUS_SUCCESS)
1385 return false;
1386
1387 switch (device_type)
1388 {
1389 case HSA_DEVICE_TYPE_GPU:
1390 break;
1391 case HSA_DEVICE_TYPE_CPU:
1392 if (!support_cpu_devices)
1393 return false;
1394 break;
1395 default:
1396 return false;
1397 }
1398
1399 uint32_t features = 0;
1400 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1401 &features);
1402 if (status != HSA_STATUS_SUCCESS
1403 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1404 return false;
1405 hsa_queue_type_t queue_type;
1406 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1407 &queue_type);
1408 if (status != HSA_STATUS_SUCCESS
1409 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1410 return false;
1411
1412 return true;
1413 }
1414
1415 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1416 agent_count in hsa_context. */
1417
1418 static hsa_status_t
1419 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1420 {
1421 if (suitable_hsa_agent_p (agent))
1422 hsa_context.agent_count++;
1423 return HSA_STATUS_SUCCESS;
1424 }
1425
1426 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1427 id to the describing structure in the hsa context. The index of the
1428 structure is pointed to by DATA, increment it afterwards. */
1429
1430 static hsa_status_t
1431 assign_agent_ids (hsa_agent_t agent, void *data)
1432 {
1433 if (suitable_hsa_agent_p (agent))
1434 {
1435 int *agent_index = (int *) data;
1436 hsa_context.agents[*agent_index].id = agent;
1437 ++*agent_index;
1438 }
1439 return HSA_STATUS_SUCCESS;
1440 }
1441
1442 /* Initialize hsa_context if it has not already been done.
1443 Return TRUE on success. */
1444
1445 static bool
1446 init_hsa_context (void)
1447 {
1448 hsa_status_t status;
1449 int agent_index = 0;
1450
1451 if (hsa_context.initialized)
1452 return true;
1453 init_environment_variables ();
1454 if (!init_hsa_runtime_functions ())
1455 {
1456 GCN_WARNING ("Run-time could not be dynamically opened\n");
1457 if (suppress_host_fallback)
1458 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1459 return false;
1460 }
1461 status = hsa_fns.hsa_init_fn ();
1462 if (status != HSA_STATUS_SUCCESS)
1463 return hsa_error ("Run-time could not be initialized", status);
1464 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1465
1466 if (debug)
1467 dump_hsa_system_info ();
1468
1469 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1470 if (status != HSA_STATUS_SUCCESS)
1471 return hsa_error ("GCN GPU devices could not be enumerated", status);
1472 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1473
1474 hsa_context.agents
1475 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1476 * sizeof (struct agent_info));
1477 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1478 if (status != HSA_STATUS_SUCCESS)
1479 return hsa_error ("Scanning compute agents failed", status);
1480 if (agent_index != hsa_context.agent_count)
1481 {
1482 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1483 return false;
1484 }
1485
1486 if (debug)
1487 {
1488 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1489 if (status != HSA_STATUS_SUCCESS)
1490 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1491 }
1492
1493 uint16_t minor, major;
1494 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1495 &minor);
1496 if (status != HSA_STATUS_SUCCESS)
1497 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1498 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1499 &major);
1500 if (status != HSA_STATUS_SUCCESS)
1501 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1502
1503 size_t len = sizeof hsa_context.driver_version_s;
1504 int printed = snprintf (hsa_context.driver_version_s, len,
1505 "HSA Runtime %hu.%hu", (unsigned short int)major,
1506 (unsigned short int)minor);
1507 if (printed >= len)
1508 GCN_WARNING ("HSA runtime version string was truncated."
1509 "Version %hu.%hu is too long.", (unsigned short int)major,
1510 (unsigned short int)minor);
1511
1512 hsa_context.initialized = true;
1513 return true;
1514 }
1515
1516 /* Verify that hsa_context has already been initialized and return the
1517 agent_info structure describing device number N. Return NULL on error. */
1518
1519 static struct agent_info *
1520 get_agent_info (int n)
1521 {
1522 if (!hsa_context.initialized)
1523 {
1524 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1525 return NULL;
1526 }
1527 if (n >= hsa_context.agent_count)
1528 {
1529 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1530 return NULL;
1531 }
1532 if (!hsa_context.agents[n].initialized)
1533 {
1534 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1535 return NULL;
1536 }
1537 return &hsa_context.agents[n];
1538 }
1539
1540 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1541
1542 Selects (breaks at) a suitable region of type KIND. */
1543
1544 static hsa_status_t
1545 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1546 hsa_region_global_flag_t kind)
1547 {
1548 hsa_status_t status;
1549 hsa_region_segment_t segment;
1550
1551 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1552 &segment);
1553 if (status != HSA_STATUS_SUCCESS)
1554 return status;
1555 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1556 return HSA_STATUS_SUCCESS;
1557
1558 uint32_t flags;
1559 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1560 &flags);
1561 if (status != HSA_STATUS_SUCCESS)
1562 return status;
1563 if (flags & kind)
1564 {
1565 *retval = region;
1566 return HSA_STATUS_INFO_BREAK;
1567 }
1568 return HSA_STATUS_SUCCESS;
1569 }
1570
1571 /* Callback of hsa_agent_iterate_regions.
1572
1573 Selects a kernargs memory region. */
1574
1575 static hsa_status_t
1576 get_kernarg_memory_region (hsa_region_t region, void *data)
1577 {
1578 return get_memory_region (region, (hsa_region_t *)data,
1579 HSA_REGION_GLOBAL_FLAG_KERNARG);
1580 }
1581
1582 /* Callback of hsa_agent_iterate_regions.
1583
1584 Selects a coarse-grained memory region suitable for the heap and
1585 offload data. */
1586
1587 static hsa_status_t
1588 get_data_memory_region (hsa_region_t region, void *data)
1589 {
1590 return get_memory_region (region, (hsa_region_t *)data,
1591 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1592 }
1593
1594 static int
1595 elf_gcn_isa_field (Elf64_Ehdr *image)
1596 {
1597 return image->e_flags & EF_AMDGPU_MACH_MASK;
1598 }
1599
1600 const static char *gcn_gfx803_s = "gfx803";
1601 const static char *gcn_gfx900_s = "gfx900";
1602 const static char *gcn_gfx906_s = "gfx906";
1603 const static int gcn_isa_name_len = 6;
1604
1605 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1606 support the ISA. */
1607
1608 static const char*
1609 isa_hsa_name (int isa) {
1610 switch(isa)
1611 {
1612 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1613 return gcn_gfx803_s;
1614 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1615 return gcn_gfx900_s;
1616 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1617 return gcn_gfx906_s;
1618 }
1619 return NULL;
1620 }
1621
1622 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1623 with -march) or NULL if we do not support the ISA.
1624 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1625
1626 static const char*
1627 isa_gcc_name (int isa) {
1628 switch(isa)
1629 {
1630 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1631 return "fiji";
1632 default:
1633 return isa_hsa_name (isa);
1634 }
1635 }
1636
1637 /* Returns the code which is used in the GCN object code to identify the ISA with
1638 the given name (as used by the HSA runtime). */
1639
1640 static gcn_isa
1641 isa_code(const char *isa) {
1642 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1643 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1644
1645 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1646 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1647
1648 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1649 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1650
1651 return -1;
1652 }
1653
1654 /* }}} */
1655 /* {{{ Run */
1656
1657 /* Create or reuse a team arena.
1658
1659 Team arenas are used by OpenMP to avoid calling malloc multiple times
1660 while setting up each team. This is purely a performance optimization.
1661
1662 Allocating an arena also costs performance, albeit on the host side, so
1663 this function will reuse an existing arena if a large enough one is idle.
1664 The arena is released, but not deallocated, when the kernel exits. */
1665
1666 static void *
1667 get_team_arena (struct agent_info *agent, int num_teams)
1668 {
1669 struct team_arena_list **next_ptr = &agent->team_arena_list;
1670 struct team_arena_list *item;
1671
1672 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1673 {
1674 if (item->num_teams < num_teams)
1675 continue;
1676
1677 if (pthread_mutex_trylock (&item->in_use))
1678 continue;
1679
1680 return item->arena;
1681 }
1682
1683 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1684
1685 if (pthread_mutex_lock (&agent->team_arena_write_lock))
1686 {
1687 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1688 return false;
1689 }
1690 item = malloc (sizeof (*item));
1691 item->num_teams = num_teams;
1692 item->next = NULL;
1693 *next_ptr = item;
1694
1695 if (pthread_mutex_init (&item->in_use, NULL))
1696 {
1697 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1698 return false;
1699 }
1700 if (pthread_mutex_lock (&item->in_use))
1701 {
1702 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1703 return false;
1704 }
1705 if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1706 {
1707 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1708 return false;
1709 }
1710
1711 const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */
1712 hsa_status_t status;
1713 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1714 TEAM_ARENA_SIZE*num_teams,
1715 &item->arena);
1716 if (status != HSA_STATUS_SUCCESS)
1717 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1718 status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1719 HSA_ACCESS_PERMISSION_RW);
1720 if (status != HSA_STATUS_SUCCESS)
1721 hsa_fatal ("Could not assign arena memory to device", status);
1722
1723 return item->arena;
1724 }
1725
1726 /* Mark a team arena available for reuse. */
1727
1728 static void
1729 release_team_arena (struct agent_info* agent, void *arena)
1730 {
1731 struct team_arena_list *item;
1732
1733 for (item = agent->team_arena_list; item; item = item->next)
1734 {
1735 if (item->arena == arena)
1736 {
1737 if (pthread_mutex_unlock (&item->in_use))
1738 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1739 return;
1740 }
1741 }
1742 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1743 }
1744
1745 /* Clean up all the allocated team arenas. */
1746
1747 static bool
1748 destroy_team_arenas (struct agent_info *agent)
1749 {
1750 struct team_arena_list *item, *next;
1751
1752 for (item = agent->team_arena_list; item; item = next)
1753 {
1754 next = item->next;
1755 hsa_fns.hsa_memory_free_fn (item->arena);
1756 if (pthread_mutex_destroy (&item->in_use))
1757 {
1758 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1759 return false;
1760 }
1761 free (item);
1762 }
1763 agent->team_arena_list = NULL;
1764
1765 return true;
1766 }
1767
1768 /* Allocate memory on a specified device. */
1769
1770 static void *
1771 alloc_by_agent (struct agent_info *agent, size_t size)
1772 {
1773 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1774
1775 /* Zero-size allocations are invalid, so in order to return a valid pointer
1776 we need to pass a valid size. One source of zero-size allocations is
1777 kernargs for kernels that have no inputs or outputs (the kernel may
1778 only use console output, for example). */
1779 if (size == 0)
1780 size = 4;
1781
1782 void *ptr;
1783 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1784 size, &ptr);
1785 if (status != HSA_STATUS_SUCCESS)
1786 {
1787 hsa_error ("Could not allocate device memory", status);
1788 return NULL;
1789 }
1790
1791 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1792 HSA_ACCESS_PERMISSION_RW);
1793 if (status != HSA_STATUS_SUCCESS)
1794 {
1795 hsa_error ("Could not assign data memory to device", status);
1796 return NULL;
1797 }
1798
1799 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1800 bool profiling_dispatch_p
1801 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1802 if (profiling_dispatch_p)
1803 {
1804 acc_prof_info *prof_info = thr->prof_info;
1805 acc_event_info data_event_info;
1806 acc_api_info *api_info = thr->api_info;
1807
1808 prof_info->event_type = acc_ev_alloc;
1809
1810 data_event_info.data_event.event_type = prof_info->event_type;
1811 data_event_info.data_event.valid_bytes
1812 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1813 data_event_info.data_event.parent_construct
1814 = acc_construct_parallel;
1815 data_event_info.data_event.implicit = 1;
1816 data_event_info.data_event.tool_info = NULL;
1817 data_event_info.data_event.var_name = NULL;
1818 data_event_info.data_event.bytes = size;
1819 data_event_info.data_event.host_ptr = NULL;
1820 data_event_info.data_event.device_ptr = (void *) ptr;
1821
1822 api_info->device_api = acc_device_api_other;
1823
1824 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1825 api_info);
1826 }
1827
1828 return ptr;
1829 }
1830
1831 /* Create kernel dispatch data structure for given KERNEL, along with
1832 the necessary device signals and memory allocations. */
1833
1834 static struct kernel_dispatch *
1835 create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1836 {
1837 struct agent_info *agent = kernel->agent;
1838 struct kernel_dispatch *shadow
1839 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1840
1841 shadow->agent = kernel->agent;
1842 shadow->object = kernel->object;
1843
1844 hsa_signal_t sync_signal;
1845 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1846 if (status != HSA_STATUS_SUCCESS)
1847 hsa_fatal ("Error creating the GCN sync signal", status);
1848
1849 shadow->signal = sync_signal.handle;
1850 shadow->private_segment_size = kernel->private_segment_size;
1851 shadow->group_segment_size = kernel->group_segment_size;
1852
1853 /* We expect kernels to request a single pointer, explicitly, and the
1854 rest of struct kernargs, implicitly. If they request anything else
1855 then something is wrong. */
1856 if (kernel->kernarg_segment_size > 8)
1857 {
1858 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1859 return NULL;
1860 }
1861
1862 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1863 sizeof (struct kernargs),
1864 &shadow->kernarg_address);
1865 if (status != HSA_STATUS_SUCCESS)
1866 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1867 struct kernargs *kernargs = shadow->kernarg_address;
1868
1869 /* Zero-initialize the output_data (minimum needed). */
1870 kernargs->out_ptr = (int64_t)&kernargs->output_data;
1871 kernargs->output_data.next_output = 0;
1872 for (unsigned i = 0;
1873 i < (sizeof (kernargs->output_data.queue)
1874 / sizeof (kernargs->output_data.queue[0]));
1875 i++)
1876 kernargs->output_data.queue[i].written = 0;
1877 kernargs->output_data.consumed = 0;
1878
1879 /* Pass in the heap location. */
1880 kernargs->heap_ptr = (int64_t)kernel->module->heap;
1881
1882 /* Create an arena. */
1883 if (kernel->kind == KIND_OPENMP)
1884 kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1885 else
1886 kernargs->arena_ptr = 0;
1887
1888 /* Ensure we can recognize unset return values. */
1889 kernargs->output_data.return_value = 0xcafe0000;
1890
1891 return shadow;
1892 }
1893
1894 /* Output any data written to console output from the kernel. It is expected
1895 that this function is polled during kernel execution.
1896
1897 We print all entries from the last item printed to the next entry without
1898 a "written" flag. If the "final" flag is set then it'll continue right to
1899 the end.
1900
1901 The print buffer is circular, but the from and to locations don't wrap when
1902 the buffer does, so the output limit is UINT_MAX. The target blocks on
1903 output when the buffer is full. */
1904
1905 static void
1906 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1907 bool final)
1908 {
1909 unsigned int limit = (sizeof (kernargs->output_data.queue)
1910 / sizeof (kernargs->output_data.queue[0]));
1911
1912 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1913 __ATOMIC_ACQUIRE);
1914 unsigned int to = kernargs->output_data.next_output;
1915
1916 if (from > to)
1917 {
1918 /* Overflow. */
1919 if (final)
1920 printf ("GCN print buffer overflowed.\n");
1921 return;
1922 }
1923
1924 unsigned int i;
1925 for (i = from; i < to; i++)
1926 {
1927 struct printf_data *data = &kernargs->output_data.queue[i%limit];
1928
1929 if (!data->written && !final)
1930 break;
1931
1932 switch (data->type)
1933 {
1934 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1935 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1936 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1937 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1938 default: printf ("GCN print buffer error!\n"); break;
1939 }
1940 data->written = 0;
1941 __atomic_store_n (&kernargs->output_data.consumed, i+1,
1942 __ATOMIC_RELEASE);
1943 }
1944 fflush (stdout);
1945 }
1946
1947 /* Release data structure created for a kernel dispatch in SHADOW argument,
1948 and clean up the signal and memory allocations. */
1949
1950 static void
1951 release_kernel_dispatch (struct kernel_dispatch *shadow)
1952 {
1953 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1954
1955 struct kernargs *kernargs = shadow->kernarg_address;
1956 void *arena = (void *)kernargs->arena_ptr;
1957 if (arena)
1958 release_team_arena (shadow->agent, arena);
1959
1960 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1961
1962 hsa_signal_t s;
1963 s.handle = shadow->signal;
1964 hsa_fns.hsa_signal_destroy_fn (s);
1965
1966 free (shadow);
1967 }
1968
1969 /* Extract the properties from a kernel binary. */
1970
1971 static void
1972 init_kernel_properties (struct kernel_info *kernel)
1973 {
1974 hsa_status_t status;
1975 struct agent_info *agent = kernel->agent;
1976 hsa_executable_symbol_t kernel_symbol;
1977 char *buf = alloca (strlen (kernel->name) + 4);
1978 sprintf (buf, "%s.kd", kernel->name);
1979 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
1980 buf, agent->id,
1981 0, &kernel_symbol);
1982 if (status != HSA_STATUS_SUCCESS)
1983 {
1984 hsa_warn ("Could not find symbol for kernel in the code object", status);
1985 fprintf (stderr, "not found name: '%s'\n", buf);
1986 dump_executable_symbols (agent->executable);
1987 goto failure;
1988 }
1989 GCN_DEBUG ("Located kernel %s\n", kernel->name);
1990 status = hsa_fns.hsa_executable_symbol_get_info_fn
1991 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
1992 if (status != HSA_STATUS_SUCCESS)
1993 hsa_fatal ("Could not extract a kernel object from its symbol", status);
1994 status = hsa_fns.hsa_executable_symbol_get_info_fn
1995 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1996 &kernel->kernarg_segment_size);
1997 if (status != HSA_STATUS_SUCCESS)
1998 hsa_fatal ("Could not get info about kernel argument size", status);
1999 status = hsa_fns.hsa_executable_symbol_get_info_fn
2000 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2001 &kernel->group_segment_size);
2002 if (status != HSA_STATUS_SUCCESS)
2003 hsa_fatal ("Could not get info about kernel group segment size", status);
2004 status = hsa_fns.hsa_executable_symbol_get_info_fn
2005 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2006 &kernel->private_segment_size);
2007 if (status != HSA_STATUS_SUCCESS)
2008 hsa_fatal ("Could not get info about kernel private segment size",
2009 status);
2010
2011 /* The kernel type is not known until something tries to launch it. */
2012 kernel->kind = KIND_UNKNOWN;
2013
2014 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2015 "following segment sizes: \n", kernel->name);
2016 GCN_DEBUG (" group_segment_size: %u\n",
2017 (unsigned) kernel->group_segment_size);
2018 GCN_DEBUG (" private_segment_size: %u\n",
2019 (unsigned) kernel->private_segment_size);
2020 GCN_DEBUG (" kernarg_segment_size: %u\n",
2021 (unsigned) kernel->kernarg_segment_size);
2022 return;
2023
2024 failure:
2025 kernel->initialization_failed = true;
2026 }
2027
2028 /* Do all the work that is necessary before running KERNEL for the first time.
2029 The function assumes the program has been created, finalized and frozen by
2030 create_and_finalize_hsa_program. */
2031
2032 static void
2033 init_kernel (struct kernel_info *kernel)
2034 {
2035 if (pthread_mutex_lock (&kernel->init_mutex))
2036 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2037 if (kernel->initialized)
2038 {
2039 if (pthread_mutex_unlock (&kernel->init_mutex))
2040 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2041 "mutex");
2042
2043 return;
2044 }
2045
2046 init_kernel_properties (kernel);
2047
2048 if (!kernel->initialization_failed)
2049 {
2050 GCN_DEBUG ("\n");
2051
2052 kernel->initialized = true;
2053 }
2054 if (pthread_mutex_unlock (&kernel->init_mutex))
2055 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2056 "mutex");
2057 }
2058
2059 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2060 launch attributes from KLA.
2061
2062 MODULE_LOCKED indicates that the caller already holds the lock and
2063 run_kernel need not lock it again.
2064 If AQ is NULL then agent->sync_queue will be used. */
2065
2066 static void
2067 run_kernel (struct kernel_info *kernel, void *vars,
2068 struct GOMP_kernel_launch_attributes *kla,
2069 struct goacc_asyncqueue *aq, bool module_locked)
2070 {
2071 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2072 kernel->description->vpgr_count);
2073
2074 /* Reduce the number of threads/workers if there are insufficient
2075 VGPRs available to run the kernels together. */
2076 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2077 {
2078 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2079 int max_threads = (256 / granulated_vgprs) * 4;
2080 if (kla->gdims[2] > max_threads)
2081 {
2082 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2083 " per team/gang - reducing to %d threads/workers.\n",
2084 kla->gdims[2], max_threads);
2085 kla->gdims[2] = max_threads;
2086 }
2087 }
2088
2089 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2090 (aq ? aq->id : 0));
2091 GCN_DEBUG ("GCN launch attribs: gdims:[");
2092 int i;
2093 for (i = 0; i < kla->ndim; ++i)
2094 {
2095 if (i)
2096 DEBUG_PRINT (", ");
2097 DEBUG_PRINT ("%u", kla->gdims[i]);
2098 }
2099 DEBUG_PRINT ("], normalized gdims:[");
2100 for (i = 0; i < kla->ndim; ++i)
2101 {
2102 if (i)
2103 DEBUG_PRINT (", ");
2104 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2105 }
2106 DEBUG_PRINT ("], wdims:[");
2107 for (i = 0; i < kla->ndim; ++i)
2108 {
2109 if (i)
2110 DEBUG_PRINT (", ");
2111 DEBUG_PRINT ("%u", kla->wdims[i]);
2112 }
2113 DEBUG_PRINT ("]\n");
2114 DEBUG_FLUSH ();
2115
2116 struct agent_info *agent = kernel->agent;
2117 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2118 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2119
2120 if (!agent->initialized)
2121 GOMP_PLUGIN_fatal ("Agent must be initialized");
2122
2123 if (!kernel->initialized)
2124 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2125
2126 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2127
2128 uint64_t index
2129 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2130 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2131
2132 /* Wait until the queue is not full before writing the packet. */
2133 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2134 >= command_q->size)
2135 ;
2136
2137 /* Do not allow the dimensions to be overridden when running
2138 constructors or destructors. */
2139 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2140 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2141
2142 hsa_kernel_dispatch_packet_t *packet;
2143 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2144 + index % command_q->size;
2145
2146 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2147 packet->grid_size_x = override_x ? : kla->gdims[0];
2148 packet->workgroup_size_x = get_group_size (kla->ndim,
2149 packet->grid_size_x,
2150 kla->wdims[0]);
2151
2152 if (kla->ndim >= 2)
2153 {
2154 packet->grid_size_y = kla->gdims[1];
2155 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2156 kla->wdims[1]);
2157 }
2158 else
2159 {
2160 packet->grid_size_y = 1;
2161 packet->workgroup_size_y = 1;
2162 }
2163
2164 if (kla->ndim == 3)
2165 {
2166 packet->grid_size_z = limit_worker_threads (override_z
2167 ? : kla->gdims[2]);
2168 packet->workgroup_size_z = get_group_size (kla->ndim,
2169 packet->grid_size_z,
2170 kla->wdims[2]);
2171 }
2172 else
2173 {
2174 packet->grid_size_z = 1;
2175 packet->workgroup_size_z = 1;
2176 }
2177
2178 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2179 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2180 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2181 packet->grid_size_x / packet->workgroup_size_x,
2182 packet->grid_size_y / packet->workgroup_size_y,
2183 packet->grid_size_z / packet->workgroup_size_z,
2184 packet->workgroup_size_x, packet->workgroup_size_y,
2185 packet->workgroup_size_z);
2186
2187 struct kernel_dispatch *shadow
2188 = create_kernel_dispatch (kernel, packet->grid_size_x);
2189 shadow->queue = command_q;
2190
2191 if (debug)
2192 {
2193 fprintf (stderr, "\nKernel has following dependencies:\n");
2194 print_kernel_dispatch (shadow, 2);
2195 }
2196
2197 packet->private_segment_size = kernel->private_segment_size;
2198 packet->group_segment_size = kernel->group_segment_size;
2199 packet->kernel_object = kernel->object;
2200 packet->kernarg_address = shadow->kernarg_address;
2201 hsa_signal_t s;
2202 s.handle = shadow->signal;
2203 packet->completion_signal = s;
2204 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2205 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2206
2207 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2208
2209 uint16_t header;
2210 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2211 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2212 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2213
2214 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2215 agent->device_id);
2216
2217 packet_store_release ((uint32_t *) packet, header,
2218 (uint16_t) kla->ndim
2219 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2220
2221 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2222 index);
2223
2224 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2225
2226 /* Root signal waits with 1ms timeout. */
2227 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2228 1000 * 1000,
2229 HSA_WAIT_STATE_BLOCKED) != 0)
2230 {
2231 console_output (kernel, shadow->kernarg_address, false);
2232 }
2233 console_output (kernel, shadow->kernarg_address, true);
2234
2235 struct kernargs *kernargs = shadow->kernarg_address;
2236 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2237
2238 release_kernel_dispatch (shadow);
2239
2240 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2241 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2242
2243 unsigned int upper = (return_value & ~0xffff) >> 16;
2244 if (upper == 0xcafe)
2245 ; // exit not called, normal termination.
2246 else if (upper == 0xffff)
2247 ; // exit called.
2248 else
2249 {
2250 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2251 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2252 return_value);
2253 abort ();
2254 }
2255
2256 if (upper == 0xffff)
2257 {
2258 unsigned int signal = (return_value >> 8) & 0xff;
2259
2260 if (signal == SIGABRT)
2261 {
2262 GCN_WARNING ("GCN Kernel aborted\n");
2263 abort ();
2264 }
2265 else if (signal != 0)
2266 {
2267 GCN_WARNING ("GCN Kernel received unknown signal\n");
2268 abort ();
2269 }
2270
2271 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2272 exit (return_value & 0xff);
2273 }
2274 }
2275
2276 /* }}} */
2277 /* {{{ Load/Unload */
2278
2279 /* Initialize KERNEL from D and other parameters. Return true on success. */
2280
2281 static bool
2282 init_basic_kernel_info (struct kernel_info *kernel,
2283 struct hsa_kernel_description *d,
2284 struct agent_info *agent,
2285 struct module_info *module)
2286 {
2287 kernel->agent = agent;
2288 kernel->module = module;
2289 kernel->name = d->name;
2290 kernel->description = d;
2291 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2292 {
2293 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2294 return false;
2295 }
2296 return true;
2297 }
2298
2299 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2300
2301 static bool
2302 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2303 {
2304 int isa_field = elf_gcn_isa_field (image);
2305 const char* isa_s = isa_hsa_name (isa_field);
2306 if (!isa_s)
2307 {
2308 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2309 return false;
2310 }
2311
2312 if (isa_field != agent->device_isa)
2313 {
2314 char msg[120];
2315 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2316 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2317 assert (agent_isa_s);
2318 assert (agent_isa_gcc_s);
2319
2320 snprintf (msg, sizeof msg,
2321 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2322 "Try to recompile with '-foffload=-march=%s'.\n",
2323 isa_s, agent_isa_s, agent_isa_gcc_s);
2324
2325 hsa_error (msg, HSA_STATUS_ERROR);
2326 return false;
2327 }
2328
2329 return true;
2330 }
2331
2332 /* Create and finalize the program consisting of all loaded modules. */
2333
2334 static bool
2335 create_and_finalize_hsa_program (struct agent_info *agent)
2336 {
2337 hsa_status_t status;
2338 bool res = true;
2339 if (pthread_mutex_lock (&agent->prog_mutex))
2340 {
2341 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2342 return false;
2343 }
2344 if (agent->prog_finalized)
2345 goto final;
2346
2347 status
2348 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2349 HSA_EXECUTABLE_STATE_UNFROZEN,
2350 "", &agent->executable);
2351 if (status != HSA_STATUS_SUCCESS)
2352 {
2353 hsa_error ("Could not create GCN executable", status);
2354 goto fail;
2355 }
2356
2357 /* Load any GCN modules. */
2358 struct module_info *module = agent->module;
2359 if (module)
2360 {
2361 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2362
2363 if (!isa_matches_agent (agent, image))
2364 goto fail;
2365
2366 hsa_code_object_t co = { 0 };
2367 status = hsa_fns.hsa_code_object_deserialize_fn
2368 (module->image_desc->gcn_image->image,
2369 module->image_desc->gcn_image->size,
2370 NULL, &co);
2371 if (status != HSA_STATUS_SUCCESS)
2372 {
2373 hsa_error ("Could not deserialize GCN code object", status);
2374 goto fail;
2375 }
2376
2377 status = hsa_fns.hsa_executable_load_code_object_fn
2378 (agent->executable, agent->id, co, "");
2379 if (status != HSA_STATUS_SUCCESS)
2380 {
2381 hsa_error ("Could not load GCN code object", status);
2382 goto fail;
2383 }
2384
2385 if (!module->heap)
2386 {
2387 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2388 gcn_kernel_heap_size,
2389 (void**)&module->heap);
2390 if (status != HSA_STATUS_SUCCESS)
2391 {
2392 hsa_error ("Could not allocate memory for GCN heap", status);
2393 goto fail;
2394 }
2395
2396 status = hsa_fns.hsa_memory_assign_agent_fn
2397 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2398 if (status != HSA_STATUS_SUCCESS)
2399 {
2400 hsa_error ("Could not assign GCN heap memory to device", status);
2401 goto fail;
2402 }
2403
2404 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2405 &gcn_kernel_heap_size,
2406 sizeof (gcn_kernel_heap_size));
2407 }
2408
2409 }
2410
2411 if (debug)
2412 dump_executable_symbols (agent->executable);
2413
2414 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2415 if (status != HSA_STATUS_SUCCESS)
2416 {
2417 hsa_error ("Could not freeze the GCN executable", status);
2418 goto fail;
2419 }
2420
2421 final:
2422 agent->prog_finalized = true;
2423
2424 if (pthread_mutex_unlock (&agent->prog_mutex))
2425 {
2426 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2427 res = false;
2428 }
2429
2430 return res;
2431
2432 fail:
2433 res = false;
2434 goto final;
2435 }
2436
2437 /* Free the HSA program in agent and everything associated with it and set
2438 agent->prog_finalized and the initialized flags of all kernels to false.
2439 Return TRUE on success. */
2440
2441 static bool
2442 destroy_hsa_program (struct agent_info *agent)
2443 {
2444 if (!agent->prog_finalized)
2445 return true;
2446
2447 hsa_status_t status;
2448
2449 GCN_DEBUG ("Destroying the current GCN program.\n");
2450
2451 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2452 if (status != HSA_STATUS_SUCCESS)
2453 return hsa_error ("Could not destroy GCN executable", status);
2454
2455 if (agent->module)
2456 {
2457 int i;
2458 for (i = 0; i < agent->module->kernel_count; i++)
2459 agent->module->kernels[i].initialized = false;
2460
2461 if (agent->module->heap)
2462 {
2463 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2464 agent->module->heap = NULL;
2465 }
2466 }
2467 agent->prog_finalized = false;
2468 return true;
2469 }
2470
2471 /* Deinitialize all information associated with MODULE and kernels within
2472 it. Return TRUE on success. */
2473
2474 static bool
2475 destroy_module (struct module_info *module, bool locked)
2476 {
2477 /* Run destructors before destroying module. */
2478 struct GOMP_kernel_launch_attributes kla =
2479 { 3,
2480 /* Grid size. */
2481 { 1, 64, 1 },
2482 /* Work-group size. */
2483 { 1, 64, 1 }
2484 };
2485
2486 if (module->fini_array_func)
2487 {
2488 init_kernel (module->fini_array_func);
2489 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2490 }
2491 module->constructors_run_p = false;
2492
2493 int i;
2494 for (i = 0; i < module->kernel_count; i++)
2495 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2496 {
2497 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2498 "mutex");
2499 return false;
2500 }
2501
2502 return true;
2503 }
2504
2505 /* }}} */
2506 /* {{{ Async */
2507
2508 /* Callback of dispatch queues to report errors. */
2509
2510 static void
2511 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2512 {
2513 struct queue_entry *entry = &aq->queue[index];
2514
2515 switch (entry->type)
2516 {
2517 case KERNEL_LAUNCH:
2518 if (DEBUG_QUEUES)
2519 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2520 aq->agent->device_id, aq->id, index);
2521 run_kernel (entry->u.launch.kernel,
2522 entry->u.launch.vars,
2523 &entry->u.launch.kla, aq, false);
2524 if (DEBUG_QUEUES)
2525 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2526 aq->agent->device_id, aq->id, index);
2527 break;
2528
2529 case CALLBACK:
2530 if (DEBUG_QUEUES)
2531 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2532 aq->agent->device_id, aq->id, index);
2533 entry->u.callback.fn (entry->u.callback.data);
2534 if (DEBUG_QUEUES)
2535 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2536 aq->agent->device_id, aq->id, index);
2537 break;
2538
2539 case ASYNC_WAIT:
2540 {
2541 /* FIXME: is it safe to access a placeholder that may already have
2542 been executed? */
2543 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2544
2545 if (DEBUG_QUEUES)
2546 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2547 aq->agent->device_id, aq->id, index);
2548
2549 pthread_mutex_lock (&placeholderp->mutex);
2550
2551 while (!placeholderp->executed)
2552 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2553
2554 pthread_mutex_unlock (&placeholderp->mutex);
2555
2556 if (pthread_cond_destroy (&placeholderp->cond))
2557 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2558
2559 if (pthread_mutex_destroy (&placeholderp->mutex))
2560 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2561
2562 if (DEBUG_QUEUES)
2563 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2564 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2565 }
2566 break;
2567
2568 case ASYNC_PLACEHOLDER:
2569 pthread_mutex_lock (&entry->u.placeholder.mutex);
2570 entry->u.placeholder.executed = 1;
2571 pthread_cond_signal (&entry->u.placeholder.cond);
2572 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2573 break;
2574
2575 default:
2576 GOMP_PLUGIN_fatal ("Unknown queue element");
2577 }
2578 }
2579
2580 /* This function is run as a thread to service an async queue in the
2581 background. It runs continuously until the stop flag is set. */
2582
2583 static void *
2584 drain_queue (void *thread_arg)
2585 {
2586 struct goacc_asyncqueue *aq = thread_arg;
2587
2588 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2589 {
2590 aq->drain_queue_stop = 2;
2591 return NULL;
2592 }
2593
2594 pthread_mutex_lock (&aq->mutex);
2595
2596 while (true)
2597 {
2598 if (aq->drain_queue_stop)
2599 break;
2600
2601 if (aq->queue_n > 0)
2602 {
2603 pthread_mutex_unlock (&aq->mutex);
2604 execute_queue_entry (aq, aq->queue_first);
2605
2606 pthread_mutex_lock (&aq->mutex);
2607 aq->queue_first = ((aq->queue_first + 1)
2608 % ASYNC_QUEUE_SIZE);
2609 aq->queue_n--;
2610
2611 if (DEBUG_THREAD_SIGNAL)
2612 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2613 aq->agent->device_id, aq->id);
2614 pthread_cond_broadcast (&aq->queue_cond_out);
2615 pthread_mutex_unlock (&aq->mutex);
2616
2617 if (DEBUG_QUEUES)
2618 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2619 aq->id);
2620 pthread_mutex_lock (&aq->mutex);
2621 }
2622 else
2623 {
2624 if (DEBUG_THREAD_SLEEP)
2625 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2626 aq->agent->device_id, aq->id);
2627 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2628 if (DEBUG_THREAD_SLEEP)
2629 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2630 aq->agent->device_id, aq->id);
2631 }
2632 }
2633
2634 aq->drain_queue_stop = 2;
2635 if (DEBUG_THREAD_SIGNAL)
2636 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2637 aq->agent->device_id, aq->id);
2638 pthread_cond_broadcast (&aq->queue_cond_out);
2639 pthread_mutex_unlock (&aq->mutex);
2640
2641 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2642 return NULL;
2643 }
2644
2645 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2646 is not usually the case. This is just a debug tool. */
2647
2648 static void
2649 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2650 {
2651 pthread_mutex_lock (&aq->mutex);
2652
2653 while (aq->queue_n > 0)
2654 {
2655 execute_queue_entry (aq, aq->queue_first);
2656
2657 aq->queue_first = ((aq->queue_first + 1)
2658 % ASYNC_QUEUE_SIZE);
2659 aq->queue_n--;
2660 }
2661
2662 pthread_mutex_unlock (&aq->mutex);
2663 }
2664
2665 /* Block the current thread until an async queue is writable. The aq->mutex
2666 lock should be held on entry, and remains locked on exit. */
2667
2668 static void
2669 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2670 {
2671 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2672 {
2673 /* Queue is full. Wait for it to not be full. */
2674 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2675 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2676 }
2677 }
2678
2679 /* Request an asynchronous kernel launch on the specified queue. This
2680 may block if the queue is full, but returns without waiting for the
2681 kernel to run. */
2682
2683 static void
2684 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2685 void *vars, struct GOMP_kernel_launch_attributes *kla)
2686 {
2687 assert (aq->agent == kernel->agent);
2688
2689 pthread_mutex_lock (&aq->mutex);
2690
2691 wait_for_queue_nonfull (aq);
2692
2693 int queue_last = ((aq->queue_first + aq->queue_n)
2694 % ASYNC_QUEUE_SIZE);
2695 if (DEBUG_QUEUES)
2696 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2697 aq->id, queue_last);
2698
2699 aq->queue[queue_last].type = KERNEL_LAUNCH;
2700 aq->queue[queue_last].u.launch.kernel = kernel;
2701 aq->queue[queue_last].u.launch.vars = vars;
2702 aq->queue[queue_last].u.launch.kla = *kla;
2703
2704 aq->queue_n++;
2705
2706 if (DEBUG_THREAD_SIGNAL)
2707 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2708 aq->agent->device_id, aq->id);
2709 pthread_cond_signal (&aq->queue_cond_in);
2710
2711 pthread_mutex_unlock (&aq->mutex);
2712 }
2713
2714 /* Request an asynchronous callback on the specified queue. The callback
2715 function will be called, with the given opaque data, from the appropriate
2716 async thread, when all previous items on that queue are complete. */
2717
2718 static void
2719 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2720 void *data)
2721 {
2722 pthread_mutex_lock (&aq->mutex);
2723
2724 wait_for_queue_nonfull (aq);
2725
2726 int queue_last = ((aq->queue_first + aq->queue_n)
2727 % ASYNC_QUEUE_SIZE);
2728 if (DEBUG_QUEUES)
2729 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2730 aq->id, queue_last);
2731
2732 aq->queue[queue_last].type = CALLBACK;
2733 aq->queue[queue_last].u.callback.fn = fn;
2734 aq->queue[queue_last].u.callback.data = data;
2735
2736 aq->queue_n++;
2737
2738 if (DEBUG_THREAD_SIGNAL)
2739 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2740 aq->agent->device_id, aq->id);
2741 pthread_cond_signal (&aq->queue_cond_in);
2742
2743 pthread_mutex_unlock (&aq->mutex);
2744 }
2745
2746 /* Request that a given async thread wait for another thread (unspecified) to
2747 reach the given placeholder. The wait will occur when all previous entries
2748 on the queue are complete. A placeholder is effectively a kind of signal
2749 which simply sets a flag when encountered in a queue. */
2750
2751 static void
2752 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2753 struct placeholder *placeholderp)
2754 {
2755 pthread_mutex_lock (&aq->mutex);
2756
2757 wait_for_queue_nonfull (aq);
2758
2759 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2760 if (DEBUG_QUEUES)
2761 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2762 aq->id, queue_last);
2763
2764 aq->queue[queue_last].type = ASYNC_WAIT;
2765 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2766
2767 aq->queue_n++;
2768
2769 if (DEBUG_THREAD_SIGNAL)
2770 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2771 aq->agent->device_id, aq->id);
2772 pthread_cond_signal (&aq->queue_cond_in);
2773
2774 pthread_mutex_unlock (&aq->mutex);
2775 }
2776
2777 /* Add a placeholder into an async queue. When the async thread reaches the
2778 placeholder it will set the "executed" flag to true and continue.
2779 Another thread may be waiting on this thread reaching the placeholder. */
2780
2781 static struct placeholder *
2782 queue_push_placeholder (struct goacc_asyncqueue *aq)
2783 {
2784 struct placeholder *placeholderp;
2785
2786 pthread_mutex_lock (&aq->mutex);
2787
2788 wait_for_queue_nonfull (aq);
2789
2790 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2791 if (DEBUG_QUEUES)
2792 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2793 aq->id, queue_last);
2794
2795 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2796 placeholderp = &aq->queue[queue_last].u.placeholder;
2797
2798 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2799 {
2800 pthread_mutex_unlock (&aq->mutex);
2801 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2802 }
2803
2804 if (pthread_cond_init (&placeholderp->cond, NULL))
2805 {
2806 pthread_mutex_unlock (&aq->mutex);
2807 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2808 }
2809
2810 placeholderp->executed = 0;
2811
2812 aq->queue_n++;
2813
2814 if (DEBUG_THREAD_SIGNAL)
2815 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2816 aq->agent->device_id, aq->id);
2817 pthread_cond_signal (&aq->queue_cond_in);
2818
2819 pthread_mutex_unlock (&aq->mutex);
2820
2821 return placeholderp;
2822 }
2823
2824 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
2825
2826 static void
2827 finalize_async_thread (struct goacc_asyncqueue *aq)
2828 {
2829 pthread_mutex_lock (&aq->mutex);
2830 if (aq->drain_queue_stop == 2)
2831 {
2832 pthread_mutex_unlock (&aq->mutex);
2833 return;
2834 }
2835
2836 aq->drain_queue_stop = 1;
2837
2838 if (DEBUG_THREAD_SIGNAL)
2839 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2840 aq->agent->device_id, aq->id);
2841 pthread_cond_signal (&aq->queue_cond_in);
2842
2843 while (aq->drain_queue_stop != 2)
2844 {
2845 if (DEBUG_THREAD_SLEEP)
2846 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2847 " to sleep\n", aq->agent->device_id, aq->id);
2848 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2849 if (DEBUG_THREAD_SLEEP)
2850 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2851 aq->agent->device_id, aq->id);
2852 }
2853
2854 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2855 aq->id);
2856 pthread_mutex_unlock (&aq->mutex);
2857
2858 int err = pthread_join (aq->thread_drain_queue, NULL);
2859 if (err != 0)
2860 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2861 aq->agent->device_id, aq->id, strerror (err));
2862 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2863 }
2864
2865 /* Set up an async queue for OpenMP. There will be only one. The
2866 implementation simply uses an OpenACC async queue.
2867 FIXME: is this thread-safe if two threads call this function? */
2868
2869 static void
2870 maybe_init_omp_async (struct agent_info *agent)
2871 {
2872 if (!agent->omp_async_queue)
2873 agent->omp_async_queue
2874 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2875 }
2876
2877 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2878 copies from read-only pages. */
2879
2880 static void
2881 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2882 {
2883 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2884
2885 if (status == HSA_STATUS_SUCCESS)
2886 return;
2887
2888 /* It appears that the copy fails if the source data is in a read-only page.
2889 We can't detect that easily, so try copying the data to a temporary buffer
2890 and doing the copy again if we got an error above. */
2891
2892 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2893 "[%p:+%d]\n", (void *) src, (int) len);
2894
2895 void *src_copy = malloc (len);
2896 memcpy (src_copy, src, len);
2897 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2898 free (src_copy);
2899 if (status != HSA_STATUS_SUCCESS)
2900 GOMP_PLUGIN_error ("memory copy failed");
2901 }
2902
2903 /* Copy data to or from a device. This is intended for use as an async
2904 callback event. */
2905
2906 static void
2907 copy_data (void *data_)
2908 {
2909 struct copy_data *data = (struct copy_data *)data_;
2910 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2911 data->aq->agent->device_id, data->aq->id, data->len, data->src,
2912 data->dst);
2913 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
2914 if (data->free_src)
2915 free ((void *) data->src);
2916 free (data);
2917 }
2918
2919 /* Free device data. This is intended for use as an async callback event. */
2920
2921 static void
2922 gomp_offload_free (void *ptr)
2923 {
2924 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2925 GOMP_OFFLOAD_free (0, ptr);
2926 }
2927
2928 /* Request an asynchronous data copy, to or from a device, on a given queue.
2929 The event will be registered as a callback. If FREE_SRC is true
2930 then the source data will be freed following the copy. */
2931
2932 static void
2933 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
2934 size_t len, bool free_src)
2935 {
2936 if (DEBUG_QUEUES)
2937 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2938 aq->agent->device_id, aq->id, len, src, dst);
2939 struct copy_data *data
2940 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
2941 data->dst = dst;
2942 data->src = src;
2943 data->len = len;
2944 data->free_src = free_src;
2945 data->aq = aq;
2946 queue_push_callback (aq, copy_data, data);
2947 }
2948
2949 /* Return true if the given queue is currently empty. */
2950
2951 static int
2952 queue_empty (struct goacc_asyncqueue *aq)
2953 {
2954 pthread_mutex_lock (&aq->mutex);
2955 int res = aq->queue_n == 0 ? 1 : 0;
2956 pthread_mutex_unlock (&aq->mutex);
2957
2958 return res;
2959 }
2960
2961 /* Wait for a given queue to become empty. This implements an OpenACC wait
2962 directive. */
2963
2964 static void
2965 wait_queue (struct goacc_asyncqueue *aq)
2966 {
2967 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2968 {
2969 drain_queue_synchronous (aq);
2970 return;
2971 }
2972
2973 pthread_mutex_lock (&aq->mutex);
2974
2975 while (aq->queue_n > 0)
2976 {
2977 if (DEBUG_THREAD_SLEEP)
2978 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
2979 aq->agent->device_id, aq->id);
2980 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2981 if (DEBUG_THREAD_SLEEP)
2982 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
2983 aq->id);
2984 }
2985
2986 pthread_mutex_unlock (&aq->mutex);
2987 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
2988 }
2989
2990 /* }}} */
2991 /* {{{ OpenACC support */
2992
2993 /* Execute an OpenACC kernel, synchronously or asynchronously. */
2994
2995 static void
2996 gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
2997 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
2998 struct goacc_asyncqueue *aq)
2999 {
3000 if (!GOMP_OFFLOAD_can_run (kernel))
3001 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3002
3003 /* If we get here then this must be an OpenACC kernel. */
3004 kernel->kind = KIND_OPENACC;
3005
3006 /* devaddrs must be double-indirect on the target. */
3007 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3008 for (size_t i = 0; i < mapnum; i++)
3009 hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3010 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3011 sizeof (void *));
3012
3013 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3014 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3015 {
3016 struct hsa_kernel_description *d
3017 = &kernel->module->image_desc->kernel_infos[i];
3018 if (d->name == kernel->name)
3019 {
3020 hsa_kernel_desc = d;
3021 break;
3022 }
3023 }
3024
3025 /* We may have statically-determined dimensions in
3026 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3027 invocation at runtime in dims[]. We allow static dimensions to take
3028 priority over dynamic dimensions when present (non-zero). */
3029 if (hsa_kernel_desc->oacc_dims[0] > 0)
3030 dims[0] = hsa_kernel_desc->oacc_dims[0];
3031 if (hsa_kernel_desc->oacc_dims[1] > 0)
3032 dims[1] = hsa_kernel_desc->oacc_dims[1];
3033 if (hsa_kernel_desc->oacc_dims[2] > 0)
3034 dims[2] = hsa_kernel_desc->oacc_dims[2];
3035
3036 /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3037 There isn't really a correct answer for this without a clue about the
3038 problem size, so let's do a reasonable number of single-worker gangs.
3039 64 gangs matches a typical Fiji device. */
3040
3041 /* NOTE: Until support for middle-end worker partitioning is merged, use 1
3042 for the default number of workers. */
3043 if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs. */
3044 if (dims[1] == 0) dims[1] = 1; /* Workers. */
3045
3046 /* The incoming dimensions are expressed in terms of gangs, workers, and
3047 vectors. The HSA dimensions are expressed in terms of "work-items",
3048 which means multiples of vector lanes.
3049
3050 The "grid size" specifies the size of the problem space, and the
3051 "work-group size" specifies how much of that we want a single compute
3052 unit to chew on at once.
3053
3054 The three dimensions do not really correspond to hardware, but the
3055 important thing is that the HSA runtime will launch as many
3056 work-groups as it takes to process the entire grid, and each
3057 work-group will contain as many wave-fronts as it takes to process
3058 the work-items in that group.
3059
3060 Essentially, as long as we set the Y dimension to 64 (the number of
3061 vector lanes in hardware), and the Z group size to the maximum (16),
3062 then we will get the gangs (X) and workers (Z) launched as we expect.
3063
3064 The reason for the apparent reversal of vector and worker dimension
3065 order is to do with the way the run-time distributes work-items across
3066 v1 and v2. */
3067 struct GOMP_kernel_launch_attributes kla =
3068 {3,
3069 /* Grid size. */
3070 {dims[0], 64, dims[1]},
3071 /* Work-group size. */
3072 {1, 64, 16}
3073 };
3074
3075 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3076 acc_prof_info *prof_info = thr->prof_info;
3077 acc_event_info enqueue_launch_event_info;
3078 acc_api_info *api_info = thr->api_info;
3079 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3080 if (profiling_dispatch_p)
3081 {
3082 prof_info->event_type = acc_ev_enqueue_launch_start;
3083
3084 enqueue_launch_event_info.launch_event.event_type
3085 = prof_info->event_type;
3086 enqueue_launch_event_info.launch_event.valid_bytes
3087 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3088 enqueue_launch_event_info.launch_event.parent_construct
3089 = acc_construct_parallel;
3090 enqueue_launch_event_info.launch_event.implicit = 1;
3091 enqueue_launch_event_info.launch_event.tool_info = NULL;
3092 enqueue_launch_event_info.launch_event.kernel_name
3093 = (char *) kernel->name;
3094 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3095 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3096 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3097
3098 api_info->device_api = acc_device_api_other;
3099
3100 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3101 &enqueue_launch_event_info, api_info);
3102 }
3103
3104 if (!async)
3105 {
3106 run_kernel (kernel, ind_da, &kla, NULL, false);
3107 gomp_offload_free (ind_da);
3108 }
3109 else
3110 {
3111 queue_push_launch (aq, kernel, ind_da, &kla);
3112 if (DEBUG_QUEUES)
3113 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3114 aq->agent->device_id, aq->id, ind_da);
3115 queue_push_callback (aq, gomp_offload_free, ind_da);
3116 }
3117
3118 if (profiling_dispatch_p)
3119 {
3120 prof_info->event_type = acc_ev_enqueue_launch_end;
3121 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3122 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3123 &enqueue_launch_event_info,
3124 api_info);
3125 }
3126 }
3127
3128 /* }}} */
3129 /* {{{ Generic Plugin API */
3130
3131 /* Return the name of the accelerator, which is "gcn". */
3132
3133 const char *
3134 GOMP_OFFLOAD_get_name (void)
3135 {
3136 return "gcn";
3137 }
3138
3139 /* Return the specific capabilities the HSA accelerator have. */
3140
3141 unsigned int
3142 GOMP_OFFLOAD_get_caps (void)
3143 {
3144 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3145 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3146 | GOMP_OFFLOAD_CAP_OPENACC_200;
3147 }
3148
3149 /* Identify as GCN accelerator. */
3150
3151 int
3152 GOMP_OFFLOAD_get_type (void)
3153 {
3154 return OFFLOAD_TARGET_TYPE_GCN;
3155 }
3156
3157 /* Return the libgomp version number we're compatible with. There is
3158 no requirement for cross-version compatibility. */
3159
3160 unsigned
3161 GOMP_OFFLOAD_version (void)
3162 {
3163 return GOMP_VERSION;
3164 }
3165
3166 /* Return the number of GCN devices on the system. */
3167
3168 int
3169 GOMP_OFFLOAD_get_num_devices (void)
3170 {
3171 if (!init_hsa_context ())
3172 return 0;
3173 return hsa_context.agent_count;
3174 }
3175
3176 /* Initialize device (agent) number N so that it can be used for computation.
3177 Return TRUE on success. */
3178
3179 bool
3180 GOMP_OFFLOAD_init_device (int n)
3181 {
3182 if (!init_hsa_context ())
3183 return false;
3184 if (n >= hsa_context.agent_count)
3185 {
3186 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3187 return false;
3188 }
3189 struct agent_info *agent = &hsa_context.agents[n];
3190
3191 if (agent->initialized)
3192 return true;
3193
3194 agent->device_id = n;
3195
3196 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3197 {
3198 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3199 return false;
3200 }
3201 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3202 {
3203 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3204 return false;
3205 }
3206 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3207 {
3208 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3209 return false;
3210 }
3211 if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3212 {
3213 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3214 return false;
3215 }
3216 agent->async_queues = NULL;
3217 agent->omp_async_queue = NULL;
3218 agent->team_arena_list = NULL;
3219
3220 uint32_t queue_size;
3221 hsa_status_t status;
3222 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3223 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3224 &queue_size);
3225 if (status != HSA_STATUS_SUCCESS)
3226 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3227 status);
3228
3229 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3230 &agent->name);
3231 if (status != HSA_STATUS_SUCCESS)
3232 return hsa_error ("Error querying the name of the agent", status);
3233
3234 agent->device_isa = isa_code (agent->name);
3235 if (agent->device_isa < 0)
3236 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3237
3238 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3239 &agent->vendor_name);
3240 if (status != HSA_STATUS_SUCCESS)
3241 return hsa_error ("Error querying the vendor name of the agent", status);
3242
3243 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3244 HSA_QUEUE_TYPE_MULTI,
3245 hsa_queue_callback, NULL, UINT32_MAX,
3246 UINT32_MAX, &agent->sync_queue);
3247 if (status != HSA_STATUS_SUCCESS)
3248 return hsa_error ("Error creating command queue", status);
3249
3250 agent->kernarg_region.handle = (uint64_t) -1;
3251 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3252 get_kernarg_memory_region,
3253 &agent->kernarg_region);
3254 if (status != HSA_STATUS_SUCCESS
3255 && status != HSA_STATUS_INFO_BREAK)
3256 hsa_error ("Scanning memory regions failed", status);
3257 if (agent->kernarg_region.handle == (uint64_t) -1)
3258 {
3259 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3260 "arguments");
3261 return false;
3262 }
3263 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3264 dump_hsa_region (agent->kernarg_region, NULL);
3265
3266 agent->data_region.handle = (uint64_t) -1;
3267 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3268 get_data_memory_region,
3269 &agent->data_region);
3270 if (status != HSA_STATUS_SUCCESS
3271 && status != HSA_STATUS_INFO_BREAK)
3272 hsa_error ("Scanning memory regions failed", status);
3273 if (agent->data_region.handle == (uint64_t) -1)
3274 {
3275 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3276 "data");
3277 return false;
3278 }
3279 GCN_DEBUG ("Selected device data memory region:\n");
3280 dump_hsa_region (agent->data_region, NULL);
3281
3282 GCN_DEBUG ("GCN agent %d initialized\n", n);
3283
3284 agent->initialized = true;
3285 return true;
3286 }
3287
3288 /* Load GCN object-code module described by struct gcn_image_desc in
3289 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3290 If there are any constructors then run them. */
3291
3292 int
3293 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3294 struct addr_pair **target_table)
3295 {
3296 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3297 {
3298 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3299 " (expected %u, received %u)",
3300 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3301 return -1;
3302 }
3303
3304 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3305 struct agent_info *agent;
3306 struct addr_pair *pair;
3307 struct module_info *module;
3308 struct kernel_info *kernel;
3309 int kernel_count = image_desc->kernel_count;
3310 unsigned var_count = image_desc->global_variable_count;
3311
3312 agent = get_agent_info (ord);
3313 if (!agent)
3314 return -1;
3315
3316 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3317 {
3318 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3319 return -1;
3320 }
3321 if (agent->prog_finalized
3322 && !destroy_hsa_program (agent))
3323 return -1;
3324
3325 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3326 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3327 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
3328 * sizeof (struct addr_pair));
3329 *target_table = pair;
3330 module = (struct module_info *)
3331 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3332 + kernel_count * sizeof (struct kernel_info));
3333 module->image_desc = image_desc;
3334 module->kernel_count = kernel_count;
3335 module->heap = NULL;
3336 module->constructors_run_p = false;
3337
3338 kernel = &module->kernels[0];
3339
3340 /* Allocate memory for kernel dependencies. */
3341 for (unsigned i = 0; i < kernel_count; i++)
3342 {
3343 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3344 if (!init_basic_kernel_info (kernel, d, agent, module))
3345 return -1;
3346 if (strcmp (d->name, "_init_array") == 0)
3347 module->init_array_func = kernel;
3348 else if (strcmp (d->name, "_fini_array") == 0)
3349 module->fini_array_func = kernel;
3350 else
3351 {
3352 pair->start = (uintptr_t) kernel;
3353 pair->end = (uintptr_t) (kernel + 1);
3354 pair++;
3355 }
3356 kernel++;
3357 }
3358
3359 agent->module = module;
3360 if (pthread_rwlock_unlock (&agent->module_rwlock))
3361 {
3362 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3363 return -1;
3364 }
3365
3366 if (!create_and_finalize_hsa_program (agent))
3367 return -1;
3368
3369 for (unsigned i = 0; i < var_count; i++)
3370 {
3371 struct global_var_info *v = &image_desc->global_variables[i];
3372 GCN_DEBUG ("Looking for variable %s\n", v->name);
3373
3374 hsa_status_t status;
3375 hsa_executable_symbol_t var_symbol;
3376 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3377 v->name, agent->id,
3378 0, &var_symbol);
3379
3380 if (status != HSA_STATUS_SUCCESS)
3381 hsa_fatal ("Could not find symbol for variable in the code object",
3382 status);
3383
3384 uint64_t var_addr;
3385 uint32_t var_size;
3386 status = hsa_fns.hsa_executable_symbol_get_info_fn
3387 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
3388 if (status != HSA_STATUS_SUCCESS)
3389 hsa_fatal ("Could not extract a variable from its symbol", status);
3390 status = hsa_fns.hsa_executable_symbol_get_info_fn
3391 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
3392 if (status != HSA_STATUS_SUCCESS)
3393 hsa_fatal ("Could not extract a variable size from its symbol", status);
3394
3395 pair->start = var_addr;
3396 pair->end = var_addr + var_size;
3397 GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
3398 (void *)var_addr, var_size);
3399 pair++;
3400 }
3401
3402 /* Ensure that constructors are run first. */
3403 struct GOMP_kernel_launch_attributes kla =
3404 { 3,
3405 /* Grid size. */
3406 { 1, 64, 1 },
3407 /* Work-group size. */
3408 { 1, 64, 1 }
3409 };
3410
3411 if (module->init_array_func)
3412 {
3413 init_kernel (module->init_array_func);
3414 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3415 }
3416 module->constructors_run_p = true;
3417
3418 /* Don't report kernels that libgomp need not know about. */
3419 if (module->init_array_func)
3420 kernel_count--;
3421 if (module->fini_array_func)
3422 kernel_count--;
3423
3424 return kernel_count + var_count;
3425 }
3426
3427 /* Unload GCN object-code module described by struct gcn_image_desc in
3428 TARGET_DATA from agent number N. Return TRUE on success. */
3429
3430 bool
3431 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3432 {
3433 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3434 {
3435 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3436 " (expected %u, received %u)",
3437 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3438 return false;
3439 }
3440
3441 struct agent_info *agent;
3442 agent = get_agent_info (n);
3443 if (!agent)
3444 return false;
3445
3446 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3447 {
3448 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3449 return false;
3450 }
3451
3452 if (!agent->module || agent->module->image_desc != target_data)
3453 {
3454 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3455 "loaded before");
3456 return false;
3457 }
3458
3459 if (!destroy_module (agent->module, true))
3460 return false;
3461 free (agent->module);
3462 agent->module = NULL;
3463 if (!destroy_hsa_program (agent))
3464 return false;
3465 if (pthread_rwlock_unlock (&agent->module_rwlock))
3466 {
3467 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3468 return false;
3469 }
3470 return true;
3471 }
3472
3473 /* Deinitialize all information and status associated with agent number N. We
3474 do not attempt any synchronization, assuming the user and libgomp will not
3475 attempt deinitialization of a device that is in any way being used at the
3476 same time. Return TRUE on success. */
3477
3478 bool
3479 GOMP_OFFLOAD_fini_device (int n)
3480 {
3481 struct agent_info *agent = get_agent_info (n);
3482 if (!agent)
3483 return false;
3484
3485 if (!agent->initialized)
3486 return true;
3487
3488 if (agent->omp_async_queue)
3489 {
3490 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3491 agent->omp_async_queue = NULL;
3492 }
3493
3494 if (agent->module)
3495 {
3496 if (!destroy_module (agent->module, false))
3497 return false;
3498 free (agent->module);
3499 agent->module = NULL;
3500 }
3501
3502 if (!destroy_team_arenas (agent))
3503 return false;
3504
3505 if (!destroy_hsa_program (agent))
3506 return false;
3507
3508 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3509 if (status != HSA_STATUS_SUCCESS)
3510 return hsa_error ("Error destroying command queue", status);
3511
3512 if (pthread_mutex_destroy (&agent->prog_mutex))
3513 {
3514 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3515 return false;
3516 }
3517 if (pthread_rwlock_destroy (&agent->module_rwlock))
3518 {
3519 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3520 return false;
3521 }
3522
3523 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3524 {
3525 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3526 return false;
3527 }
3528 if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3529 {
3530 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3531 return false;
3532 }
3533 agent->initialized = false;
3534 return true;
3535 }
3536
3537 /* Return true if the HSA runtime can run function FN_PTR. */
3538
3539 bool
3540 GOMP_OFFLOAD_can_run (void *fn_ptr)
3541 {
3542 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3543
3544 init_kernel (kernel);
3545 if (kernel->initialization_failed)
3546 goto failure;
3547
3548 return true;
3549
3550 failure:
3551 if (suppress_host_fallback)
3552 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3553 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3554 return false;
3555 }
3556
3557 /* Allocate memory on device N. */
3558
3559 void *
3560 GOMP_OFFLOAD_alloc (int n, size_t size)
3561 {
3562 struct agent_info *agent = get_agent_info (n);
3563 return alloc_by_agent (agent, size);
3564 }
3565
3566 /* Free memory from device N. */
3567
3568 bool
3569 GOMP_OFFLOAD_free (int device, void *ptr)
3570 {
3571 GCN_DEBUG ("Freeing memory on device %d\n", device);
3572
3573 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3574 if (status != HSA_STATUS_SUCCESS)
3575 {
3576 hsa_error ("Could not free device memory", status);
3577 return false;
3578 }
3579
3580 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3581 bool profiling_dispatch_p
3582 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3583 if (profiling_dispatch_p)
3584 {
3585 acc_prof_info *prof_info = thr->prof_info;
3586 acc_event_info data_event_info;
3587 acc_api_info *api_info = thr->api_info;
3588
3589 prof_info->event_type = acc_ev_free;
3590
3591 data_event_info.data_event.event_type = prof_info->event_type;
3592 data_event_info.data_event.valid_bytes
3593 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3594 data_event_info.data_event.parent_construct
3595 = acc_construct_parallel;
3596 data_event_info.data_event.implicit = 1;
3597 data_event_info.data_event.tool_info = NULL;
3598 data_event_info.data_event.var_name = NULL;
3599 data_event_info.data_event.bytes = 0;
3600 data_event_info.data_event.host_ptr = NULL;
3601 data_event_info.data_event.device_ptr = (void *) ptr;
3602
3603 api_info->device_api = acc_device_api_other;
3604
3605 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3606 api_info);
3607 }
3608
3609 return true;
3610 }
3611
3612 /* Copy data from DEVICE to host. */
3613
3614 bool
3615 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3616 {
3617 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3618 src, dst);
3619 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3620 if (status != HSA_STATUS_SUCCESS)
3621 GOMP_PLUGIN_error ("memory copy failed");
3622 return true;
3623 }
3624
3625 /* Copy data from host to DEVICE. */
3626
3627 bool
3628 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3629 {
3630 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3631 device, dst);
3632 hsa_memory_copy_wrapper (dst, src, n);
3633 return true;
3634 }
3635
3636 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3637
3638 bool
3639 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3640 {
3641 struct gcn_thread *thread_data = gcn_thread ();
3642
3643 if (thread_data && !async_synchronous_p (thread_data->async))
3644 {
3645 struct agent_info *agent = get_agent_info (device);
3646 maybe_init_omp_async (agent);
3647 queue_push_copy (agent->omp_async_queue, dst, src, n, false);
3648 return true;
3649 }
3650
3651 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3652 device, src, device, dst);
3653 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3654 if (status != HSA_STATUS_SUCCESS)
3655 GOMP_PLUGIN_error ("memory copy failed");
3656 return true;
3657 }
3658
3659 /* }}} */
3660 /* {{{ OpenMP Plugin API */
3661
3662 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3663 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3664 to a kernel_info structure, and must have previously been loaded to the
3665 specified device. */
3666
3667 void
3668 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3669 {
3670 struct agent_info *agent = get_agent_info (device);
3671 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3672 struct GOMP_kernel_launch_attributes def;
3673 struct GOMP_kernel_launch_attributes *kla;
3674 assert (agent == kernel->agent);
3675
3676 /* If we get here then the kernel must be OpenMP. */
3677 kernel->kind = KIND_OPENMP;
3678
3679 if (!parse_target_attributes (args, &def, &kla, agent))
3680 {
3681 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3682 return;
3683 }
3684 run_kernel (kernel, vars, kla, NULL, false);
3685 }
3686
3687 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3688 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3689 GOMP_PLUGIN_target_task_completion when it has finished. */
3690
3691 void
3692 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3693 void **args, void *async_data)
3694 {
3695 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3696 struct agent_info *agent = get_agent_info (device);
3697 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3698 struct GOMP_kernel_launch_attributes def;
3699 struct GOMP_kernel_launch_attributes *kla;
3700 assert (agent == kernel->agent);
3701
3702 /* If we get here then the kernel must be OpenMP. */
3703 kernel->kind = KIND_OPENMP;
3704
3705 if (!parse_target_attributes (args, &def, &kla, agent))
3706 {
3707 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3708 return;
3709 }
3710
3711 maybe_init_omp_async (agent);
3712 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3713 queue_push_callback (agent->omp_async_queue,
3714 GOMP_PLUGIN_target_task_completion, async_data);
3715 }
3716
3717 /* }}} */
3718 /* {{{ OpenACC Plugin API */
3719
3720 /* Run a synchronous OpenACC kernel. The device number is inferred from the
3721 already-loaded KERNEL. */
3722
3723 void
3724 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3725 void **hostaddrs, void **devaddrs, unsigned *dims,
3726 void *targ_mem_desc)
3727 {
3728 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3729
3730 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3731 NULL);
3732 }
3733
3734 /* Run an asynchronous OpenACC kernel on the specified queue. */
3735
3736 void
3737 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3738 void **hostaddrs, void **devaddrs,
3739 unsigned *dims, void *targ_mem_desc,
3740 struct goacc_asyncqueue *aq)
3741 {
3742 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3743
3744 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3745 aq);
3746 }
3747
3748 /* Create a new asynchronous thread and queue for running future kernels. */
3749
3750 struct goacc_asyncqueue *
3751 GOMP_OFFLOAD_openacc_async_construct (int device)
3752 {
3753 struct agent_info *agent = get_agent_info (device);
3754
3755 pthread_mutex_lock (&agent->async_queues_mutex);
3756
3757 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3758 aq->agent = get_agent_info (device);
3759 aq->prev = NULL;
3760 aq->next = agent->async_queues;
3761 if (aq->next)
3762 {
3763 aq->next->prev = aq;
3764 aq->id = aq->next->id + 1;
3765 }
3766 else
3767 aq->id = 1;
3768 agent->async_queues = aq;
3769
3770 aq->queue_first = 0;
3771 aq->queue_n = 0;
3772 aq->drain_queue_stop = 0;
3773
3774 if (pthread_mutex_init (&aq->mutex, NULL))
3775 {
3776 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3777 return false;
3778 }
3779 if (pthread_cond_init (&aq->queue_cond_in, NULL))
3780 {
3781 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3782 return false;
3783 }
3784 if (pthread_cond_init (&aq->queue_cond_out, NULL))
3785 {
3786 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3787 return false;
3788 }
3789
3790 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3791 ASYNC_QUEUE_SIZE,
3792 HSA_QUEUE_TYPE_MULTI,
3793 hsa_queue_callback, NULL,
3794 UINT32_MAX, UINT32_MAX,
3795 &aq->hsa_queue);
3796 if (status != HSA_STATUS_SUCCESS)
3797 hsa_fatal ("Error creating command queue", status);
3798
3799 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3800 if (err != 0)
3801 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3802 strerror (err));
3803 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3804 aq->id);
3805
3806 pthread_mutex_unlock (&agent->async_queues_mutex);
3807
3808 return aq;
3809 }
3810
3811 /* Destroy an existing asynchronous thread and queue. Waits for any
3812 currently-running task to complete, but cancels any queued tasks. */
3813
3814 bool
3815 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3816 {
3817 struct agent_info *agent = aq->agent;
3818
3819 finalize_async_thread (aq);
3820
3821 pthread_mutex_lock (&agent->async_queues_mutex);
3822
3823 int err;
3824 if ((err = pthread_mutex_destroy (&aq->mutex)))
3825 {
3826 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3827 goto fail;
3828 }
3829 if (pthread_cond_destroy (&aq->queue_cond_in))
3830 {
3831 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3832 goto fail;
3833 }
3834 if (pthread_cond_destroy (&aq->queue_cond_out))
3835 {
3836 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3837 goto fail;
3838 }
3839 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3840 if (status != HSA_STATUS_SUCCESS)
3841 {
3842 hsa_error ("Error destroying command queue", status);
3843 goto fail;
3844 }
3845
3846 if (aq->prev)
3847 aq->prev->next = aq->next;
3848 if (aq->next)
3849 aq->next->prev = aq->prev;
3850 if (agent->async_queues == aq)
3851 agent->async_queues = aq->next;
3852
3853 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3854
3855 free (aq);
3856 pthread_mutex_unlock (&agent->async_queues_mutex);
3857 return true;
3858
3859 fail:
3860 pthread_mutex_unlock (&agent->async_queues_mutex);
3861 return false;
3862 }
3863
3864 /* Return true if the specified async queue is currently empty. */
3865
3866 int
3867 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
3868 {
3869 return queue_empty (aq);
3870 }
3871
3872 /* Block until the specified queue has executed all its tasks and the
3873 queue is empty. */
3874
3875 bool
3876 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
3877 {
3878 wait_queue (aq);
3879 return true;
3880 }
3881
3882 /* Add a serialization point across two async queues. Any new tasks added to
3883 AQ2, after this call, will not run until all tasks on AQ1, at the time
3884 of this call, have completed. */
3885
3886 bool
3887 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
3888 struct goacc_asyncqueue *aq2)
3889 {
3890 /* For serialize, stream aq2 waits for aq1 to complete work that has been
3891 scheduled to run on it up to this point. */
3892 if (aq1 != aq2)
3893 {
3894 struct placeholder *placeholderp = queue_push_placeholder (aq1);
3895 queue_push_asyncwait (aq2, placeholderp);
3896 }
3897 return true;
3898 }
3899
3900 /* Add an opaque callback to the given async queue. */
3901
3902 void
3903 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
3904 void (*fn) (void *), void *data)
3905 {
3906 queue_push_callback (aq, fn, data);
3907 }
3908
3909 /* Queue up an asynchronous data copy from host to DEVICE. */
3910
3911 bool
3912 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
3913 size_t n, struct goacc_asyncqueue *aq)
3914 {
3915 struct agent_info *agent = get_agent_info (device);
3916 assert (agent == aq->agent);
3917 /* The source data does not necessarily remain live until the deferred
3918 copy happens. Taking a snapshot of the data here avoids reading
3919 uninitialised data later, but means that (a) data is copied twice and
3920 (b) modifications to the copied data between the "spawning" point of
3921 the asynchronous kernel and when it is executed will not be seen.
3922 But, that is probably correct. */
3923 void *src_copy = GOMP_PLUGIN_malloc (n);
3924 memcpy (src_copy, src, n);
3925 queue_push_copy (aq, dst, src_copy, n, true);
3926 return true;
3927 }
3928
3929 /* Queue up an asynchronous data copy from DEVICE to host. */
3930
3931 bool
3932 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
3933 size_t n, struct goacc_asyncqueue *aq)
3934 {
3935 struct agent_info *agent = get_agent_info (device);
3936 assert (agent == aq->agent);
3937 queue_push_copy (aq, dst, src, n, false);
3938 return true;
3939 }
3940
3941 union goacc_property_value
3942 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
3943 {
3944 struct agent_info *agent = get_agent_info (device);
3945
3946 union goacc_property_value propval = { .val = 0 };
3947
3948 switch (prop)
3949 {
3950 case GOACC_PROPERTY_FREE_MEMORY:
3951 /* Not supported. */
3952 break;
3953 case GOACC_PROPERTY_MEMORY:
3954 {
3955 size_t size;
3956 hsa_region_t region = agent->data_region;
3957 hsa_status_t status =
3958 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
3959 if (status == HSA_STATUS_SUCCESS)
3960 propval.val = size;
3961 break;
3962 }
3963 case GOACC_PROPERTY_NAME:
3964 propval.ptr = agent->name;
3965 break;
3966 case GOACC_PROPERTY_VENDOR:
3967 propval.ptr = agent->vendor_name;
3968 break;
3969 case GOACC_PROPERTY_DRIVER:
3970 propval.ptr = hsa_context.driver_version_s;
3971 break;
3972 }
3973
3974 return propval;
3975 }
3976
3977 /* Set up plugin-specific thread-local-data (host-side). */
3978
3979 void *
3980 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
3981 {
3982 struct gcn_thread *thread_data
3983 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
3984
3985 thread_data->async = GOMP_ASYNC_SYNC;
3986
3987 return (void *) thread_data;
3988 }
3989
3990 /* Clean up plugin-specific thread-local-data. */
3991
3992 void
3993 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
3994 {
3995 free (data);
3996 }
3997
3998 /* }}} */