]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgomp/plugin/plugin-gcn.c
Update copyright years.
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
1 /* Plugin for AMD GCN execution.
2
3 Copyright (C) 2013-2023 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* {{{ Includes and defines */
30
31 #include "config.h"
32 #include "symcat.h"
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <pthread.h>
37 #include <inttypes.h>
38 #include <stdbool.h>
39 #include <limits.h>
40 #include <hsa.h>
41 #include <hsa_ext_amd.h>
42 #include <dlfcn.h>
43 #include <signal.h>
44 #include "libgomp-plugin.h"
45 #include "config/gcn/libgomp-gcn.h" /* For struct output. */
46 #include "gomp-constants.h"
47 #include <elf.h>
48 #include "oacc-plugin.h"
49 #include "oacc-int.h"
50 #include <assert.h>
51
52 /* These probably won't be in elf.h for a while. */
53 #ifndef R_AMDGPU_NONE
54 #define R_AMDGPU_NONE 0
55 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57 #define R_AMDGPU_ABS64 3 /* S + A */
58 #define R_AMDGPU_REL32 4 /* S + A - P */
59 #define R_AMDGPU_REL64 5 /* S + A - P */
60 #define R_AMDGPU_ABS32 6 /* S + A */
61 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
66 #define R_AMDGPU_RELATIVE64 13 /* B + A */
67 #endif
68
69 /* GCN specific definitions for asynchronous queues. */
70
71 #define ASYNC_QUEUE_SIZE 64
72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
73 #define DEBUG_QUEUES 0
74 #define DEBUG_THREAD_SLEEP 0
75 #define DEBUG_THREAD_SIGNAL 0
76
77 /* Defaults. */
78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
79
80 /* Secure getenv() which returns NULL if running as SUID/SGID. */
81 #ifndef HAVE_SECURE_GETENV
82 #ifdef HAVE___SECURE_GETENV
83 #define secure_getenv __secure_getenv
84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
86
87 #include <unistd.h>
88
89 /* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
91
92 static char *
93 secure_getenv (const char *name)
94 {
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
96 return getenv (name);
97 else
98 return NULL;
99 }
100
101 #else
102 #define secure_getenv getenv
103 #endif
104 #endif
105
106 /* }}} */
107 /* {{{ Types */
108
109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
110
111 struct gcn_thread
112 {
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
114 int async;
115 };
116
117 /* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
119
120 struct hsa_runtime_fn_info
121 {
122 /* HSA runtime. */
123 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
124 const char **status_string);
125 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
126 void *value);
127 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
128 hsa_agent_info_t attribute,
129 void *value);
130 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
131 hsa_isa_info_t attribute,
132 uint32_t index,
133 void *value);
134 hsa_status_t (*hsa_init_fn) (void);
135 hsa_status_t (*hsa_iterate_agents_fn)
136 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
137 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
138 hsa_region_info_t attribute,
139 void *value);
140 hsa_status_t (*hsa_queue_create_fn)
141 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
142 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
143 void *data, uint32_t private_segment_size,
144 uint32_t group_segment_size, hsa_queue_t **queue);
145 hsa_status_t (*hsa_agent_iterate_regions_fn)
146 (hsa_agent_t agent,
147 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
148 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
149 hsa_status_t (*hsa_executable_create_fn)
150 (hsa_profile_t profile, hsa_executable_state_t executable_state,
151 const char *options, hsa_executable_t *executable);
152 hsa_status_t (*hsa_executable_global_variable_define_fn)
153 (hsa_executable_t executable, const char *variable_name, void *address);
154 hsa_status_t (*hsa_executable_load_code_object_fn)
155 (hsa_executable_t executable, hsa_agent_t agent,
156 hsa_code_object_t code_object, const char *options);
157 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
158 const char *options);
159 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
160 uint32_t num_consumers,
161 const hsa_agent_t *consumers,
162 hsa_signal_t *signal);
163 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
164 void **ptr);
165 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
166 hsa_access_permission_t access);
167 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
168 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
169 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
170 hsa_status_t (*hsa_executable_get_symbol_fn)
171 (hsa_executable_t executable, const char *module_name,
172 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
173 hsa_executable_symbol_t *symbol);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn)
175 (hsa_executable_symbol_t executable_symbol,
176 hsa_executable_symbol_info_t attribute, void *value);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn)
178 (hsa_executable_t executable,
179 hsa_status_t (*callback)(hsa_executable_t executable,
180 hsa_executable_symbol_t symbol, void *data),
181 void *data);
182 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
183 uint64_t value);
184 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
185 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
190 (hsa_signal_t signal, hsa_signal_condition_t condition,
191 hsa_signal_value_t compare_value, uint64_t timeout_hint,
192 hsa_wait_state_t wait_state_hint);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
194 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
195
196 hsa_status_t (*hsa_code_object_deserialize_fn)
197 (void *serialized_code_object, size_t serialized_code_object_size,
198 const char *options, hsa_code_object_t *code_object);
199 };
200
201 /* Structure describing the run-time and grid properties of an HSA kernel
202 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
203
204 struct GOMP_kernel_launch_attributes
205 {
206 /* Number of dimensions the workload has. Maximum number is 3. */
207 uint32_t ndim;
208 /* Size of the grid in the three respective dimensions. */
209 uint32_t gdims[3];
210 /* Size of work-groups in the respective dimensions. */
211 uint32_t wdims[3];
212 };
213
214 /* Collection of information needed for a dispatch of a kernel from a
215 kernel. */
216
217 struct kernel_dispatch
218 {
219 struct agent_info *agent;
220 /* Pointer to a command queue associated with a kernel dispatch agent. */
221 void *queue;
222 /* Pointer to a memory space used for kernel arguments passing. */
223 void *kernarg_address;
224 /* Kernel object. */
225 uint64_t object;
226 /* Synchronization signal used for dispatch synchronization. */
227 uint64_t signal;
228 /* Private segment size. */
229 uint32_t private_segment_size;
230 /* Group segment size. */
231 uint32_t group_segment_size;
232 };
233
234 /* Structure of the kernargs segment, supporting console output.
235
236 This needs to match the definitions in Newlib, and the expectations
237 in libgomp target code. */
238
239 struct kernargs {
240 /* Leave space for the real kernel arguments.
241 OpenACC and OpenMP only use one pointer. */
242 int64_t dummy1;
243 int64_t dummy2;
244
245 /* A pointer to struct output, below, for console output data. */
246 int64_t out_ptr;
247
248 /* A pointer to struct heap, below. */
249 int64_t heap_ptr;
250
251 /* A pointer to an ephemeral memory arena.
252 Only needed for OpenMP. */
253 int64_t arena_ptr;
254
255 /* Output data. */
256 struct output output_data;
257 };
258
259 /* A queue entry for a future asynchronous launch. */
260
261 struct kernel_launch
262 {
263 struct kernel_info *kernel;
264 void *vars;
265 struct GOMP_kernel_launch_attributes kla;
266 };
267
268 /* A queue entry for a future callback. */
269
270 struct callback
271 {
272 void (*fn)(void *);
273 void *data;
274 };
275
276 /* A data struct for the copy_data callback. */
277
278 struct copy_data
279 {
280 void *dst;
281 const void *src;
282 size_t len;
283 struct goacc_asyncqueue *aq;
284 };
285
286 /* A queue entry for a placeholder. These correspond to a wait event. */
287
288 struct placeholder
289 {
290 int executed;
291 pthread_cond_t cond;
292 pthread_mutex_t mutex;
293 };
294
295 /* A queue entry for a wait directive. */
296
297 struct asyncwait_info
298 {
299 struct placeholder *placeholderp;
300 };
301
302 /* Encode the type of an entry in an async queue. */
303
304 enum entry_type
305 {
306 KERNEL_LAUNCH,
307 CALLBACK,
308 ASYNC_WAIT,
309 ASYNC_PLACEHOLDER
310 };
311
312 /* An entry in an async queue. */
313
314 struct queue_entry
315 {
316 enum entry_type type;
317 union {
318 struct kernel_launch launch;
319 struct callback callback;
320 struct asyncwait_info asyncwait;
321 struct placeholder placeholder;
322 } u;
323 };
324
325 /* An async queue header.
326
327 OpenMP may create one of these.
328 OpenACC may create many. */
329
330 struct goacc_asyncqueue
331 {
332 struct agent_info *agent;
333 hsa_queue_t *hsa_queue;
334
335 pthread_t thread_drain_queue;
336 pthread_mutex_t mutex;
337 pthread_cond_t queue_cond_in;
338 pthread_cond_t queue_cond_out;
339 struct queue_entry queue[ASYNC_QUEUE_SIZE];
340 int queue_first;
341 int queue_n;
342 int drain_queue_stop;
343
344 int id;
345 struct goacc_asyncqueue *prev;
346 struct goacc_asyncqueue *next;
347 };
348
349 /* Mkoffload uses this structure to describe a kernel.
350
351 OpenMP kernel dimensions are passed at runtime.
352 OpenACC kernel dimensions are passed at compile time, here. */
353
354 struct hsa_kernel_description
355 {
356 const char *name;
357 int oacc_dims[3]; /* Only present for GCN kernels. */
358 int sgpr_count;
359 int vpgr_count;
360 };
361
362 /* Mkoffload uses this structure to describe an offload variable. */
363
364 struct global_var_info
365 {
366 const char *name;
367 void *address;
368 };
369
370 /* Mkoffload uses this structure to describe all the kernels in a
371 loadable module. These are passed the libgomp via static constructors. */
372
373 struct gcn_image_desc
374 {
375 struct gcn_image {
376 size_t size;
377 void *image;
378 } *gcn_image;
379 const unsigned kernel_count;
380 struct hsa_kernel_description *kernel_infos;
381 const unsigned global_variable_count;
382 };
383
384 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
385 support.
386 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
387
388 typedef enum {
389 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
390 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
391 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
392 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
393 EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f
394 } EF_AMDGPU_MACH;
395
396 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
397 typedef EF_AMDGPU_MACH gcn_isa;
398
399 /* Description of an HSA GPU agent (device) and the program associated with
400 it. */
401
402 struct agent_info
403 {
404 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
405 hsa_agent_t id;
406 /* The user-visible device number. */
407 int device_id;
408 /* Whether the agent has been initialized. The fields below are usable only
409 if it has been. */
410 bool initialized;
411
412 /* The instruction set architecture of the device. */
413 gcn_isa device_isa;
414 /* Name of the agent. */
415 char name[64];
416 /* Name of the vendor of the agent. */
417 char vendor_name[64];
418 /* Command queues of the agent. */
419 hsa_queue_t *sync_queue;
420 struct goacc_asyncqueue *async_queues, *omp_async_queue;
421 pthread_mutex_t async_queues_mutex;
422
423 /* The HSA memory region from which to allocate kernel arguments. */
424 hsa_region_t kernarg_region;
425
426 /* The HSA memory region from which to allocate device data. */
427 hsa_region_t data_region;
428
429 /* Allocated team arenas. */
430 struct team_arena_list *team_arena_list;
431 pthread_mutex_t team_arena_write_lock;
432
433 /* Read-write lock that protects kernels which are running or about to be run
434 from interference with loading and unloading of images. Needs to be
435 locked for reading while a kernel is being run, and for writing if the
436 list of modules is manipulated (and thus the HSA program invalidated). */
437 pthread_rwlock_t module_rwlock;
438
439 /* The module associated with this kernel. */
440 struct module_info *module;
441
442 /* Mutex enforcing that only one thread will finalize the HSA program. A
443 thread should have locked agent->module_rwlock for reading before
444 acquiring it. */
445 pthread_mutex_t prog_mutex;
446 /* Flag whether the HSA program that consists of all the modules has been
447 finalized. */
448 bool prog_finalized;
449 /* HSA executable - the finalized program that is used to locate kernels. */
450 hsa_executable_t executable;
451 };
452
453 /* Information required to identify, finalize and run any given kernel. */
454
455 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
456
457 struct kernel_info
458 {
459 /* Name of the kernel, required to locate it within the GCN object-code
460 module. */
461 const char *name;
462 /* The specific agent the kernel has been or will be finalized for and run
463 on. */
464 struct agent_info *agent;
465 /* The specific module where the kernel takes place. */
466 struct module_info *module;
467 /* Information provided by mkoffload associated with the kernel. */
468 struct hsa_kernel_description *description;
469 /* Mutex enforcing that at most once thread ever initializes a kernel for
470 use. A thread should have locked agent->module_rwlock for reading before
471 acquiring it. */
472 pthread_mutex_t init_mutex;
473 /* Flag indicating whether the kernel has been initialized and all fields
474 below it contain valid data. */
475 bool initialized;
476 /* Flag indicating that the kernel has a problem that blocks an execution. */
477 bool initialization_failed;
478 /* The object to be put into the dispatch queue. */
479 uint64_t object;
480 /* Required size of kernel arguments. */
481 uint32_t kernarg_segment_size;
482 /* Required size of group segment. */
483 uint32_t group_segment_size;
484 /* Required size of private segment. */
485 uint32_t private_segment_size;
486 /* Set up for OpenMP or OpenACC? */
487 enum offload_kind kind;
488 };
489
490 /* Information about a particular GCN module, its image and kernels. */
491
492 struct module_info
493 {
494 /* The description with which the program has registered the image. */
495 struct gcn_image_desc *image_desc;
496 /* GCN heap allocation. */
497 struct heap *heap;
498 /* Physical boundaries of the loaded module. */
499 Elf64_Addr phys_address_start;
500 Elf64_Addr phys_address_end;
501
502 bool constructors_run_p;
503 struct kernel_info *init_array_func, *fini_array_func;
504
505 /* Number of kernels in this module. */
506 int kernel_count;
507 /* An array of kernel_info structures describing each kernel in this
508 module. */
509 struct kernel_info kernels[];
510 };
511
512 /* A linked list of memory arenas allocated on the device.
513 These are only used by OpenMP, as a means to optimize per-team malloc. */
514
515 struct team_arena_list
516 {
517 struct team_arena_list *next;
518
519 /* The number of teams determines the size of the allocation. */
520 int num_teams;
521 /* The device address of the arena itself. */
522 void *arena;
523 /* A flag to prevent two asynchronous kernels trying to use the same arena.
524 The mutex is locked until the kernel exits. */
525 pthread_mutex_t in_use;
526 };
527
528 /* Information about the whole HSA environment and all of its agents. */
529
530 struct hsa_context_info
531 {
532 /* Whether the structure has been initialized. */
533 bool initialized;
534 /* Number of usable GPU HSA agents in the system. */
535 int agent_count;
536 /* Array of agent_info structures describing the individual HSA agents. */
537 struct agent_info *agents;
538 /* Driver version string. */
539 char driver_version_s[30];
540 };
541
542 /* Format of the on-device heap.
543
544 This must match the definition in Newlib and gcn-run. */
545
546 struct heap {
547 int64_t size;
548 char data[0];
549 };
550
551 /* }}} */
552 /* {{{ Global variables */
553
554 /* Information about the whole HSA environment and all of its agents. */
555
556 static struct hsa_context_info hsa_context;
557
558 /* HSA runtime functions that are initialized in init_hsa_context. */
559
560 static struct hsa_runtime_fn_info hsa_fns;
561
562 /* Heap space, allocated target-side, provided for use of newlib malloc.
563 Each module should have it's own heap allocated.
564 Beware that heap usage increases with OpenMP teams. See also arenas. */
565
566 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
567
568 /* Flag to decide whether print to stderr information about what is going on.
569 Set in init_debug depending on environment variables. */
570
571 static bool debug;
572
573 /* Flag to decide if the runtime should suppress a possible fallback to host
574 execution. */
575
576 static bool suppress_host_fallback;
577
578 /* Flag to locate HSA runtime shared library that is dlopened
579 by this plug-in. */
580
581 static const char *hsa_runtime_lib;
582
583 /* Flag to decide if the runtime should support also CPU devices (can be
584 a simulator). */
585
586 static bool support_cpu_devices;
587
588 /* Runtime dimension overrides. Zero indicates default. */
589
590 static int override_x_dim = 0;
591 static int override_z_dim = 0;
592
593 /* }}} */
594 /* {{{ Debug & Diagnostic */
595
596 /* Print a message to stderr if GCN_DEBUG value is set to true. */
597
598 #define DEBUG_PRINT(...) \
599 do \
600 { \
601 if (debug) \
602 { \
603 fprintf (stderr, __VA_ARGS__); \
604 } \
605 } \
606 while (false);
607
608 /* Flush stderr if GCN_DEBUG value is set to true. */
609
610 #define DEBUG_FLUSH() \
611 do { \
612 if (debug) \
613 fflush (stderr); \
614 } while (false)
615
616 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
617 is set to true. */
618
619 #define DEBUG_LOG(prefix, ...) \
620 do \
621 { \
622 DEBUG_PRINT (prefix); \
623 DEBUG_PRINT (__VA_ARGS__); \
624 DEBUG_FLUSH (); \
625 } while (false)
626
627 /* Print a debugging message to stderr. */
628
629 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
630
631 /* Print a warning message to stderr. */
632
633 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
634
635 /* Print HSA warning STR with an HSA STATUS code. */
636
637 static void
638 hsa_warn (const char *str, hsa_status_t status)
639 {
640 if (!debug)
641 return;
642
643 const char *hsa_error_msg = "[unknown]";
644 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
645
646 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
647 hsa_error_msg);
648 }
649
650 /* Report a fatal error STR together with the HSA error corresponding to STATUS
651 and terminate execution of the current process. */
652
653 static void
654 hsa_fatal (const char *str, hsa_status_t status)
655 {
656 const char *hsa_error_msg = "[unknown]";
657 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
658 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
659 hsa_error_msg);
660 }
661
662 /* Like hsa_fatal, except only report error message, and return FALSE
663 for propagating error processing to outside of plugin. */
664
665 static bool
666 hsa_error (const char *str, hsa_status_t status)
667 {
668 const char *hsa_error_msg = "[unknown]";
669 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
670 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
671 hsa_error_msg);
672 return false;
673 }
674
675 /* Dump information about the available hardware. */
676
677 static void
678 dump_hsa_system_info (void)
679 {
680 hsa_status_t status;
681
682 hsa_endianness_t endianness;
683 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
684 &endianness);
685 if (status == HSA_STATUS_SUCCESS)
686 switch (endianness)
687 {
688 case HSA_ENDIANNESS_LITTLE:
689 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
690 break;
691 case HSA_ENDIANNESS_BIG:
692 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
693 break;
694 default:
695 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
696 }
697 else
698 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
699
700 uint8_t extensions[128];
701 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
702 &extensions);
703 if (status == HSA_STATUS_SUCCESS)
704 {
705 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
706 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
707 }
708 else
709 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
710 }
711
712 /* Dump information about the available hardware. */
713
714 static void
715 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
716 {
717 switch (machine_model)
718 {
719 case HSA_MACHINE_MODEL_SMALL:
720 GCN_DEBUG ("%s: SMALL\n", s);
721 break;
722 case HSA_MACHINE_MODEL_LARGE:
723 GCN_DEBUG ("%s: LARGE\n", s);
724 break;
725 default:
726 GCN_WARNING ("%s: UNKNOWN\n", s);
727 break;
728 }
729 }
730
731 /* Dump information about the available hardware. */
732
733 static void
734 dump_profile (hsa_profile_t profile, const char *s)
735 {
736 switch (profile)
737 {
738 case HSA_PROFILE_FULL:
739 GCN_DEBUG ("%s: FULL\n", s);
740 break;
741 case HSA_PROFILE_BASE:
742 GCN_DEBUG ("%s: BASE\n", s);
743 break;
744 default:
745 GCN_WARNING ("%s: UNKNOWN\n", s);
746 break;
747 }
748 }
749
750 /* Dump information about a device memory region. */
751
752 static hsa_status_t
753 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
754 {
755 hsa_status_t status;
756
757 hsa_region_segment_t segment;
758 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
759 &segment);
760 if (status == HSA_STATUS_SUCCESS)
761 {
762 if (segment == HSA_REGION_SEGMENT_GLOBAL)
763 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
764 else if (segment == HSA_REGION_SEGMENT_READONLY)
765 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
766 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
767 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
768 else if (segment == HSA_REGION_SEGMENT_GROUP)
769 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
770 else
771 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
772 }
773 else
774 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
775
776 if (segment == HSA_REGION_SEGMENT_GLOBAL)
777 {
778 uint32_t flags;
779 status
780 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
781 &flags);
782 if (status == HSA_STATUS_SUCCESS)
783 {
784 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
785 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
786 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
787 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
788 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
789 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
790 }
791 else
792 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
793 }
794
795 size_t size;
796 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
797 if (status == HSA_STATUS_SUCCESS)
798 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
799 else
800 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
801
802 status
803 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
804 &size);
805 if (status == HSA_STATUS_SUCCESS)
806 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
807 else
808 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
809
810 bool alloc_allowed;
811 status
812 = hsa_fns.hsa_region_get_info_fn (region,
813 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
814 &alloc_allowed);
815 if (status == HSA_STATUS_SUCCESS)
816 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
817 else
818 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
819
820 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
821 return HSA_STATUS_SUCCESS;
822
823 status
824 = hsa_fns.hsa_region_get_info_fn (region,
825 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
826 &size);
827 if (status == HSA_STATUS_SUCCESS)
828 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
829 else
830 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
831
832 size_t align;
833 status
834 = hsa_fns.hsa_region_get_info_fn (region,
835 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
836 &align);
837 if (status == HSA_STATUS_SUCCESS)
838 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
839 else
840 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
841
842 return HSA_STATUS_SUCCESS;
843 }
844
845 /* Dump information about all the device memory regions. */
846
847 static void
848 dump_hsa_regions (hsa_agent_t agent)
849 {
850 hsa_status_t status;
851 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
852 dump_hsa_region,
853 NULL);
854 if (status != HSA_STATUS_SUCCESS)
855 hsa_error ("Dumping hsa regions failed", status);
856 }
857
858 /* Dump information about the available devices. */
859
860 static hsa_status_t
861 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
862 {
863 hsa_status_t status;
864
865 char buf[64];
866 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
867 &buf);
868 if (status == HSA_STATUS_SUCCESS)
869 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
870 else
871 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
872
873 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
874 &buf);
875 if (status == HSA_STATUS_SUCCESS)
876 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
877 else
878 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
879
880 hsa_machine_model_t machine_model;
881 status
882 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
883 &machine_model);
884 if (status == HSA_STATUS_SUCCESS)
885 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
886 else
887 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
888
889 hsa_profile_t profile;
890 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
891 &profile);
892 if (status == HSA_STATUS_SUCCESS)
893 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
894 else
895 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
896
897 hsa_device_type_t device_type;
898 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
899 &device_type);
900 if (status == HSA_STATUS_SUCCESS)
901 {
902 switch (device_type)
903 {
904 case HSA_DEVICE_TYPE_CPU:
905 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
906 break;
907 case HSA_DEVICE_TYPE_GPU:
908 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
909 break;
910 case HSA_DEVICE_TYPE_DSP:
911 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
912 break;
913 default:
914 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
915 break;
916 }
917 }
918 else
919 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
920
921 uint32_t cu_count;
922 status = hsa_fns.hsa_agent_get_info_fn
923 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
924 if (status == HSA_STATUS_SUCCESS)
925 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
926 else
927 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
928
929 uint32_t size;
930 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
931 &size);
932 if (status == HSA_STATUS_SUCCESS)
933 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
934 else
935 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
936
937 uint32_t max_dim;
938 status = hsa_fns.hsa_agent_get_info_fn (agent,
939 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
940 &max_dim);
941 if (status == HSA_STATUS_SUCCESS)
942 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
943 else
944 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
945
946 uint32_t max_size;
947 status = hsa_fns.hsa_agent_get_info_fn (agent,
948 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
949 &max_size);
950 if (status == HSA_STATUS_SUCCESS)
951 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
952 else
953 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
954
955 uint32_t grid_max_dim;
956 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
957 &grid_max_dim);
958 if (status == HSA_STATUS_SUCCESS)
959 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
960 else
961 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
962
963 uint32_t grid_max_size;
964 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
965 &grid_max_size);
966 if (status == HSA_STATUS_SUCCESS)
967 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
968 else
969 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
970
971 dump_hsa_regions (agent);
972
973 return HSA_STATUS_SUCCESS;
974 }
975
976 /* Forward reference. */
977
978 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
979
980 /* Helper function for dump_executable_symbols. */
981
982 static hsa_status_t
983 dump_executable_symbol (hsa_executable_t executable,
984 hsa_executable_symbol_t symbol,
985 void *data __attribute__((unused)))
986 {
987 char *name = get_executable_symbol_name (symbol);
988
989 if (name)
990 {
991 GCN_DEBUG ("executable symbol: %s\n", name);
992 free (name);
993 }
994
995 return HSA_STATUS_SUCCESS;
996 }
997
998 /* Dump all global symbol in an executable. */
999
1000 static void
1001 dump_executable_symbols (hsa_executable_t executable)
1002 {
1003 hsa_status_t status;
1004 status
1005 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1006 dump_executable_symbol,
1007 NULL);
1008 if (status != HSA_STATUS_SUCCESS)
1009 hsa_fatal ("Could not dump HSA executable symbols", status);
1010 }
1011
1012 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1013
1014 static void
1015 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1016 {
1017 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1018
1019 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1020 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1021 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1022 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1023 (void*)kernargs->heap_ptr);
1024 fprintf (stderr, "%*sarena address: %p\n", indent, "",
1025 (void*)kernargs->arena_ptr);
1026 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1027 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1028 dispatch->private_segment_size);
1029 fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1030 dispatch->group_segment_size);
1031 fprintf (stderr, "\n");
1032 }
1033
1034 /* }}} */
1035 /* {{{ Utility functions */
1036
1037 /* Cast the thread local storage to gcn_thread. */
1038
1039 static inline struct gcn_thread *
1040 gcn_thread (void)
1041 {
1042 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1043 }
1044
1045 /* Initialize debug and suppress_host_fallback according to the environment. */
1046
1047 static void
1048 init_environment_variables (void)
1049 {
1050 if (secure_getenv ("GCN_DEBUG"))
1051 debug = true;
1052 else
1053 debug = false;
1054
1055 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1056 suppress_host_fallback = true;
1057 else
1058 suppress_host_fallback = false;
1059
1060 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1061 if (hsa_runtime_lib == NULL)
1062 hsa_runtime_lib = "libhsa-runtime64.so.1";
1063
1064 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1065
1066 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1067 if (!x)
1068 x = secure_getenv ("GCN_NUM_GANGS");
1069 if (x)
1070 override_x_dim = atoi (x);
1071
1072 const char *z = secure_getenv ("GCN_NUM_THREADS");
1073 if (!z)
1074 z = secure_getenv ("GCN_NUM_WORKERS");
1075 if (z)
1076 override_z_dim = atoi (z);
1077
1078 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1079 if (heap)
1080 {
1081 size_t tmp = atol (heap);
1082 if (tmp)
1083 gcn_kernel_heap_size = tmp;
1084 }
1085 }
1086
1087 /* Return malloc'd string with name of SYMBOL. */
1088
1089 static char *
1090 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1091 {
1092 hsa_status_t status;
1093 char *res;
1094 uint32_t len;
1095 const hsa_executable_symbol_info_t info_name_length
1096 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1097
1098 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1099 &len);
1100 if (status != HSA_STATUS_SUCCESS)
1101 {
1102 hsa_error ("Could not get length of symbol name", status);
1103 return NULL;
1104 }
1105
1106 res = GOMP_PLUGIN_malloc (len + 1);
1107
1108 const hsa_executable_symbol_info_t info_name
1109 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1110
1111 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1112
1113 if (status != HSA_STATUS_SUCCESS)
1114 {
1115 hsa_error ("Could not get symbol name", status);
1116 free (res);
1117 return NULL;
1118 }
1119
1120 res[len] = '\0';
1121
1122 return res;
1123 }
1124
1125 /* Get the number of GPU Compute Units. */
1126
1127 static int
1128 get_cu_count (struct agent_info *agent)
1129 {
1130 uint32_t cu_count;
1131 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1132 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1133 if (status == HSA_STATUS_SUCCESS)
1134 return cu_count;
1135 else
1136 return 64; /* The usual number for older devices. */
1137 }
1138
1139 /* Calculate the maximum grid size for OMP threads / OACC workers.
1140 This depends on the kernel's resource usage levels. */
1141
1142 static int
1143 limit_worker_threads (int threads)
1144 {
1145 /* FIXME Do something more inteligent here.
1146 GCN can always run 4 threads within a Compute Unit, but
1147 more than that depends on register usage. */
1148 if (threads > 16)
1149 threads = 16;
1150 return threads;
1151 }
1152
1153 /* This sets the maximum number of teams to twice the number of GPU Compute
1154 Units to avoid memory waste and corresponding memory access faults. */
1155
1156 static int
1157 limit_teams (int teams, struct agent_info *agent)
1158 {
1159 int max_teams = 2 * get_cu_count (agent);
1160 if (teams > max_teams)
1161 teams = max_teams;
1162 return teams;
1163 }
1164
1165 /* Parse the target attributes INPUT provided by the compiler and return true
1166 if we should run anything all. If INPUT is NULL, fill DEF with default
1167 values, then store INPUT or DEF into *RESULT.
1168
1169 This is used for OpenMP only. */
1170
1171 static bool
1172 parse_target_attributes (void **input,
1173 struct GOMP_kernel_launch_attributes *def,
1174 struct GOMP_kernel_launch_attributes **result,
1175 struct agent_info *agent)
1176 {
1177 if (!input)
1178 GOMP_PLUGIN_fatal ("No target arguments provided");
1179
1180 bool grid_attrs_found = false;
1181 bool gcn_dims_found = false;
1182 int gcn_teams = 0;
1183 int gcn_threads = 0;
1184 while (*input)
1185 {
1186 intptr_t id = (intptr_t) *input++, val;
1187
1188 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1189 val = (intptr_t) *input++;
1190 else
1191 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1192
1193 val = (val > INT_MAX) ? INT_MAX : val;
1194
1195 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1196 && ((id & GOMP_TARGET_ARG_ID_MASK)
1197 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1198 {
1199 grid_attrs_found = true;
1200 break;
1201 }
1202 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1203 == GOMP_TARGET_ARG_DEVICE_ALL)
1204 {
1205 gcn_dims_found = true;
1206 switch (id & GOMP_TARGET_ARG_ID_MASK)
1207 {
1208 case GOMP_TARGET_ARG_NUM_TEAMS:
1209 gcn_teams = limit_teams (val, agent);
1210 break;
1211 case GOMP_TARGET_ARG_THREAD_LIMIT:
1212 gcn_threads = limit_worker_threads (val);
1213 break;
1214 default:
1215 ;
1216 }
1217 }
1218 }
1219
1220 if (gcn_dims_found)
1221 {
1222 bool gfx900_workaround_p = false;
1223
1224 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1225 && gcn_threads == 0 && override_z_dim == 0)
1226 {
1227 gfx900_workaround_p = true;
1228 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1229 "threads to at most 4 per team.\n");
1230 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1231 "GCN_NUM_THREADS=16\n");
1232 }
1233
1234 /* Ideally, when a dimension isn't explicitly specified, we should
1235 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1236 In practice, we tune for peak performance on BabelStream, which
1237 for OpenACC is currently 32 threads per CU. */
1238 def->ndim = 3;
1239 if (gcn_teams <= 0 && gcn_threads <= 0)
1240 {
1241 /* Set up a reasonable number of teams and threads. */
1242 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1243 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1244 def->gdims[2] = gcn_threads;
1245 }
1246 else if (gcn_teams <= 0 && gcn_threads > 0)
1247 {
1248 /* Auto-scale the number of teams with the number of threads. */
1249 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1250 def->gdims[2] = gcn_threads;
1251 }
1252 else if (gcn_teams > 0 && gcn_threads <= 0)
1253 {
1254 int max_threads = gfx900_workaround_p ? 4 : 16;
1255
1256 /* Auto-scale the number of threads with the number of teams. */
1257 def->gdims[0] = gcn_teams;
1258 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1259 if (def->gdims[2] == 0)
1260 def->gdims[2] = 1;
1261 else if (def->gdims[2] > max_threads)
1262 def->gdims[2] = max_threads;
1263 }
1264 else
1265 {
1266 def->gdims[0] = gcn_teams;
1267 def->gdims[2] = gcn_threads;
1268 }
1269 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1270 def->wdims[0] = 1; /* Single team per work-group. */
1271 def->wdims[1] = 64;
1272 def->wdims[2] = 16;
1273 *result = def;
1274 return true;
1275 }
1276 else if (!grid_attrs_found)
1277 {
1278 def->ndim = 1;
1279 def->gdims[0] = 1;
1280 def->gdims[1] = 1;
1281 def->gdims[2] = 1;
1282 def->wdims[0] = 1;
1283 def->wdims[1] = 1;
1284 def->wdims[2] = 1;
1285 *result = def;
1286 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1287 return true;
1288 }
1289
1290 struct GOMP_kernel_launch_attributes *kla;
1291 kla = (struct GOMP_kernel_launch_attributes *) *input;
1292 *result = kla;
1293 if (kla->ndim == 0 || kla->ndim > 3)
1294 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1295
1296 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1297 unsigned i;
1298 for (i = 0; i < kla->ndim; i++)
1299 {
1300 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1301 kla->gdims[i], kla->wdims[i]);
1302 if (kla->gdims[i] == 0)
1303 return false;
1304 }
1305 return true;
1306 }
1307
1308 /* Return the group size given the requested GROUP size, GRID size and number
1309 of grid dimensions NDIM. */
1310
1311 static uint32_t
1312 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1313 {
1314 if (group == 0)
1315 {
1316 /* TODO: Provide a default via environment or device characteristics. */
1317 if (ndim == 1)
1318 group = 64;
1319 else if (ndim == 2)
1320 group = 8;
1321 else
1322 group = 4;
1323 }
1324
1325 if (group > grid)
1326 group = grid;
1327 return group;
1328 }
1329
1330 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1331
1332 static void
1333 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1334 {
1335 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1336 }
1337
1338 /* A never-called callback for the HSA command queues. These signal events
1339 that we don't use, so we trigger an error.
1340
1341 This "queue" is not to be confused with the async queues, below. */
1342
1343 static void
1344 hsa_queue_callback (hsa_status_t status,
1345 hsa_queue_t *queue __attribute__ ((unused)),
1346 void *data __attribute__ ((unused)))
1347 {
1348 hsa_fatal ("Asynchronous queue error", status);
1349 }
1350
1351 /* }}} */
1352 /* {{{ HSA initialization */
1353
1354 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1355
1356 static bool
1357 init_hsa_runtime_functions (void)
1358 {
1359 #define DLSYM_FN(function) \
1360 hsa_fns.function##_fn = dlsym (handle, #function); \
1361 if (hsa_fns.function##_fn == NULL) \
1362 return false;
1363 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1364 if (handle == NULL)
1365 return false;
1366
1367 DLSYM_FN (hsa_status_string)
1368 DLSYM_FN (hsa_system_get_info)
1369 DLSYM_FN (hsa_agent_get_info)
1370 DLSYM_FN (hsa_init)
1371 DLSYM_FN (hsa_iterate_agents)
1372 DLSYM_FN (hsa_region_get_info)
1373 DLSYM_FN (hsa_queue_create)
1374 DLSYM_FN (hsa_agent_iterate_regions)
1375 DLSYM_FN (hsa_executable_destroy)
1376 DLSYM_FN (hsa_executable_create)
1377 DLSYM_FN (hsa_executable_global_variable_define)
1378 DLSYM_FN (hsa_executable_load_code_object)
1379 DLSYM_FN (hsa_executable_freeze)
1380 DLSYM_FN (hsa_signal_create)
1381 DLSYM_FN (hsa_memory_allocate)
1382 DLSYM_FN (hsa_memory_assign_agent)
1383 DLSYM_FN (hsa_memory_copy)
1384 DLSYM_FN (hsa_memory_free)
1385 DLSYM_FN (hsa_signal_destroy)
1386 DLSYM_FN (hsa_executable_get_symbol)
1387 DLSYM_FN (hsa_executable_symbol_get_info)
1388 DLSYM_FN (hsa_executable_iterate_symbols)
1389 DLSYM_FN (hsa_queue_add_write_index_release)
1390 DLSYM_FN (hsa_queue_load_read_index_acquire)
1391 DLSYM_FN (hsa_signal_wait_acquire)
1392 DLSYM_FN (hsa_signal_store_relaxed)
1393 DLSYM_FN (hsa_signal_store_release)
1394 DLSYM_FN (hsa_signal_load_acquire)
1395 DLSYM_FN (hsa_queue_destroy)
1396 DLSYM_FN (hsa_code_object_deserialize)
1397 return true;
1398 #undef DLSYM_FN
1399 }
1400
1401 /* Return true if the agent is a GPU and can accept of concurrent submissions
1402 from different threads. */
1403
1404 static bool
1405 suitable_hsa_agent_p (hsa_agent_t agent)
1406 {
1407 hsa_device_type_t device_type;
1408 hsa_status_t status
1409 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1410 &device_type);
1411 if (status != HSA_STATUS_SUCCESS)
1412 return false;
1413
1414 switch (device_type)
1415 {
1416 case HSA_DEVICE_TYPE_GPU:
1417 break;
1418 case HSA_DEVICE_TYPE_CPU:
1419 if (!support_cpu_devices)
1420 return false;
1421 break;
1422 default:
1423 return false;
1424 }
1425
1426 uint32_t features = 0;
1427 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1428 &features);
1429 if (status != HSA_STATUS_SUCCESS
1430 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1431 return false;
1432 hsa_queue_type_t queue_type;
1433 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1434 &queue_type);
1435 if (status != HSA_STATUS_SUCCESS
1436 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1437 return false;
1438
1439 return true;
1440 }
1441
1442 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1443 agent_count in hsa_context. */
1444
1445 static hsa_status_t
1446 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1447 {
1448 if (suitable_hsa_agent_p (agent))
1449 hsa_context.agent_count++;
1450 return HSA_STATUS_SUCCESS;
1451 }
1452
1453 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1454 id to the describing structure in the hsa context. The index of the
1455 structure is pointed to by DATA, increment it afterwards. */
1456
1457 static hsa_status_t
1458 assign_agent_ids (hsa_agent_t agent, void *data)
1459 {
1460 if (suitable_hsa_agent_p (agent))
1461 {
1462 int *agent_index = (int *) data;
1463 hsa_context.agents[*agent_index].id = agent;
1464 ++*agent_index;
1465 }
1466 return HSA_STATUS_SUCCESS;
1467 }
1468
1469 /* Initialize hsa_context if it has not already been done.
1470 Return TRUE on success. */
1471
1472 static bool
1473 init_hsa_context (void)
1474 {
1475 hsa_status_t status;
1476 int agent_index = 0;
1477
1478 if (hsa_context.initialized)
1479 return true;
1480 init_environment_variables ();
1481 if (!init_hsa_runtime_functions ())
1482 {
1483 GCN_WARNING ("Run-time could not be dynamically opened\n");
1484 if (suppress_host_fallback)
1485 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1486 return false;
1487 }
1488 status = hsa_fns.hsa_init_fn ();
1489 if (status != HSA_STATUS_SUCCESS)
1490 return hsa_error ("Run-time could not be initialized", status);
1491 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1492
1493 if (debug)
1494 dump_hsa_system_info ();
1495
1496 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1497 if (status != HSA_STATUS_SUCCESS)
1498 return hsa_error ("GCN GPU devices could not be enumerated", status);
1499 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1500
1501 hsa_context.agents
1502 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1503 * sizeof (struct agent_info));
1504 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1505 if (status != HSA_STATUS_SUCCESS)
1506 return hsa_error ("Scanning compute agents failed", status);
1507 if (agent_index != hsa_context.agent_count)
1508 {
1509 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1510 return false;
1511 }
1512
1513 if (debug)
1514 {
1515 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1516 if (status != HSA_STATUS_SUCCESS)
1517 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1518 }
1519
1520 uint16_t minor, major;
1521 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1522 &minor);
1523 if (status != HSA_STATUS_SUCCESS)
1524 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1525 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1526 &major);
1527 if (status != HSA_STATUS_SUCCESS)
1528 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1529
1530 size_t len = sizeof hsa_context.driver_version_s;
1531 int printed = snprintf (hsa_context.driver_version_s, len,
1532 "HSA Runtime %hu.%hu", (unsigned short int)major,
1533 (unsigned short int)minor);
1534 if (printed >= len)
1535 GCN_WARNING ("HSA runtime version string was truncated."
1536 "Version %hu.%hu is too long.", (unsigned short int)major,
1537 (unsigned short int)minor);
1538
1539 hsa_context.initialized = true;
1540 return true;
1541 }
1542
1543 /* Verify that hsa_context has already been initialized and return the
1544 agent_info structure describing device number N. Return NULL on error. */
1545
1546 static struct agent_info *
1547 get_agent_info (int n)
1548 {
1549 if (!hsa_context.initialized)
1550 {
1551 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1552 return NULL;
1553 }
1554 if (n >= hsa_context.agent_count)
1555 {
1556 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1557 return NULL;
1558 }
1559 if (!hsa_context.agents[n].initialized)
1560 {
1561 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1562 return NULL;
1563 }
1564 return &hsa_context.agents[n];
1565 }
1566
1567 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1568
1569 Selects (breaks at) a suitable region of type KIND. */
1570
1571 static hsa_status_t
1572 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1573 hsa_region_global_flag_t kind)
1574 {
1575 hsa_status_t status;
1576 hsa_region_segment_t segment;
1577
1578 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1579 &segment);
1580 if (status != HSA_STATUS_SUCCESS)
1581 return status;
1582 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1583 return HSA_STATUS_SUCCESS;
1584
1585 uint32_t flags;
1586 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1587 &flags);
1588 if (status != HSA_STATUS_SUCCESS)
1589 return status;
1590 if (flags & kind)
1591 {
1592 *retval = region;
1593 return HSA_STATUS_INFO_BREAK;
1594 }
1595 return HSA_STATUS_SUCCESS;
1596 }
1597
1598 /* Callback of hsa_agent_iterate_regions.
1599
1600 Selects a kernargs memory region. */
1601
1602 static hsa_status_t
1603 get_kernarg_memory_region (hsa_region_t region, void *data)
1604 {
1605 return get_memory_region (region, (hsa_region_t *)data,
1606 HSA_REGION_GLOBAL_FLAG_KERNARG);
1607 }
1608
1609 /* Callback of hsa_agent_iterate_regions.
1610
1611 Selects a coarse-grained memory region suitable for the heap and
1612 offload data. */
1613
1614 static hsa_status_t
1615 get_data_memory_region (hsa_region_t region, void *data)
1616 {
1617 return get_memory_region (region, (hsa_region_t *)data,
1618 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1619 }
1620
1621 static int
1622 elf_gcn_isa_field (Elf64_Ehdr *image)
1623 {
1624 return image->e_flags & EF_AMDGPU_MACH_MASK;
1625 }
1626
1627 const static char *gcn_gfx803_s = "gfx803";
1628 const static char *gcn_gfx900_s = "gfx900";
1629 const static char *gcn_gfx906_s = "gfx906";
1630 const static char *gcn_gfx908_s = "gfx908";
1631 const static char *gcn_gfx90a_s = "gfx90a";
1632 const static int gcn_isa_name_len = 6;
1633
1634 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1635 support the ISA. */
1636
1637 static const char*
1638 isa_hsa_name (int isa) {
1639 switch(isa)
1640 {
1641 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1642 return gcn_gfx803_s;
1643 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1644 return gcn_gfx900_s;
1645 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1646 return gcn_gfx906_s;
1647 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1648 return gcn_gfx908_s;
1649 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1650 return gcn_gfx90a_s;
1651 }
1652 return NULL;
1653 }
1654
1655 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1656 with -march) or NULL if we do not support the ISA.
1657 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1658
1659 static const char*
1660 isa_gcc_name (int isa) {
1661 switch(isa)
1662 {
1663 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1664 return "fiji";
1665 default:
1666 return isa_hsa_name (isa);
1667 }
1668 }
1669
1670 /* Returns the code which is used in the GCN object code to identify the ISA with
1671 the given name (as used by the HSA runtime). */
1672
1673 static gcn_isa
1674 isa_code(const char *isa) {
1675 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1676 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1677
1678 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1679 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1680
1681 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1682 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1683
1684 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1685 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1686
1687 if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1688 return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1689
1690 return -1;
1691 }
1692
1693 /* }}} */
1694 /* {{{ Run */
1695
1696 /* Create or reuse a team arena.
1697
1698 Team arenas are used by OpenMP to avoid calling malloc multiple times
1699 while setting up each team. This is purely a performance optimization.
1700
1701 Allocating an arena also costs performance, albeit on the host side, so
1702 this function will reuse an existing arena if a large enough one is idle.
1703 The arena is released, but not deallocated, when the kernel exits. */
1704
1705 static void *
1706 get_team_arena (struct agent_info *agent, int num_teams)
1707 {
1708 struct team_arena_list **next_ptr = &agent->team_arena_list;
1709 struct team_arena_list *item;
1710
1711 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1712 {
1713 if (item->num_teams < num_teams)
1714 continue;
1715
1716 if (pthread_mutex_trylock (&item->in_use))
1717 continue;
1718
1719 return item->arena;
1720 }
1721
1722 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1723
1724 if (pthread_mutex_lock (&agent->team_arena_write_lock))
1725 {
1726 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1727 return false;
1728 }
1729 item = malloc (sizeof (*item));
1730 item->num_teams = num_teams;
1731 item->next = NULL;
1732 *next_ptr = item;
1733
1734 if (pthread_mutex_init (&item->in_use, NULL))
1735 {
1736 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1737 return false;
1738 }
1739 if (pthread_mutex_lock (&item->in_use))
1740 {
1741 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1742 return false;
1743 }
1744 if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1745 {
1746 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1747 return false;
1748 }
1749
1750 const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */
1751 hsa_status_t status;
1752 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1753 TEAM_ARENA_SIZE*num_teams,
1754 &item->arena);
1755 if (status != HSA_STATUS_SUCCESS)
1756 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1757 status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1758 HSA_ACCESS_PERMISSION_RW);
1759 if (status != HSA_STATUS_SUCCESS)
1760 hsa_fatal ("Could not assign arena memory to device", status);
1761
1762 return item->arena;
1763 }
1764
1765 /* Mark a team arena available for reuse. */
1766
1767 static void
1768 release_team_arena (struct agent_info* agent, void *arena)
1769 {
1770 struct team_arena_list *item;
1771
1772 for (item = agent->team_arena_list; item; item = item->next)
1773 {
1774 if (item->arena == arena)
1775 {
1776 if (pthread_mutex_unlock (&item->in_use))
1777 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1778 return;
1779 }
1780 }
1781 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1782 }
1783
1784 /* Clean up all the allocated team arenas. */
1785
1786 static bool
1787 destroy_team_arenas (struct agent_info *agent)
1788 {
1789 struct team_arena_list *item, *next;
1790
1791 for (item = agent->team_arena_list; item; item = next)
1792 {
1793 next = item->next;
1794 hsa_fns.hsa_memory_free_fn (item->arena);
1795 if (pthread_mutex_destroy (&item->in_use))
1796 {
1797 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1798 return false;
1799 }
1800 free (item);
1801 }
1802 agent->team_arena_list = NULL;
1803
1804 return true;
1805 }
1806
1807 /* Allocate memory on a specified device. */
1808
1809 static void *
1810 alloc_by_agent (struct agent_info *agent, size_t size)
1811 {
1812 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1813
1814 /* Zero-size allocations are invalid, so in order to return a valid pointer
1815 we need to pass a valid size. One source of zero-size allocations is
1816 kernargs for kernels that have no inputs or outputs (the kernel may
1817 only use console output, for example). */
1818 if (size == 0)
1819 size = 4;
1820
1821 void *ptr;
1822 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1823 size, &ptr);
1824 if (status != HSA_STATUS_SUCCESS)
1825 {
1826 hsa_error ("Could not allocate device memory", status);
1827 return NULL;
1828 }
1829
1830 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1831 HSA_ACCESS_PERMISSION_RW);
1832 if (status != HSA_STATUS_SUCCESS)
1833 {
1834 hsa_error ("Could not assign data memory to device", status);
1835 return NULL;
1836 }
1837
1838 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1839 bool profiling_dispatch_p
1840 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1841 if (profiling_dispatch_p)
1842 {
1843 acc_prof_info *prof_info = thr->prof_info;
1844 acc_event_info data_event_info;
1845 acc_api_info *api_info = thr->api_info;
1846
1847 prof_info->event_type = acc_ev_alloc;
1848
1849 data_event_info.data_event.event_type = prof_info->event_type;
1850 data_event_info.data_event.valid_bytes
1851 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1852 data_event_info.data_event.parent_construct
1853 = acc_construct_parallel;
1854 data_event_info.data_event.implicit = 1;
1855 data_event_info.data_event.tool_info = NULL;
1856 data_event_info.data_event.var_name = NULL;
1857 data_event_info.data_event.bytes = size;
1858 data_event_info.data_event.host_ptr = NULL;
1859 data_event_info.data_event.device_ptr = (void *) ptr;
1860
1861 api_info->device_api = acc_device_api_other;
1862
1863 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1864 api_info);
1865 }
1866
1867 return ptr;
1868 }
1869
1870 /* Create kernel dispatch data structure for given KERNEL, along with
1871 the necessary device signals and memory allocations. */
1872
1873 static struct kernel_dispatch *
1874 create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1875 {
1876 struct agent_info *agent = kernel->agent;
1877 struct kernel_dispatch *shadow
1878 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1879
1880 shadow->agent = kernel->agent;
1881 shadow->object = kernel->object;
1882
1883 hsa_signal_t sync_signal;
1884 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1885 if (status != HSA_STATUS_SUCCESS)
1886 hsa_fatal ("Error creating the GCN sync signal", status);
1887
1888 shadow->signal = sync_signal.handle;
1889 shadow->private_segment_size = kernel->private_segment_size;
1890 shadow->group_segment_size = kernel->group_segment_size;
1891
1892 /* We expect kernels to request a single pointer, explicitly, and the
1893 rest of struct kernargs, implicitly. If they request anything else
1894 then something is wrong. */
1895 if (kernel->kernarg_segment_size > 8)
1896 {
1897 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1898 return NULL;
1899 }
1900
1901 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1902 sizeof (struct kernargs),
1903 &shadow->kernarg_address);
1904 if (status != HSA_STATUS_SUCCESS)
1905 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1906 struct kernargs *kernargs = shadow->kernarg_address;
1907
1908 /* Zero-initialize the output_data (minimum needed). */
1909 kernargs->out_ptr = (int64_t)&kernargs->output_data;
1910 kernargs->output_data.next_output = 0;
1911 for (unsigned i = 0;
1912 i < (sizeof (kernargs->output_data.queue)
1913 / sizeof (kernargs->output_data.queue[0]));
1914 i++)
1915 kernargs->output_data.queue[i].written = 0;
1916 kernargs->output_data.consumed = 0;
1917
1918 /* Pass in the heap location. */
1919 kernargs->heap_ptr = (int64_t)kernel->module->heap;
1920
1921 /* Create an arena. */
1922 if (kernel->kind == KIND_OPENMP)
1923 kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1924 else
1925 kernargs->arena_ptr = 0;
1926
1927 /* Ensure we can recognize unset return values. */
1928 kernargs->output_data.return_value = 0xcafe0000;
1929
1930 return shadow;
1931 }
1932
1933 static void
1934 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
1935 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
1936 {
1937 int dev_num = dev_num64;
1938 GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
1939 NULL, NULL, NULL);
1940 }
1941
1942 /* Output any data written to console output from the kernel. It is expected
1943 that this function is polled during kernel execution.
1944
1945 We print all entries from the last item printed to the next entry without
1946 a "written" flag. If the "final" flag is set then it'll continue right to
1947 the end.
1948
1949 The print buffer is circular, but the from and to locations don't wrap when
1950 the buffer does, so the output limit is UINT_MAX. The target blocks on
1951 output when the buffer is full. */
1952
1953 static void
1954 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1955 bool final)
1956 {
1957 unsigned int limit = (sizeof (kernargs->output_data.queue)
1958 / sizeof (kernargs->output_data.queue[0]));
1959
1960 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1961 __ATOMIC_ACQUIRE);
1962 unsigned int to = kernargs->output_data.next_output;
1963
1964 if (from > to)
1965 {
1966 /* Overflow. */
1967 if (final)
1968 printf ("GCN print buffer overflowed.\n");
1969 return;
1970 }
1971
1972 unsigned int i;
1973 for (i = from; i < to; i++)
1974 {
1975 struct printf_data *data = &kernargs->output_data.queue[i%limit];
1976
1977 if (!data->written && !final)
1978 break;
1979
1980 switch (data->type)
1981 {
1982 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1983 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1984 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1985 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1986 case 4:
1987 process_reverse_offload (data->value_u64[0], data->value_u64[1],
1988 data->value_u64[2], data->value_u64[3],
1989 data->value_u64[4], data->value_u64[5]);
1990 break;
1991 default: printf ("GCN print buffer error!\n"); break;
1992 }
1993 data->written = 0;
1994 __atomic_store_n (&kernargs->output_data.consumed, i+1,
1995 __ATOMIC_RELEASE);
1996 }
1997 fflush (stdout);
1998 }
1999
2000 /* Release data structure created for a kernel dispatch in SHADOW argument,
2001 and clean up the signal and memory allocations. */
2002
2003 static void
2004 release_kernel_dispatch (struct kernel_dispatch *shadow)
2005 {
2006 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2007
2008 struct kernargs *kernargs = shadow->kernarg_address;
2009 void *arena = (void *)kernargs->arena_ptr;
2010 if (arena)
2011 release_team_arena (shadow->agent, arena);
2012
2013 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2014
2015 hsa_signal_t s;
2016 s.handle = shadow->signal;
2017 hsa_fns.hsa_signal_destroy_fn (s);
2018
2019 free (shadow);
2020 }
2021
2022 /* Extract the properties from a kernel binary. */
2023
2024 static void
2025 init_kernel_properties (struct kernel_info *kernel)
2026 {
2027 hsa_status_t status;
2028 struct agent_info *agent = kernel->agent;
2029 hsa_executable_symbol_t kernel_symbol;
2030 char *buf = alloca (strlen (kernel->name) + 4);
2031 sprintf (buf, "%s.kd", kernel->name);
2032 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2033 buf, agent->id,
2034 0, &kernel_symbol);
2035 if (status != HSA_STATUS_SUCCESS)
2036 {
2037 hsa_warn ("Could not find symbol for kernel in the code object", status);
2038 fprintf (stderr, "not found name: '%s'\n", buf);
2039 dump_executable_symbols (agent->executable);
2040 goto failure;
2041 }
2042 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2043 status = hsa_fns.hsa_executable_symbol_get_info_fn
2044 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2045 if (status != HSA_STATUS_SUCCESS)
2046 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2047 status = hsa_fns.hsa_executable_symbol_get_info_fn
2048 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2049 &kernel->kernarg_segment_size);
2050 if (status != HSA_STATUS_SUCCESS)
2051 hsa_fatal ("Could not get info about kernel argument size", status);
2052 status = hsa_fns.hsa_executable_symbol_get_info_fn
2053 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2054 &kernel->group_segment_size);
2055 if (status != HSA_STATUS_SUCCESS)
2056 hsa_fatal ("Could not get info about kernel group segment size", status);
2057 status = hsa_fns.hsa_executable_symbol_get_info_fn
2058 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2059 &kernel->private_segment_size);
2060 if (status != HSA_STATUS_SUCCESS)
2061 hsa_fatal ("Could not get info about kernel private segment size",
2062 status);
2063
2064 /* The kernel type is not known until something tries to launch it. */
2065 kernel->kind = KIND_UNKNOWN;
2066
2067 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2068 "following segment sizes: \n", kernel->name);
2069 GCN_DEBUG (" group_segment_size: %u\n",
2070 (unsigned) kernel->group_segment_size);
2071 GCN_DEBUG (" private_segment_size: %u\n",
2072 (unsigned) kernel->private_segment_size);
2073 GCN_DEBUG (" kernarg_segment_size: %u\n",
2074 (unsigned) kernel->kernarg_segment_size);
2075 return;
2076
2077 failure:
2078 kernel->initialization_failed = true;
2079 }
2080
2081 /* Do all the work that is necessary before running KERNEL for the first time.
2082 The function assumes the program has been created, finalized and frozen by
2083 create_and_finalize_hsa_program. */
2084
2085 static void
2086 init_kernel (struct kernel_info *kernel)
2087 {
2088 if (pthread_mutex_lock (&kernel->init_mutex))
2089 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2090 if (kernel->initialized)
2091 {
2092 if (pthread_mutex_unlock (&kernel->init_mutex))
2093 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2094 "mutex");
2095
2096 return;
2097 }
2098
2099 init_kernel_properties (kernel);
2100
2101 if (!kernel->initialization_failed)
2102 {
2103 GCN_DEBUG ("\n");
2104
2105 kernel->initialized = true;
2106 }
2107 if (pthread_mutex_unlock (&kernel->init_mutex))
2108 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2109 "mutex");
2110 }
2111
2112 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2113 launch attributes from KLA.
2114
2115 MODULE_LOCKED indicates that the caller already holds the lock and
2116 run_kernel need not lock it again.
2117 If AQ is NULL then agent->sync_queue will be used. */
2118
2119 static void
2120 run_kernel (struct kernel_info *kernel, void *vars,
2121 struct GOMP_kernel_launch_attributes *kla,
2122 struct goacc_asyncqueue *aq, bool module_locked)
2123 {
2124 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2125 kernel->description->vpgr_count);
2126
2127 /* Reduce the number of threads/workers if there are insufficient
2128 VGPRs available to run the kernels together. */
2129 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2130 {
2131 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2132 int max_threads = (256 / granulated_vgprs) * 4;
2133 if (kla->gdims[2] > max_threads)
2134 {
2135 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2136 " per team/gang - reducing to %d threads/workers.\n",
2137 kla->gdims[2], max_threads);
2138 kla->gdims[2] = max_threads;
2139 }
2140 }
2141
2142 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2143 (aq ? aq->id : 0));
2144 GCN_DEBUG ("GCN launch attribs: gdims:[");
2145 int i;
2146 for (i = 0; i < kla->ndim; ++i)
2147 {
2148 if (i)
2149 DEBUG_PRINT (", ");
2150 DEBUG_PRINT ("%u", kla->gdims[i]);
2151 }
2152 DEBUG_PRINT ("], normalized gdims:[");
2153 for (i = 0; i < kla->ndim; ++i)
2154 {
2155 if (i)
2156 DEBUG_PRINT (", ");
2157 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2158 }
2159 DEBUG_PRINT ("], wdims:[");
2160 for (i = 0; i < kla->ndim; ++i)
2161 {
2162 if (i)
2163 DEBUG_PRINT (", ");
2164 DEBUG_PRINT ("%u", kla->wdims[i]);
2165 }
2166 DEBUG_PRINT ("]\n");
2167 DEBUG_FLUSH ();
2168
2169 struct agent_info *agent = kernel->agent;
2170 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2171 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2172
2173 if (!agent->initialized)
2174 GOMP_PLUGIN_fatal ("Agent must be initialized");
2175
2176 if (!kernel->initialized)
2177 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2178
2179 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2180
2181 uint64_t index
2182 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2183 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2184
2185 /* Wait until the queue is not full before writing the packet. */
2186 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2187 >= command_q->size)
2188 ;
2189
2190 /* Do not allow the dimensions to be overridden when running
2191 constructors or destructors. */
2192 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2193 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2194
2195 hsa_kernel_dispatch_packet_t *packet;
2196 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2197 + index % command_q->size;
2198
2199 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2200 packet->grid_size_x = override_x ? : kla->gdims[0];
2201 packet->workgroup_size_x = get_group_size (kla->ndim,
2202 packet->grid_size_x,
2203 kla->wdims[0]);
2204
2205 if (kla->ndim >= 2)
2206 {
2207 packet->grid_size_y = kla->gdims[1];
2208 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2209 kla->wdims[1]);
2210 }
2211 else
2212 {
2213 packet->grid_size_y = 1;
2214 packet->workgroup_size_y = 1;
2215 }
2216
2217 if (kla->ndim == 3)
2218 {
2219 packet->grid_size_z = limit_worker_threads (override_z
2220 ? : kla->gdims[2]);
2221 packet->workgroup_size_z = get_group_size (kla->ndim,
2222 packet->grid_size_z,
2223 kla->wdims[2]);
2224 }
2225 else
2226 {
2227 packet->grid_size_z = 1;
2228 packet->workgroup_size_z = 1;
2229 }
2230
2231 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2232 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2233 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2234 packet->grid_size_x / packet->workgroup_size_x,
2235 packet->grid_size_y / packet->workgroup_size_y,
2236 packet->grid_size_z / packet->workgroup_size_z,
2237 packet->workgroup_size_x, packet->workgroup_size_y,
2238 packet->workgroup_size_z);
2239
2240 struct kernel_dispatch *shadow
2241 = create_kernel_dispatch (kernel, packet->grid_size_x);
2242 shadow->queue = command_q;
2243
2244 if (debug)
2245 {
2246 fprintf (stderr, "\nKernel has following dependencies:\n");
2247 print_kernel_dispatch (shadow, 2);
2248 }
2249
2250 packet->private_segment_size = kernel->private_segment_size;
2251 packet->group_segment_size = kernel->group_segment_size;
2252 packet->kernel_object = kernel->object;
2253 packet->kernarg_address = shadow->kernarg_address;
2254 hsa_signal_t s;
2255 s.handle = shadow->signal;
2256 packet->completion_signal = s;
2257 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2258 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2259
2260 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2261
2262 uint16_t header;
2263 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2264 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2265 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2266
2267 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2268 agent->device_id);
2269
2270 packet_store_release ((uint32_t *) packet, header,
2271 (uint16_t) kla->ndim
2272 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2273
2274 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2275 index);
2276
2277 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2278
2279 /* Root signal waits with 1ms timeout. */
2280 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2281 1000 * 1000,
2282 HSA_WAIT_STATE_BLOCKED) != 0)
2283 {
2284 console_output (kernel, shadow->kernarg_address, false);
2285 }
2286 console_output (kernel, shadow->kernarg_address, true);
2287
2288 struct kernargs *kernargs = shadow->kernarg_address;
2289 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2290
2291 release_kernel_dispatch (shadow);
2292
2293 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2294 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2295
2296 unsigned int upper = (return_value & ~0xffff) >> 16;
2297 if (upper == 0xcafe)
2298 ; // exit not called, normal termination.
2299 else if (upper == 0xffff)
2300 ; // exit called.
2301 else
2302 {
2303 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2304 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2305 return_value);
2306 abort ();
2307 }
2308
2309 if (upper == 0xffff)
2310 {
2311 unsigned int signal = (return_value >> 8) & 0xff;
2312
2313 if (signal == SIGABRT)
2314 {
2315 GCN_WARNING ("GCN Kernel aborted\n");
2316 abort ();
2317 }
2318 else if (signal != 0)
2319 {
2320 GCN_WARNING ("GCN Kernel received unknown signal\n");
2321 abort ();
2322 }
2323
2324 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2325 exit (return_value & 0xff);
2326 }
2327 }
2328
2329 /* }}} */
2330 /* {{{ Load/Unload */
2331
2332 /* Initialize KERNEL from D and other parameters. Return true on success. */
2333
2334 static bool
2335 init_basic_kernel_info (struct kernel_info *kernel,
2336 struct hsa_kernel_description *d,
2337 struct agent_info *agent,
2338 struct module_info *module)
2339 {
2340 kernel->agent = agent;
2341 kernel->module = module;
2342 kernel->name = d->name;
2343 kernel->description = d;
2344 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2345 {
2346 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2347 return false;
2348 }
2349 return true;
2350 }
2351
2352 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2353
2354 static bool
2355 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2356 {
2357 int isa_field = elf_gcn_isa_field (image);
2358 const char* isa_s = isa_hsa_name (isa_field);
2359 if (!isa_s)
2360 {
2361 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2362 return false;
2363 }
2364
2365 if (isa_field != agent->device_isa)
2366 {
2367 char msg[120];
2368 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2369 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2370 assert (agent_isa_s);
2371 assert (agent_isa_gcc_s);
2372
2373 snprintf (msg, sizeof msg,
2374 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2375 "Try to recompile with '-foffload-options=-march=%s'.\n",
2376 isa_s, agent_isa_s, agent_isa_gcc_s);
2377
2378 hsa_error (msg, HSA_STATUS_ERROR);
2379 return false;
2380 }
2381
2382 return true;
2383 }
2384
2385 /* Create and finalize the program consisting of all loaded modules. */
2386
2387 static bool
2388 create_and_finalize_hsa_program (struct agent_info *agent)
2389 {
2390 hsa_status_t status;
2391 bool res = true;
2392 if (pthread_mutex_lock (&agent->prog_mutex))
2393 {
2394 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2395 return false;
2396 }
2397 if (agent->prog_finalized)
2398 goto final;
2399
2400 status
2401 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2402 HSA_EXECUTABLE_STATE_UNFROZEN,
2403 "", &agent->executable);
2404 if (status != HSA_STATUS_SUCCESS)
2405 {
2406 hsa_error ("Could not create GCN executable", status);
2407 goto fail;
2408 }
2409
2410 /* Load any GCN modules. */
2411 struct module_info *module = agent->module;
2412 if (module)
2413 {
2414 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2415
2416 if (!isa_matches_agent (agent, image))
2417 goto fail;
2418
2419 hsa_code_object_t co = { 0 };
2420 status = hsa_fns.hsa_code_object_deserialize_fn
2421 (module->image_desc->gcn_image->image,
2422 module->image_desc->gcn_image->size,
2423 NULL, &co);
2424 if (status != HSA_STATUS_SUCCESS)
2425 {
2426 hsa_error ("Could not deserialize GCN code object", status);
2427 goto fail;
2428 }
2429
2430 status = hsa_fns.hsa_executable_load_code_object_fn
2431 (agent->executable, agent->id, co, "");
2432 if (status != HSA_STATUS_SUCCESS)
2433 {
2434 hsa_error ("Could not load GCN code object", status);
2435 goto fail;
2436 }
2437
2438 if (!module->heap)
2439 {
2440 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2441 gcn_kernel_heap_size,
2442 (void**)&module->heap);
2443 if (status != HSA_STATUS_SUCCESS)
2444 {
2445 hsa_error ("Could not allocate memory for GCN heap", status);
2446 goto fail;
2447 }
2448
2449 status = hsa_fns.hsa_memory_assign_agent_fn
2450 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2451 if (status != HSA_STATUS_SUCCESS)
2452 {
2453 hsa_error ("Could not assign GCN heap memory to device", status);
2454 goto fail;
2455 }
2456
2457 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2458 &gcn_kernel_heap_size,
2459 sizeof (gcn_kernel_heap_size));
2460 }
2461
2462 }
2463
2464 if (debug)
2465 dump_executable_symbols (agent->executable);
2466
2467 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2468 if (status != HSA_STATUS_SUCCESS)
2469 {
2470 hsa_error ("Could not freeze the GCN executable", status);
2471 goto fail;
2472 }
2473
2474 final:
2475 agent->prog_finalized = true;
2476
2477 if (pthread_mutex_unlock (&agent->prog_mutex))
2478 {
2479 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2480 res = false;
2481 }
2482
2483 return res;
2484
2485 fail:
2486 res = false;
2487 goto final;
2488 }
2489
2490 /* Free the HSA program in agent and everything associated with it and set
2491 agent->prog_finalized and the initialized flags of all kernels to false.
2492 Return TRUE on success. */
2493
2494 static bool
2495 destroy_hsa_program (struct agent_info *agent)
2496 {
2497 if (!agent->prog_finalized)
2498 return true;
2499
2500 hsa_status_t status;
2501
2502 GCN_DEBUG ("Destroying the current GCN program.\n");
2503
2504 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2505 if (status != HSA_STATUS_SUCCESS)
2506 return hsa_error ("Could not destroy GCN executable", status);
2507
2508 if (agent->module)
2509 {
2510 int i;
2511 for (i = 0; i < agent->module->kernel_count; i++)
2512 agent->module->kernels[i].initialized = false;
2513
2514 if (agent->module->heap)
2515 {
2516 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2517 agent->module->heap = NULL;
2518 }
2519 }
2520 agent->prog_finalized = false;
2521 return true;
2522 }
2523
2524 /* Deinitialize all information associated with MODULE and kernels within
2525 it. Return TRUE on success. */
2526
2527 static bool
2528 destroy_module (struct module_info *module, bool locked)
2529 {
2530 /* Run destructors before destroying module. */
2531 struct GOMP_kernel_launch_attributes kla =
2532 { 3,
2533 /* Grid size. */
2534 { 1, 64, 1 },
2535 /* Work-group size. */
2536 { 1, 64, 1 }
2537 };
2538
2539 if (module->fini_array_func)
2540 {
2541 init_kernel (module->fini_array_func);
2542 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2543 }
2544 module->constructors_run_p = false;
2545
2546 int i;
2547 for (i = 0; i < module->kernel_count; i++)
2548 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2549 {
2550 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2551 "mutex");
2552 return false;
2553 }
2554
2555 return true;
2556 }
2557
2558 /* }}} */
2559 /* {{{ Async */
2560
2561 /* Callback of dispatch queues to report errors. */
2562
2563 static void
2564 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2565 {
2566 struct queue_entry *entry = &aq->queue[index];
2567
2568 switch (entry->type)
2569 {
2570 case KERNEL_LAUNCH:
2571 if (DEBUG_QUEUES)
2572 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2573 aq->agent->device_id, aq->id, index);
2574 run_kernel (entry->u.launch.kernel,
2575 entry->u.launch.vars,
2576 &entry->u.launch.kla, aq, false);
2577 if (DEBUG_QUEUES)
2578 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2579 aq->agent->device_id, aq->id, index);
2580 break;
2581
2582 case CALLBACK:
2583 if (DEBUG_QUEUES)
2584 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2585 aq->agent->device_id, aq->id, index);
2586 entry->u.callback.fn (entry->u.callback.data);
2587 if (DEBUG_QUEUES)
2588 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2589 aq->agent->device_id, aq->id, index);
2590 break;
2591
2592 case ASYNC_WAIT:
2593 {
2594 /* FIXME: is it safe to access a placeholder that may already have
2595 been executed? */
2596 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2597
2598 if (DEBUG_QUEUES)
2599 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2600 aq->agent->device_id, aq->id, index);
2601
2602 pthread_mutex_lock (&placeholderp->mutex);
2603
2604 while (!placeholderp->executed)
2605 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2606
2607 pthread_mutex_unlock (&placeholderp->mutex);
2608
2609 if (pthread_cond_destroy (&placeholderp->cond))
2610 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2611
2612 if (pthread_mutex_destroy (&placeholderp->mutex))
2613 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2614
2615 if (DEBUG_QUEUES)
2616 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2617 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2618 }
2619 break;
2620
2621 case ASYNC_PLACEHOLDER:
2622 pthread_mutex_lock (&entry->u.placeholder.mutex);
2623 entry->u.placeholder.executed = 1;
2624 pthread_cond_signal (&entry->u.placeholder.cond);
2625 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2626 break;
2627
2628 default:
2629 GOMP_PLUGIN_fatal ("Unknown queue element");
2630 }
2631 }
2632
2633 /* This function is run as a thread to service an async queue in the
2634 background. It runs continuously until the stop flag is set. */
2635
2636 static void *
2637 drain_queue (void *thread_arg)
2638 {
2639 struct goacc_asyncqueue *aq = thread_arg;
2640
2641 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2642 {
2643 aq->drain_queue_stop = 2;
2644 return NULL;
2645 }
2646
2647 pthread_mutex_lock (&aq->mutex);
2648
2649 while (true)
2650 {
2651 if (aq->drain_queue_stop)
2652 break;
2653
2654 if (aq->queue_n > 0)
2655 {
2656 pthread_mutex_unlock (&aq->mutex);
2657 execute_queue_entry (aq, aq->queue_first);
2658
2659 pthread_mutex_lock (&aq->mutex);
2660 aq->queue_first = ((aq->queue_first + 1)
2661 % ASYNC_QUEUE_SIZE);
2662 aq->queue_n--;
2663
2664 if (DEBUG_THREAD_SIGNAL)
2665 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2666 aq->agent->device_id, aq->id);
2667 pthread_cond_broadcast (&aq->queue_cond_out);
2668 pthread_mutex_unlock (&aq->mutex);
2669
2670 if (DEBUG_QUEUES)
2671 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2672 aq->id);
2673 pthread_mutex_lock (&aq->mutex);
2674 }
2675 else
2676 {
2677 if (DEBUG_THREAD_SLEEP)
2678 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2679 aq->agent->device_id, aq->id);
2680 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2681 if (DEBUG_THREAD_SLEEP)
2682 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2683 aq->agent->device_id, aq->id);
2684 }
2685 }
2686
2687 aq->drain_queue_stop = 2;
2688 if (DEBUG_THREAD_SIGNAL)
2689 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2690 aq->agent->device_id, aq->id);
2691 pthread_cond_broadcast (&aq->queue_cond_out);
2692 pthread_mutex_unlock (&aq->mutex);
2693
2694 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2695 return NULL;
2696 }
2697
2698 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2699 is not usually the case. This is just a debug tool. */
2700
2701 static void
2702 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2703 {
2704 pthread_mutex_lock (&aq->mutex);
2705
2706 while (aq->queue_n > 0)
2707 {
2708 execute_queue_entry (aq, aq->queue_first);
2709
2710 aq->queue_first = ((aq->queue_first + 1)
2711 % ASYNC_QUEUE_SIZE);
2712 aq->queue_n--;
2713 }
2714
2715 pthread_mutex_unlock (&aq->mutex);
2716 }
2717
2718 /* Block the current thread until an async queue is writable. The aq->mutex
2719 lock should be held on entry, and remains locked on exit. */
2720
2721 static void
2722 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2723 {
2724 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2725 {
2726 /* Queue is full. Wait for it to not be full. */
2727 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2728 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2729 }
2730 }
2731
2732 /* Request an asynchronous kernel launch on the specified queue. This
2733 may block if the queue is full, but returns without waiting for the
2734 kernel to run. */
2735
2736 static void
2737 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2738 void *vars, struct GOMP_kernel_launch_attributes *kla)
2739 {
2740 assert (aq->agent == kernel->agent);
2741
2742 pthread_mutex_lock (&aq->mutex);
2743
2744 wait_for_queue_nonfull (aq);
2745
2746 int queue_last = ((aq->queue_first + aq->queue_n)
2747 % ASYNC_QUEUE_SIZE);
2748 if (DEBUG_QUEUES)
2749 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2750 aq->id, queue_last);
2751
2752 aq->queue[queue_last].type = KERNEL_LAUNCH;
2753 aq->queue[queue_last].u.launch.kernel = kernel;
2754 aq->queue[queue_last].u.launch.vars = vars;
2755 aq->queue[queue_last].u.launch.kla = *kla;
2756
2757 aq->queue_n++;
2758
2759 if (DEBUG_THREAD_SIGNAL)
2760 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2761 aq->agent->device_id, aq->id);
2762 pthread_cond_signal (&aq->queue_cond_in);
2763
2764 pthread_mutex_unlock (&aq->mutex);
2765 }
2766
2767 /* Request an asynchronous callback on the specified queue. The callback
2768 function will be called, with the given opaque data, from the appropriate
2769 async thread, when all previous items on that queue are complete. */
2770
2771 static void
2772 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2773 void *data)
2774 {
2775 pthread_mutex_lock (&aq->mutex);
2776
2777 wait_for_queue_nonfull (aq);
2778
2779 int queue_last = ((aq->queue_first + aq->queue_n)
2780 % ASYNC_QUEUE_SIZE);
2781 if (DEBUG_QUEUES)
2782 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2783 aq->id, queue_last);
2784
2785 aq->queue[queue_last].type = CALLBACK;
2786 aq->queue[queue_last].u.callback.fn = fn;
2787 aq->queue[queue_last].u.callback.data = data;
2788
2789 aq->queue_n++;
2790
2791 if (DEBUG_THREAD_SIGNAL)
2792 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2793 aq->agent->device_id, aq->id);
2794 pthread_cond_signal (&aq->queue_cond_in);
2795
2796 pthread_mutex_unlock (&aq->mutex);
2797 }
2798
2799 /* Request that a given async thread wait for another thread (unspecified) to
2800 reach the given placeholder. The wait will occur when all previous entries
2801 on the queue are complete. A placeholder is effectively a kind of signal
2802 which simply sets a flag when encountered in a queue. */
2803
2804 static void
2805 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2806 struct placeholder *placeholderp)
2807 {
2808 pthread_mutex_lock (&aq->mutex);
2809
2810 wait_for_queue_nonfull (aq);
2811
2812 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2813 if (DEBUG_QUEUES)
2814 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2815 aq->id, queue_last);
2816
2817 aq->queue[queue_last].type = ASYNC_WAIT;
2818 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2819
2820 aq->queue_n++;
2821
2822 if (DEBUG_THREAD_SIGNAL)
2823 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2824 aq->agent->device_id, aq->id);
2825 pthread_cond_signal (&aq->queue_cond_in);
2826
2827 pthread_mutex_unlock (&aq->mutex);
2828 }
2829
2830 /* Add a placeholder into an async queue. When the async thread reaches the
2831 placeholder it will set the "executed" flag to true and continue.
2832 Another thread may be waiting on this thread reaching the placeholder. */
2833
2834 static struct placeholder *
2835 queue_push_placeholder (struct goacc_asyncqueue *aq)
2836 {
2837 struct placeholder *placeholderp;
2838
2839 pthread_mutex_lock (&aq->mutex);
2840
2841 wait_for_queue_nonfull (aq);
2842
2843 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2844 if (DEBUG_QUEUES)
2845 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2846 aq->id, queue_last);
2847
2848 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2849 placeholderp = &aq->queue[queue_last].u.placeholder;
2850
2851 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2852 {
2853 pthread_mutex_unlock (&aq->mutex);
2854 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2855 }
2856
2857 if (pthread_cond_init (&placeholderp->cond, NULL))
2858 {
2859 pthread_mutex_unlock (&aq->mutex);
2860 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2861 }
2862
2863 placeholderp->executed = 0;
2864
2865 aq->queue_n++;
2866
2867 if (DEBUG_THREAD_SIGNAL)
2868 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2869 aq->agent->device_id, aq->id);
2870 pthread_cond_signal (&aq->queue_cond_in);
2871
2872 pthread_mutex_unlock (&aq->mutex);
2873
2874 return placeholderp;
2875 }
2876
2877 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
2878
2879 static void
2880 finalize_async_thread (struct goacc_asyncqueue *aq)
2881 {
2882 pthread_mutex_lock (&aq->mutex);
2883 if (aq->drain_queue_stop == 2)
2884 {
2885 pthread_mutex_unlock (&aq->mutex);
2886 return;
2887 }
2888
2889 aq->drain_queue_stop = 1;
2890
2891 if (DEBUG_THREAD_SIGNAL)
2892 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2893 aq->agent->device_id, aq->id);
2894 pthread_cond_signal (&aq->queue_cond_in);
2895
2896 while (aq->drain_queue_stop != 2)
2897 {
2898 if (DEBUG_THREAD_SLEEP)
2899 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2900 " to sleep\n", aq->agent->device_id, aq->id);
2901 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2902 if (DEBUG_THREAD_SLEEP)
2903 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2904 aq->agent->device_id, aq->id);
2905 }
2906
2907 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2908 aq->id);
2909 pthread_mutex_unlock (&aq->mutex);
2910
2911 int err = pthread_join (aq->thread_drain_queue, NULL);
2912 if (err != 0)
2913 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2914 aq->agent->device_id, aq->id, strerror (err));
2915 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2916 }
2917
2918 /* Set up an async queue for OpenMP. There will be only one. The
2919 implementation simply uses an OpenACC async queue.
2920 FIXME: is this thread-safe if two threads call this function? */
2921
2922 static void
2923 maybe_init_omp_async (struct agent_info *agent)
2924 {
2925 if (!agent->omp_async_queue)
2926 agent->omp_async_queue
2927 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2928 }
2929
2930 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2931 copies from read-only pages. */
2932
2933 static void
2934 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2935 {
2936 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2937
2938 if (status == HSA_STATUS_SUCCESS)
2939 return;
2940
2941 /* It appears that the copy fails if the source data is in a read-only page.
2942 We can't detect that easily, so try copying the data to a temporary buffer
2943 and doing the copy again if we got an error above. */
2944
2945 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2946 "[%p:+%d]\n", (void *) src, (int) len);
2947
2948 void *src_copy = malloc (len);
2949 memcpy (src_copy, src, len);
2950 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2951 free (src_copy);
2952 if (status != HSA_STATUS_SUCCESS)
2953 GOMP_PLUGIN_error ("memory copy failed");
2954 }
2955
2956 /* Copy data to or from a device. This is intended for use as an async
2957 callback event. */
2958
2959 static void
2960 copy_data (void *data_)
2961 {
2962 struct copy_data *data = (struct copy_data *)data_;
2963 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2964 data->aq->agent->device_id, data->aq->id, data->len, data->src,
2965 data->dst);
2966 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
2967 free (data);
2968 }
2969
2970 /* Free device data. This is intended for use as an async callback event. */
2971
2972 static void
2973 gomp_offload_free (void *ptr)
2974 {
2975 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2976 GOMP_OFFLOAD_free (0, ptr);
2977 }
2978
2979 /* Request an asynchronous data copy, to or from a device, on a given queue.
2980 The event will be registered as a callback. */
2981
2982 static void
2983 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
2984 size_t len)
2985 {
2986 if (DEBUG_QUEUES)
2987 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2988 aq->agent->device_id, aq->id, len, src, dst);
2989 struct copy_data *data
2990 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
2991 data->dst = dst;
2992 data->src = src;
2993 data->len = len;
2994 data->aq = aq;
2995 queue_push_callback (aq, copy_data, data);
2996 }
2997
2998 /* Return true if the given queue is currently empty. */
2999
3000 static int
3001 queue_empty (struct goacc_asyncqueue *aq)
3002 {
3003 pthread_mutex_lock (&aq->mutex);
3004 int res = aq->queue_n == 0 ? 1 : 0;
3005 pthread_mutex_unlock (&aq->mutex);
3006
3007 return res;
3008 }
3009
3010 /* Wait for a given queue to become empty. This implements an OpenACC wait
3011 directive. */
3012
3013 static void
3014 wait_queue (struct goacc_asyncqueue *aq)
3015 {
3016 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3017 {
3018 drain_queue_synchronous (aq);
3019 return;
3020 }
3021
3022 pthread_mutex_lock (&aq->mutex);
3023
3024 while (aq->queue_n > 0)
3025 {
3026 if (DEBUG_THREAD_SLEEP)
3027 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3028 aq->agent->device_id, aq->id);
3029 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3030 if (DEBUG_THREAD_SLEEP)
3031 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3032 aq->id);
3033 }
3034
3035 pthread_mutex_unlock (&aq->mutex);
3036 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3037 }
3038
3039 /* }}} */
3040 /* {{{ OpenACC support */
3041
3042 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3043
3044 static void
3045 gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3046 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3047 struct goacc_asyncqueue *aq)
3048 {
3049 if (!GOMP_OFFLOAD_can_run (kernel))
3050 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3051
3052 /* If we get here then this must be an OpenACC kernel. */
3053 kernel->kind = KIND_OPENACC;
3054
3055 /* devaddrs must be double-indirect on the target. */
3056 void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3057 for (size_t i = 0; i < mapnum; i++)
3058 hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3059 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3060 sizeof (void *));
3061
3062 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3063 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3064 {
3065 struct hsa_kernel_description *d
3066 = &kernel->module->image_desc->kernel_infos[i];
3067 if (d->name == kernel->name)
3068 {
3069 hsa_kernel_desc = d;
3070 break;
3071 }
3072 }
3073
3074 /* We may have statically-determined dimensions in
3075 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3076 invocation at runtime in dims[]. We allow static dimensions to take
3077 priority over dynamic dimensions when present (non-zero). */
3078 if (hsa_kernel_desc->oacc_dims[0] > 0)
3079 dims[0] = hsa_kernel_desc->oacc_dims[0];
3080 if (hsa_kernel_desc->oacc_dims[1] > 0)
3081 dims[1] = hsa_kernel_desc->oacc_dims[1];
3082 if (hsa_kernel_desc->oacc_dims[2] > 0)
3083 dims[2] = hsa_kernel_desc->oacc_dims[2];
3084
3085 /* Ideally, when a dimension isn't explicitly specified, we should
3086 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3087 In practice, we tune for peak performance on BabelStream, which
3088 for OpenACC is currently 32 threads per CU. */
3089 if (dims[0] == 0 && dims[1] == 0)
3090 {
3091 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3092 number. There isn't really a correct answer for this without a clue
3093 about the problem size, so let's do a reasonable number of workers
3094 and gangs. */
3095
3096 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3097 dims[1] = 8; /* Workers. */
3098 }
3099 else if (dims[0] == 0 && dims[1] > 0)
3100 {
3101 /* Auto-scale the number of gangs with the requested number of workers. */
3102 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3103 }
3104 else if (dims[0] > 0 && dims[1] == 0)
3105 {
3106 /* Auto-scale the number of workers with the requested number of gangs. */
3107 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3108 if (dims[1] == 0)
3109 dims[1] = 1;
3110 if (dims[1] > 16)
3111 dims[1] = 16;
3112 }
3113
3114 /* The incoming dimensions are expressed in terms of gangs, workers, and
3115 vectors. The HSA dimensions are expressed in terms of "work-items",
3116 which means multiples of vector lanes.
3117
3118 The "grid size" specifies the size of the problem space, and the
3119 "work-group size" specifies how much of that we want a single compute
3120 unit to chew on at once.
3121
3122 The three dimensions do not really correspond to hardware, but the
3123 important thing is that the HSA runtime will launch as many
3124 work-groups as it takes to process the entire grid, and each
3125 work-group will contain as many wave-fronts as it takes to process
3126 the work-items in that group.
3127
3128 Essentially, as long as we set the Y dimension to 64 (the number of
3129 vector lanes in hardware), and the Z group size to the maximum (16),
3130 then we will get the gangs (X) and workers (Z) launched as we expect.
3131
3132 The reason for the apparent reversal of vector and worker dimension
3133 order is to do with the way the run-time distributes work-items across
3134 v1 and v2. */
3135 struct GOMP_kernel_launch_attributes kla =
3136 {3,
3137 /* Grid size. */
3138 {dims[0], 64, dims[1]},
3139 /* Work-group size. */
3140 {1, 64, 16}
3141 };
3142
3143 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3144 acc_prof_info *prof_info = thr->prof_info;
3145 acc_event_info enqueue_launch_event_info;
3146 acc_api_info *api_info = thr->api_info;
3147 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3148 if (profiling_dispatch_p)
3149 {
3150 prof_info->event_type = acc_ev_enqueue_launch_start;
3151
3152 enqueue_launch_event_info.launch_event.event_type
3153 = prof_info->event_type;
3154 enqueue_launch_event_info.launch_event.valid_bytes
3155 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3156 enqueue_launch_event_info.launch_event.parent_construct
3157 = acc_construct_parallel;
3158 enqueue_launch_event_info.launch_event.implicit = 1;
3159 enqueue_launch_event_info.launch_event.tool_info = NULL;
3160 enqueue_launch_event_info.launch_event.kernel_name
3161 = (char *) kernel->name;
3162 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3163 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3164 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3165
3166 api_info->device_api = acc_device_api_other;
3167
3168 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3169 &enqueue_launch_event_info, api_info);
3170 }
3171
3172 if (!async)
3173 {
3174 run_kernel (kernel, ind_da, &kla, NULL, false);
3175 gomp_offload_free (ind_da);
3176 }
3177 else
3178 {
3179 queue_push_launch (aq, kernel, ind_da, &kla);
3180 if (DEBUG_QUEUES)
3181 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3182 aq->agent->device_id, aq->id, ind_da);
3183 queue_push_callback (aq, gomp_offload_free, ind_da);
3184 }
3185
3186 if (profiling_dispatch_p)
3187 {
3188 prof_info->event_type = acc_ev_enqueue_launch_end;
3189 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3190 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3191 &enqueue_launch_event_info,
3192 api_info);
3193 }
3194 }
3195
3196 /* }}} */
3197 /* {{{ Generic Plugin API */
3198
3199 /* Return the name of the accelerator, which is "gcn". */
3200
3201 const char *
3202 GOMP_OFFLOAD_get_name (void)
3203 {
3204 return "gcn";
3205 }
3206
3207 /* Return the specific capabilities the HSA accelerator have. */
3208
3209 unsigned int
3210 GOMP_OFFLOAD_get_caps (void)
3211 {
3212 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3213 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3214 | GOMP_OFFLOAD_CAP_OPENACC_200;
3215 }
3216
3217 /* Identify as GCN accelerator. */
3218
3219 int
3220 GOMP_OFFLOAD_get_type (void)
3221 {
3222 return OFFLOAD_TARGET_TYPE_GCN;
3223 }
3224
3225 /* Return the libgomp version number we're compatible with. There is
3226 no requirement for cross-version compatibility. */
3227
3228 unsigned
3229 GOMP_OFFLOAD_version (void)
3230 {
3231 return GOMP_VERSION;
3232 }
3233
3234 /* Return the number of GCN devices on the system. */
3235
3236 int
3237 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3238 {
3239 if (!init_hsa_context ())
3240 return 0;
3241 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3242 devices were present. */
3243 if (hsa_context.agent_count > 0 && omp_requires_mask != 0)
3244 return -1;
3245 return hsa_context.agent_count;
3246 }
3247
3248 /* Initialize device (agent) number N so that it can be used for computation.
3249 Return TRUE on success. */
3250
3251 bool
3252 GOMP_OFFLOAD_init_device (int n)
3253 {
3254 if (!init_hsa_context ())
3255 return false;
3256 if (n >= hsa_context.agent_count)
3257 {
3258 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3259 return false;
3260 }
3261 struct agent_info *agent = &hsa_context.agents[n];
3262
3263 if (agent->initialized)
3264 return true;
3265
3266 agent->device_id = n;
3267
3268 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3269 {
3270 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3271 return false;
3272 }
3273 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3274 {
3275 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3276 return false;
3277 }
3278 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3279 {
3280 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3281 return false;
3282 }
3283 if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3284 {
3285 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3286 return false;
3287 }
3288 agent->async_queues = NULL;
3289 agent->omp_async_queue = NULL;
3290 agent->team_arena_list = NULL;
3291
3292 uint32_t queue_size;
3293 hsa_status_t status;
3294 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3295 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3296 &queue_size);
3297 if (status != HSA_STATUS_SUCCESS)
3298 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3299 status);
3300
3301 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3302 &agent->name);
3303 if (status != HSA_STATUS_SUCCESS)
3304 return hsa_error ("Error querying the name of the agent", status);
3305
3306 agent->device_isa = isa_code (agent->name);
3307 if (agent->device_isa < 0)
3308 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3309
3310 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3311 &agent->vendor_name);
3312 if (status != HSA_STATUS_SUCCESS)
3313 return hsa_error ("Error querying the vendor name of the agent", status);
3314
3315 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3316 HSA_QUEUE_TYPE_MULTI,
3317 hsa_queue_callback, NULL, UINT32_MAX,
3318 UINT32_MAX, &agent->sync_queue);
3319 if (status != HSA_STATUS_SUCCESS)
3320 return hsa_error ("Error creating command queue", status);
3321
3322 agent->kernarg_region.handle = (uint64_t) -1;
3323 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3324 get_kernarg_memory_region,
3325 &agent->kernarg_region);
3326 if (status != HSA_STATUS_SUCCESS
3327 && status != HSA_STATUS_INFO_BREAK)
3328 hsa_error ("Scanning memory regions failed", status);
3329 if (agent->kernarg_region.handle == (uint64_t) -1)
3330 {
3331 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3332 "arguments");
3333 return false;
3334 }
3335 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3336 dump_hsa_region (agent->kernarg_region, NULL);
3337
3338 agent->data_region.handle = (uint64_t) -1;
3339 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3340 get_data_memory_region,
3341 &agent->data_region);
3342 if (status != HSA_STATUS_SUCCESS
3343 && status != HSA_STATUS_INFO_BREAK)
3344 hsa_error ("Scanning memory regions failed", status);
3345 if (agent->data_region.handle == (uint64_t) -1)
3346 {
3347 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3348 "data");
3349 return false;
3350 }
3351 GCN_DEBUG ("Selected device data memory region:\n");
3352 dump_hsa_region (agent->data_region, NULL);
3353
3354 GCN_DEBUG ("GCN agent %d initialized\n", n);
3355
3356 agent->initialized = true;
3357 return true;
3358 }
3359
3360 /* Load GCN object-code module described by struct gcn_image_desc in
3361 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3362 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3363 contain the on-device addresses of the functions for reverse offload. To be
3364 freed by the caller. */
3365
3366 int
3367 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3368 struct addr_pair **target_table,
3369 uint64_t **rev_fn_table)
3370 {
3371 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3372 {
3373 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3374 " (expected %u, received %u)",
3375 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3376 return -1;
3377 }
3378
3379 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3380 struct agent_info *agent;
3381 struct addr_pair *pair;
3382 struct module_info *module;
3383 struct kernel_info *kernel;
3384 int kernel_count = image_desc->kernel_count;
3385 unsigned var_count = image_desc->global_variable_count;
3386 /* Currently, "others" is a struct of ICVS. */
3387 int other_count = 1;
3388
3389 agent = get_agent_info (ord);
3390 if (!agent)
3391 return -1;
3392
3393 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3394 {
3395 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3396 return -1;
3397 }
3398 if (agent->prog_finalized
3399 && !destroy_hsa_program (agent))
3400 return -1;
3401
3402 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3403 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3404 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3405 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3406 * sizeof (struct addr_pair));
3407 *target_table = pair;
3408 module = (struct module_info *)
3409 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3410 + kernel_count * sizeof (struct kernel_info));
3411 module->image_desc = image_desc;
3412 module->kernel_count = kernel_count;
3413 module->heap = NULL;
3414 module->constructors_run_p = false;
3415
3416 kernel = &module->kernels[0];
3417
3418 /* Allocate memory for kernel dependencies. */
3419 for (unsigned i = 0; i < kernel_count; i++)
3420 {
3421 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3422 if (!init_basic_kernel_info (kernel, d, agent, module))
3423 return -1;
3424 if (strcmp (d->name, "_init_array") == 0)
3425 module->init_array_func = kernel;
3426 else if (strcmp (d->name, "_fini_array") == 0)
3427 module->fini_array_func = kernel;
3428 else
3429 {
3430 pair->start = (uintptr_t) kernel;
3431 pair->end = (uintptr_t) (kernel + 1);
3432 pair++;
3433 }
3434 kernel++;
3435 }
3436
3437 agent->module = module;
3438 if (pthread_rwlock_unlock (&agent->module_rwlock))
3439 {
3440 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3441 return -1;
3442 }
3443
3444 if (!create_and_finalize_hsa_program (agent))
3445 return -1;
3446
3447 if (var_count > 0)
3448 {
3449 hsa_status_t status;
3450 hsa_executable_symbol_t var_symbol;
3451 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3452 ".offload_var_table",
3453 agent->id,
3454 0, &var_symbol);
3455
3456 if (status != HSA_STATUS_SUCCESS)
3457 hsa_fatal ("Could not find symbol for variable in the code object",
3458 status);
3459
3460 uint64_t var_table_addr;
3461 status = hsa_fns.hsa_executable_symbol_get_info_fn
3462 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3463 &var_table_addr);
3464 if (status != HSA_STATUS_SUCCESS)
3465 hsa_fatal ("Could not extract a variable from its symbol", status);
3466
3467 struct {
3468 uint64_t addr;
3469 uint64_t size;
3470 } var_table[var_count];
3471 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3472 (void*)var_table_addr, sizeof (var_table));
3473
3474 for (unsigned i = 0; i < var_count; i++)
3475 {
3476 pair->start = var_table[i].addr;
3477 pair->end = var_table[i].addr + var_table[i].size;
3478 GCN_DEBUG ("Found variable at %p with size %lu\n",
3479 (void *)var_table[i].addr, var_table[i].size);
3480 pair++;
3481 }
3482 }
3483
3484 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3485
3486 hsa_status_t status;
3487 hsa_executable_symbol_t var_symbol;
3488 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3489 XSTRING (GOMP_ADDITIONAL_ICVS),
3490 agent->id, 0, &var_symbol);
3491 if (status == HSA_STATUS_SUCCESS)
3492 {
3493 uint64_t varptr;
3494 uint32_t varsize;
3495
3496 status = hsa_fns.hsa_executable_symbol_get_info_fn
3497 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3498 &varptr);
3499 if (status != HSA_STATUS_SUCCESS)
3500 hsa_fatal ("Could not extract a variable from its symbol", status);
3501 status = hsa_fns.hsa_executable_symbol_get_info_fn
3502 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3503 &varsize);
3504 if (status != HSA_STATUS_SUCCESS)
3505 hsa_fatal ("Could not extract a variable size from its symbol",
3506 status);
3507
3508 pair->start = varptr;
3509 pair->end = varptr + varsize;
3510 }
3511 else
3512 {
3513 /* The variable was not in this image. */
3514 GCN_DEBUG ("Variable not found in image: %s\n",
3515 XSTRING (GOMP_ADDITIONAL_ICVS));
3516 pair->start = pair->end = 0;
3517 }
3518
3519 /* Ensure that constructors are run first. */
3520 struct GOMP_kernel_launch_attributes kla =
3521 { 3,
3522 /* Grid size. */
3523 { 1, 64, 1 },
3524 /* Work-group size. */
3525 { 1, 64, 1 }
3526 };
3527
3528 if (module->init_array_func)
3529 {
3530 init_kernel (module->init_array_func);
3531 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3532 }
3533 module->constructors_run_p = true;
3534
3535 /* Don't report kernels that libgomp need not know about. */
3536 if (module->init_array_func)
3537 kernel_count--;
3538 if (module->fini_array_func)
3539 kernel_count--;
3540
3541 if (rev_fn_table != NULL && kernel_count == 0)
3542 *rev_fn_table = NULL;
3543 else if (rev_fn_table != NULL)
3544 {
3545 hsa_status_t status;
3546 hsa_executable_symbol_t var_symbol;
3547 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3548 ".offload_func_table",
3549 agent->id, 0, &var_symbol);
3550 if (status != HSA_STATUS_SUCCESS)
3551 hsa_fatal ("Could not find symbol for variable in the code object",
3552 status);
3553 uint64_t fn_table_addr;
3554 status = hsa_fns.hsa_executable_symbol_get_info_fn
3555 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3556 &fn_table_addr);
3557 if (status != HSA_STATUS_SUCCESS)
3558 hsa_fatal ("Could not extract a variable from its symbol", status);
3559 *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3560 GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3561 (void*) fn_table_addr,
3562 kernel_count * sizeof (uint64_t));
3563 }
3564
3565 return kernel_count + var_count + other_count;
3566 }
3567
3568 /* Unload GCN object-code module described by struct gcn_image_desc in
3569 TARGET_DATA from agent number N. Return TRUE on success. */
3570
3571 bool
3572 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3573 {
3574 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3575 {
3576 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3577 " (expected %u, received %u)",
3578 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3579 return false;
3580 }
3581
3582 struct agent_info *agent;
3583 agent = get_agent_info (n);
3584 if (!agent)
3585 return false;
3586
3587 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3588 {
3589 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3590 return false;
3591 }
3592
3593 if (!agent->module || agent->module->image_desc != target_data)
3594 {
3595 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3596 "loaded before");
3597 return false;
3598 }
3599
3600 if (!destroy_module (agent->module, true))
3601 return false;
3602 free (agent->module);
3603 agent->module = NULL;
3604 if (!destroy_hsa_program (agent))
3605 return false;
3606 if (pthread_rwlock_unlock (&agent->module_rwlock))
3607 {
3608 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3609 return false;
3610 }
3611 return true;
3612 }
3613
3614 /* Deinitialize all information and status associated with agent number N. We
3615 do not attempt any synchronization, assuming the user and libgomp will not
3616 attempt deinitialization of a device that is in any way being used at the
3617 same time. Return TRUE on success. */
3618
3619 bool
3620 GOMP_OFFLOAD_fini_device (int n)
3621 {
3622 struct agent_info *agent = get_agent_info (n);
3623 if (!agent)
3624 return false;
3625
3626 if (!agent->initialized)
3627 return true;
3628
3629 if (agent->omp_async_queue)
3630 {
3631 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3632 agent->omp_async_queue = NULL;
3633 }
3634
3635 if (agent->module)
3636 {
3637 if (!destroy_module (agent->module, false))
3638 return false;
3639 free (agent->module);
3640 agent->module = NULL;
3641 }
3642
3643 if (!destroy_team_arenas (agent))
3644 return false;
3645
3646 if (!destroy_hsa_program (agent))
3647 return false;
3648
3649 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3650 if (status != HSA_STATUS_SUCCESS)
3651 return hsa_error ("Error destroying command queue", status);
3652
3653 if (pthread_mutex_destroy (&agent->prog_mutex))
3654 {
3655 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3656 return false;
3657 }
3658 if (pthread_rwlock_destroy (&agent->module_rwlock))
3659 {
3660 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3661 return false;
3662 }
3663
3664 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3665 {
3666 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3667 return false;
3668 }
3669 if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3670 {
3671 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3672 return false;
3673 }
3674 agent->initialized = false;
3675 return true;
3676 }
3677
3678 /* Return true if the HSA runtime can run function FN_PTR. */
3679
3680 bool
3681 GOMP_OFFLOAD_can_run (void *fn_ptr)
3682 {
3683 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3684
3685 init_kernel (kernel);
3686 if (kernel->initialization_failed)
3687 goto failure;
3688
3689 return true;
3690
3691 failure:
3692 if (suppress_host_fallback)
3693 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3694 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3695 return false;
3696 }
3697
3698 /* Allocate memory on device N. */
3699
3700 void *
3701 GOMP_OFFLOAD_alloc (int n, size_t size)
3702 {
3703 struct agent_info *agent = get_agent_info (n);
3704 return alloc_by_agent (agent, size);
3705 }
3706
3707 /* Free memory from device N. */
3708
3709 bool
3710 GOMP_OFFLOAD_free (int device, void *ptr)
3711 {
3712 GCN_DEBUG ("Freeing memory on device %d\n", device);
3713
3714 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3715 if (status != HSA_STATUS_SUCCESS)
3716 {
3717 hsa_error ("Could not free device memory", status);
3718 return false;
3719 }
3720
3721 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3722 bool profiling_dispatch_p
3723 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3724 if (profiling_dispatch_p)
3725 {
3726 acc_prof_info *prof_info = thr->prof_info;
3727 acc_event_info data_event_info;
3728 acc_api_info *api_info = thr->api_info;
3729
3730 prof_info->event_type = acc_ev_free;
3731
3732 data_event_info.data_event.event_type = prof_info->event_type;
3733 data_event_info.data_event.valid_bytes
3734 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3735 data_event_info.data_event.parent_construct
3736 = acc_construct_parallel;
3737 data_event_info.data_event.implicit = 1;
3738 data_event_info.data_event.tool_info = NULL;
3739 data_event_info.data_event.var_name = NULL;
3740 data_event_info.data_event.bytes = 0;
3741 data_event_info.data_event.host_ptr = NULL;
3742 data_event_info.data_event.device_ptr = (void *) ptr;
3743
3744 api_info->device_api = acc_device_api_other;
3745
3746 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3747 api_info);
3748 }
3749
3750 return true;
3751 }
3752
3753 /* Copy data from DEVICE to host. */
3754
3755 bool
3756 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3757 {
3758 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3759 src, dst);
3760 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3761 if (status != HSA_STATUS_SUCCESS)
3762 GOMP_PLUGIN_error ("memory copy failed");
3763 return true;
3764 }
3765
3766 /* Copy data from host to DEVICE. */
3767
3768 bool
3769 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3770 {
3771 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3772 device, dst);
3773 hsa_memory_copy_wrapper (dst, src, n);
3774 return true;
3775 }
3776
3777 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3778
3779 bool
3780 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3781 {
3782 struct gcn_thread *thread_data = gcn_thread ();
3783
3784 if (thread_data && !async_synchronous_p (thread_data->async))
3785 {
3786 struct agent_info *agent = get_agent_info (device);
3787 maybe_init_omp_async (agent);
3788 queue_push_copy (agent->omp_async_queue, dst, src, n);
3789 return true;
3790 }
3791
3792 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3793 device, src, device, dst);
3794 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3795 if (status != HSA_STATUS_SUCCESS)
3796 GOMP_PLUGIN_error ("memory copy failed");
3797 return true;
3798 }
3799
3800 /* }}} */
3801 /* {{{ OpenMP Plugin API */
3802
3803 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3804 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3805 to a kernel_info structure, and must have previously been loaded to the
3806 specified device. */
3807
3808 void
3809 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3810 {
3811 struct agent_info *agent = get_agent_info (device);
3812 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3813 struct GOMP_kernel_launch_attributes def;
3814 struct GOMP_kernel_launch_attributes *kla;
3815 assert (agent == kernel->agent);
3816
3817 /* If we get here then the kernel must be OpenMP. */
3818 kernel->kind = KIND_OPENMP;
3819
3820 if (!parse_target_attributes (args, &def, &kla, agent))
3821 {
3822 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3823 return;
3824 }
3825 run_kernel (kernel, vars, kla, NULL, false);
3826 }
3827
3828 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3829 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3830 GOMP_PLUGIN_target_task_completion when it has finished. */
3831
3832 void
3833 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3834 void **args, void *async_data)
3835 {
3836 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3837 struct agent_info *agent = get_agent_info (device);
3838 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3839 struct GOMP_kernel_launch_attributes def;
3840 struct GOMP_kernel_launch_attributes *kla;
3841 assert (agent == kernel->agent);
3842
3843 /* If we get here then the kernel must be OpenMP. */
3844 kernel->kind = KIND_OPENMP;
3845
3846 if (!parse_target_attributes (args, &def, &kla, agent))
3847 {
3848 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3849 return;
3850 }
3851
3852 maybe_init_omp_async (agent);
3853 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3854 queue_push_callback (agent->omp_async_queue,
3855 GOMP_PLUGIN_target_task_completion, async_data);
3856 }
3857
3858 /* }}} */
3859 /* {{{ OpenACC Plugin API */
3860
3861 /* Run a synchronous OpenACC kernel. The device number is inferred from the
3862 already-loaded KERNEL. */
3863
3864 void
3865 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3866 void **hostaddrs, void **devaddrs, unsigned *dims,
3867 void *targ_mem_desc)
3868 {
3869 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3870
3871 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3872 NULL);
3873 }
3874
3875 /* Run an asynchronous OpenACC kernel on the specified queue. */
3876
3877 void
3878 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3879 void **hostaddrs, void **devaddrs,
3880 unsigned *dims, void *targ_mem_desc,
3881 struct goacc_asyncqueue *aq)
3882 {
3883 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3884
3885 gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3886 aq);
3887 }
3888
3889 /* Create a new asynchronous thread and queue for running future kernels. */
3890
3891 struct goacc_asyncqueue *
3892 GOMP_OFFLOAD_openacc_async_construct (int device)
3893 {
3894 struct agent_info *agent = get_agent_info (device);
3895
3896 pthread_mutex_lock (&agent->async_queues_mutex);
3897
3898 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3899 aq->agent = get_agent_info (device);
3900 aq->prev = NULL;
3901 aq->next = agent->async_queues;
3902 if (aq->next)
3903 {
3904 aq->next->prev = aq;
3905 aq->id = aq->next->id + 1;
3906 }
3907 else
3908 aq->id = 1;
3909 agent->async_queues = aq;
3910
3911 aq->queue_first = 0;
3912 aq->queue_n = 0;
3913 aq->drain_queue_stop = 0;
3914
3915 if (pthread_mutex_init (&aq->mutex, NULL))
3916 {
3917 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3918 return false;
3919 }
3920 if (pthread_cond_init (&aq->queue_cond_in, NULL))
3921 {
3922 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3923 return false;
3924 }
3925 if (pthread_cond_init (&aq->queue_cond_out, NULL))
3926 {
3927 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3928 return false;
3929 }
3930
3931 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3932 ASYNC_QUEUE_SIZE,
3933 HSA_QUEUE_TYPE_MULTI,
3934 hsa_queue_callback, NULL,
3935 UINT32_MAX, UINT32_MAX,
3936 &aq->hsa_queue);
3937 if (status != HSA_STATUS_SUCCESS)
3938 hsa_fatal ("Error creating command queue", status);
3939
3940 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3941 if (err != 0)
3942 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3943 strerror (err));
3944 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3945 aq->id);
3946
3947 pthread_mutex_unlock (&agent->async_queues_mutex);
3948
3949 return aq;
3950 }
3951
3952 /* Destroy an existing asynchronous thread and queue. Waits for any
3953 currently-running task to complete, but cancels any queued tasks. */
3954
3955 bool
3956 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3957 {
3958 struct agent_info *agent = aq->agent;
3959
3960 finalize_async_thread (aq);
3961
3962 pthread_mutex_lock (&agent->async_queues_mutex);
3963
3964 int err;
3965 if ((err = pthread_mutex_destroy (&aq->mutex)))
3966 {
3967 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3968 goto fail;
3969 }
3970 if (pthread_cond_destroy (&aq->queue_cond_in))
3971 {
3972 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3973 goto fail;
3974 }
3975 if (pthread_cond_destroy (&aq->queue_cond_out))
3976 {
3977 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3978 goto fail;
3979 }
3980 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3981 if (status != HSA_STATUS_SUCCESS)
3982 {
3983 hsa_error ("Error destroying command queue", status);
3984 goto fail;
3985 }
3986
3987 if (aq->prev)
3988 aq->prev->next = aq->next;
3989 if (aq->next)
3990 aq->next->prev = aq->prev;
3991 if (agent->async_queues == aq)
3992 agent->async_queues = aq->next;
3993
3994 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3995
3996 free (aq);
3997 pthread_mutex_unlock (&agent->async_queues_mutex);
3998 return true;
3999
4000 fail:
4001 pthread_mutex_unlock (&agent->async_queues_mutex);
4002 return false;
4003 }
4004
4005 /* Return true if the specified async queue is currently empty. */
4006
4007 int
4008 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4009 {
4010 return queue_empty (aq);
4011 }
4012
4013 /* Block until the specified queue has executed all its tasks and the
4014 queue is empty. */
4015
4016 bool
4017 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4018 {
4019 wait_queue (aq);
4020 return true;
4021 }
4022
4023 /* Add a serialization point across two async queues. Any new tasks added to
4024 AQ2, after this call, will not run until all tasks on AQ1, at the time
4025 of this call, have completed. */
4026
4027 bool
4028 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4029 struct goacc_asyncqueue *aq2)
4030 {
4031 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4032 scheduled to run on it up to this point. */
4033 if (aq1 != aq2)
4034 {
4035 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4036 queue_push_asyncwait (aq2, placeholderp);
4037 }
4038 return true;
4039 }
4040
4041 /* Add an opaque callback to the given async queue. */
4042
4043 void
4044 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4045 void (*fn) (void *), void *data)
4046 {
4047 queue_push_callback (aq, fn, data);
4048 }
4049
4050 /* Queue up an asynchronous data copy from host to DEVICE. */
4051
4052 bool
4053 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4054 size_t n, struct goacc_asyncqueue *aq)
4055 {
4056 struct agent_info *agent = get_agent_info (device);
4057 assert (agent == aq->agent);
4058 queue_push_copy (aq, dst, src, n);
4059 return true;
4060 }
4061
4062 /* Queue up an asynchronous data copy from DEVICE to host. */
4063
4064 bool
4065 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4066 size_t n, struct goacc_asyncqueue *aq)
4067 {
4068 struct agent_info *agent = get_agent_info (device);
4069 assert (agent == aq->agent);
4070 queue_push_copy (aq, dst, src, n);
4071 return true;
4072 }
4073
4074 union goacc_property_value
4075 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4076 {
4077 struct agent_info *agent = get_agent_info (device);
4078
4079 union goacc_property_value propval = { .val = 0 };
4080
4081 switch (prop)
4082 {
4083 case GOACC_PROPERTY_FREE_MEMORY:
4084 /* Not supported. */
4085 break;
4086 case GOACC_PROPERTY_MEMORY:
4087 {
4088 size_t size;
4089 hsa_region_t region = agent->data_region;
4090 hsa_status_t status =
4091 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4092 if (status == HSA_STATUS_SUCCESS)
4093 propval.val = size;
4094 break;
4095 }
4096 case GOACC_PROPERTY_NAME:
4097 propval.ptr = agent->name;
4098 break;
4099 case GOACC_PROPERTY_VENDOR:
4100 propval.ptr = agent->vendor_name;
4101 break;
4102 case GOACC_PROPERTY_DRIVER:
4103 propval.ptr = hsa_context.driver_version_s;
4104 break;
4105 }
4106
4107 return propval;
4108 }
4109
4110 /* Set up plugin-specific thread-local-data (host-side). */
4111
4112 void *
4113 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4114 {
4115 struct gcn_thread *thread_data
4116 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4117
4118 thread_data->async = GOMP_ASYNC_SYNC;
4119
4120 return (void *) thread_data;
4121 }
4122
4123 /* Clean up plugin-specific thread-local-data. */
4124
4125 void
4126 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4127 {
4128 free (data);
4129 }
4130
4131 /* }}} */