]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgomp/plugin/plugin-gcn.c
GCN: Add pre-initial support for gfx1100: 'EF_AMDGPU_MACH_AMDGCN_GFX1100'
[thirdparty/gcc.git] / libgomp / plugin / plugin-gcn.c
1 /* Plugin for AMD GCN execution.
2
3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* {{{ Includes and defines */
30
31 #include "config.h"
32 #include "symcat.h"
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <pthread.h>
37 #include <inttypes.h>
38 #include <stdbool.h>
39 #include <limits.h>
40 #include <hsa.h>
41 #include <hsa_ext_amd.h>
42 #include <dlfcn.h>
43 #include <signal.h>
44 #include "libgomp-plugin.h"
45 #include "config/gcn/libgomp-gcn.h" /* For struct output. */
46 #include "gomp-constants.h"
47 #include <elf.h>
48 #include "oacc-plugin.h"
49 #include "oacc-int.h"
50 #include <assert.h>
51
52 /* These probably won't be in elf.h for a while. */
53 #ifndef R_AMDGPU_NONE
54 #define R_AMDGPU_NONE 0
55 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57 #define R_AMDGPU_ABS64 3 /* S + A */
58 #define R_AMDGPU_REL32 4 /* S + A - P */
59 #define R_AMDGPU_REL64 5 /* S + A - P */
60 #define R_AMDGPU_ABS32 6 /* S + A */
61 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
66 #define R_AMDGPU_RELATIVE64 13 /* B + A */
67 #endif
68
69 /* GCN specific definitions for asynchronous queues. */
70
71 #define ASYNC_QUEUE_SIZE 64
72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
73 #define DEBUG_QUEUES 0
74 #define DEBUG_THREAD_SLEEP 0
75 #define DEBUG_THREAD_SIGNAL 0
76
77 /* Defaults. */
78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
79
80 /* Secure getenv() which returns NULL if running as SUID/SGID. */
81 #ifndef HAVE_SECURE_GETENV
82 #ifdef HAVE___SECURE_GETENV
83 #define secure_getenv __secure_getenv
84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
86
87 #include <unistd.h>
88
89 /* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
91
92 static char *
93 secure_getenv (const char *name)
94 {
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
96 return getenv (name);
97 else
98 return NULL;
99 }
100
101 #else
102 #define secure_getenv getenv
103 #endif
104 #endif
105
106 /* }}} */
107 /* {{{ Types */
108
109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
110
111 struct gcn_thread
112 {
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
114 int async;
115 };
116
117 /* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
119
120 struct hsa_runtime_fn_info
121 {
122 /* HSA runtime. */
123 hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
124 const char **status_string);
125 hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
126 void *value);
127 hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
128 hsa_agent_info_t attribute,
129 void *value);
130 hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
131 hsa_isa_info_t attribute,
132 uint32_t index,
133 void *value);
134 hsa_status_t (*hsa_init_fn) (void);
135 hsa_status_t (*hsa_iterate_agents_fn)
136 (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
137 hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
138 hsa_region_info_t attribute,
139 void *value);
140 hsa_status_t (*hsa_queue_create_fn)
141 (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
142 void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
143 void *data, uint32_t private_segment_size,
144 uint32_t group_segment_size, hsa_queue_t **queue);
145 hsa_status_t (*hsa_agent_iterate_regions_fn)
146 (hsa_agent_t agent,
147 hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
148 hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
149 hsa_status_t (*hsa_executable_create_fn)
150 (hsa_profile_t profile, hsa_executable_state_t executable_state,
151 const char *options, hsa_executable_t *executable);
152 hsa_status_t (*hsa_executable_global_variable_define_fn)
153 (hsa_executable_t executable, const char *variable_name, void *address);
154 hsa_status_t (*hsa_executable_load_code_object_fn)
155 (hsa_executable_t executable, hsa_agent_t agent,
156 hsa_code_object_t code_object, const char *options);
157 hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
158 const char *options);
159 hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
160 uint32_t num_consumers,
161 const hsa_agent_t *consumers,
162 hsa_signal_t *signal);
163 hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
164 void **ptr);
165 hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
166 hsa_access_permission_t access);
167 hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
168 hsa_status_t (*hsa_memory_free_fn) (void *ptr);
169 hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
170 hsa_status_t (*hsa_executable_get_symbol_fn)
171 (hsa_executable_t executable, const char *module_name,
172 const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
173 hsa_executable_symbol_t *symbol);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn)
175 (hsa_executable_symbol_t executable_symbol,
176 hsa_executable_symbol_info_t attribute, void *value);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn)
178 (hsa_executable_t executable,
179 hsa_status_t (*callback)(hsa_executable_t executable,
180 hsa_executable_symbol_t symbol, void *data),
181 void *data);
182 uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
183 uint64_t value);
184 uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
185 void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
186 hsa_signal_value_t value);
187 void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
188 hsa_signal_value_t value);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
190 (hsa_signal_t signal, hsa_signal_condition_t condition,
191 hsa_signal_value_t compare_value, uint64_t timeout_hint,
192 hsa_wait_state_t wait_state_hint);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
194 hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
195
196 hsa_status_t (*hsa_code_object_deserialize_fn)
197 (void *serialized_code_object, size_t serialized_code_object_size,
198 const char *options, hsa_code_object_t *code_object);
199 hsa_status_t (*hsa_amd_memory_lock_fn)
200 (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
201 void **agent_ptr);
202 hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
203 hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
204 (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
205 const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
206 const hsa_dim3_t *range, hsa_agent_t copy_agent,
207 hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
208 const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
209 };
210
211 /* Structure describing the run-time and grid properties of an HSA kernel
212 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
213
214 struct GOMP_kernel_launch_attributes
215 {
216 /* Number of dimensions the workload has. Maximum number is 3. */
217 uint32_t ndim;
218 /* Size of the grid in the three respective dimensions. */
219 uint32_t gdims[3];
220 /* Size of work-groups in the respective dimensions. */
221 uint32_t wdims[3];
222 };
223
224 /* Collection of information needed for a dispatch of a kernel from a
225 kernel. */
226
227 struct kernel_dispatch
228 {
229 struct agent_info *agent;
230 /* Pointer to a command queue associated with a kernel dispatch agent. */
231 void *queue;
232 /* Pointer to a memory space used for kernel arguments passing. */
233 void *kernarg_address;
234 /* Kernel object. */
235 uint64_t object;
236 /* Synchronization signal used for dispatch synchronization. */
237 uint64_t signal;
238 /* Private segment size. */
239 uint32_t private_segment_size;
240 /* Group segment size. */
241 uint32_t group_segment_size;
242 };
243
244 /* Structure of the kernargs segment, supporting console output.
245
246 This needs to match the definitions in Newlib, and the expectations
247 in libgomp target code. */
248
249 struct kernargs {
250 struct kernargs_abi abi;
251
252 /* Output data. */
253 struct output output_data;
254 };
255
256 /* A queue entry for a future asynchronous launch. */
257
258 struct kernel_launch
259 {
260 struct kernel_info *kernel;
261 void *vars;
262 struct GOMP_kernel_launch_attributes kla;
263 };
264
265 /* A queue entry for a future callback. */
266
267 struct callback
268 {
269 void (*fn)(void *);
270 void *data;
271 };
272
273 /* A data struct for the copy_data callback. */
274
275 struct copy_data
276 {
277 void *dst;
278 const void *src;
279 size_t len;
280 struct goacc_asyncqueue *aq;
281 };
282
283 /* A queue entry for a placeholder. These correspond to a wait event. */
284
285 struct placeholder
286 {
287 int executed;
288 pthread_cond_t cond;
289 pthread_mutex_t mutex;
290 };
291
292 /* A queue entry for a wait directive. */
293
294 struct asyncwait_info
295 {
296 struct placeholder *placeholderp;
297 };
298
299 /* Encode the type of an entry in an async queue. */
300
301 enum entry_type
302 {
303 KERNEL_LAUNCH,
304 CALLBACK,
305 ASYNC_WAIT,
306 ASYNC_PLACEHOLDER
307 };
308
309 /* An entry in an async queue. */
310
311 struct queue_entry
312 {
313 enum entry_type type;
314 union {
315 struct kernel_launch launch;
316 struct callback callback;
317 struct asyncwait_info asyncwait;
318 struct placeholder placeholder;
319 } u;
320 };
321
322 /* An async queue header.
323
324 OpenMP may create one of these.
325 OpenACC may create many. */
326
327 struct goacc_asyncqueue
328 {
329 struct agent_info *agent;
330 hsa_queue_t *hsa_queue;
331
332 pthread_t thread_drain_queue;
333 pthread_mutex_t mutex;
334 pthread_cond_t queue_cond_in;
335 pthread_cond_t queue_cond_out;
336 struct queue_entry queue[ASYNC_QUEUE_SIZE];
337 int queue_first;
338 int queue_n;
339 int drain_queue_stop;
340
341 int id;
342 struct goacc_asyncqueue *prev;
343 struct goacc_asyncqueue *next;
344 };
345
346 /* Mkoffload uses this structure to describe a kernel.
347
348 OpenMP kernel dimensions are passed at runtime.
349 OpenACC kernel dimensions are passed at compile time, here. */
350
351 struct hsa_kernel_description
352 {
353 const char *name;
354 int oacc_dims[3]; /* Only present for GCN kernels. */
355 int sgpr_count;
356 int vpgr_count;
357 };
358
359 /* Mkoffload uses this structure to describe an offload variable. */
360
361 struct global_var_info
362 {
363 const char *name;
364 void *address;
365 };
366
367 /* Mkoffload uses this structure to describe all the kernels in a
368 loadable module. These are passed the libgomp via static constructors. */
369
370 struct gcn_image_desc
371 {
372 struct gcn_image {
373 size_t size;
374 void *image;
375 } *gcn_image;
376 const unsigned kernel_count;
377 struct hsa_kernel_description *kernel_infos;
378 const unsigned ind_func_count;
379 const unsigned global_variable_count;
380 };
381
382 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
383 support.
384 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
385
386 typedef enum {
387 EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
388 EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
389 EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
390 EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
391 EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
392 EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
393 EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041
394 } EF_AMDGPU_MACH;
395
396 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
397 typedef EF_AMDGPU_MACH gcn_isa;
398
399 /* Description of an HSA GPU agent (device) and the program associated with
400 it. */
401
402 struct agent_info
403 {
404 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
405 hsa_agent_t id;
406 /* The user-visible device number. */
407 int device_id;
408 /* Whether the agent has been initialized. The fields below are usable only
409 if it has been. */
410 bool initialized;
411
412 /* The instruction set architecture of the device. */
413 gcn_isa device_isa;
414 /* Name of the agent. */
415 char name[64];
416 /* Name of the vendor of the agent. */
417 char vendor_name[64];
418 /* Command queues of the agent. */
419 hsa_queue_t *sync_queue;
420 struct goacc_asyncqueue *async_queues, *omp_async_queue;
421 pthread_mutex_t async_queues_mutex;
422
423 /* The HSA memory region from which to allocate kernel arguments. */
424 hsa_region_t kernarg_region;
425
426 /* The HSA memory region from which to allocate device data. */
427 hsa_region_t data_region;
428
429 /* Allocated ephemeral memories (team arena and stack space). */
430 struct ephemeral_memories_list *ephemeral_memories_list;
431 pthread_mutex_t ephemeral_memories_write_lock;
432
433 /* Read-write lock that protects kernels which are running or about to be run
434 from interference with loading and unloading of images. Needs to be
435 locked for reading while a kernel is being run, and for writing if the
436 list of modules is manipulated (and thus the HSA program invalidated). */
437 pthread_rwlock_t module_rwlock;
438
439 /* The module associated with this kernel. */
440 struct module_info *module;
441
442 /* Mutex enforcing that only one thread will finalize the HSA program. A
443 thread should have locked agent->module_rwlock for reading before
444 acquiring it. */
445 pthread_mutex_t prog_mutex;
446 /* Flag whether the HSA program that consists of all the modules has been
447 finalized. */
448 bool prog_finalized;
449 /* HSA executable - the finalized program that is used to locate kernels. */
450 hsa_executable_t executable;
451 };
452
453 /* Information required to identify, finalize and run any given kernel. */
454
455 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
456
457 struct kernel_info
458 {
459 /* Name of the kernel, required to locate it within the GCN object-code
460 module. */
461 const char *name;
462 /* The specific agent the kernel has been or will be finalized for and run
463 on. */
464 struct agent_info *agent;
465 /* The specific module where the kernel takes place. */
466 struct module_info *module;
467 /* Information provided by mkoffload associated with the kernel. */
468 struct hsa_kernel_description *description;
469 /* Mutex enforcing that at most once thread ever initializes a kernel for
470 use. A thread should have locked agent->module_rwlock for reading before
471 acquiring it. */
472 pthread_mutex_t init_mutex;
473 /* Flag indicating whether the kernel has been initialized and all fields
474 below it contain valid data. */
475 bool initialized;
476 /* Flag indicating that the kernel has a problem that blocks an execution. */
477 bool initialization_failed;
478 /* The object to be put into the dispatch queue. */
479 uint64_t object;
480 /* Required size of kernel arguments. */
481 uint32_t kernarg_segment_size;
482 /* Required size of group segment. */
483 uint32_t group_segment_size;
484 /* Required size of private segment. */
485 uint32_t private_segment_size;
486 /* Set up for OpenMP or OpenACC? */
487 enum offload_kind kind;
488 };
489
490 /* Information about a particular GCN module, its image and kernels. */
491
492 struct module_info
493 {
494 /* The description with which the program has registered the image. */
495 struct gcn_image_desc *image_desc;
496 /* GCN heap allocation. */
497 struct heap *heap;
498 /* Physical boundaries of the loaded module. */
499 Elf64_Addr phys_address_start;
500 Elf64_Addr phys_address_end;
501
502 bool constructors_run_p;
503 struct kernel_info *init_array_func, *fini_array_func;
504
505 /* Number of kernels in this module. */
506 int kernel_count;
507 /* An array of kernel_info structures describing each kernel in this
508 module. */
509 struct kernel_info kernels[];
510 };
511
512 /* A linked list of memory arenas allocated on the device.
513 These are used by OpenMP, as a means to optimize per-team malloc,
514 and for host-accessible stack space. */
515
516 struct ephemeral_memories_list
517 {
518 struct ephemeral_memories_list *next;
519
520 /* The size is determined by the number of teams and threads. */
521 size_t size;
522 /* The device address allocated memory. */
523 void *address;
524 /* A flag to prevent two asynchronous kernels trying to use the same memory.
525 The mutex is locked until the kernel exits. */
526 pthread_mutex_t in_use;
527 };
528
529 /* Information about the whole HSA environment and all of its agents. */
530
531 struct hsa_context_info
532 {
533 /* Whether the structure has been initialized. */
534 bool initialized;
535 /* Number of usable GPU HSA agents in the system. */
536 int agent_count;
537 /* Array of agent_info structures describing the individual HSA agents. */
538 struct agent_info *agents;
539 /* Driver version string. */
540 char driver_version_s[30];
541 };
542
543 /* }}} */
544 /* {{{ Global variables */
545
546 /* Information about the whole HSA environment and all of its agents. */
547
548 static struct hsa_context_info hsa_context;
549
550 /* HSA runtime functions that are initialized in init_hsa_context. */
551
552 static struct hsa_runtime_fn_info hsa_fns;
553
554 /* Heap space, allocated target-side, provided for use of newlib malloc.
555 Each module should have it's own heap allocated.
556 Beware that heap usage increases with OpenMP teams. See also arenas. */
557
558 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
559
560 /* Ephemeral memory sizes for each kernel launch. */
561
562 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
563 static int stack_size = DEFAULT_GCN_STACK_SIZE;
564 static int lowlat_size = -1;
565
566 /* Flag to decide whether print to stderr information about what is going on.
567 Set in init_debug depending on environment variables. */
568
569 static bool debug;
570
571 /* Flag to decide if the runtime should suppress a possible fallback to host
572 execution. */
573
574 static bool suppress_host_fallback;
575
576 /* Flag to locate HSA runtime shared library that is dlopened
577 by this plug-in. */
578
579 static const char *hsa_runtime_lib;
580
581 /* Flag to decide if the runtime should support also CPU devices (can be
582 a simulator). */
583
584 static bool support_cpu_devices;
585
586 /* Runtime dimension overrides. Zero indicates default. */
587
588 static int override_x_dim = 0;
589 static int override_z_dim = 0;
590
591 /* }}} */
592 /* {{{ Debug & Diagnostic */
593
594 /* Print a message to stderr if GCN_DEBUG value is set to true. */
595
596 #define DEBUG_PRINT(...) \
597 do \
598 { \
599 if (debug) \
600 { \
601 fprintf (stderr, __VA_ARGS__); \
602 } \
603 } \
604 while (false);
605
606 /* Flush stderr if GCN_DEBUG value is set to true. */
607
608 #define DEBUG_FLUSH() \
609 do { \
610 if (debug) \
611 fflush (stderr); \
612 } while (false)
613
614 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
615 is set to true. */
616
617 #define DEBUG_LOG(prefix, ...) \
618 do \
619 { \
620 DEBUG_PRINT (prefix); \
621 DEBUG_PRINT (__VA_ARGS__); \
622 DEBUG_FLUSH (); \
623 } while (false)
624
625 /* Print a debugging message to stderr. */
626
627 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
628
629 /* Print a warning message to stderr. */
630
631 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
632
633 /* Print HSA warning STR with an HSA STATUS code. */
634
635 static void
636 hsa_warn (const char *str, hsa_status_t status)
637 {
638 if (!debug)
639 return;
640
641 const char *hsa_error_msg = "[unknown]";
642 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
643
644 fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
645 hsa_error_msg);
646 }
647
648 /* Report a fatal error STR together with the HSA error corresponding to STATUS
649 and terminate execution of the current process. */
650
651 static void
652 hsa_fatal (const char *str, hsa_status_t status)
653 {
654 const char *hsa_error_msg = "[unknown]";
655 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
656 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
657 hsa_error_msg);
658 }
659
660 /* Like hsa_fatal, except only report error message, and return FALSE
661 for propagating error processing to outside of plugin. */
662
663 static bool
664 hsa_error (const char *str, hsa_status_t status)
665 {
666 const char *hsa_error_msg = "[unknown]";
667 hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
668 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
669 hsa_error_msg);
670 return false;
671 }
672
673 /* Dump information about the available hardware. */
674
675 static void
676 dump_hsa_system_info (void)
677 {
678 hsa_status_t status;
679
680 hsa_endianness_t endianness;
681 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
682 &endianness);
683 if (status == HSA_STATUS_SUCCESS)
684 switch (endianness)
685 {
686 case HSA_ENDIANNESS_LITTLE:
687 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
688 break;
689 case HSA_ENDIANNESS_BIG:
690 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
691 break;
692 default:
693 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
694 }
695 else
696 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
697
698 uint8_t extensions[128];
699 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
700 &extensions);
701 if (status == HSA_STATUS_SUCCESS)
702 {
703 if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
704 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
705 }
706 else
707 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
708 }
709
710 /* Dump information about the available hardware. */
711
712 static void
713 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
714 {
715 switch (machine_model)
716 {
717 case HSA_MACHINE_MODEL_SMALL:
718 GCN_DEBUG ("%s: SMALL\n", s);
719 break;
720 case HSA_MACHINE_MODEL_LARGE:
721 GCN_DEBUG ("%s: LARGE\n", s);
722 break;
723 default:
724 GCN_WARNING ("%s: UNKNOWN\n", s);
725 break;
726 }
727 }
728
729 /* Dump information about the available hardware. */
730
731 static void
732 dump_profile (hsa_profile_t profile, const char *s)
733 {
734 switch (profile)
735 {
736 case HSA_PROFILE_FULL:
737 GCN_DEBUG ("%s: FULL\n", s);
738 break;
739 case HSA_PROFILE_BASE:
740 GCN_DEBUG ("%s: BASE\n", s);
741 break;
742 default:
743 GCN_WARNING ("%s: UNKNOWN\n", s);
744 break;
745 }
746 }
747
748 /* Dump information about a device memory region. */
749
750 static hsa_status_t
751 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
752 {
753 hsa_status_t status;
754
755 hsa_region_segment_t segment;
756 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
757 &segment);
758 if (status == HSA_STATUS_SUCCESS)
759 {
760 if (segment == HSA_REGION_SEGMENT_GLOBAL)
761 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
762 else if (segment == HSA_REGION_SEGMENT_READONLY)
763 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
764 else if (segment == HSA_REGION_SEGMENT_PRIVATE)
765 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
766 else if (segment == HSA_REGION_SEGMENT_GROUP)
767 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
768 else
769 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
770 }
771 else
772 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
773
774 if (segment == HSA_REGION_SEGMENT_GLOBAL)
775 {
776 uint32_t flags;
777 status
778 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
779 &flags);
780 if (status == HSA_STATUS_SUCCESS)
781 {
782 if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
783 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
784 if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
785 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
786 if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
787 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
788 }
789 else
790 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
791 }
792
793 size_t size;
794 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
795 if (status == HSA_STATUS_SUCCESS)
796 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
797 else
798 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
799
800 status
801 = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
802 &size);
803 if (status == HSA_STATUS_SUCCESS)
804 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
805 else
806 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
807
808 bool alloc_allowed;
809 status
810 = hsa_fns.hsa_region_get_info_fn (region,
811 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
812 &alloc_allowed);
813 if (status == HSA_STATUS_SUCCESS)
814 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
815 else
816 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
817
818 if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
819 return HSA_STATUS_SUCCESS;
820
821 status
822 = hsa_fns.hsa_region_get_info_fn (region,
823 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
824 &size);
825 if (status == HSA_STATUS_SUCCESS)
826 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
827 else
828 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
829
830 size_t align;
831 status
832 = hsa_fns.hsa_region_get_info_fn (region,
833 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
834 &align);
835 if (status == HSA_STATUS_SUCCESS)
836 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
837 else
838 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
839
840 return HSA_STATUS_SUCCESS;
841 }
842
843 /* Dump information about all the device memory regions. */
844
845 static void
846 dump_hsa_regions (hsa_agent_t agent)
847 {
848 hsa_status_t status;
849 status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
850 dump_hsa_region,
851 NULL);
852 if (status != HSA_STATUS_SUCCESS)
853 hsa_error ("Dumping hsa regions failed", status);
854 }
855
856 /* Dump information about the available devices. */
857
858 static hsa_status_t
859 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
860 {
861 hsa_status_t status;
862
863 char buf[64];
864 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
865 &buf);
866 if (status == HSA_STATUS_SUCCESS)
867 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
868 else
869 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
870
871 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
872 &buf);
873 if (status == HSA_STATUS_SUCCESS)
874 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
875 else
876 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
877
878 hsa_machine_model_t machine_model;
879 status
880 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
881 &machine_model);
882 if (status == HSA_STATUS_SUCCESS)
883 dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
884 else
885 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
886
887 hsa_profile_t profile;
888 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
889 &profile);
890 if (status == HSA_STATUS_SUCCESS)
891 dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
892 else
893 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
894
895 hsa_device_type_t device_type;
896 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
897 &device_type);
898 if (status == HSA_STATUS_SUCCESS)
899 {
900 switch (device_type)
901 {
902 case HSA_DEVICE_TYPE_CPU:
903 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
904 break;
905 case HSA_DEVICE_TYPE_GPU:
906 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
907 break;
908 case HSA_DEVICE_TYPE_DSP:
909 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
910 break;
911 default:
912 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
913 break;
914 }
915 }
916 else
917 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
918
919 uint32_t cu_count;
920 status = hsa_fns.hsa_agent_get_info_fn
921 (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
922 if (status == HSA_STATUS_SUCCESS)
923 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
924 else
925 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
926
927 uint32_t size;
928 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
929 &size);
930 if (status == HSA_STATUS_SUCCESS)
931 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
932 else
933 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
934
935 uint32_t max_dim;
936 status = hsa_fns.hsa_agent_get_info_fn (agent,
937 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
938 &max_dim);
939 if (status == HSA_STATUS_SUCCESS)
940 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
941 else
942 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
943
944 uint32_t max_size;
945 status = hsa_fns.hsa_agent_get_info_fn (agent,
946 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
947 &max_size);
948 if (status == HSA_STATUS_SUCCESS)
949 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
950 else
951 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
952
953 uint32_t grid_max_dim;
954 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
955 &grid_max_dim);
956 if (status == HSA_STATUS_SUCCESS)
957 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
958 else
959 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
960
961 uint32_t grid_max_size;
962 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
963 &grid_max_size);
964 if (status == HSA_STATUS_SUCCESS)
965 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
966 else
967 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
968
969 dump_hsa_regions (agent);
970
971 return HSA_STATUS_SUCCESS;
972 }
973
974 /* Forward reference. */
975
976 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
977
978 /* Helper function for dump_executable_symbols. */
979
980 static hsa_status_t
981 dump_executable_symbol (hsa_executable_t executable,
982 hsa_executable_symbol_t symbol,
983 void *data __attribute__((unused)))
984 {
985 char *name = get_executable_symbol_name (symbol);
986
987 if (name)
988 {
989 GCN_DEBUG ("executable symbol: %s\n", name);
990 free (name);
991 }
992
993 return HSA_STATUS_SUCCESS;
994 }
995
996 /* Dump all global symbol in an executable. */
997
998 static void
999 dump_executable_symbols (hsa_executable_t executable)
1000 {
1001 hsa_status_t status;
1002 status
1003 = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1004 dump_executable_symbol,
1005 NULL);
1006 if (status != HSA_STATUS_SUCCESS)
1007 hsa_fatal ("Could not dump HSA executable symbols", status);
1008 }
1009
1010 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1011
1012 static void
1013 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1014 {
1015 struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1016
1017 fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1018 fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1019 fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1020 fprintf (stderr, "%*sheap address: %p\n", indent, "",
1021 (void*)kernargs->abi.heap_ptr);
1022 fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1023 "", (void*)kernargs->abi.arena_ptr,
1024 kernargs->abi.arena_size_per_team);
1025 fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1026 "", (void*)kernargs->abi.stack_ptr,
1027 kernargs->abi.stack_size_per_thread);
1028 fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1029 fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1030 dispatch->private_segment_size);
1031 fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1032 "", dispatch->group_segment_size);
1033 fprintf (stderr, "\n");
1034 }
1035
1036 /* }}} */
1037 /* {{{ Utility functions */
1038
1039 /* Cast the thread local storage to gcn_thread. */
1040
1041 static inline struct gcn_thread *
1042 gcn_thread (void)
1043 {
1044 return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1045 }
1046
1047 /* Initialize debug and suppress_host_fallback according to the environment. */
1048
1049 static void
1050 init_environment_variables (void)
1051 {
1052 if (secure_getenv ("GCN_DEBUG"))
1053 debug = true;
1054 else
1055 debug = false;
1056
1057 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1058 suppress_host_fallback = true;
1059 else
1060 suppress_host_fallback = false;
1061
1062 hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1063 if (hsa_runtime_lib == NULL)
1064 hsa_runtime_lib = "libhsa-runtime64.so.1";
1065
1066 support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1067
1068 const char *x = secure_getenv ("GCN_NUM_TEAMS");
1069 if (!x)
1070 x = secure_getenv ("GCN_NUM_GANGS");
1071 if (x)
1072 override_x_dim = atoi (x);
1073
1074 const char *z = secure_getenv ("GCN_NUM_THREADS");
1075 if (!z)
1076 z = secure_getenv ("GCN_NUM_WORKERS");
1077 if (z)
1078 override_z_dim = atoi (z);
1079
1080 const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1081 if (heap)
1082 {
1083 size_t tmp = atol (heap);
1084 if (tmp)
1085 gcn_kernel_heap_size = tmp;
1086 }
1087
1088 const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1089 if (arena)
1090 {
1091 int tmp = atoi (arena);
1092 if (tmp)
1093 team_arena_size = tmp;;
1094 }
1095
1096 const char *stack = secure_getenv ("GCN_STACK_SIZE");
1097 if (stack)
1098 {
1099 int tmp = atoi (stack);
1100 if (tmp)
1101 stack_size = tmp;;
1102 }
1103
1104 const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1105 if (lowlat)
1106 lowlat_size = atoi (lowlat);
1107 }
1108
1109 /* Return malloc'd string with name of SYMBOL. */
1110
1111 static char *
1112 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1113 {
1114 hsa_status_t status;
1115 char *res;
1116 uint32_t len;
1117 const hsa_executable_symbol_info_t info_name_length
1118 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1119
1120 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1121 &len);
1122 if (status != HSA_STATUS_SUCCESS)
1123 {
1124 hsa_error ("Could not get length of symbol name", status);
1125 return NULL;
1126 }
1127
1128 res = GOMP_PLUGIN_malloc (len + 1);
1129
1130 const hsa_executable_symbol_info_t info_name
1131 = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1132
1133 status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1134
1135 if (status != HSA_STATUS_SUCCESS)
1136 {
1137 hsa_error ("Could not get symbol name", status);
1138 free (res);
1139 return NULL;
1140 }
1141
1142 res[len] = '\0';
1143
1144 return res;
1145 }
1146
1147 /* Get the number of GPU Compute Units. */
1148
1149 static int
1150 get_cu_count (struct agent_info *agent)
1151 {
1152 uint32_t cu_count;
1153 hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1154 (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1155 if (status == HSA_STATUS_SUCCESS)
1156 return cu_count;
1157 else
1158 return 64; /* The usual number for older devices. */
1159 }
1160
1161 /* Calculate the maximum grid size for OMP threads / OACC workers.
1162 This depends on the kernel's resource usage levels. */
1163
1164 static int
1165 limit_worker_threads (int threads)
1166 {
1167 /* FIXME Do something more inteligent here.
1168 GCN can always run 4 threads within a Compute Unit, but
1169 more than that depends on register usage. */
1170 if (threads > 16)
1171 threads = 16;
1172 return threads;
1173 }
1174
1175 /* This sets the maximum number of teams to twice the number of GPU Compute
1176 Units to avoid memory waste and corresponding memory access faults. */
1177
1178 static int
1179 limit_teams (int teams, struct agent_info *agent)
1180 {
1181 int max_teams = 2 * get_cu_count (agent);
1182 if (teams > max_teams)
1183 teams = max_teams;
1184 return teams;
1185 }
1186
1187 /* Parse the target attributes INPUT provided by the compiler and return true
1188 if we should run anything all. If INPUT is NULL, fill DEF with default
1189 values, then store INPUT or DEF into *RESULT.
1190
1191 This is used for OpenMP only. */
1192
1193 static bool
1194 parse_target_attributes (void **input,
1195 struct GOMP_kernel_launch_attributes *def,
1196 struct GOMP_kernel_launch_attributes **result,
1197 struct agent_info *agent)
1198 {
1199 if (!input)
1200 GOMP_PLUGIN_fatal ("No target arguments provided");
1201
1202 bool grid_attrs_found = false;
1203 bool gcn_dims_found = false;
1204 int gcn_teams = 0;
1205 int gcn_threads = 0;
1206 while (*input)
1207 {
1208 intptr_t id = (intptr_t) *input++, val;
1209
1210 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1211 val = (intptr_t) *input++;
1212 else
1213 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1214
1215 val = (val > INT_MAX) ? INT_MAX : val;
1216
1217 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1218 && ((id & GOMP_TARGET_ARG_ID_MASK)
1219 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1220 {
1221 grid_attrs_found = true;
1222 break;
1223 }
1224 else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1225 == GOMP_TARGET_ARG_DEVICE_ALL)
1226 {
1227 gcn_dims_found = true;
1228 switch (id & GOMP_TARGET_ARG_ID_MASK)
1229 {
1230 case GOMP_TARGET_ARG_NUM_TEAMS:
1231 gcn_teams = limit_teams (val, agent);
1232 break;
1233 case GOMP_TARGET_ARG_THREAD_LIMIT:
1234 gcn_threads = limit_worker_threads (val);
1235 break;
1236 default:
1237 ;
1238 }
1239 }
1240 }
1241
1242 if (gcn_dims_found)
1243 {
1244 bool gfx900_workaround_p = false;
1245
1246 if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1247 && gcn_threads == 0 && override_z_dim == 0)
1248 {
1249 gfx900_workaround_p = true;
1250 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1251 "threads to at most 4 per team.\n");
1252 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1253 "GCN_NUM_THREADS=16\n");
1254 }
1255
1256 /* Ideally, when a dimension isn't explicitly specified, we should
1257 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1258 In practice, we tune for peak performance on BabelStream, which
1259 for OpenACC is currently 32 threads per CU. */
1260 def->ndim = 3;
1261 if (gcn_teams <= 0 && gcn_threads <= 0)
1262 {
1263 /* Set up a reasonable number of teams and threads. */
1264 gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1265 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1266 def->gdims[2] = gcn_threads;
1267 }
1268 else if (gcn_teams <= 0 && gcn_threads > 0)
1269 {
1270 /* Auto-scale the number of teams with the number of threads. */
1271 def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1272 def->gdims[2] = gcn_threads;
1273 }
1274 else if (gcn_teams > 0 && gcn_threads <= 0)
1275 {
1276 int max_threads = gfx900_workaround_p ? 4 : 16;
1277
1278 /* Auto-scale the number of threads with the number of teams. */
1279 def->gdims[0] = gcn_teams;
1280 def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1281 if (def->gdims[2] == 0)
1282 def->gdims[2] = 1;
1283 else if (def->gdims[2] > max_threads)
1284 def->gdims[2] = max_threads;
1285 }
1286 else
1287 {
1288 def->gdims[0] = gcn_teams;
1289 def->gdims[2] = gcn_threads;
1290 }
1291 def->gdims[1] = 64; /* Each thread is 64 work items wide. */
1292 def->wdims[0] = 1; /* Single team per work-group. */
1293 def->wdims[1] = 64;
1294 def->wdims[2] = 16;
1295 *result = def;
1296 return true;
1297 }
1298 else if (!grid_attrs_found)
1299 {
1300 def->ndim = 1;
1301 def->gdims[0] = 1;
1302 def->gdims[1] = 1;
1303 def->gdims[2] = 1;
1304 def->wdims[0] = 1;
1305 def->wdims[1] = 1;
1306 def->wdims[2] = 1;
1307 *result = def;
1308 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1309 return true;
1310 }
1311
1312 struct GOMP_kernel_launch_attributes *kla;
1313 kla = (struct GOMP_kernel_launch_attributes *) *input;
1314 *result = kla;
1315 if (kla->ndim == 0 || kla->ndim > 3)
1316 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1317
1318 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1319 unsigned i;
1320 for (i = 0; i < kla->ndim; i++)
1321 {
1322 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i,
1323 kla->gdims[i], kla->wdims[i]);
1324 if (kla->gdims[i] == 0)
1325 return false;
1326 }
1327 return true;
1328 }
1329
1330 /* Return the group size given the requested GROUP size, GRID size and number
1331 of grid dimensions NDIM. */
1332
1333 static uint32_t
1334 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1335 {
1336 if (group == 0)
1337 {
1338 /* TODO: Provide a default via environment or device characteristics. */
1339 if (ndim == 1)
1340 group = 64;
1341 else if (ndim == 2)
1342 group = 8;
1343 else
1344 group = 4;
1345 }
1346
1347 if (group > grid)
1348 group = grid;
1349 return group;
1350 }
1351
1352 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1353
1354 static void
1355 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1356 {
1357 __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1358 }
1359
1360 /* A never-called callback for the HSA command queues. These signal events
1361 that we don't use, so we trigger an error.
1362
1363 This "queue" is not to be confused with the async queues, below. */
1364
1365 static void
1366 hsa_queue_callback (hsa_status_t status,
1367 hsa_queue_t *queue __attribute__ ((unused)),
1368 void *data __attribute__ ((unused)))
1369 {
1370 hsa_fatal ("Asynchronous queue error", status);
1371 }
1372
1373 /* }}} */
1374 /* {{{ HSA initialization */
1375
1376 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1377
1378 static bool
1379 init_hsa_runtime_functions (void)
1380 {
1381 #define DLSYM_FN(function) \
1382 hsa_fns.function##_fn = dlsym (handle, #function); \
1383 if (hsa_fns.function##_fn == NULL) \
1384 return false;
1385 #define DLSYM_OPT_FN(function) \
1386 hsa_fns.function##_fn = dlsym (handle, #function);
1387 void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1388 if (handle == NULL)
1389 return false;
1390
1391 DLSYM_FN (hsa_status_string)
1392 DLSYM_FN (hsa_system_get_info)
1393 DLSYM_FN (hsa_agent_get_info)
1394 DLSYM_FN (hsa_init)
1395 DLSYM_FN (hsa_iterate_agents)
1396 DLSYM_FN (hsa_region_get_info)
1397 DLSYM_FN (hsa_queue_create)
1398 DLSYM_FN (hsa_agent_iterate_regions)
1399 DLSYM_FN (hsa_executable_destroy)
1400 DLSYM_FN (hsa_executable_create)
1401 DLSYM_FN (hsa_executable_global_variable_define)
1402 DLSYM_FN (hsa_executable_load_code_object)
1403 DLSYM_FN (hsa_executable_freeze)
1404 DLSYM_FN (hsa_signal_create)
1405 DLSYM_FN (hsa_memory_allocate)
1406 DLSYM_FN (hsa_memory_assign_agent)
1407 DLSYM_FN (hsa_memory_copy)
1408 DLSYM_FN (hsa_memory_free)
1409 DLSYM_FN (hsa_signal_destroy)
1410 DLSYM_FN (hsa_executable_get_symbol)
1411 DLSYM_FN (hsa_executable_symbol_get_info)
1412 DLSYM_FN (hsa_executable_iterate_symbols)
1413 DLSYM_FN (hsa_queue_add_write_index_release)
1414 DLSYM_FN (hsa_queue_load_read_index_acquire)
1415 DLSYM_FN (hsa_signal_wait_acquire)
1416 DLSYM_FN (hsa_signal_store_relaxed)
1417 DLSYM_FN (hsa_signal_store_release)
1418 DLSYM_FN (hsa_signal_load_acquire)
1419 DLSYM_FN (hsa_queue_destroy)
1420 DLSYM_FN (hsa_code_object_deserialize)
1421 DLSYM_OPT_FN (hsa_amd_memory_lock)
1422 DLSYM_OPT_FN (hsa_amd_memory_unlock)
1423 DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
1424 return true;
1425 #undef DLSYM_OPT_FN
1426 #undef DLSYM_FN
1427 }
1428
1429 /* Return true if the agent is a GPU and can accept of concurrent submissions
1430 from different threads. */
1431
1432 static bool
1433 suitable_hsa_agent_p (hsa_agent_t agent)
1434 {
1435 hsa_device_type_t device_type;
1436 hsa_status_t status
1437 = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1438 &device_type);
1439 if (status != HSA_STATUS_SUCCESS)
1440 return false;
1441
1442 switch (device_type)
1443 {
1444 case HSA_DEVICE_TYPE_GPU:
1445 break;
1446 case HSA_DEVICE_TYPE_CPU:
1447 if (!support_cpu_devices)
1448 return false;
1449 break;
1450 default:
1451 return false;
1452 }
1453
1454 uint32_t features = 0;
1455 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1456 &features);
1457 if (status != HSA_STATUS_SUCCESS
1458 || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1459 return false;
1460 hsa_queue_type_t queue_type;
1461 status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1462 &queue_type);
1463 if (status != HSA_STATUS_SUCCESS
1464 || (queue_type != HSA_QUEUE_TYPE_MULTI))
1465 return false;
1466
1467 return true;
1468 }
1469
1470 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1471 agent_count in hsa_context. */
1472
1473 static hsa_status_t
1474 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1475 {
1476 if (suitable_hsa_agent_p (agent))
1477 hsa_context.agent_count++;
1478 return HSA_STATUS_SUCCESS;
1479 }
1480
1481 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1482 id to the describing structure in the hsa context. The index of the
1483 structure is pointed to by DATA, increment it afterwards. */
1484
1485 static hsa_status_t
1486 assign_agent_ids (hsa_agent_t agent, void *data)
1487 {
1488 if (suitable_hsa_agent_p (agent))
1489 {
1490 int *agent_index = (int *) data;
1491 hsa_context.agents[*agent_index].id = agent;
1492 ++*agent_index;
1493 }
1494 return HSA_STATUS_SUCCESS;
1495 }
1496
1497 /* Initialize hsa_context if it has not already been done.
1498 Return TRUE on success. */
1499
1500 static bool
1501 init_hsa_context (void)
1502 {
1503 hsa_status_t status;
1504 int agent_index = 0;
1505
1506 if (hsa_context.initialized)
1507 return true;
1508 init_environment_variables ();
1509 if (!init_hsa_runtime_functions ())
1510 {
1511 GCN_WARNING ("Run-time could not be dynamically opened\n");
1512 if (suppress_host_fallback)
1513 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1514 return false;
1515 }
1516 status = hsa_fns.hsa_init_fn ();
1517 if (status != HSA_STATUS_SUCCESS)
1518 return hsa_error ("Run-time could not be initialized", status);
1519 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1520
1521 if (debug)
1522 dump_hsa_system_info ();
1523
1524 status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1525 if (status != HSA_STATUS_SUCCESS)
1526 return hsa_error ("GCN GPU devices could not be enumerated", status);
1527 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1528
1529 hsa_context.agents
1530 = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1531 * sizeof (struct agent_info));
1532 status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1533 if (status != HSA_STATUS_SUCCESS)
1534 return hsa_error ("Scanning compute agents failed", status);
1535 if (agent_index != hsa_context.agent_count)
1536 {
1537 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1538 return false;
1539 }
1540
1541 if (debug)
1542 {
1543 status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1544 if (status != HSA_STATUS_SUCCESS)
1545 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1546 }
1547
1548 uint16_t minor, major;
1549 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1550 &minor);
1551 if (status != HSA_STATUS_SUCCESS)
1552 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1553 status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1554 &major);
1555 if (status != HSA_STATUS_SUCCESS)
1556 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1557
1558 size_t len = sizeof hsa_context.driver_version_s;
1559 int printed = snprintf (hsa_context.driver_version_s, len,
1560 "HSA Runtime %hu.%hu", (unsigned short int)major,
1561 (unsigned short int)minor);
1562 if (printed >= len)
1563 GCN_WARNING ("HSA runtime version string was truncated."
1564 "Version %hu.%hu is too long.", (unsigned short int)major,
1565 (unsigned short int)minor);
1566
1567 hsa_context.initialized = true;
1568 return true;
1569 }
1570
1571 /* Verify that hsa_context has already been initialized and return the
1572 agent_info structure describing device number N. Return NULL on error. */
1573
1574 static struct agent_info *
1575 get_agent_info (int n)
1576 {
1577 if (!hsa_context.initialized)
1578 {
1579 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1580 return NULL;
1581 }
1582 if (n >= hsa_context.agent_count)
1583 {
1584 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1585 return NULL;
1586 }
1587 if (!hsa_context.agents[n].initialized)
1588 {
1589 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1590 return NULL;
1591 }
1592 return &hsa_context.agents[n];
1593 }
1594
1595 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1596
1597 Selects (breaks at) a suitable region of type KIND. */
1598
1599 static hsa_status_t
1600 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1601 hsa_region_global_flag_t kind)
1602 {
1603 hsa_status_t status;
1604 hsa_region_segment_t segment;
1605
1606 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1607 &segment);
1608 if (status != HSA_STATUS_SUCCESS)
1609 return status;
1610 if (segment != HSA_REGION_SEGMENT_GLOBAL)
1611 return HSA_STATUS_SUCCESS;
1612
1613 uint32_t flags;
1614 status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1615 &flags);
1616 if (status != HSA_STATUS_SUCCESS)
1617 return status;
1618 if (flags & kind)
1619 {
1620 *retval = region;
1621 return HSA_STATUS_INFO_BREAK;
1622 }
1623 return HSA_STATUS_SUCCESS;
1624 }
1625
1626 /* Callback of hsa_agent_iterate_regions.
1627
1628 Selects a kernargs memory region. */
1629
1630 static hsa_status_t
1631 get_kernarg_memory_region (hsa_region_t region, void *data)
1632 {
1633 return get_memory_region (region, (hsa_region_t *)data,
1634 HSA_REGION_GLOBAL_FLAG_KERNARG);
1635 }
1636
1637 /* Callback of hsa_agent_iterate_regions.
1638
1639 Selects a coarse-grained memory region suitable for the heap and
1640 offload data. */
1641
1642 static hsa_status_t
1643 get_data_memory_region (hsa_region_t region, void *data)
1644 {
1645 return get_memory_region (region, (hsa_region_t *)data,
1646 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1647 }
1648
1649 static int
1650 elf_gcn_isa_field (Elf64_Ehdr *image)
1651 {
1652 return image->e_flags & EF_AMDGPU_MACH_MASK;
1653 }
1654
1655 const static char *gcn_gfx803_s = "gfx803";
1656 const static char *gcn_gfx900_s = "gfx900";
1657 const static char *gcn_gfx906_s = "gfx906";
1658 const static char *gcn_gfx908_s = "gfx908";
1659 const static char *gcn_gfx90a_s = "gfx90a";
1660 const static char *gcn_gfx1030_s = "gfx1030";
1661 const static char *gcn_gfx1100_s = "gfx1100";
1662 const static int gcn_isa_name_len = 7;
1663
1664 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1665 support the ISA. */
1666
1667 static const char*
1668 isa_hsa_name (int isa) {
1669 switch(isa)
1670 {
1671 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1672 return gcn_gfx803_s;
1673 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1674 return gcn_gfx900_s;
1675 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1676 return gcn_gfx906_s;
1677 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1678 return gcn_gfx908_s;
1679 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1680 return gcn_gfx90a_s;
1681 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1682 return gcn_gfx1030_s;
1683 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1684 return gcn_gfx1100_s;
1685 }
1686 return NULL;
1687 }
1688
1689 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1690 with -march) or NULL if we do not support the ISA.
1691 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1692
1693 static const char*
1694 isa_gcc_name (int isa) {
1695 switch(isa)
1696 {
1697 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1698 return "fiji";
1699 default:
1700 return isa_hsa_name (isa);
1701 }
1702 }
1703
1704 /* Returns the code which is used in the GCN object code to identify the ISA with
1705 the given name (as used by the HSA runtime). */
1706
1707 static gcn_isa
1708 isa_code(const char *isa) {
1709 if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1710 return EF_AMDGPU_MACH_AMDGCN_GFX803;
1711
1712 if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1713 return EF_AMDGPU_MACH_AMDGCN_GFX900;
1714
1715 if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1716 return EF_AMDGPU_MACH_AMDGCN_GFX906;
1717
1718 if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1719 return EF_AMDGPU_MACH_AMDGCN_GFX908;
1720
1721 if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1722 return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1723
1724 if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
1725 return EF_AMDGPU_MACH_AMDGCN_GFX1030;
1726
1727 if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len))
1728 return EF_AMDGPU_MACH_AMDGCN_GFX1100;
1729
1730 return -1;
1731 }
1732
1733 /* CDNA2 devices have twice as many VGPRs compared to older devices. */
1734
1735 static int
1736 max_isa_vgprs (int isa)
1737 {
1738 switch (isa)
1739 {
1740 case EF_AMDGPU_MACH_AMDGCN_GFX803:
1741 case EF_AMDGPU_MACH_AMDGCN_GFX900:
1742 case EF_AMDGPU_MACH_AMDGCN_GFX906:
1743 case EF_AMDGPU_MACH_AMDGCN_GFX908:
1744 case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1745 case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1746 return 256;
1747 case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1748 return 512;
1749 }
1750 GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1751 }
1752
1753 /* }}} */
1754 /* {{{ Run */
1755
1756 /* Create or reuse a team arena and stack space.
1757
1758 Team arenas are used by OpenMP to avoid calling malloc multiple times
1759 while setting up each team. This is purely a performance optimization.
1760
1761 The stack space is used by all kernels. We must allocate it in such a
1762 way that the reverse offload implmentation can access the data.
1763
1764 Allocating this memory costs performance, so this function will reuse an
1765 existing allocation if a large enough one is idle.
1766 The memory lock is released, but not deallocated, when the kernel exits. */
1767
1768 static void
1769 configure_ephemeral_memories (struct kernel_info *kernel,
1770 struct kernargs_abi *kernargs, int num_teams,
1771 int num_threads)
1772 {
1773 struct agent_info *agent = kernel->agent;
1774 struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1775 struct ephemeral_memories_list *item;
1776
1777 int actual_arena_size = (kernel->kind == KIND_OPENMP
1778 ? team_arena_size : 0);
1779 int actual_arena_total_size = actual_arena_size * num_teams;
1780 size_t size = (actual_arena_total_size
1781 + num_teams * num_threads * stack_size);
1782
1783 for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1784 {
1785 if (item->size < size)
1786 continue;
1787
1788 if (pthread_mutex_trylock (&item->in_use) == 0)
1789 break;
1790 }
1791
1792 if (!item)
1793 {
1794 GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1795 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1796 num_teams, num_threads, size);
1797
1798 if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1799 {
1800 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1801 return;
1802 }
1803 item = malloc (sizeof (*item));
1804 item->size = size;
1805 item->next = NULL;
1806 *next_ptr = item;
1807
1808 if (pthread_mutex_init (&item->in_use, NULL))
1809 {
1810 GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1811 return;
1812 }
1813 if (pthread_mutex_lock (&item->in_use))
1814 {
1815 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1816 return;
1817 }
1818 if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1819 {
1820 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1821 return;
1822 }
1823
1824 hsa_status_t status;
1825 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1826 &item->address);
1827 if (status != HSA_STATUS_SUCCESS)
1828 hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1829 status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1830 HSA_ACCESS_PERMISSION_RW);
1831 if (status != HSA_STATUS_SUCCESS)
1832 hsa_fatal ("Could not assign arena & stack memory to device", status);
1833 }
1834
1835 kernargs->arena_ptr = (actual_arena_total_size
1836 ? (uint64_t)item->address
1837 : 0);
1838 kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1839 kernargs->arena_size_per_team = actual_arena_size;
1840 kernargs->stack_size_per_thread = stack_size;
1841 }
1842
1843 /* Mark an ephemeral memory space available for reuse. */
1844
1845 static void
1846 release_ephemeral_memories (struct agent_info* agent, void *address)
1847 {
1848 struct ephemeral_memories_list *item;
1849
1850 for (item = agent->ephemeral_memories_list; item; item = item->next)
1851 {
1852 if (item->address == address)
1853 {
1854 if (pthread_mutex_unlock (&item->in_use))
1855 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1856 return;
1857 }
1858 }
1859 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1860 }
1861
1862 /* Clean up all the allocated team arenas. */
1863
1864 static bool
1865 destroy_ephemeral_memories (struct agent_info *agent)
1866 {
1867 struct ephemeral_memories_list *item, *next;
1868
1869 for (item = agent->ephemeral_memories_list; item; item = next)
1870 {
1871 next = item->next;
1872 hsa_fns.hsa_memory_free_fn (item->address);
1873 if (pthread_mutex_destroy (&item->in_use))
1874 {
1875 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1876 return false;
1877 }
1878 free (item);
1879 }
1880 agent->ephemeral_memories_list = NULL;
1881
1882 return true;
1883 }
1884
1885 /* Allocate memory on a specified device. */
1886
1887 static void *
1888 alloc_by_agent (struct agent_info *agent, size_t size)
1889 {
1890 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1891
1892 void *ptr;
1893 hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1894 size, &ptr);
1895 if (status != HSA_STATUS_SUCCESS)
1896 {
1897 hsa_error ("Could not allocate device memory", status);
1898 return NULL;
1899 }
1900
1901 status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1902 HSA_ACCESS_PERMISSION_RW);
1903 if (status != HSA_STATUS_SUCCESS)
1904 {
1905 hsa_error ("Could not assign data memory to device", status);
1906 return NULL;
1907 }
1908
1909 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1910 bool profiling_dispatch_p
1911 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1912 if (profiling_dispatch_p)
1913 {
1914 acc_prof_info *prof_info = thr->prof_info;
1915 acc_event_info data_event_info;
1916 acc_api_info *api_info = thr->api_info;
1917
1918 prof_info->event_type = acc_ev_alloc;
1919
1920 data_event_info.data_event.event_type = prof_info->event_type;
1921 data_event_info.data_event.valid_bytes
1922 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1923 data_event_info.data_event.parent_construct
1924 = acc_construct_parallel;
1925 data_event_info.data_event.implicit = 1;
1926 data_event_info.data_event.tool_info = NULL;
1927 data_event_info.data_event.var_name = NULL;
1928 data_event_info.data_event.bytes = size;
1929 data_event_info.data_event.host_ptr = NULL;
1930 data_event_info.data_event.device_ptr = (void *) ptr;
1931
1932 api_info->device_api = acc_device_api_other;
1933
1934 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1935 api_info);
1936 }
1937
1938 return ptr;
1939 }
1940
1941 /* Create kernel dispatch data structure for given KERNEL, along with
1942 the necessary device signals and memory allocations. */
1943
1944 static struct kernel_dispatch *
1945 create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1946 int num_threads)
1947 {
1948 struct agent_info *agent = kernel->agent;
1949 struct kernel_dispatch *shadow
1950 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1951
1952 shadow->agent = kernel->agent;
1953 shadow->object = kernel->object;
1954
1955 hsa_signal_t sync_signal;
1956 hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1957 if (status != HSA_STATUS_SUCCESS)
1958 hsa_fatal ("Error creating the GCN sync signal", status);
1959
1960 shadow->signal = sync_signal.handle;
1961 shadow->private_segment_size = kernel->private_segment_size;
1962
1963 if (lowlat_size < 0)
1964 {
1965 /* Divide the LDS between the number of running teams.
1966 Allocate not less than is defined in the kernel metadata. */
1967 int teams_per_cu = num_teams / get_cu_count (agent);
1968 int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
1969 shadow->group_segment_size
1970 = (kernel->group_segment_size > LDS_per_team
1971 ? kernel->group_segment_size
1972 : LDS_per_team);;
1973 }
1974 else if (lowlat_size < GCN_LOWLAT_HEAP+8)
1975 /* Ensure that there's space for the OpenMP libgomp data. */
1976 shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
1977 else
1978 shadow->group_segment_size = (lowlat_size > 65536
1979 ? 65536
1980 : lowlat_size);
1981
1982 /* We expect kernels to request a single pointer, explicitly, and the
1983 rest of struct kernargs, implicitly. If they request anything else
1984 then something is wrong. */
1985 if (kernel->kernarg_segment_size > 8)
1986 {
1987 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1988 return NULL;
1989 }
1990
1991 status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1992 sizeof (struct kernargs),
1993 &shadow->kernarg_address);
1994 if (status != HSA_STATUS_SUCCESS)
1995 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1996 struct kernargs *kernargs = shadow->kernarg_address;
1997
1998 /* Zero-initialize the output_data (minimum needed). */
1999 kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
2000 kernargs->output_data.next_output = 0;
2001 for (unsigned i = 0;
2002 i < (sizeof (kernargs->output_data.queue)
2003 / sizeof (kernargs->output_data.queue[0]));
2004 i++)
2005 kernargs->output_data.queue[i].written = 0;
2006 kernargs->output_data.consumed = 0;
2007
2008 /* Pass in the heap location. */
2009 kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
2010
2011 /* Create the ephemeral memory spaces. */
2012 configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
2013
2014 /* Ensure we can recognize unset return values. */
2015 kernargs->output_data.return_value = 0xcafe0000;
2016
2017 return shadow;
2018 }
2019
2020 static void
2021 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
2022 uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
2023 {
2024 int dev_num = dev_num64;
2025 GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
2026 NULL);
2027 }
2028
2029 /* Output any data written to console output from the kernel. It is expected
2030 that this function is polled during kernel execution.
2031
2032 We print all entries from the last item printed to the next entry without
2033 a "written" flag. If the "final" flag is set then it'll continue right to
2034 the end.
2035
2036 The print buffer is circular, but the from and to locations don't wrap when
2037 the buffer does, so the output limit is UINT_MAX. The target blocks on
2038 output when the buffer is full. */
2039
2040 static void
2041 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2042 bool final)
2043 {
2044 unsigned int limit = (sizeof (kernargs->output_data.queue)
2045 / sizeof (kernargs->output_data.queue[0]));
2046
2047 unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2048 __ATOMIC_ACQUIRE);
2049 unsigned int to = kernargs->output_data.next_output;
2050
2051 if (from > to)
2052 {
2053 /* Overflow. */
2054 if (final)
2055 printf ("GCN print buffer overflowed.\n");
2056 return;
2057 }
2058
2059 unsigned int i;
2060 for (i = from; i < to; i++)
2061 {
2062 struct printf_data *data = &kernargs->output_data.queue[i%limit];
2063
2064 if (!data->written && !final)
2065 break;
2066
2067 switch (data->type)
2068 {
2069 case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2070 case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2071 case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2072 case 3: printf ("%.128s%.128s", data->msg, data->text); break;
2073 case 4:
2074 process_reverse_offload (data->value_u64[0], data->value_u64[1],
2075 data->value_u64[2], data->value_u64[3],
2076 data->value_u64[4], data->value_u64[5]);
2077 break;
2078 default: printf ("GCN print buffer error!\n"); break;
2079 }
2080 data->written = 0;
2081 __atomic_store_n (&kernargs->output_data.consumed, i+1,
2082 __ATOMIC_RELEASE);
2083 }
2084 fflush (stdout);
2085 }
2086
2087 /* Release data structure created for a kernel dispatch in SHADOW argument,
2088 and clean up the signal and memory allocations. */
2089
2090 static void
2091 release_kernel_dispatch (struct kernel_dispatch *shadow)
2092 {
2093 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2094
2095 struct kernargs *kernargs = shadow->kernarg_address;
2096 void *addr = (void *)kernargs->abi.arena_ptr;
2097 if (!addr)
2098 addr = (void *)kernargs->abi.stack_ptr;
2099 release_ephemeral_memories (shadow->agent, addr);
2100
2101 hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2102
2103 hsa_signal_t s;
2104 s.handle = shadow->signal;
2105 hsa_fns.hsa_signal_destroy_fn (s);
2106
2107 free (shadow);
2108 }
2109
2110 /* Extract the properties from a kernel binary. */
2111
2112 static void
2113 init_kernel_properties (struct kernel_info *kernel)
2114 {
2115 hsa_status_t status;
2116 struct agent_info *agent = kernel->agent;
2117 hsa_executable_symbol_t kernel_symbol;
2118 char *buf = alloca (strlen (kernel->name) + 4);
2119 sprintf (buf, "%s.kd", kernel->name);
2120 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2121 buf, agent->id,
2122 0, &kernel_symbol);
2123 if (status != HSA_STATUS_SUCCESS)
2124 {
2125 hsa_warn ("Could not find symbol for kernel in the code object", status);
2126 fprintf (stderr, "not found name: '%s'\n", buf);
2127 dump_executable_symbols (agent->executable);
2128 goto failure;
2129 }
2130 GCN_DEBUG ("Located kernel %s\n", kernel->name);
2131 status = hsa_fns.hsa_executable_symbol_get_info_fn
2132 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2133 if (status != HSA_STATUS_SUCCESS)
2134 hsa_fatal ("Could not extract a kernel object from its symbol", status);
2135 status = hsa_fns.hsa_executable_symbol_get_info_fn
2136 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2137 &kernel->kernarg_segment_size);
2138 if (status != HSA_STATUS_SUCCESS)
2139 hsa_fatal ("Could not get info about kernel argument size", status);
2140 status = hsa_fns.hsa_executable_symbol_get_info_fn
2141 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2142 &kernel->group_segment_size);
2143 if (status != HSA_STATUS_SUCCESS)
2144 hsa_fatal ("Could not get info about kernel group segment size", status);
2145 status = hsa_fns.hsa_executable_symbol_get_info_fn
2146 (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2147 &kernel->private_segment_size);
2148 if (status != HSA_STATUS_SUCCESS)
2149 hsa_fatal ("Could not get info about kernel private segment size",
2150 status);
2151
2152 /* The kernel type is not known until something tries to launch it. */
2153 kernel->kind = KIND_UNKNOWN;
2154
2155 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2156 "following segment sizes: \n", kernel->name);
2157 GCN_DEBUG (" group_segment_size: %u\n",
2158 (unsigned) kernel->group_segment_size);
2159 GCN_DEBUG (" private_segment_size: %u\n",
2160 (unsigned) kernel->private_segment_size);
2161 GCN_DEBUG (" kernarg_segment_size: %u\n",
2162 (unsigned) kernel->kernarg_segment_size);
2163 return;
2164
2165 failure:
2166 kernel->initialization_failed = true;
2167 }
2168
2169 /* Do all the work that is necessary before running KERNEL for the first time.
2170 The function assumes the program has been created, finalized and frozen by
2171 create_and_finalize_hsa_program. */
2172
2173 static void
2174 init_kernel (struct kernel_info *kernel)
2175 {
2176 if (pthread_mutex_lock (&kernel->init_mutex))
2177 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2178 if (kernel->initialized)
2179 {
2180 if (pthread_mutex_unlock (&kernel->init_mutex))
2181 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2182 "mutex");
2183
2184 return;
2185 }
2186
2187 init_kernel_properties (kernel);
2188
2189 if (!kernel->initialization_failed)
2190 {
2191 GCN_DEBUG ("\n");
2192
2193 kernel->initialized = true;
2194 }
2195 if (pthread_mutex_unlock (&kernel->init_mutex))
2196 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2197 "mutex");
2198 }
2199
2200 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2201 launch attributes from KLA.
2202
2203 MODULE_LOCKED indicates that the caller already holds the lock and
2204 run_kernel need not lock it again.
2205 If AQ is NULL then agent->sync_queue will be used. */
2206
2207 static void
2208 run_kernel (struct kernel_info *kernel, void *vars,
2209 struct GOMP_kernel_launch_attributes *kla,
2210 struct goacc_asyncqueue *aq, bool module_locked)
2211 {
2212 struct agent_info *agent = kernel->agent;
2213 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2214 kernel->description->vpgr_count);
2215
2216 /* Reduce the number of threads/workers if there are insufficient
2217 VGPRs available to run the kernels together. */
2218 if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2219 {
2220 int max_vgprs = max_isa_vgprs (agent->device_isa);
2221 int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2222 int max_threads = (max_vgprs / granulated_vgprs) * 4;
2223 if (kla->gdims[2] > max_threads)
2224 {
2225 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2226 " per team/gang - reducing to %d threads/workers.\n",
2227 kla->gdims[2], max_threads);
2228 kla->gdims[2] = max_threads;
2229 }
2230 }
2231
2232 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2233 (aq ? aq->id : 0));
2234 GCN_DEBUG ("GCN launch attribs: gdims:[");
2235 int i;
2236 for (i = 0; i < kla->ndim; ++i)
2237 {
2238 if (i)
2239 DEBUG_PRINT (", ");
2240 DEBUG_PRINT ("%u", kla->gdims[i]);
2241 }
2242 DEBUG_PRINT ("], normalized gdims:[");
2243 for (i = 0; i < kla->ndim; ++i)
2244 {
2245 if (i)
2246 DEBUG_PRINT (", ");
2247 DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2248 }
2249 DEBUG_PRINT ("], wdims:[");
2250 for (i = 0; i < kla->ndim; ++i)
2251 {
2252 if (i)
2253 DEBUG_PRINT (", ");
2254 DEBUG_PRINT ("%u", kla->wdims[i]);
2255 }
2256 DEBUG_PRINT ("]\n");
2257 DEBUG_FLUSH ();
2258
2259 if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2260 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2261
2262 if (!agent->initialized)
2263 GOMP_PLUGIN_fatal ("Agent must be initialized");
2264
2265 if (!kernel->initialized)
2266 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2267
2268 hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2269
2270 uint64_t index
2271 = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2272 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2273
2274 /* Wait until the queue is not full before writing the packet. */
2275 while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2276 >= command_q->size)
2277 ;
2278
2279 /* Do not allow the dimensions to be overridden when running
2280 constructors or destructors. */
2281 int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2282 int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2283
2284 hsa_kernel_dispatch_packet_t *packet;
2285 packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2286 + index % command_q->size;
2287
2288 memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2289 packet->grid_size_x = override_x ? : kla->gdims[0];
2290 packet->workgroup_size_x = get_group_size (kla->ndim,
2291 packet->grid_size_x,
2292 kla->wdims[0]);
2293
2294 if (kla->ndim >= 2)
2295 {
2296 packet->grid_size_y = kla->gdims[1];
2297 packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2298 kla->wdims[1]);
2299 }
2300 else
2301 {
2302 packet->grid_size_y = 1;
2303 packet->workgroup_size_y = 1;
2304 }
2305
2306 if (kla->ndim == 3)
2307 {
2308 packet->grid_size_z = limit_worker_threads (override_z
2309 ? : kla->gdims[2]);
2310 packet->workgroup_size_z = get_group_size (kla->ndim,
2311 packet->grid_size_z,
2312 kla->wdims[2]);
2313 }
2314 else
2315 {
2316 packet->grid_size_z = 1;
2317 packet->workgroup_size_z = 1;
2318 }
2319
2320 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2321 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2322 packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2323 packet->grid_size_x / packet->workgroup_size_x,
2324 packet->grid_size_y / packet->workgroup_size_y,
2325 packet->grid_size_z / packet->workgroup_size_z,
2326 packet->workgroup_size_x, packet->workgroup_size_y,
2327 packet->workgroup_size_z);
2328
2329 struct kernel_dispatch *shadow
2330 = create_kernel_dispatch (kernel, packet->grid_size_x,
2331 packet->grid_size_z);
2332 shadow->queue = command_q;
2333
2334 if (debug)
2335 {
2336 fprintf (stderr, "\nKernel has following dependencies:\n");
2337 print_kernel_dispatch (shadow, 2);
2338 }
2339
2340 packet->private_segment_size = shadow->private_segment_size;
2341 packet->group_segment_size = shadow->group_segment_size;
2342 packet->kernel_object = shadow->object;
2343 packet->kernarg_address = shadow->kernarg_address;
2344 hsa_signal_t s;
2345 s.handle = shadow->signal;
2346 packet->completion_signal = s;
2347 hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2348 memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2349
2350 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2351
2352 uint16_t header;
2353 header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2354 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2355 header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2356
2357 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2358 agent->device_id);
2359
2360 packet_store_release ((uint32_t *) packet, header,
2361 (uint16_t) kla->ndim
2362 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2363
2364 hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2365 index);
2366
2367 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2368
2369 /* Root signal waits with 1ms timeout. */
2370 while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2371 1000 * 1000,
2372 HSA_WAIT_STATE_BLOCKED) != 0)
2373 {
2374 console_output (kernel, shadow->kernarg_address, false);
2375 }
2376 console_output (kernel, shadow->kernarg_address, true);
2377
2378 struct kernargs *kernargs = shadow->kernarg_address;
2379 unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2380
2381 release_kernel_dispatch (shadow);
2382
2383 if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2384 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2385
2386 unsigned int upper = (return_value & ~0xffff) >> 16;
2387 if (upper == 0xcafe)
2388 ; // exit not called, normal termination.
2389 else if (upper == 0xffff)
2390 ; // exit called.
2391 else
2392 {
2393 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2394 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2395 return_value);
2396 abort ();
2397 }
2398
2399 if (upper == 0xffff)
2400 {
2401 unsigned int signal = (return_value >> 8) & 0xff;
2402
2403 if (signal == SIGABRT)
2404 {
2405 GCN_WARNING ("GCN Kernel aborted\n");
2406 abort ();
2407 }
2408 else if (signal != 0)
2409 {
2410 GCN_WARNING ("GCN Kernel received unknown signal\n");
2411 abort ();
2412 }
2413
2414 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2415 exit (return_value & 0xff);
2416 }
2417 }
2418
2419 /* }}} */
2420 /* {{{ Load/Unload */
2421
2422 /* Initialize KERNEL from D and other parameters. Return true on success. */
2423
2424 static bool
2425 init_basic_kernel_info (struct kernel_info *kernel,
2426 struct hsa_kernel_description *d,
2427 struct agent_info *agent,
2428 struct module_info *module)
2429 {
2430 kernel->agent = agent;
2431 kernel->module = module;
2432 kernel->name = d->name;
2433 kernel->description = d;
2434 if (pthread_mutex_init (&kernel->init_mutex, NULL))
2435 {
2436 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2437 return false;
2438 }
2439 return true;
2440 }
2441
2442 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2443
2444 static bool
2445 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2446 {
2447 int isa_field = elf_gcn_isa_field (image);
2448 const char* isa_s = isa_hsa_name (isa_field);
2449 if (!isa_s)
2450 {
2451 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2452 return false;
2453 }
2454
2455 if (isa_field != agent->device_isa)
2456 {
2457 char msg[120];
2458 const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2459 const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2460 assert (agent_isa_s);
2461 assert (agent_isa_gcc_s);
2462
2463 snprintf (msg, sizeof msg,
2464 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2465 "Try to recompile with '-foffload-options=-march=%s'.\n",
2466 isa_s, agent_isa_s, agent_isa_gcc_s);
2467
2468 hsa_error (msg, HSA_STATUS_ERROR);
2469 return false;
2470 }
2471
2472 return true;
2473 }
2474
2475 /* Create and finalize the program consisting of all loaded modules. */
2476
2477 static bool
2478 create_and_finalize_hsa_program (struct agent_info *agent)
2479 {
2480 hsa_status_t status;
2481 bool res = true;
2482 if (pthread_mutex_lock (&agent->prog_mutex))
2483 {
2484 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2485 return false;
2486 }
2487 if (agent->prog_finalized)
2488 goto final;
2489
2490 status
2491 = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2492 HSA_EXECUTABLE_STATE_UNFROZEN,
2493 "", &agent->executable);
2494 if (status != HSA_STATUS_SUCCESS)
2495 {
2496 hsa_error ("Could not create GCN executable", status);
2497 goto fail;
2498 }
2499
2500 /* Load any GCN modules. */
2501 struct module_info *module = agent->module;
2502 if (module)
2503 {
2504 Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2505
2506 if (!isa_matches_agent (agent, image))
2507 goto fail;
2508
2509 hsa_code_object_t co = { 0 };
2510 status = hsa_fns.hsa_code_object_deserialize_fn
2511 (module->image_desc->gcn_image->image,
2512 module->image_desc->gcn_image->size,
2513 NULL, &co);
2514 if (status != HSA_STATUS_SUCCESS)
2515 {
2516 hsa_error ("Could not deserialize GCN code object", status);
2517 goto fail;
2518 }
2519
2520 status = hsa_fns.hsa_executable_load_code_object_fn
2521 (agent->executable, agent->id, co, "");
2522 if (status != HSA_STATUS_SUCCESS)
2523 {
2524 hsa_error ("Could not load GCN code object", status);
2525 goto fail;
2526 }
2527
2528 if (!module->heap)
2529 {
2530 status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2531 gcn_kernel_heap_size,
2532 (void**)&module->heap);
2533 if (status != HSA_STATUS_SUCCESS)
2534 {
2535 hsa_error ("Could not allocate memory for GCN heap", status);
2536 goto fail;
2537 }
2538
2539 status = hsa_fns.hsa_memory_assign_agent_fn
2540 (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2541 if (status != HSA_STATUS_SUCCESS)
2542 {
2543 hsa_error ("Could not assign GCN heap memory to device", status);
2544 goto fail;
2545 }
2546
2547 hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2548 &gcn_kernel_heap_size,
2549 sizeof (gcn_kernel_heap_size));
2550 }
2551
2552 }
2553
2554 if (debug)
2555 dump_executable_symbols (agent->executable);
2556
2557 status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2558 if (status != HSA_STATUS_SUCCESS)
2559 {
2560 hsa_error ("Could not freeze the GCN executable", status);
2561 goto fail;
2562 }
2563
2564 final:
2565 agent->prog_finalized = true;
2566
2567 if (pthread_mutex_unlock (&agent->prog_mutex))
2568 {
2569 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2570 res = false;
2571 }
2572
2573 return res;
2574
2575 fail:
2576 res = false;
2577 goto final;
2578 }
2579
2580 /* Free the HSA program in agent and everything associated with it and set
2581 agent->prog_finalized and the initialized flags of all kernels to false.
2582 Return TRUE on success. */
2583
2584 static bool
2585 destroy_hsa_program (struct agent_info *agent)
2586 {
2587 if (!agent->prog_finalized)
2588 return true;
2589
2590 hsa_status_t status;
2591
2592 GCN_DEBUG ("Destroying the current GCN program.\n");
2593
2594 status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2595 if (status != HSA_STATUS_SUCCESS)
2596 return hsa_error ("Could not destroy GCN executable", status);
2597
2598 if (agent->module)
2599 {
2600 int i;
2601 for (i = 0; i < agent->module->kernel_count; i++)
2602 agent->module->kernels[i].initialized = false;
2603
2604 if (agent->module->heap)
2605 {
2606 hsa_fns.hsa_memory_free_fn (agent->module->heap);
2607 agent->module->heap = NULL;
2608 }
2609 }
2610 agent->prog_finalized = false;
2611 return true;
2612 }
2613
2614 /* Deinitialize all information associated with MODULE and kernels within
2615 it. Return TRUE on success. */
2616
2617 static bool
2618 destroy_module (struct module_info *module, bool locked)
2619 {
2620 /* Run destructors before destroying module. */
2621 struct GOMP_kernel_launch_attributes kla =
2622 { 3,
2623 /* Grid size. */
2624 { 1, 64, 1 },
2625 /* Work-group size. */
2626 { 1, 64, 1 }
2627 };
2628
2629 if (module->fini_array_func)
2630 {
2631 init_kernel (module->fini_array_func);
2632 run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2633 }
2634 module->constructors_run_p = false;
2635
2636 int i;
2637 for (i = 0; i < module->kernel_count; i++)
2638 if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2639 {
2640 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2641 "mutex");
2642 return false;
2643 }
2644
2645 return true;
2646 }
2647
2648 /* }}} */
2649 /* {{{ Async */
2650
2651 /* Callback of dispatch queues to report errors. */
2652
2653 static void
2654 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2655 {
2656 struct queue_entry *entry = &aq->queue[index];
2657
2658 switch (entry->type)
2659 {
2660 case KERNEL_LAUNCH:
2661 if (DEBUG_QUEUES)
2662 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2663 aq->agent->device_id, aq->id, index);
2664 run_kernel (entry->u.launch.kernel,
2665 entry->u.launch.vars,
2666 &entry->u.launch.kla, aq, false);
2667 if (DEBUG_QUEUES)
2668 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2669 aq->agent->device_id, aq->id, index);
2670 break;
2671
2672 case CALLBACK:
2673 if (DEBUG_QUEUES)
2674 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2675 aq->agent->device_id, aq->id, index);
2676 entry->u.callback.fn (entry->u.callback.data);
2677 if (DEBUG_QUEUES)
2678 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2679 aq->agent->device_id, aq->id, index);
2680 break;
2681
2682 case ASYNC_WAIT:
2683 {
2684 /* FIXME: is it safe to access a placeholder that may already have
2685 been executed? */
2686 struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2687
2688 if (DEBUG_QUEUES)
2689 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2690 aq->agent->device_id, aq->id, index);
2691
2692 pthread_mutex_lock (&placeholderp->mutex);
2693
2694 while (!placeholderp->executed)
2695 pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2696
2697 pthread_mutex_unlock (&placeholderp->mutex);
2698
2699 if (pthread_cond_destroy (&placeholderp->cond))
2700 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2701
2702 if (pthread_mutex_destroy (&placeholderp->mutex))
2703 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2704
2705 if (DEBUG_QUEUES)
2706 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2707 "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2708 }
2709 break;
2710
2711 case ASYNC_PLACEHOLDER:
2712 pthread_mutex_lock (&entry->u.placeholder.mutex);
2713 entry->u.placeholder.executed = 1;
2714 pthread_cond_signal (&entry->u.placeholder.cond);
2715 pthread_mutex_unlock (&entry->u.placeholder.mutex);
2716 break;
2717
2718 default:
2719 GOMP_PLUGIN_fatal ("Unknown queue element");
2720 }
2721 }
2722
2723 /* This function is run as a thread to service an async queue in the
2724 background. It runs continuously until the stop flag is set. */
2725
2726 static void *
2727 drain_queue (void *thread_arg)
2728 {
2729 struct goacc_asyncqueue *aq = thread_arg;
2730
2731 if (DRAIN_QUEUE_SYNCHRONOUS_P)
2732 {
2733 aq->drain_queue_stop = 2;
2734 return NULL;
2735 }
2736
2737 pthread_mutex_lock (&aq->mutex);
2738
2739 while (true)
2740 {
2741 if (aq->drain_queue_stop)
2742 break;
2743
2744 if (aq->queue_n > 0)
2745 {
2746 pthread_mutex_unlock (&aq->mutex);
2747 execute_queue_entry (aq, aq->queue_first);
2748
2749 pthread_mutex_lock (&aq->mutex);
2750 aq->queue_first = ((aq->queue_first + 1)
2751 % ASYNC_QUEUE_SIZE);
2752 aq->queue_n--;
2753
2754 if (DEBUG_THREAD_SIGNAL)
2755 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2756 aq->agent->device_id, aq->id);
2757 pthread_cond_broadcast (&aq->queue_cond_out);
2758 pthread_mutex_unlock (&aq->mutex);
2759
2760 if (DEBUG_QUEUES)
2761 GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2762 aq->id);
2763 pthread_mutex_lock (&aq->mutex);
2764 }
2765 else
2766 {
2767 if (DEBUG_THREAD_SLEEP)
2768 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2769 aq->agent->device_id, aq->id);
2770 pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2771 if (DEBUG_THREAD_SLEEP)
2772 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2773 aq->agent->device_id, aq->id);
2774 }
2775 }
2776
2777 aq->drain_queue_stop = 2;
2778 if (DEBUG_THREAD_SIGNAL)
2779 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2780 aq->agent->device_id, aq->id);
2781 pthread_cond_broadcast (&aq->queue_cond_out);
2782 pthread_mutex_unlock (&aq->mutex);
2783
2784 GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2785 return NULL;
2786 }
2787
2788 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2789 is not usually the case. This is just a debug tool. */
2790
2791 static void
2792 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2793 {
2794 pthread_mutex_lock (&aq->mutex);
2795
2796 while (aq->queue_n > 0)
2797 {
2798 execute_queue_entry (aq, aq->queue_first);
2799
2800 aq->queue_first = ((aq->queue_first + 1)
2801 % ASYNC_QUEUE_SIZE);
2802 aq->queue_n--;
2803 }
2804
2805 pthread_mutex_unlock (&aq->mutex);
2806 }
2807
2808 /* Block the current thread until an async queue is writable. The aq->mutex
2809 lock should be held on entry, and remains locked on exit. */
2810
2811 static void
2812 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2813 {
2814 if (aq->queue_n == ASYNC_QUEUE_SIZE)
2815 {
2816 /* Queue is full. Wait for it to not be full. */
2817 while (aq->queue_n == ASYNC_QUEUE_SIZE)
2818 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2819 }
2820 }
2821
2822 /* Request an asynchronous kernel launch on the specified queue. This
2823 may block if the queue is full, but returns without waiting for the
2824 kernel to run. */
2825
2826 static void
2827 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2828 void *vars, struct GOMP_kernel_launch_attributes *kla)
2829 {
2830 assert (aq->agent == kernel->agent);
2831
2832 pthread_mutex_lock (&aq->mutex);
2833
2834 wait_for_queue_nonfull (aq);
2835
2836 int queue_last = ((aq->queue_first + aq->queue_n)
2837 % ASYNC_QUEUE_SIZE);
2838 if (DEBUG_QUEUES)
2839 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2840 aq->id, queue_last);
2841
2842 aq->queue[queue_last].type = KERNEL_LAUNCH;
2843 aq->queue[queue_last].u.launch.kernel = kernel;
2844 aq->queue[queue_last].u.launch.vars = vars;
2845 aq->queue[queue_last].u.launch.kla = *kla;
2846
2847 aq->queue_n++;
2848
2849 if (DEBUG_THREAD_SIGNAL)
2850 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2851 aq->agent->device_id, aq->id);
2852 pthread_cond_signal (&aq->queue_cond_in);
2853
2854 pthread_mutex_unlock (&aq->mutex);
2855 }
2856
2857 /* Request an asynchronous callback on the specified queue. The callback
2858 function will be called, with the given opaque data, from the appropriate
2859 async thread, when all previous items on that queue are complete. */
2860
2861 static void
2862 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2863 void *data)
2864 {
2865 pthread_mutex_lock (&aq->mutex);
2866
2867 wait_for_queue_nonfull (aq);
2868
2869 int queue_last = ((aq->queue_first + aq->queue_n)
2870 % ASYNC_QUEUE_SIZE);
2871 if (DEBUG_QUEUES)
2872 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2873 aq->id, queue_last);
2874
2875 aq->queue[queue_last].type = CALLBACK;
2876 aq->queue[queue_last].u.callback.fn = fn;
2877 aq->queue[queue_last].u.callback.data = data;
2878
2879 aq->queue_n++;
2880
2881 if (DEBUG_THREAD_SIGNAL)
2882 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2883 aq->agent->device_id, aq->id);
2884 pthread_cond_signal (&aq->queue_cond_in);
2885
2886 pthread_mutex_unlock (&aq->mutex);
2887 }
2888
2889 /* Request that a given async thread wait for another thread (unspecified) to
2890 reach the given placeholder. The wait will occur when all previous entries
2891 on the queue are complete. A placeholder is effectively a kind of signal
2892 which simply sets a flag when encountered in a queue. */
2893
2894 static void
2895 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2896 struct placeholder *placeholderp)
2897 {
2898 pthread_mutex_lock (&aq->mutex);
2899
2900 wait_for_queue_nonfull (aq);
2901
2902 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2903 if (DEBUG_QUEUES)
2904 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2905 aq->id, queue_last);
2906
2907 aq->queue[queue_last].type = ASYNC_WAIT;
2908 aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2909
2910 aq->queue_n++;
2911
2912 if (DEBUG_THREAD_SIGNAL)
2913 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2914 aq->agent->device_id, aq->id);
2915 pthread_cond_signal (&aq->queue_cond_in);
2916
2917 pthread_mutex_unlock (&aq->mutex);
2918 }
2919
2920 /* Add a placeholder into an async queue. When the async thread reaches the
2921 placeholder it will set the "executed" flag to true and continue.
2922 Another thread may be waiting on this thread reaching the placeholder. */
2923
2924 static struct placeholder *
2925 queue_push_placeholder (struct goacc_asyncqueue *aq)
2926 {
2927 struct placeholder *placeholderp;
2928
2929 pthread_mutex_lock (&aq->mutex);
2930
2931 wait_for_queue_nonfull (aq);
2932
2933 int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2934 if (DEBUG_QUEUES)
2935 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2936 aq->id, queue_last);
2937
2938 aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2939 placeholderp = &aq->queue[queue_last].u.placeholder;
2940
2941 if (pthread_mutex_init (&placeholderp->mutex, NULL))
2942 {
2943 pthread_mutex_unlock (&aq->mutex);
2944 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2945 }
2946
2947 if (pthread_cond_init (&placeholderp->cond, NULL))
2948 {
2949 pthread_mutex_unlock (&aq->mutex);
2950 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2951 }
2952
2953 placeholderp->executed = 0;
2954
2955 aq->queue_n++;
2956
2957 if (DEBUG_THREAD_SIGNAL)
2958 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2959 aq->agent->device_id, aq->id);
2960 pthread_cond_signal (&aq->queue_cond_in);
2961
2962 pthread_mutex_unlock (&aq->mutex);
2963
2964 return placeholderp;
2965 }
2966
2967 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
2968
2969 static void
2970 finalize_async_thread (struct goacc_asyncqueue *aq)
2971 {
2972 pthread_mutex_lock (&aq->mutex);
2973 if (aq->drain_queue_stop == 2)
2974 {
2975 pthread_mutex_unlock (&aq->mutex);
2976 return;
2977 }
2978
2979 aq->drain_queue_stop = 1;
2980
2981 if (DEBUG_THREAD_SIGNAL)
2982 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2983 aq->agent->device_id, aq->id);
2984 pthread_cond_signal (&aq->queue_cond_in);
2985
2986 while (aq->drain_queue_stop != 2)
2987 {
2988 if (DEBUG_THREAD_SLEEP)
2989 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2990 " to sleep\n", aq->agent->device_id, aq->id);
2991 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2992 if (DEBUG_THREAD_SLEEP)
2993 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2994 aq->agent->device_id, aq->id);
2995 }
2996
2997 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2998 aq->id);
2999 pthread_mutex_unlock (&aq->mutex);
3000
3001 int err = pthread_join (aq->thread_drain_queue, NULL);
3002 if (err != 0)
3003 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3004 aq->agent->device_id, aq->id, strerror (err));
3005 GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3006 }
3007
3008 /* Set up an async queue for OpenMP. There will be only one. The
3009 implementation simply uses an OpenACC async queue.
3010 FIXME: is this thread-safe if two threads call this function? */
3011
3012 static void
3013 maybe_init_omp_async (struct agent_info *agent)
3014 {
3015 if (!agent->omp_async_queue)
3016 agent->omp_async_queue
3017 = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3018 }
3019
3020 /* A wrapper that works around an issue in the HSA runtime with host-to-device
3021 copies from read-only pages. */
3022
3023 static void
3024 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3025 {
3026 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3027
3028 if (status == HSA_STATUS_SUCCESS)
3029 return;
3030
3031 /* It appears that the copy fails if the source data is in a read-only page.
3032 We can't detect that easily, so try copying the data to a temporary buffer
3033 and doing the copy again if we got an error above. */
3034
3035 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3036 "[%p:+%d]\n", (void *) src, (int) len);
3037
3038 void *src_copy = malloc (len);
3039 memcpy (src_copy, src, len);
3040 status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3041 free (src_copy);
3042 if (status != HSA_STATUS_SUCCESS)
3043 GOMP_PLUGIN_error ("memory copy failed");
3044 }
3045
3046 /* Copy data to or from a device. This is intended for use as an async
3047 callback event. */
3048
3049 static void
3050 copy_data (void *data_)
3051 {
3052 struct copy_data *data = (struct copy_data *)data_;
3053 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3054 data->aq->agent->device_id, data->aq->id, data->len, data->src,
3055 data->dst);
3056 hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3057 free (data);
3058 }
3059
3060 /* Request an asynchronous data copy, to or from a device, on a given queue.
3061 The event will be registered as a callback. */
3062
3063 static void
3064 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3065 size_t len)
3066 {
3067 if (DEBUG_QUEUES)
3068 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3069 aq->agent->device_id, aq->id, len, src, dst);
3070 struct copy_data *data
3071 = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3072 data->dst = dst;
3073 data->src = src;
3074 data->len = len;
3075 data->aq = aq;
3076 queue_push_callback (aq, copy_data, data);
3077 }
3078
3079 /* Return true if the given queue is currently empty. */
3080
3081 static int
3082 queue_empty (struct goacc_asyncqueue *aq)
3083 {
3084 pthread_mutex_lock (&aq->mutex);
3085 int res = aq->queue_n == 0 ? 1 : 0;
3086 pthread_mutex_unlock (&aq->mutex);
3087
3088 return res;
3089 }
3090
3091 /* Wait for a given queue to become empty. This implements an OpenACC wait
3092 directive. */
3093
3094 static void
3095 wait_queue (struct goacc_asyncqueue *aq)
3096 {
3097 if (DRAIN_QUEUE_SYNCHRONOUS_P)
3098 {
3099 drain_queue_synchronous (aq);
3100 return;
3101 }
3102
3103 pthread_mutex_lock (&aq->mutex);
3104
3105 while (aq->queue_n > 0)
3106 {
3107 if (DEBUG_THREAD_SLEEP)
3108 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3109 aq->agent->device_id, aq->id);
3110 pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3111 if (DEBUG_THREAD_SLEEP)
3112 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq->agent->device_id,
3113 aq->id);
3114 }
3115
3116 pthread_mutex_unlock (&aq->mutex);
3117 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3118 }
3119
3120 /* }}} */
3121 /* {{{ OpenACC support */
3122
3123 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3124
3125 static void
3126 gcn_exec (struct kernel_info *kernel,
3127 void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3128 struct goacc_asyncqueue *aq)
3129 {
3130 if (!GOMP_OFFLOAD_can_run (kernel))
3131 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3132
3133 /* If we get here then this must be an OpenACC kernel. */
3134 kernel->kind = KIND_OPENACC;
3135
3136 struct hsa_kernel_description *hsa_kernel_desc = NULL;
3137 for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3138 {
3139 struct hsa_kernel_description *d
3140 = &kernel->module->image_desc->kernel_infos[i];
3141 if (d->name == kernel->name)
3142 {
3143 hsa_kernel_desc = d;
3144 break;
3145 }
3146 }
3147
3148 /* We may have statically-determined dimensions in
3149 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3150 invocation at runtime in dims[]. We allow static dimensions to take
3151 priority over dynamic dimensions when present (non-zero). */
3152 if (hsa_kernel_desc->oacc_dims[0] > 0)
3153 dims[0] = hsa_kernel_desc->oacc_dims[0];
3154 if (hsa_kernel_desc->oacc_dims[1] > 0)
3155 dims[1] = hsa_kernel_desc->oacc_dims[1];
3156 if (hsa_kernel_desc->oacc_dims[2] > 0)
3157 dims[2] = hsa_kernel_desc->oacc_dims[2];
3158
3159 /* Ideally, when a dimension isn't explicitly specified, we should
3160 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3161 In practice, we tune for peak performance on BabelStream, which
3162 for OpenACC is currently 32 threads per CU. */
3163 if (dims[0] == 0 && dims[1] == 0)
3164 {
3165 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3166 number. There isn't really a correct answer for this without a clue
3167 about the problem size, so let's do a reasonable number of workers
3168 and gangs. */
3169
3170 dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs. */
3171 dims[1] = 8; /* Workers. */
3172 }
3173 else if (dims[0] == 0 && dims[1] > 0)
3174 {
3175 /* Auto-scale the number of gangs with the requested number of workers. */
3176 dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3177 }
3178 else if (dims[0] > 0 && dims[1] == 0)
3179 {
3180 /* Auto-scale the number of workers with the requested number of gangs. */
3181 dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3182 if (dims[1] == 0)
3183 dims[1] = 1;
3184 if (dims[1] > 16)
3185 dims[1] = 16;
3186 }
3187
3188 /* The incoming dimensions are expressed in terms of gangs, workers, and
3189 vectors. The HSA dimensions are expressed in terms of "work-items",
3190 which means multiples of vector lanes.
3191
3192 The "grid size" specifies the size of the problem space, and the
3193 "work-group size" specifies how much of that we want a single compute
3194 unit to chew on at once.
3195
3196 The three dimensions do not really correspond to hardware, but the
3197 important thing is that the HSA runtime will launch as many
3198 work-groups as it takes to process the entire grid, and each
3199 work-group will contain as many wave-fronts as it takes to process
3200 the work-items in that group.
3201
3202 Essentially, as long as we set the Y dimension to 64 (the number of
3203 vector lanes in hardware), and the Z group size to the maximum (16),
3204 then we will get the gangs (X) and workers (Z) launched as we expect.
3205
3206 The reason for the apparent reversal of vector and worker dimension
3207 order is to do with the way the run-time distributes work-items across
3208 v1 and v2. */
3209 struct GOMP_kernel_launch_attributes kla =
3210 {3,
3211 /* Grid size. */
3212 {dims[0], 64, dims[1]},
3213 /* Work-group size. */
3214 {1, 64, 16}
3215 };
3216
3217 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3218 acc_prof_info *prof_info = thr->prof_info;
3219 acc_event_info enqueue_launch_event_info;
3220 acc_api_info *api_info = thr->api_info;
3221 bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3222 if (profiling_dispatch_p)
3223 {
3224 prof_info->event_type = acc_ev_enqueue_launch_start;
3225
3226 enqueue_launch_event_info.launch_event.event_type
3227 = prof_info->event_type;
3228 enqueue_launch_event_info.launch_event.valid_bytes
3229 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3230 enqueue_launch_event_info.launch_event.parent_construct
3231 = acc_construct_parallel;
3232 enqueue_launch_event_info.launch_event.implicit = 1;
3233 enqueue_launch_event_info.launch_event.tool_info = NULL;
3234 enqueue_launch_event_info.launch_event.kernel_name
3235 = (char *) kernel->name;
3236 enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3237 enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3238 enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3239
3240 api_info->device_api = acc_device_api_other;
3241
3242 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3243 &enqueue_launch_event_info, api_info);
3244 }
3245
3246 if (!async)
3247 run_kernel (kernel, devaddrs, &kla, NULL, false);
3248 else
3249 queue_push_launch (aq, kernel, devaddrs, &kla);
3250
3251 if (profiling_dispatch_p)
3252 {
3253 prof_info->event_type = acc_ev_enqueue_launch_end;
3254 enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3255 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3256 &enqueue_launch_event_info,
3257 api_info);
3258 }
3259 }
3260
3261 /* }}} */
3262 /* {{{ Generic Plugin API */
3263
3264 /* Return the name of the accelerator, which is "gcn". */
3265
3266 const char *
3267 GOMP_OFFLOAD_get_name (void)
3268 {
3269 return "gcn";
3270 }
3271
3272 /* Return the specific capabilities the HSA accelerator have. */
3273
3274 unsigned int
3275 GOMP_OFFLOAD_get_caps (void)
3276 {
3277 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3278 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3279 | GOMP_OFFLOAD_CAP_OPENACC_200;
3280 }
3281
3282 /* Identify as GCN accelerator. */
3283
3284 int
3285 GOMP_OFFLOAD_get_type (void)
3286 {
3287 return OFFLOAD_TARGET_TYPE_GCN;
3288 }
3289
3290 /* Return the libgomp version number we're compatible with. There is
3291 no requirement for cross-version compatibility. */
3292
3293 unsigned
3294 GOMP_OFFLOAD_version (void)
3295 {
3296 return GOMP_VERSION;
3297 }
3298
3299 /* Return the number of GCN devices on the system. */
3300
3301 int
3302 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3303 {
3304 if (!init_hsa_context ())
3305 return 0;
3306 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3307 devices were present. */
3308 if (hsa_context.agent_count > 0
3309 && ((omp_requires_mask
3310 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3311 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
3312 return -1;
3313 return hsa_context.agent_count;
3314 }
3315
3316 /* Initialize device (agent) number N so that it can be used for computation.
3317 Return TRUE on success. */
3318
3319 bool
3320 GOMP_OFFLOAD_init_device (int n)
3321 {
3322 if (!init_hsa_context ())
3323 return false;
3324 if (n >= hsa_context.agent_count)
3325 {
3326 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3327 return false;
3328 }
3329 struct agent_info *agent = &hsa_context.agents[n];
3330
3331 if (agent->initialized)
3332 return true;
3333
3334 agent->device_id = n;
3335
3336 if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3337 {
3338 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3339 return false;
3340 }
3341 if (pthread_mutex_init (&agent->prog_mutex, NULL))
3342 {
3343 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3344 return false;
3345 }
3346 if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3347 {
3348 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3349 return false;
3350 }
3351 if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
3352 {
3353 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3354 return false;
3355 }
3356 agent->async_queues = NULL;
3357 agent->omp_async_queue = NULL;
3358 agent->ephemeral_memories_list = NULL;
3359
3360 uint32_t queue_size;
3361 hsa_status_t status;
3362 status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3363 HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3364 &queue_size);
3365 if (status != HSA_STATUS_SUCCESS)
3366 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3367 status);
3368
3369 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3370 &agent->name);
3371 if (status != HSA_STATUS_SUCCESS)
3372 return hsa_error ("Error querying the name of the agent", status);
3373
3374 agent->device_isa = isa_code (agent->name);
3375 if (agent->device_isa < 0)
3376 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3377
3378 status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3379 &agent->vendor_name);
3380 if (status != HSA_STATUS_SUCCESS)
3381 return hsa_error ("Error querying the vendor name of the agent", status);
3382
3383 status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3384 HSA_QUEUE_TYPE_MULTI,
3385 hsa_queue_callback, NULL, UINT32_MAX,
3386 UINT32_MAX, &agent->sync_queue);
3387 if (status != HSA_STATUS_SUCCESS)
3388 return hsa_error ("Error creating command queue", status);
3389
3390 agent->kernarg_region.handle = (uint64_t) -1;
3391 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3392 get_kernarg_memory_region,
3393 &agent->kernarg_region);
3394 if (status != HSA_STATUS_SUCCESS
3395 && status != HSA_STATUS_INFO_BREAK)
3396 hsa_error ("Scanning memory regions failed", status);
3397 if (agent->kernarg_region.handle == (uint64_t) -1)
3398 {
3399 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3400 "arguments");
3401 return false;
3402 }
3403 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3404 dump_hsa_region (agent->kernarg_region, NULL);
3405
3406 agent->data_region.handle = (uint64_t) -1;
3407 status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3408 get_data_memory_region,
3409 &agent->data_region);
3410 if (status != HSA_STATUS_SUCCESS
3411 && status != HSA_STATUS_INFO_BREAK)
3412 hsa_error ("Scanning memory regions failed", status);
3413 if (agent->data_region.handle == (uint64_t) -1)
3414 {
3415 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3416 "data");
3417 return false;
3418 }
3419 GCN_DEBUG ("Selected device data memory region:\n");
3420 dump_hsa_region (agent->data_region, NULL);
3421
3422 GCN_DEBUG ("GCN agent %d initialized\n", n);
3423
3424 agent->initialized = true;
3425 return true;
3426 }
3427
3428 /* Load GCN object-code module described by struct gcn_image_desc in
3429 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3430 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3431 contain the on-device addresses of the functions for reverse offload. To be
3432 freed by the caller. */
3433
3434 int
3435 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3436 struct addr_pair **target_table,
3437 uint64_t **rev_fn_table,
3438 uint64_t *host_ind_fn_table)
3439 {
3440 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3441 {
3442 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3443 " (expected %u, received %u)",
3444 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3445 return -1;
3446 }
3447
3448 struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3449 struct agent_info *agent;
3450 struct addr_pair *pair;
3451 struct module_info *module;
3452 struct kernel_info *kernel;
3453 int kernel_count = image_desc->kernel_count;
3454 unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3455 ? image_desc->ind_func_count : 0;
3456 unsigned var_count = image_desc->global_variable_count;
3457 /* Currently, "others" is a struct of ICVS. */
3458 int other_count = 1;
3459
3460 agent = get_agent_info (ord);
3461 if (!agent)
3462 return -1;
3463
3464 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3465 {
3466 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3467 return -1;
3468 }
3469 if (agent->prog_finalized
3470 && !destroy_hsa_program (agent))
3471 return -1;
3472
3473 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3474 GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
3475 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3476 GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3477 pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3478 * sizeof (struct addr_pair));
3479 *target_table = pair;
3480 module = (struct module_info *)
3481 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3482 + kernel_count * sizeof (struct kernel_info));
3483 module->image_desc = image_desc;
3484 module->kernel_count = kernel_count;
3485 module->heap = NULL;
3486 module->constructors_run_p = false;
3487
3488 kernel = &module->kernels[0];
3489
3490 /* Allocate memory for kernel dependencies. */
3491 for (unsigned i = 0; i < kernel_count; i++)
3492 {
3493 struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3494 if (!init_basic_kernel_info (kernel, d, agent, module))
3495 return -1;
3496 if (strcmp (d->name, "_init_array") == 0)
3497 module->init_array_func = kernel;
3498 else if (strcmp (d->name, "_fini_array") == 0)
3499 module->fini_array_func = kernel;
3500 else
3501 {
3502 pair->start = (uintptr_t) kernel;
3503 pair->end = (uintptr_t) (kernel + 1);
3504 pair++;
3505 }
3506 kernel++;
3507 }
3508
3509 agent->module = module;
3510 if (pthread_rwlock_unlock (&agent->module_rwlock))
3511 {
3512 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3513 return -1;
3514 }
3515
3516 if (!create_and_finalize_hsa_program (agent))
3517 return -1;
3518
3519 if (var_count > 0)
3520 {
3521 hsa_status_t status;
3522 hsa_executable_symbol_t var_symbol;
3523 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3524 ".offload_var_table",
3525 agent->id,
3526 0, &var_symbol);
3527
3528 if (status != HSA_STATUS_SUCCESS)
3529 hsa_fatal ("Could not find symbol for variable in the code object",
3530 status);
3531
3532 uint64_t var_table_addr;
3533 status = hsa_fns.hsa_executable_symbol_get_info_fn
3534 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3535 &var_table_addr);
3536 if (status != HSA_STATUS_SUCCESS)
3537 hsa_fatal ("Could not extract a variable from its symbol", status);
3538
3539 struct {
3540 uint64_t addr;
3541 uint64_t size;
3542 } var_table[var_count];
3543 GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3544 (void*)var_table_addr, sizeof (var_table));
3545
3546 for (unsigned i = 0; i < var_count; i++)
3547 {
3548 pair->start = var_table[i].addr;
3549 pair->end = var_table[i].addr + var_table[i].size;
3550 GCN_DEBUG ("Found variable at %p with size %lu\n",
3551 (void *)var_table[i].addr, var_table[i].size);
3552 pair++;
3553 }
3554 }
3555
3556 if (ind_func_count > 0)
3557 {
3558 hsa_status_t status;
3559
3560 /* Read indirect function table from image. */
3561 hsa_executable_symbol_t ind_funcs_symbol;
3562 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3563 ".offload_ind_func_table",
3564 agent->id,
3565 0, &ind_funcs_symbol);
3566
3567 if (status != HSA_STATUS_SUCCESS)
3568 hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3569 "code object", status);
3570
3571 uint64_t ind_funcs_table_addr;
3572 status = hsa_fns.hsa_executable_symbol_get_info_fn
3573 (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3574 &ind_funcs_table_addr);
3575 if (status != HSA_STATUS_SUCCESS)
3576 hsa_fatal ("Could not extract a variable from its symbol", status);
3577
3578 uint64_t ind_funcs_table[ind_func_count];
3579 GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3580 (void*) ind_funcs_table_addr,
3581 sizeof (ind_funcs_table));
3582
3583 /* Build host->target address map for indirect functions. */
3584 uint64_t ind_fn_map[ind_func_count * 2 + 1];
3585 for (unsigned i = 0; i < ind_func_count; i++)
3586 {
3587 ind_fn_map[i * 2] = host_ind_fn_table[i];
3588 ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3589 GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3590 i, host_ind_fn_table[i], ind_funcs_table[i]);
3591 }
3592 ind_fn_map[ind_func_count * 2] = 0;
3593
3594 /* Write the map onto the target. */
3595 void *map_target_addr
3596 = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3597 GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3598
3599 GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3600 (void*) ind_fn_map,
3601 sizeof (ind_fn_map));
3602
3603 /* Write address of the map onto the target. */
3604 hsa_executable_symbol_t symbol;
3605
3606 status
3607 = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3608 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3609 agent->id, 0, &symbol);
3610 if (status != HSA_STATUS_SUCCESS)
3611 hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3612 status);
3613
3614 uint64_t varptr;
3615 uint32_t varsize;
3616
3617 status = hsa_fns.hsa_executable_symbol_get_info_fn
3618 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3619 &varptr);
3620 if (status != HSA_STATUS_SUCCESS)
3621 hsa_fatal ("Could not extract a variable from its symbol", status);
3622 status = hsa_fns.hsa_executable_symbol_get_info_fn
3623 (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3624 &varsize);
3625 if (status != HSA_STATUS_SUCCESS)
3626 hsa_fatal ("Could not extract a variable size from its symbol",
3627 status);
3628
3629 GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3630 varptr, varsize);
3631
3632 GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3633 &map_target_addr,
3634 sizeof (map_target_addr));
3635 }
3636
3637 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3638
3639 hsa_status_t status;
3640 hsa_executable_symbol_t var_symbol;
3641 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3642 XSTRING (GOMP_ADDITIONAL_ICVS),
3643 agent->id, 0, &var_symbol);
3644 if (status == HSA_STATUS_SUCCESS)
3645 {
3646 uint64_t varptr;
3647 uint32_t varsize;
3648
3649 status = hsa_fns.hsa_executable_symbol_get_info_fn
3650 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3651 &varptr);
3652 if (status != HSA_STATUS_SUCCESS)
3653 hsa_fatal ("Could not extract a variable from its symbol", status);
3654 status = hsa_fns.hsa_executable_symbol_get_info_fn
3655 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3656 &varsize);
3657 if (status != HSA_STATUS_SUCCESS)
3658 hsa_fatal ("Could not extract a variable size from its symbol",
3659 status);
3660
3661 pair->start = varptr;
3662 pair->end = varptr + varsize;
3663 }
3664 else
3665 {
3666 /* The variable was not in this image. */
3667 GCN_DEBUG ("Variable not found in image: %s\n",
3668 XSTRING (GOMP_ADDITIONAL_ICVS));
3669 pair->start = pair->end = 0;
3670 }
3671
3672 /* Ensure that constructors are run first. */
3673 struct GOMP_kernel_launch_attributes kla =
3674 { 3,
3675 /* Grid size. */
3676 { 1, 64, 1 },
3677 /* Work-group size. */
3678 { 1, 64, 1 }
3679 };
3680
3681 if (module->init_array_func)
3682 {
3683 init_kernel (module->init_array_func);
3684 run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3685 }
3686 module->constructors_run_p = true;
3687
3688 /* Don't report kernels that libgomp need not know about. */
3689 if (module->init_array_func)
3690 kernel_count--;
3691 if (module->fini_array_func)
3692 kernel_count--;
3693
3694 if (rev_fn_table != NULL && kernel_count == 0)
3695 *rev_fn_table = NULL;
3696 else if (rev_fn_table != NULL)
3697 {
3698 hsa_status_t status;
3699 hsa_executable_symbol_t var_symbol;
3700 status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3701 ".offload_func_table",
3702 agent->id, 0, &var_symbol);
3703 if (status != HSA_STATUS_SUCCESS)
3704 hsa_fatal ("Could not find symbol for variable in the code object",
3705 status);
3706 uint64_t fn_table_addr;
3707 status = hsa_fns.hsa_executable_symbol_get_info_fn
3708 (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3709 &fn_table_addr);
3710 if (status != HSA_STATUS_SUCCESS)
3711 hsa_fatal ("Could not extract a variable from its symbol", status);
3712 *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3713 GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3714 (void*) fn_table_addr,
3715 kernel_count * sizeof (uint64_t));
3716 }
3717
3718 return kernel_count + var_count + other_count;
3719 }
3720
3721 /* Unload GCN object-code module described by struct gcn_image_desc in
3722 TARGET_DATA from agent number N. Return TRUE on success. */
3723
3724 bool
3725 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3726 {
3727 if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3728 {
3729 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3730 " (expected %u, received %u)",
3731 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3732 return false;
3733 }
3734
3735 struct agent_info *agent;
3736 agent = get_agent_info (n);
3737 if (!agent)
3738 return false;
3739
3740 if (pthread_rwlock_wrlock (&agent->module_rwlock))
3741 {
3742 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3743 return false;
3744 }
3745
3746 if (!agent->module || agent->module->image_desc != target_data)
3747 {
3748 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3749 "loaded before");
3750 return false;
3751 }
3752
3753 if (!destroy_module (agent->module, true))
3754 return false;
3755 free (agent->module);
3756 agent->module = NULL;
3757 if (!destroy_hsa_program (agent))
3758 return false;
3759 if (pthread_rwlock_unlock (&agent->module_rwlock))
3760 {
3761 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3762 return false;
3763 }
3764 return true;
3765 }
3766
3767 /* Deinitialize all information and status associated with agent number N. We
3768 do not attempt any synchronization, assuming the user and libgomp will not
3769 attempt deinitialization of a device that is in any way being used at the
3770 same time. Return TRUE on success. */
3771
3772 bool
3773 GOMP_OFFLOAD_fini_device (int n)
3774 {
3775 struct agent_info *agent = get_agent_info (n);
3776 if (!agent)
3777 return false;
3778
3779 if (!agent->initialized)
3780 return true;
3781
3782 if (agent->omp_async_queue)
3783 {
3784 GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3785 agent->omp_async_queue = NULL;
3786 }
3787
3788 if (agent->module)
3789 {
3790 if (!destroy_module (agent->module, false))
3791 return false;
3792 free (agent->module);
3793 agent->module = NULL;
3794 }
3795
3796 if (!destroy_ephemeral_memories (agent))
3797 return false;
3798
3799 if (!destroy_hsa_program (agent))
3800 return false;
3801
3802 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3803 if (status != HSA_STATUS_SUCCESS)
3804 return hsa_error ("Error destroying command queue", status);
3805
3806 if (pthread_mutex_destroy (&agent->prog_mutex))
3807 {
3808 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3809 return false;
3810 }
3811 if (pthread_rwlock_destroy (&agent->module_rwlock))
3812 {
3813 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3814 return false;
3815 }
3816
3817 if (pthread_mutex_destroy (&agent->async_queues_mutex))
3818 {
3819 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3820 return false;
3821 }
3822 if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
3823 {
3824 GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3825 return false;
3826 }
3827 agent->initialized = false;
3828 return true;
3829 }
3830
3831 /* Return true if the HSA runtime can run function FN_PTR. */
3832
3833 bool
3834 GOMP_OFFLOAD_can_run (void *fn_ptr)
3835 {
3836 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3837
3838 init_kernel (kernel);
3839 if (kernel->initialization_failed)
3840 goto failure;
3841
3842 return true;
3843
3844 failure:
3845 if (suppress_host_fallback)
3846 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3847 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3848 return false;
3849 }
3850
3851 /* Allocate memory on device N. */
3852
3853 void *
3854 GOMP_OFFLOAD_alloc (int n, size_t size)
3855 {
3856 struct agent_info *agent = get_agent_info (n);
3857 return alloc_by_agent (agent, size);
3858 }
3859
3860 /* Free memory from device N. */
3861
3862 bool
3863 GOMP_OFFLOAD_free (int device, void *ptr)
3864 {
3865 GCN_DEBUG ("Freeing memory on device %d\n", device);
3866
3867 hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3868 if (status != HSA_STATUS_SUCCESS)
3869 {
3870 hsa_error ("Could not free device memory", status);
3871 return false;
3872 }
3873
3874 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3875 bool profiling_dispatch_p
3876 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3877 if (profiling_dispatch_p)
3878 {
3879 acc_prof_info *prof_info = thr->prof_info;
3880 acc_event_info data_event_info;
3881 acc_api_info *api_info = thr->api_info;
3882
3883 prof_info->event_type = acc_ev_free;
3884
3885 data_event_info.data_event.event_type = prof_info->event_type;
3886 data_event_info.data_event.valid_bytes
3887 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3888 data_event_info.data_event.parent_construct
3889 = acc_construct_parallel;
3890 data_event_info.data_event.implicit = 1;
3891 data_event_info.data_event.tool_info = NULL;
3892 data_event_info.data_event.var_name = NULL;
3893 data_event_info.data_event.bytes = 0;
3894 data_event_info.data_event.host_ptr = NULL;
3895 data_event_info.data_event.device_ptr = (void *) ptr;
3896
3897 api_info->device_api = acc_device_api_other;
3898
3899 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3900 api_info);
3901 }
3902
3903 return true;
3904 }
3905
3906 /* Copy data from DEVICE to host. */
3907
3908 bool
3909 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3910 {
3911 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3912 src, dst);
3913 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3914 if (status != HSA_STATUS_SUCCESS)
3915 GOMP_PLUGIN_error ("memory copy failed");
3916 return true;
3917 }
3918
3919 /* Copy data from host to DEVICE. */
3920
3921 bool
3922 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3923 {
3924 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3925 device, dst);
3926 hsa_memory_copy_wrapper (dst, src, n);
3927 return true;
3928 }
3929
3930 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3931
3932 bool
3933 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3934 {
3935 struct gcn_thread *thread_data = gcn_thread ();
3936
3937 if (thread_data && !async_synchronous_p (thread_data->async))
3938 {
3939 struct agent_info *agent = get_agent_info (device);
3940 maybe_init_omp_async (agent);
3941 queue_push_copy (agent->omp_async_queue, dst, src, n);
3942 return true;
3943 }
3944
3945 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3946 device, src, device, dst);
3947 hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3948 if (status != HSA_STATUS_SUCCESS)
3949 GOMP_PLUGIN_error ("memory copy failed");
3950 return true;
3951 }
3952
3953 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
3954 measured in bytes. So we have:
3955
3956 dim1_size: number of bytes to copy on innermost dimension ("row")
3957 dim0_len: number of rows to copy
3958 dst: base pointer for destination of copy
3959 dst_offset1_size: innermost row offset (for dest), in bytes
3960 dst_offset0_len: offset, number of rows (for dest)
3961 dst_dim1_size: whole-array dest row length, in bytes (pitch)
3962 src: base pointer for source of copy
3963 src_offset1_size: innermost row offset (for source), in bytes
3964 src_offset0_len: offset, number of rows (for source)
3965 src_dim1_size: whole-array source row length, in bytes (pitch)
3966 */
3967
3968 int
3969 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
3970 size_t dim0_len, void *dst, size_t dst_offset1_size,
3971 size_t dst_offset0_len, size_t dst_dim1_size,
3972 const void *src, size_t src_offset1_size,
3973 size_t src_offset0_len, size_t src_dim1_size)
3974 {
3975 if (!hsa_fns.hsa_amd_memory_lock_fn
3976 || !hsa_fns.hsa_amd_memory_unlock_fn
3977 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
3978 return -1;
3979
3980 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
3981 out quietly if we have anything oddly-aligned rather than letting the
3982 driver raise an error. */
3983 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
3984 return -1;
3985
3986 if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
3987 return -1;
3988
3989 /* Only handle host to device or device to host transfers here. */
3990 if ((dst_ord == -1 && src_ord == -1)
3991 || (dst_ord != -1 && src_ord != -1))
3992 return -1;
3993
3994 hsa_amd_copy_direction_t dir
3995 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
3996 hsa_agent_t copy_agent;
3997
3998 /* We need to pin (lock) host memory before we start the transfer. Try to
3999 lock the minimum size necessary, i.e. using partial first/last rows of the
4000 whole array. Something like this:
4001
4002 rows -->
4003 ..............
4004 c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4005 o | ++#######+++++ <- whole row
4006 l | ++#######+++++ <- "
4007 s v ++#######..... <- last row apart from trailing remainder
4008 ..............
4009
4010 We could split very large transfers into several rectangular copies, but
4011 that is unimplemented for now. */
4012
4013 size_t bounded_size_host, first_elem_offset_host;
4014 void *host_ptr;
4015 if (dir == hsaHostToDevice)
4016 {
4017 bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
4018 first_elem_offset_host = src_offset0_len * src_dim1_size
4019 + src_offset1_size;
4020 host_ptr = (void *) src;
4021 struct agent_info *agent = get_agent_info (dst_ord);
4022 copy_agent = agent->id;
4023 }
4024 else
4025 {
4026 bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
4027 first_elem_offset_host = dst_offset0_len * dst_dim1_size
4028 + dst_offset1_size;
4029 host_ptr = dst;
4030 struct agent_info *agent = get_agent_info (src_ord);
4031 copy_agent = agent->id;
4032 }
4033
4034 void *agent_ptr;
4035
4036 hsa_status_t status
4037 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4038 bounded_size_host, NULL, 0, &agent_ptr);
4039 /* We can't lock the host memory: don't give up though, we might still be
4040 able to use the slow path in our caller. So, don't make this an
4041 error. */
4042 if (status != HSA_STATUS_SUCCESS)
4043 return -1;
4044
4045 hsa_pitched_ptr_t dstpp, srcpp;
4046 hsa_dim3_t dst_offsets, src_offsets, ranges;
4047
4048 int retval = 1;
4049
4050 hsa_signal_t completion_signal;
4051 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4052 if (status != HSA_STATUS_SUCCESS)
4053 {
4054 retval = -1;
4055 goto unlock;
4056 }
4057
4058 if (dir == hsaHostToDevice)
4059 {
4060 srcpp.base = agent_ptr - first_elem_offset_host;
4061 dstpp.base = dst;
4062 }
4063 else
4064 {
4065 srcpp.base = (void *) src;
4066 dstpp.base = agent_ptr - first_elem_offset_host;
4067 }
4068
4069 srcpp.pitch = src_dim1_size;
4070 srcpp.slice = 0;
4071
4072 src_offsets.x = src_offset1_size;
4073 src_offsets.y = src_offset0_len;
4074 src_offsets.z = 0;
4075
4076 dstpp.pitch = dst_dim1_size;
4077 dstpp.slice = 0;
4078
4079 dst_offsets.x = dst_offset1_size;
4080 dst_offsets.y = dst_offset0_len;
4081 dst_offsets.z = 0;
4082
4083 ranges.x = dim1_size;
4084 ranges.y = dim0_len;
4085 ranges.z = 1;
4086
4087 status
4088 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4089 &src_offsets, &ranges,
4090 copy_agent, dir, 0, NULL,
4091 completion_signal);
4092 /* If the rectangular copy fails, we might still be able to use the slow
4093 path. We need to unlock the host memory though, so don't return
4094 immediately. */
4095 if (status != HSA_STATUS_SUCCESS)
4096 retval = -1;
4097 else
4098 hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4099 HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
4100 HSA_WAIT_STATE_ACTIVE);
4101
4102 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4103
4104 unlock:
4105 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4106 if (status != HSA_STATUS_SUCCESS)
4107 hsa_fatal ("Could not unlock host memory", status);
4108
4109 return retval;
4110 }
4111
4112 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4113 measured in bytes. So we have:
4114
4115 dim2_size: number of bytes to copy on innermost dimension ("row")
4116 dim1_len: number of rows per slice to copy
4117 dim0_len: number of slices to copy
4118 dst: base pointer for destination of copy
4119 dst_offset2_size: innermost row offset (for dest), in bytes
4120 dst_offset1_len: offset, number of rows (for dest)
4121 dst_offset0_len: offset, number of slices (for dest)
4122 dst_dim2_size: whole-array dest row length, in bytes (pitch)
4123 dst_dim1_len: whole-array number of rows in slice (for dest)
4124 src: base pointer for source of copy
4125 src_offset2_size: innermost row offset (for source), in bytes
4126 src_offset1_len: offset, number of rows (for source)
4127 src_offset0_len: offset, number of slices (for source)
4128 src_dim2_size: whole-array source row length, in bytes (pitch)
4129 src_dim1_len: whole-array number of rows in slice (for source)
4130 */
4131
4132 int
4133 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
4134 size_t dim1_len, size_t dim0_len, void *dst,
4135 size_t dst_offset2_size, size_t dst_offset1_len,
4136 size_t dst_offset0_len, size_t dst_dim2_size,
4137 size_t dst_dim1_len, const void *src,
4138 size_t src_offset2_size, size_t src_offset1_len,
4139 size_t src_offset0_len, size_t src_dim2_size,
4140 size_t src_dim1_len)
4141 {
4142 if (!hsa_fns.hsa_amd_memory_lock_fn
4143 || !hsa_fns.hsa_amd_memory_unlock_fn
4144 || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4145 return -1;
4146
4147 /* GCN hardware requires 4-byte alignment for base addresses & pitches. Bail
4148 out quietly if we have anything oddly-aligned rather than letting the
4149 driver raise an error. */
4150 if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4151 return -1;
4152
4153 if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
4154 return -1;
4155
4156 /* Only handle host to device or device to host transfers here. */
4157 if ((dst_ord == -1 && src_ord == -1)
4158 || (dst_ord != -1 && src_ord != -1))
4159 return -1;
4160
4161 hsa_amd_copy_direction_t dir
4162 = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4163 hsa_agent_t copy_agent;
4164
4165 /* We need to pin (lock) host memory before we start the transfer. Try to
4166 lock the minimum size necessary, i.e. using partial first/last slices of
4167 the whole 3D array. Something like this:
4168
4169 slice 0: slice 1: slice 2:
4170 __________ __________ __________
4171 ^ /+++++++++/ : /+++++++++/ : / /
4172 column /+++##++++/| | /+++##++++/| | /+++## / # = subarray
4173 / / ##++++/ | |/+++##++++/ | |/+++##++++/ + = area to pin
4174 /_________/ : /_________/ : /_________/
4175 row --->
4176
4177 We could split very large transfers into several rectangular copies, but
4178 that is unimplemented for now. */
4179
4180 size_t bounded_size_host, first_elem_offset_host;
4181 void *host_ptr;
4182 if (dir == hsaHostToDevice)
4183 {
4184 size_t slice_bytes = src_dim2_size * src_dim1_len;
4185 bounded_size_host = slice_bytes * (dim0_len - 1)
4186 + src_dim2_size * (dim1_len - 1)
4187 + dim2_size;
4188 first_elem_offset_host = src_offset0_len * slice_bytes
4189 + src_offset1_len * src_dim2_size
4190 + src_offset2_size;
4191 host_ptr = (void *) src;
4192 struct agent_info *agent = get_agent_info (dst_ord);
4193 copy_agent = agent->id;
4194 }
4195 else
4196 {
4197 size_t slice_bytes = dst_dim2_size * dst_dim1_len;
4198 bounded_size_host = slice_bytes * (dim0_len - 1)
4199 + dst_dim2_size * (dim1_len - 1)
4200 + dim2_size;
4201 first_elem_offset_host = dst_offset0_len * slice_bytes
4202 + dst_offset1_len * dst_dim2_size
4203 + dst_offset2_size;
4204 host_ptr = dst;
4205 struct agent_info *agent = get_agent_info (src_ord);
4206 copy_agent = agent->id;
4207 }
4208
4209 void *agent_ptr;
4210
4211 hsa_status_t status
4212 = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4213 bounded_size_host, NULL, 0, &agent_ptr);
4214 /* We can't lock the host memory: don't give up though, we might still be
4215 able to use the slow path in our caller (maybe even with iterated memcpy2d
4216 calls). So, don't make this an error. */
4217 if (status != HSA_STATUS_SUCCESS)
4218 return -1;
4219
4220 hsa_pitched_ptr_t dstpp, srcpp;
4221 hsa_dim3_t dst_offsets, src_offsets, ranges;
4222
4223 int retval = 1;
4224
4225 hsa_signal_t completion_signal;
4226 status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4227 if (status != HSA_STATUS_SUCCESS)
4228 {
4229 retval = -1;
4230 goto unlock;
4231 }
4232
4233 if (dir == hsaHostToDevice)
4234 {
4235 srcpp.base = agent_ptr - first_elem_offset_host;
4236 dstpp.base = dst;
4237 }
4238 else
4239 {
4240 srcpp.base = (void *) src;
4241 dstpp.base = agent_ptr - first_elem_offset_host;
4242 }
4243
4244 /* Pitch is measured in bytes. */
4245 srcpp.pitch = src_dim2_size;
4246 /* Slice is also measured in bytes (i.e. total per-slice). */
4247 srcpp.slice = src_dim2_size * src_dim1_len;
4248
4249 src_offsets.x = src_offset2_size;
4250 src_offsets.y = src_offset1_len;
4251 src_offsets.z = src_offset0_len;
4252
4253 /* As above. */
4254 dstpp.pitch = dst_dim2_size;
4255 dstpp.slice = dst_dim2_size * dst_dim1_len;
4256
4257 dst_offsets.x = dst_offset2_size;
4258 dst_offsets.y = dst_offset1_len;
4259 dst_offsets.z = dst_offset0_len;
4260
4261 ranges.x = dim2_size;
4262 ranges.y = dim1_len;
4263 ranges.z = dim0_len;
4264
4265 status
4266 = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4267 &src_offsets, &ranges,
4268 copy_agent, dir, 0, NULL,
4269 completion_signal);
4270 /* If the rectangular copy fails, we might still be able to use the slow
4271 path. We need to unlock the host memory though, so don't return
4272 immediately. */
4273 if (status != HSA_STATUS_SUCCESS)
4274 retval = -1;
4275 else
4276 {
4277 hsa_signal_value_t sv
4278 = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4279 HSA_SIGNAL_CONDITION_LT, 1,
4280 UINT64_MAX,
4281 HSA_WAIT_STATE_ACTIVE);
4282 if (sv < 0)
4283 {
4284 GCN_WARNING ("async copy rect failure");
4285 retval = -1;
4286 }
4287 }
4288
4289 hsa_fns.hsa_signal_destroy_fn (completion_signal);
4290
4291 unlock:
4292 status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4293 if (status != HSA_STATUS_SUCCESS)
4294 hsa_fatal ("Could not unlock host memory", status);
4295
4296 return retval;
4297 }
4298
4299 /* }}} */
4300 /* {{{ OpenMP Plugin API */
4301
4302 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4303 in VARS as a parameter. The kernel is identified by FN_PTR which must point
4304 to a kernel_info structure, and must have previously been loaded to the
4305 specified device. */
4306
4307 void
4308 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
4309 {
4310 struct agent_info *agent = get_agent_info (device);
4311 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4312 struct GOMP_kernel_launch_attributes def;
4313 struct GOMP_kernel_launch_attributes *kla;
4314 assert (agent == kernel->agent);
4315
4316 /* If we get here then the kernel must be OpenMP. */
4317 kernel->kind = KIND_OPENMP;
4318
4319 if (!parse_target_attributes (args, &def, &kla, agent))
4320 {
4321 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4322 return;
4323 }
4324 run_kernel (kernel, vars, kla, NULL, false);
4325 }
4326
4327 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
4328 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4329 GOMP_PLUGIN_target_task_completion when it has finished. */
4330
4331 void
4332 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
4333 void **args, void *async_data)
4334 {
4335 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4336 struct agent_info *agent = get_agent_info (device);
4337 struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
4338 struct GOMP_kernel_launch_attributes def;
4339 struct GOMP_kernel_launch_attributes *kla;
4340 assert (agent == kernel->agent);
4341
4342 /* If we get here then the kernel must be OpenMP. */
4343 kernel->kind = KIND_OPENMP;
4344
4345 if (!parse_target_attributes (args, &def, &kla, agent))
4346 {
4347 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4348 return;
4349 }
4350
4351 maybe_init_omp_async (agent);
4352 queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
4353 queue_push_callback (agent->omp_async_queue,
4354 GOMP_PLUGIN_target_task_completion, async_data);
4355 }
4356
4357 /* }}} */
4358 /* {{{ OpenACC Plugin API */
4359
4360 /* Run a synchronous OpenACC kernel. The device number is inferred from the
4361 already-loaded KERNEL. */
4362
4363 void
4364 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
4365 size_t mapnum __attribute__((unused)),
4366 void **hostaddrs __attribute__((unused)),
4367 void **devaddrs, unsigned *dims,
4368 void *targ_mem_desc)
4369 {
4370 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4371
4372 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
4373 }
4374
4375 /* Run an asynchronous OpenACC kernel on the specified queue. */
4376
4377 void
4378 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4379 size_t mapnum __attribute__((unused)),
4380 void **hostaddrs __attribute__((unused)),
4381 void **devaddrs,
4382 unsigned *dims, void *targ_mem_desc,
4383 struct goacc_asyncqueue *aq)
4384 {
4385 struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4386
4387 gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
4388 }
4389
4390 /* Create a new asynchronous thread and queue for running future kernels. */
4391
4392 struct goacc_asyncqueue *
4393 GOMP_OFFLOAD_openacc_async_construct (int device)
4394 {
4395 struct agent_info *agent = get_agent_info (device);
4396
4397 pthread_mutex_lock (&agent->async_queues_mutex);
4398
4399 struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4400 aq->agent = get_agent_info (device);
4401 aq->prev = NULL;
4402 aq->next = agent->async_queues;
4403 if (aq->next)
4404 {
4405 aq->next->prev = aq;
4406 aq->id = aq->next->id + 1;
4407 }
4408 else
4409 aq->id = 1;
4410 agent->async_queues = aq;
4411
4412 aq->queue_first = 0;
4413 aq->queue_n = 0;
4414 aq->drain_queue_stop = 0;
4415
4416 if (pthread_mutex_init (&aq->mutex, NULL))
4417 {
4418 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4419 return false;
4420 }
4421 if (pthread_cond_init (&aq->queue_cond_in, NULL))
4422 {
4423 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4424 return false;
4425 }
4426 if (pthread_cond_init (&aq->queue_cond_out, NULL))
4427 {
4428 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4429 return false;
4430 }
4431
4432 hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4433 ASYNC_QUEUE_SIZE,
4434 HSA_QUEUE_TYPE_MULTI,
4435 hsa_queue_callback, NULL,
4436 UINT32_MAX, UINT32_MAX,
4437 &aq->hsa_queue);
4438 if (status != HSA_STATUS_SUCCESS)
4439 hsa_fatal ("Error creating command queue", status);
4440
4441 int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4442 if (err != 0)
4443 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4444 strerror (err));
4445 GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4446 aq->id);
4447
4448 pthread_mutex_unlock (&agent->async_queues_mutex);
4449
4450 return aq;
4451 }
4452
4453 /* Destroy an existing asynchronous thread and queue. Waits for any
4454 currently-running task to complete, but cancels any queued tasks. */
4455
4456 bool
4457 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4458 {
4459 struct agent_info *agent = aq->agent;
4460
4461 finalize_async_thread (aq);
4462
4463 pthread_mutex_lock (&agent->async_queues_mutex);
4464
4465 int err;
4466 if ((err = pthread_mutex_destroy (&aq->mutex)))
4467 {
4468 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4469 goto fail;
4470 }
4471 if (pthread_cond_destroy (&aq->queue_cond_in))
4472 {
4473 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4474 goto fail;
4475 }
4476 if (pthread_cond_destroy (&aq->queue_cond_out))
4477 {
4478 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4479 goto fail;
4480 }
4481 hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4482 if (status != HSA_STATUS_SUCCESS)
4483 {
4484 hsa_error ("Error destroying command queue", status);
4485 goto fail;
4486 }
4487
4488 if (aq->prev)
4489 aq->prev->next = aq->next;
4490 if (aq->next)
4491 aq->next->prev = aq->prev;
4492 if (agent->async_queues == aq)
4493 agent->async_queues = aq->next;
4494
4495 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4496
4497 free (aq);
4498 pthread_mutex_unlock (&agent->async_queues_mutex);
4499 return true;
4500
4501 fail:
4502 pthread_mutex_unlock (&agent->async_queues_mutex);
4503 return false;
4504 }
4505
4506 /* Return true if the specified async queue is currently empty. */
4507
4508 int
4509 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4510 {
4511 return queue_empty (aq);
4512 }
4513
4514 /* Block until the specified queue has executed all its tasks and the
4515 queue is empty. */
4516
4517 bool
4518 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4519 {
4520 wait_queue (aq);
4521 return true;
4522 }
4523
4524 /* Add a serialization point across two async queues. Any new tasks added to
4525 AQ2, after this call, will not run until all tasks on AQ1, at the time
4526 of this call, have completed. */
4527
4528 bool
4529 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4530 struct goacc_asyncqueue *aq2)
4531 {
4532 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4533 scheduled to run on it up to this point. */
4534 if (aq1 != aq2)
4535 {
4536 struct placeholder *placeholderp = queue_push_placeholder (aq1);
4537 queue_push_asyncwait (aq2, placeholderp);
4538 }
4539 return true;
4540 }
4541
4542 /* Add an opaque callback to the given async queue. */
4543
4544 void
4545 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4546 void (*fn) (void *), void *data)
4547 {
4548 queue_push_callback (aq, fn, data);
4549 }
4550
4551 /* Queue up an asynchronous data copy from host to DEVICE. */
4552
4553 bool
4554 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4555 size_t n, struct goacc_asyncqueue *aq)
4556 {
4557 struct agent_info *agent = get_agent_info (device);
4558 assert (agent == aq->agent);
4559 queue_push_copy (aq, dst, src, n);
4560 return true;
4561 }
4562
4563 /* Queue up an asynchronous data copy from DEVICE to host. */
4564
4565 bool
4566 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4567 size_t n, struct goacc_asyncqueue *aq)
4568 {
4569 struct agent_info *agent = get_agent_info (device);
4570 assert (agent == aq->agent);
4571 queue_push_copy (aq, dst, src, n);
4572 return true;
4573 }
4574
4575 union goacc_property_value
4576 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4577 {
4578 struct agent_info *agent = get_agent_info (device);
4579
4580 union goacc_property_value propval = { .val = 0 };
4581
4582 switch (prop)
4583 {
4584 case GOACC_PROPERTY_FREE_MEMORY:
4585 /* Not supported. */
4586 break;
4587 case GOACC_PROPERTY_MEMORY:
4588 {
4589 size_t size;
4590 hsa_region_t region = agent->data_region;
4591 hsa_status_t status =
4592 hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4593 if (status == HSA_STATUS_SUCCESS)
4594 propval.val = size;
4595 break;
4596 }
4597 case GOACC_PROPERTY_NAME:
4598 propval.ptr = agent->name;
4599 break;
4600 case GOACC_PROPERTY_VENDOR:
4601 propval.ptr = agent->vendor_name;
4602 break;
4603 case GOACC_PROPERTY_DRIVER:
4604 propval.ptr = hsa_context.driver_version_s;
4605 break;
4606 }
4607
4608 return propval;
4609 }
4610
4611 /* Set up plugin-specific thread-local-data (host-side). */
4612
4613 void *
4614 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4615 {
4616 struct gcn_thread *thread_data
4617 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4618
4619 thread_data->async = GOMP_ASYNC_SYNC;
4620
4621 return (void *) thread_data;
4622 }
4623
4624 /* Clean up plugin-specific thread-local-data. */
4625
4626 void
4627 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4628 {
4629 free (data);
4630 }
4631
4632 /* }}} */