1 /* Plugin for AMD GCN execution.
3 Copyright (C) 2013-2020 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* {{{ Includes and defines */
42 #include "libgomp-plugin.h"
43 #include "gomp-constants.h"
45 #include "oacc-plugin.h"
49 /* Additional definitions not in HSA 1.1.
50 FIXME: this needs to be updated in hsa.h for upstream, but the only source
51 right now is the ROCr source which may cause license issues. */
52 #define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
54 /* These probably won't be in elf.h for a while. */
56 #define R_AMDGPU_NONE 0
57 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
58 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
59 #define R_AMDGPU_ABS64 3 /* S + A */
60 #define R_AMDGPU_REL32 4 /* S + A - P */
61 #define R_AMDGPU_REL64 5 /* S + A - P */
62 #define R_AMDGPU_ABS32 6 /* S + A */
63 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
64 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
66 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
67 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
68 #define R_AMDGPU_RELATIVE64 13 /* B + A */
71 /* GCN specific definitions for asynchronous queues. */
73 #define ASYNC_QUEUE_SIZE 64
74 #define DRAIN_QUEUE_SYNCHRONOUS_P false
75 #define DEBUG_QUEUES 0
76 #define DEBUG_THREAD_SLEEP 0
77 #define DEBUG_THREAD_SIGNAL 0
80 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
82 /* Secure getenv() which returns NULL if running as SUID/SGID. */
83 #ifndef HAVE_SECURE_GETENV
84 #ifdef HAVE___SECURE_GETENV
85 #define secure_getenv __secure_getenv
86 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
87 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
91 /* Implementation of secure_getenv() for targets where it is not provided but
92 we have at least means to test real and effective IDs. */
95 secure_getenv (const char *name
)
97 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
104 #define secure_getenv getenv
111 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
115 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
119 /* As an HSA runtime is dlopened, following structure defines function
120 pointers utilized by the HSA plug-in. */
122 struct hsa_runtime_fn_info
125 hsa_status_t (*hsa_status_string_fn
) (hsa_status_t status
,
126 const char **status_string
);
127 hsa_status_t (*hsa_system_get_info_fn
) (hsa_system_info_t attribute
,
129 hsa_status_t (*hsa_agent_get_info_fn
) (hsa_agent_t agent
,
130 hsa_agent_info_t attribute
,
132 hsa_status_t (*hsa_isa_get_info_fn
)(hsa_isa_t isa
,
133 hsa_isa_info_t attribute
,
136 hsa_status_t (*hsa_init_fn
) (void);
137 hsa_status_t (*hsa_iterate_agents_fn
)
138 (hsa_status_t (*callback
)(hsa_agent_t agent
, void *data
), void *data
);
139 hsa_status_t (*hsa_region_get_info_fn
) (hsa_region_t region
,
140 hsa_region_info_t attribute
,
142 hsa_status_t (*hsa_queue_create_fn
)
143 (hsa_agent_t agent
, uint32_t size
, hsa_queue_type_t type
,
144 void (*callback
)(hsa_status_t status
, hsa_queue_t
*source
, void *data
),
145 void *data
, uint32_t private_segment_size
,
146 uint32_t group_segment_size
, hsa_queue_t
**queue
);
147 hsa_status_t (*hsa_agent_iterate_regions_fn
)
149 hsa_status_t (*callback
)(hsa_region_t region
, void *data
), void *data
);
150 hsa_status_t (*hsa_executable_destroy_fn
) (hsa_executable_t executable
);
151 hsa_status_t (*hsa_executable_create_fn
)
152 (hsa_profile_t profile
, hsa_executable_state_t executable_state
,
153 const char *options
, hsa_executable_t
*executable
);
154 hsa_status_t (*hsa_executable_global_variable_define_fn
)
155 (hsa_executable_t executable
, const char *variable_name
, void *address
);
156 hsa_status_t (*hsa_executable_load_code_object_fn
)
157 (hsa_executable_t executable
, hsa_agent_t agent
,
158 hsa_code_object_t code_object
, const char *options
);
159 hsa_status_t (*hsa_executable_freeze_fn
)(hsa_executable_t executable
,
160 const char *options
);
161 hsa_status_t (*hsa_signal_create_fn
) (hsa_signal_value_t initial_value
,
162 uint32_t num_consumers
,
163 const hsa_agent_t
*consumers
,
164 hsa_signal_t
*signal
);
165 hsa_status_t (*hsa_memory_allocate_fn
) (hsa_region_t region
, size_t size
,
167 hsa_status_t (*hsa_memory_assign_agent_fn
) (void *ptr
, hsa_agent_t agent
,
168 hsa_access_permission_t access
);
169 hsa_status_t (*hsa_memory_copy_fn
)(void *dst
, const void *src
, size_t size
);
170 hsa_status_t (*hsa_memory_free_fn
) (void *ptr
);
171 hsa_status_t (*hsa_signal_destroy_fn
) (hsa_signal_t signal
);
172 hsa_status_t (*hsa_executable_get_symbol_fn
)
173 (hsa_executable_t executable
, const char *module_name
,
174 const char *symbol_name
, hsa_agent_t agent
, int32_t call_convention
,
175 hsa_executable_symbol_t
*symbol
);
176 hsa_status_t (*hsa_executable_symbol_get_info_fn
)
177 (hsa_executable_symbol_t executable_symbol
,
178 hsa_executable_symbol_info_t attribute
, void *value
);
179 hsa_status_t (*hsa_executable_iterate_symbols_fn
)
180 (hsa_executable_t executable
,
181 hsa_status_t (*callback
)(hsa_executable_t executable
,
182 hsa_executable_symbol_t symbol
, void *data
),
184 uint64_t (*hsa_queue_add_write_index_release_fn
) (const hsa_queue_t
*queue
,
186 uint64_t (*hsa_queue_load_read_index_acquire_fn
) (const hsa_queue_t
*queue
);
187 void (*hsa_signal_store_relaxed_fn
) (hsa_signal_t signal
,
188 hsa_signal_value_t value
);
189 void (*hsa_signal_store_release_fn
) (hsa_signal_t signal
,
190 hsa_signal_value_t value
);
191 hsa_signal_value_t (*hsa_signal_wait_acquire_fn
)
192 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
193 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
194 hsa_wait_state_t wait_state_hint
);
195 hsa_signal_value_t (*hsa_signal_load_acquire_fn
) (hsa_signal_t signal
);
196 hsa_status_t (*hsa_queue_destroy_fn
) (hsa_queue_t
*queue
);
198 hsa_status_t (*hsa_code_object_deserialize_fn
)
199 (void *serialized_code_object
, size_t serialized_code_object_size
,
200 const char *options
, hsa_code_object_t
*code_object
);
203 /* Structure describing the run-time and grid properties of an HSA kernel
204 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
206 struct GOMP_kernel_launch_attributes
208 /* Number of dimensions the workload has. Maximum number is 3. */
210 /* Size of the grid in the three respective dimensions. */
212 /* Size of work-groups in the respective dimensions. */
216 /* Collection of information needed for a dispatch of a kernel from a
219 struct kernel_dispatch
221 struct agent_info
*agent
;
222 /* Pointer to a command queue associated with a kernel dispatch agent. */
224 /* Pointer to a memory space used for kernel arguments passing. */
225 void *kernarg_address
;
228 /* Synchronization signal used for dispatch synchronization. */
230 /* Private segment size. */
231 uint32_t private_segment_size
;
232 /* Group segment size. */
233 uint32_t group_segment_size
;
236 /* Structure of the kernargs segment, supporting console output.
238 This needs to match the definitions in Newlib, and the expectations
239 in libgomp target code. */
242 /* Leave space for the real kernel arguments.
243 OpenACC and OpenMP only use one pointer. */
247 /* A pointer to struct output, below, for console output data. */
250 /* A pointer to struct heap, below. */
253 /* A pointer to an ephemeral memory arena.
254 Only needed for OpenMP. */
260 unsigned int next_output
;
271 unsigned int consumed
;
275 /* A queue entry for a future asynchronous launch. */
279 struct kernel_info
*kernel
;
281 struct GOMP_kernel_launch_attributes kla
;
284 /* A queue entry for a future callback. */
292 /* A data struct for the copy_data callback. */
300 struct goacc_asyncqueue
*aq
;
303 /* A queue entry for a placeholder. These correspond to a wait event. */
309 pthread_mutex_t mutex
;
312 /* A queue entry for a wait directive. */
314 struct asyncwait_info
316 struct placeholder
*placeholderp
;
319 /* Encode the type of an entry in an async queue. */
329 /* An entry in an async queue. */
333 enum entry_type type
;
335 struct kernel_launch launch
;
336 struct callback callback
;
337 struct asyncwait_info asyncwait
;
338 struct placeholder placeholder
;
342 /* An async queue header.
344 OpenMP may create one of these.
345 OpenACC may create many. */
347 struct goacc_asyncqueue
349 struct agent_info
*agent
;
350 hsa_queue_t
*hsa_queue
;
352 pthread_t thread_drain_queue
;
353 pthread_mutex_t mutex
;
354 pthread_cond_t queue_cond_in
;
355 pthread_cond_t queue_cond_out
;
356 struct queue_entry queue
[ASYNC_QUEUE_SIZE
];
359 int drain_queue_stop
;
362 struct goacc_asyncqueue
*prev
;
363 struct goacc_asyncqueue
*next
;
366 /* Mkoffload uses this structure to describe a kernel.
368 OpenMP kernel dimensions are passed at runtime.
369 OpenACC kernel dimensions are passed at compile time, here. */
371 struct hsa_kernel_description
374 int oacc_dims
[3]; /* Only present for GCN kernels. */
379 /* Mkoffload uses this structure to describe an offload variable. */
381 struct global_var_info
387 /* Mkoffload uses this structure to describe all the kernels in a
388 loadable module. These are passed the libgomp via static constructors. */
390 struct gcn_image_desc
396 const unsigned kernel_count
;
397 struct hsa_kernel_description
*kernel_infos
;
398 const unsigned global_variable_count
;
399 struct global_var_info
*global_variables
;
402 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
404 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
407 EF_AMDGPU_MACH_AMDGCN_GFX803
= 0x02a,
408 EF_AMDGPU_MACH_AMDGCN_GFX900
= 0x02c,
409 EF_AMDGPU_MACH_AMDGCN_GFX906
= 0x02f,
412 const static int EF_AMDGPU_MACH_MASK
= 0x000000ff;
413 typedef EF_AMDGPU_MACH gcn_isa
;
415 /* Description of an HSA GPU agent (device) and the program associated with
420 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
422 /* The user-visible device number. */
424 /* Whether the agent has been initialized. The fields below are usable only
428 /* The instruction set architecture of the device. */
430 /* Name of the agent. */
432 /* Name of the vendor of the agent. */
433 char vendor_name
[64];
434 /* Command queues of the agent. */
435 hsa_queue_t
*sync_queue
;
436 struct goacc_asyncqueue
*async_queues
, *omp_async_queue
;
437 pthread_mutex_t async_queues_mutex
;
439 /* The HSA memory region from which to allocate kernel arguments. */
440 hsa_region_t kernarg_region
;
442 /* The HSA memory region from which to allocate device data. */
443 hsa_region_t data_region
;
445 /* Allocated team arenas. */
446 struct team_arena_list
*team_arena_list
;
447 pthread_mutex_t team_arena_write_lock
;
449 /* Read-write lock that protects kernels which are running or about to be run
450 from interference with loading and unloading of images. Needs to be
451 locked for reading while a kernel is being run, and for writing if the
452 list of modules is manipulated (and thus the HSA program invalidated). */
453 pthread_rwlock_t module_rwlock
;
455 /* The module associated with this kernel. */
456 struct module_info
*module
;
458 /* Mutex enforcing that only one thread will finalize the HSA program. A
459 thread should have locked agent->module_rwlock for reading before
461 pthread_mutex_t prog_mutex
;
462 /* Flag whether the HSA program that consists of all the modules has been
465 /* HSA executable - the finalized program that is used to locate kernels. */
466 hsa_executable_t executable
;
469 /* Information required to identify, finalize and run any given kernel. */
471 enum offload_kind
{KIND_UNKNOWN
, KIND_OPENMP
, KIND_OPENACC
};
475 /* Name of the kernel, required to locate it within the GCN object-code
478 /* The specific agent the kernel has been or will be finalized for and run
480 struct agent_info
*agent
;
481 /* The specific module where the kernel takes place. */
482 struct module_info
*module
;
483 /* Information provided by mkoffload associated with the kernel. */
484 struct hsa_kernel_description
*description
;
485 /* Mutex enforcing that at most once thread ever initializes a kernel for
486 use. A thread should have locked agent->module_rwlock for reading before
488 pthread_mutex_t init_mutex
;
489 /* Flag indicating whether the kernel has been initialized and all fields
490 below it contain valid data. */
492 /* Flag indicating that the kernel has a problem that blocks an execution. */
493 bool initialization_failed
;
494 /* The object to be put into the dispatch queue. */
496 /* Required size of kernel arguments. */
497 uint32_t kernarg_segment_size
;
498 /* Required size of group segment. */
499 uint32_t group_segment_size
;
500 /* Required size of private segment. */
501 uint32_t private_segment_size
;
502 /* Set up for OpenMP or OpenACC? */
503 enum offload_kind kind
;
506 /* Information about a particular GCN module, its image and kernels. */
510 /* The description with which the program has registered the image. */
511 struct gcn_image_desc
*image_desc
;
512 /* GCN heap allocation. */
514 /* Physical boundaries of the loaded module. */
515 Elf64_Addr phys_address_start
;
516 Elf64_Addr phys_address_end
;
518 bool constructors_run_p
;
519 struct kernel_info
*init_array_func
, *fini_array_func
;
521 /* Number of kernels in this module. */
523 /* An array of kernel_info structures describing each kernel in this
525 struct kernel_info kernels
[];
528 /* A linked list of memory arenas allocated on the device.
529 These are only used by OpenMP, as a means to optimize per-team malloc. */
531 struct team_arena_list
533 struct team_arena_list
*next
;
535 /* The number of teams determines the size of the allocation. */
537 /* The device address of the arena itself. */
539 /* A flag to prevent two asynchronous kernels trying to use the same arena.
540 The mutex is locked until the kernel exits. */
541 pthread_mutex_t in_use
;
544 /* Information about the whole HSA environment and all of its agents. */
546 struct hsa_context_info
548 /* Whether the structure has been initialized. */
550 /* Number of usable GPU HSA agents in the system. */
552 /* Array of agent_info structures describing the individual HSA agents. */
553 struct agent_info
*agents
;
554 /* Driver version string. */
555 char driver_version_s
[30];
558 /* Format of the on-device heap.
560 This must match the definition in Newlib and gcn-run. */
568 /* {{{ Global variables */
570 /* Information about the whole HSA environment and all of its agents. */
572 static struct hsa_context_info hsa_context
;
574 /* HSA runtime functions that are initialized in init_hsa_context. */
576 static struct hsa_runtime_fn_info hsa_fns
;
578 /* Heap space, allocated target-side, provided for use of newlib malloc.
579 Each module should have it's own heap allocated.
580 Beware that heap usage increases with OpenMP teams. See also arenas. */
582 static size_t gcn_kernel_heap_size
= DEFAULT_GCN_HEAP_SIZE
;
584 /* Flag to decide whether print to stderr information about what is going on.
585 Set in init_debug depending on environment variables. */
589 /* Flag to decide if the runtime should suppress a possible fallback to host
592 static bool suppress_host_fallback
;
594 /* Flag to locate HSA runtime shared library that is dlopened
597 static const char *hsa_runtime_lib
;
599 /* Flag to decide if the runtime should support also CPU devices (can be
602 static bool support_cpu_devices
;
604 /* Runtime dimension overrides. Zero indicates default. */
606 static int override_x_dim
= 0;
607 static int override_z_dim
= 0;
610 /* {{{ Debug & Diagnostic */
612 /* Print a message to stderr if GCN_DEBUG value is set to true. */
614 #define DEBUG_PRINT(...) \
619 fprintf (stderr, __VA_ARGS__); \
624 /* Flush stderr if GCN_DEBUG value is set to true. */
626 #define DEBUG_FLUSH() \
632 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
635 #define DEBUG_LOG(prefix, ...) \
638 DEBUG_PRINT (prefix); \
639 DEBUG_PRINT (__VA_ARGS__); \
643 /* Print a debugging message to stderr. */
645 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
647 /* Print a warning message to stderr. */
649 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
651 /* Print HSA warning STR with an HSA STATUS code. */
654 hsa_warn (const char *str
, hsa_status_t status
)
659 const char *hsa_error_msg
= "[unknown]";
660 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
662 fprintf (stderr
, "GCN warning: %s\nRuntime message: %s\n", str
,
666 /* Report a fatal error STR together with the HSA error corresponding to STATUS
667 and terminate execution of the current process. */
670 hsa_fatal (const char *str
, hsa_status_t status
)
672 const char *hsa_error_msg
= "[unknown]";
673 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
674 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str
,
678 /* Like hsa_fatal, except only report error message, and return FALSE
679 for propagating error processing to outside of plugin. */
682 hsa_error (const char *str
, hsa_status_t status
)
684 const char *hsa_error_msg
= "[unknown]";
685 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
686 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str
,
691 /* Dump information about the available hardware. */
694 dump_hsa_system_info (void)
698 hsa_endianness_t endianness
;
699 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS
,
701 if (status
== HSA_STATUS_SUCCESS
)
704 case HSA_ENDIANNESS_LITTLE
:
705 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
707 case HSA_ENDIANNESS_BIG
:
708 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
711 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
714 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
716 uint8_t extensions
[128];
717 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS
,
719 if (status
== HSA_STATUS_SUCCESS
)
721 if (extensions
[0] & (1 << HSA_EXTENSION_IMAGES
))
722 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
725 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
728 /* Dump information about the available hardware. */
731 dump_machine_model (hsa_machine_model_t machine_model
, const char *s
)
733 switch (machine_model
)
735 case HSA_MACHINE_MODEL_SMALL
:
736 GCN_DEBUG ("%s: SMALL\n", s
);
738 case HSA_MACHINE_MODEL_LARGE
:
739 GCN_DEBUG ("%s: LARGE\n", s
);
742 GCN_WARNING ("%s: UNKNOWN\n", s
);
747 /* Dump information about the available hardware. */
750 dump_profile (hsa_profile_t profile
, const char *s
)
754 case HSA_PROFILE_FULL
:
755 GCN_DEBUG ("%s: FULL\n", s
);
757 case HSA_PROFILE_BASE
:
758 GCN_DEBUG ("%s: BASE\n", s
);
761 GCN_WARNING ("%s: UNKNOWN\n", s
);
766 /* Dump information about a device memory region. */
769 dump_hsa_region (hsa_region_t region
, void *data
__attribute__((unused
)))
773 hsa_region_segment_t segment
;
774 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
,
776 if (status
== HSA_STATUS_SUCCESS
)
778 if (segment
== HSA_REGION_SEGMENT_GLOBAL
)
779 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
780 else if (segment
== HSA_REGION_SEGMENT_READONLY
)
781 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
782 else if (segment
== HSA_REGION_SEGMENT_PRIVATE
)
783 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
784 else if (segment
== HSA_REGION_SEGMENT_GROUP
)
785 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
787 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
790 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
792 if (segment
== HSA_REGION_SEGMENT_GLOBAL
)
796 = hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
798 if (status
== HSA_STATUS_SUCCESS
)
800 if (flags
& HSA_REGION_GLOBAL_FLAG_KERNARG
)
801 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
802 if (flags
& HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
)
803 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
804 if (flags
& HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
)
805 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
808 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
812 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SIZE
, &size
);
813 if (status
== HSA_STATUS_SUCCESS
)
814 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size
);
816 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
819 = hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_ALLOC_MAX_SIZE
,
821 if (status
== HSA_STATUS_SUCCESS
)
822 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size
);
824 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
828 = hsa_fns
.hsa_region_get_info_fn (region
,
829 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED
,
831 if (status
== HSA_STATUS_SUCCESS
)
832 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed
);
834 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
836 if (status
!= HSA_STATUS_SUCCESS
|| !alloc_allowed
)
837 return HSA_STATUS_SUCCESS
;
840 = hsa_fns
.hsa_region_get_info_fn (region
,
841 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE
,
843 if (status
== HSA_STATUS_SUCCESS
)
844 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size
);
846 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
850 = hsa_fns
.hsa_region_get_info_fn (region
,
851 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT
,
853 if (status
== HSA_STATUS_SUCCESS
)
854 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align
);
856 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
858 return HSA_STATUS_SUCCESS
;
861 /* Dump information about all the device memory regions. */
864 dump_hsa_regions (hsa_agent_t agent
)
867 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
,
870 if (status
!= HSA_STATUS_SUCCESS
)
871 hsa_error ("Dumping hsa regions failed", status
);
874 /* Dump information about the available devices. */
877 dump_hsa_agent_info (hsa_agent_t agent
, void *data
__attribute__((unused
)))
882 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_NAME
,
884 if (status
== HSA_STATUS_SUCCESS
)
885 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf
);
887 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
889 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_VENDOR_NAME
,
891 if (status
== HSA_STATUS_SUCCESS
)
892 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf
);
894 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
896 hsa_machine_model_t machine_model
;
898 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_MACHINE_MODEL
,
900 if (status
== HSA_STATUS_SUCCESS
)
901 dump_machine_model (machine_model
, "HSA_AGENT_INFO_MACHINE_MODEL");
903 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
905 hsa_profile_t profile
;
906 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_PROFILE
,
908 if (status
== HSA_STATUS_SUCCESS
)
909 dump_profile (profile
, "HSA_AGENT_INFO_PROFILE");
911 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
913 hsa_device_type_t device_type
;
914 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
916 if (status
== HSA_STATUS_SUCCESS
)
920 case HSA_DEVICE_TYPE_CPU
:
921 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
923 case HSA_DEVICE_TYPE_GPU
:
924 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
926 case HSA_DEVICE_TYPE_DSP
:
927 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
930 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
935 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
938 status
= hsa_fns
.hsa_agent_get_info_fn
939 (agent
, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, &cu_count
);
940 if (status
== HSA_STATUS_SUCCESS
)
941 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count
);
943 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
946 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_WAVEFRONT_SIZE
,
948 if (status
== HSA_STATUS_SUCCESS
)
949 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size
);
951 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
954 status
= hsa_fns
.hsa_agent_get_info_fn (agent
,
955 HSA_AGENT_INFO_WORKGROUP_MAX_DIM
,
957 if (status
== HSA_STATUS_SUCCESS
)
958 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim
);
960 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
963 status
= hsa_fns
.hsa_agent_get_info_fn (agent
,
964 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE
,
966 if (status
== HSA_STATUS_SUCCESS
)
967 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size
);
969 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
971 uint32_t grid_max_dim
;
972 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_GRID_MAX_DIM
,
974 if (status
== HSA_STATUS_SUCCESS
)
975 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim
);
977 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
979 uint32_t grid_max_size
;
980 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_GRID_MAX_SIZE
,
982 if (status
== HSA_STATUS_SUCCESS
)
983 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size
);
985 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
987 dump_hsa_regions (agent
);
989 return HSA_STATUS_SUCCESS
;
992 /* Forward reference. */
994 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol
);
996 /* Helper function for dump_executable_symbols. */
999 dump_executable_symbol (hsa_executable_t executable
,
1000 hsa_executable_symbol_t symbol
,
1001 void *data
__attribute__((unused
)))
1003 char *name
= get_executable_symbol_name (symbol
);
1007 GCN_DEBUG ("executable symbol: %s\n", name
);
1011 return HSA_STATUS_SUCCESS
;
1014 /* Dump all global symbol in an executable. */
1017 dump_executable_symbols (hsa_executable_t executable
)
1019 hsa_status_t status
;
1021 = hsa_fns
.hsa_executable_iterate_symbols_fn (executable
,
1022 dump_executable_symbol
,
1024 if (status
!= HSA_STATUS_SUCCESS
)
1025 hsa_fatal ("Could not dump HSA executable symbols", status
);
1028 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1031 print_kernel_dispatch (struct kernel_dispatch
*dispatch
, unsigned indent
)
1033 struct kernargs
*kernargs
= (struct kernargs
*)dispatch
->kernarg_address
;
1035 fprintf (stderr
, "%*sthis: %p\n", indent
, "", dispatch
);
1036 fprintf (stderr
, "%*squeue: %p\n", indent
, "", dispatch
->queue
);
1037 fprintf (stderr
, "%*skernarg_address: %p\n", indent
, "", kernargs
);
1038 fprintf (stderr
, "%*sheap address: %p\n", indent
, "",
1039 (void*)kernargs
->heap_ptr
);
1040 fprintf (stderr
, "%*sarena address: %p\n", indent
, "",
1041 (void*)kernargs
->arena_ptr
);
1042 fprintf (stderr
, "%*sobject: %lu\n", indent
, "", dispatch
->object
);
1043 fprintf (stderr
, "%*sprivate_segment_size: %u\n", indent
, "",
1044 dispatch
->private_segment_size
);
1045 fprintf (stderr
, "%*sgroup_segment_size: %u\n", indent
, "",
1046 dispatch
->group_segment_size
);
1047 fprintf (stderr
, "\n");
1051 /* {{{ Utility functions */
1053 /* Cast the thread local storage to gcn_thread. */
1055 static inline struct gcn_thread
*
1058 return (struct gcn_thread
*) GOMP_PLUGIN_acc_thread ();
1061 /* Initialize debug and suppress_host_fallback according to the environment. */
1064 init_environment_variables (void)
1066 if (secure_getenv ("GCN_DEBUG"))
1071 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1072 suppress_host_fallback
= true;
1074 suppress_host_fallback
= false;
1076 hsa_runtime_lib
= secure_getenv ("HSA_RUNTIME_LIB");
1077 if (hsa_runtime_lib
== NULL
)
1078 hsa_runtime_lib
= HSA_RUNTIME_LIB
"libhsa-runtime64.so.1";
1080 support_cpu_devices
= secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1082 const char *x
= secure_getenv ("GCN_NUM_TEAMS");
1084 x
= secure_getenv ("GCN_NUM_GANGS");
1086 override_x_dim
= atoi (x
);
1088 const char *z
= secure_getenv ("GCN_NUM_THREADS");
1090 z
= secure_getenv ("GCN_NUM_WORKERS");
1092 override_z_dim
= atoi (z
);
1094 const char *heap
= secure_getenv ("GCN_HEAP_SIZE");
1097 size_t tmp
= atol (heap
);
1099 gcn_kernel_heap_size
= tmp
;
1103 /* Return malloc'd string with name of SYMBOL. */
1106 get_executable_symbol_name (hsa_executable_symbol_t symbol
)
1108 hsa_status_t status
;
1111 const hsa_executable_symbol_info_t info_name_length
1112 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH
;
1114 status
= hsa_fns
.hsa_executable_symbol_get_info_fn (symbol
, info_name_length
,
1116 if (status
!= HSA_STATUS_SUCCESS
)
1118 hsa_error ("Could not get length of symbol name", status
);
1122 res
= GOMP_PLUGIN_malloc (len
+ 1);
1124 const hsa_executable_symbol_info_t info_name
1125 = HSA_EXECUTABLE_SYMBOL_INFO_NAME
;
1127 status
= hsa_fns
.hsa_executable_symbol_get_info_fn (symbol
, info_name
, res
);
1129 if (status
!= HSA_STATUS_SUCCESS
)
1131 hsa_error ("Could not get symbol name", status
);
1141 /* Get the number of GPU Compute Units. */
1144 get_cu_count (struct agent_info
*agent
)
1147 hsa_status_t status
= hsa_fns
.hsa_agent_get_info_fn
1148 (agent
->id
, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, &cu_count
);
1149 if (status
== HSA_STATUS_SUCCESS
)
1152 return 64; /* The usual number for older devices. */
1155 /* Calculate the maximum grid size for OMP threads / OACC workers.
1156 This depends on the kernel's resource usage levels. */
1159 limit_worker_threads (int threads
)
1161 /* FIXME Do something more inteligent here.
1162 GCN can always run 4 threads within a Compute Unit, but
1163 more than that depends on register usage. */
1169 /* Parse the target attributes INPUT provided by the compiler and return true
1170 if we should run anything all. If INPUT is NULL, fill DEF with default
1171 values, then store INPUT or DEF into *RESULT.
1173 This is used for OpenMP only. */
1176 parse_target_attributes (void **input
,
1177 struct GOMP_kernel_launch_attributes
*def
,
1178 struct GOMP_kernel_launch_attributes
**result
,
1179 struct agent_info
*agent
)
1182 GOMP_PLUGIN_fatal ("No target arguments provided");
1184 bool grid_attrs_found
= false;
1185 bool gcn_dims_found
= false;
1187 int gcn_threads
= 0;
1190 intptr_t id
= (intptr_t) *input
++, val
;
1192 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
1193 val
= (intptr_t) *input
++;
1195 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
1197 val
= (val
> INT_MAX
) ? INT_MAX
: val
;
1199 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) == GOMP_DEVICE_GCN
1200 && ((id
& GOMP_TARGET_ARG_ID_MASK
)
1201 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES
))
1203 grid_attrs_found
= true;
1206 else if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
)
1207 == GOMP_TARGET_ARG_DEVICE_ALL
)
1209 gcn_dims_found
= true;
1210 switch (id
& GOMP_TARGET_ARG_ID_MASK
)
1212 case GOMP_TARGET_ARG_NUM_TEAMS
:
1215 case GOMP_TARGET_ARG_THREAD_LIMIT
:
1216 gcn_threads
= limit_worker_threads (val
);
1226 if (agent
->device_isa
== EF_AMDGPU_MACH_AMDGCN_GFX900
1227 && gcn_threads
== 0 && override_z_dim
== 0)
1230 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1231 "threads to 4 per team.\n");
1232 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1233 "GCN_NUM_THREADS=16\n");
1237 /* Fiji has 64 CUs, but Vega20 has 60. */
1238 def
->gdims
[0] = (gcn_teams
> 0) ? gcn_teams
: get_cu_count (agent
);
1239 /* Each thread is 64 work items wide. */
1241 /* A work group can have 16 wavefronts. */
1242 def
->gdims
[2] = (gcn_threads
> 0) ? gcn_threads
: 16;
1243 def
->wdims
[0] = 1; /* Single team per work-group. */
1249 else if (!grid_attrs_found
)
1259 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1263 struct GOMP_kernel_launch_attributes
*kla
;
1264 kla
= (struct GOMP_kernel_launch_attributes
*) *input
;
1266 if (kla
->ndim
== 0 || kla
->ndim
> 3)
1267 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla
->ndim
);
1269 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla
->ndim
);
1271 for (i
= 0; i
< kla
->ndim
; i
++)
1273 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i
,
1274 kla
->gdims
[i
], kla
->wdims
[i
]);
1275 if (kla
->gdims
[i
] == 0)
1281 /* Return the group size given the requested GROUP size, GRID size and number
1282 of grid dimensions NDIM. */
1285 get_group_size (uint32_t ndim
, uint32_t grid
, uint32_t group
)
1289 /* TODO: Provide a default via environment or device characteristics. */
1303 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1306 packet_store_release (uint32_t* packet
, uint16_t header
, uint16_t rest
)
1308 __atomic_store_n (packet
, header
| (rest
<< 16), __ATOMIC_RELEASE
);
1311 /* A never-called callback for the HSA command queues. These signal events
1312 that we don't use, so we trigger an error.
1314 This "queue" is not to be confused with the async queues, below. */
1317 hsa_queue_callback (hsa_status_t status
,
1318 hsa_queue_t
*queue
__attribute__ ((unused
)),
1319 void *data
__attribute__ ((unused
)))
1321 hsa_fatal ("Asynchronous queue error", status
);
1325 /* {{{ HSA initialization */
1327 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1330 init_hsa_runtime_functions (void)
1332 #define DLSYM_FN(function) \
1333 hsa_fns.function##_fn = dlsym (handle, #function); \
1334 if (hsa_fns.function##_fn == NULL) \
1336 void *handle
= dlopen (hsa_runtime_lib
, RTLD_LAZY
);
1340 DLSYM_FN (hsa_status_string
)
1341 DLSYM_FN (hsa_system_get_info
)
1342 DLSYM_FN (hsa_agent_get_info
)
1344 DLSYM_FN (hsa_iterate_agents
)
1345 DLSYM_FN (hsa_region_get_info
)
1346 DLSYM_FN (hsa_queue_create
)
1347 DLSYM_FN (hsa_agent_iterate_regions
)
1348 DLSYM_FN (hsa_executable_destroy
)
1349 DLSYM_FN (hsa_executable_create
)
1350 DLSYM_FN (hsa_executable_global_variable_define
)
1351 DLSYM_FN (hsa_executable_load_code_object
)
1352 DLSYM_FN (hsa_executable_freeze
)
1353 DLSYM_FN (hsa_signal_create
)
1354 DLSYM_FN (hsa_memory_allocate
)
1355 DLSYM_FN (hsa_memory_assign_agent
)
1356 DLSYM_FN (hsa_memory_copy
)
1357 DLSYM_FN (hsa_memory_free
)
1358 DLSYM_FN (hsa_signal_destroy
)
1359 DLSYM_FN (hsa_executable_get_symbol
)
1360 DLSYM_FN (hsa_executable_symbol_get_info
)
1361 DLSYM_FN (hsa_executable_iterate_symbols
)
1362 DLSYM_FN (hsa_queue_add_write_index_release
)
1363 DLSYM_FN (hsa_queue_load_read_index_acquire
)
1364 DLSYM_FN (hsa_signal_wait_acquire
)
1365 DLSYM_FN (hsa_signal_store_relaxed
)
1366 DLSYM_FN (hsa_signal_store_release
)
1367 DLSYM_FN (hsa_signal_load_acquire
)
1368 DLSYM_FN (hsa_queue_destroy
)
1369 DLSYM_FN (hsa_code_object_deserialize
)
1374 /* Return true if the agent is a GPU and can accept of concurrent submissions
1375 from different threads. */
1378 suitable_hsa_agent_p (hsa_agent_t agent
)
1380 hsa_device_type_t device_type
;
1382 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
1384 if (status
!= HSA_STATUS_SUCCESS
)
1387 switch (device_type
)
1389 case HSA_DEVICE_TYPE_GPU
:
1391 case HSA_DEVICE_TYPE_CPU
:
1392 if (!support_cpu_devices
)
1399 uint32_t features
= 0;
1400 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_FEATURE
,
1402 if (status
!= HSA_STATUS_SUCCESS
1403 || !(features
& HSA_AGENT_FEATURE_KERNEL_DISPATCH
))
1405 hsa_queue_type_t queue_type
;
1406 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_QUEUE_TYPE
,
1408 if (status
!= HSA_STATUS_SUCCESS
1409 || (queue_type
!= HSA_QUEUE_TYPE_MULTI
))
1415 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1416 agent_count in hsa_context. */
1419 count_gpu_agents (hsa_agent_t agent
, void *data
__attribute__ ((unused
)))
1421 if (suitable_hsa_agent_p (agent
))
1422 hsa_context
.agent_count
++;
1423 return HSA_STATUS_SUCCESS
;
1426 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1427 id to the describing structure in the hsa context. The index of the
1428 structure is pointed to by DATA, increment it afterwards. */
1431 assign_agent_ids (hsa_agent_t agent
, void *data
)
1433 if (suitable_hsa_agent_p (agent
))
1435 int *agent_index
= (int *) data
;
1436 hsa_context
.agents
[*agent_index
].id
= agent
;
1439 return HSA_STATUS_SUCCESS
;
1442 /* Initialize hsa_context if it has not already been done.
1443 Return TRUE on success. */
1446 init_hsa_context (void)
1448 hsa_status_t status
;
1449 int agent_index
= 0;
1451 if (hsa_context
.initialized
)
1453 init_environment_variables ();
1454 if (!init_hsa_runtime_functions ())
1456 GCN_WARNING ("Run-time could not be dynamically opened\n");
1457 if (suppress_host_fallback
)
1458 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1461 status
= hsa_fns
.hsa_init_fn ();
1462 if (status
!= HSA_STATUS_SUCCESS
)
1463 return hsa_error ("Run-time could not be initialized", status
);
1464 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1467 dump_hsa_system_info ();
1469 status
= hsa_fns
.hsa_iterate_agents_fn (count_gpu_agents
, NULL
);
1470 if (status
!= HSA_STATUS_SUCCESS
)
1471 return hsa_error ("GCN GPU devices could not be enumerated", status
);
1472 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context
.agent_count
);
1475 = GOMP_PLUGIN_malloc_cleared (hsa_context
.agent_count
1476 * sizeof (struct agent_info
));
1477 status
= hsa_fns
.hsa_iterate_agents_fn (assign_agent_ids
, &agent_index
);
1478 if (status
!= HSA_STATUS_SUCCESS
)
1479 return hsa_error ("Scanning compute agents failed", status
);
1480 if (agent_index
!= hsa_context
.agent_count
)
1482 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1488 status
= hsa_fns
.hsa_iterate_agents_fn (dump_hsa_agent_info
, NULL
);
1489 if (status
!= HSA_STATUS_SUCCESS
)
1490 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1493 uint16_t minor
, major
;
1494 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR
,
1496 if (status
!= HSA_STATUS_SUCCESS
)
1497 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1498 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR
,
1500 if (status
!= HSA_STATUS_SUCCESS
)
1501 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1503 size_t len
= sizeof hsa_context
.driver_version_s
;
1504 int printed
= snprintf (hsa_context
.driver_version_s
, len
,
1505 "HSA Runtime %hu.%hu", (unsigned short int)major
,
1506 (unsigned short int)minor
);
1508 GCN_WARNING ("HSA runtime version string was truncated."
1509 "Version %hu.%hu is too long.", (unsigned short int)major
,
1510 (unsigned short int)minor
);
1512 hsa_context
.initialized
= true;
1516 /* Verify that hsa_context has already been initialized and return the
1517 agent_info structure describing device number N. Return NULL on error. */
1519 static struct agent_info
*
1520 get_agent_info (int n
)
1522 if (!hsa_context
.initialized
)
1524 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1527 if (n
>= hsa_context
.agent_count
)
1529 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n
);
1532 if (!hsa_context
.agents
[n
].initialized
)
1534 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1537 return &hsa_context
.agents
[n
];
1540 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1542 Selects (breaks at) a suitable region of type KIND. */
1545 get_memory_region (hsa_region_t region
, hsa_region_t
*retval
,
1546 hsa_region_global_flag_t kind
)
1548 hsa_status_t status
;
1549 hsa_region_segment_t segment
;
1551 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
,
1553 if (status
!= HSA_STATUS_SUCCESS
)
1555 if (segment
!= HSA_REGION_SEGMENT_GLOBAL
)
1556 return HSA_STATUS_SUCCESS
;
1559 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
1561 if (status
!= HSA_STATUS_SUCCESS
)
1566 return HSA_STATUS_INFO_BREAK
;
1568 return HSA_STATUS_SUCCESS
;
1571 /* Callback of hsa_agent_iterate_regions.
1573 Selects a kernargs memory region. */
1576 get_kernarg_memory_region (hsa_region_t region
, void *data
)
1578 return get_memory_region (region
, (hsa_region_t
*)data
,
1579 HSA_REGION_GLOBAL_FLAG_KERNARG
);
1582 /* Callback of hsa_agent_iterate_regions.
1584 Selects a coarse-grained memory region suitable for the heap and
1588 get_data_memory_region (hsa_region_t region
, void *data
)
1590 return get_memory_region (region
, (hsa_region_t
*)data
,
1591 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
);
1595 elf_gcn_isa_field (Elf64_Ehdr
*image
)
1597 return image
->e_flags
& EF_AMDGPU_MACH_MASK
;
1600 const static char *gcn_gfx803_s
= "gfx803";
1601 const static char *gcn_gfx900_s
= "gfx900";
1602 const static char *gcn_gfx906_s
= "gfx906";
1603 const static int gcn_isa_name_len
= 6;
1605 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1609 isa_hsa_name (int isa
) {
1612 case EF_AMDGPU_MACH_AMDGCN_GFX803
:
1613 return gcn_gfx803_s
;
1614 case EF_AMDGPU_MACH_AMDGCN_GFX900
:
1615 return gcn_gfx900_s
;
1616 case EF_AMDGPU_MACH_AMDGCN_GFX906
:
1617 return gcn_gfx906_s
;
1622 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1623 with -march) or NULL if we do not support the ISA.
1624 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1627 isa_gcc_name (int isa
) {
1630 case EF_AMDGPU_MACH_AMDGCN_GFX803
:
1633 return isa_hsa_name (isa
);
1637 /* Returns the code which is used in the GCN object code to identify the ISA with
1638 the given name (as used by the HSA runtime). */
1641 isa_code(const char *isa
) {
1642 if (!strncmp (isa
, gcn_gfx803_s
, gcn_isa_name_len
))
1643 return EF_AMDGPU_MACH_AMDGCN_GFX803
;
1645 if (!strncmp (isa
, gcn_gfx900_s
, gcn_isa_name_len
))
1646 return EF_AMDGPU_MACH_AMDGCN_GFX900
;
1648 if (!strncmp (isa
, gcn_gfx906_s
, gcn_isa_name_len
))
1649 return EF_AMDGPU_MACH_AMDGCN_GFX906
;
1657 /* Create or reuse a team arena.
1659 Team arenas are used by OpenMP to avoid calling malloc multiple times
1660 while setting up each team. This is purely a performance optimization.
1662 Allocating an arena also costs performance, albeit on the host side, so
1663 this function will reuse an existing arena if a large enough one is idle.
1664 The arena is released, but not deallocated, when the kernel exits. */
1667 get_team_arena (struct agent_info
*agent
, int num_teams
)
1669 struct team_arena_list
**next_ptr
= &agent
->team_arena_list
;
1670 struct team_arena_list
*item
;
1672 for (item
= *next_ptr
; item
; next_ptr
= &item
->next
, item
= item
->next
)
1674 if (item
->num_teams
< num_teams
)
1677 if (pthread_mutex_trylock (&item
->in_use
))
1683 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams
);
1685 if (pthread_mutex_lock (&agent
->team_arena_write_lock
))
1687 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1690 item
= malloc (sizeof (*item
));
1691 item
->num_teams
= num_teams
;
1695 if (pthread_mutex_init (&item
->in_use
, NULL
))
1697 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1700 if (pthread_mutex_lock (&item
->in_use
))
1702 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1705 if (pthread_mutex_unlock (&agent
->team_arena_write_lock
))
1707 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1711 const int TEAM_ARENA_SIZE
= 64*1024; /* Must match libgomp.h. */
1712 hsa_status_t status
;
1713 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
1714 TEAM_ARENA_SIZE
*num_teams
,
1716 if (status
!= HSA_STATUS_SUCCESS
)
1717 hsa_fatal ("Could not allocate memory for GCN kernel arena", status
);
1718 status
= hsa_fns
.hsa_memory_assign_agent_fn (item
->arena
, agent
->id
,
1719 HSA_ACCESS_PERMISSION_RW
);
1720 if (status
!= HSA_STATUS_SUCCESS
)
1721 hsa_fatal ("Could not assign arena memory to device", status
);
1726 /* Mark a team arena available for reuse. */
1729 release_team_arena (struct agent_info
* agent
, void *arena
)
1731 struct team_arena_list
*item
;
1733 for (item
= agent
->team_arena_list
; item
; item
= item
->next
)
1735 if (item
->arena
== arena
)
1737 if (pthread_mutex_unlock (&item
->in_use
))
1738 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1742 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1745 /* Clean up all the allocated team arenas. */
1748 destroy_team_arenas (struct agent_info
*agent
)
1750 struct team_arena_list
*item
, *next
;
1752 for (item
= agent
->team_arena_list
; item
; item
= next
)
1755 hsa_fns
.hsa_memory_free_fn (item
->arena
);
1756 if (pthread_mutex_destroy (&item
->in_use
))
1758 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1763 agent
->team_arena_list
= NULL
;
1768 /* Allocate memory on a specified device. */
1771 alloc_by_agent (struct agent_info
*agent
, size_t size
)
1773 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size
, agent
->device_id
);
1775 /* Zero-size allocations are invalid, so in order to return a valid pointer
1776 we need to pass a valid size. One source of zero-size allocations is
1777 kernargs for kernels that have no inputs or outputs (the kernel may
1778 only use console output, for example). */
1783 hsa_status_t status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
1785 if (status
!= HSA_STATUS_SUCCESS
)
1787 hsa_error ("Could not allocate device memory", status
);
1791 status
= hsa_fns
.hsa_memory_assign_agent_fn (ptr
, agent
->id
,
1792 HSA_ACCESS_PERMISSION_RW
);
1793 if (status
!= HSA_STATUS_SUCCESS
)
1795 hsa_error ("Could not assign data memory to device", status
);
1799 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1800 bool profiling_dispatch_p
1801 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1802 if (profiling_dispatch_p
)
1804 acc_prof_info
*prof_info
= thr
->prof_info
;
1805 acc_event_info data_event_info
;
1806 acc_api_info
*api_info
= thr
->api_info
;
1808 prof_info
->event_type
= acc_ev_alloc
;
1810 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1811 data_event_info
.data_event
.valid_bytes
1812 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1813 data_event_info
.data_event
.parent_construct
1814 = acc_construct_parallel
;
1815 data_event_info
.data_event
.implicit
= 1;
1816 data_event_info
.data_event
.tool_info
= NULL
;
1817 data_event_info
.data_event
.var_name
= NULL
;
1818 data_event_info
.data_event
.bytes
= size
;
1819 data_event_info
.data_event
.host_ptr
= NULL
;
1820 data_event_info
.data_event
.device_ptr
= (void *) ptr
;
1822 api_info
->device_api
= acc_device_api_other
;
1824 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1831 /* Create kernel dispatch data structure for given KERNEL, along with
1832 the necessary device signals and memory allocations. */
1834 static struct kernel_dispatch
*
1835 create_kernel_dispatch (struct kernel_info
*kernel
, int num_teams
)
1837 struct agent_info
*agent
= kernel
->agent
;
1838 struct kernel_dispatch
*shadow
1839 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch
));
1841 shadow
->agent
= kernel
->agent
;
1842 shadow
->object
= kernel
->object
;
1844 hsa_signal_t sync_signal
;
1845 hsa_status_t status
= hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &sync_signal
);
1846 if (status
!= HSA_STATUS_SUCCESS
)
1847 hsa_fatal ("Error creating the GCN sync signal", status
);
1849 shadow
->signal
= sync_signal
.handle
;
1850 shadow
->private_segment_size
= kernel
->private_segment_size
;
1851 shadow
->group_segment_size
= kernel
->group_segment_size
;
1853 /* We expect kernels to request a single pointer, explicitly, and the
1854 rest of struct kernargs, implicitly. If they request anything else
1855 then something is wrong. */
1856 if (kernel
->kernarg_segment_size
> 8)
1858 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1862 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->kernarg_region
,
1863 sizeof (struct kernargs
),
1864 &shadow
->kernarg_address
);
1865 if (status
!= HSA_STATUS_SUCCESS
)
1866 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status
);
1867 struct kernargs
*kernargs
= shadow
->kernarg_address
;
1869 /* Zero-initialize the output_data (minimum needed). */
1870 kernargs
->out_ptr
= (int64_t)&kernargs
->output_data
;
1871 kernargs
->output_data
.next_output
= 0;
1872 for (unsigned i
= 0;
1873 i
< (sizeof (kernargs
->output_data
.queue
)
1874 / sizeof (kernargs
->output_data
.queue
[0]));
1876 kernargs
->output_data
.queue
[i
].written
= 0;
1877 kernargs
->output_data
.consumed
= 0;
1879 /* Pass in the heap location. */
1880 kernargs
->heap_ptr
= (int64_t)kernel
->module
->heap
;
1882 /* Create an arena. */
1883 if (kernel
->kind
== KIND_OPENMP
)
1884 kernargs
->arena_ptr
= (int64_t)get_team_arena (agent
, num_teams
);
1886 kernargs
->arena_ptr
= 0;
1888 /* Ensure we can recognize unset return values. */
1889 kernargs
->output_data
.return_value
= 0xcafe0000;
1894 /* Output any data written to console output from the kernel. It is expected
1895 that this function is polled during kernel execution.
1897 We print all entries from the last item printed to the next entry without
1898 a "written" flag. If the "final" flag is set then it'll continue right to
1901 The print buffer is circular, but the from and to locations don't wrap when
1902 the buffer does, so the output limit is UINT_MAX. The target blocks on
1903 output when the buffer is full. */
1906 console_output (struct kernel_info
*kernel
, struct kernargs
*kernargs
,
1909 unsigned int limit
= (sizeof (kernargs
->output_data
.queue
)
1910 / sizeof (kernargs
->output_data
.queue
[0]));
1912 unsigned int from
= __atomic_load_n (&kernargs
->output_data
.consumed
,
1914 unsigned int to
= kernargs
->output_data
.next_output
;
1920 printf ("GCN print buffer overflowed.\n");
1925 for (i
= from
; i
< to
; i
++)
1927 struct printf_data
*data
= &kernargs
->output_data
.queue
[i
%limit
];
1929 if (!data
->written
&& !final
)
1934 case 0: printf ("%.128s%ld\n", data
->msg
, data
->ivalue
); break;
1935 case 1: printf ("%.128s%f\n", data
->msg
, data
->dvalue
); break;
1936 case 2: printf ("%.128s%.128s\n", data
->msg
, data
->text
); break;
1937 case 3: printf ("%.128s%.128s", data
->msg
, data
->text
); break;
1938 default: printf ("GCN print buffer error!\n"); break;
1941 __atomic_store_n (&kernargs
->output_data
.consumed
, i
+1,
1947 /* Release data structure created for a kernel dispatch in SHADOW argument,
1948 and clean up the signal and memory allocations. */
1951 release_kernel_dispatch (struct kernel_dispatch
*shadow
)
1953 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow
);
1955 struct kernargs
*kernargs
= shadow
->kernarg_address
;
1956 void *arena
= (void *)kernargs
->arena_ptr
;
1958 release_team_arena (shadow
->agent
, arena
);
1960 hsa_fns
.hsa_memory_free_fn (shadow
->kernarg_address
);
1963 s
.handle
= shadow
->signal
;
1964 hsa_fns
.hsa_signal_destroy_fn (s
);
1969 /* Extract the properties from a kernel binary. */
1972 init_kernel_properties (struct kernel_info
*kernel
)
1974 hsa_status_t status
;
1975 struct agent_info
*agent
= kernel
->agent
;
1976 hsa_executable_symbol_t kernel_symbol
;
1977 char *buf
= alloca (strlen (kernel
->name
) + 4);
1978 sprintf (buf
, "%s.kd", kernel
->name
);
1979 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
1982 if (status
!= HSA_STATUS_SUCCESS
)
1984 hsa_warn ("Could not find symbol for kernel in the code object", status
);
1985 fprintf (stderr
, "not found name: '%s'\n", buf
);
1986 dump_executable_symbols (agent
->executable
);
1989 GCN_DEBUG ("Located kernel %s\n", kernel
->name
);
1990 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
1991 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &kernel
->object
);
1992 if (status
!= HSA_STATUS_SUCCESS
)
1993 hsa_fatal ("Could not extract a kernel object from its symbol", status
);
1994 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
1995 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
,
1996 &kernel
->kernarg_segment_size
);
1997 if (status
!= HSA_STATUS_SUCCESS
)
1998 hsa_fatal ("Could not get info about kernel argument size", status
);
1999 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2000 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
,
2001 &kernel
->group_segment_size
);
2002 if (status
!= HSA_STATUS_SUCCESS
)
2003 hsa_fatal ("Could not get info about kernel group segment size", status
);
2004 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2005 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
,
2006 &kernel
->private_segment_size
);
2007 if (status
!= HSA_STATUS_SUCCESS
)
2008 hsa_fatal ("Could not get info about kernel private segment size",
2011 /* The kernel type is not known until something tries to launch it. */
2012 kernel
->kind
= KIND_UNKNOWN
;
2014 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2015 "following segment sizes: \n", kernel
->name
);
2016 GCN_DEBUG (" group_segment_size: %u\n",
2017 (unsigned) kernel
->group_segment_size
);
2018 GCN_DEBUG (" private_segment_size: %u\n",
2019 (unsigned) kernel
->private_segment_size
);
2020 GCN_DEBUG (" kernarg_segment_size: %u\n",
2021 (unsigned) kernel
->kernarg_segment_size
);
2025 kernel
->initialization_failed
= true;
2028 /* Do all the work that is necessary before running KERNEL for the first time.
2029 The function assumes the program has been created, finalized and frozen by
2030 create_and_finalize_hsa_program. */
2033 init_kernel (struct kernel_info
*kernel
)
2035 if (pthread_mutex_lock (&kernel
->init_mutex
))
2036 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2037 if (kernel
->initialized
)
2039 if (pthread_mutex_unlock (&kernel
->init_mutex
))
2040 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2046 init_kernel_properties (kernel
);
2048 if (!kernel
->initialization_failed
)
2052 kernel
->initialized
= true;
2054 if (pthread_mutex_unlock (&kernel
->init_mutex
))
2055 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2059 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2060 launch attributes from KLA.
2062 MODULE_LOCKED indicates that the caller already holds the lock and
2063 run_kernel need not lock it again.
2064 If AQ is NULL then agent->sync_queue will be used. */
2067 run_kernel (struct kernel_info
*kernel
, void *vars
,
2068 struct GOMP_kernel_launch_attributes
*kla
,
2069 struct goacc_asyncqueue
*aq
, bool module_locked
)
2071 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel
->description
->sgpr_count
,
2072 kernel
->description
->vpgr_count
);
2074 /* Reduce the number of threads/workers if there are insufficient
2075 VGPRs available to run the kernels together. */
2076 if (kla
->ndim
== 3 && kernel
->description
->vpgr_count
> 0)
2078 int granulated_vgprs
= (kernel
->description
->vpgr_count
+ 3) & ~3;
2079 int max_threads
= (256 / granulated_vgprs
) * 4;
2080 if (kla
->gdims
[2] > max_threads
)
2082 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2083 " per team/gang - reducing to %d threads/workers.\n",
2084 kla
->gdims
[2], max_threads
);
2085 kla
->gdims
[2] = max_threads
;
2089 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel
->agent
->device_id
,
2091 GCN_DEBUG ("GCN launch attribs: gdims:[");
2093 for (i
= 0; i
< kla
->ndim
; ++i
)
2097 DEBUG_PRINT ("%u", kla
->gdims
[i
]);
2099 DEBUG_PRINT ("], normalized gdims:[");
2100 for (i
= 0; i
< kla
->ndim
; ++i
)
2104 DEBUG_PRINT ("%u", kla
->gdims
[i
] / kla
->wdims
[i
]);
2106 DEBUG_PRINT ("], wdims:[");
2107 for (i
= 0; i
< kla
->ndim
; ++i
)
2111 DEBUG_PRINT ("%u", kla
->wdims
[i
]);
2113 DEBUG_PRINT ("]\n");
2116 struct agent_info
*agent
= kernel
->agent
;
2117 if (!module_locked
&& pthread_rwlock_rdlock (&agent
->module_rwlock
))
2118 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2120 if (!agent
->initialized
)
2121 GOMP_PLUGIN_fatal ("Agent must be initialized");
2123 if (!kernel
->initialized
)
2124 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2126 hsa_queue_t
*command_q
= (aq
? aq
->hsa_queue
: kernel
->agent
->sync_queue
);
2129 = hsa_fns
.hsa_queue_add_write_index_release_fn (command_q
, 1);
2130 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index
);
2132 /* Wait until the queue is not full before writing the packet. */
2133 while (index
- hsa_fns
.hsa_queue_load_read_index_acquire_fn (command_q
)
2137 /* Do not allow the dimensions to be overridden when running
2138 constructors or destructors. */
2139 int override_x
= kernel
->kind
== KIND_UNKNOWN
? 0 : override_x_dim
;
2140 int override_z
= kernel
->kind
== KIND_UNKNOWN
? 0 : override_z_dim
;
2142 hsa_kernel_dispatch_packet_t
*packet
;
2143 packet
= ((hsa_kernel_dispatch_packet_t
*) command_q
->base_address
)
2144 + index
% command_q
->size
;
2146 memset (((uint8_t *) packet
) + 4, 0, sizeof (*packet
) - 4);
2147 packet
->grid_size_x
= override_x
? : kla
->gdims
[0];
2148 packet
->workgroup_size_x
= get_group_size (kla
->ndim
,
2149 packet
->grid_size_x
,
2154 packet
->grid_size_y
= kla
->gdims
[1];
2155 packet
->workgroup_size_y
= get_group_size (kla
->ndim
, kla
->gdims
[1],
2160 packet
->grid_size_y
= 1;
2161 packet
->workgroup_size_y
= 1;
2166 packet
->grid_size_z
= limit_worker_threads (override_z
2168 packet
->workgroup_size_z
= get_group_size (kla
->ndim
,
2169 packet
->grid_size_z
,
2174 packet
->grid_size_z
= 1;
2175 packet
->workgroup_size_z
= 1;
2178 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2179 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2180 packet
->grid_size_x
, packet
->grid_size_y
, packet
->grid_size_z
,
2181 packet
->grid_size_x
/ packet
->workgroup_size_x
,
2182 packet
->grid_size_y
/ packet
->workgroup_size_y
,
2183 packet
->grid_size_z
/ packet
->workgroup_size_z
,
2184 packet
->workgroup_size_x
, packet
->workgroup_size_y
,
2185 packet
->workgroup_size_z
);
2187 struct kernel_dispatch
*shadow
2188 = create_kernel_dispatch (kernel
, packet
->grid_size_x
);
2189 shadow
->queue
= command_q
;
2193 fprintf (stderr
, "\nKernel has following dependencies:\n");
2194 print_kernel_dispatch (shadow
, 2);
2197 packet
->private_segment_size
= kernel
->private_segment_size
;
2198 packet
->group_segment_size
= kernel
->group_segment_size
;
2199 packet
->kernel_object
= kernel
->object
;
2200 packet
->kernarg_address
= shadow
->kernarg_address
;
2202 s
.handle
= shadow
->signal
;
2203 packet
->completion_signal
= s
;
2204 hsa_fns
.hsa_signal_store_relaxed_fn (s
, 1);
2205 memcpy (shadow
->kernarg_address
, &vars
, sizeof (vars
));
2207 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2210 header
= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
;
2211 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
;
2212 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
;
2214 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel
->name
,
2217 packet_store_release ((uint32_t *) packet
, header
,
2218 (uint16_t) kla
->ndim
2219 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
);
2221 hsa_fns
.hsa_signal_store_release_fn (command_q
->doorbell_signal
,
2224 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2226 /* Root signal waits with 1ms timeout. */
2227 while (hsa_fns
.hsa_signal_wait_acquire_fn (s
, HSA_SIGNAL_CONDITION_LT
, 1,
2229 HSA_WAIT_STATE_BLOCKED
) != 0)
2231 console_output (kernel
, shadow
->kernarg_address
, false);
2233 console_output (kernel
, shadow
->kernarg_address
, true);
2235 struct kernargs
*kernargs
= shadow
->kernarg_address
;
2236 unsigned int return_value
= (unsigned int)kernargs
->output_data
.return_value
;
2238 release_kernel_dispatch (shadow
);
2240 if (!module_locked
&& pthread_rwlock_unlock (&agent
->module_rwlock
))
2241 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2243 unsigned int upper
= (return_value
& ~0xffff) >> 16;
2244 if (upper
== 0xcafe)
2245 ; // exit not called, normal termination.
2246 else if (upper
== 0xffff)
2250 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2251 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2256 if (upper
== 0xffff)
2258 unsigned int signal
= (return_value
>> 8) & 0xff;
2260 if (signal
== SIGABRT
)
2262 GCN_WARNING ("GCN Kernel aborted\n");
2265 else if (signal
!= 0)
2267 GCN_WARNING ("GCN Kernel received unknown signal\n");
2271 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value
& 0xff);
2272 exit (return_value
& 0xff);
2277 /* {{{ Load/Unload */
2279 /* Initialize KERNEL from D and other parameters. Return true on success. */
2282 init_basic_kernel_info (struct kernel_info
*kernel
,
2283 struct hsa_kernel_description
*d
,
2284 struct agent_info
*agent
,
2285 struct module_info
*module
)
2287 kernel
->agent
= agent
;
2288 kernel
->module
= module
;
2289 kernel
->name
= d
->name
;
2290 kernel
->description
= d
;
2291 if (pthread_mutex_init (&kernel
->init_mutex
, NULL
))
2293 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2299 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2302 isa_matches_agent (struct agent_info
*agent
, Elf64_Ehdr
*image
)
2304 int isa_field
= elf_gcn_isa_field (image
);
2305 const char* isa_s
= isa_hsa_name (isa_field
);
2308 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR
);
2312 if (isa_field
!= agent
->device_isa
)
2315 const char *agent_isa_s
= isa_hsa_name (agent
->device_isa
);
2316 const char *agent_isa_gcc_s
= isa_gcc_name (agent
->device_isa
);
2317 assert (agent_isa_s
);
2318 assert (agent_isa_gcc_s
);
2320 snprintf (msg
, sizeof msg
,
2321 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2322 "Try to recompile with '-foffload=-march=%s'.\n",
2323 isa_s
, agent_isa_s
, agent_isa_gcc_s
);
2325 hsa_error (msg
, HSA_STATUS_ERROR
);
2332 /* Create and finalize the program consisting of all loaded modules. */
2335 create_and_finalize_hsa_program (struct agent_info
*agent
)
2337 hsa_status_t status
;
2339 if (pthread_mutex_lock (&agent
->prog_mutex
))
2341 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2344 if (agent
->prog_finalized
)
2348 = hsa_fns
.hsa_executable_create_fn (HSA_PROFILE_FULL
,
2349 HSA_EXECUTABLE_STATE_UNFROZEN
,
2350 "", &agent
->executable
);
2351 if (status
!= HSA_STATUS_SUCCESS
)
2353 hsa_error ("Could not create GCN executable", status
);
2357 /* Load any GCN modules. */
2358 struct module_info
*module
= agent
->module
;
2361 Elf64_Ehdr
*image
= (Elf64_Ehdr
*)module
->image_desc
->gcn_image
->image
;
2363 if (!isa_matches_agent (agent
, image
))
2366 hsa_code_object_t co
= { 0 };
2367 status
= hsa_fns
.hsa_code_object_deserialize_fn
2368 (module
->image_desc
->gcn_image
->image
,
2369 module
->image_desc
->gcn_image
->size
,
2371 if (status
!= HSA_STATUS_SUCCESS
)
2373 hsa_error ("Could not deserialize GCN code object", status
);
2377 status
= hsa_fns
.hsa_executable_load_code_object_fn
2378 (agent
->executable
, agent
->id
, co
, "");
2379 if (status
!= HSA_STATUS_SUCCESS
)
2381 hsa_error ("Could not load GCN code object", status
);
2387 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
2388 gcn_kernel_heap_size
,
2389 (void**)&module
->heap
);
2390 if (status
!= HSA_STATUS_SUCCESS
)
2392 hsa_error ("Could not allocate memory for GCN heap", status
);
2396 status
= hsa_fns
.hsa_memory_assign_agent_fn
2397 (module
->heap
, agent
->id
, HSA_ACCESS_PERMISSION_RW
);
2398 if (status
!= HSA_STATUS_SUCCESS
)
2400 hsa_error ("Could not assign GCN heap memory to device", status
);
2404 hsa_fns
.hsa_memory_copy_fn (&module
->heap
->size
,
2405 &gcn_kernel_heap_size
,
2406 sizeof (gcn_kernel_heap_size
));
2412 dump_executable_symbols (agent
->executable
);
2414 status
= hsa_fns
.hsa_executable_freeze_fn (agent
->executable
, "");
2415 if (status
!= HSA_STATUS_SUCCESS
)
2417 hsa_error ("Could not freeze the GCN executable", status
);
2422 agent
->prog_finalized
= true;
2424 if (pthread_mutex_unlock (&agent
->prog_mutex
))
2426 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2437 /* Free the HSA program in agent and everything associated with it and set
2438 agent->prog_finalized and the initialized flags of all kernels to false.
2439 Return TRUE on success. */
2442 destroy_hsa_program (struct agent_info
*agent
)
2444 if (!agent
->prog_finalized
)
2447 hsa_status_t status
;
2449 GCN_DEBUG ("Destroying the current GCN program.\n");
2451 status
= hsa_fns
.hsa_executable_destroy_fn (agent
->executable
);
2452 if (status
!= HSA_STATUS_SUCCESS
)
2453 return hsa_error ("Could not destroy GCN executable", status
);
2458 for (i
= 0; i
< agent
->module
->kernel_count
; i
++)
2459 agent
->module
->kernels
[i
].initialized
= false;
2461 if (agent
->module
->heap
)
2463 hsa_fns
.hsa_memory_free_fn (agent
->module
->heap
);
2464 agent
->module
->heap
= NULL
;
2467 agent
->prog_finalized
= false;
2471 /* Deinitialize all information associated with MODULE and kernels within
2472 it. Return TRUE on success. */
2475 destroy_module (struct module_info
*module
, bool locked
)
2477 /* Run destructors before destroying module. */
2478 struct GOMP_kernel_launch_attributes kla
=
2482 /* Work-group size. */
2486 if (module
->fini_array_func
)
2488 init_kernel (module
->fini_array_func
);
2489 run_kernel (module
->fini_array_func
, NULL
, &kla
, NULL
, locked
);
2491 module
->constructors_run_p
= false;
2494 for (i
= 0; i
< module
->kernel_count
; i
++)
2495 if (pthread_mutex_destroy (&module
->kernels
[i
].init_mutex
))
2497 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2508 /* Callback of dispatch queues to report errors. */
2511 execute_queue_entry (struct goacc_asyncqueue
*aq
, int index
)
2513 struct queue_entry
*entry
= &aq
->queue
[index
];
2515 switch (entry
->type
)
2519 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2520 aq
->agent
->device_id
, aq
->id
, index
);
2521 run_kernel (entry
->u
.launch
.kernel
,
2522 entry
->u
.launch
.vars
,
2523 &entry
->u
.launch
.kla
, aq
, false);
2525 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2526 aq
->agent
->device_id
, aq
->id
, index
);
2531 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2532 aq
->agent
->device_id
, aq
->id
, index
);
2533 entry
->u
.callback
.fn (entry
->u
.callback
.data
);
2535 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2536 aq
->agent
->device_id
, aq
->id
, index
);
2541 /* FIXME: is it safe to access a placeholder that may already have
2543 struct placeholder
*placeholderp
= entry
->u
.asyncwait
.placeholderp
;
2546 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2547 aq
->agent
->device_id
, aq
->id
, index
);
2549 pthread_mutex_lock (&placeholderp
->mutex
);
2551 while (!placeholderp
->executed
)
2552 pthread_cond_wait (&placeholderp
->cond
, &placeholderp
->mutex
);
2554 pthread_mutex_unlock (&placeholderp
->mutex
);
2556 if (pthread_cond_destroy (&placeholderp
->cond
))
2557 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2559 if (pthread_mutex_destroy (&placeholderp
->mutex
))
2560 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2563 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2564 "entry (%d) done\n", aq
->agent
->device_id
, aq
->id
, index
);
2568 case ASYNC_PLACEHOLDER
:
2569 pthread_mutex_lock (&entry
->u
.placeholder
.mutex
);
2570 entry
->u
.placeholder
.executed
= 1;
2571 pthread_cond_signal (&entry
->u
.placeholder
.cond
);
2572 pthread_mutex_unlock (&entry
->u
.placeholder
.mutex
);
2576 GOMP_PLUGIN_fatal ("Unknown queue element");
2580 /* This function is run as a thread to service an async queue in the
2581 background. It runs continuously until the stop flag is set. */
2584 drain_queue (void *thread_arg
)
2586 struct goacc_asyncqueue
*aq
= thread_arg
;
2588 if (DRAIN_QUEUE_SYNCHRONOUS_P
)
2590 aq
->drain_queue_stop
= 2;
2594 pthread_mutex_lock (&aq
->mutex
);
2598 if (aq
->drain_queue_stop
)
2601 if (aq
->queue_n
> 0)
2603 pthread_mutex_unlock (&aq
->mutex
);
2604 execute_queue_entry (aq
, aq
->queue_first
);
2606 pthread_mutex_lock (&aq
->mutex
);
2607 aq
->queue_first
= ((aq
->queue_first
+ 1)
2608 % ASYNC_QUEUE_SIZE
);
2611 if (DEBUG_THREAD_SIGNAL
)
2612 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2613 aq
->agent
->device_id
, aq
->id
);
2614 pthread_cond_broadcast (&aq
->queue_cond_out
);
2615 pthread_mutex_unlock (&aq
->mutex
);
2618 GCN_DEBUG ("Async thread %d:%d: continue\n", aq
->agent
->device_id
,
2620 pthread_mutex_lock (&aq
->mutex
);
2624 if (DEBUG_THREAD_SLEEP
)
2625 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2626 aq
->agent
->device_id
, aq
->id
);
2627 pthread_cond_wait (&aq
->queue_cond_in
, &aq
->mutex
);
2628 if (DEBUG_THREAD_SLEEP
)
2629 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2630 aq
->agent
->device_id
, aq
->id
);
2634 aq
->drain_queue_stop
= 2;
2635 if (DEBUG_THREAD_SIGNAL
)
2636 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2637 aq
->agent
->device_id
, aq
->id
);
2638 pthread_cond_broadcast (&aq
->queue_cond_out
);
2639 pthread_mutex_unlock (&aq
->mutex
);
2641 GCN_DEBUG ("Async thread %d:%d: returning\n", aq
->agent
->device_id
, aq
->id
);
2645 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2646 is not usually the case. This is just a debug tool. */
2649 drain_queue_synchronous (struct goacc_asyncqueue
*aq
)
2651 pthread_mutex_lock (&aq
->mutex
);
2653 while (aq
->queue_n
> 0)
2655 execute_queue_entry (aq
, aq
->queue_first
);
2657 aq
->queue_first
= ((aq
->queue_first
+ 1)
2658 % ASYNC_QUEUE_SIZE
);
2662 pthread_mutex_unlock (&aq
->mutex
);
2665 /* Block the current thread until an async queue is writable. The aq->mutex
2666 lock should be held on entry, and remains locked on exit. */
2669 wait_for_queue_nonfull (struct goacc_asyncqueue
*aq
)
2671 if (aq
->queue_n
== ASYNC_QUEUE_SIZE
)
2673 /* Queue is full. Wait for it to not be full. */
2674 while (aq
->queue_n
== ASYNC_QUEUE_SIZE
)
2675 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
2679 /* Request an asynchronous kernel launch on the specified queue. This
2680 may block if the queue is full, but returns without waiting for the
2684 queue_push_launch (struct goacc_asyncqueue
*aq
, struct kernel_info
*kernel
,
2685 void *vars
, struct GOMP_kernel_launch_attributes
*kla
)
2687 assert (aq
->agent
== kernel
->agent
);
2689 pthread_mutex_lock (&aq
->mutex
);
2691 wait_for_queue_nonfull (aq
);
2693 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
)
2694 % ASYNC_QUEUE_SIZE
);
2696 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq
->agent
->device_id
,
2697 aq
->id
, queue_last
);
2699 aq
->queue
[queue_last
].type
= KERNEL_LAUNCH
;
2700 aq
->queue
[queue_last
].u
.launch
.kernel
= kernel
;
2701 aq
->queue
[queue_last
].u
.launch
.vars
= vars
;
2702 aq
->queue
[queue_last
].u
.launch
.kla
= *kla
;
2706 if (DEBUG_THREAD_SIGNAL
)
2707 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2708 aq
->agent
->device_id
, aq
->id
);
2709 pthread_cond_signal (&aq
->queue_cond_in
);
2711 pthread_mutex_unlock (&aq
->mutex
);
2714 /* Request an asynchronous callback on the specified queue. The callback
2715 function will be called, with the given opaque data, from the appropriate
2716 async thread, when all previous items on that queue are complete. */
2719 queue_push_callback (struct goacc_asyncqueue
*aq
, void (*fn
)(void *),
2722 pthread_mutex_lock (&aq
->mutex
);
2724 wait_for_queue_nonfull (aq
);
2726 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
)
2727 % ASYNC_QUEUE_SIZE
);
2729 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq
->agent
->device_id
,
2730 aq
->id
, queue_last
);
2732 aq
->queue
[queue_last
].type
= CALLBACK
;
2733 aq
->queue
[queue_last
].u
.callback
.fn
= fn
;
2734 aq
->queue
[queue_last
].u
.callback
.data
= data
;
2738 if (DEBUG_THREAD_SIGNAL
)
2739 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2740 aq
->agent
->device_id
, aq
->id
);
2741 pthread_cond_signal (&aq
->queue_cond_in
);
2743 pthread_mutex_unlock (&aq
->mutex
);
2746 /* Request that a given async thread wait for another thread (unspecified) to
2747 reach the given placeholder. The wait will occur when all previous entries
2748 on the queue are complete. A placeholder is effectively a kind of signal
2749 which simply sets a flag when encountered in a queue. */
2752 queue_push_asyncwait (struct goacc_asyncqueue
*aq
,
2753 struct placeholder
*placeholderp
)
2755 pthread_mutex_lock (&aq
->mutex
);
2757 wait_for_queue_nonfull (aq
);
2759 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
) % ASYNC_QUEUE_SIZE
);
2761 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq
->agent
->device_id
,
2762 aq
->id
, queue_last
);
2764 aq
->queue
[queue_last
].type
= ASYNC_WAIT
;
2765 aq
->queue
[queue_last
].u
.asyncwait
.placeholderp
= placeholderp
;
2769 if (DEBUG_THREAD_SIGNAL
)
2770 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2771 aq
->agent
->device_id
, aq
->id
);
2772 pthread_cond_signal (&aq
->queue_cond_in
);
2774 pthread_mutex_unlock (&aq
->mutex
);
2777 /* Add a placeholder into an async queue. When the async thread reaches the
2778 placeholder it will set the "executed" flag to true and continue.
2779 Another thread may be waiting on this thread reaching the placeholder. */
2781 static struct placeholder
*
2782 queue_push_placeholder (struct goacc_asyncqueue
*aq
)
2784 struct placeholder
*placeholderp
;
2786 pthread_mutex_lock (&aq
->mutex
);
2788 wait_for_queue_nonfull (aq
);
2790 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
) % ASYNC_QUEUE_SIZE
);
2792 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq
->agent
->device_id
,
2793 aq
->id
, queue_last
);
2795 aq
->queue
[queue_last
].type
= ASYNC_PLACEHOLDER
;
2796 placeholderp
= &aq
->queue
[queue_last
].u
.placeholder
;
2798 if (pthread_mutex_init (&placeholderp
->mutex
, NULL
))
2800 pthread_mutex_unlock (&aq
->mutex
);
2801 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2804 if (pthread_cond_init (&placeholderp
->cond
, NULL
))
2806 pthread_mutex_unlock (&aq
->mutex
);
2807 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2810 placeholderp
->executed
= 0;
2814 if (DEBUG_THREAD_SIGNAL
)
2815 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2816 aq
->agent
->device_id
, aq
->id
);
2817 pthread_cond_signal (&aq
->queue_cond_in
);
2819 pthread_mutex_unlock (&aq
->mutex
);
2821 return placeholderp
;
2824 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
2827 finalize_async_thread (struct goacc_asyncqueue
*aq
)
2829 pthread_mutex_lock (&aq
->mutex
);
2830 if (aq
->drain_queue_stop
== 2)
2832 pthread_mutex_unlock (&aq
->mutex
);
2836 aq
->drain_queue_stop
= 1;
2838 if (DEBUG_THREAD_SIGNAL
)
2839 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2840 aq
->agent
->device_id
, aq
->id
);
2841 pthread_cond_signal (&aq
->queue_cond_in
);
2843 while (aq
->drain_queue_stop
!= 2)
2845 if (DEBUG_THREAD_SLEEP
)
2846 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2847 " to sleep\n", aq
->agent
->device_id
, aq
->id
);
2848 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
2849 if (DEBUG_THREAD_SLEEP
)
2850 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2851 aq
->agent
->device_id
, aq
->id
);
2854 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq
->agent
->device_id
,
2856 pthread_mutex_unlock (&aq
->mutex
);
2858 int err
= pthread_join (aq
->thread_drain_queue
, NULL
);
2860 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2861 aq
->agent
->device_id
, aq
->id
, strerror (err
));
2862 GCN_DEBUG ("Joined with async thread %d:%d\n", aq
->agent
->device_id
, aq
->id
);
2865 /* Set up an async queue for OpenMP. There will be only one. The
2866 implementation simply uses an OpenACC async queue.
2867 FIXME: is this thread-safe if two threads call this function? */
2870 maybe_init_omp_async (struct agent_info
*agent
)
2872 if (!agent
->omp_async_queue
)
2873 agent
->omp_async_queue
2874 = GOMP_OFFLOAD_openacc_async_construct (agent
->device_id
);
2877 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2878 copies from read-only pages. */
2881 hsa_memory_copy_wrapper (void *dst
, const void *src
, size_t len
)
2883 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, len
);
2885 if (status
== HSA_STATUS_SUCCESS
)
2888 /* It appears that the copy fails if the source data is in a read-only page.
2889 We can't detect that easily, so try copying the data to a temporary buffer
2890 and doing the copy again if we got an error above. */
2892 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2893 "[%p:+%d]\n", (void *) src
, (int) len
);
2895 void *src_copy
= malloc (len
);
2896 memcpy (src_copy
, src
, len
);
2897 status
= hsa_fns
.hsa_memory_copy_fn (dst
, (const void *) src_copy
, len
);
2899 if (status
!= HSA_STATUS_SUCCESS
)
2900 GOMP_PLUGIN_error ("memory copy failed");
2903 /* Copy data to or from a device. This is intended for use as an async
2907 copy_data (void *data_
)
2909 struct copy_data
*data
= (struct copy_data
*)data_
;
2910 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2911 data
->aq
->agent
->device_id
, data
->aq
->id
, data
->len
, data
->src
,
2913 hsa_memory_copy_wrapper (data
->dst
, data
->src
, data
->len
);
2915 free ((void *) data
->src
);
2919 /* Free device data. This is intended for use as an async callback event. */
2922 gomp_offload_free (void *ptr
)
2924 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr
);
2925 GOMP_OFFLOAD_free (0, ptr
);
2928 /* Request an asynchronous data copy, to or from a device, on a given queue.
2929 The event will be registered as a callback. If FREE_SRC is true
2930 then the source data will be freed following the copy. */
2933 queue_push_copy (struct goacc_asyncqueue
*aq
, void *dst
, const void *src
,
2934 size_t len
, bool free_src
)
2937 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2938 aq
->agent
->device_id
, aq
->id
, len
, src
, dst
);
2939 struct copy_data
*data
2940 = (struct copy_data
*)GOMP_PLUGIN_malloc (sizeof (struct copy_data
));
2944 data
->free_src
= free_src
;
2946 queue_push_callback (aq
, copy_data
, data
);
2949 /* Return true if the given queue is currently empty. */
2952 queue_empty (struct goacc_asyncqueue
*aq
)
2954 pthread_mutex_lock (&aq
->mutex
);
2955 int res
= aq
->queue_n
== 0 ? 1 : 0;
2956 pthread_mutex_unlock (&aq
->mutex
);
2961 /* Wait for a given queue to become empty. This implements an OpenACC wait
2965 wait_queue (struct goacc_asyncqueue
*aq
)
2967 if (DRAIN_QUEUE_SYNCHRONOUS_P
)
2969 drain_queue_synchronous (aq
);
2973 pthread_mutex_lock (&aq
->mutex
);
2975 while (aq
->queue_n
> 0)
2977 if (DEBUG_THREAD_SLEEP
)
2978 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
2979 aq
->agent
->device_id
, aq
->id
);
2980 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
2981 if (DEBUG_THREAD_SLEEP
)
2982 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq
->agent
->device_id
,
2986 pthread_mutex_unlock (&aq
->mutex
);
2987 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq
->agent
->device_id
, aq
->id
);
2991 /* {{{ OpenACC support */
2993 /* Execute an OpenACC kernel, synchronously or asynchronously. */
2996 gcn_exec (struct kernel_info
*kernel
, size_t mapnum
, void **hostaddrs
,
2997 void **devaddrs
, unsigned *dims
, void *targ_mem_desc
, bool async
,
2998 struct goacc_asyncqueue
*aq
)
3000 if (!GOMP_OFFLOAD_can_run (kernel
))
3001 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3003 /* If we get here then this must be an OpenACC kernel. */
3004 kernel
->kind
= KIND_OPENACC
;
3006 /* devaddrs must be double-indirect on the target. */
3007 void **ind_da
= alloc_by_agent (kernel
->agent
, sizeof (void*) * mapnum
);
3008 for (size_t i
= 0; i
< mapnum
; i
++)
3009 hsa_fns
.hsa_memory_copy_fn (&ind_da
[i
],
3010 devaddrs
[i
] ? &devaddrs
[i
] : &hostaddrs
[i
],
3013 struct hsa_kernel_description
*hsa_kernel_desc
= NULL
;
3014 for (unsigned i
= 0; i
< kernel
->module
->image_desc
->kernel_count
; i
++)
3016 struct hsa_kernel_description
*d
3017 = &kernel
->module
->image_desc
->kernel_infos
[i
];
3018 if (d
->name
== kernel
->name
)
3020 hsa_kernel_desc
= d
;
3025 /* We may have statically-determined dimensions in
3026 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3027 invocation at runtime in dims[]. We allow static dimensions to take
3028 priority over dynamic dimensions when present (non-zero). */
3029 if (hsa_kernel_desc
->oacc_dims
[0] > 0)
3030 dims
[0] = hsa_kernel_desc
->oacc_dims
[0];
3031 if (hsa_kernel_desc
->oacc_dims
[1] > 0)
3032 dims
[1] = hsa_kernel_desc
->oacc_dims
[1];
3033 if (hsa_kernel_desc
->oacc_dims
[2] > 0)
3034 dims
[2] = hsa_kernel_desc
->oacc_dims
[2];
3036 /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3037 There isn't really a correct answer for this without a clue about the
3038 problem size, so let's do a reasonable number of single-worker gangs.
3039 64 gangs matches a typical Fiji device. */
3041 /* NOTE: Until support for middle-end worker partitioning is merged, use 1
3042 for the default number of workers. */
3043 if (dims
[0] == 0) dims
[0] = get_cu_count (kernel
->agent
); /* Gangs. */
3044 if (dims
[1] == 0) dims
[1] = 1; /* Workers. */
3046 /* The incoming dimensions are expressed in terms of gangs, workers, and
3047 vectors. The HSA dimensions are expressed in terms of "work-items",
3048 which means multiples of vector lanes.
3050 The "grid size" specifies the size of the problem space, and the
3051 "work-group size" specifies how much of that we want a single compute
3052 unit to chew on at once.
3054 The three dimensions do not really correspond to hardware, but the
3055 important thing is that the HSA runtime will launch as many
3056 work-groups as it takes to process the entire grid, and each
3057 work-group will contain as many wave-fronts as it takes to process
3058 the work-items in that group.
3060 Essentially, as long as we set the Y dimension to 64 (the number of
3061 vector lanes in hardware), and the Z group size to the maximum (16),
3062 then we will get the gangs (X) and workers (Z) launched as we expect.
3064 The reason for the apparent reversal of vector and worker dimension
3065 order is to do with the way the run-time distributes work-items across
3067 struct GOMP_kernel_launch_attributes kla
=
3070 {dims
[0], 64, dims
[1]},
3071 /* Work-group size. */
3075 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
3076 acc_prof_info
*prof_info
= thr
->prof_info
;
3077 acc_event_info enqueue_launch_event_info
;
3078 acc_api_info
*api_info
= thr
->api_info
;
3079 bool profiling_dispatch_p
= __builtin_expect (prof_info
!= NULL
, false);
3080 if (profiling_dispatch_p
)
3082 prof_info
->event_type
= acc_ev_enqueue_launch_start
;
3084 enqueue_launch_event_info
.launch_event
.event_type
3085 = prof_info
->event_type
;
3086 enqueue_launch_event_info
.launch_event
.valid_bytes
3087 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES
;
3088 enqueue_launch_event_info
.launch_event
.parent_construct
3089 = acc_construct_parallel
;
3090 enqueue_launch_event_info
.launch_event
.implicit
= 1;
3091 enqueue_launch_event_info
.launch_event
.tool_info
= NULL
;
3092 enqueue_launch_event_info
.launch_event
.kernel_name
3093 = (char *) kernel
->name
;
3094 enqueue_launch_event_info
.launch_event
.num_gangs
= kla
.gdims
[0];
3095 enqueue_launch_event_info
.launch_event
.num_workers
= kla
.gdims
[2];
3096 enqueue_launch_event_info
.launch_event
.vector_length
= kla
.gdims
[1];
3098 api_info
->device_api
= acc_device_api_other
;
3100 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
,
3101 &enqueue_launch_event_info
, api_info
);
3106 run_kernel (kernel
, ind_da
, &kla
, NULL
, false);
3107 gomp_offload_free (ind_da
);
3111 queue_push_launch (aq
, kernel
, ind_da
, &kla
);
3113 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3114 aq
->agent
->device_id
, aq
->id
, ind_da
);
3115 queue_push_callback (aq
, gomp_offload_free
, ind_da
);
3118 if (profiling_dispatch_p
)
3120 prof_info
->event_type
= acc_ev_enqueue_launch_end
;
3121 enqueue_launch_event_info
.launch_event
.event_type
= prof_info
->event_type
;
3122 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
,
3123 &enqueue_launch_event_info
,
3129 /* {{{ Generic Plugin API */
3131 /* Return the name of the accelerator, which is "gcn". */
3134 GOMP_OFFLOAD_get_name (void)
3139 /* Return the specific capabilities the HSA accelerator have. */
3142 GOMP_OFFLOAD_get_caps (void)
3144 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3145 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3146 | GOMP_OFFLOAD_CAP_OPENACC_200
;
3149 /* Identify as GCN accelerator. */
3152 GOMP_OFFLOAD_get_type (void)
3154 return OFFLOAD_TARGET_TYPE_GCN
;
3157 /* Return the libgomp version number we're compatible with. There is
3158 no requirement for cross-version compatibility. */
3161 GOMP_OFFLOAD_version (void)
3163 return GOMP_VERSION
;
3166 /* Return the number of GCN devices on the system. */
3169 GOMP_OFFLOAD_get_num_devices (void)
3171 if (!init_hsa_context ())
3173 return hsa_context
.agent_count
;
3176 /* Initialize device (agent) number N so that it can be used for computation.
3177 Return TRUE on success. */
3180 GOMP_OFFLOAD_init_device (int n
)
3182 if (!init_hsa_context ())
3184 if (n
>= hsa_context
.agent_count
)
3186 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n
);
3189 struct agent_info
*agent
= &hsa_context
.agents
[n
];
3191 if (agent
->initialized
)
3194 agent
->device_id
= n
;
3196 if (pthread_rwlock_init (&agent
->module_rwlock
, NULL
))
3198 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3201 if (pthread_mutex_init (&agent
->prog_mutex
, NULL
))
3203 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3206 if (pthread_mutex_init (&agent
->async_queues_mutex
, NULL
))
3208 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3211 if (pthread_mutex_init (&agent
->team_arena_write_lock
, NULL
))
3213 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3216 agent
->async_queues
= NULL
;
3217 agent
->omp_async_queue
= NULL
;
3218 agent
->team_arena_list
= NULL
;
3220 uint32_t queue_size
;
3221 hsa_status_t status
;
3222 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
,
3223 HSA_AGENT_INFO_QUEUE_MAX_SIZE
,
3225 if (status
!= HSA_STATUS_SUCCESS
)
3226 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3229 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
, HSA_AGENT_INFO_NAME
,
3231 if (status
!= HSA_STATUS_SUCCESS
)
3232 return hsa_error ("Error querying the name of the agent", status
);
3234 agent
->device_isa
= isa_code (agent
->name
);
3235 if (agent
->device_isa
< 0)
3236 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR
);
3238 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
, HSA_AGENT_INFO_VENDOR_NAME
,
3239 &agent
->vendor_name
);
3240 if (status
!= HSA_STATUS_SUCCESS
)
3241 return hsa_error ("Error querying the vendor name of the agent", status
);
3243 status
= hsa_fns
.hsa_queue_create_fn (agent
->id
, queue_size
,
3244 HSA_QUEUE_TYPE_MULTI
,
3245 hsa_queue_callback
, NULL
, UINT32_MAX
,
3246 UINT32_MAX
, &agent
->sync_queue
);
3247 if (status
!= HSA_STATUS_SUCCESS
)
3248 return hsa_error ("Error creating command queue", status
);
3250 agent
->kernarg_region
.handle
= (uint64_t) -1;
3251 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
->id
,
3252 get_kernarg_memory_region
,
3253 &agent
->kernarg_region
);
3254 if (status
!= HSA_STATUS_SUCCESS
3255 && status
!= HSA_STATUS_INFO_BREAK
)
3256 hsa_error ("Scanning memory regions failed", status
);
3257 if (agent
->kernarg_region
.handle
== (uint64_t) -1)
3259 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3263 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3264 dump_hsa_region (agent
->kernarg_region
, NULL
);
3266 agent
->data_region
.handle
= (uint64_t) -1;
3267 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
->id
,
3268 get_data_memory_region
,
3269 &agent
->data_region
);
3270 if (status
!= HSA_STATUS_SUCCESS
3271 && status
!= HSA_STATUS_INFO_BREAK
)
3272 hsa_error ("Scanning memory regions failed", status
);
3273 if (agent
->data_region
.handle
== (uint64_t) -1)
3275 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3279 GCN_DEBUG ("Selected device data memory region:\n");
3280 dump_hsa_region (agent
->data_region
, NULL
);
3282 GCN_DEBUG ("GCN agent %d initialized\n", n
);
3284 agent
->initialized
= true;
3288 /* Load GCN object-code module described by struct gcn_image_desc in
3289 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3290 If there are any constructors then run them. */
3293 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
3294 struct addr_pair
**target_table
)
3296 if (GOMP_VERSION_DEV (version
) != GOMP_VERSION_GCN
)
3298 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3299 " (expected %u, received %u)",
3300 GOMP_VERSION_GCN
, GOMP_VERSION_DEV (version
));
3304 struct gcn_image_desc
*image_desc
= (struct gcn_image_desc
*) target_data
;
3305 struct agent_info
*agent
;
3306 struct addr_pair
*pair
;
3307 struct module_info
*module
;
3308 struct kernel_info
*kernel
;
3309 int kernel_count
= image_desc
->kernel_count
;
3310 unsigned var_count
= image_desc
->global_variable_count
;
3312 agent
= get_agent_info (ord
);
3316 if (pthread_rwlock_wrlock (&agent
->module_rwlock
))
3318 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3321 if (agent
->prog_finalized
3322 && !destroy_hsa_program (agent
))
3325 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count
);
3326 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count
);
3327 pair
= GOMP_PLUGIN_malloc ((kernel_count
+ var_count
- 2)
3328 * sizeof (struct addr_pair
));
3329 *target_table
= pair
;
3330 module
= (struct module_info
*)
3331 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info
)
3332 + kernel_count
* sizeof (struct kernel_info
));
3333 module
->image_desc
= image_desc
;
3334 module
->kernel_count
= kernel_count
;
3335 module
->heap
= NULL
;
3336 module
->constructors_run_p
= false;
3338 kernel
= &module
->kernels
[0];
3340 /* Allocate memory for kernel dependencies. */
3341 for (unsigned i
= 0; i
< kernel_count
; i
++)
3343 struct hsa_kernel_description
*d
= &image_desc
->kernel_infos
[i
];
3344 if (!init_basic_kernel_info (kernel
, d
, agent
, module
))
3346 if (strcmp (d
->name
, "_init_array") == 0)
3347 module
->init_array_func
= kernel
;
3348 else if (strcmp (d
->name
, "_fini_array") == 0)
3349 module
->fini_array_func
= kernel
;
3352 pair
->start
= (uintptr_t) kernel
;
3353 pair
->end
= (uintptr_t) (kernel
+ 1);
3359 agent
->module
= module
;
3360 if (pthread_rwlock_unlock (&agent
->module_rwlock
))
3362 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3366 if (!create_and_finalize_hsa_program (agent
))
3369 for (unsigned i
= 0; i
< var_count
; i
++)
3371 struct global_var_info
*v
= &image_desc
->global_variables
[i
];
3372 GCN_DEBUG ("Looking for variable %s\n", v
->name
);
3374 hsa_status_t status
;
3375 hsa_executable_symbol_t var_symbol
;
3376 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3380 if (status
!= HSA_STATUS_SUCCESS
)
3381 hsa_fatal ("Could not find symbol for variable in the code object",
3386 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3387 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
, &var_addr
);
3388 if (status
!= HSA_STATUS_SUCCESS
)
3389 hsa_fatal ("Could not extract a variable from its symbol", status
);
3390 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3391 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE
, &var_size
);
3392 if (status
!= HSA_STATUS_SUCCESS
)
3393 hsa_fatal ("Could not extract a variable size from its symbol", status
);
3395 pair
->start
= var_addr
;
3396 pair
->end
= var_addr
+ var_size
;
3397 GCN_DEBUG ("Found variable %s at %p with size %u\n", v
->name
,
3398 (void *)var_addr
, var_size
);
3402 /* Ensure that constructors are run first. */
3403 struct GOMP_kernel_launch_attributes kla
=
3407 /* Work-group size. */
3411 if (module
->init_array_func
)
3413 init_kernel (module
->init_array_func
);
3414 run_kernel (module
->init_array_func
, NULL
, &kla
, NULL
, false);
3416 module
->constructors_run_p
= true;
3418 /* Don't report kernels that libgomp need not know about. */
3419 if (module
->init_array_func
)
3421 if (module
->fini_array_func
)
3424 return kernel_count
+ var_count
;
3427 /* Unload GCN object-code module described by struct gcn_image_desc in
3428 TARGET_DATA from agent number N. Return TRUE on success. */
3431 GOMP_OFFLOAD_unload_image (int n
, unsigned version
, const void *target_data
)
3433 if (GOMP_VERSION_DEV (version
) != GOMP_VERSION_GCN
)
3435 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3436 " (expected %u, received %u)",
3437 GOMP_VERSION_GCN
, GOMP_VERSION_DEV (version
));
3441 struct agent_info
*agent
;
3442 agent
= get_agent_info (n
);
3446 if (pthread_rwlock_wrlock (&agent
->module_rwlock
))
3448 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3452 if (!agent
->module
|| agent
->module
->image_desc
!= target_data
)
3454 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3459 if (!destroy_module (agent
->module
, true))
3461 free (agent
->module
);
3462 agent
->module
= NULL
;
3463 if (!destroy_hsa_program (agent
))
3465 if (pthread_rwlock_unlock (&agent
->module_rwlock
))
3467 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3473 /* Deinitialize all information and status associated with agent number N. We
3474 do not attempt any synchronization, assuming the user and libgomp will not
3475 attempt deinitialization of a device that is in any way being used at the
3476 same time. Return TRUE on success. */
3479 GOMP_OFFLOAD_fini_device (int n
)
3481 struct agent_info
*agent
= get_agent_info (n
);
3485 if (!agent
->initialized
)
3488 if (agent
->omp_async_queue
)
3490 GOMP_OFFLOAD_openacc_async_destruct (agent
->omp_async_queue
);
3491 agent
->omp_async_queue
= NULL
;
3496 if (!destroy_module (agent
->module
, false))
3498 free (agent
->module
);
3499 agent
->module
= NULL
;
3502 if (!destroy_team_arenas (agent
))
3505 if (!destroy_hsa_program (agent
))
3508 hsa_status_t status
= hsa_fns
.hsa_queue_destroy_fn (agent
->sync_queue
);
3509 if (status
!= HSA_STATUS_SUCCESS
)
3510 return hsa_error ("Error destroying command queue", status
);
3512 if (pthread_mutex_destroy (&agent
->prog_mutex
))
3514 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3517 if (pthread_rwlock_destroy (&agent
->module_rwlock
))
3519 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3523 if (pthread_mutex_destroy (&agent
->async_queues_mutex
))
3525 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3528 if (pthread_mutex_destroy (&agent
->team_arena_write_lock
))
3530 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3533 agent
->initialized
= false;
3537 /* Return true if the HSA runtime can run function FN_PTR. */
3540 GOMP_OFFLOAD_can_run (void *fn_ptr
)
3542 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3544 init_kernel (kernel
);
3545 if (kernel
->initialization_failed
)
3551 if (suppress_host_fallback
)
3552 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3553 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3557 /* Allocate memory on device N. */
3560 GOMP_OFFLOAD_alloc (int n
, size_t size
)
3562 struct agent_info
*agent
= get_agent_info (n
);
3563 return alloc_by_agent (agent
, size
);
3566 /* Free memory from device N. */
3569 GOMP_OFFLOAD_free (int device
, void *ptr
)
3571 GCN_DEBUG ("Freeing memory on device %d\n", device
);
3573 hsa_status_t status
= hsa_fns
.hsa_memory_free_fn (ptr
);
3574 if (status
!= HSA_STATUS_SUCCESS
)
3576 hsa_error ("Could not free device memory", status
);
3580 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
3581 bool profiling_dispatch_p
3582 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
3583 if (profiling_dispatch_p
)
3585 acc_prof_info
*prof_info
= thr
->prof_info
;
3586 acc_event_info data_event_info
;
3587 acc_api_info
*api_info
= thr
->api_info
;
3589 prof_info
->event_type
= acc_ev_free
;
3591 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
3592 data_event_info
.data_event
.valid_bytes
3593 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
3594 data_event_info
.data_event
.parent_construct
3595 = acc_construct_parallel
;
3596 data_event_info
.data_event
.implicit
= 1;
3597 data_event_info
.data_event
.tool_info
= NULL
;
3598 data_event_info
.data_event
.var_name
= NULL
;
3599 data_event_info
.data_event
.bytes
= 0;
3600 data_event_info
.data_event
.host_ptr
= NULL
;
3601 data_event_info
.data_event
.device_ptr
= (void *) ptr
;
3603 api_info
->device_api
= acc_device_api_other
;
3605 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
3612 /* Copy data from DEVICE to host. */
3615 GOMP_OFFLOAD_dev2host (int device
, void *dst
, const void *src
, size_t n
)
3617 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n
, device
,
3619 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, n
);
3620 if (status
!= HSA_STATUS_SUCCESS
)
3621 GOMP_PLUGIN_error ("memory copy failed");
3625 /* Copy data from host to DEVICE. */
3628 GOMP_OFFLOAD_host2dev (int device
, void *dst
, const void *src
, size_t n
)
3630 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n
, src
,
3632 hsa_memory_copy_wrapper (dst
, src
, n
);
3636 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3639 GOMP_OFFLOAD_dev2dev (int device
, void *dst
, const void *src
, size_t n
)
3641 struct gcn_thread
*thread_data
= gcn_thread ();
3643 if (thread_data
&& !async_synchronous_p (thread_data
->async
))
3645 struct agent_info
*agent
= get_agent_info (device
);
3646 maybe_init_omp_async (agent
);
3647 queue_push_copy (agent
->omp_async_queue
, dst
, src
, n
, false);
3651 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n
,
3652 device
, src
, device
, dst
);
3653 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, n
);
3654 if (status
!= HSA_STATUS_SUCCESS
)
3655 GOMP_PLUGIN_error ("memory copy failed");
3660 /* {{{ OpenMP Plugin API */
3662 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3663 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3664 to a kernel_info structure, and must have previously been loaded to the
3665 specified device. */
3668 GOMP_OFFLOAD_run (int device
, void *fn_ptr
, void *vars
, void **args
)
3670 struct agent_info
*agent
= get_agent_info (device
);
3671 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3672 struct GOMP_kernel_launch_attributes def
;
3673 struct GOMP_kernel_launch_attributes
*kla
;
3674 assert (agent
== kernel
->agent
);
3676 /* If we get here then the kernel must be OpenMP. */
3677 kernel
->kind
= KIND_OPENMP
;
3679 if (!parse_target_attributes (args
, &def
, &kla
, agent
))
3681 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3684 run_kernel (kernel
, vars
, kla
, NULL
, false);
3687 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3688 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3689 GOMP_PLUGIN_target_task_completion when it has finished. */
3692 GOMP_OFFLOAD_async_run (int device
, void *tgt_fn
, void *tgt_vars
,
3693 void **args
, void *async_data
)
3695 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3696 struct agent_info
*agent
= get_agent_info (device
);
3697 struct kernel_info
*kernel
= (struct kernel_info
*) tgt_fn
;
3698 struct GOMP_kernel_launch_attributes def
;
3699 struct GOMP_kernel_launch_attributes
*kla
;
3700 assert (agent
== kernel
->agent
);
3702 /* If we get here then the kernel must be OpenMP. */
3703 kernel
->kind
= KIND_OPENMP
;
3705 if (!parse_target_attributes (args
, &def
, &kla
, agent
))
3707 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3711 maybe_init_omp_async (agent
);
3712 queue_push_launch (agent
->omp_async_queue
, kernel
, tgt_vars
, kla
);
3713 queue_push_callback (agent
->omp_async_queue
,
3714 GOMP_PLUGIN_target_task_completion
, async_data
);
3718 /* {{{ OpenACC Plugin API */
3720 /* Run a synchronous OpenACC kernel. The device number is inferred from the
3721 already-loaded KERNEL. */
3724 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr
) (void *), size_t mapnum
,
3725 void **hostaddrs
, void **devaddrs
, unsigned *dims
,
3726 void *targ_mem_desc
)
3728 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3730 gcn_exec (kernel
, mapnum
, hostaddrs
, devaddrs
, dims
, targ_mem_desc
, false,
3734 /* Run an asynchronous OpenACC kernel on the specified queue. */
3737 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr
) (void *), size_t mapnum
,
3738 void **hostaddrs
, void **devaddrs
,
3739 unsigned *dims
, void *targ_mem_desc
,
3740 struct goacc_asyncqueue
*aq
)
3742 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3744 gcn_exec (kernel
, mapnum
, hostaddrs
, devaddrs
, dims
, targ_mem_desc
, true,
3748 /* Create a new asynchronous thread and queue for running future kernels. */
3750 struct goacc_asyncqueue
*
3751 GOMP_OFFLOAD_openacc_async_construct (int device
)
3753 struct agent_info
*agent
= get_agent_info (device
);
3755 pthread_mutex_lock (&agent
->async_queues_mutex
);
3757 struct goacc_asyncqueue
*aq
= GOMP_PLUGIN_malloc (sizeof (*aq
));
3758 aq
->agent
= get_agent_info (device
);
3760 aq
->next
= agent
->async_queues
;
3763 aq
->next
->prev
= aq
;
3764 aq
->id
= aq
->next
->id
+ 1;
3768 agent
->async_queues
= aq
;
3770 aq
->queue_first
= 0;
3772 aq
->drain_queue_stop
= 0;
3774 if (pthread_mutex_init (&aq
->mutex
, NULL
))
3776 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3779 if (pthread_cond_init (&aq
->queue_cond_in
, NULL
))
3781 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3784 if (pthread_cond_init (&aq
->queue_cond_out
, NULL
))
3786 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3790 hsa_status_t status
= hsa_fns
.hsa_queue_create_fn (agent
->id
,
3792 HSA_QUEUE_TYPE_MULTI
,
3793 hsa_queue_callback
, NULL
,
3794 UINT32_MAX
, UINT32_MAX
,
3796 if (status
!= HSA_STATUS_SUCCESS
)
3797 hsa_fatal ("Error creating command queue", status
);
3799 int err
= pthread_create (&aq
->thread_drain_queue
, NULL
, &drain_queue
, aq
);
3801 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3803 GCN_DEBUG ("Async thread %d:%d: created\n", aq
->agent
->device_id
,
3806 pthread_mutex_unlock (&agent
->async_queues_mutex
);
3811 /* Destroy an existing asynchronous thread and queue. Waits for any
3812 currently-running task to complete, but cancels any queued tasks. */
3815 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue
*aq
)
3817 struct agent_info
*agent
= aq
->agent
;
3819 finalize_async_thread (aq
);
3821 pthread_mutex_lock (&agent
->async_queues_mutex
);
3824 if ((err
= pthread_mutex_destroy (&aq
->mutex
)))
3826 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err
);
3829 if (pthread_cond_destroy (&aq
->queue_cond_in
))
3831 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3834 if (pthread_cond_destroy (&aq
->queue_cond_out
))
3836 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3839 hsa_status_t status
= hsa_fns
.hsa_queue_destroy_fn (aq
->hsa_queue
);
3840 if (status
!= HSA_STATUS_SUCCESS
)
3842 hsa_error ("Error destroying command queue", status
);
3847 aq
->prev
->next
= aq
->next
;
3849 aq
->next
->prev
= aq
->prev
;
3850 if (agent
->async_queues
== aq
)
3851 agent
->async_queues
= aq
->next
;
3853 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent
->device_id
, aq
->id
);
3856 pthread_mutex_unlock (&agent
->async_queues_mutex
);
3860 pthread_mutex_unlock (&agent
->async_queues_mutex
);
3864 /* Return true if the specified async queue is currently empty. */
3867 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue
*aq
)
3869 return queue_empty (aq
);
3872 /* Block until the specified queue has executed all its tasks and the
3876 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue
*aq
)
3882 /* Add a serialization point across two async queues. Any new tasks added to
3883 AQ2, after this call, will not run until all tasks on AQ1, at the time
3884 of this call, have completed. */
3887 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue
*aq1
,
3888 struct goacc_asyncqueue
*aq2
)
3890 /* For serialize, stream aq2 waits for aq1 to complete work that has been
3891 scheduled to run on it up to this point. */
3894 struct placeholder
*placeholderp
= queue_push_placeholder (aq1
);
3895 queue_push_asyncwait (aq2
, placeholderp
);
3900 /* Add an opaque callback to the given async queue. */
3903 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue
*aq
,
3904 void (*fn
) (void *), void *data
)
3906 queue_push_callback (aq
, fn
, data
);
3909 /* Queue up an asynchronous data copy from host to DEVICE. */
3912 GOMP_OFFLOAD_openacc_async_host2dev (int device
, void *dst
, const void *src
,
3913 size_t n
, struct goacc_asyncqueue
*aq
)
3915 struct agent_info
*agent
= get_agent_info (device
);
3916 assert (agent
== aq
->agent
);
3917 /* The source data does not necessarily remain live until the deferred
3918 copy happens. Taking a snapshot of the data here avoids reading
3919 uninitialised data later, but means that (a) data is copied twice and
3920 (b) modifications to the copied data between the "spawning" point of
3921 the asynchronous kernel and when it is executed will not be seen.
3922 But, that is probably correct. */
3923 void *src_copy
= GOMP_PLUGIN_malloc (n
);
3924 memcpy (src_copy
, src
, n
);
3925 queue_push_copy (aq
, dst
, src_copy
, n
, true);
3929 /* Queue up an asynchronous data copy from DEVICE to host. */
3932 GOMP_OFFLOAD_openacc_async_dev2host (int device
, void *dst
, const void *src
,
3933 size_t n
, struct goacc_asyncqueue
*aq
)
3935 struct agent_info
*agent
= get_agent_info (device
);
3936 assert (agent
== aq
->agent
);
3937 queue_push_copy (aq
, dst
, src
, n
, false);
3941 union goacc_property_value
3942 GOMP_OFFLOAD_openacc_get_property (int device
, enum goacc_property prop
)
3944 struct agent_info
*agent
= get_agent_info (device
);
3946 union goacc_property_value propval
= { .val
= 0 };
3950 case GOACC_PROPERTY_FREE_MEMORY
:
3951 /* Not supported. */
3953 case GOACC_PROPERTY_MEMORY
:
3956 hsa_region_t region
= agent
->data_region
;
3957 hsa_status_t status
=
3958 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SIZE
, &size
);
3959 if (status
== HSA_STATUS_SUCCESS
)
3963 case GOACC_PROPERTY_NAME
:
3964 propval
.ptr
= agent
->name
;
3966 case GOACC_PROPERTY_VENDOR
:
3967 propval
.ptr
= agent
->vendor_name
;
3969 case GOACC_PROPERTY_DRIVER
:
3970 propval
.ptr
= hsa_context
.driver_version_s
;
3977 /* Set up plugin-specific thread-local-data (host-side). */
3980 GOMP_OFFLOAD_openacc_create_thread_data (int ord
__attribute__((unused
)))
3982 struct gcn_thread
*thread_data
3983 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread
));
3985 thread_data
->async
= GOMP_ASYNC_SYNC
;
3987 return (void *) thread_data
;
3990 /* Clean up plugin-specific thread-local-data. */
3993 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)