1 /* Plugin for AMD GCN execution.
3 Copyright (C) 2013-2020 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* {{{ Includes and defines */
42 #include "libgomp-plugin.h"
43 #include "gomp-constants.h"
45 #include "oacc-plugin.h"
49 /* Additional definitions not in HSA 1.1.
50 FIXME: this needs to be updated in hsa.h for upstream, but the only source
51 right now is the ROCr source which may cause license issues. */
52 #define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
54 /* These probably won't be in elf.h for a while. */
55 #define R_AMDGPU_NONE 0
56 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
57 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
58 #define R_AMDGPU_ABS64 3 /* S + A */
59 #define R_AMDGPU_REL32 4 /* S + A - P */
60 #define R_AMDGPU_REL64 5 /* S + A - P */
61 #define R_AMDGPU_ABS32 6 /* S + A */
62 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
63 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
64 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
65 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
66 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
68 #define R_AMDGPU_RELATIVE64 13 /* B + A */
70 /* GCN specific definitions for asynchronous queues. */
72 #define ASYNC_QUEUE_SIZE 64
73 #define DRAIN_QUEUE_SYNCHRONOUS_P false
74 #define DEBUG_QUEUES 0
75 #define DEBUG_THREAD_SLEEP 0
76 #define DEBUG_THREAD_SIGNAL 0
79 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
81 /* Secure getenv() which returns NULL if running as SUID/SGID. */
82 #ifndef HAVE_SECURE_GETENV
83 #ifdef HAVE___SECURE_GETENV
84 #define secure_getenv __secure_getenv
85 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
86 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
90 /* Implementation of secure_getenv() for targets where it is not provided but
91 we have at least means to test real and effective IDs. */
94 secure_getenv (const char *name
)
96 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
103 #define secure_getenv getenv
110 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
114 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
118 /* As an HSA runtime is dlopened, following structure defines function
119 pointers utilized by the HSA plug-in. */
121 struct hsa_runtime_fn_info
124 hsa_status_t (*hsa_status_string_fn
) (hsa_status_t status
,
125 const char **status_string
);
126 hsa_status_t (*hsa_system_get_info_fn
) (hsa_system_info_t attribute
,
128 hsa_status_t (*hsa_agent_get_info_fn
) (hsa_agent_t agent
,
129 hsa_agent_info_t attribute
,
131 hsa_status_t (*hsa_isa_get_info_fn
)(hsa_isa_t isa
,
132 hsa_isa_info_t attribute
,
135 hsa_status_t (*hsa_init_fn
) (void);
136 hsa_status_t (*hsa_iterate_agents_fn
)
137 (hsa_status_t (*callback
)(hsa_agent_t agent
, void *data
), void *data
);
138 hsa_status_t (*hsa_region_get_info_fn
) (hsa_region_t region
,
139 hsa_region_info_t attribute
,
141 hsa_status_t (*hsa_queue_create_fn
)
142 (hsa_agent_t agent
, uint32_t size
, hsa_queue_type_t type
,
143 void (*callback
)(hsa_status_t status
, hsa_queue_t
*source
, void *data
),
144 void *data
, uint32_t private_segment_size
,
145 uint32_t group_segment_size
, hsa_queue_t
**queue
);
146 hsa_status_t (*hsa_agent_iterate_regions_fn
)
148 hsa_status_t (*callback
)(hsa_region_t region
, void *data
), void *data
);
149 hsa_status_t (*hsa_executable_destroy_fn
) (hsa_executable_t executable
);
150 hsa_status_t (*hsa_executable_create_fn
)
151 (hsa_profile_t profile
, hsa_executable_state_t executable_state
,
152 const char *options
, hsa_executable_t
*executable
);
153 hsa_status_t (*hsa_executable_global_variable_define_fn
)
154 (hsa_executable_t executable
, const char *variable_name
, void *address
);
155 hsa_status_t (*hsa_executable_load_code_object_fn
)
156 (hsa_executable_t executable
, hsa_agent_t agent
,
157 hsa_code_object_t code_object
, const char *options
);
158 hsa_status_t (*hsa_executable_freeze_fn
)(hsa_executable_t executable
,
159 const char *options
);
160 hsa_status_t (*hsa_signal_create_fn
) (hsa_signal_value_t initial_value
,
161 uint32_t num_consumers
,
162 const hsa_agent_t
*consumers
,
163 hsa_signal_t
*signal
);
164 hsa_status_t (*hsa_memory_allocate_fn
) (hsa_region_t region
, size_t size
,
166 hsa_status_t (*hsa_memory_assign_agent_fn
) (void *ptr
, hsa_agent_t agent
,
167 hsa_access_permission_t access
);
168 hsa_status_t (*hsa_memory_copy_fn
)(void *dst
, const void *src
, size_t size
);
169 hsa_status_t (*hsa_memory_free_fn
) (void *ptr
);
170 hsa_status_t (*hsa_signal_destroy_fn
) (hsa_signal_t signal
);
171 hsa_status_t (*hsa_executable_get_symbol_fn
)
172 (hsa_executable_t executable
, const char *module_name
,
173 const char *symbol_name
, hsa_agent_t agent
, int32_t call_convention
,
174 hsa_executable_symbol_t
*symbol
);
175 hsa_status_t (*hsa_executable_symbol_get_info_fn
)
176 (hsa_executable_symbol_t executable_symbol
,
177 hsa_executable_symbol_info_t attribute
, void *value
);
178 hsa_status_t (*hsa_executable_iterate_symbols_fn
)
179 (hsa_executable_t executable
,
180 hsa_status_t (*callback
)(hsa_executable_t executable
,
181 hsa_executable_symbol_t symbol
, void *data
),
183 uint64_t (*hsa_queue_add_write_index_release_fn
) (const hsa_queue_t
*queue
,
185 uint64_t (*hsa_queue_load_read_index_acquire_fn
) (const hsa_queue_t
*queue
);
186 void (*hsa_signal_store_relaxed_fn
) (hsa_signal_t signal
,
187 hsa_signal_value_t value
);
188 void (*hsa_signal_store_release_fn
) (hsa_signal_t signal
,
189 hsa_signal_value_t value
);
190 hsa_signal_value_t (*hsa_signal_wait_acquire_fn
)
191 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
192 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
193 hsa_wait_state_t wait_state_hint
);
194 hsa_signal_value_t (*hsa_signal_load_acquire_fn
) (hsa_signal_t signal
);
195 hsa_status_t (*hsa_queue_destroy_fn
) (hsa_queue_t
*queue
);
197 hsa_status_t (*hsa_code_object_deserialize_fn
)
198 (void *serialized_code_object
, size_t serialized_code_object_size
,
199 const char *options
, hsa_code_object_t
*code_object
);
202 /* Structure describing the run-time and grid properties of an HSA kernel
203 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
205 struct GOMP_kernel_launch_attributes
207 /* Number of dimensions the workload has. Maximum number is 3. */
209 /* Size of the grid in the three respective dimensions. */
211 /* Size of work-groups in the respective dimensions. */
215 /* Collection of information needed for a dispatch of a kernel from a
218 struct kernel_dispatch
220 struct agent_info
*agent
;
221 /* Pointer to a command queue associated with a kernel dispatch agent. */
223 /* Pointer to a memory space used for kernel arguments passing. */
224 void *kernarg_address
;
227 /* Synchronization signal used for dispatch synchronization. */
229 /* Private segment size. */
230 uint32_t private_segment_size
;
231 /* Group segment size. */
232 uint32_t group_segment_size
;
235 /* Structure of the kernargs segment, supporting console output.
237 This needs to match the definitions in Newlib, and the expectations
238 in libgomp target code. */
241 /* Leave space for the real kernel arguments.
242 OpenACC and OpenMP only use one pointer. */
246 /* A pointer to struct output, below, for console output data. */
249 /* A pointer to struct heap, below. */
252 /* A pointer to an ephemeral memory arena.
253 Only needed for OpenMP. */
259 unsigned int next_output
;
270 unsigned int consumed
;
274 /* A queue entry for a future asynchronous launch. */
278 struct kernel_info
*kernel
;
280 struct GOMP_kernel_launch_attributes kla
;
283 /* A queue entry for a future callback. */
291 /* A data struct for the copy_data callback. */
299 struct goacc_asyncqueue
*aq
;
302 /* A queue entry for a placeholder. These correspond to a wait event. */
308 pthread_mutex_t mutex
;
311 /* A queue entry for a wait directive. */
313 struct asyncwait_info
315 struct placeholder
*placeholderp
;
318 /* Encode the type of an entry in an async queue. */
328 /* An entry in an async queue. */
332 enum entry_type type
;
334 struct kernel_launch launch
;
335 struct callback callback
;
336 struct asyncwait_info asyncwait
;
337 struct placeholder placeholder
;
341 /* An async queue header.
343 OpenMP may create one of these.
344 OpenACC may create many. */
346 struct goacc_asyncqueue
348 struct agent_info
*agent
;
349 hsa_queue_t
*hsa_queue
;
351 pthread_t thread_drain_queue
;
352 pthread_mutex_t mutex
;
353 pthread_cond_t queue_cond_in
;
354 pthread_cond_t queue_cond_out
;
355 struct queue_entry queue
[ASYNC_QUEUE_SIZE
];
358 int drain_queue_stop
;
361 struct goacc_asyncqueue
*prev
;
362 struct goacc_asyncqueue
*next
;
365 /* Mkoffload uses this structure to describe a kernel.
367 OpenMP kernel dimensions are passed at runtime.
368 OpenACC kernel dimensions are passed at compile time, here. */
370 struct hsa_kernel_description
373 int oacc_dims
[3]; /* Only present for GCN kernels. */
376 /* Mkoffload uses this structure to describe an offload variable. */
378 struct global_var_info
384 /* Mkoffload uses this structure to describe all the kernels in a
385 loadable module. These are passed the libgomp via static constructors. */
387 struct gcn_image_desc
393 const unsigned kernel_count
;
394 struct hsa_kernel_description
*kernel_infos
;
395 const unsigned global_variable_count
;
396 struct global_var_info
*global_variables
;
399 /* Description of an HSA GPU agent (device) and the program associated with
404 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
406 /* The user-visible device number. */
408 /* Whether the agent has been initialized. The fields below are usable only
411 /* Precomputed check for problem architectures. */
414 /* Command queues of the agent. */
415 hsa_queue_t
*sync_queue
;
416 struct goacc_asyncqueue
*async_queues
, *omp_async_queue
;
417 pthread_mutex_t async_queues_mutex
;
419 /* The HSA memory region from which to allocate kernel arguments. */
420 hsa_region_t kernarg_region
;
422 /* The HSA memory region from which to allocate device data. */
423 hsa_region_t data_region
;
425 /* Allocated team arenas. */
426 struct team_arena_list
*team_arena_list
;
427 pthread_mutex_t team_arena_write_lock
;
429 /* Read-write lock that protects kernels which are running or about to be run
430 from interference with loading and unloading of images. Needs to be
431 locked for reading while a kernel is being run, and for writing if the
432 list of modules is manipulated (and thus the HSA program invalidated). */
433 pthread_rwlock_t module_rwlock
;
435 /* The module associated with this kernel. */
436 struct module_info
*module
;
438 /* Mutex enforcing that only one thread will finalize the HSA program. A
439 thread should have locked agent->module_rwlock for reading before
441 pthread_mutex_t prog_mutex
;
442 /* Flag whether the HSA program that consists of all the modules has been
445 /* HSA executable - the finalized program that is used to locate kernels. */
446 hsa_executable_t executable
;
449 /* Information required to identify, finalize and run any given kernel. */
451 enum offload_kind
{KIND_UNKNOWN
, KIND_OPENMP
, KIND_OPENACC
};
455 /* Name of the kernel, required to locate it within the GCN object-code
458 /* The specific agent the kernel has been or will be finalized for and run
460 struct agent_info
*agent
;
461 /* The specific module where the kernel takes place. */
462 struct module_info
*module
;
463 /* Mutex enforcing that at most once thread ever initializes a kernel for
464 use. A thread should have locked agent->module_rwlock for reading before
466 pthread_mutex_t init_mutex
;
467 /* Flag indicating whether the kernel has been initialized and all fields
468 below it contain valid data. */
470 /* Flag indicating that the kernel has a problem that blocks an execution. */
471 bool initialization_failed
;
472 /* The object to be put into the dispatch queue. */
474 /* Required size of kernel arguments. */
475 uint32_t kernarg_segment_size
;
476 /* Required size of group segment. */
477 uint32_t group_segment_size
;
478 /* Required size of private segment. */
479 uint32_t private_segment_size
;
480 /* Set up for OpenMP or OpenACC? */
481 enum offload_kind kind
;
484 /* Information about a particular GCN module, its image and kernels. */
488 /* The description with which the program has registered the image. */
489 struct gcn_image_desc
*image_desc
;
490 /* GCN heap allocation. */
492 /* Physical boundaries of the loaded module. */
493 Elf64_Addr phys_address_start
;
494 Elf64_Addr phys_address_end
;
496 bool constructors_run_p
;
497 struct kernel_info
*init_array_func
, *fini_array_func
;
499 /* Number of kernels in this module. */
501 /* An array of kernel_info structures describing each kernel in this
503 struct kernel_info kernels
[];
506 /* A linked list of memory arenas allocated on the device.
507 These are only used by OpenMP, as a means to optimize per-team malloc. */
509 struct team_arena_list
511 struct team_arena_list
*next
;
513 /* The number of teams determines the size of the allocation. */
515 /* The device address of the arena itself. */
517 /* A flag to prevent two asynchronous kernels trying to use the same arena.
518 The mutex is locked until the kernel exits. */
519 pthread_mutex_t in_use
;
522 /* Information about the whole HSA environment and all of its agents. */
524 struct hsa_context_info
526 /* Whether the structure has been initialized. */
528 /* Number of usable GPU HSA agents in the system. */
530 /* Array of agent_info structures describing the individual HSA agents. */
531 struct agent_info
*agents
;
534 /* Format of the on-device heap.
536 This must match the definition in Newlib and gcn-run. */
544 /* {{{ Global variables */
546 /* Information about the whole HSA environment and all of its agents. */
548 static struct hsa_context_info hsa_context
;
550 /* HSA runtime functions that are initialized in init_hsa_context. */
552 static struct hsa_runtime_fn_info hsa_fns
;
554 /* Heap space, allocated target-side, provided for use of newlib malloc.
555 Each module should have it's own heap allocated.
556 Beware that heap usage increases with OpenMP teams. See also arenas. */
558 static size_t gcn_kernel_heap_size
= DEFAULT_GCN_HEAP_SIZE
;
560 /* Flag to decide whether print to stderr information about what is going on.
561 Set in init_debug depending on environment variables. */
565 /* Flag to decide if the runtime should suppress a possible fallback to host
568 static bool suppress_host_fallback
;
570 /* Flag to locate HSA runtime shared library that is dlopened
573 static const char *hsa_runtime_lib
;
575 /* Flag to decide if the runtime should support also CPU devices (can be
578 static bool support_cpu_devices
;
580 /* Runtime dimension overrides. Zero indicates default. */
582 static int override_x_dim
= 0;
583 static int override_z_dim
= 0;
586 /* {{{ Debug & Diagnostic */
588 /* Print a message to stderr if GCN_DEBUG value is set to true. */
590 #define DEBUG_PRINT(...) \
595 fprintf (stderr, __VA_ARGS__); \
600 /* Flush stderr if GCN_DEBUG value is set to true. */
602 #define DEBUG_FLUSH() \
608 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
611 #define DEBUG_LOG(prefix, ...) \
614 DEBUG_PRINT (prefix); \
615 DEBUG_PRINT (__VA_ARGS__); \
619 /* Print a debugging message to stderr. */
621 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
623 /* Print a warning message to stderr. */
625 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
627 /* Print HSA warning STR with an HSA STATUS code. */
630 hsa_warn (const char *str
, hsa_status_t status
)
635 const char *hsa_error_msg
= "[unknown]";
636 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
638 fprintf (stderr
, "GCN warning: %s\nRuntime message: %s\n", str
,
642 /* Report a fatal error STR together with the HSA error corresponding to STATUS
643 and terminate execution of the current process. */
646 hsa_fatal (const char *str
, hsa_status_t status
)
648 const char *hsa_error_msg
= "[unknown]";
649 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
650 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str
,
654 /* Like hsa_fatal, except only report error message, and return FALSE
655 for propagating error processing to outside of plugin. */
658 hsa_error (const char *str
, hsa_status_t status
)
660 const char *hsa_error_msg
= "[unknown]";
661 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
662 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str
,
667 /* Dump information about the available hardware. */
670 dump_hsa_system_info (void)
674 hsa_endianness_t endianness
;
675 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS
,
677 if (status
== HSA_STATUS_SUCCESS
)
680 case HSA_ENDIANNESS_LITTLE
:
681 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
683 case HSA_ENDIANNESS_BIG
:
684 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
687 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
690 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
692 uint8_t extensions
[128];
693 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS
,
695 if (status
== HSA_STATUS_SUCCESS
)
697 if (extensions
[0] & (1 << HSA_EXTENSION_IMAGES
))
698 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
701 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
704 /* Dump information about the available hardware. */
707 dump_machine_model (hsa_machine_model_t machine_model
, const char *s
)
709 switch (machine_model
)
711 case HSA_MACHINE_MODEL_SMALL
:
712 GCN_DEBUG ("%s: SMALL\n", s
);
714 case HSA_MACHINE_MODEL_LARGE
:
715 GCN_DEBUG ("%s: LARGE\n", s
);
718 GCN_WARNING ("%s: UNKNOWN\n", s
);
723 /* Dump information about the available hardware. */
726 dump_profile (hsa_profile_t profile
, const char *s
)
730 case HSA_PROFILE_FULL
:
731 GCN_DEBUG ("%s: FULL\n", s
);
733 case HSA_PROFILE_BASE
:
734 GCN_DEBUG ("%s: BASE\n", s
);
737 GCN_WARNING ("%s: UNKNOWN\n", s
);
742 /* Dump information about a device memory region. */
745 dump_hsa_region (hsa_region_t region
, void *data
__attribute__((unused
)))
749 hsa_region_segment_t segment
;
750 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
,
752 if (status
== HSA_STATUS_SUCCESS
)
754 if (segment
== HSA_REGION_SEGMENT_GLOBAL
)
755 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
756 else if (segment
== HSA_REGION_SEGMENT_READONLY
)
757 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
758 else if (segment
== HSA_REGION_SEGMENT_PRIVATE
)
759 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
760 else if (segment
== HSA_REGION_SEGMENT_GROUP
)
761 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
763 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
766 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
768 if (segment
== HSA_REGION_SEGMENT_GLOBAL
)
772 = hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
774 if (status
== HSA_STATUS_SUCCESS
)
776 if (flags
& HSA_REGION_GLOBAL_FLAG_KERNARG
)
777 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
778 if (flags
& HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
)
779 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
780 if (flags
& HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
)
781 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
784 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
788 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SIZE
, &size
);
789 if (status
== HSA_STATUS_SUCCESS
)
790 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size
);
792 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
795 = hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_ALLOC_MAX_SIZE
,
797 if (status
== HSA_STATUS_SUCCESS
)
798 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size
);
800 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
804 = hsa_fns
.hsa_region_get_info_fn (region
,
805 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED
,
807 if (status
== HSA_STATUS_SUCCESS
)
808 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed
);
810 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
812 if (status
!= HSA_STATUS_SUCCESS
|| !alloc_allowed
)
813 return HSA_STATUS_SUCCESS
;
816 = hsa_fns
.hsa_region_get_info_fn (region
,
817 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE
,
819 if (status
== HSA_STATUS_SUCCESS
)
820 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size
);
822 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
826 = hsa_fns
.hsa_region_get_info_fn (region
,
827 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT
,
829 if (status
== HSA_STATUS_SUCCESS
)
830 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align
);
832 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
834 return HSA_STATUS_SUCCESS
;
837 /* Dump information about all the device memory regions. */
840 dump_hsa_regions (hsa_agent_t agent
)
843 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
,
846 if (status
!= HSA_STATUS_SUCCESS
)
847 hsa_error ("Dumping hsa regions failed", status
);
850 /* Dump information about the available devices. */
853 dump_hsa_agent_info (hsa_agent_t agent
, void *data
__attribute__((unused
)))
858 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_NAME
,
860 if (status
== HSA_STATUS_SUCCESS
)
861 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf
);
863 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
865 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_VENDOR_NAME
,
867 if (status
== HSA_STATUS_SUCCESS
)
868 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf
);
870 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
872 hsa_machine_model_t machine_model
;
874 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_MACHINE_MODEL
,
876 if (status
== HSA_STATUS_SUCCESS
)
877 dump_machine_model (machine_model
, "HSA_AGENT_INFO_MACHINE_MODEL");
879 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
881 hsa_profile_t profile
;
882 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_PROFILE
,
884 if (status
== HSA_STATUS_SUCCESS
)
885 dump_profile (profile
, "HSA_AGENT_INFO_PROFILE");
887 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
889 hsa_device_type_t device_type
;
890 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
892 if (status
== HSA_STATUS_SUCCESS
)
896 case HSA_DEVICE_TYPE_CPU
:
897 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
899 case HSA_DEVICE_TYPE_GPU
:
900 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
902 case HSA_DEVICE_TYPE_DSP
:
903 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
906 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
911 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
914 status
= hsa_fns
.hsa_agent_get_info_fn
915 (agent
, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, &cu_count
);
916 if (status
== HSA_STATUS_SUCCESS
)
917 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count
);
919 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
922 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_WAVEFRONT_SIZE
,
924 if (status
== HSA_STATUS_SUCCESS
)
925 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size
);
927 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
930 status
= hsa_fns
.hsa_agent_get_info_fn (agent
,
931 HSA_AGENT_INFO_WORKGROUP_MAX_DIM
,
933 if (status
== HSA_STATUS_SUCCESS
)
934 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim
);
936 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
939 status
= hsa_fns
.hsa_agent_get_info_fn (agent
,
940 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE
,
942 if (status
== HSA_STATUS_SUCCESS
)
943 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size
);
945 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
947 uint32_t grid_max_dim
;
948 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_GRID_MAX_DIM
,
950 if (status
== HSA_STATUS_SUCCESS
)
951 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim
);
953 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
955 uint32_t grid_max_size
;
956 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_GRID_MAX_SIZE
,
958 if (status
== HSA_STATUS_SUCCESS
)
959 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size
);
961 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
963 dump_hsa_regions (agent
);
965 return HSA_STATUS_SUCCESS
;
968 /* Forward reference. */
970 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol
);
972 /* Helper function for dump_executable_symbols. */
975 dump_executable_symbol (hsa_executable_t executable
,
976 hsa_executable_symbol_t symbol
,
977 void *data
__attribute__((unused
)))
979 char *name
= get_executable_symbol_name (symbol
);
983 GCN_DEBUG ("executable symbol: %s\n", name
);
987 return HSA_STATUS_SUCCESS
;
990 /* Dump all global symbol in an executable. */
993 dump_executable_symbols (hsa_executable_t executable
)
997 = hsa_fns
.hsa_executable_iterate_symbols_fn (executable
,
998 dump_executable_symbol
,
1000 if (status
!= HSA_STATUS_SUCCESS
)
1001 hsa_fatal ("Could not dump HSA executable symbols", status
);
1004 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1007 print_kernel_dispatch (struct kernel_dispatch
*dispatch
, unsigned indent
)
1009 struct kernargs
*kernargs
= (struct kernargs
*)dispatch
->kernarg_address
;
1011 fprintf (stderr
, "%*sthis: %p\n", indent
, "", dispatch
);
1012 fprintf (stderr
, "%*squeue: %p\n", indent
, "", dispatch
->queue
);
1013 fprintf (stderr
, "%*skernarg_address: %p\n", indent
, "", kernargs
);
1014 fprintf (stderr
, "%*sheap address: %p\n", indent
, "",
1015 (void*)kernargs
->heap_ptr
);
1016 fprintf (stderr
, "%*sarena address: %p\n", indent
, "",
1017 (void*)kernargs
->arena_ptr
);
1018 fprintf (stderr
, "%*sobject: %lu\n", indent
, "", dispatch
->object
);
1019 fprintf (stderr
, "%*sprivate_segment_size: %u\n", indent
, "",
1020 dispatch
->private_segment_size
);
1021 fprintf (stderr
, "%*sgroup_segment_size: %u\n", indent
, "",
1022 dispatch
->group_segment_size
);
1023 fprintf (stderr
, "\n");
1027 /* {{{ Utility functions */
1029 /* Cast the thread local storage to gcn_thread. */
1031 static inline struct gcn_thread
*
1034 return (struct gcn_thread
*) GOMP_PLUGIN_acc_thread ();
1037 /* Initialize debug and suppress_host_fallback according to the environment. */
1040 init_environment_variables (void)
1042 if (secure_getenv ("GCN_DEBUG"))
1047 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1048 suppress_host_fallback
= true;
1050 suppress_host_fallback
= false;
1052 hsa_runtime_lib
= secure_getenv ("HSA_RUNTIME_LIB");
1053 if (hsa_runtime_lib
== NULL
)
1054 hsa_runtime_lib
= HSA_RUNTIME_LIB
"libhsa-runtime64.so";
1056 support_cpu_devices
= secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1058 const char *x
= secure_getenv ("GCN_NUM_TEAMS");
1060 x
= secure_getenv ("GCN_NUM_GANGS");
1062 override_x_dim
= atoi (x
);
1064 const char *z
= secure_getenv ("GCN_NUM_THREADS");
1066 z
= secure_getenv ("GCN_NUM_WORKERS");
1068 override_z_dim
= atoi (z
);
1070 const char *heap
= secure_getenv ("GCN_HEAP_SIZE");
1073 size_t tmp
= atol (heap
);
1075 gcn_kernel_heap_size
= tmp
;
1079 /* Return malloc'd string with name of SYMBOL. */
1082 get_executable_symbol_name (hsa_executable_symbol_t symbol
)
1084 hsa_status_t status
;
1087 const hsa_executable_symbol_info_t info_name_length
1088 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH
;
1090 status
= hsa_fns
.hsa_executable_symbol_get_info_fn (symbol
, info_name_length
,
1092 if (status
!= HSA_STATUS_SUCCESS
)
1094 hsa_error ("Could not get length of symbol name", status
);
1098 res
= GOMP_PLUGIN_malloc (len
+ 1);
1100 const hsa_executable_symbol_info_t info_name
1101 = HSA_EXECUTABLE_SYMBOL_INFO_NAME
;
1103 status
= hsa_fns
.hsa_executable_symbol_get_info_fn (symbol
, info_name
, res
);
1105 if (status
!= HSA_STATUS_SUCCESS
)
1107 hsa_error ("Could not get symbol name", status
);
1117 /* Helper function for find_executable_symbol. */
1120 find_executable_symbol_1 (hsa_executable_t executable
,
1121 hsa_executable_symbol_t symbol
,
1124 hsa_executable_symbol_t
*res
= (hsa_executable_symbol_t
*)data
;
1126 return HSA_STATUS_INFO_BREAK
;
1129 /* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true. If not
1130 found, return false. */
1133 find_executable_symbol (hsa_executable_t executable
,
1134 hsa_executable_symbol_t
*symbol
)
1136 hsa_status_t status
;
1139 = hsa_fns
.hsa_executable_iterate_symbols_fn (executable
,
1140 find_executable_symbol_1
,
1142 if (status
!= HSA_STATUS_INFO_BREAK
)
1144 hsa_error ("Could not find executable symbol", status
);
1151 /* Get the number of GPU Compute Units. */
1154 get_cu_count (struct agent_info
*agent
)
1157 hsa_status_t status
= hsa_fns
.hsa_agent_get_info_fn
1158 (agent
->id
, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, &cu_count
);
1159 if (status
== HSA_STATUS_SUCCESS
)
1162 return 64; /* The usual number for older devices. */
1165 /* Calculate the maximum grid size for OMP threads / OACC workers.
1166 This depends on the kernel's resource usage levels. */
1169 limit_worker_threads (int threads
)
1171 /* FIXME Do something more inteligent here.
1172 GCN can always run 4 threads within a Compute Unit, but
1173 more than that depends on register usage. */
1179 /* Parse the target attributes INPUT provided by the compiler and return true
1180 if we should run anything all. If INPUT is NULL, fill DEF with default
1181 values, then store INPUT or DEF into *RESULT.
1183 This is used for OpenMP only. */
1186 parse_target_attributes (void **input
,
1187 struct GOMP_kernel_launch_attributes
*def
,
1188 struct GOMP_kernel_launch_attributes
**result
,
1189 struct agent_info
*agent
)
1192 GOMP_PLUGIN_fatal ("No target arguments provided");
1194 bool grid_attrs_found
= false;
1195 bool gcn_dims_found
= false;
1197 int gcn_threads
= 0;
1200 intptr_t id
= (intptr_t) *input
++, val
;
1202 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
1203 val
= (intptr_t) *input
++;
1205 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
1207 val
= (val
> INT_MAX
) ? INT_MAX
: val
;
1209 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) == GOMP_DEVICE_GCN
1210 && ((id
& GOMP_TARGET_ARG_ID_MASK
)
1211 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES
))
1213 grid_attrs_found
= true;
1216 else if ((id
& GOMP_TARGET_ARG_DEVICE_ALL
) == GOMP_TARGET_ARG_DEVICE_ALL
)
1218 gcn_dims_found
= true;
1219 switch (id
& GOMP_TARGET_ARG_ID_MASK
)
1221 case GOMP_TARGET_ARG_NUM_TEAMS
:
1224 case GOMP_TARGET_ARG_THREAD_LIMIT
:
1225 gcn_threads
= limit_worker_threads (val
);
1235 if (agent
->gfx900_p
&& gcn_threads
== 0 && override_z_dim
== 0)
1238 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1239 "threads to 4 per team.\n");
1240 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1241 "GCN_NUM_THREADS=16\n");
1245 /* Fiji has 64 CUs, but Vega20 has 60. */
1246 def
->gdims
[0] = (gcn_teams
> 0) ? gcn_teams
: get_cu_count (agent
);
1247 /* Each thread is 64 work items wide. */
1249 /* A work group can have 16 wavefronts. */
1250 def
->gdims
[2] = (gcn_threads
> 0) ? gcn_threads
: 16;
1251 def
->wdims
[0] = 1; /* Single team per work-group. */
1257 else if (!grid_attrs_found
)
1267 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1271 struct GOMP_kernel_launch_attributes
*kla
;
1272 kla
= (struct GOMP_kernel_launch_attributes
*) *input
;
1274 if (kla
->ndim
== 0 || kla
->ndim
> 3)
1275 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla
->ndim
);
1277 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla
->ndim
);
1279 for (i
= 0; i
< kla
->ndim
; i
++)
1281 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i
,
1282 kla
->gdims
[i
], kla
->wdims
[i
]);
1283 if (kla
->gdims
[i
] == 0)
1289 /* Return the group size given the requested GROUP size, GRID size and number
1290 of grid dimensions NDIM. */
1293 get_group_size (uint32_t ndim
, uint32_t grid
, uint32_t group
)
1297 /* TODO: Provide a default via environment or device characteristics. */
1311 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1314 packet_store_release (uint32_t* packet
, uint16_t header
, uint16_t rest
)
1316 __atomic_store_n (packet
, header
| (rest
<< 16), __ATOMIC_RELEASE
);
1319 /* A never-called callback for the HSA command queues. These signal events
1320 that we don't use, so we trigger an error.
1322 This "queue" is not to be confused with the async queues, below. */
1325 hsa_queue_callback (hsa_status_t status
,
1326 hsa_queue_t
*queue
__attribute__ ((unused
)),
1327 void *data
__attribute__ ((unused
)))
1329 hsa_fatal ("Asynchronous queue error", status
);
1333 /* {{{ HSA initialization */
1335 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1338 init_hsa_runtime_functions (void)
1340 #define DLSYM_FN(function) \
1341 hsa_fns.function##_fn = dlsym (handle, #function); \
1342 if (hsa_fns.function##_fn == NULL) \
1344 void *handle
= dlopen (hsa_runtime_lib
, RTLD_LAZY
);
1348 DLSYM_FN (hsa_status_string
)
1349 DLSYM_FN (hsa_system_get_info
)
1350 DLSYM_FN (hsa_agent_get_info
)
1352 DLSYM_FN (hsa_iterate_agents
)
1353 DLSYM_FN (hsa_region_get_info
)
1354 DLSYM_FN (hsa_queue_create
)
1355 DLSYM_FN (hsa_agent_iterate_regions
)
1356 DLSYM_FN (hsa_executable_destroy
)
1357 DLSYM_FN (hsa_executable_create
)
1358 DLSYM_FN (hsa_executable_global_variable_define
)
1359 DLSYM_FN (hsa_executable_load_code_object
)
1360 DLSYM_FN (hsa_executable_freeze
)
1361 DLSYM_FN (hsa_signal_create
)
1362 DLSYM_FN (hsa_memory_allocate
)
1363 DLSYM_FN (hsa_memory_assign_agent
)
1364 DLSYM_FN (hsa_memory_copy
)
1365 DLSYM_FN (hsa_memory_free
)
1366 DLSYM_FN (hsa_signal_destroy
)
1367 DLSYM_FN (hsa_executable_get_symbol
)
1368 DLSYM_FN (hsa_executable_symbol_get_info
)
1369 DLSYM_FN (hsa_executable_iterate_symbols
)
1370 DLSYM_FN (hsa_queue_add_write_index_release
)
1371 DLSYM_FN (hsa_queue_load_read_index_acquire
)
1372 DLSYM_FN (hsa_signal_wait_acquire
)
1373 DLSYM_FN (hsa_signal_store_relaxed
)
1374 DLSYM_FN (hsa_signal_store_release
)
1375 DLSYM_FN (hsa_signal_load_acquire
)
1376 DLSYM_FN (hsa_queue_destroy
)
1377 DLSYM_FN (hsa_code_object_deserialize
)
1382 /* Return true if the agent is a GPU and can accept of concurrent submissions
1383 from different threads. */
1386 suitable_hsa_agent_p (hsa_agent_t agent
)
1388 hsa_device_type_t device_type
;
1390 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
1392 if (status
!= HSA_STATUS_SUCCESS
)
1395 switch (device_type
)
1397 case HSA_DEVICE_TYPE_GPU
:
1399 case HSA_DEVICE_TYPE_CPU
:
1400 if (!support_cpu_devices
)
1407 uint32_t features
= 0;
1408 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_FEATURE
,
1410 if (status
!= HSA_STATUS_SUCCESS
1411 || !(features
& HSA_AGENT_FEATURE_KERNEL_DISPATCH
))
1413 hsa_queue_type_t queue_type
;
1414 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_QUEUE_TYPE
,
1416 if (status
!= HSA_STATUS_SUCCESS
1417 || (queue_type
!= HSA_QUEUE_TYPE_MULTI
))
1423 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1424 agent_count in hsa_context. */
1427 count_gpu_agents (hsa_agent_t agent
, void *data
__attribute__ ((unused
)))
1429 if (suitable_hsa_agent_p (agent
))
1430 hsa_context
.agent_count
++;
1431 return HSA_STATUS_SUCCESS
;
1434 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1435 id to the describing structure in the hsa context. The index of the
1436 structure is pointed to by DATA, increment it afterwards. */
1439 assign_agent_ids (hsa_agent_t agent
, void *data
)
1441 if (suitable_hsa_agent_p (agent
))
1443 int *agent_index
= (int *) data
;
1444 hsa_context
.agents
[*agent_index
].id
= agent
;
1447 return HSA_STATUS_SUCCESS
;
1450 /* Initialize hsa_context if it has not already been done.
1451 Return TRUE on success. */
1454 init_hsa_context (void)
1456 hsa_status_t status
;
1457 int agent_index
= 0;
1459 if (hsa_context
.initialized
)
1461 init_environment_variables ();
1462 if (!init_hsa_runtime_functions ())
1464 GCN_WARNING ("Run-time could not be dynamically opened\n");
1465 if (suppress_host_fallback
)
1466 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1469 status
= hsa_fns
.hsa_init_fn ();
1470 if (status
!= HSA_STATUS_SUCCESS
)
1471 return hsa_error ("Run-time could not be initialized", status
);
1472 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1475 dump_hsa_system_info ();
1477 status
= hsa_fns
.hsa_iterate_agents_fn (count_gpu_agents
, NULL
);
1478 if (status
!= HSA_STATUS_SUCCESS
)
1479 return hsa_error ("GCN GPU devices could not be enumerated", status
);
1480 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context
.agent_count
);
1483 = GOMP_PLUGIN_malloc_cleared (hsa_context
.agent_count
1484 * sizeof (struct agent_info
));
1485 status
= hsa_fns
.hsa_iterate_agents_fn (assign_agent_ids
, &agent_index
);
1486 if (agent_index
!= hsa_context
.agent_count
)
1488 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1494 status
= hsa_fns
.hsa_iterate_agents_fn (dump_hsa_agent_info
, NULL
);
1495 if (status
!= HSA_STATUS_SUCCESS
)
1496 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1499 hsa_context
.initialized
= true;
1503 /* Verify that hsa_context has already been initialized and return the
1504 agent_info structure describing device number N. Return NULL on error. */
1506 static struct agent_info
*
1507 get_agent_info (int n
)
1509 if (!hsa_context
.initialized
)
1511 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1514 if (n
>= hsa_context
.agent_count
)
1516 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n
);
1519 if (!hsa_context
.agents
[n
].initialized
)
1521 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1524 return &hsa_context
.agents
[n
];
1527 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1529 Selects (breaks at) a suitable region of type KIND. */
1532 get_memory_region (hsa_region_t region
, hsa_region_t
*retval
,
1533 hsa_region_global_flag_t kind
)
1535 hsa_status_t status
;
1536 hsa_region_segment_t segment
;
1538 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
,
1540 if (status
!= HSA_STATUS_SUCCESS
)
1542 if (segment
!= HSA_REGION_SEGMENT_GLOBAL
)
1543 return HSA_STATUS_SUCCESS
;
1546 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
1548 if (status
!= HSA_STATUS_SUCCESS
)
1553 return HSA_STATUS_INFO_BREAK
;
1555 return HSA_STATUS_SUCCESS
;
1558 /* Callback of hsa_agent_iterate_regions.
1560 Selects a kernargs memory region. */
1563 get_kernarg_memory_region (hsa_region_t region
, void *data
)
1565 return get_memory_region (region
, (hsa_region_t
*)data
,
1566 HSA_REGION_GLOBAL_FLAG_KERNARG
);
1569 /* Callback of hsa_agent_iterate_regions.
1571 Selects a coarse-grained memory region suitable for the heap and
1575 get_data_memory_region (hsa_region_t region
, void *data
)
1577 return get_memory_region (region
, (hsa_region_t
*)data
,
1578 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
);
1584 /* Create or reuse a team arena.
1586 Team arenas are used by OpenMP to avoid calling malloc multiple times
1587 while setting up each team. This is purely a performance optimization.
1589 Allocating an arena also costs performance, albeit on the host side, so
1590 this function will reuse an existing arena if a large enough one is idle.
1591 The arena is released, but not deallocated, when the kernel exits. */
1594 get_team_arena (struct agent_info
*agent
, int num_teams
)
1596 struct team_arena_list
**next_ptr
= &agent
->team_arena_list
;
1597 struct team_arena_list
*item
;
1599 for (item
= *next_ptr
; item
; next_ptr
= &item
->next
, item
= item
->next
)
1601 if (item
->num_teams
< num_teams
)
1604 if (pthread_mutex_trylock (&item
->in_use
))
1610 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams
);
1612 if (pthread_mutex_lock (&agent
->team_arena_write_lock
))
1614 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1617 item
= malloc (sizeof (*item
));
1618 item
->num_teams
= num_teams
;
1622 if (pthread_mutex_init (&item
->in_use
, NULL
))
1624 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1627 if (pthread_mutex_lock (&item
->in_use
))
1629 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1632 if (pthread_mutex_unlock (&agent
->team_arena_write_lock
))
1634 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1638 const int TEAM_ARENA_SIZE
= 64*1024; /* Must match libgomp.h. */
1639 hsa_status_t status
;
1640 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
1641 TEAM_ARENA_SIZE
*num_teams
,
1643 if (status
!= HSA_STATUS_SUCCESS
)
1644 hsa_fatal ("Could not allocate memory for GCN kernel arena", status
);
1645 status
= hsa_fns
.hsa_memory_assign_agent_fn (item
->arena
, agent
->id
,
1646 HSA_ACCESS_PERMISSION_RW
);
1647 if (status
!= HSA_STATUS_SUCCESS
)
1648 hsa_fatal ("Could not assign arena memory to device", status
);
1653 /* Mark a team arena available for reuse. */
1656 release_team_arena (struct agent_info
* agent
, void *arena
)
1658 struct team_arena_list
*item
;
1660 for (item
= agent
->team_arena_list
; item
; item
= item
->next
)
1662 if (item
->arena
== arena
)
1664 if (pthread_mutex_unlock (&item
->in_use
))
1665 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1669 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1672 /* Clean up all the allocated team arenas. */
1675 destroy_team_arenas (struct agent_info
*agent
)
1677 struct team_arena_list
*item
, *next
;
1679 for (item
= agent
->team_arena_list
; item
; item
= next
)
1682 hsa_fns
.hsa_memory_free_fn (item
->arena
);
1683 if (pthread_mutex_destroy (&item
->in_use
))
1685 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1690 agent
->team_arena_list
= NULL
;
1695 /* Allocate memory on a specified device. */
1698 alloc_by_agent (struct agent_info
*agent
, size_t size
)
1700 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size
, agent
->device_id
);
1702 /* Zero-size allocations are invalid, so in order to return a valid pointer
1703 we need to pass a valid size. One source of zero-size allocations is
1704 kernargs for kernels that have no inputs or outputs (the kernel may
1705 only use console output, for example). */
1710 hsa_status_t status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
1712 if (status
!= HSA_STATUS_SUCCESS
)
1714 hsa_error ("Could not allocate device memory", status
);
1718 status
= hsa_fns
.hsa_memory_assign_agent_fn (ptr
, agent
->id
,
1719 HSA_ACCESS_PERMISSION_RW
);
1720 if (status
!= HSA_STATUS_SUCCESS
)
1722 hsa_error ("Could not assign data memory to device", status
);
1726 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1727 bool profiling_dispatch_p
1728 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1729 if (profiling_dispatch_p
)
1731 acc_prof_info
*prof_info
= thr
->prof_info
;
1732 acc_event_info data_event_info
;
1733 acc_api_info
*api_info
= thr
->api_info
;
1735 prof_info
->event_type
= acc_ev_alloc
;
1737 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1738 data_event_info
.data_event
.valid_bytes
1739 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1740 data_event_info
.data_event
.parent_construct
1741 = acc_construct_parallel
;
1742 data_event_info
.data_event
.implicit
= 1;
1743 data_event_info
.data_event
.tool_info
= NULL
;
1744 data_event_info
.data_event
.var_name
= NULL
;
1745 data_event_info
.data_event
.bytes
= size
;
1746 data_event_info
.data_event
.host_ptr
= NULL
;
1747 data_event_info
.data_event
.device_ptr
= (void *) ptr
;
1749 api_info
->device_api
= acc_device_api_other
;
1751 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1758 /* Create kernel dispatch data structure for given KERNEL, along with
1759 the necessary device signals and memory allocations. */
1761 static struct kernel_dispatch
*
1762 create_kernel_dispatch (struct kernel_info
*kernel
, int num_teams
)
1764 struct agent_info
*agent
= kernel
->agent
;
1765 struct kernel_dispatch
*shadow
1766 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch
));
1768 shadow
->agent
= kernel
->agent
;
1769 shadow
->object
= kernel
->object
;
1771 hsa_signal_t sync_signal
;
1772 hsa_status_t status
= hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &sync_signal
);
1773 if (status
!= HSA_STATUS_SUCCESS
)
1774 hsa_fatal ("Error creating the GCN sync signal", status
);
1776 shadow
->signal
= sync_signal
.handle
;
1777 shadow
->private_segment_size
= kernel
->private_segment_size
;
1778 shadow
->group_segment_size
= kernel
->group_segment_size
;
1780 /* We expect kernels to request a single pointer, explicitly, and the
1781 rest of struct kernargs, implicitly. If they request anything else
1782 then something is wrong. */
1783 if (kernel
->kernarg_segment_size
> 8)
1785 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1789 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->kernarg_region
,
1790 sizeof (struct kernargs
),
1791 &shadow
->kernarg_address
);
1792 if (status
!= HSA_STATUS_SUCCESS
)
1793 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status
);
1794 struct kernargs
*kernargs
= shadow
->kernarg_address
;
1796 /* Zero-initialize the output_data (minimum needed). */
1797 kernargs
->out_ptr
= (int64_t)&kernargs
->output_data
;
1798 kernargs
->output_data
.next_output
= 0;
1799 for (unsigned i
= 0;
1800 i
< (sizeof (kernargs
->output_data
.queue
)
1801 / sizeof (kernargs
->output_data
.queue
[0]));
1803 kernargs
->output_data
.queue
[i
].written
= 0;
1804 kernargs
->output_data
.consumed
= 0;
1806 /* Pass in the heap location. */
1807 kernargs
->heap_ptr
= (int64_t)kernel
->module
->heap
;
1809 /* Create an arena. */
1810 if (kernel
->kind
== KIND_OPENMP
)
1811 kernargs
->arena_ptr
= (int64_t)get_team_arena (agent
, num_teams
);
1813 kernargs
->arena_ptr
= 0;
1815 /* Ensure we can recognize unset return values. */
1816 kernargs
->output_data
.return_value
= 0xcafe0000;
1821 /* Output any data written to console output from the kernel. It is expected
1822 that this function is polled during kernel execution.
1824 We print all entries from the last item printed to the next entry without
1825 a "written" flag. If the "final" flag is set then it'll continue right to
1828 The print buffer is circular, but the from and to locations don't wrap when
1829 the buffer does, so the output limit is UINT_MAX. The target blocks on
1830 output when the buffer is full. */
1833 console_output (struct kernel_info
*kernel
, struct kernargs
*kernargs
,
1836 unsigned int limit
= (sizeof (kernargs
->output_data
.queue
)
1837 / sizeof (kernargs
->output_data
.queue
[0]));
1839 unsigned int from
= __atomic_load_n (&kernargs
->output_data
.consumed
,
1841 unsigned int to
= kernargs
->output_data
.next_output
;
1847 printf ("GCN print buffer overflowed.\n");
1852 for (i
= from
; i
< to
; i
++)
1854 struct printf_data
*data
= &kernargs
->output_data
.queue
[i
%limit
];
1856 if (!data
->written
&& !final
)
1861 case 0: printf ("%.128s%ld\n", data
->msg
, data
->ivalue
); break;
1862 case 1: printf ("%.128s%f\n", data
->msg
, data
->dvalue
); break;
1863 case 2: printf ("%.128s%.128s\n", data
->msg
, data
->text
); break;
1864 case 3: printf ("%.128s%.128s", data
->msg
, data
->text
); break;
1865 default: printf ("GCN print buffer error!\n"); break;
1868 __atomic_store_n (&kernargs
->output_data
.consumed
, i
+1,
1874 /* Release data structure created for a kernel dispatch in SHADOW argument,
1875 and clean up the signal and memory allocations. */
1878 release_kernel_dispatch (struct kernel_dispatch
*shadow
)
1880 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow
);
1882 struct kernargs
*kernargs
= shadow
->kernarg_address
;
1883 void *arena
= (void *)kernargs
->arena_ptr
;
1885 release_team_arena (shadow
->agent
, arena
);
1887 hsa_fns
.hsa_memory_free_fn (shadow
->kernarg_address
);
1890 s
.handle
= shadow
->signal
;
1891 hsa_fns
.hsa_signal_destroy_fn (s
);
1896 /* Extract the properties from a kernel binary. */
1899 init_kernel_properties (struct kernel_info
*kernel
)
1901 hsa_status_t status
;
1902 struct agent_info
*agent
= kernel
->agent
;
1903 hsa_executable_symbol_t kernel_symbol
;
1904 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
1905 kernel
->name
, agent
->id
,
1907 if (status
!= HSA_STATUS_SUCCESS
)
1909 hsa_warn ("Could not find symbol for kernel in the code object", status
);
1910 fprintf (stderr
, "not found name: '%s'\n", kernel
->name
);
1911 dump_executable_symbols (agent
->executable
);
1914 GCN_DEBUG ("Located kernel %s\n", kernel
->name
);
1915 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
1916 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &kernel
->object
);
1917 if (status
!= HSA_STATUS_SUCCESS
)
1918 hsa_fatal ("Could not extract a kernel object from its symbol", status
);
1919 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
1920 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
,
1921 &kernel
->kernarg_segment_size
);
1922 if (status
!= HSA_STATUS_SUCCESS
)
1923 hsa_fatal ("Could not get info about kernel argument size", status
);
1924 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
1925 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
,
1926 &kernel
->group_segment_size
);
1927 if (status
!= HSA_STATUS_SUCCESS
)
1928 hsa_fatal ("Could not get info about kernel group segment size", status
);
1929 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
1930 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
,
1931 &kernel
->private_segment_size
);
1932 if (status
!= HSA_STATUS_SUCCESS
)
1933 hsa_fatal ("Could not get info about kernel private segment size",
1936 /* The kernel type is not known until something tries to launch it. */
1937 kernel
->kind
= KIND_UNKNOWN
;
1939 GCN_DEBUG ("Kernel structure for %s fully initialized with "
1940 "following segment sizes: \n", kernel
->name
);
1941 GCN_DEBUG (" group_segment_size: %u\n",
1942 (unsigned) kernel
->group_segment_size
);
1943 GCN_DEBUG (" private_segment_size: %u\n",
1944 (unsigned) kernel
->private_segment_size
);
1945 GCN_DEBUG (" kernarg_segment_size: %u\n",
1946 (unsigned) kernel
->kernarg_segment_size
);
1950 kernel
->initialization_failed
= true;
1953 /* Do all the work that is necessary before running KERNEL for the first time.
1954 The function assumes the program has been created, finalized and frozen by
1955 create_and_finalize_hsa_program. */
1958 init_kernel (struct kernel_info
*kernel
)
1960 if (pthread_mutex_lock (&kernel
->init_mutex
))
1961 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
1962 if (kernel
->initialized
)
1964 if (pthread_mutex_unlock (&kernel
->init_mutex
))
1965 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
1971 init_kernel_properties (kernel
);
1973 if (!kernel
->initialization_failed
)
1977 kernel
->initialized
= true;
1979 if (pthread_mutex_unlock (&kernel
->init_mutex
))
1980 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
1984 /* Run KERNEL on its agent, pass VARS to it as arguments and take
1985 launch attributes from KLA.
1987 MODULE_LOCKED indicates that the caller already holds the lock and
1988 run_kernel need not lock it again.
1989 If AQ is NULL then agent->sync_queue will be used. */
1992 run_kernel (struct kernel_info
*kernel
, void *vars
,
1993 struct GOMP_kernel_launch_attributes
*kla
,
1994 struct goacc_asyncqueue
*aq
, bool module_locked
)
1996 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel
->agent
->device_id
,
1998 GCN_DEBUG ("GCN launch attribs: gdims:[");
2000 for (i
= 0; i
< kla
->ndim
; ++i
)
2004 DEBUG_PRINT ("%u", kla
->gdims
[i
]);
2006 DEBUG_PRINT ("], normalized gdims:[");
2007 for (i
= 0; i
< kla
->ndim
; ++i
)
2011 DEBUG_PRINT ("%u", kla
->gdims
[i
] / kla
->wdims
[i
]);
2013 DEBUG_PRINT ("], wdims:[");
2014 for (i
= 0; i
< kla
->ndim
; ++i
)
2018 DEBUG_PRINT ("%u", kla
->wdims
[i
]);
2020 DEBUG_PRINT ("]\n");
2023 struct agent_info
*agent
= kernel
->agent
;
2024 if (!module_locked
&& pthread_rwlock_rdlock (&agent
->module_rwlock
))
2025 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2027 if (!agent
->initialized
)
2028 GOMP_PLUGIN_fatal ("Agent must be initialized");
2030 if (!kernel
->initialized
)
2031 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2033 hsa_queue_t
*command_q
= (aq
? aq
->hsa_queue
: kernel
->agent
->sync_queue
);
2036 = hsa_fns
.hsa_queue_add_write_index_release_fn (command_q
, 1);
2037 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index
);
2039 /* Wait until the queue is not full before writing the packet. */
2040 while (index
- hsa_fns
.hsa_queue_load_read_index_acquire_fn (command_q
)
2044 /* Do not allow the dimensions to be overridden when running
2045 constructors or destructors. */
2046 int override_x
= kernel
->kind
== KIND_UNKNOWN
? 0 : override_x_dim
;
2047 int override_z
= kernel
->kind
== KIND_UNKNOWN
? 0 : override_z_dim
;
2049 hsa_kernel_dispatch_packet_t
*packet
;
2050 packet
= ((hsa_kernel_dispatch_packet_t
*) command_q
->base_address
)
2051 + index
% command_q
->size
;
2053 memset (((uint8_t *) packet
) + 4, 0, sizeof (*packet
) - 4);
2054 packet
->grid_size_x
= override_x
? : kla
->gdims
[0];
2055 packet
->workgroup_size_x
= get_group_size (kla
->ndim
,
2056 packet
->grid_size_x
,
2061 packet
->grid_size_y
= kla
->gdims
[1];
2062 packet
->workgroup_size_y
= get_group_size (kla
->ndim
, kla
->gdims
[1],
2067 packet
->grid_size_y
= 1;
2068 packet
->workgroup_size_y
= 1;
2073 packet
->grid_size_z
= limit_worker_threads (override_z
2075 packet
->workgroup_size_z
= get_group_size (kla
->ndim
,
2076 packet
->grid_size_z
,
2081 packet
->grid_size_z
= 1;
2082 packet
->workgroup_size_z
= 1;
2085 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2086 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2087 packet
->grid_size_x
, packet
->grid_size_y
, packet
->grid_size_z
,
2088 packet
->grid_size_x
/ packet
->workgroup_size_x
,
2089 packet
->grid_size_y
/ packet
->workgroup_size_y
,
2090 packet
->grid_size_z
/ packet
->workgroup_size_z
,
2091 packet
->workgroup_size_x
, packet
->workgroup_size_y
,
2092 packet
->workgroup_size_z
);
2094 struct kernel_dispatch
*shadow
2095 = create_kernel_dispatch (kernel
, packet
->grid_size_x
);
2096 shadow
->queue
= command_q
;
2100 fprintf (stderr
, "\nKernel has following dependencies:\n");
2101 print_kernel_dispatch (shadow
, 2);
2104 packet
->private_segment_size
= kernel
->private_segment_size
;
2105 packet
->group_segment_size
= kernel
->group_segment_size
;
2106 packet
->kernel_object
= kernel
->object
;
2107 packet
->kernarg_address
= shadow
->kernarg_address
;
2109 s
.handle
= shadow
->signal
;
2110 packet
->completion_signal
= s
;
2111 hsa_fns
.hsa_signal_store_relaxed_fn (s
, 1);
2112 memcpy (shadow
->kernarg_address
, &vars
, sizeof (vars
));
2114 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2117 header
= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
;
2118 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
;
2119 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
;
2121 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel
->name
,
2124 packet_store_release ((uint32_t *) packet
, header
,
2125 (uint16_t) kla
->ndim
2126 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
);
2128 hsa_fns
.hsa_signal_store_release_fn (command_q
->doorbell_signal
,
2131 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2133 /* Root signal waits with 1ms timeout. */
2134 while (hsa_fns
.hsa_signal_wait_acquire_fn (s
, HSA_SIGNAL_CONDITION_LT
, 1,
2136 HSA_WAIT_STATE_BLOCKED
) != 0)
2138 console_output (kernel
, shadow
->kernarg_address
, false);
2140 console_output (kernel
, shadow
->kernarg_address
, true);
2142 struct kernargs
*kernargs
= shadow
->kernarg_address
;
2143 unsigned int return_value
= (unsigned int)kernargs
->output_data
.return_value
;
2145 release_kernel_dispatch (shadow
);
2147 if (!module_locked
&& pthread_rwlock_unlock (&agent
->module_rwlock
))
2148 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2150 unsigned int upper
= (return_value
& ~0xffff) >> 16;
2151 if (upper
== 0xcafe)
2152 ; // exit not called, normal termination.
2153 else if (upper
== 0xffff)
2157 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2158 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2163 if (upper
== 0xffff)
2165 unsigned int signal
= (return_value
>> 8) & 0xff;
2167 if (signal
== SIGABRT
)
2169 GCN_WARNING ("GCN Kernel aborted\n");
2172 else if (signal
!= 0)
2174 GCN_WARNING ("GCN Kernel received unknown signal\n");
2178 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value
& 0xff);
2179 exit (return_value
& 0xff);
2184 /* {{{ Load/Unload */
2186 /* Initialize KERNEL from D and other parameters. Return true on success. */
2189 init_basic_kernel_info (struct kernel_info
*kernel
,
2190 struct hsa_kernel_description
*d
,
2191 struct agent_info
*agent
,
2192 struct module_info
*module
)
2194 kernel
->agent
= agent
;
2195 kernel
->module
= module
;
2196 kernel
->name
= d
->name
;
2197 if (pthread_mutex_init (&kernel
->init_mutex
, NULL
))
2199 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2205 /* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true. If
2206 not found, return false. */
2209 find_load_offset (Elf64_Addr
*load_offset
, struct agent_info
*agent
,
2210 struct module_info
*module
, Elf64_Ehdr
*image
,
2211 Elf64_Shdr
*sections
)
2215 hsa_status_t status
;
2217 hsa_executable_symbol_t symbol
;
2218 if (!find_executable_symbol (agent
->executable
, &symbol
))
2221 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2222 (symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
, load_offset
);
2223 if (status
!= HSA_STATUS_SUCCESS
)
2225 hsa_error ("Could not extract symbol address", status
);
2229 char *symbol_name
= get_executable_symbol_name (symbol
);
2230 if (symbol_name
== NULL
)
2233 /* Find the kernel function in ELF, and calculate actual load offset. */
2234 for (int i
= 0; i
< image
->e_shnum
; i
++)
2235 if (sections
[i
].sh_type
== SHT_SYMTAB
)
2237 Elf64_Shdr
*strtab
= §ions
[sections
[i
].sh_link
];
2238 char *strings
= (char *)image
+ strtab
->sh_offset
;
2240 for (size_t offset
= 0;
2241 offset
< sections
[i
].sh_size
;
2242 offset
+= sections
[i
].sh_entsize
)
2244 Elf64_Sym
*sym
= (Elf64_Sym
*)((char*)image
2245 + sections
[i
].sh_offset
2247 if (strcmp (symbol_name
, strings
+ sym
->st_name
) == 0)
2249 *load_offset
-= sym
->st_value
;
2260 /* Create and finalize the program consisting of all loaded modules. */
2263 create_and_finalize_hsa_program (struct agent_info
*agent
)
2265 hsa_status_t status
;
2266 int reloc_count
= 0;
2268 if (pthread_mutex_lock (&agent
->prog_mutex
))
2270 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2273 if (agent
->prog_finalized
)
2277 = hsa_fns
.hsa_executable_create_fn (HSA_PROFILE_FULL
,
2278 HSA_EXECUTABLE_STATE_UNFROZEN
,
2279 "", &agent
->executable
);
2280 if (status
!= HSA_STATUS_SUCCESS
)
2282 hsa_error ("Could not create GCN executable", status
);
2286 /* Load any GCN modules. */
2287 struct module_info
*module
= agent
->module
;
2290 Elf64_Ehdr
*image
= (Elf64_Ehdr
*)module
->image_desc
->gcn_image
->image
;
2292 /* Hide relocations from the HSA runtime loader.
2293 Keep a copy of the unmodified section headers to use later. */
2294 Elf64_Shdr
*image_sections
= (Elf64_Shdr
*)((char *)image
2296 for (int i
= image
->e_shnum
- 1; i
>= 0; i
--)
2298 if (image_sections
[i
].sh_type
== SHT_RELA
2299 || image_sections
[i
].sh_type
== SHT_REL
)
2300 /* Change section type to something harmless. */
2301 image_sections
[i
].sh_type
|= 0x80;
2304 hsa_code_object_t co
= { 0 };
2305 status
= hsa_fns
.hsa_code_object_deserialize_fn
2306 (module
->image_desc
->gcn_image
->image
,
2307 module
->image_desc
->gcn_image
->size
,
2309 if (status
!= HSA_STATUS_SUCCESS
)
2311 hsa_error ("Could not deserialize GCN code object", status
);
2315 status
= hsa_fns
.hsa_executable_load_code_object_fn
2316 (agent
->executable
, agent
->id
, co
, "");
2317 if (status
!= HSA_STATUS_SUCCESS
)
2319 hsa_error ("Could not load GCN code object", status
);
2325 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
2326 gcn_kernel_heap_size
,
2327 (void**)&module
->heap
);
2328 if (status
!= HSA_STATUS_SUCCESS
)
2330 hsa_error ("Could not allocate memory for GCN heap", status
);
2334 status
= hsa_fns
.hsa_memory_assign_agent_fn
2335 (module
->heap
, agent
->id
, HSA_ACCESS_PERMISSION_RW
);
2336 if (status
!= HSA_STATUS_SUCCESS
)
2338 hsa_error ("Could not assign GCN heap memory to device", status
);
2342 hsa_fns
.hsa_memory_copy_fn (&module
->heap
->size
,
2343 &gcn_kernel_heap_size
,
2344 sizeof (gcn_kernel_heap_size
));
2350 dump_executable_symbols (agent
->executable
);
2352 status
= hsa_fns
.hsa_executable_freeze_fn (agent
->executable
, "");
2353 if (status
!= HSA_STATUS_SUCCESS
)
2355 hsa_error ("Could not freeze the GCN executable", status
);
2361 struct module_info
*module
= agent
->module
;
2362 Elf64_Ehdr
*image
= (Elf64_Ehdr
*)module
->image_desc
->gcn_image
->image
;
2363 Elf64_Shdr
*sections
= (Elf64_Shdr
*)((char *)image
+ image
->e_shoff
);
2365 Elf64_Addr load_offset
;
2366 if (!find_load_offset (&load_offset
, agent
, module
, image
, sections
))
2369 /* Record the physical load address range.
2370 We need this for data copies later. */
2371 Elf64_Phdr
*segments
= (Elf64_Phdr
*)((char*)image
+ image
->e_phoff
);
2372 Elf64_Addr low
= ~0, high
= 0;
2373 for (int i
= 0; i
< image
->e_phnum
; i
++)
2374 if (segments
[i
].p_memsz
> 0)
2376 if (segments
[i
].p_paddr
< low
)
2377 low
= segments
[i
].p_paddr
;
2378 if (segments
[i
].p_paddr
> high
)
2379 high
= segments
[i
].p_paddr
+ segments
[i
].p_memsz
- 1;
2381 module
->phys_address_start
= low
+ load_offset
;
2382 module
->phys_address_end
= high
+ load_offset
;
2384 // Find dynamic symbol table
2385 Elf64_Shdr
*dynsym
= NULL
;
2386 for (int i
= 0; i
< image
->e_shnum
; i
++)
2387 if (sections
[i
].sh_type
== SHT_DYNSYM
)
2389 dynsym
= §ions
[i
];
2393 /* Fix up relocations. */
2394 for (int i
= 0; i
< image
->e_shnum
; i
++)
2396 if (sections
[i
].sh_type
== (SHT_RELA
| 0x80))
2397 for (size_t offset
= 0;
2398 offset
< sections
[i
].sh_size
;
2399 offset
+= sections
[i
].sh_entsize
)
2401 Elf64_Rela
*reloc
= (Elf64_Rela
*)((char*)image
2402 + sections
[i
].sh_offset
2406 ? (Elf64_Sym
*)((char*)image
2408 + (dynsym
->sh_entsize
2409 * ELF64_R_SYM (reloc
->r_info
)))
2412 int64_t S
= (sym
? sym
->st_value
: 0);
2413 int64_t P
= reloc
->r_offset
+ load_offset
;
2414 int64_t A
= reloc
->r_addend
;
2415 int64_t B
= load_offset
;
2417 switch (ELF64_R_TYPE (reloc
->r_info
))
2419 case R_AMDGPU_ABS32_LO
:
2420 V
= (S
+ A
) & 0xFFFFFFFF;
2423 case R_AMDGPU_ABS32_HI
:
2427 case R_AMDGPU_ABS64
:
2431 case R_AMDGPU_REL32
:
2435 case R_AMDGPU_REL64
:
2437 LLD seems to emit REL64 where the the assembler has
2438 ABS64. This is clearly wrong because it's not what the
2439 compiler is expecting. Let's assume, for now, that
2440 it's a bug. In any case, GCN kernels are always self
2441 contained and therefore relative relocations will have
2442 been resolved already, so this should be a safe
2447 case R_AMDGPU_ABS32
:
2451 /* TODO R_AMDGPU_GOTPCREL */
2452 /* TODO R_AMDGPU_GOTPCREL32_LO */
2453 /* TODO R_AMDGPU_GOTPCREL32_HI */
2454 case R_AMDGPU_REL32_LO
:
2455 V
= (S
+ A
- P
) & 0xFFFFFFFF;
2458 case R_AMDGPU_REL32_HI
:
2459 V
= (S
+ A
- P
) >> 32;
2462 case R_AMDGPU_RELATIVE64
:
2467 fprintf (stderr
, "Error: unsupported relocation type.\n");
2470 status
= hsa_fns
.hsa_memory_copy_fn ((void*)P
, &V
, size
);
2471 if (status
!= HSA_STATUS_SUCCESS
)
2473 hsa_error ("Failed to fix up relocation", status
);
2481 GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
2482 agent
->device_id
, reloc_count
);
2485 agent
->prog_finalized
= true;
2487 if (pthread_mutex_unlock (&agent
->prog_mutex
))
2489 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2500 /* Free the HSA program in agent and everything associated with it and set
2501 agent->prog_finalized and the initialized flags of all kernels to false.
2502 Return TRUE on success. */
2505 destroy_hsa_program (struct agent_info
*agent
)
2507 if (!agent
->prog_finalized
)
2510 hsa_status_t status
;
2512 GCN_DEBUG ("Destroying the current GCN program.\n");
2514 status
= hsa_fns
.hsa_executable_destroy_fn (agent
->executable
);
2515 if (status
!= HSA_STATUS_SUCCESS
)
2516 return hsa_error ("Could not destroy GCN executable", status
);
2521 for (i
= 0; i
< agent
->module
->kernel_count
; i
++)
2522 agent
->module
->kernels
[i
].initialized
= false;
2524 if (agent
->module
->heap
)
2526 hsa_fns
.hsa_memory_free_fn (agent
->module
->heap
);
2527 agent
->module
->heap
= NULL
;
2530 agent
->prog_finalized
= false;
2534 /* Deinitialize all information associated with MODULE and kernels within
2535 it. Return TRUE on success. */
2538 destroy_module (struct module_info
*module
, bool locked
)
2540 /* Run destructors before destroying module. */
2541 struct GOMP_kernel_launch_attributes kla
=
2545 /* Work-group size. */
2549 if (module
->fini_array_func
)
2551 init_kernel (module
->fini_array_func
);
2552 run_kernel (module
->fini_array_func
, NULL
, &kla
, NULL
, locked
);
2554 module
->constructors_run_p
= false;
2557 for (i
= 0; i
< module
->kernel_count
; i
++)
2558 if (pthread_mutex_destroy (&module
->kernels
[i
].init_mutex
))
2560 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2571 /* Callback of dispatch queues to report errors. */
2574 execute_queue_entry (struct goacc_asyncqueue
*aq
, int index
)
2576 struct queue_entry
*entry
= &aq
->queue
[index
];
2578 switch (entry
->type
)
2582 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2583 aq
->agent
->device_id
, aq
->id
, index
);
2584 run_kernel (entry
->u
.launch
.kernel
,
2585 entry
->u
.launch
.vars
,
2586 &entry
->u
.launch
.kla
, aq
, false);
2588 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2589 aq
->agent
->device_id
, aq
->id
, index
);
2594 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2595 aq
->agent
->device_id
, aq
->id
, index
);
2596 entry
->u
.callback
.fn (entry
->u
.callback
.data
);
2598 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2599 aq
->agent
->device_id
, aq
->id
, index
);
2604 /* FIXME: is it safe to access a placeholder that may already have
2606 struct placeholder
*placeholderp
= entry
->u
.asyncwait
.placeholderp
;
2609 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2610 aq
->agent
->device_id
, aq
->id
, index
);
2612 pthread_mutex_lock (&placeholderp
->mutex
);
2614 while (!placeholderp
->executed
)
2615 pthread_cond_wait (&placeholderp
->cond
, &placeholderp
->mutex
);
2617 pthread_mutex_unlock (&placeholderp
->mutex
);
2619 if (pthread_cond_destroy (&placeholderp
->cond
))
2620 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2622 if (pthread_mutex_destroy (&placeholderp
->mutex
))
2623 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2626 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2627 "entry (%d) done\n", aq
->agent
->device_id
, aq
->id
, index
);
2631 case ASYNC_PLACEHOLDER
:
2632 pthread_mutex_lock (&entry
->u
.placeholder
.mutex
);
2633 entry
->u
.placeholder
.executed
= 1;
2634 pthread_cond_signal (&entry
->u
.placeholder
.cond
);
2635 pthread_mutex_unlock (&entry
->u
.placeholder
.mutex
);
2639 GOMP_PLUGIN_fatal ("Unknown queue element");
2643 /* This function is run as a thread to service an async queue in the
2644 background. It runs continuously until the stop flag is set. */
2647 drain_queue (void *thread_arg
)
2649 struct goacc_asyncqueue
*aq
= thread_arg
;
2651 if (DRAIN_QUEUE_SYNCHRONOUS_P
)
2653 aq
->drain_queue_stop
= 2;
2657 pthread_mutex_lock (&aq
->mutex
);
2661 if (aq
->drain_queue_stop
)
2664 if (aq
->queue_n
> 0)
2666 pthread_mutex_unlock (&aq
->mutex
);
2667 execute_queue_entry (aq
, aq
->queue_first
);
2669 pthread_mutex_lock (&aq
->mutex
);
2670 aq
->queue_first
= ((aq
->queue_first
+ 1)
2671 % ASYNC_QUEUE_SIZE
);
2674 if (DEBUG_THREAD_SIGNAL
)
2675 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2676 aq
->agent
->device_id
, aq
->id
);
2677 pthread_cond_broadcast (&aq
->queue_cond_out
);
2678 pthread_mutex_unlock (&aq
->mutex
);
2681 GCN_DEBUG ("Async thread %d:%d: continue\n", aq
->agent
->device_id
,
2683 pthread_mutex_lock (&aq
->mutex
);
2687 if (DEBUG_THREAD_SLEEP
)
2688 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2689 aq
->agent
->device_id
, aq
->id
);
2690 pthread_cond_wait (&aq
->queue_cond_in
, &aq
->mutex
);
2691 if (DEBUG_THREAD_SLEEP
)
2692 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2693 aq
->agent
->device_id
, aq
->id
);
2697 aq
->drain_queue_stop
= 2;
2698 if (DEBUG_THREAD_SIGNAL
)
2699 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2700 aq
->agent
->device_id
, aq
->id
);
2701 pthread_cond_broadcast (&aq
->queue_cond_out
);
2702 pthread_mutex_unlock (&aq
->mutex
);
2704 GCN_DEBUG ("Async thread %d:%d: returning\n", aq
->agent
->device_id
, aq
->id
);
2708 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2709 is not usually the case. This is just a debug tool. */
2712 drain_queue_synchronous (struct goacc_asyncqueue
*aq
)
2714 pthread_mutex_lock (&aq
->mutex
);
2716 while (aq
->queue_n
> 0)
2718 execute_queue_entry (aq
, aq
->queue_first
);
2720 aq
->queue_first
= ((aq
->queue_first
+ 1)
2721 % ASYNC_QUEUE_SIZE
);
2725 pthread_mutex_unlock (&aq
->mutex
);
2728 /* Block the current thread until an async queue is writable. The aq->mutex
2729 lock should be held on entry, and remains locked on exit. */
2732 wait_for_queue_nonfull (struct goacc_asyncqueue
*aq
)
2734 if (aq
->queue_n
== ASYNC_QUEUE_SIZE
)
2736 /* Queue is full. Wait for it to not be full. */
2737 while (aq
->queue_n
== ASYNC_QUEUE_SIZE
)
2738 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
2742 /* Request an asynchronous kernel launch on the specified queue. This
2743 may block if the queue is full, but returns without waiting for the
2747 queue_push_launch (struct goacc_asyncqueue
*aq
, struct kernel_info
*kernel
,
2748 void *vars
, struct GOMP_kernel_launch_attributes
*kla
)
2750 assert (aq
->agent
== kernel
->agent
);
2752 pthread_mutex_lock (&aq
->mutex
);
2754 wait_for_queue_nonfull (aq
);
2756 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
)
2757 % ASYNC_QUEUE_SIZE
);
2759 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq
->agent
->device_id
,
2760 aq
->id
, queue_last
);
2762 aq
->queue
[queue_last
].type
= KERNEL_LAUNCH
;
2763 aq
->queue
[queue_last
].u
.launch
.kernel
= kernel
;
2764 aq
->queue
[queue_last
].u
.launch
.vars
= vars
;
2765 aq
->queue
[queue_last
].u
.launch
.kla
= *kla
;
2769 if (DEBUG_THREAD_SIGNAL
)
2770 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2771 aq
->agent
->device_id
, aq
->id
);
2772 pthread_cond_signal (&aq
->queue_cond_in
);
2774 pthread_mutex_unlock (&aq
->mutex
);
2777 /* Request an asynchronous callback on the specified queue. The callback
2778 function will be called, with the given opaque data, from the appropriate
2779 async thread, when all previous items on that queue are complete. */
2782 queue_push_callback (struct goacc_asyncqueue
*aq
, void (*fn
)(void *),
2785 pthread_mutex_lock (&aq
->mutex
);
2787 wait_for_queue_nonfull (aq
);
2789 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
)
2790 % ASYNC_QUEUE_SIZE
);
2792 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq
->agent
->device_id
,
2793 aq
->id
, queue_last
);
2795 aq
->queue
[queue_last
].type
= CALLBACK
;
2796 aq
->queue
[queue_last
].u
.callback
.fn
= fn
;
2797 aq
->queue
[queue_last
].u
.callback
.data
= data
;
2801 if (DEBUG_THREAD_SIGNAL
)
2802 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2803 aq
->agent
->device_id
, aq
->id
);
2804 pthread_cond_signal (&aq
->queue_cond_in
);
2806 pthread_mutex_unlock (&aq
->mutex
);
2809 /* Request that a given async thread wait for another thread (unspecified) to
2810 reach the given placeholder. The wait will occur when all previous entries
2811 on the queue are complete. A placeholder is effectively a kind of signal
2812 which simply sets a flag when encountered in a queue. */
2815 queue_push_asyncwait (struct goacc_asyncqueue
*aq
,
2816 struct placeholder
*placeholderp
)
2818 pthread_mutex_lock (&aq
->mutex
);
2820 wait_for_queue_nonfull (aq
);
2822 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
) % ASYNC_QUEUE_SIZE
);
2824 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq
->agent
->device_id
,
2825 aq
->id
, queue_last
);
2827 aq
->queue
[queue_last
].type
= ASYNC_WAIT
;
2828 aq
->queue
[queue_last
].u
.asyncwait
.placeholderp
= placeholderp
;
2832 if (DEBUG_THREAD_SIGNAL
)
2833 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2834 aq
->agent
->device_id
, aq
->id
);
2835 pthread_cond_signal (&aq
->queue_cond_in
);
2837 pthread_mutex_unlock (&aq
->mutex
);
2840 /* Add a placeholder into an async queue. When the async thread reaches the
2841 placeholder it will set the "executed" flag to true and continue.
2842 Another thread may be waiting on this thread reaching the placeholder. */
2844 static struct placeholder
*
2845 queue_push_placeholder (struct goacc_asyncqueue
*aq
)
2847 struct placeholder
*placeholderp
;
2849 pthread_mutex_lock (&aq
->mutex
);
2851 wait_for_queue_nonfull (aq
);
2853 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
) % ASYNC_QUEUE_SIZE
);
2855 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq
->agent
->device_id
,
2856 aq
->id
, queue_last
);
2858 aq
->queue
[queue_last
].type
= ASYNC_PLACEHOLDER
;
2859 placeholderp
= &aq
->queue
[queue_last
].u
.placeholder
;
2861 if (pthread_mutex_init (&placeholderp
->mutex
, NULL
))
2863 pthread_mutex_unlock (&aq
->mutex
);
2864 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2867 if (pthread_cond_init (&placeholderp
->cond
, NULL
))
2869 pthread_mutex_unlock (&aq
->mutex
);
2870 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2873 placeholderp
->executed
= 0;
2877 if (DEBUG_THREAD_SIGNAL
)
2878 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2879 aq
->agent
->device_id
, aq
->id
);
2880 pthread_cond_signal (&aq
->queue_cond_in
);
2882 pthread_mutex_unlock (&aq
->mutex
);
2884 return placeholderp
;
2887 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
2890 finalize_async_thread (struct goacc_asyncqueue
*aq
)
2892 pthread_mutex_lock (&aq
->mutex
);
2893 if (aq
->drain_queue_stop
== 2)
2895 pthread_mutex_unlock (&aq
->mutex
);
2899 aq
->drain_queue_stop
= 1;
2901 if (DEBUG_THREAD_SIGNAL
)
2902 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2903 aq
->agent
->device_id
, aq
->id
);
2904 pthread_cond_signal (&aq
->queue_cond_in
);
2906 while (aq
->drain_queue_stop
!= 2)
2908 if (DEBUG_THREAD_SLEEP
)
2909 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2910 " to sleep\n", aq
->agent
->device_id
, aq
->id
);
2911 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
2912 if (DEBUG_THREAD_SLEEP
)
2913 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2914 aq
->agent
->device_id
, aq
->id
);
2917 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq
->agent
->device_id
,
2919 pthread_mutex_unlock (&aq
->mutex
);
2921 int err
= pthread_join (aq
->thread_drain_queue
, NULL
);
2923 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2924 aq
->agent
->device_id
, aq
->id
, strerror (err
));
2925 GCN_DEBUG ("Joined with async thread %d:%d\n", aq
->agent
->device_id
, aq
->id
);
2928 /* Set up an async queue for OpenMP. There will be only one. The
2929 implementation simply uses an OpenACC async queue.
2930 FIXME: is this thread-safe if two threads call this function? */
2933 maybe_init_omp_async (struct agent_info
*agent
)
2935 if (!agent
->omp_async_queue
)
2936 agent
->omp_async_queue
2937 = GOMP_OFFLOAD_openacc_async_construct (agent
->device_id
);
2940 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2941 copies from read-only pages. */
2944 hsa_memory_copy_wrapper (void *dst
, const void *src
, size_t len
)
2946 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, len
);
2948 if (status
== HSA_STATUS_SUCCESS
)
2951 /* It appears that the copy fails if the source data is in a read-only page.
2952 We can't detect that easily, so try copying the data to a temporary buffer
2953 and doing the copy again if we got an error above. */
2955 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2956 "[%p:+%d]\n", (void *) src
, (int) len
);
2958 void *src_copy
= malloc (len
);
2959 memcpy (src_copy
, src
, len
);
2960 status
= hsa_fns
.hsa_memory_copy_fn (dst
, (const void *) src_copy
, len
);
2962 if (status
!= HSA_STATUS_SUCCESS
)
2963 GOMP_PLUGIN_error ("memory copy failed");
2966 /* Copy data to or from a device. This is intended for use as an async
2970 copy_data (void *data_
)
2972 struct copy_data
*data
= (struct copy_data
*)data_
;
2973 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2974 data
->aq
->agent
->device_id
, data
->aq
->id
, data
->len
, data
->src
,
2976 hsa_memory_copy_wrapper (data
->dst
, data
->src
, data
->len
);
2978 free ((void *) data
->src
);
2982 /* Free device data. This is intended for use as an async callback event. */
2985 gomp_offload_free (void *ptr
)
2987 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr
);
2988 GOMP_OFFLOAD_free (0, ptr
);
2991 /* Request an asynchronous data copy, to or from a device, on a given queue.
2992 The event will be registered as a callback. If FREE_SRC is true
2993 then the source data will be freed following the copy. */
2996 queue_push_copy (struct goacc_asyncqueue
*aq
, void *dst
, const void *src
,
2997 size_t len
, bool free_src
)
3000 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3001 aq
->agent
->device_id
, aq
->id
, len
, src
, dst
);
3002 struct copy_data
*data
3003 = (struct copy_data
*)GOMP_PLUGIN_malloc (sizeof (struct copy_data
));
3007 data
->free_src
= free_src
;
3009 queue_push_callback (aq
, copy_data
, data
);
3012 /* Return true if the given queue is currently empty. */
3015 queue_empty (struct goacc_asyncqueue
*aq
)
3017 pthread_mutex_lock (&aq
->mutex
);
3018 int res
= aq
->queue_n
== 0 ? 1 : 0;
3019 pthread_mutex_unlock (&aq
->mutex
);
3024 /* Wait for a given queue to become empty. This implements an OpenACC wait
3028 wait_queue (struct goacc_asyncqueue
*aq
)
3030 if (DRAIN_QUEUE_SYNCHRONOUS_P
)
3032 drain_queue_synchronous (aq
);
3036 pthread_mutex_lock (&aq
->mutex
);
3038 while (aq
->queue_n
> 0)
3040 if (DEBUG_THREAD_SLEEP
)
3041 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3042 aq
->agent
->device_id
, aq
->id
);
3043 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
3044 if (DEBUG_THREAD_SLEEP
)
3045 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq
->agent
->device_id
,
3049 pthread_mutex_unlock (&aq
->mutex
);
3050 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq
->agent
->device_id
, aq
->id
);
3054 /* {{{ OpenACC support */
3056 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3059 gcn_exec (struct kernel_info
*kernel
, size_t mapnum
, void **hostaddrs
,
3060 void **devaddrs
, unsigned *dims
, void *targ_mem_desc
, bool async
,
3061 struct goacc_asyncqueue
*aq
)
3063 if (!GOMP_OFFLOAD_can_run (kernel
))
3064 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3066 /* If we get here then this must be an OpenACC kernel. */
3067 kernel
->kind
= KIND_OPENACC
;
3069 /* devaddrs must be double-indirect on the target. */
3070 void **ind_da
= alloc_by_agent (kernel
->agent
, sizeof (void*) * mapnum
);
3071 for (size_t i
= 0; i
< mapnum
; i
++)
3072 hsa_fns
.hsa_memory_copy_fn (&ind_da
[i
],
3073 devaddrs
[i
] ? &devaddrs
[i
] : &hostaddrs
[i
],
3076 struct hsa_kernel_description
*hsa_kernel_desc
= NULL
;
3077 for (unsigned i
= 0; i
< kernel
->module
->image_desc
->kernel_count
; i
++)
3079 struct hsa_kernel_description
*d
3080 = &kernel
->module
->image_desc
->kernel_infos
[i
];
3081 if (d
->name
== kernel
->name
)
3083 hsa_kernel_desc
= d
;
3088 /* We may have statically-determined dimensions in
3089 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3090 invocation at runtime in dims[]. We allow static dimensions to take
3091 priority over dynamic dimensions when present (non-zero). */
3092 if (hsa_kernel_desc
->oacc_dims
[0] > 0)
3093 dims
[0] = hsa_kernel_desc
->oacc_dims
[0];
3094 if (hsa_kernel_desc
->oacc_dims
[1] > 0)
3095 dims
[1] = hsa_kernel_desc
->oacc_dims
[1];
3096 if (hsa_kernel_desc
->oacc_dims
[2] > 0)
3097 dims
[2] = hsa_kernel_desc
->oacc_dims
[2];
3099 /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3100 There isn't really a correct answer for this without a clue about the
3101 problem size, so let's do a reasonable number of single-worker gangs.
3102 64 gangs matches a typical Fiji device. */
3104 /* NOTE: Until support for middle-end worker partitioning is merged, use 1
3105 for the default number of workers. */
3106 if (dims
[0] == 0) dims
[0] = get_cu_count (kernel
->agent
); /* Gangs. */
3107 if (dims
[1] == 0) dims
[1] = 1; /* Workers. */
3109 /* The incoming dimensions are expressed in terms of gangs, workers, and
3110 vectors. The HSA dimensions are expressed in terms of "work-items",
3111 which means multiples of vector lanes.
3113 The "grid size" specifies the size of the problem space, and the
3114 "work-group size" specifies how much of that we want a single compute
3115 unit to chew on at once.
3117 The three dimensions do not really correspond to hardware, but the
3118 important thing is that the HSA runtime will launch as many
3119 work-groups as it takes to process the entire grid, and each
3120 work-group will contain as many wave-fronts as it takes to process
3121 the work-items in that group.
3123 Essentially, as long as we set the Y dimension to 64 (the number of
3124 vector lanes in hardware), and the Z group size to the maximum (16),
3125 then we will get the gangs (X) and workers (Z) launched as we expect.
3127 The reason for the apparent reversal of vector and worker dimension
3128 order is to do with the way the run-time distributes work-items across
3130 struct GOMP_kernel_launch_attributes kla
=
3133 {dims
[0], 64, dims
[1]},
3134 /* Work-group size. */
3138 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
3139 acc_prof_info
*prof_info
= thr
->prof_info
;
3140 acc_event_info enqueue_launch_event_info
;
3141 acc_api_info
*api_info
= thr
->api_info
;
3142 bool profiling_dispatch_p
= __builtin_expect (prof_info
!= NULL
, false);
3143 if (profiling_dispatch_p
)
3145 prof_info
->event_type
= acc_ev_enqueue_launch_start
;
3147 enqueue_launch_event_info
.launch_event
.event_type
3148 = prof_info
->event_type
;
3149 enqueue_launch_event_info
.launch_event
.valid_bytes
3150 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES
;
3151 enqueue_launch_event_info
.launch_event
.parent_construct
3152 = acc_construct_parallel
;
3153 enqueue_launch_event_info
.launch_event
.implicit
= 1;
3154 enqueue_launch_event_info
.launch_event
.tool_info
= NULL
;
3155 enqueue_launch_event_info
.launch_event
.kernel_name
3156 = (char *) kernel
->name
;
3157 enqueue_launch_event_info
.launch_event
.num_gangs
= kla
.gdims
[0];
3158 enqueue_launch_event_info
.launch_event
.num_workers
= kla
.gdims
[2];
3159 enqueue_launch_event_info
.launch_event
.vector_length
= kla
.gdims
[1];
3161 api_info
->device_api
= acc_device_api_other
;
3163 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
,
3164 &enqueue_launch_event_info
, api_info
);
3169 run_kernel (kernel
, ind_da
, &kla
, NULL
, false);
3170 gomp_offload_free (ind_da
);
3174 queue_push_launch (aq
, kernel
, ind_da
, &kla
);
3176 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3177 aq
->agent
->device_id
, aq
->id
, ind_da
);
3178 queue_push_callback (aq
, gomp_offload_free
, ind_da
);
3181 if (profiling_dispatch_p
)
3183 prof_info
->event_type
= acc_ev_enqueue_launch_end
;
3184 enqueue_launch_event_info
.launch_event
.event_type
= prof_info
->event_type
;
3185 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
,
3186 &enqueue_launch_event_info
,
3192 /* {{{ Generic Plugin API */
3194 /* Return the name of the accelerator, which is "gcn". */
3197 GOMP_OFFLOAD_get_name (void)
3202 /* Return the specific capabilities the HSA accelerator have. */
3205 GOMP_OFFLOAD_get_caps (void)
3207 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3208 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3209 | GOMP_OFFLOAD_CAP_OPENACC_200
;
3212 /* Identify as GCN accelerator. */
3215 GOMP_OFFLOAD_get_type (void)
3217 return OFFLOAD_TARGET_TYPE_GCN
;
3220 /* Return the libgomp version number we're compatible with. There is
3221 no requirement for cross-version compatibility. */
3224 GOMP_OFFLOAD_version (void)
3226 return GOMP_VERSION
;
3229 /* Return the number of GCN devices on the system. */
3232 GOMP_OFFLOAD_get_num_devices (void)
3234 if (!init_hsa_context ())
3236 return hsa_context
.agent_count
;
3239 /* Initialize device (agent) number N so that it can be used for computation.
3240 Return TRUE on success. */
3243 GOMP_OFFLOAD_init_device (int n
)
3245 if (!init_hsa_context ())
3247 if (n
>= hsa_context
.agent_count
)
3249 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n
);
3252 struct agent_info
*agent
= &hsa_context
.agents
[n
];
3254 if (agent
->initialized
)
3257 agent
->device_id
= n
;
3259 if (pthread_rwlock_init (&agent
->module_rwlock
, NULL
))
3261 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3264 if (pthread_mutex_init (&agent
->prog_mutex
, NULL
))
3266 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3269 if (pthread_mutex_init (&agent
->async_queues_mutex
, NULL
))
3271 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3274 if (pthread_mutex_init (&agent
->team_arena_write_lock
, NULL
))
3276 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3279 agent
->async_queues
= NULL
;
3280 agent
->omp_async_queue
= NULL
;
3281 agent
->team_arena_list
= NULL
;
3283 uint32_t queue_size
;
3284 hsa_status_t status
;
3285 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
,
3286 HSA_AGENT_INFO_QUEUE_MAX_SIZE
,
3288 if (status
!= HSA_STATUS_SUCCESS
)
3289 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3293 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
, HSA_AGENT_INFO_NAME
,
3295 if (status
!= HSA_STATUS_SUCCESS
)
3296 return hsa_error ("Error querying the name of the agent", status
);
3297 agent
->gfx900_p
= (strncmp (buf
, "gfx900", 6) == 0);
3299 status
= hsa_fns
.hsa_queue_create_fn (agent
->id
, queue_size
,
3300 HSA_QUEUE_TYPE_MULTI
,
3301 hsa_queue_callback
, NULL
, UINT32_MAX
,
3302 UINT32_MAX
, &agent
->sync_queue
);
3303 if (status
!= HSA_STATUS_SUCCESS
)
3304 return hsa_error ("Error creating command queue", status
);
3306 agent
->kernarg_region
.handle
= (uint64_t) -1;
3307 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
->id
,
3308 get_kernarg_memory_region
,
3309 &agent
->kernarg_region
);
3310 if (agent
->kernarg_region
.handle
== (uint64_t) -1)
3312 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3316 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3317 dump_hsa_region (agent
->kernarg_region
, NULL
);
3319 agent
->data_region
.handle
= (uint64_t) -1;
3320 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
->id
,
3321 get_data_memory_region
,
3322 &agent
->data_region
);
3323 if (agent
->data_region
.handle
== (uint64_t) -1)
3325 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3329 GCN_DEBUG ("Selected device data memory region:\n");
3330 dump_hsa_region (agent
->data_region
, NULL
);
3332 GCN_DEBUG ("GCN agent %d initialized\n", n
);
3334 agent
->initialized
= true;
3338 /* Load GCN object-code module described by struct gcn_image_desc in
3339 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3340 If there are any constructors then run them. */
3343 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
3344 struct addr_pair
**target_table
)
3346 if (GOMP_VERSION_DEV (version
) != GOMP_VERSION_GCN
)
3348 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3349 " (expected %u, received %u)",
3350 GOMP_VERSION_GCN
, GOMP_VERSION_DEV (version
));
3354 struct gcn_image_desc
*image_desc
= (struct gcn_image_desc
*) target_data
;
3355 struct agent_info
*agent
;
3356 struct addr_pair
*pair
;
3357 struct module_info
*module
;
3358 struct kernel_info
*kernel
;
3359 int kernel_count
= image_desc
->kernel_count
;
3360 unsigned var_count
= image_desc
->global_variable_count
;
3362 agent
= get_agent_info (ord
);
3366 if (pthread_rwlock_wrlock (&agent
->module_rwlock
))
3368 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3371 if (agent
->prog_finalized
3372 && !destroy_hsa_program (agent
))
3375 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count
);
3376 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count
);
3377 pair
= GOMP_PLUGIN_malloc ((kernel_count
+ var_count
- 2)
3378 * sizeof (struct addr_pair
));
3379 *target_table
= pair
;
3380 module
= (struct module_info
*)
3381 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info
)
3382 + kernel_count
* sizeof (struct kernel_info
));
3383 module
->image_desc
= image_desc
;
3384 module
->kernel_count
= kernel_count
;
3385 module
->heap
= NULL
;
3386 module
->constructors_run_p
= false;
3388 kernel
= &module
->kernels
[0];
3390 /* Allocate memory for kernel dependencies. */
3391 for (unsigned i
= 0; i
< kernel_count
; i
++)
3393 struct hsa_kernel_description
*d
= &image_desc
->kernel_infos
[i
];
3394 if (!init_basic_kernel_info (kernel
, d
, agent
, module
))
3396 if (strcmp (d
->name
, "_init_array") == 0)
3397 module
->init_array_func
= kernel
;
3398 else if (strcmp (d
->name
, "_fini_array") == 0)
3399 module
->fini_array_func
= kernel
;
3402 pair
->start
= (uintptr_t) kernel
;
3403 pair
->end
= (uintptr_t) (kernel
+ 1);
3409 agent
->module
= module
;
3410 if (pthread_rwlock_unlock (&agent
->module_rwlock
))
3412 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3416 if (!create_and_finalize_hsa_program (agent
))
3419 for (unsigned i
= 0; i
< var_count
; i
++)
3421 struct global_var_info
*v
= &image_desc
->global_variables
[i
];
3422 GCN_DEBUG ("Looking for variable %s\n", v
->name
);
3424 hsa_status_t status
;
3425 hsa_executable_symbol_t var_symbol
;
3426 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3430 if (status
!= HSA_STATUS_SUCCESS
)
3431 hsa_fatal ("Could not find symbol for variable in the code object",
3436 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3437 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
, &var_addr
);
3438 if (status
!= HSA_STATUS_SUCCESS
)
3439 hsa_fatal ("Could not extract a variable from its symbol", status
);
3440 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3441 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE
, &var_size
);
3442 if (status
!= HSA_STATUS_SUCCESS
)
3443 hsa_fatal ("Could not extract a variable size from its symbol", status
);
3445 pair
->start
= var_addr
;
3446 pair
->end
= var_addr
+ var_size
;
3447 GCN_DEBUG ("Found variable %s at %p with size %u\n", v
->name
,
3448 (void *)var_addr
, var_size
);
3452 /* Ensure that constructors are run first. */
3453 struct GOMP_kernel_launch_attributes kla
=
3457 /* Work-group size. */
3461 if (module
->init_array_func
)
3463 init_kernel (module
->init_array_func
);
3464 run_kernel (module
->init_array_func
, NULL
, &kla
, NULL
, false);
3466 module
->constructors_run_p
= true;
3468 /* Don't report kernels that libgomp need not know about. */
3469 if (module
->init_array_func
)
3471 if (module
->fini_array_func
)
3474 return kernel_count
+ var_count
;
3477 /* Unload GCN object-code module described by struct gcn_image_desc in
3478 TARGET_DATA from agent number N. Return TRUE on success. */
3481 GOMP_OFFLOAD_unload_image (int n
, unsigned version
, const void *target_data
)
3483 if (GOMP_VERSION_DEV (version
) != GOMP_VERSION_GCN
)
3485 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3486 " (expected %u, received %u)",
3487 GOMP_VERSION_GCN
, GOMP_VERSION_DEV (version
));
3491 struct agent_info
*agent
;
3492 agent
= get_agent_info (n
);
3496 if (pthread_rwlock_wrlock (&agent
->module_rwlock
))
3498 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3502 if (!agent
->module
|| agent
->module
->image_desc
!= target_data
)
3504 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3509 if (!destroy_module (agent
->module
, true))
3511 free (agent
->module
);
3512 agent
->module
= NULL
;
3513 if (!destroy_hsa_program (agent
))
3515 if (pthread_rwlock_unlock (&agent
->module_rwlock
))
3517 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3523 /* Deinitialize all information and status associated with agent number N. We
3524 do not attempt any synchronization, assuming the user and libgomp will not
3525 attempt deinitialization of a device that is in any way being used at the
3526 same time. Return TRUE on success. */
3529 GOMP_OFFLOAD_fini_device (int n
)
3531 struct agent_info
*agent
= get_agent_info (n
);
3535 if (!agent
->initialized
)
3538 if (agent
->omp_async_queue
)
3540 GOMP_OFFLOAD_openacc_async_destruct (agent
->omp_async_queue
);
3541 agent
->omp_async_queue
= NULL
;
3546 if (!destroy_module (agent
->module
, false))
3548 free (agent
->module
);
3549 agent
->module
= NULL
;
3552 if (!destroy_team_arenas (agent
))
3555 if (!destroy_hsa_program (agent
))
3558 hsa_status_t status
= hsa_fns
.hsa_queue_destroy_fn (agent
->sync_queue
);
3559 if (status
!= HSA_STATUS_SUCCESS
)
3560 return hsa_error ("Error destroying command queue", status
);
3562 if (pthread_mutex_destroy (&agent
->prog_mutex
))
3564 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3567 if (pthread_rwlock_destroy (&agent
->module_rwlock
))
3569 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3573 if (pthread_mutex_destroy (&agent
->async_queues_mutex
))
3575 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3578 if (pthread_mutex_destroy (&agent
->team_arena_write_lock
))
3580 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3583 agent
->initialized
= false;
3587 /* Return true if the HSA runtime can run function FN_PTR. */
3590 GOMP_OFFLOAD_can_run (void *fn_ptr
)
3592 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3594 init_kernel (kernel
);
3595 if (kernel
->initialization_failed
)
3601 if (suppress_host_fallback
)
3602 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3603 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3607 /* Allocate memory on device N. */
3610 GOMP_OFFLOAD_alloc (int n
, size_t size
)
3612 struct agent_info
*agent
= get_agent_info (n
);
3613 return alloc_by_agent (agent
, size
);
3616 /* Free memory from device N. */
3619 GOMP_OFFLOAD_free (int device
, void *ptr
)
3621 GCN_DEBUG ("Freeing memory on device %d\n", device
);
3623 hsa_status_t status
= hsa_fns
.hsa_memory_free_fn (ptr
);
3624 if (status
!= HSA_STATUS_SUCCESS
)
3626 hsa_error ("Could not free device memory", status
);
3630 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
3631 bool profiling_dispatch_p
3632 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
3633 if (profiling_dispatch_p
)
3635 acc_prof_info
*prof_info
= thr
->prof_info
;
3636 acc_event_info data_event_info
;
3637 acc_api_info
*api_info
= thr
->api_info
;
3639 prof_info
->event_type
= acc_ev_free
;
3641 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
3642 data_event_info
.data_event
.valid_bytes
3643 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
3644 data_event_info
.data_event
.parent_construct
3645 = acc_construct_parallel
;
3646 data_event_info
.data_event
.implicit
= 1;
3647 data_event_info
.data_event
.tool_info
= NULL
;
3648 data_event_info
.data_event
.var_name
= NULL
;
3649 data_event_info
.data_event
.bytes
= 0;
3650 data_event_info
.data_event
.host_ptr
= NULL
;
3651 data_event_info
.data_event
.device_ptr
= (void *) ptr
;
3653 api_info
->device_api
= acc_device_api_other
;
3655 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
3662 /* Copy data from DEVICE to host. */
3665 GOMP_OFFLOAD_dev2host (int device
, void *dst
, const void *src
, size_t n
)
3667 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n
, device
,
3669 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, n
);
3670 if (status
!= HSA_STATUS_SUCCESS
)
3671 GOMP_PLUGIN_error ("memory copy failed");
3675 /* Copy data from host to DEVICE. */
3678 GOMP_OFFLOAD_host2dev (int device
, void *dst
, const void *src
, size_t n
)
3680 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n
, src
,
3682 hsa_memory_copy_wrapper (dst
, src
, n
);
3686 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3689 GOMP_OFFLOAD_dev2dev (int device
, void *dst
, const void *src
, size_t n
)
3691 struct gcn_thread
*thread_data
= gcn_thread ();
3693 if (thread_data
&& !async_synchronous_p (thread_data
->async
))
3695 struct agent_info
*agent
= get_agent_info (device
);
3696 maybe_init_omp_async (agent
);
3697 queue_push_copy (agent
->omp_async_queue
, dst
, src
, n
, false);
3701 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n
,
3702 device
, src
, device
, dst
);
3703 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, n
);
3704 if (status
!= HSA_STATUS_SUCCESS
)
3705 GOMP_PLUGIN_error ("memory copy failed");
3710 /* {{{ OpenMP Plugin API */
3712 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3713 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3714 to a kernel_info structure, and must have previously been loaded to the
3715 specified device. */
3718 GOMP_OFFLOAD_run (int device
, void *fn_ptr
, void *vars
, void **args
)
3720 struct agent_info
*agent
= get_agent_info (device
);
3721 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3722 struct GOMP_kernel_launch_attributes def
;
3723 struct GOMP_kernel_launch_attributes
*kla
;
3724 assert (agent
== kernel
->agent
);
3726 /* If we get here then the kernel must be OpenMP. */
3727 kernel
->kind
= KIND_OPENMP
;
3729 if (!parse_target_attributes (args
, &def
, &kla
, agent
))
3731 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3734 run_kernel (kernel
, vars
, kla
, NULL
, false);
3737 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3738 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3739 GOMP_PLUGIN_target_task_completion when it has finished. */
3742 GOMP_OFFLOAD_async_run (int device
, void *tgt_fn
, void *tgt_vars
,
3743 void **args
, void *async_data
)
3745 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3746 struct agent_info
*agent
= get_agent_info (device
);
3747 struct kernel_info
*kernel
= (struct kernel_info
*) tgt_fn
;
3748 struct GOMP_kernel_launch_attributes def
;
3749 struct GOMP_kernel_launch_attributes
*kla
;
3750 assert (agent
== kernel
->agent
);
3752 /* If we get here then the kernel must be OpenMP. */
3753 kernel
->kind
= KIND_OPENMP
;
3755 if (!parse_target_attributes (args
, &def
, &kla
, agent
))
3757 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3761 maybe_init_omp_async (agent
);
3762 queue_push_launch (agent
->omp_async_queue
, kernel
, tgt_vars
, kla
);
3763 queue_push_callback (agent
->omp_async_queue
,
3764 GOMP_PLUGIN_target_task_completion
, async_data
);
3768 /* {{{ OpenACC Plugin API */
3770 /* Run a synchronous OpenACC kernel. The device number is inferred from the
3771 already-loaded KERNEL. */
3774 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr
) (void *), size_t mapnum
,
3775 void **hostaddrs
, void **devaddrs
, unsigned *dims
,
3776 void *targ_mem_desc
)
3778 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3780 gcn_exec (kernel
, mapnum
, hostaddrs
, devaddrs
, dims
, targ_mem_desc
, false,
3784 /* Run an asynchronous OpenACC kernel on the specified queue. */
3787 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr
) (void *), size_t mapnum
,
3788 void **hostaddrs
, void **devaddrs
,
3789 unsigned *dims
, void *targ_mem_desc
,
3790 struct goacc_asyncqueue
*aq
)
3792 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3794 gcn_exec (kernel
, mapnum
, hostaddrs
, devaddrs
, dims
, targ_mem_desc
, true,
3798 /* Create a new asynchronous thread and queue for running future kernels. */
3800 struct goacc_asyncqueue
*
3801 GOMP_OFFLOAD_openacc_async_construct (int device
)
3803 struct agent_info
*agent
= get_agent_info (device
);
3805 pthread_mutex_lock (&agent
->async_queues_mutex
);
3807 struct goacc_asyncqueue
*aq
= GOMP_PLUGIN_malloc (sizeof (*aq
));
3808 aq
->agent
= get_agent_info (device
);
3810 aq
->next
= agent
->async_queues
;
3813 aq
->next
->prev
= aq
;
3814 aq
->id
= aq
->next
->id
+ 1;
3818 agent
->async_queues
= aq
;
3820 aq
->queue_first
= 0;
3822 aq
->drain_queue_stop
= 0;
3824 if (pthread_mutex_init (&aq
->mutex
, NULL
))
3826 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3829 if (pthread_cond_init (&aq
->queue_cond_in
, NULL
))
3831 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3834 if (pthread_cond_init (&aq
->queue_cond_out
, NULL
))
3836 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3840 hsa_status_t status
= hsa_fns
.hsa_queue_create_fn (agent
->id
,
3842 HSA_QUEUE_TYPE_MULTI
,
3843 hsa_queue_callback
, NULL
,
3844 UINT32_MAX
, UINT32_MAX
,
3846 if (status
!= HSA_STATUS_SUCCESS
)
3847 hsa_fatal ("Error creating command queue", status
);
3849 int err
= pthread_create (&aq
->thread_drain_queue
, NULL
, &drain_queue
, aq
);
3851 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3853 GCN_DEBUG ("Async thread %d:%d: created\n", aq
->agent
->device_id
,
3856 pthread_mutex_unlock (&agent
->async_queues_mutex
);
3861 /* Destroy an existing asynchronous thread and queue. Waits for any
3862 currently-running task to complete, but cancels any queued tasks. */
3865 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue
*aq
)
3867 struct agent_info
*agent
= aq
->agent
;
3869 finalize_async_thread (aq
);
3871 pthread_mutex_lock (&agent
->async_queues_mutex
);
3874 if ((err
= pthread_mutex_destroy (&aq
->mutex
)))
3876 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err
);
3879 if (pthread_cond_destroy (&aq
->queue_cond_in
))
3881 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3884 if (pthread_cond_destroy (&aq
->queue_cond_out
))
3886 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3889 hsa_status_t status
= hsa_fns
.hsa_queue_destroy_fn (aq
->hsa_queue
);
3890 if (status
!= HSA_STATUS_SUCCESS
)
3892 hsa_error ("Error destroying command queue", status
);
3897 aq
->prev
->next
= aq
->next
;
3899 aq
->next
->prev
= aq
->prev
;
3900 if (agent
->async_queues
== aq
)
3901 agent
->async_queues
= aq
->next
;
3903 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent
->device_id
, aq
->id
);
3906 pthread_mutex_unlock (&agent
->async_queues_mutex
);
3910 pthread_mutex_unlock (&agent
->async_queues_mutex
);
3914 /* Return true if the specified async queue is currently empty. */
3917 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue
*aq
)
3919 return queue_empty (aq
);
3922 /* Block until the specified queue has executed all its tasks and the
3926 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue
*aq
)
3932 /* Add a serialization point across two async queues. Any new tasks added to
3933 AQ2, after this call, will not run until all tasks on AQ1, at the time
3934 of this call, have completed. */
3937 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue
*aq1
,
3938 struct goacc_asyncqueue
*aq2
)
3940 /* For serialize, stream aq2 waits for aq1 to complete work that has been
3941 scheduled to run on it up to this point. */
3944 struct placeholder
*placeholderp
= queue_push_placeholder (aq1
);
3945 queue_push_asyncwait (aq2
, placeholderp
);
3950 /* Add an opaque callback to the given async queue. */
3953 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue
*aq
,
3954 void (*fn
) (void *), void *data
)
3956 queue_push_callback (aq
, fn
, data
);
3959 /* Queue up an asynchronous data copy from host to DEVICE. */
3962 GOMP_OFFLOAD_openacc_async_host2dev (int device
, void *dst
, const void *src
,
3963 size_t n
, struct goacc_asyncqueue
*aq
)
3965 struct agent_info
*agent
= get_agent_info (device
);
3966 assert (agent
== aq
->agent
);
3967 /* The source data does not necessarily remain live until the deferred
3968 copy happens. Taking a snapshot of the data here avoids reading
3969 uninitialised data later, but means that (a) data is copied twice and
3970 (b) modifications to the copied data between the "spawning" point of
3971 the asynchronous kernel and when it is executed will not be seen.
3972 But, that is probably correct. */
3973 void *src_copy
= GOMP_PLUGIN_malloc (n
);
3974 memcpy (src_copy
, src
, n
);
3975 queue_push_copy (aq
, dst
, src_copy
, n
, true);
3979 /* Queue up an asynchronous data copy from DEVICE to host. */
3982 GOMP_OFFLOAD_openacc_async_dev2host (int device
, void *dst
, const void *src
,
3983 size_t n
, struct goacc_asyncqueue
*aq
)
3985 struct agent_info
*agent
= get_agent_info (device
);
3986 assert (agent
== aq
->agent
);
3987 queue_push_copy (aq
, dst
, src
, n
, false);
3991 union goacc_property_value
3992 GOMP_OFFLOAD_openacc_get_property (int device
, enum goacc_property prop
)
3994 /* Stub. Check device and return default value for unsupported properties. */
3995 /* TODO: Implement this function. */
3996 get_agent_info (device
);
3998 union goacc_property_value nullval
= { .val
= 0 };
4002 /* Set up plugin-specific thread-local-data (host-side). */
4005 GOMP_OFFLOAD_openacc_create_thread_data (int ord
__attribute__((unused
)))
4007 struct gcn_thread
*thread_data
4008 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread
));
4010 thread_data
->async
= GOMP_ASYNC_SYNC
;
4012 return (void *) thread_data
;
4015 /* Clean up plugin-specific thread-local-data. */
4018 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)