1 /* Plugin for AMD GCN execution.
3 Copyright (C) 2013-2023 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* {{{ Includes and defines */
41 #include <hsa_ext_amd.h>
44 #include "libgomp-plugin.h"
45 #include "config/gcn/libgomp-gcn.h" /* For struct output. */
46 #include "gomp-constants.h"
48 #include "oacc-plugin.h"
52 /* These probably won't be in elf.h for a while. */
54 #define R_AMDGPU_NONE 0
55 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */
56 #define R_AMDGPU_ABS32_HI 2 /* (S + A) >> 32 */
57 #define R_AMDGPU_ABS64 3 /* S + A */
58 #define R_AMDGPU_REL32 4 /* S + A - P */
59 #define R_AMDGPU_REL64 5 /* S + A - P */
60 #define R_AMDGPU_ABS32 6 /* S + A */
61 #define R_AMDGPU_GOTPCREL 7 /* G + GOT + A - P */
62 #define R_AMDGPU_GOTPCREL32_LO 8 /* (G + GOT + A - P) & 0xFFFFFFFF */
63 #define R_AMDGPU_GOTPCREL32_HI 9 /* (G + GOT + A - P) >> 32 */
64 #define R_AMDGPU_REL32_LO 10 /* (S + A - P) & 0xFFFFFFFF */
65 #define R_AMDGPU_REL32_HI 11 /* (S + A - P) >> 32 */
66 #define R_AMDGPU_RELATIVE64 13 /* B + A */
69 /* GCN specific definitions for asynchronous queues. */
71 #define ASYNC_QUEUE_SIZE 64
72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
73 #define DEBUG_QUEUES 0
74 #define DEBUG_THREAD_SLEEP 0
75 #define DEBUG_THREAD_SIGNAL 0
78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024) /* 100MB. */
80 /* Secure getenv() which returns NULL if running as SUID/SGID. */
81 #ifndef HAVE_SECURE_GETENV
82 #ifdef HAVE___SECURE_GETENV
83 #define secure_getenv __secure_getenv
84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
85 && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
89 /* Implementation of secure_getenv() for targets where it is not provided but
90 we have at least means to test real and effective IDs. */
93 secure_getenv (const char *name
)
95 if ((getuid () == geteuid ()) && (getgid () == getegid ()))
102 #define secure_getenv getenv
109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data. */
113 /* The thread number from the async clause, or GOMP_ASYNC_SYNC. */
117 /* As an HSA runtime is dlopened, following structure defines function
118 pointers utilized by the HSA plug-in. */
120 struct hsa_runtime_fn_info
123 hsa_status_t (*hsa_status_string_fn
) (hsa_status_t status
,
124 const char **status_string
);
125 hsa_status_t (*hsa_system_get_info_fn
) (hsa_system_info_t attribute
,
127 hsa_status_t (*hsa_agent_get_info_fn
) (hsa_agent_t agent
,
128 hsa_agent_info_t attribute
,
130 hsa_status_t (*hsa_isa_get_info_fn
)(hsa_isa_t isa
,
131 hsa_isa_info_t attribute
,
134 hsa_status_t (*hsa_init_fn
) (void);
135 hsa_status_t (*hsa_iterate_agents_fn
)
136 (hsa_status_t (*callback
)(hsa_agent_t agent
, void *data
), void *data
);
137 hsa_status_t (*hsa_region_get_info_fn
) (hsa_region_t region
,
138 hsa_region_info_t attribute
,
140 hsa_status_t (*hsa_queue_create_fn
)
141 (hsa_agent_t agent
, uint32_t size
, hsa_queue_type_t type
,
142 void (*callback
)(hsa_status_t status
, hsa_queue_t
*source
, void *data
),
143 void *data
, uint32_t private_segment_size
,
144 uint32_t group_segment_size
, hsa_queue_t
**queue
);
145 hsa_status_t (*hsa_agent_iterate_regions_fn
)
147 hsa_status_t (*callback
)(hsa_region_t region
, void *data
), void *data
);
148 hsa_status_t (*hsa_executable_destroy_fn
) (hsa_executable_t executable
);
149 hsa_status_t (*hsa_executable_create_fn
)
150 (hsa_profile_t profile
, hsa_executable_state_t executable_state
,
151 const char *options
, hsa_executable_t
*executable
);
152 hsa_status_t (*hsa_executable_global_variable_define_fn
)
153 (hsa_executable_t executable
, const char *variable_name
, void *address
);
154 hsa_status_t (*hsa_executable_load_code_object_fn
)
155 (hsa_executable_t executable
, hsa_agent_t agent
,
156 hsa_code_object_t code_object
, const char *options
);
157 hsa_status_t (*hsa_executable_freeze_fn
)(hsa_executable_t executable
,
158 const char *options
);
159 hsa_status_t (*hsa_signal_create_fn
) (hsa_signal_value_t initial_value
,
160 uint32_t num_consumers
,
161 const hsa_agent_t
*consumers
,
162 hsa_signal_t
*signal
);
163 hsa_status_t (*hsa_memory_allocate_fn
) (hsa_region_t region
, size_t size
,
165 hsa_status_t (*hsa_memory_assign_agent_fn
) (void *ptr
, hsa_agent_t agent
,
166 hsa_access_permission_t access
);
167 hsa_status_t (*hsa_memory_copy_fn
)(void *dst
, const void *src
, size_t size
);
168 hsa_status_t (*hsa_memory_free_fn
) (void *ptr
);
169 hsa_status_t (*hsa_signal_destroy_fn
) (hsa_signal_t signal
);
170 hsa_status_t (*hsa_executable_get_symbol_fn
)
171 (hsa_executable_t executable
, const char *module_name
,
172 const char *symbol_name
, hsa_agent_t agent
, int32_t call_convention
,
173 hsa_executable_symbol_t
*symbol
);
174 hsa_status_t (*hsa_executable_symbol_get_info_fn
)
175 (hsa_executable_symbol_t executable_symbol
,
176 hsa_executable_symbol_info_t attribute
, void *value
);
177 hsa_status_t (*hsa_executable_iterate_symbols_fn
)
178 (hsa_executable_t executable
,
179 hsa_status_t (*callback
)(hsa_executable_t executable
,
180 hsa_executable_symbol_t symbol
, void *data
),
182 uint64_t (*hsa_queue_add_write_index_release_fn
) (const hsa_queue_t
*queue
,
184 uint64_t (*hsa_queue_load_read_index_acquire_fn
) (const hsa_queue_t
*queue
);
185 void (*hsa_signal_store_relaxed_fn
) (hsa_signal_t signal
,
186 hsa_signal_value_t value
);
187 void (*hsa_signal_store_release_fn
) (hsa_signal_t signal
,
188 hsa_signal_value_t value
);
189 hsa_signal_value_t (*hsa_signal_wait_acquire_fn
)
190 (hsa_signal_t signal
, hsa_signal_condition_t condition
,
191 hsa_signal_value_t compare_value
, uint64_t timeout_hint
,
192 hsa_wait_state_t wait_state_hint
);
193 hsa_signal_value_t (*hsa_signal_load_acquire_fn
) (hsa_signal_t signal
);
194 hsa_status_t (*hsa_queue_destroy_fn
) (hsa_queue_t
*queue
);
196 hsa_status_t (*hsa_code_object_deserialize_fn
)
197 (void *serialized_code_object
, size_t serialized_code_object_size
,
198 const char *options
, hsa_code_object_t
*code_object
);
201 /* Structure describing the run-time and grid properties of an HSA kernel
202 lauch. This needs to match the format passed to GOMP_OFFLOAD_run. */
204 struct GOMP_kernel_launch_attributes
206 /* Number of dimensions the workload has. Maximum number is 3. */
208 /* Size of the grid in the three respective dimensions. */
210 /* Size of work-groups in the respective dimensions. */
214 /* Collection of information needed for a dispatch of a kernel from a
217 struct kernel_dispatch
219 struct agent_info
*agent
;
220 /* Pointer to a command queue associated with a kernel dispatch agent. */
222 /* Pointer to a memory space used for kernel arguments passing. */
223 void *kernarg_address
;
226 /* Synchronization signal used for dispatch synchronization. */
228 /* Private segment size. */
229 uint32_t private_segment_size
;
230 /* Group segment size. */
231 uint32_t group_segment_size
;
234 /* Structure of the kernargs segment, supporting console output.
236 This needs to match the definitions in Newlib, and the expectations
237 in libgomp target code. */
240 /* Leave space for the real kernel arguments.
241 OpenACC and OpenMP only use one pointer. */
245 /* A pointer to struct output, below, for console output data. */
248 /* A pointer to struct heap, below. */
251 /* A pointer to an ephemeral memory arena.
252 Only needed for OpenMP. */
256 struct output output_data
;
259 /* A queue entry for a future asynchronous launch. */
263 struct kernel_info
*kernel
;
265 struct GOMP_kernel_launch_attributes kla
;
268 /* A queue entry for a future callback. */
276 /* A data struct for the copy_data callback. */
283 struct goacc_asyncqueue
*aq
;
286 /* A queue entry for a placeholder. These correspond to a wait event. */
292 pthread_mutex_t mutex
;
295 /* A queue entry for a wait directive. */
297 struct asyncwait_info
299 struct placeholder
*placeholderp
;
302 /* Encode the type of an entry in an async queue. */
312 /* An entry in an async queue. */
316 enum entry_type type
;
318 struct kernel_launch launch
;
319 struct callback callback
;
320 struct asyncwait_info asyncwait
;
321 struct placeholder placeholder
;
325 /* An async queue header.
327 OpenMP may create one of these.
328 OpenACC may create many. */
330 struct goacc_asyncqueue
332 struct agent_info
*agent
;
333 hsa_queue_t
*hsa_queue
;
335 pthread_t thread_drain_queue
;
336 pthread_mutex_t mutex
;
337 pthread_cond_t queue_cond_in
;
338 pthread_cond_t queue_cond_out
;
339 struct queue_entry queue
[ASYNC_QUEUE_SIZE
];
342 int drain_queue_stop
;
345 struct goacc_asyncqueue
*prev
;
346 struct goacc_asyncqueue
*next
;
349 /* Mkoffload uses this structure to describe a kernel.
351 OpenMP kernel dimensions are passed at runtime.
352 OpenACC kernel dimensions are passed at compile time, here. */
354 struct hsa_kernel_description
357 int oacc_dims
[3]; /* Only present for GCN kernels. */
362 /* Mkoffload uses this structure to describe an offload variable. */
364 struct global_var_info
370 /* Mkoffload uses this structure to describe all the kernels in a
371 loadable module. These are passed the libgomp via static constructors. */
373 struct gcn_image_desc
379 const unsigned kernel_count
;
380 struct hsa_kernel_description
*kernel_infos
;
381 const unsigned global_variable_count
;
384 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
386 See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
389 EF_AMDGPU_MACH_AMDGCN_GFX803
= 0x02a,
390 EF_AMDGPU_MACH_AMDGCN_GFX900
= 0x02c,
391 EF_AMDGPU_MACH_AMDGCN_GFX906
= 0x02f,
392 EF_AMDGPU_MACH_AMDGCN_GFX908
= 0x030,
393 EF_AMDGPU_MACH_AMDGCN_GFX90a
= 0x03f
396 const static int EF_AMDGPU_MACH_MASK
= 0x000000ff;
397 typedef EF_AMDGPU_MACH gcn_isa
;
399 /* Description of an HSA GPU agent (device) and the program associated with
404 /* The HSA ID of the agent. Assigned when hsa_context is initialized. */
406 /* The user-visible device number. */
408 /* Whether the agent has been initialized. The fields below are usable only
412 /* The instruction set architecture of the device. */
414 /* Name of the agent. */
416 /* Name of the vendor of the agent. */
417 char vendor_name
[64];
418 /* Command queues of the agent. */
419 hsa_queue_t
*sync_queue
;
420 struct goacc_asyncqueue
*async_queues
, *omp_async_queue
;
421 pthread_mutex_t async_queues_mutex
;
423 /* The HSA memory region from which to allocate kernel arguments. */
424 hsa_region_t kernarg_region
;
426 /* The HSA memory region from which to allocate device data. */
427 hsa_region_t data_region
;
429 /* Allocated team arenas. */
430 struct team_arena_list
*team_arena_list
;
431 pthread_mutex_t team_arena_write_lock
;
433 /* Read-write lock that protects kernels which are running or about to be run
434 from interference with loading and unloading of images. Needs to be
435 locked for reading while a kernel is being run, and for writing if the
436 list of modules is manipulated (and thus the HSA program invalidated). */
437 pthread_rwlock_t module_rwlock
;
439 /* The module associated with this kernel. */
440 struct module_info
*module
;
442 /* Mutex enforcing that only one thread will finalize the HSA program. A
443 thread should have locked agent->module_rwlock for reading before
445 pthread_mutex_t prog_mutex
;
446 /* Flag whether the HSA program that consists of all the modules has been
449 /* HSA executable - the finalized program that is used to locate kernels. */
450 hsa_executable_t executable
;
453 /* Information required to identify, finalize and run any given kernel. */
455 enum offload_kind
{KIND_UNKNOWN
, KIND_OPENMP
, KIND_OPENACC
};
459 /* Name of the kernel, required to locate it within the GCN object-code
462 /* The specific agent the kernel has been or will be finalized for and run
464 struct agent_info
*agent
;
465 /* The specific module where the kernel takes place. */
466 struct module_info
*module
;
467 /* Information provided by mkoffload associated with the kernel. */
468 struct hsa_kernel_description
*description
;
469 /* Mutex enforcing that at most once thread ever initializes a kernel for
470 use. A thread should have locked agent->module_rwlock for reading before
472 pthread_mutex_t init_mutex
;
473 /* Flag indicating whether the kernel has been initialized and all fields
474 below it contain valid data. */
476 /* Flag indicating that the kernel has a problem that blocks an execution. */
477 bool initialization_failed
;
478 /* The object to be put into the dispatch queue. */
480 /* Required size of kernel arguments. */
481 uint32_t kernarg_segment_size
;
482 /* Required size of group segment. */
483 uint32_t group_segment_size
;
484 /* Required size of private segment. */
485 uint32_t private_segment_size
;
486 /* Set up for OpenMP or OpenACC? */
487 enum offload_kind kind
;
490 /* Information about a particular GCN module, its image and kernels. */
494 /* The description with which the program has registered the image. */
495 struct gcn_image_desc
*image_desc
;
496 /* GCN heap allocation. */
498 /* Physical boundaries of the loaded module. */
499 Elf64_Addr phys_address_start
;
500 Elf64_Addr phys_address_end
;
502 bool constructors_run_p
;
503 struct kernel_info
*init_array_func
, *fini_array_func
;
505 /* Number of kernels in this module. */
507 /* An array of kernel_info structures describing each kernel in this
509 struct kernel_info kernels
[];
512 /* A linked list of memory arenas allocated on the device.
513 These are only used by OpenMP, as a means to optimize per-team malloc. */
515 struct team_arena_list
517 struct team_arena_list
*next
;
519 /* The number of teams determines the size of the allocation. */
521 /* The device address of the arena itself. */
523 /* A flag to prevent two asynchronous kernels trying to use the same arena.
524 The mutex is locked until the kernel exits. */
525 pthread_mutex_t in_use
;
528 /* Information about the whole HSA environment and all of its agents. */
530 struct hsa_context_info
532 /* Whether the structure has been initialized. */
534 /* Number of usable GPU HSA agents in the system. */
536 /* Array of agent_info structures describing the individual HSA agents. */
537 struct agent_info
*agents
;
538 /* Driver version string. */
539 char driver_version_s
[30];
542 /* Format of the on-device heap.
544 This must match the definition in Newlib and gcn-run. */
552 /* {{{ Global variables */
554 /* Information about the whole HSA environment and all of its agents. */
556 static struct hsa_context_info hsa_context
;
558 /* HSA runtime functions that are initialized in init_hsa_context. */
560 static struct hsa_runtime_fn_info hsa_fns
;
562 /* Heap space, allocated target-side, provided for use of newlib malloc.
563 Each module should have it's own heap allocated.
564 Beware that heap usage increases with OpenMP teams. See also arenas. */
566 static size_t gcn_kernel_heap_size
= DEFAULT_GCN_HEAP_SIZE
;
568 /* Flag to decide whether print to stderr information about what is going on.
569 Set in init_debug depending on environment variables. */
573 /* Flag to decide if the runtime should suppress a possible fallback to host
576 static bool suppress_host_fallback
;
578 /* Flag to locate HSA runtime shared library that is dlopened
581 static const char *hsa_runtime_lib
;
583 /* Flag to decide if the runtime should support also CPU devices (can be
586 static bool support_cpu_devices
;
588 /* Runtime dimension overrides. Zero indicates default. */
590 static int override_x_dim
= 0;
591 static int override_z_dim
= 0;
594 /* {{{ Debug & Diagnostic */
596 /* Print a message to stderr if GCN_DEBUG value is set to true. */
598 #define DEBUG_PRINT(...) \
603 fprintf (stderr, __VA_ARGS__); \
608 /* Flush stderr if GCN_DEBUG value is set to true. */
610 #define DEBUG_FLUSH() \
616 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
619 #define DEBUG_LOG(prefix, ...) \
622 DEBUG_PRINT (prefix); \
623 DEBUG_PRINT (__VA_ARGS__); \
627 /* Print a debugging message to stderr. */
629 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
631 /* Print a warning message to stderr. */
633 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
635 /* Print HSA warning STR with an HSA STATUS code. */
638 hsa_warn (const char *str
, hsa_status_t status
)
643 const char *hsa_error_msg
= "[unknown]";
644 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
646 fprintf (stderr
, "GCN warning: %s\nRuntime message: %s\n", str
,
650 /* Report a fatal error STR together with the HSA error corresponding to STATUS
651 and terminate execution of the current process. */
654 hsa_fatal (const char *str
, hsa_status_t status
)
656 const char *hsa_error_msg
= "[unknown]";
657 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
658 GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str
,
662 /* Like hsa_fatal, except only report error message, and return FALSE
663 for propagating error processing to outside of plugin. */
666 hsa_error (const char *str
, hsa_status_t status
)
668 const char *hsa_error_msg
= "[unknown]";
669 hsa_fns
.hsa_status_string_fn (status
, &hsa_error_msg
);
670 GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str
,
675 /* Dump information about the available hardware. */
678 dump_hsa_system_info (void)
682 hsa_endianness_t endianness
;
683 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS
,
685 if (status
== HSA_STATUS_SUCCESS
)
688 case HSA_ENDIANNESS_LITTLE
:
689 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
691 case HSA_ENDIANNESS_BIG
:
692 GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
695 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
698 GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
700 uint8_t extensions
[128];
701 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS
,
703 if (status
== HSA_STATUS_SUCCESS
)
705 if (extensions
[0] & (1 << HSA_EXTENSION_IMAGES
))
706 GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
709 GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
712 /* Dump information about the available hardware. */
715 dump_machine_model (hsa_machine_model_t machine_model
, const char *s
)
717 switch (machine_model
)
719 case HSA_MACHINE_MODEL_SMALL
:
720 GCN_DEBUG ("%s: SMALL\n", s
);
722 case HSA_MACHINE_MODEL_LARGE
:
723 GCN_DEBUG ("%s: LARGE\n", s
);
726 GCN_WARNING ("%s: UNKNOWN\n", s
);
731 /* Dump information about the available hardware. */
734 dump_profile (hsa_profile_t profile
, const char *s
)
738 case HSA_PROFILE_FULL
:
739 GCN_DEBUG ("%s: FULL\n", s
);
741 case HSA_PROFILE_BASE
:
742 GCN_DEBUG ("%s: BASE\n", s
);
745 GCN_WARNING ("%s: UNKNOWN\n", s
);
750 /* Dump information about a device memory region. */
753 dump_hsa_region (hsa_region_t region
, void *data
__attribute__((unused
)))
757 hsa_region_segment_t segment
;
758 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
,
760 if (status
== HSA_STATUS_SUCCESS
)
762 if (segment
== HSA_REGION_SEGMENT_GLOBAL
)
763 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
764 else if (segment
== HSA_REGION_SEGMENT_READONLY
)
765 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
766 else if (segment
== HSA_REGION_SEGMENT_PRIVATE
)
767 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
768 else if (segment
== HSA_REGION_SEGMENT_GROUP
)
769 GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
771 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
774 GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
776 if (segment
== HSA_REGION_SEGMENT_GLOBAL
)
780 = hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
782 if (status
== HSA_STATUS_SUCCESS
)
784 if (flags
& HSA_REGION_GLOBAL_FLAG_KERNARG
)
785 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
786 if (flags
& HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
)
787 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
788 if (flags
& HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
)
789 GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
792 GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
796 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SIZE
, &size
);
797 if (status
== HSA_STATUS_SUCCESS
)
798 GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size
);
800 GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
803 = hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_ALLOC_MAX_SIZE
,
805 if (status
== HSA_STATUS_SUCCESS
)
806 GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size
);
808 GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
812 = hsa_fns
.hsa_region_get_info_fn (region
,
813 HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED
,
815 if (status
== HSA_STATUS_SUCCESS
)
816 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed
);
818 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
820 if (status
!= HSA_STATUS_SUCCESS
|| !alloc_allowed
)
821 return HSA_STATUS_SUCCESS
;
824 = hsa_fns
.hsa_region_get_info_fn (region
,
825 HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE
,
827 if (status
== HSA_STATUS_SUCCESS
)
828 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size
);
830 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
834 = hsa_fns
.hsa_region_get_info_fn (region
,
835 HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT
,
837 if (status
== HSA_STATUS_SUCCESS
)
838 GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align
);
840 GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
842 return HSA_STATUS_SUCCESS
;
845 /* Dump information about all the device memory regions. */
848 dump_hsa_regions (hsa_agent_t agent
)
851 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
,
854 if (status
!= HSA_STATUS_SUCCESS
)
855 hsa_error ("Dumping hsa regions failed", status
);
858 /* Dump information about the available devices. */
861 dump_hsa_agent_info (hsa_agent_t agent
, void *data
__attribute__((unused
)))
866 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_NAME
,
868 if (status
== HSA_STATUS_SUCCESS
)
869 GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf
);
871 GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
873 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_VENDOR_NAME
,
875 if (status
== HSA_STATUS_SUCCESS
)
876 GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf
);
878 GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
880 hsa_machine_model_t machine_model
;
882 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_MACHINE_MODEL
,
884 if (status
== HSA_STATUS_SUCCESS
)
885 dump_machine_model (machine_model
, "HSA_AGENT_INFO_MACHINE_MODEL");
887 GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
889 hsa_profile_t profile
;
890 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_PROFILE
,
892 if (status
== HSA_STATUS_SUCCESS
)
893 dump_profile (profile
, "HSA_AGENT_INFO_PROFILE");
895 GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
897 hsa_device_type_t device_type
;
898 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
900 if (status
== HSA_STATUS_SUCCESS
)
904 case HSA_DEVICE_TYPE_CPU
:
905 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
907 case HSA_DEVICE_TYPE_GPU
:
908 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
910 case HSA_DEVICE_TYPE_DSP
:
911 GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
914 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
919 GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
922 status
= hsa_fns
.hsa_agent_get_info_fn
923 (agent
, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, &cu_count
);
924 if (status
== HSA_STATUS_SUCCESS
)
925 GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count
);
927 GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
930 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_WAVEFRONT_SIZE
,
932 if (status
== HSA_STATUS_SUCCESS
)
933 GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size
);
935 GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
938 status
= hsa_fns
.hsa_agent_get_info_fn (agent
,
939 HSA_AGENT_INFO_WORKGROUP_MAX_DIM
,
941 if (status
== HSA_STATUS_SUCCESS
)
942 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim
);
944 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
947 status
= hsa_fns
.hsa_agent_get_info_fn (agent
,
948 HSA_AGENT_INFO_WORKGROUP_MAX_SIZE
,
950 if (status
== HSA_STATUS_SUCCESS
)
951 GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size
);
953 GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
955 uint32_t grid_max_dim
;
956 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_GRID_MAX_DIM
,
958 if (status
== HSA_STATUS_SUCCESS
)
959 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim
);
961 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
963 uint32_t grid_max_size
;
964 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_GRID_MAX_SIZE
,
966 if (status
== HSA_STATUS_SUCCESS
)
967 GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size
);
969 GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
971 dump_hsa_regions (agent
);
973 return HSA_STATUS_SUCCESS
;
976 /* Forward reference. */
978 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol
);
980 /* Helper function for dump_executable_symbols. */
983 dump_executable_symbol (hsa_executable_t executable
,
984 hsa_executable_symbol_t symbol
,
985 void *data
__attribute__((unused
)))
987 char *name
= get_executable_symbol_name (symbol
);
991 GCN_DEBUG ("executable symbol: %s\n", name
);
995 return HSA_STATUS_SUCCESS
;
998 /* Dump all global symbol in an executable. */
1001 dump_executable_symbols (hsa_executable_t executable
)
1003 hsa_status_t status
;
1005 = hsa_fns
.hsa_executable_iterate_symbols_fn (executable
,
1006 dump_executable_symbol
,
1008 if (status
!= HSA_STATUS_SUCCESS
)
1009 hsa_fatal ("Could not dump HSA executable symbols", status
);
1012 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
1015 print_kernel_dispatch (struct kernel_dispatch
*dispatch
, unsigned indent
)
1017 struct kernargs
*kernargs
= (struct kernargs
*)dispatch
->kernarg_address
;
1019 fprintf (stderr
, "%*sthis: %p\n", indent
, "", dispatch
);
1020 fprintf (stderr
, "%*squeue: %p\n", indent
, "", dispatch
->queue
);
1021 fprintf (stderr
, "%*skernarg_address: %p\n", indent
, "", kernargs
);
1022 fprintf (stderr
, "%*sheap address: %p\n", indent
, "",
1023 (void*)kernargs
->heap_ptr
);
1024 fprintf (stderr
, "%*sarena address: %p\n", indent
, "",
1025 (void*)kernargs
->arena_ptr
);
1026 fprintf (stderr
, "%*sobject: %lu\n", indent
, "", dispatch
->object
);
1027 fprintf (stderr
, "%*sprivate_segment_size: %u\n", indent
, "",
1028 dispatch
->private_segment_size
);
1029 fprintf (stderr
, "%*sgroup_segment_size: %u\n", indent
, "",
1030 dispatch
->group_segment_size
);
1031 fprintf (stderr
, "\n");
1035 /* {{{ Utility functions */
1037 /* Cast the thread local storage to gcn_thread. */
1039 static inline struct gcn_thread
*
1042 return (struct gcn_thread
*) GOMP_PLUGIN_acc_thread ();
1045 /* Initialize debug and suppress_host_fallback according to the environment. */
1048 init_environment_variables (void)
1050 if (secure_getenv ("GCN_DEBUG"))
1055 if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1056 suppress_host_fallback
= true;
1058 suppress_host_fallback
= false;
1060 hsa_runtime_lib
= secure_getenv ("HSA_RUNTIME_LIB");
1061 if (hsa_runtime_lib
== NULL
)
1062 hsa_runtime_lib
= "libhsa-runtime64.so.1";
1064 support_cpu_devices
= secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1066 const char *x
= secure_getenv ("GCN_NUM_TEAMS");
1068 x
= secure_getenv ("GCN_NUM_GANGS");
1070 override_x_dim
= atoi (x
);
1072 const char *z
= secure_getenv ("GCN_NUM_THREADS");
1074 z
= secure_getenv ("GCN_NUM_WORKERS");
1076 override_z_dim
= atoi (z
);
1078 const char *heap
= secure_getenv ("GCN_HEAP_SIZE");
1081 size_t tmp
= atol (heap
);
1083 gcn_kernel_heap_size
= tmp
;
1087 /* Return malloc'd string with name of SYMBOL. */
1090 get_executable_symbol_name (hsa_executable_symbol_t symbol
)
1092 hsa_status_t status
;
1095 const hsa_executable_symbol_info_t info_name_length
1096 = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH
;
1098 status
= hsa_fns
.hsa_executable_symbol_get_info_fn (symbol
, info_name_length
,
1100 if (status
!= HSA_STATUS_SUCCESS
)
1102 hsa_error ("Could not get length of symbol name", status
);
1106 res
= GOMP_PLUGIN_malloc (len
+ 1);
1108 const hsa_executable_symbol_info_t info_name
1109 = HSA_EXECUTABLE_SYMBOL_INFO_NAME
;
1111 status
= hsa_fns
.hsa_executable_symbol_get_info_fn (symbol
, info_name
, res
);
1113 if (status
!= HSA_STATUS_SUCCESS
)
1115 hsa_error ("Could not get symbol name", status
);
1125 /* Get the number of GPU Compute Units. */
1128 get_cu_count (struct agent_info
*agent
)
1131 hsa_status_t status
= hsa_fns
.hsa_agent_get_info_fn
1132 (agent
->id
, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT
, &cu_count
);
1133 if (status
== HSA_STATUS_SUCCESS
)
1136 return 64; /* The usual number for older devices. */
1139 /* Calculate the maximum grid size for OMP threads / OACC workers.
1140 This depends on the kernel's resource usage levels. */
1143 limit_worker_threads (int threads
)
1145 /* FIXME Do something more inteligent here.
1146 GCN can always run 4 threads within a Compute Unit, but
1147 more than that depends on register usage. */
1153 /* This sets the maximum number of teams to twice the number of GPU Compute
1154 Units to avoid memory waste and corresponding memory access faults. */
1157 limit_teams (int teams
, struct agent_info
*agent
)
1159 int max_teams
= 2 * get_cu_count (agent
);
1160 if (teams
> max_teams
)
1165 /* Parse the target attributes INPUT provided by the compiler and return true
1166 if we should run anything all. If INPUT is NULL, fill DEF with default
1167 values, then store INPUT or DEF into *RESULT.
1169 This is used for OpenMP only. */
1172 parse_target_attributes (void **input
,
1173 struct GOMP_kernel_launch_attributes
*def
,
1174 struct GOMP_kernel_launch_attributes
**result
,
1175 struct agent_info
*agent
)
1178 GOMP_PLUGIN_fatal ("No target arguments provided");
1180 bool grid_attrs_found
= false;
1181 bool gcn_dims_found
= false;
1183 int gcn_threads
= 0;
1186 intptr_t id
= (intptr_t) *input
++, val
;
1188 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
1189 val
= (intptr_t) *input
++;
1191 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
1193 val
= (val
> INT_MAX
) ? INT_MAX
: val
;
1195 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) == GOMP_DEVICE_GCN
1196 && ((id
& GOMP_TARGET_ARG_ID_MASK
)
1197 == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES
))
1199 grid_attrs_found
= true;
1202 else if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
)
1203 == GOMP_TARGET_ARG_DEVICE_ALL
)
1205 gcn_dims_found
= true;
1206 switch (id
& GOMP_TARGET_ARG_ID_MASK
)
1208 case GOMP_TARGET_ARG_NUM_TEAMS
:
1209 gcn_teams
= limit_teams (val
, agent
);
1211 case GOMP_TARGET_ARG_THREAD_LIMIT
:
1212 gcn_threads
= limit_worker_threads (val
);
1222 bool gfx900_workaround_p
= false;
1224 if (agent
->device_isa
== EF_AMDGPU_MACH_AMDGCN_GFX900
1225 && gcn_threads
== 0 && override_z_dim
== 0)
1227 gfx900_workaround_p
= true;
1228 GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1229 "threads to at most 4 per team.\n");
1230 GCN_WARNING (" - If this is not a Vega 10 device, please use "
1231 "GCN_NUM_THREADS=16\n");
1234 /* Ideally, when a dimension isn't explicitly specified, we should
1235 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1236 In practice, we tune for peak performance on BabelStream, which
1237 for OpenACC is currently 32 threads per CU. */
1239 if (gcn_teams
<= 0 && gcn_threads
<= 0)
1241 /* Set up a reasonable number of teams and threads. */
1242 gcn_threads
= gfx900_workaround_p
? 4 : 16; // 8;
1243 def
->gdims
[0] = get_cu_count (agent
); // * (40 / gcn_threads);
1244 def
->gdims
[2] = gcn_threads
;
1246 else if (gcn_teams
<= 0 && gcn_threads
> 0)
1248 /* Auto-scale the number of teams with the number of threads. */
1249 def
->gdims
[0] = get_cu_count (agent
); // * (40 / gcn_threads);
1250 def
->gdims
[2] = gcn_threads
;
1252 else if (gcn_teams
> 0 && gcn_threads
<= 0)
1254 int max_threads
= gfx900_workaround_p
? 4 : 16;
1256 /* Auto-scale the number of threads with the number of teams. */
1257 def
->gdims
[0] = gcn_teams
;
1258 def
->gdims
[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1259 if (def
->gdims
[2] == 0)
1261 else if (def
->gdims
[2] > max_threads
)
1262 def
->gdims
[2] = max_threads
;
1266 def
->gdims
[0] = gcn_teams
;
1267 def
->gdims
[2] = gcn_threads
;
1269 def
->gdims
[1] = 64; /* Each thread is 64 work items wide. */
1270 def
->wdims
[0] = 1; /* Single team per work-group. */
1276 else if (!grid_attrs_found
)
1286 GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1290 struct GOMP_kernel_launch_attributes
*kla
;
1291 kla
= (struct GOMP_kernel_launch_attributes
*) *input
;
1293 if (kla
->ndim
== 0 || kla
->ndim
> 3)
1294 GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla
->ndim
);
1296 GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla
->ndim
);
1298 for (i
= 0; i
< kla
->ndim
; i
++)
1300 GCN_DEBUG (" Dimension %u: grid size %u and group size %u\n", i
,
1301 kla
->gdims
[i
], kla
->wdims
[i
]);
1302 if (kla
->gdims
[i
] == 0)
1308 /* Return the group size given the requested GROUP size, GRID size and number
1309 of grid dimensions NDIM. */
1312 get_group_size (uint32_t ndim
, uint32_t grid
, uint32_t group
)
1316 /* TODO: Provide a default via environment or device characteristics. */
1330 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
1333 packet_store_release (uint32_t* packet
, uint16_t header
, uint16_t rest
)
1335 __atomic_store_n (packet
, header
| (rest
<< 16), __ATOMIC_RELEASE
);
1338 /* A never-called callback for the HSA command queues. These signal events
1339 that we don't use, so we trigger an error.
1341 This "queue" is not to be confused with the async queues, below. */
1344 hsa_queue_callback (hsa_status_t status
,
1345 hsa_queue_t
*queue
__attribute__ ((unused
)),
1346 void *data
__attribute__ ((unused
)))
1348 hsa_fatal ("Asynchronous queue error", status
);
1352 /* {{{ HSA initialization */
1354 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so. */
1357 init_hsa_runtime_functions (void)
1359 #define DLSYM_FN(function) \
1360 hsa_fns.function##_fn = dlsym (handle, #function); \
1361 if (hsa_fns.function##_fn == NULL) \
1363 void *handle
= dlopen (hsa_runtime_lib
, RTLD_LAZY
);
1367 DLSYM_FN (hsa_status_string
)
1368 DLSYM_FN (hsa_system_get_info
)
1369 DLSYM_FN (hsa_agent_get_info
)
1371 DLSYM_FN (hsa_iterate_agents
)
1372 DLSYM_FN (hsa_region_get_info
)
1373 DLSYM_FN (hsa_queue_create
)
1374 DLSYM_FN (hsa_agent_iterate_regions
)
1375 DLSYM_FN (hsa_executable_destroy
)
1376 DLSYM_FN (hsa_executable_create
)
1377 DLSYM_FN (hsa_executable_global_variable_define
)
1378 DLSYM_FN (hsa_executable_load_code_object
)
1379 DLSYM_FN (hsa_executable_freeze
)
1380 DLSYM_FN (hsa_signal_create
)
1381 DLSYM_FN (hsa_memory_allocate
)
1382 DLSYM_FN (hsa_memory_assign_agent
)
1383 DLSYM_FN (hsa_memory_copy
)
1384 DLSYM_FN (hsa_memory_free
)
1385 DLSYM_FN (hsa_signal_destroy
)
1386 DLSYM_FN (hsa_executable_get_symbol
)
1387 DLSYM_FN (hsa_executable_symbol_get_info
)
1388 DLSYM_FN (hsa_executable_iterate_symbols
)
1389 DLSYM_FN (hsa_queue_add_write_index_release
)
1390 DLSYM_FN (hsa_queue_load_read_index_acquire
)
1391 DLSYM_FN (hsa_signal_wait_acquire
)
1392 DLSYM_FN (hsa_signal_store_relaxed
)
1393 DLSYM_FN (hsa_signal_store_release
)
1394 DLSYM_FN (hsa_signal_load_acquire
)
1395 DLSYM_FN (hsa_queue_destroy
)
1396 DLSYM_FN (hsa_code_object_deserialize
)
1401 /* Return true if the agent is a GPU and can accept of concurrent submissions
1402 from different threads. */
1405 suitable_hsa_agent_p (hsa_agent_t agent
)
1407 hsa_device_type_t device_type
;
1409 = hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_DEVICE
,
1411 if (status
!= HSA_STATUS_SUCCESS
)
1414 switch (device_type
)
1416 case HSA_DEVICE_TYPE_GPU
:
1418 case HSA_DEVICE_TYPE_CPU
:
1419 if (!support_cpu_devices
)
1426 uint32_t features
= 0;
1427 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_FEATURE
,
1429 if (status
!= HSA_STATUS_SUCCESS
1430 || !(features
& HSA_AGENT_FEATURE_KERNEL_DISPATCH
))
1432 hsa_queue_type_t queue_type
;
1433 status
= hsa_fns
.hsa_agent_get_info_fn (agent
, HSA_AGENT_INFO_QUEUE_TYPE
,
1435 if (status
!= HSA_STATUS_SUCCESS
1436 || (queue_type
!= HSA_QUEUE_TYPE_MULTI
))
1442 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1443 agent_count in hsa_context. */
1446 count_gpu_agents (hsa_agent_t agent
, void *data
__attribute__ ((unused
)))
1448 if (suitable_hsa_agent_p (agent
))
1449 hsa_context
.agent_count
++;
1450 return HSA_STATUS_SUCCESS
;
1453 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1454 id to the describing structure in the hsa context. The index of the
1455 structure is pointed to by DATA, increment it afterwards. */
1458 assign_agent_ids (hsa_agent_t agent
, void *data
)
1460 if (suitable_hsa_agent_p (agent
))
1462 int *agent_index
= (int *) data
;
1463 hsa_context
.agents
[*agent_index
].id
= agent
;
1466 return HSA_STATUS_SUCCESS
;
1469 /* Initialize hsa_context if it has not already been done.
1470 Return TRUE on success. */
1473 init_hsa_context (void)
1475 hsa_status_t status
;
1476 int agent_index
= 0;
1478 if (hsa_context
.initialized
)
1480 init_environment_variables ();
1481 if (!init_hsa_runtime_functions ())
1483 GCN_WARNING ("Run-time could not be dynamically opened\n");
1484 if (suppress_host_fallback
)
1485 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1488 status
= hsa_fns
.hsa_init_fn ();
1489 if (status
!= HSA_STATUS_SUCCESS
)
1490 return hsa_error ("Run-time could not be initialized", status
);
1491 GCN_DEBUG ("HSA run-time initialized for GCN\n");
1494 dump_hsa_system_info ();
1496 status
= hsa_fns
.hsa_iterate_agents_fn (count_gpu_agents
, NULL
);
1497 if (status
!= HSA_STATUS_SUCCESS
)
1498 return hsa_error ("GCN GPU devices could not be enumerated", status
);
1499 GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context
.agent_count
);
1502 = GOMP_PLUGIN_malloc_cleared (hsa_context
.agent_count
1503 * sizeof (struct agent_info
));
1504 status
= hsa_fns
.hsa_iterate_agents_fn (assign_agent_ids
, &agent_index
);
1505 if (status
!= HSA_STATUS_SUCCESS
)
1506 return hsa_error ("Scanning compute agents failed", status
);
1507 if (agent_index
!= hsa_context
.agent_count
)
1509 GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1515 status
= hsa_fns
.hsa_iterate_agents_fn (dump_hsa_agent_info
, NULL
);
1516 if (status
!= HSA_STATUS_SUCCESS
)
1517 GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1520 uint16_t minor
, major
;
1521 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR
,
1523 if (status
!= HSA_STATUS_SUCCESS
)
1524 GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1525 status
= hsa_fns
.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR
,
1527 if (status
!= HSA_STATUS_SUCCESS
)
1528 GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1530 size_t len
= sizeof hsa_context
.driver_version_s
;
1531 int printed
= snprintf (hsa_context
.driver_version_s
, len
,
1532 "HSA Runtime %hu.%hu", (unsigned short int)major
,
1533 (unsigned short int)minor
);
1535 GCN_WARNING ("HSA runtime version string was truncated."
1536 "Version %hu.%hu is too long.", (unsigned short int)major
,
1537 (unsigned short int)minor
);
1539 hsa_context
.initialized
= true;
1543 /* Verify that hsa_context has already been initialized and return the
1544 agent_info structure describing device number N. Return NULL on error. */
1546 static struct agent_info
*
1547 get_agent_info (int n
)
1549 if (!hsa_context
.initialized
)
1551 GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1554 if (n
>= hsa_context
.agent_count
)
1556 GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n
);
1559 if (!hsa_context
.agents
[n
].initialized
)
1561 GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1564 return &hsa_context
.agents
[n
];
1567 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1569 Selects (breaks at) a suitable region of type KIND. */
1572 get_memory_region (hsa_region_t region
, hsa_region_t
*retval
,
1573 hsa_region_global_flag_t kind
)
1575 hsa_status_t status
;
1576 hsa_region_segment_t segment
;
1578 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SEGMENT
,
1580 if (status
!= HSA_STATUS_SUCCESS
)
1582 if (segment
!= HSA_REGION_SEGMENT_GLOBAL
)
1583 return HSA_STATUS_SUCCESS
;
1586 status
= hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_GLOBAL_FLAGS
,
1588 if (status
!= HSA_STATUS_SUCCESS
)
1593 return HSA_STATUS_INFO_BREAK
;
1595 return HSA_STATUS_SUCCESS
;
1598 /* Callback of hsa_agent_iterate_regions.
1600 Selects a kernargs memory region. */
1603 get_kernarg_memory_region (hsa_region_t region
, void *data
)
1605 return get_memory_region (region
, (hsa_region_t
*)data
,
1606 HSA_REGION_GLOBAL_FLAG_KERNARG
);
1609 /* Callback of hsa_agent_iterate_regions.
1611 Selects a coarse-grained memory region suitable for the heap and
1615 get_data_memory_region (hsa_region_t region
, void *data
)
1617 return get_memory_region (region
, (hsa_region_t
*)data
,
1618 HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED
);
1622 elf_gcn_isa_field (Elf64_Ehdr
*image
)
1624 return image
->e_flags
& EF_AMDGPU_MACH_MASK
;
1627 const static char *gcn_gfx803_s
= "gfx803";
1628 const static char *gcn_gfx900_s
= "gfx900";
1629 const static char *gcn_gfx906_s
= "gfx906";
1630 const static char *gcn_gfx908_s
= "gfx908";
1631 const static char *gcn_gfx90a_s
= "gfx90a";
1632 const static int gcn_isa_name_len
= 6;
1634 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1638 isa_hsa_name (int isa
) {
1641 case EF_AMDGPU_MACH_AMDGCN_GFX803
:
1642 return gcn_gfx803_s
;
1643 case EF_AMDGPU_MACH_AMDGCN_GFX900
:
1644 return gcn_gfx900_s
;
1645 case EF_AMDGPU_MACH_AMDGCN_GFX906
:
1646 return gcn_gfx906_s
;
1647 case EF_AMDGPU_MACH_AMDGCN_GFX908
:
1648 return gcn_gfx908_s
;
1649 case EF_AMDGPU_MACH_AMDGCN_GFX90a
:
1650 return gcn_gfx90a_s
;
1655 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1656 with -march) or NULL if we do not support the ISA.
1657 Keep in sync with /gcc/config/gcn/gcn.{c,opt}. */
1660 isa_gcc_name (int isa
) {
1663 case EF_AMDGPU_MACH_AMDGCN_GFX803
:
1666 return isa_hsa_name (isa
);
1670 /* Returns the code which is used in the GCN object code to identify the ISA with
1671 the given name (as used by the HSA runtime). */
1674 isa_code(const char *isa
) {
1675 if (!strncmp (isa
, gcn_gfx803_s
, gcn_isa_name_len
))
1676 return EF_AMDGPU_MACH_AMDGCN_GFX803
;
1678 if (!strncmp (isa
, gcn_gfx900_s
, gcn_isa_name_len
))
1679 return EF_AMDGPU_MACH_AMDGCN_GFX900
;
1681 if (!strncmp (isa
, gcn_gfx906_s
, gcn_isa_name_len
))
1682 return EF_AMDGPU_MACH_AMDGCN_GFX906
;
1684 if (!strncmp (isa
, gcn_gfx908_s
, gcn_isa_name_len
))
1685 return EF_AMDGPU_MACH_AMDGCN_GFX908
;
1687 if (!strncmp (isa
, gcn_gfx90a_s
, gcn_isa_name_len
))
1688 return EF_AMDGPU_MACH_AMDGCN_GFX90a
;
1696 /* Create or reuse a team arena.
1698 Team arenas are used by OpenMP to avoid calling malloc multiple times
1699 while setting up each team. This is purely a performance optimization.
1701 Allocating an arena also costs performance, albeit on the host side, so
1702 this function will reuse an existing arena if a large enough one is idle.
1703 The arena is released, but not deallocated, when the kernel exits. */
1706 get_team_arena (struct agent_info
*agent
, int num_teams
)
1708 struct team_arena_list
**next_ptr
= &agent
->team_arena_list
;
1709 struct team_arena_list
*item
;
1711 for (item
= *next_ptr
; item
; next_ptr
= &item
->next
, item
= item
->next
)
1713 if (item
->num_teams
< num_teams
)
1716 if (pthread_mutex_trylock (&item
->in_use
))
1722 GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams
);
1724 if (pthread_mutex_lock (&agent
->team_arena_write_lock
))
1726 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1729 item
= malloc (sizeof (*item
));
1730 item
->num_teams
= num_teams
;
1734 if (pthread_mutex_init (&item
->in_use
, NULL
))
1736 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1739 if (pthread_mutex_lock (&item
->in_use
))
1741 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1744 if (pthread_mutex_unlock (&agent
->team_arena_write_lock
))
1746 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1750 const int TEAM_ARENA_SIZE
= 64*1024; /* Must match libgomp.h. */
1751 hsa_status_t status
;
1752 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
1753 TEAM_ARENA_SIZE
*num_teams
,
1755 if (status
!= HSA_STATUS_SUCCESS
)
1756 hsa_fatal ("Could not allocate memory for GCN kernel arena", status
);
1757 status
= hsa_fns
.hsa_memory_assign_agent_fn (item
->arena
, agent
->id
,
1758 HSA_ACCESS_PERMISSION_RW
);
1759 if (status
!= HSA_STATUS_SUCCESS
)
1760 hsa_fatal ("Could not assign arena memory to device", status
);
1765 /* Mark a team arena available for reuse. */
1768 release_team_arena (struct agent_info
* agent
, void *arena
)
1770 struct team_arena_list
*item
;
1772 for (item
= agent
->team_arena_list
; item
; item
= item
->next
)
1774 if (item
->arena
== arena
)
1776 if (pthread_mutex_unlock (&item
->in_use
))
1777 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1781 GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1784 /* Clean up all the allocated team arenas. */
1787 destroy_team_arenas (struct agent_info
*agent
)
1789 struct team_arena_list
*item
, *next
;
1791 for (item
= agent
->team_arena_list
; item
; item
= next
)
1794 hsa_fns
.hsa_memory_free_fn (item
->arena
);
1795 if (pthread_mutex_destroy (&item
->in_use
))
1797 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1802 agent
->team_arena_list
= NULL
;
1807 /* Allocate memory on a specified device. */
1810 alloc_by_agent (struct agent_info
*agent
, size_t size
)
1812 GCN_DEBUG ("Allocating %zu bytes on device %d\n", size
, agent
->device_id
);
1814 /* Zero-size allocations are invalid, so in order to return a valid pointer
1815 we need to pass a valid size. One source of zero-size allocations is
1816 kernargs for kernels that have no inputs or outputs (the kernel may
1817 only use console output, for example). */
1822 hsa_status_t status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
1824 if (status
!= HSA_STATUS_SUCCESS
)
1826 hsa_error ("Could not allocate device memory", status
);
1830 status
= hsa_fns
.hsa_memory_assign_agent_fn (ptr
, agent
->id
,
1831 HSA_ACCESS_PERMISSION_RW
);
1832 if (status
!= HSA_STATUS_SUCCESS
)
1834 hsa_error ("Could not assign data memory to device", status
);
1838 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1839 bool profiling_dispatch_p
1840 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1841 if (profiling_dispatch_p
)
1843 acc_prof_info
*prof_info
= thr
->prof_info
;
1844 acc_event_info data_event_info
;
1845 acc_api_info
*api_info
= thr
->api_info
;
1847 prof_info
->event_type
= acc_ev_alloc
;
1849 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1850 data_event_info
.data_event
.valid_bytes
1851 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1852 data_event_info
.data_event
.parent_construct
1853 = acc_construct_parallel
;
1854 data_event_info
.data_event
.implicit
= 1;
1855 data_event_info
.data_event
.tool_info
= NULL
;
1856 data_event_info
.data_event
.var_name
= NULL
;
1857 data_event_info
.data_event
.bytes
= size
;
1858 data_event_info
.data_event
.host_ptr
= NULL
;
1859 data_event_info
.data_event
.device_ptr
= (void *) ptr
;
1861 api_info
->device_api
= acc_device_api_other
;
1863 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1870 /* Create kernel dispatch data structure for given KERNEL, along with
1871 the necessary device signals and memory allocations. */
1873 static struct kernel_dispatch
*
1874 create_kernel_dispatch (struct kernel_info
*kernel
, int num_teams
)
1876 struct agent_info
*agent
= kernel
->agent
;
1877 struct kernel_dispatch
*shadow
1878 = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch
));
1880 shadow
->agent
= kernel
->agent
;
1881 shadow
->object
= kernel
->object
;
1883 hsa_signal_t sync_signal
;
1884 hsa_status_t status
= hsa_fns
.hsa_signal_create_fn (1, 0, NULL
, &sync_signal
);
1885 if (status
!= HSA_STATUS_SUCCESS
)
1886 hsa_fatal ("Error creating the GCN sync signal", status
);
1888 shadow
->signal
= sync_signal
.handle
;
1889 shadow
->private_segment_size
= kernel
->private_segment_size
;
1890 shadow
->group_segment_size
= kernel
->group_segment_size
;
1892 /* We expect kernels to request a single pointer, explicitly, and the
1893 rest of struct kernargs, implicitly. If they request anything else
1894 then something is wrong. */
1895 if (kernel
->kernarg_segment_size
> 8)
1897 GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1901 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->kernarg_region
,
1902 sizeof (struct kernargs
),
1903 &shadow
->kernarg_address
);
1904 if (status
!= HSA_STATUS_SUCCESS
)
1905 hsa_fatal ("Could not allocate memory for GCN kernel arguments", status
);
1906 struct kernargs
*kernargs
= shadow
->kernarg_address
;
1908 /* Zero-initialize the output_data (minimum needed). */
1909 kernargs
->out_ptr
= (int64_t)&kernargs
->output_data
;
1910 kernargs
->output_data
.next_output
= 0;
1911 for (unsigned i
= 0;
1912 i
< (sizeof (kernargs
->output_data
.queue
)
1913 / sizeof (kernargs
->output_data
.queue
[0]));
1915 kernargs
->output_data
.queue
[i
].written
= 0;
1916 kernargs
->output_data
.consumed
= 0;
1918 /* Pass in the heap location. */
1919 kernargs
->heap_ptr
= (int64_t)kernel
->module
->heap
;
1921 /* Create an arena. */
1922 if (kernel
->kind
== KIND_OPENMP
)
1923 kernargs
->arena_ptr
= (int64_t)get_team_arena (agent
, num_teams
);
1925 kernargs
->arena_ptr
= 0;
1927 /* Ensure we can recognize unset return values. */
1928 kernargs
->output_data
.return_value
= 0xcafe0000;
1934 process_reverse_offload (uint64_t fn
, uint64_t mapnum
, uint64_t hostaddrs
,
1935 uint64_t sizes
, uint64_t kinds
, uint64_t dev_num64
)
1937 int dev_num
= dev_num64
;
1938 GOMP_PLUGIN_target_rev (fn
, mapnum
, hostaddrs
, sizes
, kinds
, dev_num
,
1942 /* Output any data written to console output from the kernel. It is expected
1943 that this function is polled during kernel execution.
1945 We print all entries from the last item printed to the next entry without
1946 a "written" flag. If the "final" flag is set then it'll continue right to
1949 The print buffer is circular, but the from and to locations don't wrap when
1950 the buffer does, so the output limit is UINT_MAX. The target blocks on
1951 output when the buffer is full. */
1954 console_output (struct kernel_info
*kernel
, struct kernargs
*kernargs
,
1957 unsigned int limit
= (sizeof (kernargs
->output_data
.queue
)
1958 / sizeof (kernargs
->output_data
.queue
[0]));
1960 unsigned int from
= __atomic_load_n (&kernargs
->output_data
.consumed
,
1962 unsigned int to
= kernargs
->output_data
.next_output
;
1968 printf ("GCN print buffer overflowed.\n");
1973 for (i
= from
; i
< to
; i
++)
1975 struct printf_data
*data
= &kernargs
->output_data
.queue
[i
%limit
];
1977 if (!data
->written
&& !final
)
1982 case 0: printf ("%.128s%ld\n", data
->msg
, data
->ivalue
); break;
1983 case 1: printf ("%.128s%f\n", data
->msg
, data
->dvalue
); break;
1984 case 2: printf ("%.128s%.128s\n", data
->msg
, data
->text
); break;
1985 case 3: printf ("%.128s%.128s", data
->msg
, data
->text
); break;
1987 process_reverse_offload (data
->value_u64
[0], data
->value_u64
[1],
1988 data
->value_u64
[2], data
->value_u64
[3],
1989 data
->value_u64
[4], data
->value_u64
[5]);
1991 default: printf ("GCN print buffer error!\n"); break;
1994 __atomic_store_n (&kernargs
->output_data
.consumed
, i
+1,
2000 /* Release data structure created for a kernel dispatch in SHADOW argument,
2001 and clean up the signal and memory allocations. */
2004 release_kernel_dispatch (struct kernel_dispatch
*shadow
)
2006 GCN_DEBUG ("Released kernel dispatch: %p\n", shadow
);
2008 struct kernargs
*kernargs
= shadow
->kernarg_address
;
2009 void *arena
= (void *)kernargs
->arena_ptr
;
2011 release_team_arena (shadow
->agent
, arena
);
2013 hsa_fns
.hsa_memory_free_fn (shadow
->kernarg_address
);
2016 s
.handle
= shadow
->signal
;
2017 hsa_fns
.hsa_signal_destroy_fn (s
);
2022 /* Extract the properties from a kernel binary. */
2025 init_kernel_properties (struct kernel_info
*kernel
)
2027 hsa_status_t status
;
2028 struct agent_info
*agent
= kernel
->agent
;
2029 hsa_executable_symbol_t kernel_symbol
;
2030 char *buf
= alloca (strlen (kernel
->name
) + 4);
2031 sprintf (buf
, "%s.kd", kernel
->name
);
2032 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
2035 if (status
!= HSA_STATUS_SUCCESS
)
2037 hsa_warn ("Could not find symbol for kernel in the code object", status
);
2038 fprintf (stderr
, "not found name: '%s'\n", buf
);
2039 dump_executable_symbols (agent
->executable
);
2042 GCN_DEBUG ("Located kernel %s\n", kernel
->name
);
2043 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2044 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT
, &kernel
->object
);
2045 if (status
!= HSA_STATUS_SUCCESS
)
2046 hsa_fatal ("Could not extract a kernel object from its symbol", status
);
2047 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2048 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE
,
2049 &kernel
->kernarg_segment_size
);
2050 if (status
!= HSA_STATUS_SUCCESS
)
2051 hsa_fatal ("Could not get info about kernel argument size", status
);
2052 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2053 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE
,
2054 &kernel
->group_segment_size
);
2055 if (status
!= HSA_STATUS_SUCCESS
)
2056 hsa_fatal ("Could not get info about kernel group segment size", status
);
2057 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
2058 (kernel_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE
,
2059 &kernel
->private_segment_size
);
2060 if (status
!= HSA_STATUS_SUCCESS
)
2061 hsa_fatal ("Could not get info about kernel private segment size",
2064 /* The kernel type is not known until something tries to launch it. */
2065 kernel
->kind
= KIND_UNKNOWN
;
2067 GCN_DEBUG ("Kernel structure for %s fully initialized with "
2068 "following segment sizes: \n", kernel
->name
);
2069 GCN_DEBUG (" group_segment_size: %u\n",
2070 (unsigned) kernel
->group_segment_size
);
2071 GCN_DEBUG (" private_segment_size: %u\n",
2072 (unsigned) kernel
->private_segment_size
);
2073 GCN_DEBUG (" kernarg_segment_size: %u\n",
2074 (unsigned) kernel
->kernarg_segment_size
);
2078 kernel
->initialization_failed
= true;
2081 /* Do all the work that is necessary before running KERNEL for the first time.
2082 The function assumes the program has been created, finalized and frozen by
2083 create_and_finalize_hsa_program. */
2086 init_kernel (struct kernel_info
*kernel
)
2088 if (pthread_mutex_lock (&kernel
->init_mutex
))
2089 GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2090 if (kernel
->initialized
)
2092 if (pthread_mutex_unlock (&kernel
->init_mutex
))
2093 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2099 init_kernel_properties (kernel
);
2101 if (!kernel
->initialization_failed
)
2105 kernel
->initialized
= true;
2107 if (pthread_mutex_unlock (&kernel
->init_mutex
))
2108 GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2112 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2113 launch attributes from KLA.
2115 MODULE_LOCKED indicates that the caller already holds the lock and
2116 run_kernel need not lock it again.
2117 If AQ is NULL then agent->sync_queue will be used. */
2120 run_kernel (struct kernel_info
*kernel
, void *vars
,
2121 struct GOMP_kernel_launch_attributes
*kla
,
2122 struct goacc_asyncqueue
*aq
, bool module_locked
)
2124 GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel
->description
->sgpr_count
,
2125 kernel
->description
->vpgr_count
);
2127 /* Reduce the number of threads/workers if there are insufficient
2128 VGPRs available to run the kernels together. */
2129 if (kla
->ndim
== 3 && kernel
->description
->vpgr_count
> 0)
2131 int granulated_vgprs
= (kernel
->description
->vpgr_count
+ 3) & ~3;
2132 int max_threads
= (256 / granulated_vgprs
) * 4;
2133 if (kla
->gdims
[2] > max_threads
)
2135 GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2136 " per team/gang - reducing to %d threads/workers.\n",
2137 kla
->gdims
[2], max_threads
);
2138 kla
->gdims
[2] = max_threads
;
2142 GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel
->agent
->device_id
,
2144 GCN_DEBUG ("GCN launch attribs: gdims:[");
2146 for (i
= 0; i
< kla
->ndim
; ++i
)
2150 DEBUG_PRINT ("%u", kla
->gdims
[i
]);
2152 DEBUG_PRINT ("], normalized gdims:[");
2153 for (i
= 0; i
< kla
->ndim
; ++i
)
2157 DEBUG_PRINT ("%u", kla
->gdims
[i
] / kla
->wdims
[i
]);
2159 DEBUG_PRINT ("], wdims:[");
2160 for (i
= 0; i
< kla
->ndim
; ++i
)
2164 DEBUG_PRINT ("%u", kla
->wdims
[i
]);
2166 DEBUG_PRINT ("]\n");
2169 struct agent_info
*agent
= kernel
->agent
;
2170 if (!module_locked
&& pthread_rwlock_rdlock (&agent
->module_rwlock
))
2171 GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2173 if (!agent
->initialized
)
2174 GOMP_PLUGIN_fatal ("Agent must be initialized");
2176 if (!kernel
->initialized
)
2177 GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2179 hsa_queue_t
*command_q
= (aq
? aq
->hsa_queue
: kernel
->agent
->sync_queue
);
2182 = hsa_fns
.hsa_queue_add_write_index_release_fn (command_q
, 1);
2183 GCN_DEBUG ("Got AQL index %llu\n", (long long int) index
);
2185 /* Wait until the queue is not full before writing the packet. */
2186 while (index
- hsa_fns
.hsa_queue_load_read_index_acquire_fn (command_q
)
2190 /* Do not allow the dimensions to be overridden when running
2191 constructors or destructors. */
2192 int override_x
= kernel
->kind
== KIND_UNKNOWN
? 0 : override_x_dim
;
2193 int override_z
= kernel
->kind
== KIND_UNKNOWN
? 0 : override_z_dim
;
2195 hsa_kernel_dispatch_packet_t
*packet
;
2196 packet
= ((hsa_kernel_dispatch_packet_t
*) command_q
->base_address
)
2197 + index
% command_q
->size
;
2199 memset (((uint8_t *) packet
) + 4, 0, sizeof (*packet
) - 4);
2200 packet
->grid_size_x
= override_x
? : kla
->gdims
[0];
2201 packet
->workgroup_size_x
= get_group_size (kla
->ndim
,
2202 packet
->grid_size_x
,
2207 packet
->grid_size_y
= kla
->gdims
[1];
2208 packet
->workgroup_size_y
= get_group_size (kla
->ndim
, kla
->gdims
[1],
2213 packet
->grid_size_y
= 1;
2214 packet
->workgroup_size_y
= 1;
2219 packet
->grid_size_z
= limit_worker_threads (override_z
2221 packet
->workgroup_size_z
= get_group_size (kla
->ndim
,
2222 packet
->grid_size_z
,
2227 packet
->grid_size_z
= 1;
2228 packet
->workgroup_size_z
= 1;
2231 GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2232 " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2233 packet
->grid_size_x
, packet
->grid_size_y
, packet
->grid_size_z
,
2234 packet
->grid_size_x
/ packet
->workgroup_size_x
,
2235 packet
->grid_size_y
/ packet
->workgroup_size_y
,
2236 packet
->grid_size_z
/ packet
->workgroup_size_z
,
2237 packet
->workgroup_size_x
, packet
->workgroup_size_y
,
2238 packet
->workgroup_size_z
);
2240 struct kernel_dispatch
*shadow
2241 = create_kernel_dispatch (kernel
, packet
->grid_size_x
);
2242 shadow
->queue
= command_q
;
2246 fprintf (stderr
, "\nKernel has following dependencies:\n");
2247 print_kernel_dispatch (shadow
, 2);
2250 packet
->private_segment_size
= kernel
->private_segment_size
;
2251 packet
->group_segment_size
= kernel
->group_segment_size
;
2252 packet
->kernel_object
= kernel
->object
;
2253 packet
->kernarg_address
= shadow
->kernarg_address
;
2255 s
.handle
= shadow
->signal
;
2256 packet
->completion_signal
= s
;
2257 hsa_fns
.hsa_signal_store_relaxed_fn (s
, 1);
2258 memcpy (shadow
->kernarg_address
, &vars
, sizeof (vars
));
2260 GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2263 header
= HSA_PACKET_TYPE_KERNEL_DISPATCH
<< HSA_PACKET_HEADER_TYPE
;
2264 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE
;
2265 header
|= HSA_FENCE_SCOPE_SYSTEM
<< HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE
;
2267 GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel
->name
,
2270 packet_store_release ((uint32_t *) packet
, header
,
2271 (uint16_t) kla
->ndim
2272 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS
);
2274 hsa_fns
.hsa_signal_store_release_fn (command_q
->doorbell_signal
,
2277 GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2279 /* Root signal waits with 1ms timeout. */
2280 while (hsa_fns
.hsa_signal_wait_acquire_fn (s
, HSA_SIGNAL_CONDITION_LT
, 1,
2282 HSA_WAIT_STATE_BLOCKED
) != 0)
2284 console_output (kernel
, shadow
->kernarg_address
, false);
2286 console_output (kernel
, shadow
->kernarg_address
, true);
2288 struct kernargs
*kernargs
= shadow
->kernarg_address
;
2289 unsigned int return_value
= (unsigned int)kernargs
->output_data
.return_value
;
2291 release_kernel_dispatch (shadow
);
2293 if (!module_locked
&& pthread_rwlock_unlock (&agent
->module_rwlock
))
2294 GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2296 unsigned int upper
= (return_value
& ~0xffff) >> 16;
2297 if (upper
== 0xcafe)
2298 ; // exit not called, normal termination.
2299 else if (upper
== 0xffff)
2303 GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2304 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2309 if (upper
== 0xffff)
2311 unsigned int signal
= (return_value
>> 8) & 0xff;
2313 if (signal
== SIGABRT
)
2315 GCN_WARNING ("GCN Kernel aborted\n");
2318 else if (signal
!= 0)
2320 GCN_WARNING ("GCN Kernel received unknown signal\n");
2324 GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value
& 0xff);
2325 exit (return_value
& 0xff);
2330 /* {{{ Load/Unload */
2332 /* Initialize KERNEL from D and other parameters. Return true on success. */
2335 init_basic_kernel_info (struct kernel_info
*kernel
,
2336 struct hsa_kernel_description
*d
,
2337 struct agent_info
*agent
,
2338 struct module_info
*module
)
2340 kernel
->agent
= agent
;
2341 kernel
->module
= module
;
2342 kernel
->name
= d
->name
;
2343 kernel
->description
= d
;
2344 if (pthread_mutex_init (&kernel
->init_mutex
, NULL
))
2346 GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2352 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2355 isa_matches_agent (struct agent_info
*agent
, Elf64_Ehdr
*image
)
2357 int isa_field
= elf_gcn_isa_field (image
);
2358 const char* isa_s
= isa_hsa_name (isa_field
);
2361 hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR
);
2365 if (isa_field
!= agent
->device_isa
)
2368 const char *agent_isa_s
= isa_hsa_name (agent
->device_isa
);
2369 const char *agent_isa_gcc_s
= isa_gcc_name (agent
->device_isa
);
2370 assert (agent_isa_s
);
2371 assert (agent_isa_gcc_s
);
2373 snprintf (msg
, sizeof msg
,
2374 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2375 "Try to recompile with '-foffload-options=-march=%s'.\n",
2376 isa_s
, agent_isa_s
, agent_isa_gcc_s
);
2378 hsa_error (msg
, HSA_STATUS_ERROR
);
2385 /* Create and finalize the program consisting of all loaded modules. */
2388 create_and_finalize_hsa_program (struct agent_info
*agent
)
2390 hsa_status_t status
;
2392 if (pthread_mutex_lock (&agent
->prog_mutex
))
2394 GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2397 if (agent
->prog_finalized
)
2401 = hsa_fns
.hsa_executable_create_fn (HSA_PROFILE_FULL
,
2402 HSA_EXECUTABLE_STATE_UNFROZEN
,
2403 "", &agent
->executable
);
2404 if (status
!= HSA_STATUS_SUCCESS
)
2406 hsa_error ("Could not create GCN executable", status
);
2410 /* Load any GCN modules. */
2411 struct module_info
*module
= agent
->module
;
2414 Elf64_Ehdr
*image
= (Elf64_Ehdr
*)module
->image_desc
->gcn_image
->image
;
2416 if (!isa_matches_agent (agent
, image
))
2419 hsa_code_object_t co
= { 0 };
2420 status
= hsa_fns
.hsa_code_object_deserialize_fn
2421 (module
->image_desc
->gcn_image
->image
,
2422 module
->image_desc
->gcn_image
->size
,
2424 if (status
!= HSA_STATUS_SUCCESS
)
2426 hsa_error ("Could not deserialize GCN code object", status
);
2430 status
= hsa_fns
.hsa_executable_load_code_object_fn
2431 (agent
->executable
, agent
->id
, co
, "");
2432 if (status
!= HSA_STATUS_SUCCESS
)
2434 hsa_error ("Could not load GCN code object", status
);
2440 status
= hsa_fns
.hsa_memory_allocate_fn (agent
->data_region
,
2441 gcn_kernel_heap_size
,
2442 (void**)&module
->heap
);
2443 if (status
!= HSA_STATUS_SUCCESS
)
2445 hsa_error ("Could not allocate memory for GCN heap", status
);
2449 status
= hsa_fns
.hsa_memory_assign_agent_fn
2450 (module
->heap
, agent
->id
, HSA_ACCESS_PERMISSION_RW
);
2451 if (status
!= HSA_STATUS_SUCCESS
)
2453 hsa_error ("Could not assign GCN heap memory to device", status
);
2457 hsa_fns
.hsa_memory_copy_fn (&module
->heap
->size
,
2458 &gcn_kernel_heap_size
,
2459 sizeof (gcn_kernel_heap_size
));
2465 dump_executable_symbols (agent
->executable
);
2467 status
= hsa_fns
.hsa_executable_freeze_fn (agent
->executable
, "");
2468 if (status
!= HSA_STATUS_SUCCESS
)
2470 hsa_error ("Could not freeze the GCN executable", status
);
2475 agent
->prog_finalized
= true;
2477 if (pthread_mutex_unlock (&agent
->prog_mutex
))
2479 GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2490 /* Free the HSA program in agent and everything associated with it and set
2491 agent->prog_finalized and the initialized flags of all kernels to false.
2492 Return TRUE on success. */
2495 destroy_hsa_program (struct agent_info
*agent
)
2497 if (!agent
->prog_finalized
)
2500 hsa_status_t status
;
2502 GCN_DEBUG ("Destroying the current GCN program.\n");
2504 status
= hsa_fns
.hsa_executable_destroy_fn (agent
->executable
);
2505 if (status
!= HSA_STATUS_SUCCESS
)
2506 return hsa_error ("Could not destroy GCN executable", status
);
2511 for (i
= 0; i
< agent
->module
->kernel_count
; i
++)
2512 agent
->module
->kernels
[i
].initialized
= false;
2514 if (agent
->module
->heap
)
2516 hsa_fns
.hsa_memory_free_fn (agent
->module
->heap
);
2517 agent
->module
->heap
= NULL
;
2520 agent
->prog_finalized
= false;
2524 /* Deinitialize all information associated with MODULE and kernels within
2525 it. Return TRUE on success. */
2528 destroy_module (struct module_info
*module
, bool locked
)
2530 /* Run destructors before destroying module. */
2531 struct GOMP_kernel_launch_attributes kla
=
2535 /* Work-group size. */
2539 if (module
->fini_array_func
)
2541 init_kernel (module
->fini_array_func
);
2542 run_kernel (module
->fini_array_func
, NULL
, &kla
, NULL
, locked
);
2544 module
->constructors_run_p
= false;
2547 for (i
= 0; i
< module
->kernel_count
; i
++)
2548 if (pthread_mutex_destroy (&module
->kernels
[i
].init_mutex
))
2550 GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2561 /* Callback of dispatch queues to report errors. */
2564 execute_queue_entry (struct goacc_asyncqueue
*aq
, int index
)
2566 struct queue_entry
*entry
= &aq
->queue
[index
];
2568 switch (entry
->type
)
2572 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2573 aq
->agent
->device_id
, aq
->id
, index
);
2574 run_kernel (entry
->u
.launch
.kernel
,
2575 entry
->u
.launch
.vars
,
2576 &entry
->u
.launch
.kla
, aq
, false);
2578 GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2579 aq
->agent
->device_id
, aq
->id
, index
);
2584 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2585 aq
->agent
->device_id
, aq
->id
, index
);
2586 entry
->u
.callback
.fn (entry
->u
.callback
.data
);
2588 GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2589 aq
->agent
->device_id
, aq
->id
, index
);
2594 /* FIXME: is it safe to access a placeholder that may already have
2596 struct placeholder
*placeholderp
= entry
->u
.asyncwait
.placeholderp
;
2599 GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2600 aq
->agent
->device_id
, aq
->id
, index
);
2602 pthread_mutex_lock (&placeholderp
->mutex
);
2604 while (!placeholderp
->executed
)
2605 pthread_cond_wait (&placeholderp
->cond
, &placeholderp
->mutex
);
2607 pthread_mutex_unlock (&placeholderp
->mutex
);
2609 if (pthread_cond_destroy (&placeholderp
->cond
))
2610 GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2612 if (pthread_mutex_destroy (&placeholderp
->mutex
))
2613 GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2616 GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2617 "entry (%d) done\n", aq
->agent
->device_id
, aq
->id
, index
);
2621 case ASYNC_PLACEHOLDER
:
2622 pthread_mutex_lock (&entry
->u
.placeholder
.mutex
);
2623 entry
->u
.placeholder
.executed
= 1;
2624 pthread_cond_signal (&entry
->u
.placeholder
.cond
);
2625 pthread_mutex_unlock (&entry
->u
.placeholder
.mutex
);
2629 GOMP_PLUGIN_fatal ("Unknown queue element");
2633 /* This function is run as a thread to service an async queue in the
2634 background. It runs continuously until the stop flag is set. */
2637 drain_queue (void *thread_arg
)
2639 struct goacc_asyncqueue
*aq
= thread_arg
;
2641 if (DRAIN_QUEUE_SYNCHRONOUS_P
)
2643 aq
->drain_queue_stop
= 2;
2647 pthread_mutex_lock (&aq
->mutex
);
2651 if (aq
->drain_queue_stop
)
2654 if (aq
->queue_n
> 0)
2656 pthread_mutex_unlock (&aq
->mutex
);
2657 execute_queue_entry (aq
, aq
->queue_first
);
2659 pthread_mutex_lock (&aq
->mutex
);
2660 aq
->queue_first
= ((aq
->queue_first
+ 1)
2661 % ASYNC_QUEUE_SIZE
);
2664 if (DEBUG_THREAD_SIGNAL
)
2665 GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2666 aq
->agent
->device_id
, aq
->id
);
2667 pthread_cond_broadcast (&aq
->queue_cond_out
);
2668 pthread_mutex_unlock (&aq
->mutex
);
2671 GCN_DEBUG ("Async thread %d:%d: continue\n", aq
->agent
->device_id
,
2673 pthread_mutex_lock (&aq
->mutex
);
2677 if (DEBUG_THREAD_SLEEP
)
2678 GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2679 aq
->agent
->device_id
, aq
->id
);
2680 pthread_cond_wait (&aq
->queue_cond_in
, &aq
->mutex
);
2681 if (DEBUG_THREAD_SLEEP
)
2682 GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2683 aq
->agent
->device_id
, aq
->id
);
2687 aq
->drain_queue_stop
= 2;
2688 if (DEBUG_THREAD_SIGNAL
)
2689 GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2690 aq
->agent
->device_id
, aq
->id
);
2691 pthread_cond_broadcast (&aq
->queue_cond_out
);
2692 pthread_mutex_unlock (&aq
->mutex
);
2694 GCN_DEBUG ("Async thread %d:%d: returning\n", aq
->agent
->device_id
, aq
->id
);
2698 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2699 is not usually the case. This is just a debug tool. */
2702 drain_queue_synchronous (struct goacc_asyncqueue
*aq
)
2704 pthread_mutex_lock (&aq
->mutex
);
2706 while (aq
->queue_n
> 0)
2708 execute_queue_entry (aq
, aq
->queue_first
);
2710 aq
->queue_first
= ((aq
->queue_first
+ 1)
2711 % ASYNC_QUEUE_SIZE
);
2715 pthread_mutex_unlock (&aq
->mutex
);
2718 /* Block the current thread until an async queue is writable. The aq->mutex
2719 lock should be held on entry, and remains locked on exit. */
2722 wait_for_queue_nonfull (struct goacc_asyncqueue
*aq
)
2724 if (aq
->queue_n
== ASYNC_QUEUE_SIZE
)
2726 /* Queue is full. Wait for it to not be full. */
2727 while (aq
->queue_n
== ASYNC_QUEUE_SIZE
)
2728 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
2732 /* Request an asynchronous kernel launch on the specified queue. This
2733 may block if the queue is full, but returns without waiting for the
2737 queue_push_launch (struct goacc_asyncqueue
*aq
, struct kernel_info
*kernel
,
2738 void *vars
, struct GOMP_kernel_launch_attributes
*kla
)
2740 assert (aq
->agent
== kernel
->agent
);
2742 pthread_mutex_lock (&aq
->mutex
);
2744 wait_for_queue_nonfull (aq
);
2746 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
)
2747 % ASYNC_QUEUE_SIZE
);
2749 GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq
->agent
->device_id
,
2750 aq
->id
, queue_last
);
2752 aq
->queue
[queue_last
].type
= KERNEL_LAUNCH
;
2753 aq
->queue
[queue_last
].u
.launch
.kernel
= kernel
;
2754 aq
->queue
[queue_last
].u
.launch
.vars
= vars
;
2755 aq
->queue
[queue_last
].u
.launch
.kla
= *kla
;
2759 if (DEBUG_THREAD_SIGNAL
)
2760 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2761 aq
->agent
->device_id
, aq
->id
);
2762 pthread_cond_signal (&aq
->queue_cond_in
);
2764 pthread_mutex_unlock (&aq
->mutex
);
2767 /* Request an asynchronous callback on the specified queue. The callback
2768 function will be called, with the given opaque data, from the appropriate
2769 async thread, when all previous items on that queue are complete. */
2772 queue_push_callback (struct goacc_asyncqueue
*aq
, void (*fn
)(void *),
2775 pthread_mutex_lock (&aq
->mutex
);
2777 wait_for_queue_nonfull (aq
);
2779 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
)
2780 % ASYNC_QUEUE_SIZE
);
2782 GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq
->agent
->device_id
,
2783 aq
->id
, queue_last
);
2785 aq
->queue
[queue_last
].type
= CALLBACK
;
2786 aq
->queue
[queue_last
].u
.callback
.fn
= fn
;
2787 aq
->queue
[queue_last
].u
.callback
.data
= data
;
2791 if (DEBUG_THREAD_SIGNAL
)
2792 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2793 aq
->agent
->device_id
, aq
->id
);
2794 pthread_cond_signal (&aq
->queue_cond_in
);
2796 pthread_mutex_unlock (&aq
->mutex
);
2799 /* Request that a given async thread wait for another thread (unspecified) to
2800 reach the given placeholder. The wait will occur when all previous entries
2801 on the queue are complete. A placeholder is effectively a kind of signal
2802 which simply sets a flag when encountered in a queue. */
2805 queue_push_asyncwait (struct goacc_asyncqueue
*aq
,
2806 struct placeholder
*placeholderp
)
2808 pthread_mutex_lock (&aq
->mutex
);
2810 wait_for_queue_nonfull (aq
);
2812 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
) % ASYNC_QUEUE_SIZE
);
2814 GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq
->agent
->device_id
,
2815 aq
->id
, queue_last
);
2817 aq
->queue
[queue_last
].type
= ASYNC_WAIT
;
2818 aq
->queue
[queue_last
].u
.asyncwait
.placeholderp
= placeholderp
;
2822 if (DEBUG_THREAD_SIGNAL
)
2823 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2824 aq
->agent
->device_id
, aq
->id
);
2825 pthread_cond_signal (&aq
->queue_cond_in
);
2827 pthread_mutex_unlock (&aq
->mutex
);
2830 /* Add a placeholder into an async queue. When the async thread reaches the
2831 placeholder it will set the "executed" flag to true and continue.
2832 Another thread may be waiting on this thread reaching the placeholder. */
2834 static struct placeholder
*
2835 queue_push_placeholder (struct goacc_asyncqueue
*aq
)
2837 struct placeholder
*placeholderp
;
2839 pthread_mutex_lock (&aq
->mutex
);
2841 wait_for_queue_nonfull (aq
);
2843 int queue_last
= ((aq
->queue_first
+ aq
->queue_n
) % ASYNC_QUEUE_SIZE
);
2845 GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq
->agent
->device_id
,
2846 aq
->id
, queue_last
);
2848 aq
->queue
[queue_last
].type
= ASYNC_PLACEHOLDER
;
2849 placeholderp
= &aq
->queue
[queue_last
].u
.placeholder
;
2851 if (pthread_mutex_init (&placeholderp
->mutex
, NULL
))
2853 pthread_mutex_unlock (&aq
->mutex
);
2854 GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2857 if (pthread_cond_init (&placeholderp
->cond
, NULL
))
2859 pthread_mutex_unlock (&aq
->mutex
);
2860 GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2863 placeholderp
->executed
= 0;
2867 if (DEBUG_THREAD_SIGNAL
)
2868 GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2869 aq
->agent
->device_id
, aq
->id
);
2870 pthread_cond_signal (&aq
->queue_cond_in
);
2872 pthread_mutex_unlock (&aq
->mutex
);
2874 return placeholderp
;
2877 /* Signal an asynchronous thread to terminate, and wait for it to do so. */
2880 finalize_async_thread (struct goacc_asyncqueue
*aq
)
2882 pthread_mutex_lock (&aq
->mutex
);
2883 if (aq
->drain_queue_stop
== 2)
2885 pthread_mutex_unlock (&aq
->mutex
);
2889 aq
->drain_queue_stop
= 1;
2891 if (DEBUG_THREAD_SIGNAL
)
2892 GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2893 aq
->agent
->device_id
, aq
->id
);
2894 pthread_cond_signal (&aq
->queue_cond_in
);
2896 while (aq
->drain_queue_stop
!= 2)
2898 if (DEBUG_THREAD_SLEEP
)
2899 GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2900 " to sleep\n", aq
->agent
->device_id
, aq
->id
);
2901 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
2902 if (DEBUG_THREAD_SLEEP
)
2903 GCN_DEBUG ("Waiting, woke up thread %d:%d. Rechecking\n",
2904 aq
->agent
->device_id
, aq
->id
);
2907 GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq
->agent
->device_id
,
2909 pthread_mutex_unlock (&aq
->mutex
);
2911 int err
= pthread_join (aq
->thread_drain_queue
, NULL
);
2913 GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2914 aq
->agent
->device_id
, aq
->id
, strerror (err
));
2915 GCN_DEBUG ("Joined with async thread %d:%d\n", aq
->agent
->device_id
, aq
->id
);
2918 /* Set up an async queue for OpenMP. There will be only one. The
2919 implementation simply uses an OpenACC async queue.
2920 FIXME: is this thread-safe if two threads call this function? */
2923 maybe_init_omp_async (struct agent_info
*agent
)
2925 if (!agent
->omp_async_queue
)
2926 agent
->omp_async_queue
2927 = GOMP_OFFLOAD_openacc_async_construct (agent
->device_id
);
2930 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2931 copies from read-only pages. */
2934 hsa_memory_copy_wrapper (void *dst
, const void *src
, size_t len
)
2936 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, len
);
2938 if (status
== HSA_STATUS_SUCCESS
)
2941 /* It appears that the copy fails if the source data is in a read-only page.
2942 We can't detect that easily, so try copying the data to a temporary buffer
2943 and doing the copy again if we got an error above. */
2945 GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2946 "[%p:+%d]\n", (void *) src
, (int) len
);
2948 void *src_copy
= malloc (len
);
2949 memcpy (src_copy
, src
, len
);
2950 status
= hsa_fns
.hsa_memory_copy_fn (dst
, (const void *) src_copy
, len
);
2952 if (status
!= HSA_STATUS_SUCCESS
)
2953 GOMP_PLUGIN_error ("memory copy failed");
2956 /* Copy data to or from a device. This is intended for use as an async
2960 copy_data (void *data_
)
2962 struct copy_data
*data
= (struct copy_data
*)data_
;
2963 GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2964 data
->aq
->agent
->device_id
, data
->aq
->id
, data
->len
, data
->src
,
2966 hsa_memory_copy_wrapper (data
->dst
, data
->src
, data
->len
);
2970 /* Free device data. This is intended for use as an async callback event. */
2973 gomp_offload_free (void *ptr
)
2975 GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr
);
2976 GOMP_OFFLOAD_free (0, ptr
);
2979 /* Request an asynchronous data copy, to or from a device, on a given queue.
2980 The event will be registered as a callback. */
2983 queue_push_copy (struct goacc_asyncqueue
*aq
, void *dst
, const void *src
,
2987 GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2988 aq
->agent
->device_id
, aq
->id
, len
, src
, dst
);
2989 struct copy_data
*data
2990 = (struct copy_data
*)GOMP_PLUGIN_malloc (sizeof (struct copy_data
));
2995 queue_push_callback (aq
, copy_data
, data
);
2998 /* Return true if the given queue is currently empty. */
3001 queue_empty (struct goacc_asyncqueue
*aq
)
3003 pthread_mutex_lock (&aq
->mutex
);
3004 int res
= aq
->queue_n
== 0 ? 1 : 0;
3005 pthread_mutex_unlock (&aq
->mutex
);
3010 /* Wait for a given queue to become empty. This implements an OpenACC wait
3014 wait_queue (struct goacc_asyncqueue
*aq
)
3016 if (DRAIN_QUEUE_SYNCHRONOUS_P
)
3018 drain_queue_synchronous (aq
);
3022 pthread_mutex_lock (&aq
->mutex
);
3024 while (aq
->queue_n
> 0)
3026 if (DEBUG_THREAD_SLEEP
)
3027 GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3028 aq
->agent
->device_id
, aq
->id
);
3029 pthread_cond_wait (&aq
->queue_cond_out
, &aq
->mutex
);
3030 if (DEBUG_THREAD_SLEEP
)
3031 GCN_DEBUG ("thread %d:%d woke up. Rechecking\n", aq
->agent
->device_id
,
3035 pthread_mutex_unlock (&aq
->mutex
);
3036 GCN_DEBUG ("waiting for thread %d:%d, done\n", aq
->agent
->device_id
, aq
->id
);
3040 /* {{{ OpenACC support */
3042 /* Execute an OpenACC kernel, synchronously or asynchronously. */
3045 gcn_exec (struct kernel_info
*kernel
, size_t mapnum
, void **hostaddrs
,
3046 void **devaddrs
, unsigned *dims
, void *targ_mem_desc
, bool async
,
3047 struct goacc_asyncqueue
*aq
)
3049 if (!GOMP_OFFLOAD_can_run (kernel
))
3050 GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3052 /* If we get here then this must be an OpenACC kernel. */
3053 kernel
->kind
= KIND_OPENACC
;
3055 /* devaddrs must be double-indirect on the target. */
3056 void **ind_da
= alloc_by_agent (kernel
->agent
, sizeof (void*) * mapnum
);
3057 for (size_t i
= 0; i
< mapnum
; i
++)
3058 hsa_fns
.hsa_memory_copy_fn (&ind_da
[i
],
3059 devaddrs
[i
] ? &devaddrs
[i
] : &hostaddrs
[i
],
3062 struct hsa_kernel_description
*hsa_kernel_desc
= NULL
;
3063 for (unsigned i
= 0; i
< kernel
->module
->image_desc
->kernel_count
; i
++)
3065 struct hsa_kernel_description
*d
3066 = &kernel
->module
->image_desc
->kernel_infos
[i
];
3067 if (d
->name
== kernel
->name
)
3069 hsa_kernel_desc
= d
;
3074 /* We may have statically-determined dimensions in
3075 hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3076 invocation at runtime in dims[]. We allow static dimensions to take
3077 priority over dynamic dimensions when present (non-zero). */
3078 if (hsa_kernel_desc
->oacc_dims
[0] > 0)
3079 dims
[0] = hsa_kernel_desc
->oacc_dims
[0];
3080 if (hsa_kernel_desc
->oacc_dims
[1] > 0)
3081 dims
[1] = hsa_kernel_desc
->oacc_dims
[1];
3082 if (hsa_kernel_desc
->oacc_dims
[2] > 0)
3083 dims
[2] = hsa_kernel_desc
->oacc_dims
[2];
3085 /* Ideally, when a dimension isn't explicitly specified, we should
3086 tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3087 In practice, we tune for peak performance on BabelStream, which
3088 for OpenACC is currently 32 threads per CU. */
3089 if (dims
[0] == 0 && dims
[1] == 0)
3091 /* If any of the OpenACC dimensions remain 0 then we get to pick a
3092 number. There isn't really a correct answer for this without a clue
3093 about the problem size, so let's do a reasonable number of workers
3096 dims
[0] = get_cu_count (kernel
->agent
) * 4; /* Gangs. */
3097 dims
[1] = 8; /* Workers. */
3099 else if (dims
[0] == 0 && dims
[1] > 0)
3101 /* Auto-scale the number of gangs with the requested number of workers. */
3102 dims
[0] = get_cu_count (kernel
->agent
) * (32 / dims
[1]);
3104 else if (dims
[0] > 0 && dims
[1] == 0)
3106 /* Auto-scale the number of workers with the requested number of gangs. */
3107 dims
[1] = get_cu_count (kernel
->agent
) * 32 / dims
[0];
3114 /* The incoming dimensions are expressed in terms of gangs, workers, and
3115 vectors. The HSA dimensions are expressed in terms of "work-items",
3116 which means multiples of vector lanes.
3118 The "grid size" specifies the size of the problem space, and the
3119 "work-group size" specifies how much of that we want a single compute
3120 unit to chew on at once.
3122 The three dimensions do not really correspond to hardware, but the
3123 important thing is that the HSA runtime will launch as many
3124 work-groups as it takes to process the entire grid, and each
3125 work-group will contain as many wave-fronts as it takes to process
3126 the work-items in that group.
3128 Essentially, as long as we set the Y dimension to 64 (the number of
3129 vector lanes in hardware), and the Z group size to the maximum (16),
3130 then we will get the gangs (X) and workers (Z) launched as we expect.
3132 The reason for the apparent reversal of vector and worker dimension
3133 order is to do with the way the run-time distributes work-items across
3135 struct GOMP_kernel_launch_attributes kla
=
3138 {dims
[0], 64, dims
[1]},
3139 /* Work-group size. */
3143 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
3144 acc_prof_info
*prof_info
= thr
->prof_info
;
3145 acc_event_info enqueue_launch_event_info
;
3146 acc_api_info
*api_info
= thr
->api_info
;
3147 bool profiling_dispatch_p
= __builtin_expect (prof_info
!= NULL
, false);
3148 if (profiling_dispatch_p
)
3150 prof_info
->event_type
= acc_ev_enqueue_launch_start
;
3152 enqueue_launch_event_info
.launch_event
.event_type
3153 = prof_info
->event_type
;
3154 enqueue_launch_event_info
.launch_event
.valid_bytes
3155 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES
;
3156 enqueue_launch_event_info
.launch_event
.parent_construct
3157 = acc_construct_parallel
;
3158 enqueue_launch_event_info
.launch_event
.implicit
= 1;
3159 enqueue_launch_event_info
.launch_event
.tool_info
= NULL
;
3160 enqueue_launch_event_info
.launch_event
.kernel_name
3161 = (char *) kernel
->name
;
3162 enqueue_launch_event_info
.launch_event
.num_gangs
= kla
.gdims
[0];
3163 enqueue_launch_event_info
.launch_event
.num_workers
= kla
.gdims
[2];
3164 enqueue_launch_event_info
.launch_event
.vector_length
= kla
.gdims
[1];
3166 api_info
->device_api
= acc_device_api_other
;
3168 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
,
3169 &enqueue_launch_event_info
, api_info
);
3174 run_kernel (kernel
, ind_da
, &kla
, NULL
, false);
3175 gomp_offload_free (ind_da
);
3179 queue_push_launch (aq
, kernel
, ind_da
, &kla
);
3181 GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3182 aq
->agent
->device_id
, aq
->id
, ind_da
);
3183 queue_push_callback (aq
, gomp_offload_free
, ind_da
);
3186 if (profiling_dispatch_p
)
3188 prof_info
->event_type
= acc_ev_enqueue_launch_end
;
3189 enqueue_launch_event_info
.launch_event
.event_type
= prof_info
->event_type
;
3190 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
,
3191 &enqueue_launch_event_info
,
3197 /* {{{ Generic Plugin API */
3199 /* Return the name of the accelerator, which is "gcn". */
3202 GOMP_OFFLOAD_get_name (void)
3207 /* Return the specific capabilities the HSA accelerator have. */
3210 GOMP_OFFLOAD_get_caps (void)
3212 /* FIXME: Enable shared memory for APU, but not discrete GPU. */
3213 return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3214 | GOMP_OFFLOAD_CAP_OPENACC_200
;
3217 /* Identify as GCN accelerator. */
3220 GOMP_OFFLOAD_get_type (void)
3222 return OFFLOAD_TARGET_TYPE_GCN
;
3225 /* Return the libgomp version number we're compatible with. There is
3226 no requirement for cross-version compatibility. */
3229 GOMP_OFFLOAD_version (void)
3231 return GOMP_VERSION
;
3234 /* Return the number of GCN devices on the system. */
3237 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask
)
3239 if (!init_hsa_context ())
3241 /* Return -1 if no omp_requires_mask cannot be fulfilled but
3242 devices were present. */
3243 if (hsa_context
.agent_count
> 0 && omp_requires_mask
!= 0)
3245 return hsa_context
.agent_count
;
3248 /* Initialize device (agent) number N so that it can be used for computation.
3249 Return TRUE on success. */
3252 GOMP_OFFLOAD_init_device (int n
)
3254 if (!init_hsa_context ())
3256 if (n
>= hsa_context
.agent_count
)
3258 GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n
);
3261 struct agent_info
*agent
= &hsa_context
.agents
[n
];
3263 if (agent
->initialized
)
3266 agent
->device_id
= n
;
3268 if (pthread_rwlock_init (&agent
->module_rwlock
, NULL
))
3270 GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3273 if (pthread_mutex_init (&agent
->prog_mutex
, NULL
))
3275 GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3278 if (pthread_mutex_init (&agent
->async_queues_mutex
, NULL
))
3280 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3283 if (pthread_mutex_init (&agent
->team_arena_write_lock
, NULL
))
3285 GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3288 agent
->async_queues
= NULL
;
3289 agent
->omp_async_queue
= NULL
;
3290 agent
->team_arena_list
= NULL
;
3292 uint32_t queue_size
;
3293 hsa_status_t status
;
3294 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
,
3295 HSA_AGENT_INFO_QUEUE_MAX_SIZE
,
3297 if (status
!= HSA_STATUS_SUCCESS
)
3298 return hsa_error ("Error requesting maximum queue size of the GCN agent",
3301 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
, HSA_AGENT_INFO_NAME
,
3303 if (status
!= HSA_STATUS_SUCCESS
)
3304 return hsa_error ("Error querying the name of the agent", status
);
3306 agent
->device_isa
= isa_code (agent
->name
);
3307 if (agent
->device_isa
< 0)
3308 return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR
);
3310 status
= hsa_fns
.hsa_agent_get_info_fn (agent
->id
, HSA_AGENT_INFO_VENDOR_NAME
,
3311 &agent
->vendor_name
);
3312 if (status
!= HSA_STATUS_SUCCESS
)
3313 return hsa_error ("Error querying the vendor name of the agent", status
);
3315 status
= hsa_fns
.hsa_queue_create_fn (agent
->id
, queue_size
,
3316 HSA_QUEUE_TYPE_MULTI
,
3317 hsa_queue_callback
, NULL
, UINT32_MAX
,
3318 UINT32_MAX
, &agent
->sync_queue
);
3319 if (status
!= HSA_STATUS_SUCCESS
)
3320 return hsa_error ("Error creating command queue", status
);
3322 agent
->kernarg_region
.handle
= (uint64_t) -1;
3323 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
->id
,
3324 get_kernarg_memory_region
,
3325 &agent
->kernarg_region
);
3326 if (status
!= HSA_STATUS_SUCCESS
3327 && status
!= HSA_STATUS_INFO_BREAK
)
3328 hsa_error ("Scanning memory regions failed", status
);
3329 if (agent
->kernarg_region
.handle
== (uint64_t) -1)
3331 GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3335 GCN_DEBUG ("Selected kernel arguments memory region:\n");
3336 dump_hsa_region (agent
->kernarg_region
, NULL
);
3338 agent
->data_region
.handle
= (uint64_t) -1;
3339 status
= hsa_fns
.hsa_agent_iterate_regions_fn (agent
->id
,
3340 get_data_memory_region
,
3341 &agent
->data_region
);
3342 if (status
!= HSA_STATUS_SUCCESS
3343 && status
!= HSA_STATUS_INFO_BREAK
)
3344 hsa_error ("Scanning memory regions failed", status
);
3345 if (agent
->data_region
.handle
== (uint64_t) -1)
3347 GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3351 GCN_DEBUG ("Selected device data memory region:\n");
3352 dump_hsa_region (agent
->data_region
, NULL
);
3354 GCN_DEBUG ("GCN agent %d initialized\n", n
);
3356 agent
->initialized
= true;
3360 /* Load GCN object-code module described by struct gcn_image_desc in
3361 TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3362 If there are any constructors then run them. If not NULL, REV_FN_TABLE will
3363 contain the on-device addresses of the functions for reverse offload. To be
3364 freed by the caller. */
3367 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
3368 struct addr_pair
**target_table
,
3369 uint64_t **rev_fn_table
)
3371 if (GOMP_VERSION_DEV (version
) != GOMP_VERSION_GCN
)
3373 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3374 " (expected %u, received %u)",
3375 GOMP_VERSION_GCN
, GOMP_VERSION_DEV (version
));
3379 struct gcn_image_desc
*image_desc
= (struct gcn_image_desc
*) target_data
;
3380 struct agent_info
*agent
;
3381 struct addr_pair
*pair
;
3382 struct module_info
*module
;
3383 struct kernel_info
*kernel
;
3384 int kernel_count
= image_desc
->kernel_count
;
3385 unsigned var_count
= image_desc
->global_variable_count
;
3386 /* Currently, "others" is a struct of ICVS. */
3387 int other_count
= 1;
3389 agent
= get_agent_info (ord
);
3393 if (pthread_rwlock_wrlock (&agent
->module_rwlock
))
3395 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3398 if (agent
->prog_finalized
3399 && !destroy_hsa_program (agent
))
3402 GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count
);
3403 GCN_DEBUG ("Encountered %u global variables in an image\n", var_count
);
3404 GCN_DEBUG ("Expect %d other variables in an image\n", other_count
);
3405 pair
= GOMP_PLUGIN_malloc ((kernel_count
+ var_count
+ other_count
- 2)
3406 * sizeof (struct addr_pair
));
3407 *target_table
= pair
;
3408 module
= (struct module_info
*)
3409 GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info
)
3410 + kernel_count
* sizeof (struct kernel_info
));
3411 module
->image_desc
= image_desc
;
3412 module
->kernel_count
= kernel_count
;
3413 module
->heap
= NULL
;
3414 module
->constructors_run_p
= false;
3416 kernel
= &module
->kernels
[0];
3418 /* Allocate memory for kernel dependencies. */
3419 for (unsigned i
= 0; i
< kernel_count
; i
++)
3421 struct hsa_kernel_description
*d
= &image_desc
->kernel_infos
[i
];
3422 if (!init_basic_kernel_info (kernel
, d
, agent
, module
))
3424 if (strcmp (d
->name
, "_init_array") == 0)
3425 module
->init_array_func
= kernel
;
3426 else if (strcmp (d
->name
, "_fini_array") == 0)
3427 module
->fini_array_func
= kernel
;
3430 pair
->start
= (uintptr_t) kernel
;
3431 pair
->end
= (uintptr_t) (kernel
+ 1);
3437 agent
->module
= module
;
3438 if (pthread_rwlock_unlock (&agent
->module_rwlock
))
3440 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3444 if (!create_and_finalize_hsa_program (agent
))
3449 hsa_status_t status
;
3450 hsa_executable_symbol_t var_symbol
;
3451 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3452 ".offload_var_table",
3456 if (status
!= HSA_STATUS_SUCCESS
)
3457 hsa_fatal ("Could not find symbol for variable in the code object",
3460 uint64_t var_table_addr
;
3461 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3462 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3464 if (status
!= HSA_STATUS_SUCCESS
)
3465 hsa_fatal ("Could not extract a variable from its symbol", status
);
3470 } var_table
[var_count
];
3471 GOMP_OFFLOAD_dev2host (agent
->device_id
, var_table
,
3472 (void*)var_table_addr
, sizeof (var_table
));
3474 for (unsigned i
= 0; i
< var_count
; i
++)
3476 pair
->start
= var_table
[i
].addr
;
3477 pair
->end
= var_table
[i
].addr
+ var_table
[i
].size
;
3478 GCN_DEBUG ("Found variable at %p with size %lu\n",
3479 (void *)var_table
[i
].addr
, var_table
[i
].size
);
3484 GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS
));
3486 hsa_status_t status
;
3487 hsa_executable_symbol_t var_symbol
;
3488 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3489 XSTRING (GOMP_ADDITIONAL_ICVS
),
3490 agent
->id
, 0, &var_symbol
);
3491 if (status
== HSA_STATUS_SUCCESS
)
3496 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3497 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3499 if (status
!= HSA_STATUS_SUCCESS
)
3500 hsa_fatal ("Could not extract a variable from its symbol", status
);
3501 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3502 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE
,
3504 if (status
!= HSA_STATUS_SUCCESS
)
3505 hsa_fatal ("Could not extract a variable size from its symbol",
3508 pair
->start
= varptr
;
3509 pair
->end
= varptr
+ varsize
;
3513 /* The variable was not in this image. */
3514 GCN_DEBUG ("Variable not found in image: %s\n",
3515 XSTRING (GOMP_ADDITIONAL_ICVS
));
3516 pair
->start
= pair
->end
= 0;
3519 /* Ensure that constructors are run first. */
3520 struct GOMP_kernel_launch_attributes kla
=
3524 /* Work-group size. */
3528 if (module
->init_array_func
)
3530 init_kernel (module
->init_array_func
);
3531 run_kernel (module
->init_array_func
, NULL
, &kla
, NULL
, false);
3533 module
->constructors_run_p
= true;
3535 /* Don't report kernels that libgomp need not know about. */
3536 if (module
->init_array_func
)
3538 if (module
->fini_array_func
)
3541 if (rev_fn_table
!= NULL
&& kernel_count
== 0)
3542 *rev_fn_table
= NULL
;
3543 else if (rev_fn_table
!= NULL
)
3545 hsa_status_t status
;
3546 hsa_executable_symbol_t var_symbol
;
3547 status
= hsa_fns
.hsa_executable_get_symbol_fn (agent
->executable
, NULL
,
3548 ".offload_func_table",
3549 agent
->id
, 0, &var_symbol
);
3550 if (status
!= HSA_STATUS_SUCCESS
)
3551 hsa_fatal ("Could not find symbol for variable in the code object",
3553 uint64_t fn_table_addr
;
3554 status
= hsa_fns
.hsa_executable_symbol_get_info_fn
3555 (var_symbol
, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS
,
3557 if (status
!= HSA_STATUS_SUCCESS
)
3558 hsa_fatal ("Could not extract a variable from its symbol", status
);
3559 *rev_fn_table
= GOMP_PLUGIN_malloc (kernel_count
* sizeof (uint64_t));
3560 GOMP_OFFLOAD_dev2host (agent
->device_id
, *rev_fn_table
,
3561 (void*) fn_table_addr
,
3562 kernel_count
* sizeof (uint64_t));
3565 return kernel_count
+ var_count
+ other_count
;
3568 /* Unload GCN object-code module described by struct gcn_image_desc in
3569 TARGET_DATA from agent number N. Return TRUE on success. */
3572 GOMP_OFFLOAD_unload_image (int n
, unsigned version
, const void *target_data
)
3574 if (GOMP_VERSION_DEV (version
) != GOMP_VERSION_GCN
)
3576 GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3577 " (expected %u, received %u)",
3578 GOMP_VERSION_GCN
, GOMP_VERSION_DEV (version
));
3582 struct agent_info
*agent
;
3583 agent
= get_agent_info (n
);
3587 if (pthread_rwlock_wrlock (&agent
->module_rwlock
))
3589 GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3593 if (!agent
->module
|| agent
->module
->image_desc
!= target_data
)
3595 GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3600 if (!destroy_module (agent
->module
, true))
3602 free (agent
->module
);
3603 agent
->module
= NULL
;
3604 if (!destroy_hsa_program (agent
))
3606 if (pthread_rwlock_unlock (&agent
->module_rwlock
))
3608 GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3614 /* Deinitialize all information and status associated with agent number N. We
3615 do not attempt any synchronization, assuming the user and libgomp will not
3616 attempt deinitialization of a device that is in any way being used at the
3617 same time. Return TRUE on success. */
3620 GOMP_OFFLOAD_fini_device (int n
)
3622 struct agent_info
*agent
= get_agent_info (n
);
3626 if (!agent
->initialized
)
3629 if (agent
->omp_async_queue
)
3631 GOMP_OFFLOAD_openacc_async_destruct (agent
->omp_async_queue
);
3632 agent
->omp_async_queue
= NULL
;
3637 if (!destroy_module (agent
->module
, false))
3639 free (agent
->module
);
3640 agent
->module
= NULL
;
3643 if (!destroy_team_arenas (agent
))
3646 if (!destroy_hsa_program (agent
))
3649 hsa_status_t status
= hsa_fns
.hsa_queue_destroy_fn (agent
->sync_queue
);
3650 if (status
!= HSA_STATUS_SUCCESS
)
3651 return hsa_error ("Error destroying command queue", status
);
3653 if (pthread_mutex_destroy (&agent
->prog_mutex
))
3655 GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3658 if (pthread_rwlock_destroy (&agent
->module_rwlock
))
3660 GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3664 if (pthread_mutex_destroy (&agent
->async_queues_mutex
))
3666 GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3669 if (pthread_mutex_destroy (&agent
->team_arena_write_lock
))
3671 GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3674 agent
->initialized
= false;
3678 /* Return true if the HSA runtime can run function FN_PTR. */
3681 GOMP_OFFLOAD_can_run (void *fn_ptr
)
3683 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3685 init_kernel (kernel
);
3686 if (kernel
->initialization_failed
)
3692 if (suppress_host_fallback
)
3693 GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3694 GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3698 /* Allocate memory on device N. */
3701 GOMP_OFFLOAD_alloc (int n
, size_t size
)
3703 struct agent_info
*agent
= get_agent_info (n
);
3704 return alloc_by_agent (agent
, size
);
3707 /* Free memory from device N. */
3710 GOMP_OFFLOAD_free (int device
, void *ptr
)
3712 GCN_DEBUG ("Freeing memory on device %d\n", device
);
3714 hsa_status_t status
= hsa_fns
.hsa_memory_free_fn (ptr
);
3715 if (status
!= HSA_STATUS_SUCCESS
)
3717 hsa_error ("Could not free device memory", status
);
3721 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
3722 bool profiling_dispatch_p
3723 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
3724 if (profiling_dispatch_p
)
3726 acc_prof_info
*prof_info
= thr
->prof_info
;
3727 acc_event_info data_event_info
;
3728 acc_api_info
*api_info
= thr
->api_info
;
3730 prof_info
->event_type
= acc_ev_free
;
3732 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
3733 data_event_info
.data_event
.valid_bytes
3734 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
3735 data_event_info
.data_event
.parent_construct
3736 = acc_construct_parallel
;
3737 data_event_info
.data_event
.implicit
= 1;
3738 data_event_info
.data_event
.tool_info
= NULL
;
3739 data_event_info
.data_event
.var_name
= NULL
;
3740 data_event_info
.data_event
.bytes
= 0;
3741 data_event_info
.data_event
.host_ptr
= NULL
;
3742 data_event_info
.data_event
.device_ptr
= (void *) ptr
;
3744 api_info
->device_api
= acc_device_api_other
;
3746 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
3753 /* Copy data from DEVICE to host. */
3756 GOMP_OFFLOAD_dev2host (int device
, void *dst
, const void *src
, size_t n
)
3758 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n
, device
,
3760 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, n
);
3761 if (status
!= HSA_STATUS_SUCCESS
)
3762 GOMP_PLUGIN_error ("memory copy failed");
3766 /* Copy data from host to DEVICE. */
3769 GOMP_OFFLOAD_host2dev (int device
, void *dst
, const void *src
, size_t n
)
3771 GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n
, src
,
3773 hsa_memory_copy_wrapper (dst
, src
, n
);
3777 /* Copy data within DEVICE. Do the copy asynchronously, if appropriate. */
3780 GOMP_OFFLOAD_dev2dev (int device
, void *dst
, const void *src
, size_t n
)
3782 struct gcn_thread
*thread_data
= gcn_thread ();
3784 if (thread_data
&& !async_synchronous_p (thread_data
->async
))
3786 struct agent_info
*agent
= get_agent_info (device
);
3787 maybe_init_omp_async (agent
);
3788 queue_push_copy (agent
->omp_async_queue
, dst
, src
, n
);
3792 GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n
,
3793 device
, src
, device
, dst
);
3794 hsa_status_t status
= hsa_fns
.hsa_memory_copy_fn (dst
, src
, n
);
3795 if (status
!= HSA_STATUS_SUCCESS
)
3796 GOMP_PLUGIN_error ("memory copy failed");
3801 /* {{{ OpenMP Plugin API */
3803 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3804 in VARS as a parameter. The kernel is identified by FN_PTR which must point
3805 to a kernel_info structure, and must have previously been loaded to the
3806 specified device. */
3809 GOMP_OFFLOAD_run (int device
, void *fn_ptr
, void *vars
, void **args
)
3811 struct agent_info
*agent
= get_agent_info (device
);
3812 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3813 struct GOMP_kernel_launch_attributes def
;
3814 struct GOMP_kernel_launch_attributes
*kla
;
3815 assert (agent
== kernel
->agent
);
3817 /* If we get here then the kernel must be OpenMP. */
3818 kernel
->kind
= KIND_OPENMP
;
3820 if (!parse_target_attributes (args
, &def
, &kla
, agent
))
3822 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3825 run_kernel (kernel
, vars
, kla
, NULL
, false);
3828 /* Run an asynchronous OpenMP kernel on DEVICE. This is similar to
3829 GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3830 GOMP_PLUGIN_target_task_completion when it has finished. */
3833 GOMP_OFFLOAD_async_run (int device
, void *tgt_fn
, void *tgt_vars
,
3834 void **args
, void *async_data
)
3836 GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3837 struct agent_info
*agent
= get_agent_info (device
);
3838 struct kernel_info
*kernel
= (struct kernel_info
*) tgt_fn
;
3839 struct GOMP_kernel_launch_attributes def
;
3840 struct GOMP_kernel_launch_attributes
*kla
;
3841 assert (agent
== kernel
->agent
);
3843 /* If we get here then the kernel must be OpenMP. */
3844 kernel
->kind
= KIND_OPENMP
;
3846 if (!parse_target_attributes (args
, &def
, &kla
, agent
))
3848 GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3852 maybe_init_omp_async (agent
);
3853 queue_push_launch (agent
->omp_async_queue
, kernel
, tgt_vars
, kla
);
3854 queue_push_callback (agent
->omp_async_queue
,
3855 GOMP_PLUGIN_target_task_completion
, async_data
);
3859 /* {{{ OpenACC Plugin API */
3861 /* Run a synchronous OpenACC kernel. The device number is inferred from the
3862 already-loaded KERNEL. */
3865 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr
) (void *), size_t mapnum
,
3866 void **hostaddrs
, void **devaddrs
, unsigned *dims
,
3867 void *targ_mem_desc
)
3869 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3871 gcn_exec (kernel
, mapnum
, hostaddrs
, devaddrs
, dims
, targ_mem_desc
, false,
3875 /* Run an asynchronous OpenACC kernel on the specified queue. */
3878 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr
) (void *), size_t mapnum
,
3879 void **hostaddrs
, void **devaddrs
,
3880 unsigned *dims
, void *targ_mem_desc
,
3881 struct goacc_asyncqueue
*aq
)
3883 struct kernel_info
*kernel
= (struct kernel_info
*) fn_ptr
;
3885 gcn_exec (kernel
, mapnum
, hostaddrs
, devaddrs
, dims
, targ_mem_desc
, true,
3889 /* Create a new asynchronous thread and queue for running future kernels. */
3891 struct goacc_asyncqueue
*
3892 GOMP_OFFLOAD_openacc_async_construct (int device
)
3894 struct agent_info
*agent
= get_agent_info (device
);
3896 pthread_mutex_lock (&agent
->async_queues_mutex
);
3898 struct goacc_asyncqueue
*aq
= GOMP_PLUGIN_malloc (sizeof (*aq
));
3899 aq
->agent
= get_agent_info (device
);
3901 aq
->next
= agent
->async_queues
;
3904 aq
->next
->prev
= aq
;
3905 aq
->id
= aq
->next
->id
+ 1;
3909 agent
->async_queues
= aq
;
3911 aq
->queue_first
= 0;
3913 aq
->drain_queue_stop
= 0;
3915 if (pthread_mutex_init (&aq
->mutex
, NULL
))
3917 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3920 if (pthread_cond_init (&aq
->queue_cond_in
, NULL
))
3922 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3925 if (pthread_cond_init (&aq
->queue_cond_out
, NULL
))
3927 GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3931 hsa_status_t status
= hsa_fns
.hsa_queue_create_fn (agent
->id
,
3933 HSA_QUEUE_TYPE_MULTI
,
3934 hsa_queue_callback
, NULL
,
3935 UINT32_MAX
, UINT32_MAX
,
3937 if (status
!= HSA_STATUS_SUCCESS
)
3938 hsa_fatal ("Error creating command queue", status
);
3940 int err
= pthread_create (&aq
->thread_drain_queue
, NULL
, &drain_queue
, aq
);
3942 GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3944 GCN_DEBUG ("Async thread %d:%d: created\n", aq
->agent
->device_id
,
3947 pthread_mutex_unlock (&agent
->async_queues_mutex
);
3952 /* Destroy an existing asynchronous thread and queue. Waits for any
3953 currently-running task to complete, but cancels any queued tasks. */
3956 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue
*aq
)
3958 struct agent_info
*agent
= aq
->agent
;
3960 finalize_async_thread (aq
);
3962 pthread_mutex_lock (&agent
->async_queues_mutex
);
3965 if ((err
= pthread_mutex_destroy (&aq
->mutex
)))
3967 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err
);
3970 if (pthread_cond_destroy (&aq
->queue_cond_in
))
3972 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3975 if (pthread_cond_destroy (&aq
->queue_cond_out
))
3977 GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3980 hsa_status_t status
= hsa_fns
.hsa_queue_destroy_fn (aq
->hsa_queue
);
3981 if (status
!= HSA_STATUS_SUCCESS
)
3983 hsa_error ("Error destroying command queue", status
);
3988 aq
->prev
->next
= aq
->next
;
3990 aq
->next
->prev
= aq
->prev
;
3991 if (agent
->async_queues
== aq
)
3992 agent
->async_queues
= aq
->next
;
3994 GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent
->device_id
, aq
->id
);
3997 pthread_mutex_unlock (&agent
->async_queues_mutex
);
4001 pthread_mutex_unlock (&agent
->async_queues_mutex
);
4005 /* Return true if the specified async queue is currently empty. */
4008 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue
*aq
)
4010 return queue_empty (aq
);
4013 /* Block until the specified queue has executed all its tasks and the
4017 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue
*aq
)
4023 /* Add a serialization point across two async queues. Any new tasks added to
4024 AQ2, after this call, will not run until all tasks on AQ1, at the time
4025 of this call, have completed. */
4028 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue
*aq1
,
4029 struct goacc_asyncqueue
*aq2
)
4031 /* For serialize, stream aq2 waits for aq1 to complete work that has been
4032 scheduled to run on it up to this point. */
4035 struct placeholder
*placeholderp
= queue_push_placeholder (aq1
);
4036 queue_push_asyncwait (aq2
, placeholderp
);
4041 /* Add an opaque callback to the given async queue. */
4044 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue
*aq
,
4045 void (*fn
) (void *), void *data
)
4047 queue_push_callback (aq
, fn
, data
);
4050 /* Queue up an asynchronous data copy from host to DEVICE. */
4053 GOMP_OFFLOAD_openacc_async_host2dev (int device
, void *dst
, const void *src
,
4054 size_t n
, struct goacc_asyncqueue
*aq
)
4056 struct agent_info
*agent
= get_agent_info (device
);
4057 assert (agent
== aq
->agent
);
4058 queue_push_copy (aq
, dst
, src
, n
);
4062 /* Queue up an asynchronous data copy from DEVICE to host. */
4065 GOMP_OFFLOAD_openacc_async_dev2host (int device
, void *dst
, const void *src
,
4066 size_t n
, struct goacc_asyncqueue
*aq
)
4068 struct agent_info
*agent
= get_agent_info (device
);
4069 assert (agent
== aq
->agent
);
4070 queue_push_copy (aq
, dst
, src
, n
);
4074 union goacc_property_value
4075 GOMP_OFFLOAD_openacc_get_property (int device
, enum goacc_property prop
)
4077 struct agent_info
*agent
= get_agent_info (device
);
4079 union goacc_property_value propval
= { .val
= 0 };
4083 case GOACC_PROPERTY_FREE_MEMORY
:
4084 /* Not supported. */
4086 case GOACC_PROPERTY_MEMORY
:
4089 hsa_region_t region
= agent
->data_region
;
4090 hsa_status_t status
=
4091 hsa_fns
.hsa_region_get_info_fn (region
, HSA_REGION_INFO_SIZE
, &size
);
4092 if (status
== HSA_STATUS_SUCCESS
)
4096 case GOACC_PROPERTY_NAME
:
4097 propval
.ptr
= agent
->name
;
4099 case GOACC_PROPERTY_VENDOR
:
4100 propval
.ptr
= agent
->vendor_name
;
4102 case GOACC_PROPERTY_DRIVER
:
4103 propval
.ptr
= hsa_context
.driver_version_s
;
4110 /* Set up plugin-specific thread-local-data (host-side). */
4113 GOMP_OFFLOAD_openacc_create_thread_data (int ord
__attribute__((unused
)))
4115 struct gcn_thread
*thread_data
4116 = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread
));
4118 thread_data
->async
= GOMP_ASYNC_SYNC
;
4120 return (void *) thread_data
;
4123 /* Clean up plugin-specific thread-local-data. */
4126 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)