libgomp/plugin/plugin-gcn.c

   1 /* Plugin for AMD GCN execution.
   2
   3    Copyright (C) 2013-2024 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* {{{ Includes and defines  */
  30
  31 #include "config.h"
  32 #include "symcat.h"
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <pthread.h>
  37 #include <inttypes.h>
  38 #include <stdbool.h>
  39 #include <limits.h>
  40 #include <hsa.h>
  41 #include <hsa_ext_amd.h>
  42 #include <dlfcn.h>
  43 #include <signal.h>
  44 #include "libgomp-plugin.h"
  45 #include "config/gcn/libgomp-gcn.h"  /* For struct output.  */
  46 #include "gomp-constants.h"
  47 #include <elf.h>
  48 #include "oacc-plugin.h"
  49 #include "oacc-int.h"
  50 #include <assert.h>
  51
  52 /* These probably won't be in elf.h for a while.  */
  53 #ifndef R_AMDGPU_NONE
  54 #define R_AMDGPU_NONE           0
  55 #define R_AMDGPU_ABS32_LO       1       /* (S + A) & 0xFFFFFFFF  */
  56 #define R_AMDGPU_ABS32_HI       2       /* (S + A) >> 32  */
  57 #define R_AMDGPU_ABS64          3       /* S + A  */
  58 #define R_AMDGPU_REL32          4       /* S + A - P  */
  59 #define R_AMDGPU_REL64          5       /* S + A - P  */
  60 #define R_AMDGPU_ABS32          6       /* S + A  */
  61 #define R_AMDGPU_GOTPCREL       7       /* G + GOT + A - P  */
  62 #define R_AMDGPU_GOTPCREL32_LO  8       /* (G + GOT + A - P) & 0xFFFFFFFF  */
  63 #define R_AMDGPU_GOTPCREL32_HI  9       /* (G + GOT + A - P) >> 32  */
  64 #define R_AMDGPU_REL32_LO       10      /* (S + A - P) & 0xFFFFFFFF  */
  65 #define R_AMDGPU_REL32_HI       11      /* (S + A - P) >> 32  */
  66 #define R_AMDGPU_RELATIVE64     13      /* B + A  */
  67 #endif
  68
  69 /* GCN specific definitions for asynchronous queues.  */
  70
  71 #define ASYNC_QUEUE_SIZE 64
  72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
  73 #define DEBUG_QUEUES 0
  74 #define DEBUG_THREAD_SLEEP 0
  75 #define DEBUG_THREAD_SIGNAL 0
  76
  77 /* Defaults.  */
  78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
  79
  80 /* Secure getenv() which returns NULL if running as SUID/SGID.  */
  81 #ifndef HAVE_SECURE_GETENV
  82 #ifdef HAVE___SECURE_GETENV
  83 #define secure_getenv __secure_getenv
  84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
  85   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
  86
  87 #include <unistd.h>
  88
  89 /* Implementation of secure_getenv() for targets where it is not provided but
  90    we have at least means to test real and effective IDs. */
  91
  92 static char *
  93 secure_getenv (const char *name)
  94 {
  95   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
  96     return getenv (name);
  97   else
  98     return NULL;
  99 }
 100
 101 #else
 102 #define secure_getenv getenv
 103 #endif
 104 #endif
 105
 106 /* }}}  */
 107 /* {{{ Types  */
 108
 109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
 110
 111 struct gcn_thread
 112 {
 113   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
 114   int async;
 115 };
 116
 117 /* As an HSA runtime is dlopened, following structure defines function
 118    pointers utilized by the HSA plug-in.  */
 119
 120 struct hsa_runtime_fn_info
 121 {
 122   /* HSA runtime.  */
 123   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
 124                                         const char **status_string);
 125   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
 126                                           void *value);
 127   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
 128                                          hsa_agent_info_t attribute,
 129                                          void *value);
 130   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
 131                                       hsa_isa_info_t attribute,
 132                                       uint32_t index,
 133                                       void *value);
 134   hsa_status_t (*hsa_init_fn) (void);
 135   hsa_status_t (*hsa_iterate_agents_fn)
 136     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
 137   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
 138                                           hsa_region_info_t attribute,
 139                                           void *value);
 140   hsa_status_t (*hsa_queue_create_fn)
 141     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
 142      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
 143      void *data, uint32_t private_segment_size,
 144      uint32_t group_segment_size, hsa_queue_t **queue);
 145   hsa_status_t (*hsa_agent_iterate_regions_fn)
 146     (hsa_agent_t agent,
 147      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
 148   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
 149   hsa_status_t (*hsa_executable_create_fn)
 150     (hsa_profile_t profile, hsa_executable_state_t executable_state,
 151      const char *options, hsa_executable_t *executable);
 152   hsa_status_t (*hsa_executable_global_variable_define_fn)
 153     (hsa_executable_t executable, const char *variable_name, void *address);
 154   hsa_status_t (*hsa_executable_load_code_object_fn)
 155     (hsa_executable_t executable, hsa_agent_t agent,
 156      hsa_code_object_t code_object, const char *options);
 157   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
 158                                            const char *options);
 159   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
 160                                         uint32_t num_consumers,
 161                                         const hsa_agent_t *consumers,
 162                                         hsa_signal_t *signal);
 163   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
 164                                           void **ptr);
 165   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
 166                                               hsa_access_permission_t access);
 167   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
 168   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
 169   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
 170   hsa_status_t (*hsa_executable_get_symbol_fn)
 171     (hsa_executable_t executable, const char *module_name,
 172      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
 173      hsa_executable_symbol_t *symbol);
 174   hsa_status_t (*hsa_executable_symbol_get_info_fn)
 175     (hsa_executable_symbol_t executable_symbol,
 176      hsa_executable_symbol_info_t attribute, void *value);
 177   hsa_status_t (*hsa_executable_iterate_symbols_fn)
 178     (hsa_executable_t executable,
 179      hsa_status_t (*callback)(hsa_executable_t executable,
 180                               hsa_executable_symbol_t symbol, void *data),
 181      void *data);
 182   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
 183                                                     uint64_t value);
 184   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
 185   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
 186                                        hsa_signal_value_t value);
 187   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
 188                                        hsa_signal_value_t value);
 189   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
 190     (hsa_signal_t signal, hsa_signal_condition_t condition,
 191      hsa_signal_value_t compare_value, uint64_t timeout_hint,
 192      hsa_wait_state_t wait_state_hint);
 193   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
 194   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
 195
 196   hsa_status_t (*hsa_code_object_deserialize_fn)
 197     (void *serialized_code_object, size_t serialized_code_object_size,
 198      const char *options, hsa_code_object_t *code_object);
 199   hsa_status_t (*hsa_amd_memory_lock_fn)
 200     (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
 201      void **agent_ptr);
 202   hsa_status_t (*hsa_amd_memory_unlock_fn) (void *host_ptr);
 203   hsa_status_t (*hsa_amd_memory_async_copy_rect_fn)
 204     (const hsa_pitched_ptr_t *dst, const hsa_dim3_t *dst_offset,
 205      const hsa_pitched_ptr_t *src, const hsa_dim3_t *src_offset,
 206      const hsa_dim3_t *range, hsa_agent_t copy_agent,
 207      hsa_amd_copy_direction_t dir, uint32_t num_dep_signals,
 208      const hsa_signal_t *dep_signals, hsa_signal_t completion_signal);
 209 };
 210
 211 /* Structure describing the run-time and grid properties of an HSA kernel
 212    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
 213
 214 struct GOMP_kernel_launch_attributes
 215 {
 216   /* Number of dimensions the workload has.  Maximum number is 3.  */
 217   uint32_t ndim;
 218   /* Size of the grid in the three respective dimensions.  */
 219   uint32_t gdims[3];
 220   /* Size of work-groups in the respective dimensions.  */
 221   uint32_t wdims[3];
 222 };
 223
 224 /* Collection of information needed for a dispatch of a kernel from a
 225    kernel.  */
 226
 227 struct kernel_dispatch
 228 {
 229   struct agent_info *agent;
 230   /* Pointer to a command queue associated with a kernel dispatch agent.  */
 231   void *queue;
 232   /* Pointer to a memory space used for kernel arguments passing.  */
 233   void *kernarg_address;
 234   /* Kernel object.  */
 235   uint64_t object;
 236   /* Synchronization signal used for dispatch synchronization.  */
 237   uint64_t signal;
 238   /* Private segment size.  */
 239   uint32_t private_segment_size;
 240   /* Group segment size.  */
 241   uint32_t group_segment_size;
 242 };
 243
 244 /* Structure of the kernargs segment, supporting console output.
 245
 246    This needs to match the definitions in Newlib, and the expectations
 247    in libgomp target code.  */
 248
 249 struct kernargs {
 250   struct kernargs_abi abi;
 251
 252   /* Output data.  */
 253   struct output output_data;
 254 };
 255
 256 /* A queue entry for a future asynchronous launch.  */
 257
 258 struct kernel_launch
 259 {
 260   struct kernel_info *kernel;
 261   void *vars;
 262   struct GOMP_kernel_launch_attributes kla;
 263 };
 264
 265 /* A queue entry for a future callback.  */
 266
 267 struct callback
 268 {
 269   void (*fn)(void *);
 270   void *data;
 271 };
 272
 273 /* A data struct for the copy_data callback.  */
 274
 275 struct copy_data
 276 {
 277   void *dst;
 278   const void *src;
 279   size_t len;
 280   struct goacc_asyncqueue *aq;
 281 };
 282
 283 /* A queue entry for a placeholder.  These correspond to a wait event.  */
 284
 285 struct placeholder
 286 {
 287   int executed;
 288   pthread_cond_t cond;
 289   pthread_mutex_t mutex;
 290 };
 291
 292 /* A queue entry for a wait directive.  */
 293
 294 struct asyncwait_info
 295 {
 296   struct placeholder *placeholderp;
 297 };
 298
 299 /* Encode the type of an entry in an async queue.  */
 300
 301 enum entry_type
 302 {
 303   KERNEL_LAUNCH,
 304   CALLBACK,
 305   ASYNC_WAIT,
 306   ASYNC_PLACEHOLDER
 307 };
 308
 309 /* An entry in an async queue.  */
 310
 311 struct queue_entry
 312 {
 313   enum entry_type type;
 314   union {
 315     struct kernel_launch launch;
 316     struct callback callback;
 317     struct asyncwait_info asyncwait;
 318     struct placeholder placeholder;
 319   } u;
 320 };
 321
 322 /* An async queue header.
 323
 324    OpenMP may create one of these.
 325    OpenACC may create many.  */
 326
 327 struct goacc_asyncqueue
 328 {
 329   struct agent_info *agent;
 330   hsa_queue_t *hsa_queue;
 331
 332   pthread_t thread_drain_queue;
 333   pthread_mutex_t mutex;
 334   pthread_cond_t queue_cond_in;
 335   pthread_cond_t queue_cond_out;
 336   struct queue_entry queue[ASYNC_QUEUE_SIZE];
 337   int queue_first;
 338   int queue_n;
 339   int drain_queue_stop;
 340
 341   int id;
 342   struct goacc_asyncqueue *prev;
 343   struct goacc_asyncqueue *next;
 344 };
 345
 346 /* Mkoffload uses this structure to describe a kernel.
 347
 348    OpenMP kernel dimensions are passed at runtime.
 349    OpenACC kernel dimensions are passed at compile time, here.  */
 350
 351 struct hsa_kernel_description
 352 {
 353   const char *name;
 354   int oacc_dims[3];  /* Only present for GCN kernels.  */
 355   int sgpr_count;
 356   int vpgr_count;
 357 };
 358
 359 /* Mkoffload uses this structure to describe an offload variable.  */
 360
 361 struct global_var_info
 362 {
 363   const char *name;
 364   void *address;
 365 };
 366
 367 /* Mkoffload uses this structure to describe all the kernels in a
 368    loadable module.  These are passed the libgomp via static constructors.  */
 369
 370 struct gcn_image_desc
 371 {
 372   struct gcn_image {
 373     size_t size;
 374     void *image;
 375   } *gcn_image;
 376   const unsigned kernel_count;
 377   struct hsa_kernel_description *kernel_infos;
 378   const unsigned ind_func_count;
 379   const unsigned global_variable_count;
 380 };
 381
 382 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
 383    support.
 384    See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
 385
 386 typedef enum {
 387   EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
 388   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
 389   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
 390   EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
 391   EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f,
 392   EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
 393   EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041
 394 } EF_AMDGPU_MACH;
 395
 396 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
 397 typedef EF_AMDGPU_MACH gcn_isa;
 398
 399 /* Description of an HSA GPU agent (device) and the program associated with
 400    it.  */
 401
 402 struct agent_info
 403 {
 404   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
 405   hsa_agent_t id;
 406   /* The user-visible device number.  */
 407   int device_id;
 408   /* Whether the agent has been initialized.  The fields below are usable only
 409      if it has been.  */
 410   bool initialized;
 411
 412   /* The instruction set architecture of the device. */
 413   gcn_isa device_isa;
 414   /* Name of the agent. */
 415   char name[64];
 416   /* Name of the vendor of the agent. */
 417   char vendor_name[64];
 418   /* Command queues of the agent.  */
 419   hsa_queue_t *sync_queue;
 420   struct goacc_asyncqueue *async_queues, *omp_async_queue;
 421   pthread_mutex_t async_queues_mutex;
 422
 423   /* The HSA memory region from which to allocate kernel arguments.  */
 424   hsa_region_t kernarg_region;
 425
 426   /* The HSA memory region from which to allocate device data.  */
 427   hsa_region_t data_region;
 428
 429   /* Allocated ephemeral memories (team arena and stack space).  */
 430   struct ephemeral_memories_list *ephemeral_memories_list;
 431   pthread_mutex_t ephemeral_memories_write_lock;
 432
 433   /* Read-write lock that protects kernels which are running or about to be run
 434      from interference with loading and unloading of images.  Needs to be
 435      locked for reading while a kernel is being run, and for writing if the
 436      list of modules is manipulated (and thus the HSA program invalidated).  */
 437   pthread_rwlock_t module_rwlock;
 438
 439   /* The module associated with this kernel.  */
 440   struct module_info *module;
 441
 442   /* Mutex enforcing that only one thread will finalize the HSA program.  A
 443      thread should have locked agent->module_rwlock for reading before
 444      acquiring it.  */
 445   pthread_mutex_t prog_mutex;
 446   /* Flag whether the HSA program that consists of all the modules has been
 447      finalized.  */
 448   bool prog_finalized;
 449   /* HSA executable - the finalized program that is used to locate kernels.  */
 450   hsa_executable_t executable;
 451 };
 452
 453 /* Information required to identify, finalize and run any given kernel.  */
 454
 455 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
 456
 457 struct kernel_info
 458 {
 459   /* Name of the kernel, required to locate it within the GCN object-code
 460      module.  */
 461   const char *name;
 462   /* The specific agent the kernel has been or will be finalized for and run
 463      on.  */
 464   struct agent_info *agent;
 465   /* The specific module where the kernel takes place.  */
 466   struct module_info *module;
 467   /* Information provided by mkoffload associated with the kernel.  */
 468   struct hsa_kernel_description *description;
 469   /* Mutex enforcing that at most once thread ever initializes a kernel for
 470      use.  A thread should have locked agent->module_rwlock for reading before
 471      acquiring it.  */
 472   pthread_mutex_t init_mutex;
 473   /* Flag indicating whether the kernel has been initialized and all fields
 474      below it contain valid data.  */
 475   bool initialized;
 476   /* Flag indicating that the kernel has a problem that blocks an execution.  */
 477   bool initialization_failed;
 478   /* The object to be put into the dispatch queue.  */
 479   uint64_t object;
 480   /* Required size of kernel arguments.  */
 481   uint32_t kernarg_segment_size;
 482   /* Required size of group segment.  */
 483   uint32_t group_segment_size;
 484   /* Required size of private segment.  */
 485   uint32_t private_segment_size;
 486   /* Set up for OpenMP or OpenACC?  */
 487   enum offload_kind kind;
 488 };
 489
 490 /* Information about a particular GCN module, its image and kernels.  */
 491
 492 struct module_info
 493 {
 494   /* The description with which the program has registered the image.  */
 495   struct gcn_image_desc *image_desc;
 496   /* GCN heap allocation.  */
 497   struct heap *heap;
 498   /* Physical boundaries of the loaded module.  */
 499   Elf64_Addr phys_address_start;
 500   Elf64_Addr phys_address_end;
 501
 502   bool constructors_run_p;
 503   struct kernel_info *init_array_func, *fini_array_func;
 504
 505   /* Number of kernels in this module.  */
 506   int kernel_count;
 507   /* An array of kernel_info structures describing each kernel in this
 508      module.  */
 509   struct kernel_info kernels[];
 510 };
 511
 512 /* A linked list of memory arenas allocated on the device.
 513    These are used by OpenMP, as a means to optimize per-team malloc,
 514    and for host-accessible stack space.  */
 515
 516 struct ephemeral_memories_list
 517 {
 518   struct ephemeral_memories_list *next;
 519
 520   /* The size is determined by the number of teams and threads.  */
 521   size_t size;
 522   /* The device address allocated memory.  */
 523   void *address;
 524   /* A flag to prevent two asynchronous kernels trying to use the same memory.
 525      The mutex is locked until the kernel exits.  */
 526   pthread_mutex_t in_use;
 527 };
 528
 529 /* Information about the whole HSA environment and all of its agents.  */
 530
 531 struct hsa_context_info
 532 {
 533   /* Whether the structure has been initialized.  */
 534   bool initialized;
 535   /* Number of usable GPU HSA agents in the system.  */
 536   int agent_count;
 537   /* Array of agent_info structures describing the individual HSA agents.  */
 538   struct agent_info *agents;
 539   /* Driver version string. */
 540   char driver_version_s[30];
 541 };
 542
 543 /* }}}  */
 544 /* {{{ Global variables  */
 545
 546 /* Information about the whole HSA environment and all of its agents.  */
 547
 548 static struct hsa_context_info hsa_context;
 549
 550 /* HSA runtime functions that are initialized in init_hsa_context.  */
 551
 552 static struct hsa_runtime_fn_info hsa_fns;
 553
 554 /* Heap space, allocated target-side, provided for use of newlib malloc.
 555    Each module should have it's own heap allocated.
 556    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
 557
 558 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
 559
 560 /* Ephemeral memory sizes for each kernel launch.  */
 561
 562 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
 563 static int stack_size = DEFAULT_GCN_STACK_SIZE;
 564 static int lowlat_size = -1;
 565
 566 /* Flag to decide whether print to stderr information about what is going on.
 567    Set in init_debug depending on environment variables.  */
 568
 569 static bool debug;
 570
 571 /* Flag to decide if the runtime should suppress a possible fallback to host
 572    execution.  */
 573
 574 static bool suppress_host_fallback;
 575
 576 /* Flag to locate HSA runtime shared library that is dlopened
 577    by this plug-in.  */
 578
 579 static const char *hsa_runtime_lib;
 580
 581 /* Flag to decide if the runtime should support also CPU devices (can be
 582    a simulator).  */
 583
 584 static bool support_cpu_devices;
 585
 586 /* Runtime dimension overrides.  Zero indicates default.  */
 587
 588 static int override_x_dim = 0;
 589 static int override_z_dim = 0;
 590
 591 /* }}}  */
 592 /* {{{ Debug & Diagnostic  */
 593
 594 /* Print a message to stderr if GCN_DEBUG value is set to true.  */
 595
 596 #define DEBUG_PRINT(...) \
 597   do \
 598   { \
 599     if (debug) \
 600       { \
 601         fprintf (stderr, __VA_ARGS__); \
 602       } \
 603   } \
 604   while (false);
 605
 606 /* Flush stderr if GCN_DEBUG value is set to true.  */
 607
 608 #define DEBUG_FLUSH()                           \
 609   do {                                          \
 610     if (debug)                                  \
 611       fflush (stderr);                          \
 612   } while (false)
 613
 614 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
 615    is set to true.  */
 616
 617 #define DEBUG_LOG(prefix, ...)                  \
 618   do                                            \
 619     {                                           \
 620       DEBUG_PRINT (prefix);                     \
 621       DEBUG_PRINT (__VA_ARGS__);                        \
 622       DEBUG_FLUSH ();                           \
 623     } while (false)
 624
 625 /* Print a debugging message to stderr.  */
 626
 627 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
 628
 629 /* Print a warning message to stderr.  */
 630
 631 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
 632
 633 /* Print HSA warning STR with an HSA STATUS code.  */
 634
 635 static void
 636 hsa_warn (const char *str, hsa_status_t status)
 637 {
 638   if (!debug)
 639     return;
 640
 641   const char *hsa_error_msg = "[unknown]";
 642   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 643
 644   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
 645            hsa_error_msg);
 646 }
 647
 648 /* Report a fatal error STR together with the HSA error corresponding to STATUS
 649    and terminate execution of the current process.  */
 650
 651 static void
 652 hsa_fatal (const char *str, hsa_status_t status)
 653 {
 654   const char *hsa_error_msg = "[unknown]";
 655   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 656   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
 657                      hsa_error_msg);
 658 }
 659
 660 /* Like hsa_fatal, except only report error message, and return FALSE
 661    for propagating error processing to outside of plugin.  */
 662
 663 static bool
 664 hsa_error (const char *str, hsa_status_t status)
 665 {
 666   const char *hsa_error_msg = "[unknown]";
 667   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 668   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
 669                      hsa_error_msg);
 670   return false;
 671 }
 672
 673 /* Dump information about the available hardware.  */
 674
 675 static void
 676 dump_hsa_system_info (void)
 677 {
 678   hsa_status_t status;
 679
 680   hsa_endianness_t endianness;
 681   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
 682                                            &endianness);
 683   if (status == HSA_STATUS_SUCCESS)
 684     switch (endianness)
 685       {
 686       case HSA_ENDIANNESS_LITTLE:
 687         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
 688         break;
 689       case HSA_ENDIANNESS_BIG:
 690         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
 691         break;
 692       default:
 693         GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
 694       }
 695   else
 696     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
 697
 698   uint8_t extensions[128];
 699   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
 700                                            &extensions);
 701   if (status == HSA_STATUS_SUCCESS)
 702     {
 703       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
 704         GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
 705     }
 706   else
 707     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
 708 }
 709
 710 /* Dump information about the available hardware.  */
 711
 712 static void
 713 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
 714 {
 715   switch (machine_model)
 716     {
 717     case HSA_MACHINE_MODEL_SMALL:
 718       GCN_DEBUG ("%s: SMALL\n", s);
 719       break;
 720     case HSA_MACHINE_MODEL_LARGE:
 721       GCN_DEBUG ("%s: LARGE\n", s);
 722       break;
 723     default:
 724       GCN_WARNING ("%s: UNKNOWN\n", s);
 725       break;
 726     }
 727 }
 728
 729 /* Dump information about the available hardware.  */
 730
 731 static void
 732 dump_profile (hsa_profile_t profile, const char *s)
 733 {
 734   switch (profile)
 735     {
 736     case HSA_PROFILE_FULL:
 737       GCN_DEBUG ("%s: FULL\n", s);
 738       break;
 739     case HSA_PROFILE_BASE:
 740       GCN_DEBUG ("%s: BASE\n", s);
 741       break;
 742     default:
 743       GCN_WARNING ("%s: UNKNOWN\n", s);
 744       break;
 745     }
 746 }
 747
 748 /* Dump information about a device memory region.  */
 749
 750 static hsa_status_t
 751 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
 752 {
 753   hsa_status_t status;
 754
 755   hsa_region_segment_t segment;
 756   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
 757                                            &segment);
 758   if (status == HSA_STATUS_SUCCESS)
 759     {
 760       if (segment == HSA_REGION_SEGMENT_GLOBAL)
 761         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
 762       else if (segment == HSA_REGION_SEGMENT_READONLY)
 763         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
 764       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
 765         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
 766       else if (segment == HSA_REGION_SEGMENT_GROUP)
 767         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
 768       else
 769         GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
 770     }
 771   else
 772     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
 773
 774   if (segment == HSA_REGION_SEGMENT_GLOBAL)
 775     {
 776       uint32_t flags;
 777       status
 778         = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
 779                                           &flags);
 780       if (status == HSA_STATUS_SUCCESS)
 781         {
 782           if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
 783             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
 784           if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
 785             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
 786           if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
 787             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
 788         }
 789       else
 790         GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
 791     }
 792
 793   size_t size;
 794   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
 795   if (status == HSA_STATUS_SUCCESS)
 796     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
 797   else
 798     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
 799
 800   status
 801     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
 802                                       &size);
 803   if (status == HSA_STATUS_SUCCESS)
 804     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
 805   else
 806     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
 807
 808   bool alloc_allowed;
 809   status
 810     = hsa_fns.hsa_region_get_info_fn (region,
 811                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
 812                                       &alloc_allowed);
 813   if (status == HSA_STATUS_SUCCESS)
 814     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
 815   else
 816     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
 817
 818   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
 819     return HSA_STATUS_SUCCESS;
 820
 821   status
 822     = hsa_fns.hsa_region_get_info_fn (region,
 823                                       HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
 824                                       &size);
 825   if (status == HSA_STATUS_SUCCESS)
 826     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
 827   else
 828     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
 829
 830   size_t align;
 831   status
 832     = hsa_fns.hsa_region_get_info_fn (region,
 833                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
 834                                       &align);
 835   if (status == HSA_STATUS_SUCCESS)
 836     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
 837   else
 838     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
 839
 840   return HSA_STATUS_SUCCESS;
 841 }
 842
 843 /* Dump information about all the device memory regions.  */
 844
 845 static void
 846 dump_hsa_regions (hsa_agent_t agent)
 847 {
 848   hsa_status_t status;
 849   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
 850                                                  dump_hsa_region,
 851                                                  NULL);
 852   if (status != HSA_STATUS_SUCCESS)
 853     hsa_error ("Dumping hsa regions failed", status);
 854 }
 855
 856 /* Dump information about the available devices.  */
 857
 858 static hsa_status_t
 859 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
 860 {
 861   hsa_status_t status;
 862
 863   char buf[64];
 864   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
 865                                           &buf);
 866   if (status == HSA_STATUS_SUCCESS)
 867     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
 868   else
 869     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
 870
 871   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
 872                                           &buf);
 873   if (status == HSA_STATUS_SUCCESS)
 874     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
 875   else
 876     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
 877
 878   hsa_machine_model_t machine_model;
 879   status
 880     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
 881                                      &machine_model);
 882   if (status == HSA_STATUS_SUCCESS)
 883     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
 884   else
 885     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
 886
 887   hsa_profile_t profile;
 888   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
 889                                           &profile);
 890   if (status == HSA_STATUS_SUCCESS)
 891     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
 892   else
 893     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
 894
 895   hsa_device_type_t device_type;
 896   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
 897                                           &device_type);
 898   if (status == HSA_STATUS_SUCCESS)
 899     {
 900       switch (device_type)
 901         {
 902         case HSA_DEVICE_TYPE_CPU:
 903           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
 904           break;
 905         case HSA_DEVICE_TYPE_GPU:
 906           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
 907           break;
 908         case HSA_DEVICE_TYPE_DSP:
 909           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
 910           break;
 911         default:
 912           GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
 913           break;
 914         }
 915     }
 916   else
 917     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
 918
 919   uint32_t cu_count;
 920   status = hsa_fns.hsa_agent_get_info_fn
 921     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
 922   if (status == HSA_STATUS_SUCCESS)
 923     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
 924   else
 925     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
 926
 927   uint32_t size;
 928   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
 929                                           &size);
 930   if (status == HSA_STATUS_SUCCESS)
 931     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
 932   else
 933     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
 934
 935   uint32_t max_dim;
 936   status = hsa_fns.hsa_agent_get_info_fn (agent,
 937                                           HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
 938                                           &max_dim);
 939   if (status == HSA_STATUS_SUCCESS)
 940     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
 941   else
 942     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
 943
 944   uint32_t max_size;
 945   status = hsa_fns.hsa_agent_get_info_fn (agent,
 946                                           HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
 947                                           &max_size);
 948   if (status == HSA_STATUS_SUCCESS)
 949     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
 950   else
 951     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
 952
 953   uint32_t grid_max_dim;
 954   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
 955                                           &grid_max_dim);
 956   if (status == HSA_STATUS_SUCCESS)
 957     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
 958   else
 959     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
 960
 961   uint32_t grid_max_size;
 962   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
 963                                           &grid_max_size);
 964   if (status == HSA_STATUS_SUCCESS)
 965     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
 966   else
 967     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
 968
 969   dump_hsa_regions (agent);
 970
 971   return HSA_STATUS_SUCCESS;
 972 }
 973
 974 /* Forward reference.  */
 975
 976 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
 977
 978 /* Helper function for dump_executable_symbols.  */
 979
 980 static hsa_status_t
 981 dump_executable_symbol (hsa_executable_t executable,
 982                         hsa_executable_symbol_t symbol,
 983                         void *data __attribute__((unused)))
 984 {
 985   char *name = get_executable_symbol_name (symbol);
 986
 987   if (name)
 988     {
 989       GCN_DEBUG ("executable symbol: %s\n", name);
 990       free (name);
 991     }
 992
 993   return HSA_STATUS_SUCCESS;
 994 }
 995
 996 /* Dump all global symbol in an executable.  */
 997
 998 static void
 999 dump_executable_symbols (hsa_executable_t executable)
1000 {
1001   hsa_status_t status;
1002   status
1003     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1004                                                  dump_executable_symbol,
1005                                                  NULL);
1006   if (status != HSA_STATUS_SUCCESS)
1007     hsa_fatal ("Could not dump HSA executable symbols", status);
1008 }
1009
1010 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1011
1012 static void
1013 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1014 {
1015   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1016
1017   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1018   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1019   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1020   fprintf (stderr, "%*sheap address: %p\n", indent, "",
1021            (void*)kernargs->abi.heap_ptr);
1022   fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
1023            "", (void*)kernargs->abi.arena_ptr,
1024            kernargs->abi.arena_size_per_team);
1025   fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
1026            "", (void*)kernargs->abi.stack_ptr,
1027            kernargs->abi.stack_size_per_thread);
1028   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1029   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1030            dispatch->private_segment_size);
1031   fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
1032            "", dispatch->group_segment_size);
1033   fprintf (stderr, "\n");
1034 }
1035
1036 /* }}}  */
1037 /* {{{ Utility functions  */
1038
1039 /* Cast the thread local storage to gcn_thread.  */
1040
1041 static inline struct gcn_thread *
1042 gcn_thread (void)
1043 {
1044   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1045 }
1046
1047 /* Initialize debug and suppress_host_fallback according to the environment.  */
1048
1049 static void
1050 init_environment_variables (void)
1051 {
1052   if (secure_getenv ("GCN_DEBUG"))
1053     debug = true;
1054   else
1055     debug = false;
1056
1057   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1058     suppress_host_fallback = true;
1059   else
1060     suppress_host_fallback = false;
1061
1062   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1063   if (hsa_runtime_lib == NULL)
1064     hsa_runtime_lib = "libhsa-runtime64.so.1";
1065
1066   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1067
1068   const char *x = secure_getenv ("GCN_NUM_TEAMS");
1069   if (!x)
1070     x = secure_getenv ("GCN_NUM_GANGS");
1071   if (x)
1072     override_x_dim = atoi (x);
1073
1074   const char *z = secure_getenv ("GCN_NUM_THREADS");
1075   if (!z)
1076     z = secure_getenv ("GCN_NUM_WORKERS");
1077   if (z)
1078     override_z_dim = atoi (z);
1079
1080   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1081   if (heap)
1082     {
1083       size_t tmp = atol (heap);
1084       if (tmp)
1085         gcn_kernel_heap_size = tmp;
1086     }
1087
1088   const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
1089   if (arena)
1090     {
1091       int tmp = atoi (arena);
1092       if (tmp)
1093         team_arena_size = tmp;;
1094     }
1095
1096   const char *stack = secure_getenv ("GCN_STACK_SIZE");
1097   if (stack)
1098     {
1099       int tmp = atoi (stack);
1100       if (tmp)
1101         stack_size = tmp;;
1102     }
1103
1104   const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
1105   if (lowlat)
1106     lowlat_size = atoi (lowlat);
1107 }
1108
1109 /* Return malloc'd string with name of SYMBOL.  */
1110
1111 static char *
1112 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1113 {
1114   hsa_status_t status;
1115   char *res;
1116   uint32_t len;
1117   const hsa_executable_symbol_info_t info_name_length
1118     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1119
1120   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1121                                                       &len);
1122   if (status != HSA_STATUS_SUCCESS)
1123     {
1124       hsa_error ("Could not get length of symbol name", status);
1125       return NULL;
1126     }
1127
1128   res = GOMP_PLUGIN_malloc (len + 1);
1129
1130   const hsa_executable_symbol_info_t info_name
1131     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1132
1133   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1134
1135   if (status != HSA_STATUS_SUCCESS)
1136     {
1137       hsa_error ("Could not get symbol name", status);
1138       free (res);
1139       return NULL;
1140     }
1141
1142   res[len] = '\0';
1143
1144   return res;
1145 }
1146
1147 /* Get the number of GPU Compute Units.  */
1148
1149 static int
1150 get_cu_count (struct agent_info *agent)
1151 {
1152   uint32_t cu_count;
1153   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1154     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1155   if (status == HSA_STATUS_SUCCESS)
1156     return cu_count;
1157   else
1158     return 64;  /* The usual number for older devices.  */
1159 }
1160
1161 /* Calculate the maximum grid size for OMP threads / OACC workers.
1162    This depends on the kernel's resource usage levels.  */
1163
1164 static int
1165 limit_worker_threads (int threads)
1166 {
1167   /* FIXME Do something more inteligent here.
1168      GCN can always run 4 threads within a Compute Unit, but
1169      more than that depends on register usage.  */
1170   if (threads > 16)
1171     threads = 16;
1172   return threads;
1173 }
1174
1175 /* This sets the maximum number of teams to twice the number of GPU Compute
1176    Units to avoid memory waste and corresponding memory access faults.  */
1177
1178 static int
1179 limit_teams (int teams, struct agent_info *agent)
1180 {
1181   int max_teams = 2 * get_cu_count (agent);
1182   if (teams > max_teams)
1183     teams = max_teams;
1184   return teams;
1185 }
1186
1187 /* Parse the target attributes INPUT provided by the compiler and return true
1188    if we should run anything all.  If INPUT is NULL, fill DEF with default
1189    values, then store INPUT or DEF into *RESULT.
1190
1191    This is used for OpenMP only.  */
1192
1193 static bool
1194 parse_target_attributes (void **input,
1195                          struct GOMP_kernel_launch_attributes *def,
1196                          struct GOMP_kernel_launch_attributes **result,
1197                          struct agent_info *agent)
1198 {
1199   if (!input)
1200     GOMP_PLUGIN_fatal ("No target arguments provided");
1201
1202   bool grid_attrs_found = false;
1203   bool gcn_dims_found = false;
1204   int gcn_teams = 0;
1205   int gcn_threads = 0;
1206   while (*input)
1207     {
1208       intptr_t id = (intptr_t) *input++, val;
1209
1210       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1211         val = (intptr_t) *input++;
1212       else
1213         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1214
1215       val = (val > INT_MAX) ? INT_MAX : val;
1216
1217       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1218           && ((id & GOMP_TARGET_ARG_ID_MASK)
1219               == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1220         {
1221           grid_attrs_found = true;
1222           break;
1223         }
1224       else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1225                == GOMP_TARGET_ARG_DEVICE_ALL)
1226         {
1227           gcn_dims_found = true;
1228           switch (id & GOMP_TARGET_ARG_ID_MASK)
1229             {
1230             case GOMP_TARGET_ARG_NUM_TEAMS:
1231               gcn_teams = limit_teams (val, agent);
1232               break;
1233             case GOMP_TARGET_ARG_THREAD_LIMIT:
1234               gcn_threads = limit_worker_threads (val);
1235               break;
1236             default:
1237               ;
1238             }
1239         }
1240     }
1241
1242   if (gcn_dims_found)
1243     {
1244       bool gfx900_workaround_p = false;
1245
1246       if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1247           && gcn_threads == 0 && override_z_dim == 0)
1248         {
1249           gfx900_workaround_p = true;
1250           GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1251                        "threads to at most 4 per team.\n");
1252           GCN_WARNING (" - If this is not a Vega 10 device, please use "
1253                        "GCN_NUM_THREADS=16\n");
1254         }
1255
1256       /* Ideally, when a dimension isn't explicitly specified, we should
1257          tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1258          In practice, we tune for peak performance on BabelStream, which
1259          for OpenACC is currently 32 threads per CU.  */
1260       def->ndim = 3;
1261       if (gcn_teams <= 0 && gcn_threads <= 0)
1262         {
1263           /* Set up a reasonable number of teams and threads.  */
1264           gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1265           def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1266           def->gdims[2] = gcn_threads;
1267         }
1268       else if (gcn_teams <= 0 && gcn_threads > 0)
1269         {
1270           /* Auto-scale the number of teams with the number of threads.  */
1271           def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1272           def->gdims[2] = gcn_threads;
1273         }
1274       else if (gcn_teams > 0 && gcn_threads <= 0)
1275         {
1276           int max_threads = gfx900_workaround_p ? 4 : 16;
1277
1278           /* Auto-scale the number of threads with the number of teams.  */
1279           def->gdims[0] = gcn_teams;
1280           def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1281           if (def->gdims[2] == 0)
1282             def->gdims[2] = 1;
1283           else if (def->gdims[2] > max_threads)
1284             def->gdims[2] = max_threads;
1285         }
1286       else
1287         {
1288           def->gdims[0] = gcn_teams;
1289           def->gdims[2] = gcn_threads;
1290         }
1291       def->gdims[1] = 64; /* Each thread is 64 work items wide.  */
1292       def->wdims[0] = 1;  /* Single team per work-group.  */
1293       def->wdims[1] = 64;
1294       def->wdims[2] = 16;
1295       *result = def;
1296       return true;
1297     }
1298   else if (!grid_attrs_found)
1299     {
1300       def->ndim = 1;
1301       def->gdims[0] = 1;
1302       def->gdims[1] = 1;
1303       def->gdims[2] = 1;
1304       def->wdims[0] = 1;
1305       def->wdims[1] = 1;
1306       def->wdims[2] = 1;
1307       *result = def;
1308       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1309       return true;
1310     }
1311
1312   struct GOMP_kernel_launch_attributes *kla;
1313   kla = (struct GOMP_kernel_launch_attributes *) *input;
1314   *result = kla;
1315   if (kla->ndim == 0 || kla->ndim > 3)
1316     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1317
1318   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1319   unsigned i;
1320   for (i = 0; i < kla->ndim; i++)
1321     {
1322       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1323                  kla->gdims[i], kla->wdims[i]);
1324       if (kla->gdims[i] == 0)
1325         return false;
1326     }
1327   return true;
1328 }
1329
1330 /* Return the group size given the requested GROUP size, GRID size and number
1331    of grid dimensions NDIM.  */
1332
1333 static uint32_t
1334 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1335 {
1336   if (group == 0)
1337     {
1338       /* TODO: Provide a default via environment or device characteristics.  */
1339       if (ndim == 1)
1340         group = 64;
1341       else if (ndim == 2)
1342         group = 8;
1343       else
1344         group = 4;
1345     }
1346
1347   if (group > grid)
1348     group = grid;
1349   return group;
1350 }
1351
1352 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1353
1354 static void
1355 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1356 {
1357   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1358 }
1359
1360 /* A never-called callback for the HSA command queues.  These signal events
1361    that we don't use, so we trigger an error.
1362
1363    This "queue" is not to be confused with the async queues, below.  */
1364
1365 static void
1366 hsa_queue_callback (hsa_status_t status,
1367                 hsa_queue_t *queue __attribute__ ((unused)),
1368                 void *data __attribute__ ((unused)))
1369 {
1370   hsa_fatal ("Asynchronous queue error", status);
1371 }
1372
1373 /* }}}  */
1374 /* {{{ HSA initialization  */
1375
1376 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
1377
1378 static bool
1379 init_hsa_runtime_functions (void)
1380 {
1381 #define DLSYM_FN(function) \
1382   hsa_fns.function##_fn = dlsym (handle, #function); \
1383   if (hsa_fns.function##_fn == NULL) \
1384     return false;
1385 #define DLSYM_OPT_FN(function) \
1386   hsa_fns.function##_fn = dlsym (handle, #function);
1387   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1388   if (handle == NULL)
1389     return false;
1390
1391   DLSYM_FN (hsa_status_string)
1392   DLSYM_FN (hsa_system_get_info)
1393   DLSYM_FN (hsa_agent_get_info)
1394   DLSYM_FN (hsa_init)
1395   DLSYM_FN (hsa_iterate_agents)
1396   DLSYM_FN (hsa_region_get_info)
1397   DLSYM_FN (hsa_queue_create)
1398   DLSYM_FN (hsa_agent_iterate_regions)
1399   DLSYM_FN (hsa_executable_destroy)
1400   DLSYM_FN (hsa_executable_create)
1401   DLSYM_FN (hsa_executable_global_variable_define)
1402   DLSYM_FN (hsa_executable_load_code_object)
1403   DLSYM_FN (hsa_executable_freeze)
1404   DLSYM_FN (hsa_signal_create)
1405   DLSYM_FN (hsa_memory_allocate)
1406   DLSYM_FN (hsa_memory_assign_agent)
1407   DLSYM_FN (hsa_memory_copy)
1408   DLSYM_FN (hsa_memory_free)
1409   DLSYM_FN (hsa_signal_destroy)
1410   DLSYM_FN (hsa_executable_get_symbol)
1411   DLSYM_FN (hsa_executable_symbol_get_info)
1412   DLSYM_FN (hsa_executable_iterate_symbols)
1413   DLSYM_FN (hsa_queue_add_write_index_release)
1414   DLSYM_FN (hsa_queue_load_read_index_acquire)
1415   DLSYM_FN (hsa_signal_wait_acquire)
1416   DLSYM_FN (hsa_signal_store_relaxed)
1417   DLSYM_FN (hsa_signal_store_release)
1418   DLSYM_FN (hsa_signal_load_acquire)
1419   DLSYM_FN (hsa_queue_destroy)
1420   DLSYM_FN (hsa_code_object_deserialize)
1421   DLSYM_OPT_FN (hsa_amd_memory_lock)
1422   DLSYM_OPT_FN (hsa_amd_memory_unlock)
1423   DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
1424   return true;
1425 #undef DLSYM_OPT_FN
1426 #undef DLSYM_FN
1427 }
1428
1429 /* Return true if the agent is a GPU and can accept of concurrent submissions
1430    from different threads.  */
1431
1432 static bool
1433 suitable_hsa_agent_p (hsa_agent_t agent)
1434 {
1435   hsa_device_type_t device_type;
1436   hsa_status_t status
1437     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1438                                      &device_type);
1439   if (status != HSA_STATUS_SUCCESS)
1440     return false;
1441
1442   switch (device_type)
1443     {
1444     case HSA_DEVICE_TYPE_GPU:
1445       break;
1446     case HSA_DEVICE_TYPE_CPU:
1447       if (!support_cpu_devices)
1448         return false;
1449       break;
1450     default:
1451       return false;
1452     }
1453
1454   uint32_t features = 0;
1455   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1456                                           &features);
1457   if (status != HSA_STATUS_SUCCESS
1458       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1459     return false;
1460   hsa_queue_type_t queue_type;
1461   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1462                                           &queue_type);
1463   if (status != HSA_STATUS_SUCCESS
1464       || (queue_type != HSA_QUEUE_TYPE_MULTI))
1465     return false;
1466
1467   return true;
1468 }
1469
1470 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1471    agent_count in hsa_context.  */
1472
1473 static hsa_status_t
1474 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1475 {
1476   if (suitable_hsa_agent_p (agent))
1477     hsa_context.agent_count++;
1478   return HSA_STATUS_SUCCESS;
1479 }
1480
1481 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1482    id to the describing structure in the hsa context.  The index of the
1483    structure is pointed to by DATA, increment it afterwards.  */
1484
1485 static hsa_status_t
1486 assign_agent_ids (hsa_agent_t agent, void *data)
1487 {
1488   if (suitable_hsa_agent_p (agent))
1489     {
1490       int *agent_index = (int *) data;
1491       hsa_context.agents[*agent_index].id = agent;
1492       ++*agent_index;
1493     }
1494   return HSA_STATUS_SUCCESS;
1495 }
1496
1497 /* Initialize hsa_context if it has not already been done.
1498    Return TRUE on success.  */
1499
1500 static bool
1501 init_hsa_context (void)
1502 {
1503   hsa_status_t status;
1504   int agent_index = 0;
1505
1506   if (hsa_context.initialized)
1507     return true;
1508   init_environment_variables ();
1509   if (!init_hsa_runtime_functions ())
1510     {
1511       GCN_WARNING ("Run-time could not be dynamically opened\n");
1512       if (suppress_host_fallback)
1513         GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1514       return false;
1515     }
1516   status = hsa_fns.hsa_init_fn ();
1517   if (status != HSA_STATUS_SUCCESS)
1518     return hsa_error ("Run-time could not be initialized", status);
1519   GCN_DEBUG ("HSA run-time initialized for GCN\n");
1520
1521   if (debug)
1522     dump_hsa_system_info ();
1523
1524   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1525   if (status != HSA_STATUS_SUCCESS)
1526     return hsa_error ("GCN GPU devices could not be enumerated", status);
1527   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1528
1529   hsa_context.agents
1530     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1531                                   * sizeof (struct agent_info));
1532   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1533   if (status != HSA_STATUS_SUCCESS)
1534     return hsa_error ("Scanning compute agents failed", status);
1535   if (agent_index != hsa_context.agent_count)
1536     {
1537       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1538       return false;
1539     }
1540
1541   if (debug)
1542     {
1543       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1544       if (status != HSA_STATUS_SUCCESS)
1545         GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1546     }
1547
1548   uint16_t minor, major;
1549   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1550                                            &minor);
1551   if (status != HSA_STATUS_SUCCESS)
1552     GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1553   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1554                                            &major);
1555   if (status != HSA_STATUS_SUCCESS)
1556     GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1557
1558   size_t len = sizeof hsa_context.driver_version_s;
1559   int printed = snprintf (hsa_context.driver_version_s, len,
1560                           "HSA Runtime %hu.%hu", (unsigned short int)major,
1561                           (unsigned short int)minor);
1562   if (printed >= len)
1563     GCN_WARNING ("HSA runtime version string was truncated."
1564                  "Version %hu.%hu is too long.", (unsigned short int)major,
1565                  (unsigned short int)minor);
1566
1567   hsa_context.initialized = true;
1568   return true;
1569 }
1570
1571 /* Verify that hsa_context has already been initialized and return the
1572    agent_info structure describing device number N.  Return NULL on error.  */
1573
1574 static struct agent_info *
1575 get_agent_info (int n)
1576 {
1577   if (!hsa_context.initialized)
1578     {
1579       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1580       return NULL;
1581     }
1582   if (n >= hsa_context.agent_count)
1583     {
1584       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1585       return NULL;
1586     }
1587   if (!hsa_context.agents[n].initialized)
1588     {
1589       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1590       return NULL;
1591     }
1592   return &hsa_context.agents[n];
1593 }
1594
1595 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1596
1597    Selects (breaks at) a suitable region of type KIND.  */
1598
1599 static hsa_status_t
1600 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1601                    hsa_region_global_flag_t kind)
1602 {
1603   hsa_status_t status;
1604   hsa_region_segment_t segment;
1605
1606   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1607                                            &segment);
1608   if (status != HSA_STATUS_SUCCESS)
1609     return status;
1610   if (segment != HSA_REGION_SEGMENT_GLOBAL)
1611     return HSA_STATUS_SUCCESS;
1612
1613   uint32_t flags;
1614   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1615                                            &flags);
1616   if (status != HSA_STATUS_SUCCESS)
1617     return status;
1618   if (flags & kind)
1619     {
1620       *retval = region;
1621       return HSA_STATUS_INFO_BREAK;
1622     }
1623   return HSA_STATUS_SUCCESS;
1624 }
1625
1626 /* Callback of hsa_agent_iterate_regions.
1627
1628    Selects a kernargs memory region.  */
1629
1630 static hsa_status_t
1631 get_kernarg_memory_region (hsa_region_t region, void *data)
1632 {
1633   return get_memory_region (region, (hsa_region_t *)data,
1634                             HSA_REGION_GLOBAL_FLAG_KERNARG);
1635 }
1636
1637 /* Callback of hsa_agent_iterate_regions.
1638
1639    Selects a coarse-grained memory region suitable for the heap and
1640    offload data.  */
1641
1642 static hsa_status_t
1643 get_data_memory_region (hsa_region_t region, void *data)
1644 {
1645   return get_memory_region (region, (hsa_region_t *)data,
1646                             HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1647 }
1648
1649 static int
1650 elf_gcn_isa_field (Elf64_Ehdr *image)
1651 {
1652   return image->e_flags & EF_AMDGPU_MACH_MASK;
1653 }
1654
1655 const static char *gcn_gfx803_s = "gfx803";
1656 const static char *gcn_gfx900_s = "gfx900";
1657 const static char *gcn_gfx906_s = "gfx906";
1658 const static char *gcn_gfx908_s = "gfx908";
1659 const static char *gcn_gfx90a_s = "gfx90a";
1660 const static char *gcn_gfx1030_s = "gfx1030";
1661 const static char *gcn_gfx1100_s = "gfx1100";
1662 const static int gcn_isa_name_len = 7;
1663
1664 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1665    support the ISA. */
1666
1667 static const char*
1668 isa_hsa_name (int isa) {
1669   switch(isa)
1670     {
1671     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1672       return gcn_gfx803_s;
1673     case EF_AMDGPU_MACH_AMDGCN_GFX900:
1674       return gcn_gfx900_s;
1675     case EF_AMDGPU_MACH_AMDGCN_GFX906:
1676       return gcn_gfx906_s;
1677     case EF_AMDGPU_MACH_AMDGCN_GFX908:
1678       return gcn_gfx908_s;
1679     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1680       return gcn_gfx90a_s;
1681     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1682       return gcn_gfx1030_s;
1683     case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1684       return gcn_gfx1100_s;
1685     }
1686   return NULL;
1687 }
1688
1689 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1690    with -march) or NULL if we do not support the ISA.
1691    Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
1692
1693 static const char*
1694 isa_gcc_name (int isa) {
1695   switch(isa)
1696     {
1697     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1698       return "fiji";
1699     default:
1700       return isa_hsa_name (isa);
1701     }
1702 }
1703
1704 /* Returns the code which is used in the GCN object code to identify the ISA with
1705    the given name (as used by the HSA runtime).  */
1706
1707 static gcn_isa
1708 isa_code(const char *isa) {
1709   if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1710     return EF_AMDGPU_MACH_AMDGCN_GFX803;
1711
1712   if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1713     return EF_AMDGPU_MACH_AMDGCN_GFX900;
1714
1715   if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1716     return EF_AMDGPU_MACH_AMDGCN_GFX906;
1717
1718   if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1719     return EF_AMDGPU_MACH_AMDGCN_GFX908;
1720
1721   if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1722     return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1723
1724   if (!strncmp (isa, gcn_gfx1030_s, gcn_isa_name_len))
1725     return EF_AMDGPU_MACH_AMDGCN_GFX1030;
1726
1727   if (!strncmp (isa, gcn_gfx1100_s, gcn_isa_name_len))
1728     return EF_AMDGPU_MACH_AMDGCN_GFX1100;
1729
1730   return -1;
1731 }
1732
1733 /* CDNA2 devices have twice as many VGPRs compared to older devices.  */
1734
1735 static int
1736 max_isa_vgprs (int isa)
1737 {
1738   switch (isa)
1739     {
1740     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1741     case EF_AMDGPU_MACH_AMDGCN_GFX900:
1742     case EF_AMDGPU_MACH_AMDGCN_GFX906:
1743     case EF_AMDGPU_MACH_AMDGCN_GFX908:
1744     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
1745     case EF_AMDGPU_MACH_AMDGCN_GFX1100:
1746       return 256;
1747     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1748       return 512;
1749     }
1750   GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
1751 }
1752
1753 /* }}}  */
1754 /* {{{ Run  */
1755
1756 /* Create or reuse a team arena and stack space.
1757
1758    Team arenas are used by OpenMP to avoid calling malloc multiple times
1759    while setting up each team.  This is purely a performance optimization.
1760
1761    The stack space is used by all kernels.  We must allocate it in such a
1762    way that the reverse offload implmentation can access the data.
1763
1764    Allocating this memory costs performance, so this function will reuse an
1765    existing allocation if a large enough one is idle.
1766    The memory lock is released, but not deallocated, when the kernel exits.  */
1767
1768 static void
1769 configure_ephemeral_memories (struct kernel_info *kernel,
1770                               struct kernargs_abi *kernargs, int num_teams,
1771                               int num_threads)
1772 {
1773   struct agent_info *agent = kernel->agent;
1774   struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
1775   struct ephemeral_memories_list *item;
1776
1777   int actual_arena_size = (kernel->kind == KIND_OPENMP
1778                            ? team_arena_size : 0);
1779   int actual_arena_total_size = actual_arena_size * num_teams;
1780   size_t size = (actual_arena_total_size
1781                  + num_teams * num_threads * stack_size);
1782
1783   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1784     {
1785       if (item->size < size)
1786         continue;
1787
1788       if (pthread_mutex_trylock (&item->in_use) == 0)
1789         break;
1790     }
1791
1792   if (!item)
1793     {
1794       GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
1795                  " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
1796                  num_teams, num_threads, size);
1797
1798       if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
1799         {
1800           GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1801           return;
1802         }
1803       item = malloc (sizeof (*item));
1804       item->size = size;
1805       item->next = NULL;
1806       *next_ptr = item;
1807
1808       if (pthread_mutex_init (&item->in_use, NULL))
1809         {
1810           GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
1811           return;
1812         }
1813       if (pthread_mutex_lock (&item->in_use))
1814         {
1815           GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1816           return;
1817         }
1818       if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
1819         {
1820           GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1821           return;
1822         }
1823
1824       hsa_status_t status;
1825       status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
1826                                                &item->address);
1827       if (status != HSA_STATUS_SUCCESS)
1828         hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1829       status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
1830                                                    HSA_ACCESS_PERMISSION_RW);
1831       if (status != HSA_STATUS_SUCCESS)
1832         hsa_fatal ("Could not assign arena & stack memory to device", status);
1833     }
1834
1835   kernargs->arena_ptr = (actual_arena_total_size
1836                          ? (uint64_t)item->address
1837                          : 0);
1838   kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
1839   kernargs->arena_size_per_team = actual_arena_size;
1840   kernargs->stack_size_per_thread = stack_size;
1841 }
1842
1843 /* Mark an ephemeral memory space available for reuse.  */
1844
1845 static void
1846 release_ephemeral_memories (struct agent_info* agent, void *address)
1847 {
1848   struct ephemeral_memories_list *item;
1849
1850   for (item = agent->ephemeral_memories_list; item; item = item->next)
1851     {
1852       if (item->address == address)
1853         {
1854           if (pthread_mutex_unlock (&item->in_use))
1855             GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1856           return;
1857         }
1858     }
1859   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1860 }
1861
1862 /* Clean up all the allocated team arenas.  */
1863
1864 static bool
1865 destroy_ephemeral_memories (struct agent_info *agent)
1866 {
1867   struct ephemeral_memories_list *item, *next;
1868
1869   for (item = agent->ephemeral_memories_list; item; item = next)
1870     {
1871       next = item->next;
1872       hsa_fns.hsa_memory_free_fn (item->address);
1873       if (pthread_mutex_destroy (&item->in_use))
1874         {
1875           GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
1876           return false;
1877         }
1878       free (item);
1879     }
1880   agent->ephemeral_memories_list = NULL;
1881
1882   return true;
1883 }
1884
1885 /* Allocate memory on a specified device.  */
1886
1887 static void *
1888 alloc_by_agent (struct agent_info *agent, size_t size)
1889 {
1890   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1891
1892   void *ptr;
1893   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1894                                                         size, &ptr);
1895   if (status != HSA_STATUS_SUCCESS)
1896     {
1897       hsa_error ("Could not allocate device memory", status);
1898       return NULL;
1899     }
1900
1901   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1902                                                HSA_ACCESS_PERMISSION_RW);
1903   if (status != HSA_STATUS_SUCCESS)
1904     {
1905       hsa_error ("Could not assign data memory to device", status);
1906       return NULL;
1907     }
1908
1909   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1910   bool profiling_dispatch_p
1911     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1912   if (profiling_dispatch_p)
1913     {
1914       acc_prof_info *prof_info = thr->prof_info;
1915       acc_event_info data_event_info;
1916       acc_api_info *api_info = thr->api_info;
1917
1918       prof_info->event_type = acc_ev_alloc;
1919
1920       data_event_info.data_event.event_type = prof_info->event_type;
1921       data_event_info.data_event.valid_bytes
1922         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1923       data_event_info.data_event.parent_construct
1924         = acc_construct_parallel;
1925       data_event_info.data_event.implicit = 1;
1926       data_event_info.data_event.tool_info = NULL;
1927       data_event_info.data_event.var_name = NULL;
1928       data_event_info.data_event.bytes = size;
1929       data_event_info.data_event.host_ptr = NULL;
1930       data_event_info.data_event.device_ptr = (void *) ptr;
1931
1932       api_info->device_api = acc_device_api_other;
1933
1934       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1935                                             api_info);
1936     }
1937
1938   return ptr;
1939 }
1940
1941 /* Create kernel dispatch data structure for given KERNEL, along with
1942    the necessary device signals and memory allocations.  */
1943
1944 static struct kernel_dispatch *
1945 create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
1946                         int num_threads)
1947 {
1948   struct agent_info *agent = kernel->agent;
1949   struct kernel_dispatch *shadow
1950     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1951
1952   shadow->agent = kernel->agent;
1953   shadow->object = kernel->object;
1954
1955   hsa_signal_t sync_signal;
1956   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1957   if (status != HSA_STATUS_SUCCESS)
1958     hsa_fatal ("Error creating the GCN sync signal", status);
1959
1960   shadow->signal = sync_signal.handle;
1961   shadow->private_segment_size = kernel->private_segment_size;
1962
1963   if (lowlat_size < 0)
1964     {
1965       /* Divide the LDS between the number of running teams.
1966          Allocate not less than is defined in the kernel metadata.  */
1967       int teams_per_cu = num_teams / get_cu_count (agent);
1968       int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
1969       shadow->group_segment_size
1970         = (kernel->group_segment_size > LDS_per_team
1971            ? kernel->group_segment_size
1972            : LDS_per_team);;
1973     }
1974   else if (lowlat_size < GCN_LOWLAT_HEAP+8)
1975     /* Ensure that there's space for the OpenMP libgomp data.  */
1976     shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
1977   else
1978     shadow->group_segment_size = (lowlat_size > 65536
1979                                   ? 65536
1980                                   : lowlat_size);
1981
1982   /* We expect kernels to request a single pointer, explicitly, and the
1983      rest of struct kernargs, implicitly.  If they request anything else
1984      then something is wrong.  */
1985   if (kernel->kernarg_segment_size > 8)
1986     {
1987       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1988       return NULL;
1989     }
1990
1991   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1992                                            sizeof (struct kernargs),
1993                                            &shadow->kernarg_address);
1994   if (status != HSA_STATUS_SUCCESS)
1995     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1996   struct kernargs *kernargs = shadow->kernarg_address;
1997
1998   /* Zero-initialize the output_data (minimum needed).  */
1999   kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
2000   kernargs->output_data.next_output = 0;
2001   for (unsigned i = 0;
2002        i < (sizeof (kernargs->output_data.queue)
2003             / sizeof (kernargs->output_data.queue[0]));
2004        i++)
2005     kernargs->output_data.queue[i].written = 0;
2006   kernargs->output_data.consumed = 0;
2007
2008   /* Pass in the heap location.  */
2009   kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
2010
2011   /* Create the ephemeral memory spaces.  */
2012   configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
2013
2014   /* Ensure we can recognize unset return values.  */
2015   kernargs->output_data.return_value = 0xcafe0000;
2016
2017   return shadow;
2018 }
2019
2020 static void
2021 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
2022                          uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
2023 {
2024   int dev_num = dev_num64;
2025   GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
2026                           NULL);
2027 }
2028
2029 /* Output any data written to console output from the kernel.  It is expected
2030    that this function is polled during kernel execution.
2031
2032    We print all entries from the last item printed to the next entry without
2033    a "written" flag.  If the "final" flag is set then it'll continue right to
2034    the end.
2035
2036    The print buffer is circular, but the from and to locations don't wrap when
2037    the buffer does, so the output limit is UINT_MAX.  The target blocks on
2038    output when the buffer is full.  */
2039
2040 static void
2041 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
2042                 bool final)
2043 {
2044   unsigned int limit = (sizeof (kernargs->output_data.queue)
2045                         / sizeof (kernargs->output_data.queue[0]));
2046
2047   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
2048                                        __ATOMIC_ACQUIRE);
2049   unsigned int to = kernargs->output_data.next_output;
2050
2051   if (from > to)
2052     {
2053       /* Overflow.  */
2054       if (final)
2055         printf ("GCN print buffer overflowed.\n");
2056       return;
2057     }
2058
2059   unsigned int i;
2060   for (i = from; i < to; i++)
2061     {
2062       struct printf_data *data = &kernargs->output_data.queue[i%limit];
2063
2064       if (!data->written && !final)
2065         break;
2066
2067       switch (data->type)
2068         {
2069         case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
2070         case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
2071         case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
2072         case 3: printf ("%.128s%.128s", data->msg, data->text); break;
2073         case 4:
2074           process_reverse_offload (data->value_u64[0], data->value_u64[1],
2075                                    data->value_u64[2], data->value_u64[3],
2076                                    data->value_u64[4], data->value_u64[5]);
2077           break;
2078         default: printf ("GCN print buffer error!\n"); break;
2079         }
2080       data->written = 0;
2081       __atomic_store_n (&kernargs->output_data.consumed, i+1,
2082                         __ATOMIC_RELEASE);
2083     }
2084   fflush (stdout);
2085 }
2086
2087 /* Release data structure created for a kernel dispatch in SHADOW argument,
2088    and clean up the signal and memory allocations.  */
2089
2090 static void
2091 release_kernel_dispatch (struct kernel_dispatch *shadow)
2092 {
2093   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2094
2095   struct kernargs *kernargs = shadow->kernarg_address;
2096   void *addr = (void *)kernargs->abi.arena_ptr;
2097   if (!addr)
2098     addr = (void *)kernargs->abi.stack_ptr;
2099   release_ephemeral_memories (shadow->agent, addr);
2100
2101   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2102
2103   hsa_signal_t s;
2104   s.handle = shadow->signal;
2105   hsa_fns.hsa_signal_destroy_fn (s);
2106
2107   free (shadow);
2108 }
2109
2110 /* Extract the properties from a kernel binary.  */
2111
2112 static void
2113 init_kernel_properties (struct kernel_info *kernel)
2114 {
2115   hsa_status_t status;
2116   struct agent_info *agent = kernel->agent;
2117   hsa_executable_symbol_t kernel_symbol;
2118   char *buf = alloca (strlen (kernel->name) + 4);
2119   sprintf (buf, "%s.kd", kernel->name);
2120   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2121                                                  buf, agent->id,
2122                                                  0, &kernel_symbol);
2123   if (status != HSA_STATUS_SUCCESS)
2124     {
2125       hsa_warn ("Could not find symbol for kernel in the code object", status);
2126       fprintf (stderr, "not found name: '%s'\n", buf);
2127       dump_executable_symbols (agent->executable);
2128       goto failure;
2129     }
2130   GCN_DEBUG ("Located kernel %s\n", kernel->name);
2131   status = hsa_fns.hsa_executable_symbol_get_info_fn
2132     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2133   if (status != HSA_STATUS_SUCCESS)
2134     hsa_fatal ("Could not extract a kernel object from its symbol", status);
2135   status = hsa_fns.hsa_executable_symbol_get_info_fn
2136     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2137      &kernel->kernarg_segment_size);
2138   if (status != HSA_STATUS_SUCCESS)
2139     hsa_fatal ("Could not get info about kernel argument size", status);
2140   status = hsa_fns.hsa_executable_symbol_get_info_fn
2141     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2142      &kernel->group_segment_size);
2143   if (status != HSA_STATUS_SUCCESS)
2144     hsa_fatal ("Could not get info about kernel group segment size", status);
2145   status = hsa_fns.hsa_executable_symbol_get_info_fn
2146     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2147      &kernel->private_segment_size);
2148   if (status != HSA_STATUS_SUCCESS)
2149     hsa_fatal ("Could not get info about kernel private segment size",
2150                status);
2151
2152   /* The kernel type is not known until something tries to launch it.  */
2153   kernel->kind = KIND_UNKNOWN;
2154
2155   GCN_DEBUG ("Kernel structure for %s fully initialized with "
2156              "following segment sizes: \n", kernel->name);
2157   GCN_DEBUG ("  group_segment_size: %u\n",
2158              (unsigned) kernel->group_segment_size);
2159   GCN_DEBUG ("  private_segment_size: %u\n",
2160              (unsigned) kernel->private_segment_size);
2161   GCN_DEBUG ("  kernarg_segment_size: %u\n",
2162              (unsigned) kernel->kernarg_segment_size);
2163   return;
2164
2165 failure:
2166   kernel->initialization_failed = true;
2167 }
2168
2169 /* Do all the work that is necessary before running KERNEL for the first time.
2170    The function assumes the program has been created, finalized and frozen by
2171    create_and_finalize_hsa_program.  */
2172
2173 static void
2174 init_kernel (struct kernel_info *kernel)
2175 {
2176   if (pthread_mutex_lock (&kernel->init_mutex))
2177     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2178   if (kernel->initialized)
2179     {
2180       if (pthread_mutex_unlock (&kernel->init_mutex))
2181         GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2182                            "mutex");
2183
2184       return;
2185     }
2186
2187   init_kernel_properties (kernel);
2188
2189   if (!kernel->initialization_failed)
2190     {
2191       GCN_DEBUG ("\n");
2192
2193       kernel->initialized = true;
2194     }
2195   if (pthread_mutex_unlock (&kernel->init_mutex))
2196     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2197                        "mutex");
2198 }
2199
2200 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2201    launch attributes from KLA.
2202
2203    MODULE_LOCKED indicates that the caller already holds the lock and
2204    run_kernel need not lock it again.
2205    If AQ is NULL then agent->sync_queue will be used.  */
2206
2207 static void
2208 run_kernel (struct kernel_info *kernel, void *vars,
2209             struct GOMP_kernel_launch_attributes *kla,
2210             struct goacc_asyncqueue *aq, bool module_locked)
2211 {
2212   struct agent_info *agent = kernel->agent;
2213   GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2214              kernel->description->vpgr_count);
2215
2216   /* Reduce the number of threads/workers if there are insufficient
2217      VGPRs available to run the kernels together.  */
2218   if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2219     {
2220       int max_vgprs = max_isa_vgprs (agent->device_isa);
2221       int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2222       int max_threads = (max_vgprs / granulated_vgprs) * 4;
2223       if (kla->gdims[2] > max_threads)
2224         {
2225           GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2226                        " per team/gang - reducing to %d threads/workers.\n",
2227                        kla->gdims[2], max_threads);
2228           kla->gdims[2] = max_threads;
2229         }
2230     }
2231
2232   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2233              (aq ? aq->id : 0));
2234   GCN_DEBUG ("GCN launch attribs: gdims:[");
2235   int i;
2236   for (i = 0; i < kla->ndim; ++i)
2237     {
2238       if (i)
2239         DEBUG_PRINT (", ");
2240       DEBUG_PRINT ("%u", kla->gdims[i]);
2241     }
2242   DEBUG_PRINT ("], normalized gdims:[");
2243   for (i = 0; i < kla->ndim; ++i)
2244     {
2245       if (i)
2246         DEBUG_PRINT (", ");
2247       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2248     }
2249   DEBUG_PRINT ("], wdims:[");
2250   for (i = 0; i < kla->ndim; ++i)
2251     {
2252       if (i)
2253         DEBUG_PRINT (", ");
2254       DEBUG_PRINT ("%u", kla->wdims[i]);
2255     }
2256   DEBUG_PRINT ("]\n");
2257   DEBUG_FLUSH ();
2258
2259   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2260     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2261
2262   if (!agent->initialized)
2263     GOMP_PLUGIN_fatal ("Agent must be initialized");
2264
2265   if (!kernel->initialized)
2266     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2267
2268   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2269
2270   uint64_t index
2271     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2272   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2273
2274   /* Wait until the queue is not full before writing the packet.   */
2275   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2276          >= command_q->size)
2277     ;
2278
2279   /* Do not allow the dimensions to be overridden when running
2280      constructors or destructors.  */
2281   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2282   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2283
2284   hsa_kernel_dispatch_packet_t *packet;
2285   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2286            + index % command_q->size;
2287
2288   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2289   packet->grid_size_x = override_x ? : kla->gdims[0];
2290   packet->workgroup_size_x = get_group_size (kla->ndim,
2291                                              packet->grid_size_x,
2292                                              kla->wdims[0]);
2293
2294   if (kla->ndim >= 2)
2295     {
2296       packet->grid_size_y = kla->gdims[1];
2297       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2298                                                  kla->wdims[1]);
2299     }
2300   else
2301     {
2302       packet->grid_size_y = 1;
2303       packet->workgroup_size_y = 1;
2304     }
2305
2306   if (kla->ndim == 3)
2307     {
2308       packet->grid_size_z = limit_worker_threads (override_z
2309                                                   ? : kla->gdims[2]);
2310       packet->workgroup_size_z = get_group_size (kla->ndim,
2311                                                  packet->grid_size_z,
2312                                                  kla->wdims[2]);
2313     }
2314   else
2315     {
2316       packet->grid_size_z = 1;
2317       packet->workgroup_size_z = 1;
2318     }
2319
2320   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2321              " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2322              packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2323              packet->grid_size_x / packet->workgroup_size_x,
2324              packet->grid_size_y / packet->workgroup_size_y,
2325              packet->grid_size_z / packet->workgroup_size_z,
2326              packet->workgroup_size_x, packet->workgroup_size_y,
2327              packet->workgroup_size_z);
2328
2329   struct kernel_dispatch *shadow
2330     = create_kernel_dispatch (kernel, packet->grid_size_x,
2331                               packet->grid_size_z);
2332   shadow->queue = command_q;
2333
2334   if (debug)
2335     {
2336       fprintf (stderr, "\nKernel has following dependencies:\n");
2337       print_kernel_dispatch (shadow, 2);
2338     }
2339
2340   packet->private_segment_size = shadow->private_segment_size;
2341   packet->group_segment_size = shadow->group_segment_size;
2342   packet->kernel_object = shadow->object;
2343   packet->kernarg_address = shadow->kernarg_address;
2344   hsa_signal_t s;
2345   s.handle = shadow->signal;
2346   packet->completion_signal = s;
2347   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2348   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2349
2350   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2351
2352   uint16_t header;
2353   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2354   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2355   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2356
2357   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2358              agent->device_id);
2359
2360   packet_store_release ((uint32_t *) packet, header,
2361                         (uint16_t) kla->ndim
2362                         << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2363
2364   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2365                                        index);
2366
2367   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2368
2369   /* Root signal waits with 1ms timeout.  */
2370   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2371                                              1000 * 1000,
2372                                              HSA_WAIT_STATE_BLOCKED) != 0)
2373     {
2374       console_output (kernel, shadow->kernarg_address, false);
2375     }
2376   console_output (kernel, shadow->kernarg_address, true);
2377
2378   struct kernargs *kernargs = shadow->kernarg_address;
2379   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2380
2381   release_kernel_dispatch (shadow);
2382
2383   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2384     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2385
2386   unsigned int upper = (return_value & ~0xffff) >> 16;
2387   if (upper == 0xcafe)
2388     ; // exit not called, normal termination.
2389   else if (upper == 0xffff)
2390     ; // exit called.
2391   else
2392     {
2393       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2394                          " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2395                          return_value);
2396       abort ();
2397     }
2398
2399   if (upper == 0xffff)
2400     {
2401       unsigned int signal = (return_value >> 8) & 0xff;
2402
2403       if (signal == SIGABRT)
2404         {
2405           GCN_WARNING ("GCN Kernel aborted\n");
2406           abort ();
2407         }
2408       else if (signal != 0)
2409         {
2410           GCN_WARNING ("GCN Kernel received unknown signal\n");
2411           abort ();
2412         }
2413
2414       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2415       exit (return_value & 0xff);
2416     }
2417 }
2418
2419 /* }}}  */
2420 /* {{{ Load/Unload  */
2421
2422 /* Initialize KERNEL from D and other parameters.  Return true on success. */
2423
2424 static bool
2425 init_basic_kernel_info (struct kernel_info *kernel,
2426                         struct hsa_kernel_description *d,
2427                         struct agent_info *agent,
2428                         struct module_info *module)
2429 {
2430   kernel->agent = agent;
2431   kernel->module = module;
2432   kernel->name = d->name;
2433   kernel->description = d;
2434   if (pthread_mutex_init (&kernel->init_mutex, NULL))
2435     {
2436       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2437       return false;
2438     }
2439   return true;
2440 }
2441
2442 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2443
2444 static bool
2445 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2446 {
2447   int isa_field = elf_gcn_isa_field (image);
2448   const char* isa_s = isa_hsa_name (isa_field);
2449   if (!isa_s)
2450     {
2451       hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2452       return false;
2453     }
2454
2455   if (isa_field != agent->device_isa)
2456     {
2457       char msg[120];
2458       const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2459       const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2460       assert (agent_isa_s);
2461       assert (agent_isa_gcc_s);
2462
2463       snprintf (msg, sizeof msg,
2464                 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2465                 "Try to recompile with '-foffload-options=-march=%s'.\n",
2466                 isa_s, agent_isa_s, agent_isa_gcc_s);
2467
2468       hsa_error (msg, HSA_STATUS_ERROR);
2469       return false;
2470     }
2471
2472   return true;
2473 }
2474
2475 /* Create and finalize the program consisting of all loaded modules.  */
2476
2477 static bool
2478 create_and_finalize_hsa_program (struct agent_info *agent)
2479 {
2480   hsa_status_t status;
2481   bool res = true;
2482   if (pthread_mutex_lock (&agent->prog_mutex))
2483     {
2484       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2485       return false;
2486     }
2487   if (agent->prog_finalized)
2488     goto final;
2489
2490   status
2491     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2492                                         HSA_EXECUTABLE_STATE_UNFROZEN,
2493                                         "", &agent->executable);
2494   if (status != HSA_STATUS_SUCCESS)
2495     {
2496       hsa_error ("Could not create GCN executable", status);
2497       goto fail;
2498     }
2499
2500   /* Load any GCN modules.  */
2501   struct module_info *module = agent->module;
2502   if (module)
2503     {
2504       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2505
2506       if (!isa_matches_agent (agent, image))
2507         goto fail;
2508
2509       hsa_code_object_t co = { 0 };
2510       status = hsa_fns.hsa_code_object_deserialize_fn
2511         (module->image_desc->gcn_image->image,
2512          module->image_desc->gcn_image->size,
2513          NULL, &co);
2514       if (status != HSA_STATUS_SUCCESS)
2515         {
2516           hsa_error ("Could not deserialize GCN code object", status);
2517           goto fail;
2518         }
2519
2520       status = hsa_fns.hsa_executable_load_code_object_fn
2521         (agent->executable, agent->id, co, "");
2522       if (status != HSA_STATUS_SUCCESS)
2523         {
2524           hsa_error ("Could not load GCN code object", status);
2525           goto fail;
2526         }
2527
2528       if (!module->heap)
2529         {
2530           status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2531                                                    gcn_kernel_heap_size,
2532                                                    (void**)&module->heap);
2533           if (status != HSA_STATUS_SUCCESS)
2534             {
2535               hsa_error ("Could not allocate memory for GCN heap", status);
2536               goto fail;
2537             }
2538
2539           status = hsa_fns.hsa_memory_assign_agent_fn
2540                         (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2541           if (status != HSA_STATUS_SUCCESS)
2542             {
2543               hsa_error ("Could not assign GCN heap memory to device", status);
2544               goto fail;
2545             }
2546
2547           hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2548                                       &gcn_kernel_heap_size,
2549                                       sizeof (gcn_kernel_heap_size));
2550         }
2551
2552     }
2553
2554   if (debug)
2555     dump_executable_symbols (agent->executable);
2556
2557   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2558   if (status != HSA_STATUS_SUCCESS)
2559     {
2560       hsa_error ("Could not freeze the GCN executable", status);
2561       goto fail;
2562     }
2563
2564 final:
2565   agent->prog_finalized = true;
2566
2567   if (pthread_mutex_unlock (&agent->prog_mutex))
2568     {
2569       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2570       res = false;
2571     }
2572
2573   return res;
2574
2575 fail:
2576   res = false;
2577   goto final;
2578 }
2579
2580 /* Free the HSA program in agent and everything associated with it and set
2581    agent->prog_finalized and the initialized flags of all kernels to false.
2582    Return TRUE on success.  */
2583
2584 static bool
2585 destroy_hsa_program (struct agent_info *agent)
2586 {
2587   if (!agent->prog_finalized)
2588     return true;
2589
2590   hsa_status_t status;
2591
2592   GCN_DEBUG ("Destroying the current GCN program.\n");
2593
2594   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2595   if (status != HSA_STATUS_SUCCESS)
2596     return hsa_error ("Could not destroy GCN executable", status);
2597
2598   if (agent->module)
2599     {
2600       int i;
2601       for (i = 0; i < agent->module->kernel_count; i++)
2602         agent->module->kernels[i].initialized = false;
2603
2604       if (agent->module->heap)
2605         {
2606           hsa_fns.hsa_memory_free_fn (agent->module->heap);
2607           agent->module->heap = NULL;
2608         }
2609     }
2610   agent->prog_finalized = false;
2611   return true;
2612 }
2613
2614 /* Deinitialize all information associated with MODULE and kernels within
2615    it.  Return TRUE on success.  */
2616
2617 static bool
2618 destroy_module (struct module_info *module, bool locked)
2619 {
2620   /* Run destructors before destroying module.  */
2621   struct GOMP_kernel_launch_attributes kla =
2622     { 3,
2623       /* Grid size.  */
2624       { 1, 64, 1 },
2625       /* Work-group size.  */
2626       { 1, 64, 1 }
2627     };
2628
2629   if (module->fini_array_func)
2630     {
2631       init_kernel (module->fini_array_func);
2632       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2633     }
2634   module->constructors_run_p = false;
2635
2636   int i;
2637   for (i = 0; i < module->kernel_count; i++)
2638     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2639       {
2640         GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2641                            "mutex");
2642         return false;
2643       }
2644
2645   return true;
2646 }
2647
2648 /* }}}  */
2649 /* {{{ Async  */
2650
2651 /* Callback of dispatch queues to report errors.  */
2652
2653 static void
2654 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2655 {
2656   struct queue_entry *entry = &aq->queue[index];
2657
2658   switch (entry->type)
2659     {
2660     case KERNEL_LAUNCH:
2661       if (DEBUG_QUEUES)
2662         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2663                    aq->agent->device_id, aq->id, index);
2664       run_kernel (entry->u.launch.kernel,
2665                   entry->u.launch.vars,
2666                   &entry->u.launch.kla, aq, false);
2667       if (DEBUG_QUEUES)
2668         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2669                    aq->agent->device_id, aq->id, index);
2670       break;
2671
2672     case CALLBACK:
2673       if (DEBUG_QUEUES)
2674         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2675                    aq->agent->device_id, aq->id, index);
2676       entry->u.callback.fn (entry->u.callback.data);
2677       if (DEBUG_QUEUES)
2678         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2679                    aq->agent->device_id, aq->id, index);
2680       break;
2681
2682     case ASYNC_WAIT:
2683       {
2684         /* FIXME: is it safe to access a placeholder that may already have
2685            been executed?  */
2686         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2687
2688         if (DEBUG_QUEUES)
2689           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2690                      aq->agent->device_id, aq->id, index);
2691
2692         pthread_mutex_lock (&placeholderp->mutex);
2693
2694         while (!placeholderp->executed)
2695           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2696
2697         pthread_mutex_unlock (&placeholderp->mutex);
2698
2699         if (pthread_cond_destroy (&placeholderp->cond))
2700           GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2701
2702         if (pthread_mutex_destroy (&placeholderp->mutex))
2703           GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2704
2705         if (DEBUG_QUEUES)
2706           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2707                      "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2708       }
2709       break;
2710
2711     case ASYNC_PLACEHOLDER:
2712       pthread_mutex_lock (&entry->u.placeholder.mutex);
2713       entry->u.placeholder.executed = 1;
2714       pthread_cond_signal (&entry->u.placeholder.cond);
2715       pthread_mutex_unlock (&entry->u.placeholder.mutex);
2716       break;
2717
2718     default:
2719       GOMP_PLUGIN_fatal ("Unknown queue element");
2720     }
2721 }
2722
2723 /* This function is run as a thread to service an async queue in the
2724    background.  It runs continuously until the stop flag is set.  */
2725
2726 static void *
2727 drain_queue (void *thread_arg)
2728 {
2729   struct goacc_asyncqueue *aq = thread_arg;
2730
2731   if (DRAIN_QUEUE_SYNCHRONOUS_P)
2732     {
2733       aq->drain_queue_stop = 2;
2734       return NULL;
2735     }
2736
2737   pthread_mutex_lock (&aq->mutex);
2738
2739   while (true)
2740     {
2741       if (aq->drain_queue_stop)
2742         break;
2743
2744       if (aq->queue_n > 0)
2745         {
2746           pthread_mutex_unlock (&aq->mutex);
2747           execute_queue_entry (aq, aq->queue_first);
2748
2749           pthread_mutex_lock (&aq->mutex);
2750           aq->queue_first = ((aq->queue_first + 1)
2751                              % ASYNC_QUEUE_SIZE);
2752           aq->queue_n--;
2753
2754           if (DEBUG_THREAD_SIGNAL)
2755             GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2756                        aq->agent->device_id, aq->id);
2757           pthread_cond_broadcast (&aq->queue_cond_out);
2758           pthread_mutex_unlock (&aq->mutex);
2759
2760           if (DEBUG_QUEUES)
2761             GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2762                        aq->id);
2763           pthread_mutex_lock (&aq->mutex);
2764         }
2765       else
2766         {
2767           if (DEBUG_THREAD_SLEEP)
2768             GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2769                        aq->agent->device_id, aq->id);
2770           pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2771           if (DEBUG_THREAD_SLEEP)
2772             GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2773                        aq->agent->device_id, aq->id);
2774         }
2775     }
2776
2777   aq->drain_queue_stop = 2;
2778   if (DEBUG_THREAD_SIGNAL)
2779     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2780                aq->agent->device_id, aq->id);
2781   pthread_cond_broadcast (&aq->queue_cond_out);
2782   pthread_mutex_unlock (&aq->mutex);
2783
2784   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2785   return NULL;
2786 }
2787
2788 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2789    is not usually the case.  This is just a debug tool.  */
2790
2791 static void
2792 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2793 {
2794   pthread_mutex_lock (&aq->mutex);
2795
2796   while (aq->queue_n > 0)
2797     {
2798       execute_queue_entry (aq, aq->queue_first);
2799
2800       aq->queue_first = ((aq->queue_first + 1)
2801                          % ASYNC_QUEUE_SIZE);
2802       aq->queue_n--;
2803     }
2804
2805   pthread_mutex_unlock (&aq->mutex);
2806 }
2807
2808 /* Block the current thread until an async queue is writable.  The aq->mutex
2809    lock should be held on entry, and remains locked on exit.  */
2810
2811 static void
2812 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2813 {
2814   if (aq->queue_n == ASYNC_QUEUE_SIZE)
2815     {
2816       /* Queue is full.  Wait for it to not be full.  */
2817       while (aq->queue_n == ASYNC_QUEUE_SIZE)
2818         pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2819     }
2820 }
2821
2822 /* Request an asynchronous kernel launch on the specified queue.  This
2823    may block if the queue is full, but returns without waiting for the
2824    kernel to run.  */
2825
2826 static void
2827 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2828                    void *vars, struct GOMP_kernel_launch_attributes *kla)
2829 {
2830   assert (aq->agent == kernel->agent);
2831
2832   pthread_mutex_lock (&aq->mutex);
2833
2834   wait_for_queue_nonfull (aq);
2835
2836   int queue_last = ((aq->queue_first + aq->queue_n)
2837                     % ASYNC_QUEUE_SIZE);
2838   if (DEBUG_QUEUES)
2839     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2840                aq->id, queue_last);
2841
2842   aq->queue[queue_last].type = KERNEL_LAUNCH;
2843   aq->queue[queue_last].u.launch.kernel = kernel;
2844   aq->queue[queue_last].u.launch.vars = vars;
2845   aq->queue[queue_last].u.launch.kla = *kla;
2846
2847   aq->queue_n++;
2848
2849   if (DEBUG_THREAD_SIGNAL)
2850     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2851                aq->agent->device_id, aq->id);
2852   pthread_cond_signal (&aq->queue_cond_in);
2853
2854   pthread_mutex_unlock (&aq->mutex);
2855 }
2856
2857 /* Request an asynchronous callback on the specified queue.  The callback
2858    function will be called, with the given opaque data, from the appropriate
2859    async thread, when all previous items on that queue are complete.  */
2860
2861 static void
2862 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2863                      void *data)
2864 {
2865   pthread_mutex_lock (&aq->mutex);
2866
2867   wait_for_queue_nonfull (aq);
2868
2869   int queue_last = ((aq->queue_first + aq->queue_n)
2870                     % ASYNC_QUEUE_SIZE);
2871   if (DEBUG_QUEUES)
2872     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2873                aq->id, queue_last);
2874
2875   aq->queue[queue_last].type = CALLBACK;
2876   aq->queue[queue_last].u.callback.fn = fn;
2877   aq->queue[queue_last].u.callback.data = data;
2878
2879   aq->queue_n++;
2880
2881   if (DEBUG_THREAD_SIGNAL)
2882     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2883                aq->agent->device_id, aq->id);
2884   pthread_cond_signal (&aq->queue_cond_in);
2885
2886   pthread_mutex_unlock (&aq->mutex);
2887 }
2888
2889 /* Request that a given async thread wait for another thread (unspecified) to
2890    reach the given placeholder.  The wait will occur when all previous entries
2891    on the queue are complete.  A placeholder is effectively a kind of signal
2892    which simply sets a flag when encountered in a queue.  */
2893
2894 static void
2895 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2896                       struct placeholder *placeholderp)
2897 {
2898   pthread_mutex_lock (&aq->mutex);
2899
2900   wait_for_queue_nonfull (aq);
2901
2902   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2903   if (DEBUG_QUEUES)
2904     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2905                aq->id, queue_last);
2906
2907   aq->queue[queue_last].type = ASYNC_WAIT;
2908   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2909
2910   aq->queue_n++;
2911
2912   if (DEBUG_THREAD_SIGNAL)
2913     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2914                aq->agent->device_id, aq->id);
2915   pthread_cond_signal (&aq->queue_cond_in);
2916
2917   pthread_mutex_unlock (&aq->mutex);
2918 }
2919
2920 /* Add a placeholder into an async queue.  When the async thread reaches the
2921    placeholder it will set the "executed" flag to true and continue.
2922    Another thread may be waiting on this thread reaching the placeholder.  */
2923
2924 static struct placeholder *
2925 queue_push_placeholder (struct goacc_asyncqueue *aq)
2926 {
2927   struct placeholder *placeholderp;
2928
2929   pthread_mutex_lock (&aq->mutex);
2930
2931   wait_for_queue_nonfull (aq);
2932
2933   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2934   if (DEBUG_QUEUES)
2935     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2936                aq->id, queue_last);
2937
2938   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2939   placeholderp = &aq->queue[queue_last].u.placeholder;
2940
2941   if (pthread_mutex_init (&placeholderp->mutex, NULL))
2942     {
2943       pthread_mutex_unlock (&aq->mutex);
2944       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2945     }
2946
2947   if (pthread_cond_init (&placeholderp->cond, NULL))
2948     {
2949       pthread_mutex_unlock (&aq->mutex);
2950       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2951     }
2952
2953   placeholderp->executed = 0;
2954
2955   aq->queue_n++;
2956
2957   if (DEBUG_THREAD_SIGNAL)
2958     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2959                aq->agent->device_id, aq->id);
2960   pthread_cond_signal (&aq->queue_cond_in);
2961
2962   pthread_mutex_unlock (&aq->mutex);
2963
2964   return placeholderp;
2965 }
2966
2967 /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
2968
2969 static void
2970 finalize_async_thread (struct goacc_asyncqueue *aq)
2971 {
2972   pthread_mutex_lock (&aq->mutex);
2973   if (aq->drain_queue_stop == 2)
2974     {
2975       pthread_mutex_unlock (&aq->mutex);
2976       return;
2977     }
2978
2979   aq->drain_queue_stop = 1;
2980
2981   if (DEBUG_THREAD_SIGNAL)
2982     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2983                aq->agent->device_id, aq->id);
2984   pthread_cond_signal (&aq->queue_cond_in);
2985
2986   while (aq->drain_queue_stop != 2)
2987     {
2988       if (DEBUG_THREAD_SLEEP)
2989         GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2990                    " to sleep\n", aq->agent->device_id, aq->id);
2991       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2992       if (DEBUG_THREAD_SLEEP)
2993         GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
2994                    aq->agent->device_id, aq->id);
2995     }
2996
2997   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2998              aq->id);
2999   pthread_mutex_unlock (&aq->mutex);
3000
3001   int err = pthread_join (aq->thread_drain_queue, NULL);
3002   if (err != 0)
3003     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
3004                        aq->agent->device_id, aq->id, strerror (err));
3005   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
3006 }
3007
3008 /* Set up an async queue for OpenMP.  There will be only one.  The
3009    implementation simply uses an OpenACC async queue.
3010    FIXME: is this thread-safe if two threads call this function?  */
3011
3012 static void
3013 maybe_init_omp_async (struct agent_info *agent)
3014 {
3015   if (!agent->omp_async_queue)
3016     agent->omp_async_queue
3017       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
3018 }
3019
3020 /* A wrapper that works around an issue in the HSA runtime with host-to-device
3021    copies from read-only pages.  */
3022
3023 static void
3024 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
3025 {
3026   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
3027
3028   if (status == HSA_STATUS_SUCCESS)
3029     return;
3030
3031   /* It appears that the copy fails if the source data is in a read-only page.
3032      We can't detect that easily, so try copying the data to a temporary buffer
3033      and doing the copy again if we got an error above.  */
3034
3035   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
3036                "[%p:+%d]\n", (void *) src, (int) len);
3037
3038   void *src_copy = malloc (len);
3039   memcpy (src_copy, src, len);
3040   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
3041   free (src_copy);
3042   if (status != HSA_STATUS_SUCCESS)
3043     GOMP_PLUGIN_error ("memory copy failed");
3044 }
3045
3046 /* Copy data to or from a device.  This is intended for use as an async
3047    callback event.  */
3048
3049 static void
3050 copy_data (void *data_)
3051 {
3052   struct copy_data *data = (struct copy_data *)data_;
3053   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
3054              data->aq->agent->device_id, data->aq->id, data->len, data->src,
3055              data->dst);
3056   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
3057   free (data);
3058 }
3059
3060 /* Request an asynchronous data copy, to or from a device, on a given queue.
3061    The event will be registered as a callback.  */
3062
3063 static void
3064 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
3065                  size_t len)
3066 {
3067   if (DEBUG_QUEUES)
3068     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3069                aq->agent->device_id, aq->id, len, src, dst);
3070   struct copy_data *data
3071     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3072   data->dst = dst;
3073   data->src = src;
3074   data->len = len;
3075   data->aq = aq;
3076   queue_push_callback (aq, copy_data, data);
3077 }
3078
3079 /* Return true if the given queue is currently empty.  */
3080
3081 static int
3082 queue_empty (struct goacc_asyncqueue *aq)
3083 {
3084   pthread_mutex_lock (&aq->mutex);
3085   int res = aq->queue_n == 0 ? 1 : 0;
3086   pthread_mutex_unlock (&aq->mutex);
3087
3088   return res;
3089 }
3090
3091 /* Wait for a given queue to become empty.  This implements an OpenACC wait
3092    directive.  */
3093
3094 static void
3095 wait_queue (struct goacc_asyncqueue *aq)
3096 {
3097   if (DRAIN_QUEUE_SYNCHRONOUS_P)
3098     {
3099       drain_queue_synchronous (aq);
3100       return;
3101     }
3102
3103   pthread_mutex_lock (&aq->mutex);
3104
3105   while (aq->queue_n > 0)
3106     {
3107       if (DEBUG_THREAD_SLEEP)
3108         GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3109                    aq->agent->device_id, aq->id);
3110       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3111       if (DEBUG_THREAD_SLEEP)
3112         GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
3113                    aq->id);
3114     }
3115
3116   pthread_mutex_unlock (&aq->mutex);
3117   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3118 }
3119
3120 /* }}}  */
3121 /* {{{ OpenACC support  */
3122
3123 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
3124
3125 static void
3126 gcn_exec (struct kernel_info *kernel,
3127           void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3128           struct goacc_asyncqueue *aq)
3129 {
3130   if (!GOMP_OFFLOAD_can_run (kernel))
3131     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3132
3133   /* If we get here then this must be an OpenACC kernel.  */
3134   kernel->kind = KIND_OPENACC;
3135
3136   struct hsa_kernel_description *hsa_kernel_desc = NULL;
3137   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3138     {
3139       struct hsa_kernel_description *d
3140         = &kernel->module->image_desc->kernel_infos[i];
3141       if (d->name == kernel->name)
3142         {
3143           hsa_kernel_desc = d;
3144           break;
3145         }
3146     }
3147
3148   /* We may have statically-determined dimensions in
3149      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3150      invocation at runtime in dims[].  We allow static dimensions to take
3151      priority over dynamic dimensions when present (non-zero).  */
3152   if (hsa_kernel_desc->oacc_dims[0] > 0)
3153     dims[0] = hsa_kernel_desc->oacc_dims[0];
3154   if (hsa_kernel_desc->oacc_dims[1] > 0)
3155     dims[1] = hsa_kernel_desc->oacc_dims[1];
3156   if (hsa_kernel_desc->oacc_dims[2] > 0)
3157     dims[2] = hsa_kernel_desc->oacc_dims[2];
3158
3159   /* Ideally, when a dimension isn't explicitly specified, we should
3160      tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3161      In practice, we tune for peak performance on BabelStream, which
3162      for OpenACC is currently 32 threads per CU.  */
3163   if (dims[0] == 0 && dims[1] == 0)
3164     {
3165       /* If any of the OpenACC dimensions remain 0 then we get to pick a
3166          number.  There isn't really a correct answer for this without a clue
3167          about the problem size, so let's do a reasonable number of workers
3168          and gangs.  */
3169
3170       dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs.  */
3171       dims[1] = 8; /* Workers.  */
3172     }
3173   else if (dims[0] == 0 && dims[1] > 0)
3174     {
3175       /* Auto-scale the number of gangs with the requested number of workers.  */
3176       dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3177     }
3178   else if (dims[0] > 0 && dims[1] == 0)
3179     {
3180       /* Auto-scale the number of workers with the requested number of gangs.  */
3181       dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3182       if (dims[1] == 0)
3183         dims[1] = 1;
3184       if (dims[1] > 16)
3185         dims[1] = 16;
3186     }
3187
3188   /* The incoming dimensions are expressed in terms of gangs, workers, and
3189      vectors.  The HSA dimensions are expressed in terms of "work-items",
3190      which means multiples of vector lanes.
3191
3192      The "grid size" specifies the size of the problem space, and the
3193      "work-group size" specifies how much of that we want a single compute
3194      unit to chew on at once.
3195
3196      The three dimensions do not really correspond to hardware, but the
3197      important thing is that the HSA runtime will launch as many
3198      work-groups as it takes to process the entire grid, and each
3199      work-group will contain as many wave-fronts as it takes to process
3200      the work-items in that group.
3201
3202      Essentially, as long as we set the Y dimension to 64 (the number of
3203      vector lanes in hardware), and the Z group size to the maximum (16),
3204      then we will get the gangs (X) and workers (Z) launched as we expect.
3205
3206      The reason for the apparent reversal of vector and worker dimension
3207      order is to do with the way the run-time distributes work-items across
3208      v1 and v2.  */
3209   struct GOMP_kernel_launch_attributes kla =
3210     {3,
3211      /* Grid size.  */
3212      {dims[0], 64, dims[1]},
3213      /* Work-group size.  */
3214      {1,       64, 16}
3215     };
3216
3217   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3218   acc_prof_info *prof_info = thr->prof_info;
3219   acc_event_info enqueue_launch_event_info;
3220   acc_api_info *api_info = thr->api_info;
3221   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3222   if (profiling_dispatch_p)
3223     {
3224       prof_info->event_type = acc_ev_enqueue_launch_start;
3225
3226       enqueue_launch_event_info.launch_event.event_type
3227         = prof_info->event_type;
3228       enqueue_launch_event_info.launch_event.valid_bytes
3229         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3230       enqueue_launch_event_info.launch_event.parent_construct
3231         = acc_construct_parallel;
3232       enqueue_launch_event_info.launch_event.implicit = 1;
3233       enqueue_launch_event_info.launch_event.tool_info = NULL;
3234       enqueue_launch_event_info.launch_event.kernel_name
3235         = (char *) kernel->name;
3236       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3237       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3238       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3239
3240       api_info->device_api = acc_device_api_other;
3241
3242       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3243         &enqueue_launch_event_info, api_info);
3244     }
3245
3246   if (!async)
3247     run_kernel (kernel, devaddrs, &kla, NULL, false);
3248   else
3249     queue_push_launch (aq, kernel, devaddrs, &kla);
3250
3251   if (profiling_dispatch_p)
3252     {
3253       prof_info->event_type = acc_ev_enqueue_launch_end;
3254       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3255       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3256                                             &enqueue_launch_event_info,
3257                                             api_info);
3258     }
3259 }
3260
3261 /* }}}  */
3262 /* {{{ Generic Plugin API  */
3263
3264 /* Return the name of the accelerator, which is "gcn".  */
3265
3266 const char *
3267 GOMP_OFFLOAD_get_name (void)
3268 {
3269   return "gcn";
3270 }
3271
3272 /* Return the specific capabilities the HSA accelerator have.  */
3273
3274 unsigned int
3275 GOMP_OFFLOAD_get_caps (void)
3276 {
3277   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
3278   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3279             | GOMP_OFFLOAD_CAP_OPENACC_200;
3280 }
3281
3282 /* Identify as GCN accelerator.  */
3283
3284 int
3285 GOMP_OFFLOAD_get_type (void)
3286 {
3287   return OFFLOAD_TARGET_TYPE_GCN;
3288 }
3289
3290 /* Return the libgomp version number we're compatible with.  There is
3291    no requirement for cross-version compatibility.  */
3292
3293 unsigned
3294 GOMP_OFFLOAD_version (void)
3295 {
3296   return GOMP_VERSION;
3297 }
3298
3299 /* Return the number of GCN devices on the system.  */
3300
3301 int
3302 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3303 {
3304   if (!init_hsa_context ())
3305     return 0;
3306   /* Return -1 if no omp_requires_mask cannot be fulfilled but
3307      devices were present.  */
3308   if (hsa_context.agent_count > 0
3309       && ((omp_requires_mask
3310            & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
3311                | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
3312     return -1;
3313   return hsa_context.agent_count;
3314 }
3315
3316 /* Initialize device (agent) number N so that it can be used for computation.
3317    Return TRUE on success.  */
3318
3319 bool
3320 GOMP_OFFLOAD_init_device (int n)
3321 {
3322   if (!init_hsa_context ())
3323     return false;
3324   if (n >= hsa_context.agent_count)
3325     {
3326       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3327       return false;
3328     }
3329   struct agent_info *agent = &hsa_context.agents[n];
3330
3331   if (agent->initialized)
3332     return true;
3333
3334   agent->device_id = n;
3335
3336   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3337     {
3338       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3339       return false;
3340     }
3341   if (pthread_mutex_init (&agent->prog_mutex, NULL))
3342     {
3343       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3344       return false;
3345     }
3346   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3347     {
3348       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3349       return false;
3350     }
3351   if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
3352     {
3353       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3354       return false;
3355     }
3356   agent->async_queues = NULL;
3357   agent->omp_async_queue = NULL;
3358   agent->ephemeral_memories_list = NULL;
3359
3360   uint32_t queue_size;
3361   hsa_status_t status;
3362   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3363                                           HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3364                                           &queue_size);
3365   if (status != HSA_STATUS_SUCCESS)
3366     return hsa_error ("Error requesting maximum queue size of the GCN agent",
3367                       status);
3368
3369   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3370                                           &agent->name);
3371   if (status != HSA_STATUS_SUCCESS)
3372     return hsa_error ("Error querying the name of the agent", status);
3373
3374   agent->device_isa = isa_code (agent->name);
3375   if (agent->device_isa < 0)
3376     return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3377
3378   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3379                                           &agent->vendor_name);
3380   if (status != HSA_STATUS_SUCCESS)
3381     return hsa_error ("Error querying the vendor name of the agent", status);
3382
3383   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3384                                         HSA_QUEUE_TYPE_MULTI,
3385                                         hsa_queue_callback, NULL, UINT32_MAX,
3386                                         UINT32_MAX, &agent->sync_queue);
3387   if (status != HSA_STATUS_SUCCESS)
3388     return hsa_error ("Error creating command queue", status);
3389
3390   agent->kernarg_region.handle = (uint64_t) -1;
3391   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3392                                                  get_kernarg_memory_region,
3393                                                  &agent->kernarg_region);
3394   if (status != HSA_STATUS_SUCCESS
3395       && status != HSA_STATUS_INFO_BREAK)
3396     hsa_error ("Scanning memory regions failed", status);
3397   if (agent->kernarg_region.handle == (uint64_t) -1)
3398     {
3399       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3400                          "arguments");
3401       return false;
3402     }
3403   GCN_DEBUG ("Selected kernel arguments memory region:\n");
3404   dump_hsa_region (agent->kernarg_region, NULL);
3405
3406   agent->data_region.handle = (uint64_t) -1;
3407   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3408                                                  get_data_memory_region,
3409                                                  &agent->data_region);
3410   if (status != HSA_STATUS_SUCCESS
3411       && status != HSA_STATUS_INFO_BREAK)
3412     hsa_error ("Scanning memory regions failed", status);
3413   if (agent->data_region.handle == (uint64_t) -1)
3414     {
3415       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3416                          "data");
3417       return false;
3418     }
3419   GCN_DEBUG ("Selected device data memory region:\n");
3420   dump_hsa_region (agent->data_region, NULL);
3421
3422   GCN_DEBUG ("GCN agent %d initialized\n", n);
3423
3424   agent->initialized = true;
3425   return true;
3426 }
3427
3428 /* Load GCN object-code module described by struct gcn_image_desc in
3429    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3430    If there are any constructors then run them.  If not NULL, REV_FN_TABLE will
3431    contain the on-device addresses of the functions for reverse offload.  To be
3432    freed by the caller.  */
3433
3434 int
3435 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3436                          struct addr_pair **target_table,
3437                          uint64_t **rev_fn_table,
3438                          uint64_t *host_ind_fn_table)
3439 {
3440   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3441     {
3442       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3443                          " (expected %u, received %u)",
3444                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3445       return -1;
3446     }
3447
3448   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3449   struct agent_info *agent;
3450   struct addr_pair *pair;
3451   struct module_info *module;
3452   struct kernel_info *kernel;
3453   int kernel_count = image_desc->kernel_count;
3454   unsigned ind_func_count = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
3455                               ? image_desc->ind_func_count : 0;
3456   unsigned var_count = image_desc->global_variable_count;
3457   /* Currently, "others" is a struct of ICVS.  */
3458   int other_count = 1;
3459
3460   agent = get_agent_info (ord);
3461   if (!agent)
3462     return -1;
3463
3464   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3465     {
3466       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3467       return -1;
3468     }
3469   if (agent->prog_finalized
3470       && !destroy_hsa_program (agent))
3471     return -1;
3472
3473   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3474   GCN_DEBUG ("Encountered %d indirect functions in an image\n", ind_func_count);
3475   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3476   GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3477   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3478                              * sizeof (struct addr_pair));
3479   *target_table = pair;
3480   module = (struct module_info *)
3481     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3482                                 + kernel_count * sizeof (struct kernel_info));
3483   module->image_desc = image_desc;
3484   module->kernel_count = kernel_count;
3485   module->heap = NULL;
3486   module->constructors_run_p = false;
3487
3488   kernel = &module->kernels[0];
3489
3490   /* Allocate memory for kernel dependencies.  */
3491   for (unsigned i = 0; i < kernel_count; i++)
3492     {
3493       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3494       if (!init_basic_kernel_info (kernel, d, agent, module))
3495         return -1;
3496       if (strcmp (d->name, "_init_array") == 0)
3497         module->init_array_func = kernel;
3498       else if (strcmp (d->name, "_fini_array") == 0)
3499         module->fini_array_func = kernel;
3500       else
3501         {
3502           pair->start = (uintptr_t) kernel;
3503           pair->end = (uintptr_t) (kernel + 1);
3504           pair++;
3505         }
3506       kernel++;
3507     }
3508
3509   agent->module = module;
3510   if (pthread_rwlock_unlock (&agent->module_rwlock))
3511     {
3512       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3513       return -1;
3514     }
3515
3516   if (!create_and_finalize_hsa_program (agent))
3517     return -1;
3518
3519   if (var_count > 0)
3520     {
3521       hsa_status_t status;
3522       hsa_executable_symbol_t var_symbol;
3523       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3524                                                      ".offload_var_table",
3525                                                      agent->id,
3526                                                      0, &var_symbol);
3527
3528       if (status != HSA_STATUS_SUCCESS)
3529         hsa_fatal ("Could not find symbol for variable in the code object",
3530                    status);
3531
3532       uint64_t var_table_addr;
3533       status = hsa_fns.hsa_executable_symbol_get_info_fn
3534         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3535          &var_table_addr);
3536       if (status != HSA_STATUS_SUCCESS)
3537         hsa_fatal ("Could not extract a variable from its symbol", status);
3538
3539       struct {
3540         uint64_t addr;
3541         uint64_t size;
3542       } var_table[var_count];
3543       GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3544                              (void*)var_table_addr, sizeof (var_table));
3545
3546       for (unsigned i = 0; i < var_count; i++)
3547         {
3548           pair->start = var_table[i].addr;
3549           pair->end = var_table[i].addr + var_table[i].size;
3550           GCN_DEBUG ("Found variable at %p with size %lu\n",
3551                      (void *)var_table[i].addr, var_table[i].size);
3552           pair++;
3553         }
3554     }
3555
3556   if (ind_func_count > 0)
3557     {
3558       hsa_status_t status;
3559
3560       /* Read indirect function table from image.  */
3561       hsa_executable_symbol_t ind_funcs_symbol;
3562       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3563                                                      ".offload_ind_func_table",
3564                                                      agent->id,
3565                                                      0, &ind_funcs_symbol);
3566
3567       if (status != HSA_STATUS_SUCCESS)
3568         hsa_fatal ("Could not find .offload_ind_func_table symbol in the "
3569                    "code object", status);
3570
3571       uint64_t ind_funcs_table_addr;
3572       status = hsa_fns.hsa_executable_symbol_get_info_fn
3573         (ind_funcs_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3574          &ind_funcs_table_addr);
3575       if (status != HSA_STATUS_SUCCESS)
3576         hsa_fatal ("Could not extract a variable from its symbol", status);
3577
3578       uint64_t ind_funcs_table[ind_func_count];
3579       GOMP_OFFLOAD_dev2host (agent->device_id, ind_funcs_table,
3580                              (void*) ind_funcs_table_addr,
3581                              sizeof (ind_funcs_table));
3582
3583       /* Build host->target address map for indirect functions.  */
3584       uint64_t ind_fn_map[ind_func_count * 2 + 1];
3585       for (unsigned i = 0; i < ind_func_count; i++)
3586         {
3587           ind_fn_map[i * 2] = host_ind_fn_table[i];
3588           ind_fn_map[i * 2 + 1] = ind_funcs_table[i];
3589           GCN_DEBUG ("Indirect function %d: %lx->%lx\n",
3590                      i, host_ind_fn_table[i], ind_funcs_table[i]);
3591         }
3592       ind_fn_map[ind_func_count * 2] = 0;
3593
3594       /* Write the map onto the target.  */
3595       void *map_target_addr
3596         = GOMP_OFFLOAD_alloc (agent->device_id, sizeof (ind_fn_map));
3597       GCN_DEBUG ("Allocated indirect map at %p\n", map_target_addr);
3598
3599       GOMP_OFFLOAD_host2dev (agent->device_id, map_target_addr,
3600                              (void*) ind_fn_map,
3601                              sizeof (ind_fn_map));
3602
3603       /* Write address of the map onto the target.  */
3604       hsa_executable_symbol_t symbol;
3605
3606       status
3607         = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3608                                                 XSTRING (GOMP_INDIRECT_ADDR_MAP),
3609                                                 agent->id, 0, &symbol);
3610       if (status != HSA_STATUS_SUCCESS)
3611         hsa_fatal ("Could not find GOMP_INDIRECT_ADDR_MAP in code object",
3612                    status);
3613
3614       uint64_t varptr;
3615       uint32_t varsize;
3616
3617       status = hsa_fns.hsa_executable_symbol_get_info_fn
3618         (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3619          &varptr);
3620       if (status != HSA_STATUS_SUCCESS)
3621         hsa_fatal ("Could not extract a variable from its symbol", status);
3622       status = hsa_fns.hsa_executable_symbol_get_info_fn
3623         (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3624         &varsize);
3625       if (status != HSA_STATUS_SUCCESS)
3626         hsa_fatal ("Could not extract a variable size from its symbol",
3627                    status);
3628
3629       GCN_DEBUG ("Found GOMP_INDIRECT_ADDR_MAP at %lx with size %d\n",
3630                  varptr, varsize);
3631
3632       GOMP_OFFLOAD_host2dev (agent->device_id, (void *) varptr,
3633                              &map_target_addr,
3634                              sizeof (map_target_addr));
3635     }
3636
3637   GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3638
3639   hsa_status_t status;
3640   hsa_executable_symbol_t var_symbol;
3641   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3642                                                  XSTRING (GOMP_ADDITIONAL_ICVS),
3643                                                  agent->id, 0, &var_symbol);
3644   if (status == HSA_STATUS_SUCCESS)
3645     {
3646       uint64_t varptr;
3647       uint32_t varsize;
3648
3649       status = hsa_fns.hsa_executable_symbol_get_info_fn
3650         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3651          &varptr);
3652       if (status != HSA_STATUS_SUCCESS)
3653         hsa_fatal ("Could not extract a variable from its symbol", status);
3654       status = hsa_fns.hsa_executable_symbol_get_info_fn
3655         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3656          &varsize);
3657       if (status != HSA_STATUS_SUCCESS)
3658         hsa_fatal ("Could not extract a variable size from its symbol",
3659                    status);
3660
3661       pair->start = varptr;
3662       pair->end = varptr + varsize;
3663     }
3664   else
3665     {
3666       /* The variable was not in this image.  */
3667       GCN_DEBUG ("Variable not found in image: %s\n",
3668                  XSTRING (GOMP_ADDITIONAL_ICVS));
3669       pair->start = pair->end = 0;
3670     }
3671
3672   /* Ensure that constructors are run first.  */
3673   struct GOMP_kernel_launch_attributes kla =
3674     { 3,
3675       /* Grid size.  */
3676       { 1, 64, 1 },
3677       /* Work-group size.  */
3678       { 1, 64, 1 }
3679     };
3680
3681   if (module->init_array_func)
3682     {
3683       init_kernel (module->init_array_func);
3684       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3685     }
3686   module->constructors_run_p = true;
3687
3688   /* Don't report kernels that libgomp need not know about.  */
3689   if (module->init_array_func)
3690     kernel_count--;
3691   if (module->fini_array_func)
3692     kernel_count--;
3693
3694   if (rev_fn_table != NULL && kernel_count == 0)
3695     *rev_fn_table = NULL;
3696   else if (rev_fn_table != NULL)
3697     {
3698       hsa_status_t status;
3699       hsa_executable_symbol_t var_symbol;
3700       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3701                                                      ".offload_func_table",
3702                                                      agent->id, 0, &var_symbol);
3703       if (status != HSA_STATUS_SUCCESS)
3704         hsa_fatal ("Could not find symbol for variable in the code object",
3705                    status);
3706       uint64_t fn_table_addr;
3707       status = hsa_fns.hsa_executable_symbol_get_info_fn
3708         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3709          &fn_table_addr);
3710       if (status != HSA_STATUS_SUCCESS)
3711         hsa_fatal ("Could not extract a variable from its symbol", status);
3712       *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3713       GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3714                              (void*) fn_table_addr,
3715                              kernel_count * sizeof (uint64_t));
3716     }
3717
3718   return kernel_count + var_count + other_count;
3719 }
3720
3721 /* Unload GCN object-code module described by struct gcn_image_desc in
3722    TARGET_DATA from agent number N.  Return TRUE on success.  */
3723
3724 bool
3725 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3726 {
3727   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3728     {
3729       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3730                          " (expected %u, received %u)",
3731                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3732       return false;
3733     }
3734
3735   struct agent_info *agent;
3736   agent = get_agent_info (n);
3737   if (!agent)
3738     return false;
3739
3740   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3741     {
3742       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3743       return false;
3744     }
3745
3746   if (!agent->module || agent->module->image_desc != target_data)
3747     {
3748       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3749                          "loaded before");
3750       return false;
3751     }
3752
3753   if (!destroy_module (agent->module, true))
3754     return false;
3755   free (agent->module);
3756   agent->module = NULL;
3757   if (!destroy_hsa_program (agent))
3758     return false;
3759   if (pthread_rwlock_unlock (&agent->module_rwlock))
3760     {
3761       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3762       return false;
3763     }
3764   return true;
3765 }
3766
3767 /* Deinitialize all information and status associated with agent number N.  We
3768    do not attempt any synchronization, assuming the user and libgomp will not
3769    attempt deinitialization of a device that is in any way being used at the
3770    same time.  Return TRUE on success.  */
3771
3772 bool
3773 GOMP_OFFLOAD_fini_device (int n)
3774 {
3775   struct agent_info *agent = get_agent_info (n);
3776   if (!agent)
3777     return false;
3778
3779   if (!agent->initialized)
3780     return true;
3781
3782   if (agent->omp_async_queue)
3783     {
3784       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3785       agent->omp_async_queue = NULL;
3786     }
3787
3788   if (agent->module)
3789     {
3790       if (!destroy_module (agent->module, false))
3791         return false;
3792       free (agent->module);
3793       agent->module = NULL;
3794     }
3795
3796   if (!destroy_ephemeral_memories (agent))
3797     return false;
3798
3799   if (!destroy_hsa_program (agent))
3800     return false;
3801
3802   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3803   if (status != HSA_STATUS_SUCCESS)
3804     return hsa_error ("Error destroying command queue", status);
3805
3806   if (pthread_mutex_destroy (&agent->prog_mutex))
3807     {
3808       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3809       return false;
3810     }
3811   if (pthread_rwlock_destroy (&agent->module_rwlock))
3812     {
3813       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3814       return false;
3815     }
3816
3817   if (pthread_mutex_destroy (&agent->async_queues_mutex))
3818     {
3819       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3820       return false;
3821     }
3822   if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
3823     {
3824       GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
3825       return false;
3826     }
3827   agent->initialized = false;
3828   return true;
3829 }
3830
3831 /* Return true if the HSA runtime can run function FN_PTR.  */
3832
3833 bool
3834 GOMP_OFFLOAD_can_run (void *fn_ptr)
3835 {
3836   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3837
3838   init_kernel (kernel);
3839   if (kernel->initialization_failed)
3840     goto failure;
3841
3842   return true;
3843
3844 failure:
3845   if (suppress_host_fallback)
3846     GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3847   GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3848   return false;
3849 }
3850
3851 /* Allocate memory on device N.  */
3852
3853 void *
3854 GOMP_OFFLOAD_alloc (int n, size_t size)
3855 {
3856   struct agent_info *agent = get_agent_info (n);
3857   return alloc_by_agent (agent, size);
3858 }
3859
3860 /* Free memory from device N.  */
3861
3862 bool
3863 GOMP_OFFLOAD_free (int device, void *ptr)
3864 {
3865   GCN_DEBUG ("Freeing memory on device %d\n", device);
3866
3867   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3868   if (status != HSA_STATUS_SUCCESS)
3869     {
3870       hsa_error ("Could not free device memory", status);
3871       return false;
3872     }
3873
3874   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3875   bool profiling_dispatch_p
3876     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3877   if (profiling_dispatch_p)
3878     {
3879       acc_prof_info *prof_info = thr->prof_info;
3880       acc_event_info data_event_info;
3881       acc_api_info *api_info = thr->api_info;
3882
3883       prof_info->event_type = acc_ev_free;
3884
3885       data_event_info.data_event.event_type = prof_info->event_type;
3886       data_event_info.data_event.valid_bytes
3887         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3888       data_event_info.data_event.parent_construct
3889         = acc_construct_parallel;
3890       data_event_info.data_event.implicit = 1;
3891       data_event_info.data_event.tool_info = NULL;
3892       data_event_info.data_event.var_name = NULL;
3893       data_event_info.data_event.bytes = 0;
3894       data_event_info.data_event.host_ptr = NULL;
3895       data_event_info.data_event.device_ptr = (void *) ptr;
3896
3897       api_info->device_api = acc_device_api_other;
3898
3899       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3900                                             api_info);
3901     }
3902
3903   return true;
3904 }
3905
3906 /* Copy data from DEVICE to host.  */
3907
3908 bool
3909 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3910 {
3911   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3912              src, dst);
3913   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3914   if (status != HSA_STATUS_SUCCESS)
3915     GOMP_PLUGIN_error ("memory copy failed");
3916   return true;
3917 }
3918
3919 /* Copy data from host to DEVICE.  */
3920
3921 bool
3922 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3923 {
3924   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3925              device, dst);
3926   hsa_memory_copy_wrapper (dst, src, n);
3927   return true;
3928 }
3929
3930 /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
3931
3932 bool
3933 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3934 {
3935   struct gcn_thread *thread_data = gcn_thread ();
3936
3937   if (thread_data && !async_synchronous_p (thread_data->async))
3938     {
3939       struct agent_info *agent = get_agent_info (device);
3940       maybe_init_omp_async (agent);
3941       queue_push_copy (agent->omp_async_queue, dst, src, n);
3942       return true;
3943     }
3944
3945   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3946              device, src, device, dst);
3947   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3948   if (status != HSA_STATUS_SUCCESS)
3949     GOMP_PLUGIN_error ("memory copy failed");
3950   return true;
3951 }
3952
3953 /* Here <quantity>_size refers to <quantity> multiplied by size -- i.e.
3954    measured in bytes.  So we have:
3955
3956    dim1_size: number of bytes to copy on innermost dimension ("row")
3957    dim0_len: number of rows to copy
3958    dst: base pointer for destination of copy
3959    dst_offset1_size: innermost row offset (for dest), in bytes
3960    dst_offset0_len: offset, number of rows (for dest)
3961    dst_dim1_size: whole-array dest row length, in bytes (pitch)
3962    src: base pointer for source of copy
3963    src_offset1_size: innermost row offset (for source), in bytes
3964    src_offset0_len: offset, number of rows (for source)
3965    src_dim1_size: whole-array source row length, in bytes (pitch)
3966 */
3967
3968 int
3969 GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
3970                        size_t dim0_len, void *dst, size_t dst_offset1_size,
3971                        size_t dst_offset0_len, size_t dst_dim1_size,
3972                        const void *src, size_t src_offset1_size,
3973                        size_t src_offset0_len, size_t src_dim1_size)
3974 {
3975   if (!hsa_fns.hsa_amd_memory_lock_fn
3976       || !hsa_fns.hsa_amd_memory_unlock_fn
3977       || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
3978     return -1;
3979
3980   /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
3981      out quietly if we have anything oddly-aligned rather than letting the
3982      driver raise an error.  */
3983   if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
3984     return -1;
3985
3986   if ((dst_dim1_size & 3) != 0 || (src_dim1_size & 3) != 0)
3987     return -1;
3988
3989   /* Only handle host to device or device to host transfers here.  */
3990   if ((dst_ord == -1 && src_ord == -1)
3991       || (dst_ord != -1 && src_ord != -1))
3992     return -1;
3993
3994   hsa_amd_copy_direction_t dir
3995     = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
3996   hsa_agent_t copy_agent;
3997
3998   /* We need to pin (lock) host memory before we start the transfer.  Try to
3999      lock the minimum size necessary, i.e. using partial first/last rows of the
4000      whole array.  Something like this:
4001
4002          rows -->
4003          ..............
4004      c | ..#######+++++ <- first row apart from {src,dst}_offset1_size
4005      o | ++#######+++++ <- whole row
4006      l | ++#######+++++ <-     "
4007      s v ++#######..... <- last row apart from trailing remainder
4008          ..............
4009
4010      We could split very large transfers into several rectangular copies, but
4011      that is unimplemented for now.  */
4012
4013   size_t bounded_size_host, first_elem_offset_host;
4014   void *host_ptr;
4015   if (dir == hsaHostToDevice)
4016     {
4017       bounded_size_host = src_dim1_size * (dim0_len - 1) + dim1_size;
4018       first_elem_offset_host = src_offset0_len * src_dim1_size
4019                                + src_offset1_size;
4020       host_ptr = (void *) src;
4021       struct agent_info *agent = get_agent_info (dst_ord);
4022       copy_agent = agent->id;
4023     }
4024   else
4025     {
4026       bounded_size_host = dst_dim1_size * (dim0_len - 1) + dim1_size;
4027       first_elem_offset_host = dst_offset0_len * dst_dim1_size
4028                                + dst_offset1_size;
4029       host_ptr = dst;
4030       struct agent_info *agent = get_agent_info (src_ord);
4031       copy_agent = agent->id;
4032     }
4033
4034   void *agent_ptr;
4035
4036   hsa_status_t status
4037     = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4038                                       bounded_size_host, NULL, 0, &agent_ptr);
4039   /* We can't lock the host memory: don't give up though, we might still be
4040      able to use the slow path in our caller.  So, don't make this an
4041      error.  */
4042   if (status != HSA_STATUS_SUCCESS)
4043     return -1;
4044
4045   hsa_pitched_ptr_t dstpp, srcpp;
4046   hsa_dim3_t dst_offsets, src_offsets, ranges;
4047
4048   int retval = 1;
4049
4050   hsa_signal_t completion_signal;
4051   status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4052   if (status != HSA_STATUS_SUCCESS)
4053     {
4054       retval = -1;
4055       goto unlock;
4056     }
4057
4058   if (dir == hsaHostToDevice)
4059     {
4060       srcpp.base = agent_ptr - first_elem_offset_host;
4061       dstpp.base = dst;
4062     }
4063   else
4064     {
4065       srcpp.base = (void *) src;
4066       dstpp.base = agent_ptr - first_elem_offset_host;
4067     }
4068
4069   srcpp.pitch = src_dim1_size;
4070   srcpp.slice = 0;
4071
4072   src_offsets.x = src_offset1_size;
4073   src_offsets.y = src_offset0_len;
4074   src_offsets.z = 0;
4075
4076   dstpp.pitch = dst_dim1_size;
4077   dstpp.slice = 0;
4078
4079   dst_offsets.x = dst_offset1_size;
4080   dst_offsets.y = dst_offset0_len;
4081   dst_offsets.z = 0;
4082
4083   ranges.x = dim1_size;
4084   ranges.y = dim0_len;
4085   ranges.z = 1;
4086
4087   status
4088     = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4089                                                  &src_offsets, &ranges,
4090                                                  copy_agent, dir, 0, NULL,
4091                                                  completion_signal);
4092   /* If the rectangular copy fails, we might still be able to use the slow
4093      path.  We need to unlock the host memory though, so don't return
4094      immediately.  */
4095   if (status != HSA_STATUS_SUCCESS)
4096     retval = -1;
4097   else
4098     hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4099                                         HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
4100                                         HSA_WAIT_STATE_ACTIVE);
4101
4102   hsa_fns.hsa_signal_destroy_fn (completion_signal);
4103
4104 unlock:
4105   status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4106   if (status != HSA_STATUS_SUCCESS)
4107     hsa_fatal ("Could not unlock host memory", status);
4108
4109   return retval;
4110 }
4111
4112 /* As above, <quantity>_size refers to <quantity> multiplied by size -- i.e.
4113    measured in bytes.  So we have:
4114
4115    dim2_size: number of bytes to copy on innermost dimension ("row")
4116    dim1_len: number of rows per slice to copy
4117    dim0_len: number of slices to copy
4118    dst: base pointer for destination of copy
4119    dst_offset2_size: innermost row offset (for dest), in bytes
4120    dst_offset1_len: offset, number of rows (for dest)
4121    dst_offset0_len: offset, number of slices (for dest)
4122    dst_dim2_size: whole-array dest row length, in bytes (pitch)
4123    dst_dim1_len: whole-array number of rows in slice (for dest)
4124    src: base pointer for source of copy
4125    src_offset2_size: innermost row offset (for source), in bytes
4126    src_offset1_len: offset, number of rows (for source)
4127    src_offset0_len: offset, number of slices (for source)
4128    src_dim2_size: whole-array source row length, in bytes (pitch)
4129    src_dim1_len: whole-array number of rows in slice (for source)
4130 */
4131
4132 int
4133 GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
4134                        size_t dim1_len, size_t dim0_len, void *dst,
4135                        size_t dst_offset2_size, size_t dst_offset1_len,
4136                        size_t dst_offset0_len, size_t dst_dim2_size,
4137                        size_t dst_dim1_len, const void *src,
4138                        size_t src_offset2_size, size_t src_offset1_len,
4139                        size_t src_offset0_len, size_t src_dim2_size,
4140                        size_t src_dim1_len)
4141 {
4142   if (!hsa_fns.hsa_amd_memory_lock_fn
4143       || !hsa_fns.hsa_amd_memory_unlock_fn
4144       || !hsa_fns.hsa_amd_memory_async_copy_rect_fn)
4145     return -1;
4146
4147   /* GCN hardware requires 4-byte alignment for base addresses & pitches.  Bail
4148      out quietly if we have anything oddly-aligned rather than letting the
4149      driver raise an error.  */
4150   if ((((uintptr_t) dst) & 3) != 0 || (((uintptr_t) src) & 3) != 0)
4151     return -1;
4152
4153   if ((dst_dim2_size & 3) != 0 || (src_dim2_size & 3) != 0)
4154     return -1;
4155
4156   /* Only handle host to device or device to host transfers here.  */
4157   if ((dst_ord == -1 && src_ord == -1)
4158       || (dst_ord != -1 && src_ord != -1))
4159     return -1;
4160
4161   hsa_amd_copy_direction_t dir
4162     = (src_ord == -1) ? hsaHostToDevice : hsaDeviceToHost;
4163   hsa_agent_t copy_agent;
4164
4165   /* We need to pin (lock) host memory before we start the transfer.  Try to
4166      lock the minimum size necessary, i.e. using partial first/last slices of
4167      the whole 3D array.  Something like this:
4168
4169          slice 0:          slice 1:          slice 2:
4170              __________        __________        __________
4171        ^    /+++++++++/    :  /+++++++++/    :  /         /
4172     column /+++##++++/|    | /+++##++++/|    | /+++##    /  # = subarray
4173      /    /   ##++++/ |    |/+++##++++/ |    |/+++##++++/   + = area to pin
4174          /_________/  :    /_________/  :    /_________/
4175         row --->
4176
4177      We could split very large transfers into several rectangular copies, but
4178      that is unimplemented for now.  */
4179
4180   size_t bounded_size_host, first_elem_offset_host;
4181   void *host_ptr;
4182   if (dir == hsaHostToDevice)
4183     {
4184       size_t slice_bytes = src_dim2_size * src_dim1_len;
4185       bounded_size_host = slice_bytes * (dim0_len - 1)
4186                           + src_dim2_size * (dim1_len - 1)
4187                           + dim2_size;
4188       first_elem_offset_host = src_offset0_len * slice_bytes
4189                                + src_offset1_len * src_dim2_size
4190                                + src_offset2_size;
4191       host_ptr = (void *) src;
4192       struct agent_info *agent = get_agent_info (dst_ord);
4193       copy_agent = agent->id;
4194     }
4195   else
4196     {
4197       size_t slice_bytes = dst_dim2_size * dst_dim1_len;
4198       bounded_size_host = slice_bytes * (dim0_len - 1)
4199                           + dst_dim2_size * (dim1_len - 1)
4200                           + dim2_size;
4201       first_elem_offset_host = dst_offset0_len * slice_bytes
4202                                + dst_offset1_len * dst_dim2_size
4203                                + dst_offset2_size;
4204       host_ptr = dst;
4205       struct agent_info *agent = get_agent_info (src_ord);
4206       copy_agent = agent->id;
4207     }
4208
4209   void *agent_ptr;
4210
4211   hsa_status_t status
4212     = hsa_fns.hsa_amd_memory_lock_fn (host_ptr + first_elem_offset_host,
4213                                       bounded_size_host, NULL, 0, &agent_ptr);
4214   /* We can't lock the host memory: don't give up though, we might still be
4215      able to use the slow path in our caller (maybe even with iterated memcpy2d
4216      calls).  So, don't make this an error.  */
4217   if (status != HSA_STATUS_SUCCESS)
4218     return -1;
4219
4220   hsa_pitched_ptr_t dstpp, srcpp;
4221   hsa_dim3_t dst_offsets, src_offsets, ranges;
4222
4223   int retval = 1;
4224
4225   hsa_signal_t completion_signal;
4226   status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &completion_signal);
4227   if (status != HSA_STATUS_SUCCESS)
4228     {
4229       retval = -1;
4230       goto unlock;
4231     }
4232
4233   if (dir == hsaHostToDevice)
4234     {
4235       srcpp.base = agent_ptr - first_elem_offset_host;
4236       dstpp.base = dst;
4237     }
4238   else
4239     {
4240       srcpp.base = (void *) src;
4241       dstpp.base = agent_ptr - first_elem_offset_host;
4242     }
4243
4244   /* Pitch is measured in bytes.  */
4245   srcpp.pitch = src_dim2_size;
4246   /* Slice is also measured in bytes (i.e. total per-slice).  */
4247   srcpp.slice = src_dim2_size * src_dim1_len;
4248
4249   src_offsets.x = src_offset2_size;
4250   src_offsets.y = src_offset1_len;
4251   src_offsets.z = src_offset0_len;
4252
4253   /* As above.  */
4254   dstpp.pitch = dst_dim2_size;
4255   dstpp.slice = dst_dim2_size * dst_dim1_len;
4256
4257   dst_offsets.x = dst_offset2_size;
4258   dst_offsets.y = dst_offset1_len;
4259   dst_offsets.z = dst_offset0_len;
4260
4261   ranges.x = dim2_size;
4262   ranges.y = dim1_len;
4263   ranges.z = dim0_len;
4264
4265   status
4266     = hsa_fns.hsa_amd_memory_async_copy_rect_fn (&dstpp, &dst_offsets, &srcpp,
4267                                                  &src_offsets, &ranges,
4268                                                  copy_agent, dir, 0, NULL,
4269                                                  completion_signal);
4270   /* If the rectangular copy fails, we might still be able to use the slow
4271      path.  We need to unlock the host memory though, so don't return
4272      immediately.  */
4273   if (status != HSA_STATUS_SUCCESS)
4274     retval = -1;
4275   else
4276     {
4277       hsa_signal_value_t sv
4278         = hsa_fns.hsa_signal_wait_acquire_fn (completion_signal,
4279                                               HSA_SIGNAL_CONDITION_LT, 1,
4280                                               UINT64_MAX,
4281                                               HSA_WAIT_STATE_ACTIVE);
4282       if (sv < 0)
4283         {
4284           GCN_WARNING ("async copy rect failure");
4285           retval = -1;
4286         }
4287     }
4288
4289   hsa_fns.hsa_signal_destroy_fn (completion_signal);
4290
4291 unlock:
4292   status = hsa_fns.hsa_amd_memory_unlock_fn (host_ptr + first_elem_offset_host);
4293   if (status != HSA_STATUS_SUCCESS)
4294     hsa_fatal ("Could not unlock host memory", status);
4295
4296   return retval;
4297 }
4298
4299 /* }}}  */
4300 /* {{{ OpenMP Plugin API  */
4301
4302 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
4303    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
4304    to a kernel_info structure, and must have previously been loaded to the
4305    specified device.  */
4306
4307 void
4308 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
4309 {
4310   struct agent_info *agent = get_agent_info (device);
4311   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4312   struct GOMP_kernel_launch_attributes def;
4313   struct GOMP_kernel_launch_attributes *kla;
4314   assert (agent == kernel->agent);
4315
4316   /* If we get here then the kernel must be OpenMP.  */
4317   kernel->kind = KIND_OPENMP;
4318
4319   if (!parse_target_attributes (args, &def, &kla, agent))
4320     {
4321       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4322       return;
4323     }
4324   run_kernel (kernel, vars, kla, NULL, false);
4325 }
4326
4327 /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
4328    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
4329    GOMP_PLUGIN_target_task_completion when it has finished.  */
4330
4331 void
4332 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
4333                         void **args, void *async_data)
4334 {
4335   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
4336   struct agent_info *agent = get_agent_info (device);
4337   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
4338   struct GOMP_kernel_launch_attributes def;
4339   struct GOMP_kernel_launch_attributes *kla;
4340   assert (agent == kernel->agent);
4341
4342   /* If we get here then the kernel must be OpenMP.  */
4343   kernel->kind = KIND_OPENMP;
4344
4345   if (!parse_target_attributes (args, &def, &kla, agent))
4346     {
4347       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
4348       return;
4349     }
4350
4351   maybe_init_omp_async (agent);
4352   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
4353   queue_push_callback (agent->omp_async_queue,
4354                        GOMP_PLUGIN_target_task_completion, async_data);
4355 }
4356
4357 /* }}} */
4358 /* {{{ OpenACC Plugin API  */
4359
4360 /* Run a synchronous OpenACC kernel.  The device number is inferred from the
4361    already-loaded KERNEL.  */
4362
4363 void
4364 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *),
4365                            size_t mapnum __attribute__((unused)),
4366                            void **hostaddrs __attribute__((unused)),
4367                            void **devaddrs, unsigned *dims,
4368                            void *targ_mem_desc)
4369 {
4370   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4371
4372   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, false, NULL);
4373 }
4374
4375 /* Run an asynchronous OpenACC kernel on the specified queue.  */
4376
4377 void
4378 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *),
4379                                  size_t mapnum __attribute__((unused)),
4380                                  void **hostaddrs __attribute__((unused)),
4381                                  void **devaddrs,
4382                                  unsigned *dims, void *targ_mem_desc,
4383                                  struct goacc_asyncqueue *aq)
4384 {
4385   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
4386
4387   gcn_exec (kernel, devaddrs, dims, targ_mem_desc, true, aq);
4388 }
4389
4390 /* Create a new asynchronous thread and queue for running future kernels.  */
4391
4392 struct goacc_asyncqueue *
4393 GOMP_OFFLOAD_openacc_async_construct (int device)
4394 {
4395   struct agent_info *agent = get_agent_info (device);
4396
4397   pthread_mutex_lock (&agent->async_queues_mutex);
4398
4399   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
4400   aq->agent = get_agent_info (device);
4401   aq->prev = NULL;
4402   aq->next = agent->async_queues;
4403   if (aq->next)
4404     {
4405       aq->next->prev = aq;
4406       aq->id = aq->next->id + 1;
4407     }
4408   else
4409     aq->id = 1;
4410   agent->async_queues = aq;
4411
4412   aq->queue_first = 0;
4413   aq->queue_n = 0;
4414   aq->drain_queue_stop = 0;
4415
4416   if (pthread_mutex_init (&aq->mutex, NULL))
4417     {
4418       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
4419       return false;
4420     }
4421   if (pthread_cond_init (&aq->queue_cond_in, NULL))
4422     {
4423       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4424       return false;
4425     }
4426   if (pthread_cond_init (&aq->queue_cond_out, NULL))
4427     {
4428       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
4429       return false;
4430     }
4431
4432   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
4433                                                      ASYNC_QUEUE_SIZE,
4434                                                      HSA_QUEUE_TYPE_MULTI,
4435                                                      hsa_queue_callback, NULL,
4436                                                      UINT32_MAX, UINT32_MAX,
4437                                                      &aq->hsa_queue);
4438   if (status != HSA_STATUS_SUCCESS)
4439     hsa_fatal ("Error creating command queue", status);
4440
4441   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
4442   if (err != 0)
4443     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
4444                        strerror (err));
4445   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
4446              aq->id);
4447
4448   pthread_mutex_unlock (&agent->async_queues_mutex);
4449
4450   return aq;
4451 }
4452
4453 /* Destroy an existing asynchronous thread and queue.  Waits for any
4454    currently-running task to complete, but cancels any queued tasks.  */
4455
4456 bool
4457 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
4458 {
4459   struct agent_info *agent = aq->agent;
4460
4461   finalize_async_thread (aq);
4462
4463   pthread_mutex_lock (&agent->async_queues_mutex);
4464
4465   int err;
4466   if ((err = pthread_mutex_destroy (&aq->mutex)))
4467     {
4468       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
4469       goto fail;
4470     }
4471   if (pthread_cond_destroy (&aq->queue_cond_in))
4472     {
4473       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4474       goto fail;
4475     }
4476   if (pthread_cond_destroy (&aq->queue_cond_out))
4477     {
4478       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
4479       goto fail;
4480     }
4481   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
4482   if (status != HSA_STATUS_SUCCESS)
4483     {
4484       hsa_error ("Error destroying command queue", status);
4485       goto fail;
4486     }
4487
4488   if (aq->prev)
4489     aq->prev->next = aq->next;
4490   if (aq->next)
4491     aq->next->prev = aq->prev;
4492   if (agent->async_queues == aq)
4493     agent->async_queues = aq->next;
4494
4495   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
4496
4497   free (aq);
4498   pthread_mutex_unlock (&agent->async_queues_mutex);
4499   return true;
4500
4501 fail:
4502   pthread_mutex_unlock (&agent->async_queues_mutex);
4503   return false;
4504 }
4505
4506 /* Return true if the specified async queue is currently empty.  */
4507
4508 int
4509 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4510 {
4511   return queue_empty (aq);
4512 }
4513
4514 /* Block until the specified queue has executed all its tasks and the
4515    queue is empty.  */
4516
4517 bool
4518 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4519 {
4520   wait_queue (aq);
4521   return true;
4522 }
4523
4524 /* Add a serialization point across two async queues. Any new tasks added to
4525    AQ2, after this call, will not run until all tasks on AQ1, at the time
4526    of this call, have completed.  */
4527
4528 bool
4529 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4530                                       struct goacc_asyncqueue *aq2)
4531 {
4532   /* For serialize, stream aq2 waits for aq1 to complete work that has been
4533      scheduled to run on it up to this point.  */
4534   if (aq1 != aq2)
4535     {
4536       struct placeholder *placeholderp = queue_push_placeholder (aq1);
4537       queue_push_asyncwait (aq2, placeholderp);
4538     }
4539   return true;
4540 }
4541
4542 /* Add an opaque callback to the given async queue.  */
4543
4544 void
4545 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4546                                            void (*fn) (void *), void *data)
4547 {
4548   queue_push_callback (aq, fn, data);
4549 }
4550
4551 /* Queue up an asynchronous data copy from host to DEVICE.  */
4552
4553 bool
4554 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4555                                      size_t n, struct goacc_asyncqueue *aq)
4556 {
4557   struct agent_info *agent = get_agent_info (device);
4558   assert (agent == aq->agent);
4559   queue_push_copy (aq, dst, src, n);
4560   return true;
4561 }
4562
4563 /* Queue up an asynchronous data copy from DEVICE to host.  */
4564
4565 bool
4566 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4567                                      size_t n, struct goacc_asyncqueue *aq)
4568 {
4569   struct agent_info *agent = get_agent_info (device);
4570   assert (agent == aq->agent);
4571   queue_push_copy (aq, dst, src, n);
4572   return true;
4573 }
4574
4575 union goacc_property_value
4576 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4577 {
4578   struct agent_info *agent = get_agent_info (device);
4579
4580   union goacc_property_value propval = { .val = 0 };
4581
4582   switch (prop)
4583     {
4584     case GOACC_PROPERTY_FREE_MEMORY:
4585       /* Not supported. */
4586       break;
4587     case GOACC_PROPERTY_MEMORY:
4588       {
4589         size_t size;
4590         hsa_region_t region = agent->data_region;
4591         hsa_status_t status =
4592           hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4593         if (status == HSA_STATUS_SUCCESS)
4594           propval.val = size;
4595         break;
4596       }
4597     case GOACC_PROPERTY_NAME:
4598       propval.ptr = agent->name;
4599       break;
4600     case GOACC_PROPERTY_VENDOR:
4601       propval.ptr = agent->vendor_name;
4602       break;
4603     case GOACC_PROPERTY_DRIVER:
4604       propval.ptr = hsa_context.driver_version_s;
4605       break;
4606     }
4607
4608   return propval;
4609 }
4610
4611 /* Set up plugin-specific thread-local-data (host-side).  */
4612
4613 void *
4614 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4615 {
4616   struct gcn_thread *thread_data
4617     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4618
4619   thread_data->async = GOMP_ASYNC_SYNC;
4620
4621   return (void *) thread_data;
4622 }
4623
4624 /* Clean up plugin-specific thread-local-data.  */
4625
4626 void
4627 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4628 {
4629   free (data);
4630 }
4631
4632 /* }}} */