libgomp/plugin/plugin-gcn.c

   1 /* Plugin for AMD GCN execution.
   2
   3    Copyright (C) 2013-2023 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* {{{ Includes and defines  */
  30
  31 #include "config.h"
  32 #include "symcat.h"
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <pthread.h>
  37 #include <inttypes.h>
  38 #include <stdbool.h>
  39 #include <limits.h>
  40 #include <hsa.h>
  41 #include <hsa_ext_amd.h>
  42 #include <dlfcn.h>
  43 #include <signal.h>
  44 #include "libgomp-plugin.h"
  45 #include "config/gcn/libgomp-gcn.h"  /* For struct output.  */
  46 #include "gomp-constants.h"
  47 #include <elf.h>
  48 #include "oacc-plugin.h"
  49 #include "oacc-int.h"
  50 #include <assert.h>
  51
  52 /* These probably won't be in elf.h for a while.  */
  53 #ifndef R_AMDGPU_NONE
  54 #define R_AMDGPU_NONE           0
  55 #define R_AMDGPU_ABS32_LO       1       /* (S + A) & 0xFFFFFFFF  */
  56 #define R_AMDGPU_ABS32_HI       2       /* (S + A) >> 32  */
  57 #define R_AMDGPU_ABS64          3       /* S + A  */
  58 #define R_AMDGPU_REL32          4       /* S + A - P  */
  59 #define R_AMDGPU_REL64          5       /* S + A - P  */
  60 #define R_AMDGPU_ABS32          6       /* S + A  */
  61 #define R_AMDGPU_GOTPCREL       7       /* G + GOT + A - P  */
  62 #define R_AMDGPU_GOTPCREL32_LO  8       /* (G + GOT + A - P) & 0xFFFFFFFF  */
  63 #define R_AMDGPU_GOTPCREL32_HI  9       /* (G + GOT + A - P) >> 32  */
  64 #define R_AMDGPU_REL32_LO       10      /* (S + A - P) & 0xFFFFFFFF  */
  65 #define R_AMDGPU_REL32_HI       11      /* (S + A - P) >> 32  */
  66 #define R_AMDGPU_RELATIVE64     13      /* B + A  */
  67 #endif
  68
  69 /* GCN specific definitions for asynchronous queues.  */
  70
  71 #define ASYNC_QUEUE_SIZE 64
  72 #define DRAIN_QUEUE_SYNCHRONOUS_P false
  73 #define DEBUG_QUEUES 0
  74 #define DEBUG_THREAD_SLEEP 0
  75 #define DEBUG_THREAD_SIGNAL 0
  76
  77 /* Defaults.  */
  78 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
  79
  80 /* Secure getenv() which returns NULL if running as SUID/SGID.  */
  81 #ifndef HAVE_SECURE_GETENV
  82 #ifdef HAVE___SECURE_GETENV
  83 #define secure_getenv __secure_getenv
  84 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
  85   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
  86
  87 #include <unistd.h>
  88
  89 /* Implementation of secure_getenv() for targets where it is not provided but
  90    we have at least means to test real and effective IDs. */
  91
  92 static char *
  93 secure_getenv (const char *name)
  94 {
  95   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
  96     return getenv (name);
  97   else
  98     return NULL;
  99 }
 100
 101 #else
 102 #define secure_getenv getenv
 103 #endif
 104 #endif
 105
 106 /* }}}  */
 107 /* {{{ Types  */
 108
 109 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
 110
 111 struct gcn_thread
 112 {
 113   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
 114   int async;
 115 };
 116
 117 /* As an HSA runtime is dlopened, following structure defines function
 118    pointers utilized by the HSA plug-in.  */
 119
 120 struct hsa_runtime_fn_info
 121 {
 122   /* HSA runtime.  */
 123   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
 124                                         const char **status_string);
 125   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
 126                                           void *value);
 127   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
 128                                          hsa_agent_info_t attribute,
 129                                          void *value);
 130   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
 131                                       hsa_isa_info_t attribute,
 132                                       uint32_t index,
 133                                       void *value);
 134   hsa_status_t (*hsa_init_fn) (void);
 135   hsa_status_t (*hsa_iterate_agents_fn)
 136     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
 137   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
 138                                           hsa_region_info_t attribute,
 139                                           void *value);
 140   hsa_status_t (*hsa_queue_create_fn)
 141     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
 142      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
 143      void *data, uint32_t private_segment_size,
 144      uint32_t group_segment_size, hsa_queue_t **queue);
 145   hsa_status_t (*hsa_agent_iterate_regions_fn)
 146     (hsa_agent_t agent,
 147      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
 148   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
 149   hsa_status_t (*hsa_executable_create_fn)
 150     (hsa_profile_t profile, hsa_executable_state_t executable_state,
 151      const char *options, hsa_executable_t *executable);
 152   hsa_status_t (*hsa_executable_global_variable_define_fn)
 153     (hsa_executable_t executable, const char *variable_name, void *address);
 154   hsa_status_t (*hsa_executable_load_code_object_fn)
 155     (hsa_executable_t executable, hsa_agent_t agent,
 156      hsa_code_object_t code_object, const char *options);
 157   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
 158                                            const char *options);
 159   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
 160                                         uint32_t num_consumers,
 161                                         const hsa_agent_t *consumers,
 162                                         hsa_signal_t *signal);
 163   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
 164                                           void **ptr);
 165   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
 166                                               hsa_access_permission_t access);
 167   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
 168   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
 169   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
 170   hsa_status_t (*hsa_executable_get_symbol_fn)
 171     (hsa_executable_t executable, const char *module_name,
 172      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
 173      hsa_executable_symbol_t *symbol);
 174   hsa_status_t (*hsa_executable_symbol_get_info_fn)
 175     (hsa_executable_symbol_t executable_symbol,
 176      hsa_executable_symbol_info_t attribute, void *value);
 177   hsa_status_t (*hsa_executable_iterate_symbols_fn)
 178     (hsa_executable_t executable,
 179      hsa_status_t (*callback)(hsa_executable_t executable,
 180                               hsa_executable_symbol_t symbol, void *data),
 181      void *data);
 182   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
 183                                                     uint64_t value);
 184   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
 185   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
 186                                        hsa_signal_value_t value);
 187   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
 188                                        hsa_signal_value_t value);
 189   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
 190     (hsa_signal_t signal, hsa_signal_condition_t condition,
 191      hsa_signal_value_t compare_value, uint64_t timeout_hint,
 192      hsa_wait_state_t wait_state_hint);
 193   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
 194   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
 195
 196   hsa_status_t (*hsa_code_object_deserialize_fn)
 197     (void *serialized_code_object, size_t serialized_code_object_size,
 198      const char *options, hsa_code_object_t *code_object);
 199 };
 200
 201 /* Structure describing the run-time and grid properties of an HSA kernel
 202    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
 203
 204 struct GOMP_kernel_launch_attributes
 205 {
 206   /* Number of dimensions the workload has.  Maximum number is 3.  */
 207   uint32_t ndim;
 208   /* Size of the grid in the three respective dimensions.  */
 209   uint32_t gdims[3];
 210   /* Size of work-groups in the respective dimensions.  */
 211   uint32_t wdims[3];
 212 };
 213
 214 /* Collection of information needed for a dispatch of a kernel from a
 215    kernel.  */
 216
 217 struct kernel_dispatch
 218 {
 219   struct agent_info *agent;
 220   /* Pointer to a command queue associated with a kernel dispatch agent.  */
 221   void *queue;
 222   /* Pointer to a memory space used for kernel arguments passing.  */
 223   void *kernarg_address;
 224   /* Kernel object.  */
 225   uint64_t object;
 226   /* Synchronization signal used for dispatch synchronization.  */
 227   uint64_t signal;
 228   /* Private segment size.  */
 229   uint32_t private_segment_size;
 230   /* Group segment size.  */
 231   uint32_t group_segment_size;
 232 };
 233
 234 /* Structure of the kernargs segment, supporting console output.
 235
 236    This needs to match the definitions in Newlib, and the expectations
 237    in libgomp target code.  */
 238
 239 struct kernargs {
 240   /* Leave space for the real kernel arguments.
 241      OpenACC and OpenMP only use one pointer.  */
 242   int64_t dummy1;
 243   int64_t dummy2;
 244
 245   /* A pointer to struct output, below, for console output data.  */
 246   int64_t out_ptr;
 247
 248   /* A pointer to struct heap, below.  */
 249   int64_t heap_ptr;
 250
 251   /* A pointer to an ephemeral memory arena.
 252     Only needed for OpenMP.  */
 253   int64_t arena_ptr;
 254
 255   /* Output data.  */
 256   struct output output_data;
 257 };
 258
 259 /* A queue entry for a future asynchronous launch.  */
 260
 261 struct kernel_launch
 262 {
 263   struct kernel_info *kernel;
 264   void *vars;
 265   struct GOMP_kernel_launch_attributes kla;
 266 };
 267
 268 /* A queue entry for a future callback.  */
 269
 270 struct callback
 271 {
 272   void (*fn)(void *);
 273   void *data;
 274 };
 275
 276 /* A data struct for the copy_data callback.  */
 277
 278 struct copy_data
 279 {
 280   void *dst;
 281   const void *src;
 282   size_t len;
 283   struct goacc_asyncqueue *aq;
 284 };
 285
 286 /* A queue entry for a placeholder.  These correspond to a wait event.  */
 287
 288 struct placeholder
 289 {
 290   int executed;
 291   pthread_cond_t cond;
 292   pthread_mutex_t mutex;
 293 };
 294
 295 /* A queue entry for a wait directive.  */
 296
 297 struct asyncwait_info
 298 {
 299   struct placeholder *placeholderp;
 300 };
 301
 302 /* Encode the type of an entry in an async queue.  */
 303
 304 enum entry_type
 305 {
 306   KERNEL_LAUNCH,
 307   CALLBACK,
 308   ASYNC_WAIT,
 309   ASYNC_PLACEHOLDER
 310 };
 311
 312 /* An entry in an async queue.  */
 313
 314 struct queue_entry
 315 {
 316   enum entry_type type;
 317   union {
 318     struct kernel_launch launch;
 319     struct callback callback;
 320     struct asyncwait_info asyncwait;
 321     struct placeholder placeholder;
 322   } u;
 323 };
 324
 325 /* An async queue header.
 326
 327    OpenMP may create one of these.
 328    OpenACC may create many.  */
 329
 330 struct goacc_asyncqueue
 331 {
 332   struct agent_info *agent;
 333   hsa_queue_t *hsa_queue;
 334
 335   pthread_t thread_drain_queue;
 336   pthread_mutex_t mutex;
 337   pthread_cond_t queue_cond_in;
 338   pthread_cond_t queue_cond_out;
 339   struct queue_entry queue[ASYNC_QUEUE_SIZE];
 340   int queue_first;
 341   int queue_n;
 342   int drain_queue_stop;
 343
 344   int id;
 345   struct goacc_asyncqueue *prev;
 346   struct goacc_asyncqueue *next;
 347 };
 348
 349 /* Mkoffload uses this structure to describe a kernel.
 350
 351    OpenMP kernel dimensions are passed at runtime.
 352    OpenACC kernel dimensions are passed at compile time, here.  */
 353
 354 struct hsa_kernel_description
 355 {
 356   const char *name;
 357   int oacc_dims[3];  /* Only present for GCN kernels.  */
 358   int sgpr_count;
 359   int vpgr_count;
 360 };
 361
 362 /* Mkoffload uses this structure to describe an offload variable.  */
 363
 364 struct global_var_info
 365 {
 366   const char *name;
 367   void *address;
 368 };
 369
 370 /* Mkoffload uses this structure to describe all the kernels in a
 371    loadable module.  These are passed the libgomp via static constructors.  */
 372
 373 struct gcn_image_desc
 374 {
 375   struct gcn_image {
 376     size_t size;
 377     void *image;
 378   } *gcn_image;
 379   const unsigned kernel_count;
 380   struct hsa_kernel_description *kernel_infos;
 381   const unsigned global_variable_count;
 382 };
 383
 384 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
 385    support.
 386    See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
 387
 388 typedef enum {
 389   EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
 390   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
 391   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
 392   EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
 393   EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f
 394 } EF_AMDGPU_MACH;
 395
 396 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
 397 typedef EF_AMDGPU_MACH gcn_isa;
 398
 399 /* Description of an HSA GPU agent (device) and the program associated with
 400    it.  */
 401
 402 struct agent_info
 403 {
 404   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
 405   hsa_agent_t id;
 406   /* The user-visible device number.  */
 407   int device_id;
 408   /* Whether the agent has been initialized.  The fields below are usable only
 409      if it has been.  */
 410   bool initialized;
 411
 412   /* The instruction set architecture of the device. */
 413   gcn_isa device_isa;
 414   /* Name of the agent. */
 415   char name[64];
 416   /* Name of the vendor of the agent. */
 417   char vendor_name[64];
 418   /* Command queues of the agent.  */
 419   hsa_queue_t *sync_queue;
 420   struct goacc_asyncqueue *async_queues, *omp_async_queue;
 421   pthread_mutex_t async_queues_mutex;
 422
 423   /* The HSA memory region from which to allocate kernel arguments.  */
 424   hsa_region_t kernarg_region;
 425
 426   /* The HSA memory region from which to allocate device data.  */
 427   hsa_region_t data_region;
 428
 429   /* Allocated team arenas.  */
 430   struct team_arena_list *team_arena_list;
 431   pthread_mutex_t team_arena_write_lock;
 432
 433   /* Read-write lock that protects kernels which are running or about to be run
 434      from interference with loading and unloading of images.  Needs to be
 435      locked for reading while a kernel is being run, and for writing if the
 436      list of modules is manipulated (and thus the HSA program invalidated).  */
 437   pthread_rwlock_t module_rwlock;
 438
 439   /* The module associated with this kernel.  */
 440   struct module_info *module;
 441
 442   /* Mutex enforcing that only one thread will finalize the HSA program.  A
 443      thread should have locked agent->module_rwlock for reading before
 444      acquiring it.  */
 445   pthread_mutex_t prog_mutex;
 446   /* Flag whether the HSA program that consists of all the modules has been
 447      finalized.  */
 448   bool prog_finalized;
 449   /* HSA executable - the finalized program that is used to locate kernels.  */
 450   hsa_executable_t executable;
 451 };
 452
 453 /* Information required to identify, finalize and run any given kernel.  */
 454
 455 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
 456
 457 struct kernel_info
 458 {
 459   /* Name of the kernel, required to locate it within the GCN object-code
 460      module.  */
 461   const char *name;
 462   /* The specific agent the kernel has been or will be finalized for and run
 463      on.  */
 464   struct agent_info *agent;
 465   /* The specific module where the kernel takes place.  */
 466   struct module_info *module;
 467   /* Information provided by mkoffload associated with the kernel.  */
 468   struct hsa_kernel_description *description;
 469   /* Mutex enforcing that at most once thread ever initializes a kernel for
 470      use.  A thread should have locked agent->module_rwlock for reading before
 471      acquiring it.  */
 472   pthread_mutex_t init_mutex;
 473   /* Flag indicating whether the kernel has been initialized and all fields
 474      below it contain valid data.  */
 475   bool initialized;
 476   /* Flag indicating that the kernel has a problem that blocks an execution.  */
 477   bool initialization_failed;
 478   /* The object to be put into the dispatch queue.  */
 479   uint64_t object;
 480   /* Required size of kernel arguments.  */
 481   uint32_t kernarg_segment_size;
 482   /* Required size of group segment.  */
 483   uint32_t group_segment_size;
 484   /* Required size of private segment.  */
 485   uint32_t private_segment_size;
 486   /* Set up for OpenMP or OpenACC?  */
 487   enum offload_kind kind;
 488 };
 489
 490 /* Information about a particular GCN module, its image and kernels.  */
 491
 492 struct module_info
 493 {
 494   /* The description with which the program has registered the image.  */
 495   struct gcn_image_desc *image_desc;
 496   /* GCN heap allocation.  */
 497   struct heap *heap;
 498   /* Physical boundaries of the loaded module.  */
 499   Elf64_Addr phys_address_start;
 500   Elf64_Addr phys_address_end;
 501
 502   bool constructors_run_p;
 503   struct kernel_info *init_array_func, *fini_array_func;
 504
 505   /* Number of kernels in this module.  */
 506   int kernel_count;
 507   /* An array of kernel_info structures describing each kernel in this
 508      module.  */
 509   struct kernel_info kernels[];
 510 };
 511
 512 /* A linked list of memory arenas allocated on the device.
 513    These are only used by OpenMP, as a means to optimize per-team malloc.  */
 514
 515 struct team_arena_list
 516 {
 517   struct team_arena_list *next;
 518
 519   /* The number of teams determines the size of the allocation.  */
 520   int num_teams;
 521   /* The device address of the arena itself.  */
 522   void *arena;
 523   /* A flag to prevent two asynchronous kernels trying to use the same arena.
 524      The mutex is locked until the kernel exits.  */
 525   pthread_mutex_t in_use;
 526 };
 527
 528 /* Information about the whole HSA environment and all of its agents.  */
 529
 530 struct hsa_context_info
 531 {
 532   /* Whether the structure has been initialized.  */
 533   bool initialized;
 534   /* Number of usable GPU HSA agents in the system.  */
 535   int agent_count;
 536   /* Array of agent_info structures describing the individual HSA agents.  */
 537   struct agent_info *agents;
 538   /* Driver version string. */
 539   char driver_version_s[30];
 540 };
 541
 542 /* Format of the on-device heap.
 543
 544    This must match the definition in Newlib and gcn-run.  */
 545
 546 struct heap {
 547   int64_t size;
 548   char data[0];
 549 };
 550
 551 /* }}}  */
 552 /* {{{ Global variables  */
 553
 554 /* Information about the whole HSA environment and all of its agents.  */
 555
 556 static struct hsa_context_info hsa_context;
 557
 558 /* HSA runtime functions that are initialized in init_hsa_context.  */
 559
 560 static struct hsa_runtime_fn_info hsa_fns;
 561
 562 /* Heap space, allocated target-side, provided for use of newlib malloc.
 563    Each module should have it's own heap allocated.
 564    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
 565
 566 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
 567
 568 /* Flag to decide whether print to stderr information about what is going on.
 569    Set in init_debug depending on environment variables.  */
 570
 571 static bool debug;
 572
 573 /* Flag to decide if the runtime should suppress a possible fallback to host
 574    execution.  */
 575
 576 static bool suppress_host_fallback;
 577
 578 /* Flag to locate HSA runtime shared library that is dlopened
 579    by this plug-in.  */
 580
 581 static const char *hsa_runtime_lib;
 582
 583 /* Flag to decide if the runtime should support also CPU devices (can be
 584    a simulator).  */
 585
 586 static bool support_cpu_devices;
 587
 588 /* Runtime dimension overrides.  Zero indicates default.  */
 589
 590 static int override_x_dim = 0;
 591 static int override_z_dim = 0;
 592
 593 /* }}}  */
 594 /* {{{ Debug & Diagnostic  */
 595
 596 /* Print a message to stderr if GCN_DEBUG value is set to true.  */
 597
 598 #define DEBUG_PRINT(...) \
 599   do \
 600   { \
 601     if (debug) \
 602       { \
 603         fprintf (stderr, __VA_ARGS__); \
 604       } \
 605   } \
 606   while (false);
 607
 608 /* Flush stderr if GCN_DEBUG value is set to true.  */
 609
 610 #define DEBUG_FLUSH()                           \
 611   do {                                          \
 612     if (debug)                                  \
 613       fflush (stderr);                          \
 614   } while (false)
 615
 616 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
 617    is set to true.  */
 618
 619 #define DEBUG_LOG(prefix, ...)                  \
 620   do                                            \
 621     {                                           \
 622       DEBUG_PRINT (prefix);                     \
 623       DEBUG_PRINT (__VA_ARGS__);                        \
 624       DEBUG_FLUSH ();                           \
 625     } while (false)
 626
 627 /* Print a debugging message to stderr.  */
 628
 629 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
 630
 631 /* Print a warning message to stderr.  */
 632
 633 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
 634
 635 /* Print HSA warning STR with an HSA STATUS code.  */
 636
 637 static void
 638 hsa_warn (const char *str, hsa_status_t status)
 639 {
 640   if (!debug)
 641     return;
 642
 643   const char *hsa_error_msg = "[unknown]";
 644   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 645
 646   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
 647            hsa_error_msg);
 648 }
 649
 650 /* Report a fatal error STR together with the HSA error corresponding to STATUS
 651    and terminate execution of the current process.  */
 652
 653 static void
 654 hsa_fatal (const char *str, hsa_status_t status)
 655 {
 656   const char *hsa_error_msg = "[unknown]";
 657   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 658   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
 659                      hsa_error_msg);
 660 }
 661
 662 /* Like hsa_fatal, except only report error message, and return FALSE
 663    for propagating error processing to outside of plugin.  */
 664
 665 static bool
 666 hsa_error (const char *str, hsa_status_t status)
 667 {
 668   const char *hsa_error_msg = "[unknown]";
 669   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 670   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
 671                      hsa_error_msg);
 672   return false;
 673 }
 674
 675 /* Dump information about the available hardware.  */
 676
 677 static void
 678 dump_hsa_system_info (void)
 679 {
 680   hsa_status_t status;
 681
 682   hsa_endianness_t endianness;
 683   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
 684                                            &endianness);
 685   if (status == HSA_STATUS_SUCCESS)
 686     switch (endianness)
 687       {
 688       case HSA_ENDIANNESS_LITTLE:
 689         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
 690         break;
 691       case HSA_ENDIANNESS_BIG:
 692         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
 693         break;
 694       default:
 695         GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
 696       }
 697   else
 698     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
 699
 700   uint8_t extensions[128];
 701   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
 702                                            &extensions);
 703   if (status == HSA_STATUS_SUCCESS)
 704     {
 705       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
 706         GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
 707     }
 708   else
 709     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
 710 }
 711
 712 /* Dump information about the available hardware.  */
 713
 714 static void
 715 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
 716 {
 717   switch (machine_model)
 718     {
 719     case HSA_MACHINE_MODEL_SMALL:
 720       GCN_DEBUG ("%s: SMALL\n", s);
 721       break;
 722     case HSA_MACHINE_MODEL_LARGE:
 723       GCN_DEBUG ("%s: LARGE\n", s);
 724       break;
 725     default:
 726       GCN_WARNING ("%s: UNKNOWN\n", s);
 727       break;
 728     }
 729 }
 730
 731 /* Dump information about the available hardware.  */
 732
 733 static void
 734 dump_profile (hsa_profile_t profile, const char *s)
 735 {
 736   switch (profile)
 737     {
 738     case HSA_PROFILE_FULL:
 739       GCN_DEBUG ("%s: FULL\n", s);
 740       break;
 741     case HSA_PROFILE_BASE:
 742       GCN_DEBUG ("%s: BASE\n", s);
 743       break;
 744     default:
 745       GCN_WARNING ("%s: UNKNOWN\n", s);
 746       break;
 747     }
 748 }
 749
 750 /* Dump information about a device memory region.  */
 751
 752 static hsa_status_t
 753 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
 754 {
 755   hsa_status_t status;
 756
 757   hsa_region_segment_t segment;
 758   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
 759                                            &segment);
 760   if (status == HSA_STATUS_SUCCESS)
 761     {
 762       if (segment == HSA_REGION_SEGMENT_GLOBAL)
 763         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
 764       else if (segment == HSA_REGION_SEGMENT_READONLY)
 765         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
 766       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
 767         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
 768       else if (segment == HSA_REGION_SEGMENT_GROUP)
 769         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
 770       else
 771         GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
 772     }
 773   else
 774     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
 775
 776   if (segment == HSA_REGION_SEGMENT_GLOBAL)
 777     {
 778       uint32_t flags;
 779       status
 780         = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
 781                                           &flags);
 782       if (status == HSA_STATUS_SUCCESS)
 783         {
 784           if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
 785             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
 786           if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
 787             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
 788           if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
 789             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
 790         }
 791       else
 792         GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
 793     }
 794
 795   size_t size;
 796   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
 797   if (status == HSA_STATUS_SUCCESS)
 798     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
 799   else
 800     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
 801
 802   status
 803     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
 804                                       &size);
 805   if (status == HSA_STATUS_SUCCESS)
 806     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
 807   else
 808     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
 809
 810   bool alloc_allowed;
 811   status
 812     = hsa_fns.hsa_region_get_info_fn (region,
 813                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
 814                                       &alloc_allowed);
 815   if (status == HSA_STATUS_SUCCESS)
 816     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
 817   else
 818     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
 819
 820   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
 821     return HSA_STATUS_SUCCESS;
 822
 823   status
 824     = hsa_fns.hsa_region_get_info_fn (region,
 825                                       HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
 826                                       &size);
 827   if (status == HSA_STATUS_SUCCESS)
 828     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
 829   else
 830     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
 831
 832   size_t align;
 833   status
 834     = hsa_fns.hsa_region_get_info_fn (region,
 835                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
 836                                       &align);
 837   if (status == HSA_STATUS_SUCCESS)
 838     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
 839   else
 840     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
 841
 842   return HSA_STATUS_SUCCESS;
 843 }
 844
 845 /* Dump information about all the device memory regions.  */
 846
 847 static void
 848 dump_hsa_regions (hsa_agent_t agent)
 849 {
 850   hsa_status_t status;
 851   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
 852                                                  dump_hsa_region,
 853                                                  NULL);
 854   if (status != HSA_STATUS_SUCCESS)
 855     hsa_error ("Dumping hsa regions failed", status);
 856 }
 857
 858 /* Dump information about the available devices.  */
 859
 860 static hsa_status_t
 861 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
 862 {
 863   hsa_status_t status;
 864
 865   char buf[64];
 866   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
 867                                           &buf);
 868   if (status == HSA_STATUS_SUCCESS)
 869     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
 870   else
 871     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
 872
 873   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
 874                                           &buf);
 875   if (status == HSA_STATUS_SUCCESS)
 876     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
 877   else
 878     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
 879
 880   hsa_machine_model_t machine_model;
 881   status
 882     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
 883                                      &machine_model);
 884   if (status == HSA_STATUS_SUCCESS)
 885     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
 886   else
 887     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
 888
 889   hsa_profile_t profile;
 890   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
 891                                           &profile);
 892   if (status == HSA_STATUS_SUCCESS)
 893     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
 894   else
 895     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
 896
 897   hsa_device_type_t device_type;
 898   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
 899                                           &device_type);
 900   if (status == HSA_STATUS_SUCCESS)
 901     {
 902       switch (device_type)
 903         {
 904         case HSA_DEVICE_TYPE_CPU:
 905           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
 906           break;
 907         case HSA_DEVICE_TYPE_GPU:
 908           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
 909           break;
 910         case HSA_DEVICE_TYPE_DSP:
 911           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
 912           break;
 913         default:
 914           GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
 915           break;
 916         }
 917     }
 918   else
 919     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
 920
 921   uint32_t cu_count;
 922   status = hsa_fns.hsa_agent_get_info_fn
 923     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
 924   if (status == HSA_STATUS_SUCCESS)
 925     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
 926   else
 927     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
 928
 929   uint32_t size;
 930   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
 931                                           &size);
 932   if (status == HSA_STATUS_SUCCESS)
 933     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
 934   else
 935     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
 936
 937   uint32_t max_dim;
 938   status = hsa_fns.hsa_agent_get_info_fn (agent,
 939                                           HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
 940                                           &max_dim);
 941   if (status == HSA_STATUS_SUCCESS)
 942     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
 943   else
 944     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
 945
 946   uint32_t max_size;
 947   status = hsa_fns.hsa_agent_get_info_fn (agent,
 948                                           HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
 949                                           &max_size);
 950   if (status == HSA_STATUS_SUCCESS)
 951     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
 952   else
 953     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
 954
 955   uint32_t grid_max_dim;
 956   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
 957                                           &grid_max_dim);
 958   if (status == HSA_STATUS_SUCCESS)
 959     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
 960   else
 961     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
 962
 963   uint32_t grid_max_size;
 964   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
 965                                           &grid_max_size);
 966   if (status == HSA_STATUS_SUCCESS)
 967     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
 968   else
 969     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
 970
 971   dump_hsa_regions (agent);
 972
 973   return HSA_STATUS_SUCCESS;
 974 }
 975
 976 /* Forward reference.  */
 977
 978 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
 979
 980 /* Helper function for dump_executable_symbols.  */
 981
 982 static hsa_status_t
 983 dump_executable_symbol (hsa_executable_t executable,
 984                         hsa_executable_symbol_t symbol,
 985                         void *data __attribute__((unused)))
 986 {
 987   char *name = get_executable_symbol_name (symbol);
 988
 989   if (name)
 990     {
 991       GCN_DEBUG ("executable symbol: %s\n", name);
 992       free (name);
 993     }
 994
 995   return HSA_STATUS_SUCCESS;
 996 }
 997
 998 /* Dump all global symbol in an executable.  */
 999
1000 static void
1001 dump_executable_symbols (hsa_executable_t executable)
1002 {
1003   hsa_status_t status;
1004   status
1005     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1006                                                  dump_executable_symbol,
1007                                                  NULL);
1008   if (status != HSA_STATUS_SUCCESS)
1009     hsa_fatal ("Could not dump HSA executable symbols", status);
1010 }
1011
1012 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1013
1014 static void
1015 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1016 {
1017   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1018
1019   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1020   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1021   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1022   fprintf (stderr, "%*sheap address: %p\n", indent, "",
1023            (void*)kernargs->heap_ptr);
1024   fprintf (stderr, "%*sarena address: %p\n", indent, "",
1025            (void*)kernargs->arena_ptr);
1026   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1027   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1028            dispatch->private_segment_size);
1029   fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1030            dispatch->group_segment_size);
1031   fprintf (stderr, "\n");
1032 }
1033
1034 /* }}}  */
1035 /* {{{ Utility functions  */
1036
1037 /* Cast the thread local storage to gcn_thread.  */
1038
1039 static inline struct gcn_thread *
1040 gcn_thread (void)
1041 {
1042   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1043 }
1044
1045 /* Initialize debug and suppress_host_fallback according to the environment.  */
1046
1047 static void
1048 init_environment_variables (void)
1049 {
1050   if (secure_getenv ("GCN_DEBUG"))
1051     debug = true;
1052   else
1053     debug = false;
1054
1055   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1056     suppress_host_fallback = true;
1057   else
1058     suppress_host_fallback = false;
1059
1060   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1061   if (hsa_runtime_lib == NULL)
1062     hsa_runtime_lib = "libhsa-runtime64.so.1";
1063
1064   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1065
1066   const char *x = secure_getenv ("GCN_NUM_TEAMS");
1067   if (!x)
1068     x = secure_getenv ("GCN_NUM_GANGS");
1069   if (x)
1070     override_x_dim = atoi (x);
1071
1072   const char *z = secure_getenv ("GCN_NUM_THREADS");
1073   if (!z)
1074     z = secure_getenv ("GCN_NUM_WORKERS");
1075   if (z)
1076     override_z_dim = atoi (z);
1077
1078   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1079   if (heap)
1080     {
1081       size_t tmp = atol (heap);
1082       if (tmp)
1083         gcn_kernel_heap_size = tmp;
1084     }
1085 }
1086
1087 /* Return malloc'd string with name of SYMBOL.  */
1088
1089 static char *
1090 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1091 {
1092   hsa_status_t status;
1093   char *res;
1094   uint32_t len;
1095   const hsa_executable_symbol_info_t info_name_length
1096     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1097
1098   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1099                                                       &len);
1100   if (status != HSA_STATUS_SUCCESS)
1101     {
1102       hsa_error ("Could not get length of symbol name", status);
1103       return NULL;
1104     }
1105
1106   res = GOMP_PLUGIN_malloc (len + 1);
1107
1108   const hsa_executable_symbol_info_t info_name
1109     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1110
1111   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1112
1113   if (status != HSA_STATUS_SUCCESS)
1114     {
1115       hsa_error ("Could not get symbol name", status);
1116       free (res);
1117       return NULL;
1118     }
1119
1120   res[len] = '\0';
1121
1122   return res;
1123 }
1124
1125 /* Get the number of GPU Compute Units.  */
1126
1127 static int
1128 get_cu_count (struct agent_info *agent)
1129 {
1130   uint32_t cu_count;
1131   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1132     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1133   if (status == HSA_STATUS_SUCCESS)
1134     return cu_count;
1135   else
1136     return 64;  /* The usual number for older devices.  */
1137 }
1138
1139 /* Calculate the maximum grid size for OMP threads / OACC workers.
1140    This depends on the kernel's resource usage levels.  */
1141
1142 static int
1143 limit_worker_threads (int threads)
1144 {
1145   /* FIXME Do something more inteligent here.
1146      GCN can always run 4 threads within a Compute Unit, but
1147      more than that depends on register usage.  */
1148   if (threads > 16)
1149     threads = 16;
1150   return threads;
1151 }
1152
1153 /* This sets the maximum number of teams to twice the number of GPU Compute
1154    Units to avoid memory waste and corresponding memory access faults.  */
1155
1156 static int
1157 limit_teams (int teams, struct agent_info *agent)
1158 {
1159   int max_teams = 2 * get_cu_count (agent);
1160   if (teams > max_teams)
1161     teams = max_teams;
1162   return teams;
1163 }
1164
1165 /* Parse the target attributes INPUT provided by the compiler and return true
1166    if we should run anything all.  If INPUT is NULL, fill DEF with default
1167    values, then store INPUT or DEF into *RESULT.
1168
1169    This is used for OpenMP only.  */
1170
1171 static bool
1172 parse_target_attributes (void **input,
1173                          struct GOMP_kernel_launch_attributes *def,
1174                          struct GOMP_kernel_launch_attributes **result,
1175                          struct agent_info *agent)
1176 {
1177   if (!input)
1178     GOMP_PLUGIN_fatal ("No target arguments provided");
1179
1180   bool grid_attrs_found = false;
1181   bool gcn_dims_found = false;
1182   int gcn_teams = 0;
1183   int gcn_threads = 0;
1184   while (*input)
1185     {
1186       intptr_t id = (intptr_t) *input++, val;
1187
1188       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1189         val = (intptr_t) *input++;
1190       else
1191         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1192
1193       val = (val > INT_MAX) ? INT_MAX : val;
1194
1195       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1196           && ((id & GOMP_TARGET_ARG_ID_MASK)
1197               == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1198         {
1199           grid_attrs_found = true;
1200           break;
1201         }
1202       else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1203                == GOMP_TARGET_ARG_DEVICE_ALL)
1204         {
1205           gcn_dims_found = true;
1206           switch (id & GOMP_TARGET_ARG_ID_MASK)
1207             {
1208             case GOMP_TARGET_ARG_NUM_TEAMS:
1209               gcn_teams = limit_teams (val, agent);
1210               break;
1211             case GOMP_TARGET_ARG_THREAD_LIMIT:
1212               gcn_threads = limit_worker_threads (val);
1213               break;
1214             default:
1215               ;
1216             }
1217         }
1218     }
1219
1220   if (gcn_dims_found)
1221     {
1222       bool gfx900_workaround_p = false;
1223
1224       if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1225           && gcn_threads == 0 && override_z_dim == 0)
1226         {
1227           gfx900_workaround_p = true;
1228           GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1229                        "threads to at most 4 per team.\n");
1230           GCN_WARNING (" - If this is not a Vega 10 device, please use "
1231                        "GCN_NUM_THREADS=16\n");
1232         }
1233
1234       /* Ideally, when a dimension isn't explicitly specified, we should
1235          tune it to run 40 (or 32?) threads per CU with no threads getting queued.
1236          In practice, we tune for peak performance on BabelStream, which
1237          for OpenACC is currently 32 threads per CU.  */
1238       def->ndim = 3;
1239       if (gcn_teams <= 0 && gcn_threads <= 0)
1240         {
1241           /* Set up a reasonable number of teams and threads.  */
1242           gcn_threads = gfx900_workaround_p ? 4 : 16; // 8;
1243           def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1244           def->gdims[2] = gcn_threads;
1245         }
1246       else if (gcn_teams <= 0 && gcn_threads > 0)
1247         {
1248           /* Auto-scale the number of teams with the number of threads.  */
1249           def->gdims[0] = get_cu_count (agent); // * (40 / gcn_threads);
1250           def->gdims[2] = gcn_threads;
1251         }
1252       else if (gcn_teams > 0 && gcn_threads <= 0)
1253         {
1254           int max_threads = gfx900_workaround_p ? 4 : 16;
1255
1256           /* Auto-scale the number of threads with the number of teams.  */
1257           def->gdims[0] = gcn_teams;
1258           def->gdims[2] = 16; // get_cu_count (agent) * 40 / gcn_teams;
1259           if (def->gdims[2] == 0)
1260             def->gdims[2] = 1;
1261           else if (def->gdims[2] > max_threads)
1262             def->gdims[2] = max_threads;
1263         }
1264       else
1265         {
1266           def->gdims[0] = gcn_teams;
1267           def->gdims[2] = gcn_threads;
1268         }
1269       def->gdims[1] = 64; /* Each thread is 64 work items wide.  */
1270       def->wdims[0] = 1;  /* Single team per work-group.  */
1271       def->wdims[1] = 64;
1272       def->wdims[2] = 16;
1273       *result = def;
1274       return true;
1275     }
1276   else if (!grid_attrs_found)
1277     {
1278       def->ndim = 1;
1279       def->gdims[0] = 1;
1280       def->gdims[1] = 1;
1281       def->gdims[2] = 1;
1282       def->wdims[0] = 1;
1283       def->wdims[1] = 1;
1284       def->wdims[2] = 1;
1285       *result = def;
1286       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1287       return true;
1288     }
1289
1290   struct GOMP_kernel_launch_attributes *kla;
1291   kla = (struct GOMP_kernel_launch_attributes *) *input;
1292   *result = kla;
1293   if (kla->ndim == 0 || kla->ndim > 3)
1294     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1295
1296   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1297   unsigned i;
1298   for (i = 0; i < kla->ndim; i++)
1299     {
1300       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1301                  kla->gdims[i], kla->wdims[i]);
1302       if (kla->gdims[i] == 0)
1303         return false;
1304     }
1305   return true;
1306 }
1307
1308 /* Return the group size given the requested GROUP size, GRID size and number
1309    of grid dimensions NDIM.  */
1310
1311 static uint32_t
1312 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1313 {
1314   if (group == 0)
1315     {
1316       /* TODO: Provide a default via environment or device characteristics.  */
1317       if (ndim == 1)
1318         group = 64;
1319       else if (ndim == 2)
1320         group = 8;
1321       else
1322         group = 4;
1323     }
1324
1325   if (group > grid)
1326     group = grid;
1327   return group;
1328 }
1329
1330 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1331
1332 static void
1333 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1334 {
1335   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1336 }
1337
1338 /* A never-called callback for the HSA command queues.  These signal events
1339    that we don't use, so we trigger an error.
1340
1341    This "queue" is not to be confused with the async queues, below.  */
1342
1343 static void
1344 hsa_queue_callback (hsa_status_t status,
1345                 hsa_queue_t *queue __attribute__ ((unused)),
1346                 void *data __attribute__ ((unused)))
1347 {
1348   hsa_fatal ("Asynchronous queue error", status);
1349 }
1350
1351 /* }}}  */
1352 /* {{{ HSA initialization  */
1353
1354 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
1355
1356 static bool
1357 init_hsa_runtime_functions (void)
1358 {
1359 #define DLSYM_FN(function) \
1360   hsa_fns.function##_fn = dlsym (handle, #function); \
1361   if (hsa_fns.function##_fn == NULL) \
1362     return false;
1363   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1364   if (handle == NULL)
1365     return false;
1366
1367   DLSYM_FN (hsa_status_string)
1368   DLSYM_FN (hsa_system_get_info)
1369   DLSYM_FN (hsa_agent_get_info)
1370   DLSYM_FN (hsa_init)
1371   DLSYM_FN (hsa_iterate_agents)
1372   DLSYM_FN (hsa_region_get_info)
1373   DLSYM_FN (hsa_queue_create)
1374   DLSYM_FN (hsa_agent_iterate_regions)
1375   DLSYM_FN (hsa_executable_destroy)
1376   DLSYM_FN (hsa_executable_create)
1377   DLSYM_FN (hsa_executable_global_variable_define)
1378   DLSYM_FN (hsa_executable_load_code_object)
1379   DLSYM_FN (hsa_executable_freeze)
1380   DLSYM_FN (hsa_signal_create)
1381   DLSYM_FN (hsa_memory_allocate)
1382   DLSYM_FN (hsa_memory_assign_agent)
1383   DLSYM_FN (hsa_memory_copy)
1384   DLSYM_FN (hsa_memory_free)
1385   DLSYM_FN (hsa_signal_destroy)
1386   DLSYM_FN (hsa_executable_get_symbol)
1387   DLSYM_FN (hsa_executable_symbol_get_info)
1388   DLSYM_FN (hsa_executable_iterate_symbols)
1389   DLSYM_FN (hsa_queue_add_write_index_release)
1390   DLSYM_FN (hsa_queue_load_read_index_acquire)
1391   DLSYM_FN (hsa_signal_wait_acquire)
1392   DLSYM_FN (hsa_signal_store_relaxed)
1393   DLSYM_FN (hsa_signal_store_release)
1394   DLSYM_FN (hsa_signal_load_acquire)
1395   DLSYM_FN (hsa_queue_destroy)
1396   DLSYM_FN (hsa_code_object_deserialize)
1397   return true;
1398 #undef DLSYM_FN
1399 }
1400
1401 /* Return true if the agent is a GPU and can accept of concurrent submissions
1402    from different threads.  */
1403
1404 static bool
1405 suitable_hsa_agent_p (hsa_agent_t agent)
1406 {
1407   hsa_device_type_t device_type;
1408   hsa_status_t status
1409     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1410                                      &device_type);
1411   if (status != HSA_STATUS_SUCCESS)
1412     return false;
1413
1414   switch (device_type)
1415     {
1416     case HSA_DEVICE_TYPE_GPU:
1417       break;
1418     case HSA_DEVICE_TYPE_CPU:
1419       if (!support_cpu_devices)
1420         return false;
1421       break;
1422     default:
1423       return false;
1424     }
1425
1426   uint32_t features = 0;
1427   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1428                                           &features);
1429   if (status != HSA_STATUS_SUCCESS
1430       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1431     return false;
1432   hsa_queue_type_t queue_type;
1433   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1434                                           &queue_type);
1435   if (status != HSA_STATUS_SUCCESS
1436       || (queue_type != HSA_QUEUE_TYPE_MULTI))
1437     return false;
1438
1439   return true;
1440 }
1441
1442 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1443    agent_count in hsa_context.  */
1444
1445 static hsa_status_t
1446 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1447 {
1448   if (suitable_hsa_agent_p (agent))
1449     hsa_context.agent_count++;
1450   return HSA_STATUS_SUCCESS;
1451 }
1452
1453 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1454    id to the describing structure in the hsa context.  The index of the
1455    structure is pointed to by DATA, increment it afterwards.  */
1456
1457 static hsa_status_t
1458 assign_agent_ids (hsa_agent_t agent, void *data)
1459 {
1460   if (suitable_hsa_agent_p (agent))
1461     {
1462       int *agent_index = (int *) data;
1463       hsa_context.agents[*agent_index].id = agent;
1464       ++*agent_index;
1465     }
1466   return HSA_STATUS_SUCCESS;
1467 }
1468
1469 /* Initialize hsa_context if it has not already been done.
1470    Return TRUE on success.  */
1471
1472 static bool
1473 init_hsa_context (void)
1474 {
1475   hsa_status_t status;
1476   int agent_index = 0;
1477
1478   if (hsa_context.initialized)
1479     return true;
1480   init_environment_variables ();
1481   if (!init_hsa_runtime_functions ())
1482     {
1483       GCN_WARNING ("Run-time could not be dynamically opened\n");
1484       if (suppress_host_fallback)
1485         GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1486       return false;
1487     }
1488   status = hsa_fns.hsa_init_fn ();
1489   if (status != HSA_STATUS_SUCCESS)
1490     return hsa_error ("Run-time could not be initialized", status);
1491   GCN_DEBUG ("HSA run-time initialized for GCN\n");
1492
1493   if (debug)
1494     dump_hsa_system_info ();
1495
1496   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1497   if (status != HSA_STATUS_SUCCESS)
1498     return hsa_error ("GCN GPU devices could not be enumerated", status);
1499   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1500
1501   hsa_context.agents
1502     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1503                                   * sizeof (struct agent_info));
1504   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1505   if (status != HSA_STATUS_SUCCESS)
1506     return hsa_error ("Scanning compute agents failed", status);
1507   if (agent_index != hsa_context.agent_count)
1508     {
1509       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1510       return false;
1511     }
1512
1513   if (debug)
1514     {
1515       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1516       if (status != HSA_STATUS_SUCCESS)
1517         GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1518     }
1519
1520   uint16_t minor, major;
1521   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1522                                            &minor);
1523   if (status != HSA_STATUS_SUCCESS)
1524     GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1525   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1526                                            &major);
1527   if (status != HSA_STATUS_SUCCESS)
1528     GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1529
1530   size_t len = sizeof hsa_context.driver_version_s;
1531   int printed = snprintf (hsa_context.driver_version_s, len,
1532                           "HSA Runtime %hu.%hu", (unsigned short int)major,
1533                           (unsigned short int)minor);
1534   if (printed >= len)
1535     GCN_WARNING ("HSA runtime version string was truncated."
1536                  "Version %hu.%hu is too long.", (unsigned short int)major,
1537                  (unsigned short int)minor);
1538
1539   hsa_context.initialized = true;
1540   return true;
1541 }
1542
1543 /* Verify that hsa_context has already been initialized and return the
1544    agent_info structure describing device number N.  Return NULL on error.  */
1545
1546 static struct agent_info *
1547 get_agent_info (int n)
1548 {
1549   if (!hsa_context.initialized)
1550     {
1551       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1552       return NULL;
1553     }
1554   if (n >= hsa_context.agent_count)
1555     {
1556       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1557       return NULL;
1558     }
1559   if (!hsa_context.agents[n].initialized)
1560     {
1561       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1562       return NULL;
1563     }
1564   return &hsa_context.agents[n];
1565 }
1566
1567 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1568
1569    Selects (breaks at) a suitable region of type KIND.  */
1570
1571 static hsa_status_t
1572 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1573                    hsa_region_global_flag_t kind)
1574 {
1575   hsa_status_t status;
1576   hsa_region_segment_t segment;
1577
1578   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1579                                            &segment);
1580   if (status != HSA_STATUS_SUCCESS)
1581     return status;
1582   if (segment != HSA_REGION_SEGMENT_GLOBAL)
1583     return HSA_STATUS_SUCCESS;
1584
1585   uint32_t flags;
1586   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1587                                            &flags);
1588   if (status != HSA_STATUS_SUCCESS)
1589     return status;
1590   if (flags & kind)
1591     {
1592       *retval = region;
1593       return HSA_STATUS_INFO_BREAK;
1594     }
1595   return HSA_STATUS_SUCCESS;
1596 }
1597
1598 /* Callback of hsa_agent_iterate_regions.
1599
1600    Selects a kernargs memory region.  */
1601
1602 static hsa_status_t
1603 get_kernarg_memory_region (hsa_region_t region, void *data)
1604 {
1605   return get_memory_region (region, (hsa_region_t *)data,
1606                             HSA_REGION_GLOBAL_FLAG_KERNARG);
1607 }
1608
1609 /* Callback of hsa_agent_iterate_regions.
1610
1611    Selects a coarse-grained memory region suitable for the heap and
1612    offload data.  */
1613
1614 static hsa_status_t
1615 get_data_memory_region (hsa_region_t region, void *data)
1616 {
1617   return get_memory_region (region, (hsa_region_t *)data,
1618                             HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1619 }
1620
1621 static int
1622 elf_gcn_isa_field (Elf64_Ehdr *image)
1623 {
1624   return image->e_flags & EF_AMDGPU_MACH_MASK;
1625 }
1626
1627 const static char *gcn_gfx803_s = "gfx803";
1628 const static char *gcn_gfx900_s = "gfx900";
1629 const static char *gcn_gfx906_s = "gfx906";
1630 const static char *gcn_gfx908_s = "gfx908";
1631 const static char *gcn_gfx90a_s = "gfx90a";
1632 const static int gcn_isa_name_len = 6;
1633
1634 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1635    support the ISA. */
1636
1637 static const char*
1638 isa_hsa_name (int isa) {
1639   switch(isa)
1640     {
1641     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1642       return gcn_gfx803_s;
1643     case EF_AMDGPU_MACH_AMDGCN_GFX900:
1644       return gcn_gfx900_s;
1645     case EF_AMDGPU_MACH_AMDGCN_GFX906:
1646       return gcn_gfx906_s;
1647     case EF_AMDGPU_MACH_AMDGCN_GFX908:
1648       return gcn_gfx908_s;
1649     case EF_AMDGPU_MACH_AMDGCN_GFX90a:
1650       return gcn_gfx90a_s;
1651     }
1652   return NULL;
1653 }
1654
1655 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1656    with -march) or NULL if we do not support the ISA.
1657    Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
1658
1659 static const char*
1660 isa_gcc_name (int isa) {
1661   switch(isa)
1662     {
1663     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1664       return "fiji";
1665     default:
1666       return isa_hsa_name (isa);
1667     }
1668 }
1669
1670 /* Returns the code which is used in the GCN object code to identify the ISA with
1671    the given name (as used by the HSA runtime).  */
1672
1673 static gcn_isa
1674 isa_code(const char *isa) {
1675   if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1676     return EF_AMDGPU_MACH_AMDGCN_GFX803;
1677
1678   if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1679     return EF_AMDGPU_MACH_AMDGCN_GFX900;
1680
1681   if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1682     return EF_AMDGPU_MACH_AMDGCN_GFX906;
1683
1684   if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
1685     return EF_AMDGPU_MACH_AMDGCN_GFX908;
1686
1687   if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
1688     return EF_AMDGPU_MACH_AMDGCN_GFX90a;
1689
1690   return -1;
1691 }
1692
1693 /* }}}  */
1694 /* {{{ Run  */
1695
1696 /* Create or reuse a team arena.
1697
1698    Team arenas are used by OpenMP to avoid calling malloc multiple times
1699    while setting up each team.  This is purely a performance optimization.
1700
1701    Allocating an arena also costs performance, albeit on the host side, so
1702    this function will reuse an existing arena if a large enough one is idle.
1703    The arena is released, but not deallocated, when the kernel exits.  */
1704
1705 static void *
1706 get_team_arena (struct agent_info *agent, int num_teams)
1707 {
1708   struct team_arena_list **next_ptr = &agent->team_arena_list;
1709   struct team_arena_list *item;
1710
1711   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1712     {
1713       if (item->num_teams < num_teams)
1714         continue;
1715
1716       if (pthread_mutex_trylock (&item->in_use))
1717         continue;
1718
1719       return item->arena;
1720     }
1721
1722   GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1723
1724   if (pthread_mutex_lock (&agent->team_arena_write_lock))
1725     {
1726       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1727       return false;
1728     }
1729   item = malloc (sizeof (*item));
1730   item->num_teams = num_teams;
1731   item->next = NULL;
1732   *next_ptr = item;
1733
1734   if (pthread_mutex_init (&item->in_use, NULL))
1735     {
1736       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1737       return false;
1738     }
1739   if (pthread_mutex_lock (&item->in_use))
1740     {
1741       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1742       return false;
1743     }
1744   if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1745     {
1746       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1747       return false;
1748     }
1749
1750   const int TEAM_ARENA_SIZE = 64*1024;  /* Must match libgomp.h.  */
1751   hsa_status_t status;
1752   status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1753                                            TEAM_ARENA_SIZE*num_teams,
1754                                            &item->arena);
1755   if (status != HSA_STATUS_SUCCESS)
1756     hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1757   status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1758                                                HSA_ACCESS_PERMISSION_RW);
1759   if (status != HSA_STATUS_SUCCESS)
1760     hsa_fatal ("Could not assign arena memory to device", status);
1761
1762   return item->arena;
1763 }
1764
1765 /* Mark a team arena available for reuse.  */
1766
1767 static void
1768 release_team_arena (struct agent_info* agent, void *arena)
1769 {
1770   struct team_arena_list *item;
1771
1772   for (item = agent->team_arena_list; item; item = item->next)
1773     {
1774       if (item->arena == arena)
1775         {
1776           if (pthread_mutex_unlock (&item->in_use))
1777             GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1778           return;
1779         }
1780     }
1781   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1782 }
1783
1784 /* Clean up all the allocated team arenas.  */
1785
1786 static bool
1787 destroy_team_arenas (struct agent_info *agent)
1788 {
1789   struct team_arena_list *item, *next;
1790
1791   for (item = agent->team_arena_list; item; item = next)
1792     {
1793       next = item->next;
1794       hsa_fns.hsa_memory_free_fn (item->arena);
1795       if (pthread_mutex_destroy (&item->in_use))
1796         {
1797           GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1798           return false;
1799         }
1800       free (item);
1801     }
1802   agent->team_arena_list = NULL;
1803
1804   return true;
1805 }
1806
1807 /* Allocate memory on a specified device.  */
1808
1809 static void *
1810 alloc_by_agent (struct agent_info *agent, size_t size)
1811 {
1812   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1813
1814   /* Zero-size allocations are invalid, so in order to return a valid pointer
1815      we need to pass a valid size.  One source of zero-size allocations is
1816      kernargs for kernels that have no inputs or outputs (the kernel may
1817      only use console output, for example).  */
1818   if (size == 0)
1819     size = 4;
1820
1821   void *ptr;
1822   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1823                                                         size, &ptr);
1824   if (status != HSA_STATUS_SUCCESS)
1825     {
1826       hsa_error ("Could not allocate device memory", status);
1827       return NULL;
1828     }
1829
1830   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1831                                                HSA_ACCESS_PERMISSION_RW);
1832   if (status != HSA_STATUS_SUCCESS)
1833     {
1834       hsa_error ("Could not assign data memory to device", status);
1835       return NULL;
1836     }
1837
1838   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1839   bool profiling_dispatch_p
1840     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1841   if (profiling_dispatch_p)
1842     {
1843       acc_prof_info *prof_info = thr->prof_info;
1844       acc_event_info data_event_info;
1845       acc_api_info *api_info = thr->api_info;
1846
1847       prof_info->event_type = acc_ev_alloc;
1848
1849       data_event_info.data_event.event_type = prof_info->event_type;
1850       data_event_info.data_event.valid_bytes
1851         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1852       data_event_info.data_event.parent_construct
1853         = acc_construct_parallel;
1854       data_event_info.data_event.implicit = 1;
1855       data_event_info.data_event.tool_info = NULL;
1856       data_event_info.data_event.var_name = NULL;
1857       data_event_info.data_event.bytes = size;
1858       data_event_info.data_event.host_ptr = NULL;
1859       data_event_info.data_event.device_ptr = (void *) ptr;
1860
1861       api_info->device_api = acc_device_api_other;
1862
1863       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1864                                             api_info);
1865     }
1866
1867   return ptr;
1868 }
1869
1870 /* Create kernel dispatch data structure for given KERNEL, along with
1871    the necessary device signals and memory allocations.  */
1872
1873 static struct kernel_dispatch *
1874 create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1875 {
1876   struct agent_info *agent = kernel->agent;
1877   struct kernel_dispatch *shadow
1878     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1879
1880   shadow->agent = kernel->agent;
1881   shadow->object = kernel->object;
1882
1883   hsa_signal_t sync_signal;
1884   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1885   if (status != HSA_STATUS_SUCCESS)
1886     hsa_fatal ("Error creating the GCN sync signal", status);
1887
1888   shadow->signal = sync_signal.handle;
1889   shadow->private_segment_size = kernel->private_segment_size;
1890   shadow->group_segment_size = kernel->group_segment_size;
1891
1892   /* We expect kernels to request a single pointer, explicitly, and the
1893      rest of struct kernargs, implicitly.  If they request anything else
1894      then something is wrong.  */
1895   if (kernel->kernarg_segment_size > 8)
1896     {
1897       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1898       return NULL;
1899     }
1900
1901   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1902                                            sizeof (struct kernargs),
1903                                            &shadow->kernarg_address);
1904   if (status != HSA_STATUS_SUCCESS)
1905     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1906   struct kernargs *kernargs = shadow->kernarg_address;
1907
1908   /* Zero-initialize the output_data (minimum needed).  */
1909   kernargs->out_ptr = (int64_t)&kernargs->output_data;
1910   kernargs->output_data.next_output = 0;
1911   for (unsigned i = 0;
1912        i < (sizeof (kernargs->output_data.queue)
1913             / sizeof (kernargs->output_data.queue[0]));
1914        i++)
1915     kernargs->output_data.queue[i].written = 0;
1916   kernargs->output_data.consumed = 0;
1917
1918   /* Pass in the heap location.  */
1919   kernargs->heap_ptr = (int64_t)kernel->module->heap;
1920
1921   /* Create an arena.  */
1922   if (kernel->kind == KIND_OPENMP)
1923     kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1924   else
1925     kernargs->arena_ptr = 0;
1926
1927   /* Ensure we can recognize unset return values.  */
1928   kernargs->output_data.return_value = 0xcafe0000;
1929
1930   return shadow;
1931 }
1932
1933 static void
1934 process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
1935                          uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
1936 {
1937   int dev_num = dev_num64;
1938   GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
1939                           NULL, NULL, NULL);
1940 }
1941
1942 /* Output any data written to console output from the kernel.  It is expected
1943    that this function is polled during kernel execution.
1944
1945    We print all entries from the last item printed to the next entry without
1946    a "written" flag.  If the "final" flag is set then it'll continue right to
1947    the end.
1948
1949    The print buffer is circular, but the from and to locations don't wrap when
1950    the buffer does, so the output limit is UINT_MAX.  The target blocks on
1951    output when the buffer is full.  */
1952
1953 static void
1954 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1955                 bool final)
1956 {
1957   unsigned int limit = (sizeof (kernargs->output_data.queue)
1958                         / sizeof (kernargs->output_data.queue[0]));
1959
1960   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1961                                        __ATOMIC_ACQUIRE);
1962   unsigned int to = kernargs->output_data.next_output;
1963
1964   if (from > to)
1965     {
1966       /* Overflow.  */
1967       if (final)
1968         printf ("GCN print buffer overflowed.\n");
1969       return;
1970     }
1971
1972   unsigned int i;
1973   for (i = from; i < to; i++)
1974     {
1975       struct printf_data *data = &kernargs->output_data.queue[i%limit];
1976
1977       if (!data->written && !final)
1978         break;
1979
1980       switch (data->type)
1981         {
1982         case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1983         case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1984         case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1985         case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1986         case 4:
1987           process_reverse_offload (data->value_u64[0], data->value_u64[1],
1988                                    data->value_u64[2], data->value_u64[3],
1989                                    data->value_u64[4], data->value_u64[5]);
1990           break;
1991         default: printf ("GCN print buffer error!\n"); break;
1992         }
1993       data->written = 0;
1994       __atomic_store_n (&kernargs->output_data.consumed, i+1,
1995                         __ATOMIC_RELEASE);
1996     }
1997   fflush (stdout);
1998 }
1999
2000 /* Release data structure created for a kernel dispatch in SHADOW argument,
2001    and clean up the signal and memory allocations.  */
2002
2003 static void
2004 release_kernel_dispatch (struct kernel_dispatch *shadow)
2005 {
2006   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
2007
2008   struct kernargs *kernargs = shadow->kernarg_address;
2009   void *arena = (void *)kernargs->arena_ptr;
2010   if (arena)
2011     release_team_arena (shadow->agent, arena);
2012
2013   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
2014
2015   hsa_signal_t s;
2016   s.handle = shadow->signal;
2017   hsa_fns.hsa_signal_destroy_fn (s);
2018
2019   free (shadow);
2020 }
2021
2022 /* Extract the properties from a kernel binary.  */
2023
2024 static void
2025 init_kernel_properties (struct kernel_info *kernel)
2026 {
2027   hsa_status_t status;
2028   struct agent_info *agent = kernel->agent;
2029   hsa_executable_symbol_t kernel_symbol;
2030   char *buf = alloca (strlen (kernel->name) + 4);
2031   sprintf (buf, "%s.kd", kernel->name);
2032   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
2033                                                  buf, agent->id,
2034                                                  0, &kernel_symbol);
2035   if (status != HSA_STATUS_SUCCESS)
2036     {
2037       hsa_warn ("Could not find symbol for kernel in the code object", status);
2038       fprintf (stderr, "not found name: '%s'\n", buf);
2039       dump_executable_symbols (agent->executable);
2040       goto failure;
2041     }
2042   GCN_DEBUG ("Located kernel %s\n", kernel->name);
2043   status = hsa_fns.hsa_executable_symbol_get_info_fn
2044     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
2045   if (status != HSA_STATUS_SUCCESS)
2046     hsa_fatal ("Could not extract a kernel object from its symbol", status);
2047   status = hsa_fns.hsa_executable_symbol_get_info_fn
2048     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
2049      &kernel->kernarg_segment_size);
2050   if (status != HSA_STATUS_SUCCESS)
2051     hsa_fatal ("Could not get info about kernel argument size", status);
2052   status = hsa_fns.hsa_executable_symbol_get_info_fn
2053     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2054      &kernel->group_segment_size);
2055   if (status != HSA_STATUS_SUCCESS)
2056     hsa_fatal ("Could not get info about kernel group segment size", status);
2057   status = hsa_fns.hsa_executable_symbol_get_info_fn
2058     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2059      &kernel->private_segment_size);
2060   if (status != HSA_STATUS_SUCCESS)
2061     hsa_fatal ("Could not get info about kernel private segment size",
2062                status);
2063
2064   /* The kernel type is not known until something tries to launch it.  */
2065   kernel->kind = KIND_UNKNOWN;
2066
2067   GCN_DEBUG ("Kernel structure for %s fully initialized with "
2068              "following segment sizes: \n", kernel->name);
2069   GCN_DEBUG ("  group_segment_size: %u\n",
2070              (unsigned) kernel->group_segment_size);
2071   GCN_DEBUG ("  private_segment_size: %u\n",
2072              (unsigned) kernel->private_segment_size);
2073   GCN_DEBUG ("  kernarg_segment_size: %u\n",
2074              (unsigned) kernel->kernarg_segment_size);
2075   return;
2076
2077 failure:
2078   kernel->initialization_failed = true;
2079 }
2080
2081 /* Do all the work that is necessary before running KERNEL for the first time.
2082    The function assumes the program has been created, finalized and frozen by
2083    create_and_finalize_hsa_program.  */
2084
2085 static void
2086 init_kernel (struct kernel_info *kernel)
2087 {
2088   if (pthread_mutex_lock (&kernel->init_mutex))
2089     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2090   if (kernel->initialized)
2091     {
2092       if (pthread_mutex_unlock (&kernel->init_mutex))
2093         GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2094                            "mutex");
2095
2096       return;
2097     }
2098
2099   init_kernel_properties (kernel);
2100
2101   if (!kernel->initialization_failed)
2102     {
2103       GCN_DEBUG ("\n");
2104
2105       kernel->initialized = true;
2106     }
2107   if (pthread_mutex_unlock (&kernel->init_mutex))
2108     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2109                        "mutex");
2110 }
2111
2112 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2113    launch attributes from KLA.
2114
2115    MODULE_LOCKED indicates that the caller already holds the lock and
2116    run_kernel need not lock it again.
2117    If AQ is NULL then agent->sync_queue will be used.  */
2118
2119 static void
2120 run_kernel (struct kernel_info *kernel, void *vars,
2121             struct GOMP_kernel_launch_attributes *kla,
2122             struct goacc_asyncqueue *aq, bool module_locked)
2123 {
2124   GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2125              kernel->description->vpgr_count);
2126
2127   /* Reduce the number of threads/workers if there are insufficient
2128      VGPRs available to run the kernels together.  */
2129   if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2130     {
2131       int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2132       int max_threads = (256 / granulated_vgprs) * 4;
2133       if (kla->gdims[2] > max_threads)
2134         {
2135           GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2136                        " per team/gang - reducing to %d threads/workers.\n",
2137                        kla->gdims[2], max_threads);
2138           kla->gdims[2] = max_threads;
2139         }
2140     }
2141
2142   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2143              (aq ? aq->id : 0));
2144   GCN_DEBUG ("GCN launch attribs: gdims:[");
2145   int i;
2146   for (i = 0; i < kla->ndim; ++i)
2147     {
2148       if (i)
2149         DEBUG_PRINT (", ");
2150       DEBUG_PRINT ("%u", kla->gdims[i]);
2151     }
2152   DEBUG_PRINT ("], normalized gdims:[");
2153   for (i = 0; i < kla->ndim; ++i)
2154     {
2155       if (i)
2156         DEBUG_PRINT (", ");
2157       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2158     }
2159   DEBUG_PRINT ("], wdims:[");
2160   for (i = 0; i < kla->ndim; ++i)
2161     {
2162       if (i)
2163         DEBUG_PRINT (", ");
2164       DEBUG_PRINT ("%u", kla->wdims[i]);
2165     }
2166   DEBUG_PRINT ("]\n");
2167   DEBUG_FLUSH ();
2168
2169   struct agent_info *agent = kernel->agent;
2170   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2171     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2172
2173   if (!agent->initialized)
2174     GOMP_PLUGIN_fatal ("Agent must be initialized");
2175
2176   if (!kernel->initialized)
2177     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2178
2179   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2180
2181   uint64_t index
2182     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2183   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2184
2185   /* Wait until the queue is not full before writing the packet.   */
2186   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2187          >= command_q->size)
2188     ;
2189
2190   /* Do not allow the dimensions to be overridden when running
2191      constructors or destructors.  */
2192   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2193   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2194
2195   hsa_kernel_dispatch_packet_t *packet;
2196   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2197            + index % command_q->size;
2198
2199   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2200   packet->grid_size_x = override_x ? : kla->gdims[0];
2201   packet->workgroup_size_x = get_group_size (kla->ndim,
2202                                              packet->grid_size_x,
2203                                              kla->wdims[0]);
2204
2205   if (kla->ndim >= 2)
2206     {
2207       packet->grid_size_y = kla->gdims[1];
2208       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2209                                                  kla->wdims[1]);
2210     }
2211   else
2212     {
2213       packet->grid_size_y = 1;
2214       packet->workgroup_size_y = 1;
2215     }
2216
2217   if (kla->ndim == 3)
2218     {
2219       packet->grid_size_z = limit_worker_threads (override_z
2220                                                   ? : kla->gdims[2]);
2221       packet->workgroup_size_z = get_group_size (kla->ndim,
2222                                                  packet->grid_size_z,
2223                                                  kla->wdims[2]);
2224     }
2225   else
2226     {
2227       packet->grid_size_z = 1;
2228       packet->workgroup_size_z = 1;
2229     }
2230
2231   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2232              " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2233              packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2234              packet->grid_size_x / packet->workgroup_size_x,
2235              packet->grid_size_y / packet->workgroup_size_y,
2236              packet->grid_size_z / packet->workgroup_size_z,
2237              packet->workgroup_size_x, packet->workgroup_size_y,
2238              packet->workgroup_size_z);
2239
2240   struct kernel_dispatch *shadow
2241     = create_kernel_dispatch (kernel, packet->grid_size_x);
2242   shadow->queue = command_q;
2243
2244   if (debug)
2245     {
2246       fprintf (stderr, "\nKernel has following dependencies:\n");
2247       print_kernel_dispatch (shadow, 2);
2248     }
2249
2250   packet->private_segment_size = kernel->private_segment_size;
2251   packet->group_segment_size = kernel->group_segment_size;
2252   packet->kernel_object = kernel->object;
2253   packet->kernarg_address = shadow->kernarg_address;
2254   hsa_signal_t s;
2255   s.handle = shadow->signal;
2256   packet->completion_signal = s;
2257   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2258   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2259
2260   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2261
2262   uint16_t header;
2263   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2264   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2265   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2266
2267   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2268              agent->device_id);
2269
2270   packet_store_release ((uint32_t *) packet, header,
2271                         (uint16_t) kla->ndim
2272                         << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2273
2274   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2275                                        index);
2276
2277   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2278
2279   /* Root signal waits with 1ms timeout.  */
2280   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2281                                              1000 * 1000,
2282                                              HSA_WAIT_STATE_BLOCKED) != 0)
2283     {
2284       console_output (kernel, shadow->kernarg_address, false);
2285     }
2286   console_output (kernel, shadow->kernarg_address, true);
2287
2288   struct kernargs *kernargs = shadow->kernarg_address;
2289   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2290
2291   release_kernel_dispatch (shadow);
2292
2293   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2294     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2295
2296   unsigned int upper = (return_value & ~0xffff) >> 16;
2297   if (upper == 0xcafe)
2298     ; // exit not called, normal termination.
2299   else if (upper == 0xffff)
2300     ; // exit called.
2301   else
2302     {
2303       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2304                          " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2305                          return_value);
2306       abort ();
2307     }
2308
2309   if (upper == 0xffff)
2310     {
2311       unsigned int signal = (return_value >> 8) & 0xff;
2312
2313       if (signal == SIGABRT)
2314         {
2315           GCN_WARNING ("GCN Kernel aborted\n");
2316           abort ();
2317         }
2318       else if (signal != 0)
2319         {
2320           GCN_WARNING ("GCN Kernel received unknown signal\n");
2321           abort ();
2322         }
2323
2324       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2325       exit (return_value & 0xff);
2326     }
2327 }
2328
2329 /* }}}  */
2330 /* {{{ Load/Unload  */
2331
2332 /* Initialize KERNEL from D and other parameters.  Return true on success. */
2333
2334 static bool
2335 init_basic_kernel_info (struct kernel_info *kernel,
2336                         struct hsa_kernel_description *d,
2337                         struct agent_info *agent,
2338                         struct module_info *module)
2339 {
2340   kernel->agent = agent;
2341   kernel->module = module;
2342   kernel->name = d->name;
2343   kernel->description = d;
2344   if (pthread_mutex_init (&kernel->init_mutex, NULL))
2345     {
2346       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2347       return false;
2348     }
2349   return true;
2350 }
2351
2352 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2353
2354 static bool
2355 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2356 {
2357   int isa_field = elf_gcn_isa_field (image);
2358   const char* isa_s = isa_hsa_name (isa_field);
2359   if (!isa_s)
2360     {
2361       hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2362       return false;
2363     }
2364
2365   if (isa_field != agent->device_isa)
2366     {
2367       char msg[120];
2368       const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2369       const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2370       assert (agent_isa_s);
2371       assert (agent_isa_gcc_s);
2372
2373       snprintf (msg, sizeof msg,
2374                 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2375                 "Try to recompile with '-foffload-options=-march=%s'.\n",
2376                 isa_s, agent_isa_s, agent_isa_gcc_s);
2377
2378       hsa_error (msg, HSA_STATUS_ERROR);
2379       return false;
2380     }
2381
2382   return true;
2383 }
2384
2385 /* Create and finalize the program consisting of all loaded modules.  */
2386
2387 static bool
2388 create_and_finalize_hsa_program (struct agent_info *agent)
2389 {
2390   hsa_status_t status;
2391   bool res = true;
2392   if (pthread_mutex_lock (&agent->prog_mutex))
2393     {
2394       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2395       return false;
2396     }
2397   if (agent->prog_finalized)
2398     goto final;
2399
2400   status
2401     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2402                                         HSA_EXECUTABLE_STATE_UNFROZEN,
2403                                         "", &agent->executable);
2404   if (status != HSA_STATUS_SUCCESS)
2405     {
2406       hsa_error ("Could not create GCN executable", status);
2407       goto fail;
2408     }
2409
2410   /* Load any GCN modules.  */
2411   struct module_info *module = agent->module;
2412   if (module)
2413     {
2414       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2415
2416       if (!isa_matches_agent (agent, image))
2417         goto fail;
2418
2419       hsa_code_object_t co = { 0 };
2420       status = hsa_fns.hsa_code_object_deserialize_fn
2421         (module->image_desc->gcn_image->image,
2422          module->image_desc->gcn_image->size,
2423          NULL, &co);
2424       if (status != HSA_STATUS_SUCCESS)
2425         {
2426           hsa_error ("Could not deserialize GCN code object", status);
2427           goto fail;
2428         }
2429
2430       status = hsa_fns.hsa_executable_load_code_object_fn
2431         (agent->executable, agent->id, co, "");
2432       if (status != HSA_STATUS_SUCCESS)
2433         {
2434           hsa_error ("Could not load GCN code object", status);
2435           goto fail;
2436         }
2437
2438       if (!module->heap)
2439         {
2440           status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2441                                                    gcn_kernel_heap_size,
2442                                                    (void**)&module->heap);
2443           if (status != HSA_STATUS_SUCCESS)
2444             {
2445               hsa_error ("Could not allocate memory for GCN heap", status);
2446               goto fail;
2447             }
2448
2449           status = hsa_fns.hsa_memory_assign_agent_fn
2450                         (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2451           if (status != HSA_STATUS_SUCCESS)
2452             {
2453               hsa_error ("Could not assign GCN heap memory to device", status);
2454               goto fail;
2455             }
2456
2457           hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2458                                       &gcn_kernel_heap_size,
2459                                       sizeof (gcn_kernel_heap_size));
2460         }
2461
2462     }
2463
2464   if (debug)
2465     dump_executable_symbols (agent->executable);
2466
2467   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2468   if (status != HSA_STATUS_SUCCESS)
2469     {
2470       hsa_error ("Could not freeze the GCN executable", status);
2471       goto fail;
2472     }
2473
2474 final:
2475   agent->prog_finalized = true;
2476
2477   if (pthread_mutex_unlock (&agent->prog_mutex))
2478     {
2479       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2480       res = false;
2481     }
2482
2483   return res;
2484
2485 fail:
2486   res = false;
2487   goto final;
2488 }
2489
2490 /* Free the HSA program in agent and everything associated with it and set
2491    agent->prog_finalized and the initialized flags of all kernels to false.
2492    Return TRUE on success.  */
2493
2494 static bool
2495 destroy_hsa_program (struct agent_info *agent)
2496 {
2497   if (!agent->prog_finalized)
2498     return true;
2499
2500   hsa_status_t status;
2501
2502   GCN_DEBUG ("Destroying the current GCN program.\n");
2503
2504   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2505   if (status != HSA_STATUS_SUCCESS)
2506     return hsa_error ("Could not destroy GCN executable", status);
2507
2508   if (agent->module)
2509     {
2510       int i;
2511       for (i = 0; i < agent->module->kernel_count; i++)
2512         agent->module->kernels[i].initialized = false;
2513
2514       if (agent->module->heap)
2515         {
2516           hsa_fns.hsa_memory_free_fn (agent->module->heap);
2517           agent->module->heap = NULL;
2518         }
2519     }
2520   agent->prog_finalized = false;
2521   return true;
2522 }
2523
2524 /* Deinitialize all information associated with MODULE and kernels within
2525    it.  Return TRUE on success.  */
2526
2527 static bool
2528 destroy_module (struct module_info *module, bool locked)
2529 {
2530   /* Run destructors before destroying module.  */
2531   struct GOMP_kernel_launch_attributes kla =
2532     { 3,
2533       /* Grid size.  */
2534       { 1, 64, 1 },
2535       /* Work-group size.  */
2536       { 1, 64, 1 }
2537     };
2538
2539   if (module->fini_array_func)
2540     {
2541       init_kernel (module->fini_array_func);
2542       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2543     }
2544   module->constructors_run_p = false;
2545
2546   int i;
2547   for (i = 0; i < module->kernel_count; i++)
2548     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2549       {
2550         GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2551                            "mutex");
2552         return false;
2553       }
2554
2555   return true;
2556 }
2557
2558 /* }}}  */
2559 /* {{{ Async  */
2560
2561 /* Callback of dispatch queues to report errors.  */
2562
2563 static void
2564 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2565 {
2566   struct queue_entry *entry = &aq->queue[index];
2567
2568   switch (entry->type)
2569     {
2570     case KERNEL_LAUNCH:
2571       if (DEBUG_QUEUES)
2572         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2573                    aq->agent->device_id, aq->id, index);
2574       run_kernel (entry->u.launch.kernel,
2575                   entry->u.launch.vars,
2576                   &entry->u.launch.kla, aq, false);
2577       if (DEBUG_QUEUES)
2578         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2579                    aq->agent->device_id, aq->id, index);
2580       break;
2581
2582     case CALLBACK:
2583       if (DEBUG_QUEUES)
2584         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2585                    aq->agent->device_id, aq->id, index);
2586       entry->u.callback.fn (entry->u.callback.data);
2587       if (DEBUG_QUEUES)
2588         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2589                    aq->agent->device_id, aq->id, index);
2590       break;
2591
2592     case ASYNC_WAIT:
2593       {
2594         /* FIXME: is it safe to access a placeholder that may already have
2595            been executed?  */
2596         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2597
2598         if (DEBUG_QUEUES)
2599           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2600                      aq->agent->device_id, aq->id, index);
2601
2602         pthread_mutex_lock (&placeholderp->mutex);
2603
2604         while (!placeholderp->executed)
2605           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2606
2607         pthread_mutex_unlock (&placeholderp->mutex);
2608
2609         if (pthread_cond_destroy (&placeholderp->cond))
2610           GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2611
2612         if (pthread_mutex_destroy (&placeholderp->mutex))
2613           GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2614
2615         if (DEBUG_QUEUES)
2616           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2617                      "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2618       }
2619       break;
2620
2621     case ASYNC_PLACEHOLDER:
2622       pthread_mutex_lock (&entry->u.placeholder.mutex);
2623       entry->u.placeholder.executed = 1;
2624       pthread_cond_signal (&entry->u.placeholder.cond);
2625       pthread_mutex_unlock (&entry->u.placeholder.mutex);
2626       break;
2627
2628     default:
2629       GOMP_PLUGIN_fatal ("Unknown queue element");
2630     }
2631 }
2632
2633 /* This function is run as a thread to service an async queue in the
2634    background.  It runs continuously until the stop flag is set.  */
2635
2636 static void *
2637 drain_queue (void *thread_arg)
2638 {
2639   struct goacc_asyncqueue *aq = thread_arg;
2640
2641   if (DRAIN_QUEUE_SYNCHRONOUS_P)
2642     {
2643       aq->drain_queue_stop = 2;
2644       return NULL;
2645     }
2646
2647   pthread_mutex_lock (&aq->mutex);
2648
2649   while (true)
2650     {
2651       if (aq->drain_queue_stop)
2652         break;
2653
2654       if (aq->queue_n > 0)
2655         {
2656           pthread_mutex_unlock (&aq->mutex);
2657           execute_queue_entry (aq, aq->queue_first);
2658
2659           pthread_mutex_lock (&aq->mutex);
2660           aq->queue_first = ((aq->queue_first + 1)
2661                              % ASYNC_QUEUE_SIZE);
2662           aq->queue_n--;
2663
2664           if (DEBUG_THREAD_SIGNAL)
2665             GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2666                        aq->agent->device_id, aq->id);
2667           pthread_cond_broadcast (&aq->queue_cond_out);
2668           pthread_mutex_unlock (&aq->mutex);
2669
2670           if (DEBUG_QUEUES)
2671             GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2672                        aq->id);
2673           pthread_mutex_lock (&aq->mutex);
2674         }
2675       else
2676         {
2677           if (DEBUG_THREAD_SLEEP)
2678             GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2679                        aq->agent->device_id, aq->id);
2680           pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2681           if (DEBUG_THREAD_SLEEP)
2682             GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2683                        aq->agent->device_id, aq->id);
2684         }
2685     }
2686
2687   aq->drain_queue_stop = 2;
2688   if (DEBUG_THREAD_SIGNAL)
2689     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2690                aq->agent->device_id, aq->id);
2691   pthread_cond_broadcast (&aq->queue_cond_out);
2692   pthread_mutex_unlock (&aq->mutex);
2693
2694   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2695   return NULL;
2696 }
2697
2698 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2699    is not usually the case.  This is just a debug tool.  */
2700
2701 static void
2702 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2703 {
2704   pthread_mutex_lock (&aq->mutex);
2705
2706   while (aq->queue_n > 0)
2707     {
2708       execute_queue_entry (aq, aq->queue_first);
2709
2710       aq->queue_first = ((aq->queue_first + 1)
2711                          % ASYNC_QUEUE_SIZE);
2712       aq->queue_n--;
2713     }
2714
2715   pthread_mutex_unlock (&aq->mutex);
2716 }
2717
2718 /* Block the current thread until an async queue is writable.  The aq->mutex
2719    lock should be held on entry, and remains locked on exit.  */
2720
2721 static void
2722 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2723 {
2724   if (aq->queue_n == ASYNC_QUEUE_SIZE)
2725     {
2726       /* Queue is full.  Wait for it to not be full.  */
2727       while (aq->queue_n == ASYNC_QUEUE_SIZE)
2728         pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2729     }
2730 }
2731
2732 /* Request an asynchronous kernel launch on the specified queue.  This
2733    may block if the queue is full, but returns without waiting for the
2734    kernel to run.  */
2735
2736 static void
2737 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2738                    void *vars, struct GOMP_kernel_launch_attributes *kla)
2739 {
2740   assert (aq->agent == kernel->agent);
2741
2742   pthread_mutex_lock (&aq->mutex);
2743
2744   wait_for_queue_nonfull (aq);
2745
2746   int queue_last = ((aq->queue_first + aq->queue_n)
2747                     % ASYNC_QUEUE_SIZE);
2748   if (DEBUG_QUEUES)
2749     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2750                aq->id, queue_last);
2751
2752   aq->queue[queue_last].type = KERNEL_LAUNCH;
2753   aq->queue[queue_last].u.launch.kernel = kernel;
2754   aq->queue[queue_last].u.launch.vars = vars;
2755   aq->queue[queue_last].u.launch.kla = *kla;
2756
2757   aq->queue_n++;
2758
2759   if (DEBUG_THREAD_SIGNAL)
2760     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2761                aq->agent->device_id, aq->id);
2762   pthread_cond_signal (&aq->queue_cond_in);
2763
2764   pthread_mutex_unlock (&aq->mutex);
2765 }
2766
2767 /* Request an asynchronous callback on the specified queue.  The callback
2768    function will be called, with the given opaque data, from the appropriate
2769    async thread, when all previous items on that queue are complete.  */
2770
2771 static void
2772 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2773                      void *data)
2774 {
2775   pthread_mutex_lock (&aq->mutex);
2776
2777   wait_for_queue_nonfull (aq);
2778
2779   int queue_last = ((aq->queue_first + aq->queue_n)
2780                     % ASYNC_QUEUE_SIZE);
2781   if (DEBUG_QUEUES)
2782     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2783                aq->id, queue_last);
2784
2785   aq->queue[queue_last].type = CALLBACK;
2786   aq->queue[queue_last].u.callback.fn = fn;
2787   aq->queue[queue_last].u.callback.data = data;
2788
2789   aq->queue_n++;
2790
2791   if (DEBUG_THREAD_SIGNAL)
2792     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2793                aq->agent->device_id, aq->id);
2794   pthread_cond_signal (&aq->queue_cond_in);
2795
2796   pthread_mutex_unlock (&aq->mutex);
2797 }
2798
2799 /* Request that a given async thread wait for another thread (unspecified) to
2800    reach the given placeholder.  The wait will occur when all previous entries
2801    on the queue are complete.  A placeholder is effectively a kind of signal
2802    which simply sets a flag when encountered in a queue.  */
2803
2804 static void
2805 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2806                       struct placeholder *placeholderp)
2807 {
2808   pthread_mutex_lock (&aq->mutex);
2809
2810   wait_for_queue_nonfull (aq);
2811
2812   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2813   if (DEBUG_QUEUES)
2814     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2815                aq->id, queue_last);
2816
2817   aq->queue[queue_last].type = ASYNC_WAIT;
2818   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2819
2820   aq->queue_n++;
2821
2822   if (DEBUG_THREAD_SIGNAL)
2823     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2824                aq->agent->device_id, aq->id);
2825   pthread_cond_signal (&aq->queue_cond_in);
2826
2827   pthread_mutex_unlock (&aq->mutex);
2828 }
2829
2830 /* Add a placeholder into an async queue.  When the async thread reaches the
2831    placeholder it will set the "executed" flag to true and continue.
2832    Another thread may be waiting on this thread reaching the placeholder.  */
2833
2834 static struct placeholder *
2835 queue_push_placeholder (struct goacc_asyncqueue *aq)
2836 {
2837   struct placeholder *placeholderp;
2838
2839   pthread_mutex_lock (&aq->mutex);
2840
2841   wait_for_queue_nonfull (aq);
2842
2843   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2844   if (DEBUG_QUEUES)
2845     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2846                aq->id, queue_last);
2847
2848   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2849   placeholderp = &aq->queue[queue_last].u.placeholder;
2850
2851   if (pthread_mutex_init (&placeholderp->mutex, NULL))
2852     {
2853       pthread_mutex_unlock (&aq->mutex);
2854       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2855     }
2856
2857   if (pthread_cond_init (&placeholderp->cond, NULL))
2858     {
2859       pthread_mutex_unlock (&aq->mutex);
2860       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2861     }
2862
2863   placeholderp->executed = 0;
2864
2865   aq->queue_n++;
2866
2867   if (DEBUG_THREAD_SIGNAL)
2868     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2869                aq->agent->device_id, aq->id);
2870   pthread_cond_signal (&aq->queue_cond_in);
2871
2872   pthread_mutex_unlock (&aq->mutex);
2873
2874   return placeholderp;
2875 }
2876
2877 /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
2878
2879 static void
2880 finalize_async_thread (struct goacc_asyncqueue *aq)
2881 {
2882   pthread_mutex_lock (&aq->mutex);
2883   if (aq->drain_queue_stop == 2)
2884     {
2885       pthread_mutex_unlock (&aq->mutex);
2886       return;
2887     }
2888
2889   aq->drain_queue_stop = 1;
2890
2891   if (DEBUG_THREAD_SIGNAL)
2892     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2893                aq->agent->device_id, aq->id);
2894   pthread_cond_signal (&aq->queue_cond_in);
2895
2896   while (aq->drain_queue_stop != 2)
2897     {
2898       if (DEBUG_THREAD_SLEEP)
2899         GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2900                    " to sleep\n", aq->agent->device_id, aq->id);
2901       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2902       if (DEBUG_THREAD_SLEEP)
2903         GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
2904                    aq->agent->device_id, aq->id);
2905     }
2906
2907   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2908              aq->id);
2909   pthread_mutex_unlock (&aq->mutex);
2910
2911   int err = pthread_join (aq->thread_drain_queue, NULL);
2912   if (err != 0)
2913     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2914                        aq->agent->device_id, aq->id, strerror (err));
2915   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2916 }
2917
2918 /* Set up an async queue for OpenMP.  There will be only one.  The
2919    implementation simply uses an OpenACC async queue.
2920    FIXME: is this thread-safe if two threads call this function?  */
2921
2922 static void
2923 maybe_init_omp_async (struct agent_info *agent)
2924 {
2925   if (!agent->omp_async_queue)
2926     agent->omp_async_queue
2927       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2928 }
2929
2930 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2931    copies from read-only pages.  */
2932
2933 static void
2934 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2935 {
2936   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2937
2938   if (status == HSA_STATUS_SUCCESS)
2939     return;
2940
2941   /* It appears that the copy fails if the source data is in a read-only page.
2942      We can't detect that easily, so try copying the data to a temporary buffer
2943      and doing the copy again if we got an error above.  */
2944
2945   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2946                "[%p:+%d]\n", (void *) src, (int) len);
2947
2948   void *src_copy = malloc (len);
2949   memcpy (src_copy, src, len);
2950   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2951   free (src_copy);
2952   if (status != HSA_STATUS_SUCCESS)
2953     GOMP_PLUGIN_error ("memory copy failed");
2954 }
2955
2956 /* Copy data to or from a device.  This is intended for use as an async
2957    callback event.  */
2958
2959 static void
2960 copy_data (void *data_)
2961 {
2962   struct copy_data *data = (struct copy_data *)data_;
2963   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2964              data->aq->agent->device_id, data->aq->id, data->len, data->src,
2965              data->dst);
2966   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
2967   free (data);
2968 }
2969
2970 /* Free device data.  This is intended for use as an async callback event.  */
2971
2972 static void
2973 gomp_offload_free (void *ptr)
2974 {
2975   GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2976   GOMP_OFFLOAD_free (0, ptr);
2977 }
2978
2979 /* Request an asynchronous data copy, to or from a device, on a given queue.
2980    The event will be registered as a callback.  */
2981
2982 static void
2983 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
2984                  size_t len)
2985 {
2986   if (DEBUG_QUEUES)
2987     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2988                aq->agent->device_id, aq->id, len, src, dst);
2989   struct copy_data *data
2990     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
2991   data->dst = dst;
2992   data->src = src;
2993   data->len = len;
2994   data->aq = aq;
2995   queue_push_callback (aq, copy_data, data);
2996 }
2997
2998 /* Return true if the given queue is currently empty.  */
2999
3000 static int
3001 queue_empty (struct goacc_asyncqueue *aq)
3002 {
3003   pthread_mutex_lock (&aq->mutex);
3004   int res = aq->queue_n == 0 ? 1 : 0;
3005   pthread_mutex_unlock (&aq->mutex);
3006
3007   return res;
3008 }
3009
3010 /* Wait for a given queue to become empty.  This implements an OpenACC wait
3011    directive.  */
3012
3013 static void
3014 wait_queue (struct goacc_asyncqueue *aq)
3015 {
3016   if (DRAIN_QUEUE_SYNCHRONOUS_P)
3017     {
3018       drain_queue_synchronous (aq);
3019       return;
3020     }
3021
3022   pthread_mutex_lock (&aq->mutex);
3023
3024   while (aq->queue_n > 0)
3025     {
3026       if (DEBUG_THREAD_SLEEP)
3027         GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3028                    aq->agent->device_id, aq->id);
3029       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3030       if (DEBUG_THREAD_SLEEP)
3031         GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
3032                    aq->id);
3033     }
3034
3035   pthread_mutex_unlock (&aq->mutex);
3036   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3037 }
3038
3039 /* }}}  */
3040 /* {{{ OpenACC support  */
3041
3042 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
3043
3044 static void
3045 gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3046           void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3047           struct goacc_asyncqueue *aq)
3048 {
3049   if (!GOMP_OFFLOAD_can_run (kernel))
3050     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3051
3052   /* If we get here then this must be an OpenACC kernel.  */
3053   kernel->kind = KIND_OPENACC;
3054
3055   /* devaddrs must be double-indirect on the target.  */
3056   void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3057   for (size_t i = 0; i < mapnum; i++)
3058     hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3059                                 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3060                                 sizeof (void *));
3061
3062   struct hsa_kernel_description *hsa_kernel_desc = NULL;
3063   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3064     {
3065       struct hsa_kernel_description *d
3066         = &kernel->module->image_desc->kernel_infos[i];
3067       if (d->name == kernel->name)
3068         {
3069           hsa_kernel_desc = d;
3070           break;
3071         }
3072     }
3073
3074   /* We may have statically-determined dimensions in
3075      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3076      invocation at runtime in dims[].  We allow static dimensions to take
3077      priority over dynamic dimensions when present (non-zero).  */
3078   if (hsa_kernel_desc->oacc_dims[0] > 0)
3079     dims[0] = hsa_kernel_desc->oacc_dims[0];
3080   if (hsa_kernel_desc->oacc_dims[1] > 0)
3081     dims[1] = hsa_kernel_desc->oacc_dims[1];
3082   if (hsa_kernel_desc->oacc_dims[2] > 0)
3083     dims[2] = hsa_kernel_desc->oacc_dims[2];
3084
3085   /* Ideally, when a dimension isn't explicitly specified, we should
3086      tune it to run 40 (or 32?) threads per CU with no threads getting queued.
3087      In practice, we tune for peak performance on BabelStream, which
3088      for OpenACC is currently 32 threads per CU.  */
3089   if (dims[0] == 0 && dims[1] == 0)
3090     {
3091       /* If any of the OpenACC dimensions remain 0 then we get to pick a
3092          number.  There isn't really a correct answer for this without a clue
3093          about the problem size, so let's do a reasonable number of workers
3094          and gangs.  */
3095
3096       dims[0] = get_cu_count (kernel->agent) * 4; /* Gangs.  */
3097       dims[1] = 8; /* Workers.  */
3098     }
3099   else if (dims[0] == 0 && dims[1] > 0)
3100     {
3101       /* Auto-scale the number of gangs with the requested number of workers.  */
3102       dims[0] = get_cu_count (kernel->agent) * (32 / dims[1]);
3103     }
3104   else if (dims[0] > 0 && dims[1] == 0)
3105     {
3106       /* Auto-scale the number of workers with the requested number of gangs.  */
3107       dims[1] = get_cu_count (kernel->agent) * 32 / dims[0];
3108       if (dims[1] == 0)
3109         dims[1] = 1;
3110       if (dims[1] > 16)
3111         dims[1] = 16;
3112     }
3113
3114   /* The incoming dimensions are expressed in terms of gangs, workers, and
3115      vectors.  The HSA dimensions are expressed in terms of "work-items",
3116      which means multiples of vector lanes.
3117
3118      The "grid size" specifies the size of the problem space, and the
3119      "work-group size" specifies how much of that we want a single compute
3120      unit to chew on at once.
3121
3122      The three dimensions do not really correspond to hardware, but the
3123      important thing is that the HSA runtime will launch as many
3124      work-groups as it takes to process the entire grid, and each
3125      work-group will contain as many wave-fronts as it takes to process
3126      the work-items in that group.
3127
3128      Essentially, as long as we set the Y dimension to 64 (the number of
3129      vector lanes in hardware), and the Z group size to the maximum (16),
3130      then we will get the gangs (X) and workers (Z) launched as we expect.
3131
3132      The reason for the apparent reversal of vector and worker dimension
3133      order is to do with the way the run-time distributes work-items across
3134      v1 and v2.  */
3135   struct GOMP_kernel_launch_attributes kla =
3136     {3,
3137      /* Grid size.  */
3138      {dims[0], 64, dims[1]},
3139      /* Work-group size.  */
3140      {1,       64, 16}
3141     };
3142
3143   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3144   acc_prof_info *prof_info = thr->prof_info;
3145   acc_event_info enqueue_launch_event_info;
3146   acc_api_info *api_info = thr->api_info;
3147   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3148   if (profiling_dispatch_p)
3149     {
3150       prof_info->event_type = acc_ev_enqueue_launch_start;
3151
3152       enqueue_launch_event_info.launch_event.event_type
3153         = prof_info->event_type;
3154       enqueue_launch_event_info.launch_event.valid_bytes
3155         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3156       enqueue_launch_event_info.launch_event.parent_construct
3157         = acc_construct_parallel;
3158       enqueue_launch_event_info.launch_event.implicit = 1;
3159       enqueue_launch_event_info.launch_event.tool_info = NULL;
3160       enqueue_launch_event_info.launch_event.kernel_name
3161         = (char *) kernel->name;
3162       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3163       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3164       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3165
3166       api_info->device_api = acc_device_api_other;
3167
3168       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3169         &enqueue_launch_event_info, api_info);
3170     }
3171
3172   if (!async)
3173     {
3174       run_kernel (kernel, ind_da, &kla, NULL, false);
3175       gomp_offload_free (ind_da);
3176     }
3177   else
3178     {
3179       queue_push_launch (aq, kernel, ind_da, &kla);
3180       if (DEBUG_QUEUES)
3181         GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3182                    aq->agent->device_id, aq->id, ind_da);
3183       queue_push_callback (aq, gomp_offload_free, ind_da);
3184     }
3185
3186   if (profiling_dispatch_p)
3187     {
3188       prof_info->event_type = acc_ev_enqueue_launch_end;
3189       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3190       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3191                                             &enqueue_launch_event_info,
3192                                             api_info);
3193     }
3194 }
3195
3196 /* }}}  */
3197 /* {{{ Generic Plugin API  */
3198
3199 /* Return the name of the accelerator, which is "gcn".  */
3200
3201 const char *
3202 GOMP_OFFLOAD_get_name (void)
3203 {
3204   return "gcn";
3205 }
3206
3207 /* Return the specific capabilities the HSA accelerator have.  */
3208
3209 unsigned int
3210 GOMP_OFFLOAD_get_caps (void)
3211 {
3212   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
3213   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3214             | GOMP_OFFLOAD_CAP_OPENACC_200;
3215 }
3216
3217 /* Identify as GCN accelerator.  */
3218
3219 int
3220 GOMP_OFFLOAD_get_type (void)
3221 {
3222   return OFFLOAD_TARGET_TYPE_GCN;
3223 }
3224
3225 /* Return the libgomp version number we're compatible with.  There is
3226    no requirement for cross-version compatibility.  */
3227
3228 unsigned
3229 GOMP_OFFLOAD_version (void)
3230 {
3231   return GOMP_VERSION;
3232 }
3233
3234 /* Return the number of GCN devices on the system.  */
3235
3236 int
3237 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
3238 {
3239   if (!init_hsa_context ())
3240     return 0;
3241   /* Return -1 if no omp_requires_mask cannot be fulfilled but
3242      devices were present.  */
3243   if (hsa_context.agent_count > 0 && omp_requires_mask != 0)
3244     return -1;
3245   return hsa_context.agent_count;
3246 }
3247
3248 /* Initialize device (agent) number N so that it can be used for computation.
3249    Return TRUE on success.  */
3250
3251 bool
3252 GOMP_OFFLOAD_init_device (int n)
3253 {
3254   if (!init_hsa_context ())
3255     return false;
3256   if (n >= hsa_context.agent_count)
3257     {
3258       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3259       return false;
3260     }
3261   struct agent_info *agent = &hsa_context.agents[n];
3262
3263   if (agent->initialized)
3264     return true;
3265
3266   agent->device_id = n;
3267
3268   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3269     {
3270       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3271       return false;
3272     }
3273   if (pthread_mutex_init (&agent->prog_mutex, NULL))
3274     {
3275       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3276       return false;
3277     }
3278   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3279     {
3280       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3281       return false;
3282     }
3283   if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3284     {
3285       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3286       return false;
3287     }
3288   agent->async_queues = NULL;
3289   agent->omp_async_queue = NULL;
3290   agent->team_arena_list = NULL;
3291
3292   uint32_t queue_size;
3293   hsa_status_t status;
3294   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3295                                           HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3296                                           &queue_size);
3297   if (status != HSA_STATUS_SUCCESS)
3298     return hsa_error ("Error requesting maximum queue size of the GCN agent",
3299                       status);
3300
3301   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3302                                           &agent->name);
3303   if (status != HSA_STATUS_SUCCESS)
3304     return hsa_error ("Error querying the name of the agent", status);
3305
3306   agent->device_isa = isa_code (agent->name);
3307   if (agent->device_isa < 0)
3308     return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3309
3310   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3311                                           &agent->vendor_name);
3312   if (status != HSA_STATUS_SUCCESS)
3313     return hsa_error ("Error querying the vendor name of the agent", status);
3314
3315   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3316                                         HSA_QUEUE_TYPE_MULTI,
3317                                         hsa_queue_callback, NULL, UINT32_MAX,
3318                                         UINT32_MAX, &agent->sync_queue);
3319   if (status != HSA_STATUS_SUCCESS)
3320     return hsa_error ("Error creating command queue", status);
3321
3322   agent->kernarg_region.handle = (uint64_t) -1;
3323   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3324                                                  get_kernarg_memory_region,
3325                                                  &agent->kernarg_region);
3326   if (status != HSA_STATUS_SUCCESS
3327       && status != HSA_STATUS_INFO_BREAK)
3328     hsa_error ("Scanning memory regions failed", status);
3329   if (agent->kernarg_region.handle == (uint64_t) -1)
3330     {
3331       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3332                          "arguments");
3333       return false;
3334     }
3335   GCN_DEBUG ("Selected kernel arguments memory region:\n");
3336   dump_hsa_region (agent->kernarg_region, NULL);
3337
3338   agent->data_region.handle = (uint64_t) -1;
3339   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3340                                                  get_data_memory_region,
3341                                                  &agent->data_region);
3342   if (status != HSA_STATUS_SUCCESS
3343       && status != HSA_STATUS_INFO_BREAK)
3344     hsa_error ("Scanning memory regions failed", status);
3345   if (agent->data_region.handle == (uint64_t) -1)
3346     {
3347       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3348                          "data");
3349       return false;
3350     }
3351   GCN_DEBUG ("Selected device data memory region:\n");
3352   dump_hsa_region (agent->data_region, NULL);
3353
3354   GCN_DEBUG ("GCN agent %d initialized\n", n);
3355
3356   agent->initialized = true;
3357   return true;
3358 }
3359
3360 /* Load GCN object-code module described by struct gcn_image_desc in
3361    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3362    If there are any constructors then run them.  If not NULL, REV_FN_TABLE will
3363    contain the on-device addresses of the functions for reverse offload.  To be
3364    freed by the caller.  */
3365
3366 int
3367 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3368                          struct addr_pair **target_table,
3369                          uint64_t **rev_fn_table)
3370 {
3371   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3372     {
3373       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3374                          " (expected %u, received %u)",
3375                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3376       return -1;
3377     }
3378
3379   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3380   struct agent_info *agent;
3381   struct addr_pair *pair;
3382   struct module_info *module;
3383   struct kernel_info *kernel;
3384   int kernel_count = image_desc->kernel_count;
3385   unsigned var_count = image_desc->global_variable_count;
3386   /* Currently, "others" is a struct of ICVS.  */
3387   int other_count = 1;
3388
3389   agent = get_agent_info (ord);
3390   if (!agent)
3391     return -1;
3392
3393   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3394     {
3395       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3396       return -1;
3397     }
3398   if (agent->prog_finalized
3399       && !destroy_hsa_program (agent))
3400     return -1;
3401
3402   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3403   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3404   GCN_DEBUG ("Expect %d other variables in an image\n", other_count);
3405   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count + other_count - 2)
3406                              * sizeof (struct addr_pair));
3407   *target_table = pair;
3408   module = (struct module_info *)
3409     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3410                                 + kernel_count * sizeof (struct kernel_info));
3411   module->image_desc = image_desc;
3412   module->kernel_count = kernel_count;
3413   module->heap = NULL;
3414   module->constructors_run_p = false;
3415
3416   kernel = &module->kernels[0];
3417
3418   /* Allocate memory for kernel dependencies.  */
3419   for (unsigned i = 0; i < kernel_count; i++)
3420     {
3421       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3422       if (!init_basic_kernel_info (kernel, d, agent, module))
3423         return -1;
3424       if (strcmp (d->name, "_init_array") == 0)
3425         module->init_array_func = kernel;
3426       else if (strcmp (d->name, "_fini_array") == 0)
3427         module->fini_array_func = kernel;
3428       else
3429         {
3430           pair->start = (uintptr_t) kernel;
3431           pair->end = (uintptr_t) (kernel + 1);
3432           pair++;
3433         }
3434       kernel++;
3435     }
3436
3437   agent->module = module;
3438   if (pthread_rwlock_unlock (&agent->module_rwlock))
3439     {
3440       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3441       return -1;
3442     }
3443
3444   if (!create_and_finalize_hsa_program (agent))
3445     return -1;
3446
3447   if (var_count > 0)
3448     {
3449       hsa_status_t status;
3450       hsa_executable_symbol_t var_symbol;
3451       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3452                                                      ".offload_var_table",
3453                                                      agent->id,
3454                                                      0, &var_symbol);
3455
3456       if (status != HSA_STATUS_SUCCESS)
3457         hsa_fatal ("Could not find symbol for variable in the code object",
3458                    status);
3459
3460       uint64_t var_table_addr;
3461       status = hsa_fns.hsa_executable_symbol_get_info_fn
3462         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3463          &var_table_addr);
3464       if (status != HSA_STATUS_SUCCESS)
3465         hsa_fatal ("Could not extract a variable from its symbol", status);
3466
3467       struct {
3468         uint64_t addr;
3469         uint64_t size;
3470       } var_table[var_count];
3471       GOMP_OFFLOAD_dev2host (agent->device_id, var_table,
3472                              (void*)var_table_addr, sizeof (var_table));
3473
3474       for (unsigned i = 0; i < var_count; i++)
3475         {
3476           pair->start = var_table[i].addr;
3477           pair->end = var_table[i].addr + var_table[i].size;
3478           GCN_DEBUG ("Found variable at %p with size %lu\n",
3479                      (void *)var_table[i].addr, var_table[i].size);
3480           pair++;
3481         }
3482     }
3483
3484   GCN_DEBUG ("Looking for variable %s\n", XSTRING (GOMP_ADDITIONAL_ICVS));
3485
3486   hsa_status_t status;
3487   hsa_executable_symbol_t var_symbol;
3488   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3489                                                  XSTRING (GOMP_ADDITIONAL_ICVS),
3490                                                  agent->id, 0, &var_symbol);
3491   if (status == HSA_STATUS_SUCCESS)
3492     {
3493       uint64_t varptr;
3494       uint32_t varsize;
3495
3496       status = hsa_fns.hsa_executable_symbol_get_info_fn
3497         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3498          &varptr);
3499       if (status != HSA_STATUS_SUCCESS)
3500         hsa_fatal ("Could not extract a variable from its symbol", status);
3501       status = hsa_fns.hsa_executable_symbol_get_info_fn
3502         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE,
3503          &varsize);
3504       if (status != HSA_STATUS_SUCCESS)
3505         hsa_fatal ("Could not extract a variable size from its symbol",
3506                    status);
3507
3508       pair->start = varptr;
3509       pair->end = varptr + varsize;
3510     }
3511   else
3512     {
3513       /* The variable was not in this image.  */
3514       GCN_DEBUG ("Variable not found in image: %s\n",
3515                  XSTRING (GOMP_ADDITIONAL_ICVS));
3516       pair->start = pair->end = 0;
3517     }
3518
3519   /* Ensure that constructors are run first.  */
3520   struct GOMP_kernel_launch_attributes kla =
3521     { 3,
3522       /* Grid size.  */
3523       { 1, 64, 1 },
3524       /* Work-group size.  */
3525       { 1, 64, 1 }
3526     };
3527
3528   if (module->init_array_func)
3529     {
3530       init_kernel (module->init_array_func);
3531       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3532     }
3533   module->constructors_run_p = true;
3534
3535   /* Don't report kernels that libgomp need not know about.  */
3536   if (module->init_array_func)
3537     kernel_count--;
3538   if (module->fini_array_func)
3539     kernel_count--;
3540
3541   if (rev_fn_table != NULL && kernel_count == 0)
3542     *rev_fn_table = NULL;
3543   else if (rev_fn_table != NULL)
3544     {
3545       hsa_status_t status;
3546       hsa_executable_symbol_t var_symbol;
3547       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3548                                                      ".offload_func_table",
3549                                                      agent->id, 0, &var_symbol);
3550       if (status != HSA_STATUS_SUCCESS)
3551         hsa_fatal ("Could not find symbol for variable in the code object",
3552                    status);
3553       uint64_t fn_table_addr;
3554       status = hsa_fns.hsa_executable_symbol_get_info_fn
3555         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
3556          &fn_table_addr);
3557       if (status != HSA_STATUS_SUCCESS)
3558         hsa_fatal ("Could not extract a variable from its symbol", status);
3559       *rev_fn_table = GOMP_PLUGIN_malloc (kernel_count * sizeof (uint64_t));
3560       GOMP_OFFLOAD_dev2host (agent->device_id, *rev_fn_table,
3561                              (void*) fn_table_addr,
3562                              kernel_count * sizeof (uint64_t));
3563     }
3564
3565   return kernel_count + var_count + other_count;
3566 }
3567
3568 /* Unload GCN object-code module described by struct gcn_image_desc in
3569    TARGET_DATA from agent number N.  Return TRUE on success.  */
3570
3571 bool
3572 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3573 {
3574   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3575     {
3576       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3577                          " (expected %u, received %u)",
3578                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3579       return false;
3580     }
3581
3582   struct agent_info *agent;
3583   agent = get_agent_info (n);
3584   if (!agent)
3585     return false;
3586
3587   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3588     {
3589       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3590       return false;
3591     }
3592
3593   if (!agent->module || agent->module->image_desc != target_data)
3594     {
3595       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3596                          "loaded before");
3597       return false;
3598     }
3599
3600   if (!destroy_module (agent->module, true))
3601     return false;
3602   free (agent->module);
3603   agent->module = NULL;
3604   if (!destroy_hsa_program (agent))
3605     return false;
3606   if (pthread_rwlock_unlock (&agent->module_rwlock))
3607     {
3608       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3609       return false;
3610     }
3611   return true;
3612 }
3613
3614 /* Deinitialize all information and status associated with agent number N.  We
3615    do not attempt any synchronization, assuming the user and libgomp will not
3616    attempt deinitialization of a device that is in any way being used at the
3617    same time.  Return TRUE on success.  */
3618
3619 bool
3620 GOMP_OFFLOAD_fini_device (int n)
3621 {
3622   struct agent_info *agent = get_agent_info (n);
3623   if (!agent)
3624     return false;
3625
3626   if (!agent->initialized)
3627     return true;
3628
3629   if (agent->omp_async_queue)
3630     {
3631       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3632       agent->omp_async_queue = NULL;
3633     }
3634
3635   if (agent->module)
3636     {
3637       if (!destroy_module (agent->module, false))
3638         return false;
3639       free (agent->module);
3640       agent->module = NULL;
3641     }
3642
3643   if (!destroy_team_arenas (agent))
3644     return false;
3645
3646   if (!destroy_hsa_program (agent))
3647     return false;
3648
3649   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3650   if (status != HSA_STATUS_SUCCESS)
3651     return hsa_error ("Error destroying command queue", status);
3652
3653   if (pthread_mutex_destroy (&agent->prog_mutex))
3654     {
3655       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3656       return false;
3657     }
3658   if (pthread_rwlock_destroy (&agent->module_rwlock))
3659     {
3660       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3661       return false;
3662     }
3663
3664   if (pthread_mutex_destroy (&agent->async_queues_mutex))
3665     {
3666       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3667       return false;
3668     }
3669   if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3670     {
3671       GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3672       return false;
3673     }
3674   agent->initialized = false;
3675   return true;
3676 }
3677
3678 /* Return true if the HSA runtime can run function FN_PTR.  */
3679
3680 bool
3681 GOMP_OFFLOAD_can_run (void *fn_ptr)
3682 {
3683   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3684
3685   init_kernel (kernel);
3686   if (kernel->initialization_failed)
3687     goto failure;
3688
3689   return true;
3690
3691 failure:
3692   if (suppress_host_fallback)
3693     GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3694   GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3695   return false;
3696 }
3697
3698 /* Allocate memory on device N.  */
3699
3700 void *
3701 GOMP_OFFLOAD_alloc (int n, size_t size)
3702 {
3703   struct agent_info *agent = get_agent_info (n);
3704   return alloc_by_agent (agent, size);
3705 }
3706
3707 /* Free memory from device N.  */
3708
3709 bool
3710 GOMP_OFFLOAD_free (int device, void *ptr)
3711 {
3712   GCN_DEBUG ("Freeing memory on device %d\n", device);
3713
3714   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3715   if (status != HSA_STATUS_SUCCESS)
3716     {
3717       hsa_error ("Could not free device memory", status);
3718       return false;
3719     }
3720
3721   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3722   bool profiling_dispatch_p
3723     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3724   if (profiling_dispatch_p)
3725     {
3726       acc_prof_info *prof_info = thr->prof_info;
3727       acc_event_info data_event_info;
3728       acc_api_info *api_info = thr->api_info;
3729
3730       prof_info->event_type = acc_ev_free;
3731
3732       data_event_info.data_event.event_type = prof_info->event_type;
3733       data_event_info.data_event.valid_bytes
3734         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3735       data_event_info.data_event.parent_construct
3736         = acc_construct_parallel;
3737       data_event_info.data_event.implicit = 1;
3738       data_event_info.data_event.tool_info = NULL;
3739       data_event_info.data_event.var_name = NULL;
3740       data_event_info.data_event.bytes = 0;
3741       data_event_info.data_event.host_ptr = NULL;
3742       data_event_info.data_event.device_ptr = (void *) ptr;
3743
3744       api_info->device_api = acc_device_api_other;
3745
3746       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3747                                             api_info);
3748     }
3749
3750   return true;
3751 }
3752
3753 /* Copy data from DEVICE to host.  */
3754
3755 bool
3756 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3757 {
3758   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3759              src, dst);
3760   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3761   if (status != HSA_STATUS_SUCCESS)
3762     GOMP_PLUGIN_error ("memory copy failed");
3763   return true;
3764 }
3765
3766 /* Copy data from host to DEVICE.  */
3767
3768 bool
3769 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3770 {
3771   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3772              device, dst);
3773   hsa_memory_copy_wrapper (dst, src, n);
3774   return true;
3775 }
3776
3777 /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
3778
3779 bool
3780 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3781 {
3782   struct gcn_thread *thread_data = gcn_thread ();
3783
3784   if (thread_data && !async_synchronous_p (thread_data->async))
3785     {
3786       struct agent_info *agent = get_agent_info (device);
3787       maybe_init_omp_async (agent);
3788       queue_push_copy (agent->omp_async_queue, dst, src, n);
3789       return true;
3790     }
3791
3792   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3793              device, src, device, dst);
3794   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3795   if (status != HSA_STATUS_SUCCESS)
3796     GOMP_PLUGIN_error ("memory copy failed");
3797   return true;
3798 }
3799
3800 /* }}}  */
3801 /* {{{ OpenMP Plugin API  */
3802
3803 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3804    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
3805    to a kernel_info structure, and must have previously been loaded to the
3806    specified device.  */
3807
3808 void
3809 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3810 {
3811   struct agent_info *agent = get_agent_info (device);
3812   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3813   struct GOMP_kernel_launch_attributes def;
3814   struct GOMP_kernel_launch_attributes *kla;
3815   assert (agent == kernel->agent);
3816
3817   /* If we get here then the kernel must be OpenMP.  */
3818   kernel->kind = KIND_OPENMP;
3819
3820   if (!parse_target_attributes (args, &def, &kla, agent))
3821     {
3822       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3823       return;
3824     }
3825   run_kernel (kernel, vars, kla, NULL, false);
3826 }
3827
3828 /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
3829    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3830    GOMP_PLUGIN_target_task_completion when it has finished.  */
3831
3832 void
3833 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3834                         void **args, void *async_data)
3835 {
3836   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3837   struct agent_info *agent = get_agent_info (device);
3838   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3839   struct GOMP_kernel_launch_attributes def;
3840   struct GOMP_kernel_launch_attributes *kla;
3841   assert (agent == kernel->agent);
3842
3843   /* If we get here then the kernel must be OpenMP.  */
3844   kernel->kind = KIND_OPENMP;
3845
3846   if (!parse_target_attributes (args, &def, &kla, agent))
3847     {
3848       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3849       return;
3850     }
3851
3852   maybe_init_omp_async (agent);
3853   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3854   queue_push_callback (agent->omp_async_queue,
3855                        GOMP_PLUGIN_target_task_completion, async_data);
3856 }
3857
3858 /* }}} */
3859 /* {{{ OpenACC Plugin API  */
3860
3861 /* Run a synchronous OpenACC kernel.  The device number is inferred from the
3862    already-loaded KERNEL.  */
3863
3864 void
3865 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3866                            void **hostaddrs, void **devaddrs, unsigned *dims,
3867                            void *targ_mem_desc)
3868 {
3869   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3870
3871   gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3872             NULL);
3873 }
3874
3875 /* Run an asynchronous OpenACC kernel on the specified queue.  */
3876
3877 void
3878 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3879                                  void **hostaddrs, void **devaddrs,
3880                                  unsigned *dims, void *targ_mem_desc,
3881                                  struct goacc_asyncqueue *aq)
3882 {
3883   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3884
3885   gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3886             aq);
3887 }
3888
3889 /* Create a new asynchronous thread and queue for running future kernels.  */
3890
3891 struct goacc_asyncqueue *
3892 GOMP_OFFLOAD_openacc_async_construct (int device)
3893 {
3894   struct agent_info *agent = get_agent_info (device);
3895
3896   pthread_mutex_lock (&agent->async_queues_mutex);
3897
3898   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3899   aq->agent = get_agent_info (device);
3900   aq->prev = NULL;
3901   aq->next = agent->async_queues;
3902   if (aq->next)
3903     {
3904       aq->next->prev = aq;
3905       aq->id = aq->next->id + 1;
3906     }
3907   else
3908     aq->id = 1;
3909   agent->async_queues = aq;
3910
3911   aq->queue_first = 0;
3912   aq->queue_n = 0;
3913   aq->drain_queue_stop = 0;
3914
3915   if (pthread_mutex_init (&aq->mutex, NULL))
3916     {
3917       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3918       return false;
3919     }
3920   if (pthread_cond_init (&aq->queue_cond_in, NULL))
3921     {
3922       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3923       return false;
3924     }
3925   if (pthread_cond_init (&aq->queue_cond_out, NULL))
3926     {
3927       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3928       return false;
3929     }
3930
3931   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3932                                                      ASYNC_QUEUE_SIZE,
3933                                                      HSA_QUEUE_TYPE_MULTI,
3934                                                      hsa_queue_callback, NULL,
3935                                                      UINT32_MAX, UINT32_MAX,
3936                                                      &aq->hsa_queue);
3937   if (status != HSA_STATUS_SUCCESS)
3938     hsa_fatal ("Error creating command queue", status);
3939
3940   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3941   if (err != 0)
3942     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3943                        strerror (err));
3944   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3945              aq->id);
3946
3947   pthread_mutex_unlock (&agent->async_queues_mutex);
3948
3949   return aq;
3950 }
3951
3952 /* Destroy an existing asynchronous thread and queue.  Waits for any
3953    currently-running task to complete, but cancels any queued tasks.  */
3954
3955 bool
3956 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3957 {
3958   struct agent_info *agent = aq->agent;
3959
3960   finalize_async_thread (aq);
3961
3962   pthread_mutex_lock (&agent->async_queues_mutex);
3963
3964   int err;
3965   if ((err = pthread_mutex_destroy (&aq->mutex)))
3966     {
3967       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3968       goto fail;
3969     }
3970   if (pthread_cond_destroy (&aq->queue_cond_in))
3971     {
3972       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3973       goto fail;
3974     }
3975   if (pthread_cond_destroy (&aq->queue_cond_out))
3976     {
3977       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3978       goto fail;
3979     }
3980   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3981   if (status != HSA_STATUS_SUCCESS)
3982     {
3983       hsa_error ("Error destroying command queue", status);
3984       goto fail;
3985     }
3986
3987   if (aq->prev)
3988     aq->prev->next = aq->next;
3989   if (aq->next)
3990     aq->next->prev = aq->prev;
3991   if (agent->async_queues == aq)
3992     agent->async_queues = aq->next;
3993
3994   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3995
3996   free (aq);
3997   pthread_mutex_unlock (&agent->async_queues_mutex);
3998   return true;
3999
4000 fail:
4001   pthread_mutex_unlock (&agent->async_queues_mutex);
4002   return false;
4003 }
4004
4005 /* Return true if the specified async queue is currently empty.  */
4006
4007 int
4008 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
4009 {
4010   return queue_empty (aq);
4011 }
4012
4013 /* Block until the specified queue has executed all its tasks and the
4014    queue is empty.  */
4015
4016 bool
4017 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
4018 {
4019   wait_queue (aq);
4020   return true;
4021 }
4022
4023 /* Add a serialization point across two async queues. Any new tasks added to
4024    AQ2, after this call, will not run until all tasks on AQ1, at the time
4025    of this call, have completed.  */
4026
4027 bool
4028 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
4029                                       struct goacc_asyncqueue *aq2)
4030 {
4031   /* For serialize, stream aq2 waits for aq1 to complete work that has been
4032      scheduled to run on it up to this point.  */
4033   if (aq1 != aq2)
4034     {
4035       struct placeholder *placeholderp = queue_push_placeholder (aq1);
4036       queue_push_asyncwait (aq2, placeholderp);
4037     }
4038   return true;
4039 }
4040
4041 /* Add an opaque callback to the given async queue.  */
4042
4043 void
4044 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
4045                                            void (*fn) (void *), void *data)
4046 {
4047   queue_push_callback (aq, fn, data);
4048 }
4049
4050 /* Queue up an asynchronous data copy from host to DEVICE.  */
4051
4052 bool
4053 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
4054                                      size_t n, struct goacc_asyncqueue *aq)
4055 {
4056   struct agent_info *agent = get_agent_info (device);
4057   assert (agent == aq->agent);
4058   queue_push_copy (aq, dst, src, n);
4059   return true;
4060 }
4061
4062 /* Queue up an asynchronous data copy from DEVICE to host.  */
4063
4064 bool
4065 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
4066                                      size_t n, struct goacc_asyncqueue *aq)
4067 {
4068   struct agent_info *agent = get_agent_info (device);
4069   assert (agent == aq->agent);
4070   queue_push_copy (aq, dst, src, n);
4071   return true;
4072 }
4073
4074 union goacc_property_value
4075 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
4076 {
4077   struct agent_info *agent = get_agent_info (device);
4078
4079   union goacc_property_value propval = { .val = 0 };
4080
4081   switch (prop)
4082     {
4083     case GOACC_PROPERTY_FREE_MEMORY:
4084       /* Not supported. */
4085       break;
4086     case GOACC_PROPERTY_MEMORY:
4087       {
4088         size_t size;
4089         hsa_region_t region = agent->data_region;
4090         hsa_status_t status =
4091           hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
4092         if (status == HSA_STATUS_SUCCESS)
4093           propval.val = size;
4094         break;
4095       }
4096     case GOACC_PROPERTY_NAME:
4097       propval.ptr = agent->name;
4098       break;
4099     case GOACC_PROPERTY_VENDOR:
4100       propval.ptr = agent->vendor_name;
4101       break;
4102     case GOACC_PROPERTY_DRIVER:
4103       propval.ptr = hsa_context.driver_version_s;
4104       break;
4105     }
4106
4107   return propval;
4108 }
4109
4110 /* Set up plugin-specific thread-local-data (host-side).  */
4111
4112 void *
4113 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4114 {
4115   struct gcn_thread *thread_data
4116     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4117
4118   thread_data->async = GOMP_ASYNC_SYNC;
4119
4120   return (void *) thread_data;
4121 }
4122
4123 /* Clean up plugin-specific thread-local-data.  */
4124
4125 void
4126 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4127 {
4128   free (data);
4129 }
4130
4131 /* }}} */