libgomp/plugin/plugin-gcn.c

   1 /* Plugin for AMD GCN execution.
   2
   3    Copyright (C) 2013-2020 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* {{{ Includes and defines  */
  30
  31 #include "config.h"
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #include <pthread.h>
  36 #include <inttypes.h>
  37 #include <stdbool.h>
  38 #include <limits.h>
  39 #include <hsa.h>
  40 #include <dlfcn.h>
  41 #include <signal.h>
  42 #include "libgomp-plugin.h"
  43 #include "gomp-constants.h"
  44 #include <elf.h>
  45 #include "oacc-plugin.h"
  46 #include "oacc-int.h"
  47 #include <assert.h>
  48
  49 /* Additional definitions not in HSA 1.1.
  50    FIXME: this needs to be updated in hsa.h for upstream, but the only source
  51           right now is the ROCr source which may cause license issues.  */
  52 #define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
  53
  54 /* These probably won't be in elf.h for a while.  */
  55 #ifndef R_AMDGPU_NONE
  56 #define R_AMDGPU_NONE           0
  57 #define R_AMDGPU_ABS32_LO       1       /* (S + A) & 0xFFFFFFFF  */
  58 #define R_AMDGPU_ABS32_HI       2       /* (S + A) >> 32  */
  59 #define R_AMDGPU_ABS64          3       /* S + A  */
  60 #define R_AMDGPU_REL32          4       /* S + A - P  */
  61 #define R_AMDGPU_REL64          5       /* S + A - P  */
  62 #define R_AMDGPU_ABS32          6       /* S + A  */
  63 #define R_AMDGPU_GOTPCREL       7       /* G + GOT + A - P  */
  64 #define R_AMDGPU_GOTPCREL32_LO  8       /* (G + GOT + A - P) & 0xFFFFFFFF  */
  65 #define R_AMDGPU_GOTPCREL32_HI  9       /* (G + GOT + A - P) >> 32  */
  66 #define R_AMDGPU_REL32_LO       10      /* (S + A - P) & 0xFFFFFFFF  */
  67 #define R_AMDGPU_REL32_HI       11      /* (S + A - P) >> 32  */
  68 #define R_AMDGPU_RELATIVE64     13      /* B + A  */
  69 #endif
  70
  71 /* GCN specific definitions for asynchronous queues.  */
  72
  73 #define ASYNC_QUEUE_SIZE 64
  74 #define DRAIN_QUEUE_SYNCHRONOUS_P false
  75 #define DEBUG_QUEUES 0
  76 #define DEBUG_THREAD_SLEEP 0
  77 #define DEBUG_THREAD_SIGNAL 0
  78
  79 /* Defaults.  */
  80 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
  81
  82 /* Secure getenv() which returns NULL if running as SUID/SGID.  */
  83 #ifndef HAVE_SECURE_GETENV
  84 #ifdef HAVE___SECURE_GETENV
  85 #define secure_getenv __secure_getenv
  86 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
  87   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
  88
  89 #include <unistd.h>
  90
  91 /* Implementation of secure_getenv() for targets where it is not provided but
  92    we have at least means to test real and effective IDs. */
  93
  94 static char *
  95 secure_getenv (const char *name)
  96 {
  97   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
  98     return getenv (name);
  99   else
 100     return NULL;
 101 }
 102
 103 #else
 104 #define secure_getenv getenv
 105 #endif
 106 #endif
 107
 108 /* }}}  */
 109 /* {{{ Types  */
 110
 111 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
 112
 113 struct gcn_thread
 114 {
 115   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
 116   int async;
 117 };
 118
 119 /* As an HSA runtime is dlopened, following structure defines function
 120    pointers utilized by the HSA plug-in.  */
 121
 122 struct hsa_runtime_fn_info
 123 {
 124   /* HSA runtime.  */
 125   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
 126                                         const char **status_string);
 127   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
 128                                           void *value);
 129   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
 130                                          hsa_agent_info_t attribute,
 131                                          void *value);
 132   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
 133                                       hsa_isa_info_t attribute,
 134                                       uint32_t index,
 135                                       void *value);
 136   hsa_status_t (*hsa_init_fn) (void);
 137   hsa_status_t (*hsa_iterate_agents_fn)
 138     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
 139   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
 140                                           hsa_region_info_t attribute,
 141                                           void *value);
 142   hsa_status_t (*hsa_queue_create_fn)
 143     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
 144      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
 145      void *data, uint32_t private_segment_size,
 146      uint32_t group_segment_size, hsa_queue_t **queue);
 147   hsa_status_t (*hsa_agent_iterate_regions_fn)
 148     (hsa_agent_t agent,
 149      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
 150   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
 151   hsa_status_t (*hsa_executable_create_fn)
 152     (hsa_profile_t profile, hsa_executable_state_t executable_state,
 153      const char *options, hsa_executable_t *executable);
 154   hsa_status_t (*hsa_executable_global_variable_define_fn)
 155     (hsa_executable_t executable, const char *variable_name, void *address);
 156   hsa_status_t (*hsa_executable_load_code_object_fn)
 157     (hsa_executable_t executable, hsa_agent_t agent,
 158      hsa_code_object_t code_object, const char *options);
 159   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
 160                                            const char *options);
 161   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
 162                                         uint32_t num_consumers,
 163                                         const hsa_agent_t *consumers,
 164                                         hsa_signal_t *signal);
 165   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
 166                                           void **ptr);
 167   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
 168                                               hsa_access_permission_t access);
 169   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
 170   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
 171   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
 172   hsa_status_t (*hsa_executable_get_symbol_fn)
 173     (hsa_executable_t executable, const char *module_name,
 174      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
 175      hsa_executable_symbol_t *symbol);
 176   hsa_status_t (*hsa_executable_symbol_get_info_fn)
 177     (hsa_executable_symbol_t executable_symbol,
 178      hsa_executable_symbol_info_t attribute, void *value);
 179   hsa_status_t (*hsa_executable_iterate_symbols_fn)
 180     (hsa_executable_t executable,
 181      hsa_status_t (*callback)(hsa_executable_t executable,
 182                               hsa_executable_symbol_t symbol, void *data),
 183      void *data);
 184   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
 185                                                     uint64_t value);
 186   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
 187   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
 188                                        hsa_signal_value_t value);
 189   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
 190                                        hsa_signal_value_t value);
 191   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
 192     (hsa_signal_t signal, hsa_signal_condition_t condition,
 193      hsa_signal_value_t compare_value, uint64_t timeout_hint,
 194      hsa_wait_state_t wait_state_hint);
 195   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
 196   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
 197
 198   hsa_status_t (*hsa_code_object_deserialize_fn)
 199     (void *serialized_code_object, size_t serialized_code_object_size,
 200      const char *options, hsa_code_object_t *code_object);
 201 };
 202
 203 /* Structure describing the run-time and grid properties of an HSA kernel
 204    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
 205
 206 struct GOMP_kernel_launch_attributes
 207 {
 208   /* Number of dimensions the workload has.  Maximum number is 3.  */
 209   uint32_t ndim;
 210   /* Size of the grid in the three respective dimensions.  */
 211   uint32_t gdims[3];
 212   /* Size of work-groups in the respective dimensions.  */
 213   uint32_t wdims[3];
 214 };
 215
 216 /* Collection of information needed for a dispatch of a kernel from a
 217    kernel.  */
 218
 219 struct kernel_dispatch
 220 {
 221   struct agent_info *agent;
 222   /* Pointer to a command queue associated with a kernel dispatch agent.  */
 223   void *queue;
 224   /* Pointer to a memory space used for kernel arguments passing.  */
 225   void *kernarg_address;
 226   /* Kernel object.  */
 227   uint64_t object;
 228   /* Synchronization signal used for dispatch synchronization.  */
 229   uint64_t signal;
 230   /* Private segment size.  */
 231   uint32_t private_segment_size;
 232   /* Group segment size.  */
 233   uint32_t group_segment_size;
 234 };
 235
 236 /* Structure of the kernargs segment, supporting console output.
 237
 238    This needs to match the definitions in Newlib, and the expectations
 239    in libgomp target code.  */
 240
 241 struct kernargs {
 242   /* Leave space for the real kernel arguments.
 243      OpenACC and OpenMP only use one pointer.  */
 244   int64_t dummy1;
 245   int64_t dummy2;
 246
 247   /* A pointer to struct output, below, for console output data.  */
 248   int64_t out_ptr;
 249
 250   /* A pointer to struct heap, below.  */
 251   int64_t heap_ptr;
 252
 253   /* A pointer to an ephemeral memory arena.
 254     Only needed for OpenMP.  */
 255   int64_t arena_ptr;
 256
 257   /* Output data.  */
 258   struct output {
 259     int return_value;
 260     unsigned int next_output;
 261     struct printf_data {
 262       int written;
 263       char msg[128];
 264       int type;
 265       union {
 266         int64_t ivalue;
 267         double dvalue;
 268         char text[128];
 269       };
 270     } queue[1024];
 271     unsigned int consumed;
 272   } output_data;
 273 };
 274
 275 /* A queue entry for a future asynchronous launch.  */
 276
 277 struct kernel_launch
 278 {
 279   struct kernel_info *kernel;
 280   void *vars;
 281   struct GOMP_kernel_launch_attributes kla;
 282 };
 283
 284 /* A queue entry for a future callback.  */
 285
 286 struct callback
 287 {
 288   void (*fn)(void *);
 289   void *data;
 290 };
 291
 292 /* A data struct for the copy_data callback.  */
 293
 294 struct copy_data
 295 {
 296   void *dst;
 297   const void *src;
 298   size_t len;
 299   bool free_src;
 300   struct goacc_asyncqueue *aq;
 301 };
 302
 303 /* A queue entry for a placeholder.  These correspond to a wait event.  */
 304
 305 struct placeholder
 306 {
 307   int executed;
 308   pthread_cond_t cond;
 309   pthread_mutex_t mutex;
 310 };
 311
 312 /* A queue entry for a wait directive.  */
 313
 314 struct asyncwait_info
 315 {
 316   struct placeholder *placeholderp;
 317 };
 318
 319 /* Encode the type of an entry in an async queue.  */
 320
 321 enum entry_type
 322 {
 323   KERNEL_LAUNCH,
 324   CALLBACK,
 325   ASYNC_WAIT,
 326   ASYNC_PLACEHOLDER
 327 };
 328
 329 /* An entry in an async queue.  */
 330
 331 struct queue_entry
 332 {
 333   enum entry_type type;
 334   union {
 335     struct kernel_launch launch;
 336     struct callback callback;
 337     struct asyncwait_info asyncwait;
 338     struct placeholder placeholder;
 339   } u;
 340 };
 341
 342 /* An async queue header.
 343
 344    OpenMP may create one of these.
 345    OpenACC may create many.  */
 346
 347 struct goacc_asyncqueue
 348 {
 349   struct agent_info *agent;
 350   hsa_queue_t *hsa_queue;
 351
 352   pthread_t thread_drain_queue;
 353   pthread_mutex_t mutex;
 354   pthread_cond_t queue_cond_in;
 355   pthread_cond_t queue_cond_out;
 356   struct queue_entry queue[ASYNC_QUEUE_SIZE];
 357   int queue_first;
 358   int queue_n;
 359   int drain_queue_stop;
 360
 361   int id;
 362   struct goacc_asyncqueue *prev;
 363   struct goacc_asyncqueue *next;
 364 };
 365
 366 /* Mkoffload uses this structure to describe a kernel.
 367
 368    OpenMP kernel dimensions are passed at runtime.
 369    OpenACC kernel dimensions are passed at compile time, here.  */
 370
 371 struct hsa_kernel_description
 372 {
 373   const char *name;
 374   int oacc_dims[3];  /* Only present for GCN kernels.  */
 375   int sgpr_count;
 376   int vpgr_count;
 377 };
 378
 379 /* Mkoffload uses this structure to describe an offload variable.  */
 380
 381 struct global_var_info
 382 {
 383   const char *name;
 384   void *address;
 385 };
 386
 387 /* Mkoffload uses this structure to describe all the kernels in a
 388    loadable module.  These are passed the libgomp via static constructors.  */
 389
 390 struct gcn_image_desc
 391 {
 392   struct gcn_image {
 393     size_t size;
 394     void *image;
 395   } *gcn_image;
 396   const unsigned kernel_count;
 397   struct hsa_kernel_description *kernel_infos;
 398   const unsigned global_variable_count;
 399   struct global_var_info *global_variables;
 400 };
 401
 402 /* This enum mirrors the corresponding LLVM enum's values for all ISAs that we
 403    support.
 404    See https://llvm.org/docs/AMDGPUUsage.html#amdgpu-ef-amdgpu-mach-table */
 405
 406 typedef enum {
 407   EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
 408   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
 409   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
 410 } EF_AMDGPU_MACH;
 411
 412 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
 413 typedef EF_AMDGPU_MACH gcn_isa;
 414
 415 /* Description of an HSA GPU agent (device) and the program associated with
 416    it.  */
 417
 418 struct agent_info
 419 {
 420   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
 421   hsa_agent_t id;
 422   /* The user-visible device number.  */
 423   int device_id;
 424   /* Whether the agent has been initialized.  The fields below are usable only
 425      if it has been.  */
 426   bool initialized;
 427
 428   /* The instruction set architecture of the device. */
 429   gcn_isa device_isa;
 430   /* Name of the agent. */
 431   char name[64];
 432   /* Name of the vendor of the agent. */
 433   char vendor_name[64];
 434   /* Command queues of the agent.  */
 435   hsa_queue_t *sync_queue;
 436   struct goacc_asyncqueue *async_queues, *omp_async_queue;
 437   pthread_mutex_t async_queues_mutex;
 438
 439   /* The HSA memory region from which to allocate kernel arguments.  */
 440   hsa_region_t kernarg_region;
 441
 442   /* The HSA memory region from which to allocate device data.  */
 443   hsa_region_t data_region;
 444
 445   /* Allocated team arenas.  */
 446   struct team_arena_list *team_arena_list;
 447   pthread_mutex_t team_arena_write_lock;
 448
 449   /* Read-write lock that protects kernels which are running or about to be run
 450      from interference with loading and unloading of images.  Needs to be
 451      locked for reading while a kernel is being run, and for writing if the
 452      list of modules is manipulated (and thus the HSA program invalidated).  */
 453   pthread_rwlock_t module_rwlock;
 454
 455   /* The module associated with this kernel.  */
 456   struct module_info *module;
 457
 458   /* Mutex enforcing that only one thread will finalize the HSA program.  A
 459      thread should have locked agent->module_rwlock for reading before
 460      acquiring it.  */
 461   pthread_mutex_t prog_mutex;
 462   /* Flag whether the HSA program that consists of all the modules has been
 463      finalized.  */
 464   bool prog_finalized;
 465   /* HSA executable - the finalized program that is used to locate kernels.  */
 466   hsa_executable_t executable;
 467 };
 468
 469 /* Information required to identify, finalize and run any given kernel.  */
 470
 471 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
 472
 473 struct kernel_info
 474 {
 475   /* Name of the kernel, required to locate it within the GCN object-code
 476      module.  */
 477   const char *name;
 478   /* The specific agent the kernel has been or will be finalized for and run
 479      on.  */
 480   struct agent_info *agent;
 481   /* The specific module where the kernel takes place.  */
 482   struct module_info *module;
 483   /* Information provided by mkoffload associated with the kernel.  */
 484   struct hsa_kernel_description *description;
 485   /* Mutex enforcing that at most once thread ever initializes a kernel for
 486      use.  A thread should have locked agent->module_rwlock for reading before
 487      acquiring it.  */
 488   pthread_mutex_t init_mutex;
 489   /* Flag indicating whether the kernel has been initialized and all fields
 490      below it contain valid data.  */
 491   bool initialized;
 492   /* Flag indicating that the kernel has a problem that blocks an execution.  */
 493   bool initialization_failed;
 494   /* The object to be put into the dispatch queue.  */
 495   uint64_t object;
 496   /* Required size of kernel arguments.  */
 497   uint32_t kernarg_segment_size;
 498   /* Required size of group segment.  */
 499   uint32_t group_segment_size;
 500   /* Required size of private segment.  */
 501   uint32_t private_segment_size;
 502   /* Set up for OpenMP or OpenACC?  */
 503   enum offload_kind kind;
 504 };
 505
 506 /* Information about a particular GCN module, its image and kernels.  */
 507
 508 struct module_info
 509 {
 510   /* The description with which the program has registered the image.  */
 511   struct gcn_image_desc *image_desc;
 512   /* GCN heap allocation.  */
 513   struct heap *heap;
 514   /* Physical boundaries of the loaded module.  */
 515   Elf64_Addr phys_address_start;
 516   Elf64_Addr phys_address_end;
 517
 518   bool constructors_run_p;
 519   struct kernel_info *init_array_func, *fini_array_func;
 520
 521   /* Number of kernels in this module.  */
 522   int kernel_count;
 523   /* An array of kernel_info structures describing each kernel in this
 524      module.  */
 525   struct kernel_info kernels[];
 526 };
 527
 528 /* A linked list of memory arenas allocated on the device.
 529    These are only used by OpenMP, as a means to optimize per-team malloc.  */
 530
 531 struct team_arena_list
 532 {
 533   struct team_arena_list *next;
 534
 535   /* The number of teams determines the size of the allocation.  */
 536   int num_teams;
 537   /* The device address of the arena itself.  */
 538   void *arena;
 539   /* A flag to prevent two asynchronous kernels trying to use the same arena.
 540      The mutex is locked until the kernel exits.  */
 541   pthread_mutex_t in_use;
 542 };
 543
 544 /* Information about the whole HSA environment and all of its agents.  */
 545
 546 struct hsa_context_info
 547 {
 548   /* Whether the structure has been initialized.  */
 549   bool initialized;
 550   /* Number of usable GPU HSA agents in the system.  */
 551   int agent_count;
 552   /* Array of agent_info structures describing the individual HSA agents.  */
 553   struct agent_info *agents;
 554   /* Driver version string. */
 555   char driver_version_s[30];
 556 };
 557
 558 /* Format of the on-device heap.
 559
 560    This must match the definition in Newlib and gcn-run.  */
 561
 562 struct heap {
 563   int64_t size;
 564   char data[0];
 565 };
 566
 567 /* }}}  */
 568 /* {{{ Global variables  */
 569
 570 /* Information about the whole HSA environment and all of its agents.  */
 571
 572 static struct hsa_context_info hsa_context;
 573
 574 /* HSA runtime functions that are initialized in init_hsa_context.  */
 575
 576 static struct hsa_runtime_fn_info hsa_fns;
 577
 578 /* Heap space, allocated target-side, provided for use of newlib malloc.
 579    Each module should have it's own heap allocated.
 580    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
 581
 582 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
 583
 584 /* Flag to decide whether print to stderr information about what is going on.
 585    Set in init_debug depending on environment variables.  */
 586
 587 static bool debug;
 588
 589 /* Flag to decide if the runtime should suppress a possible fallback to host
 590    execution.  */
 591
 592 static bool suppress_host_fallback;
 593
 594 /* Flag to locate HSA runtime shared library that is dlopened
 595    by this plug-in.  */
 596
 597 static const char *hsa_runtime_lib;
 598
 599 /* Flag to decide if the runtime should support also CPU devices (can be
 600    a simulator).  */
 601
 602 static bool support_cpu_devices;
 603
 604 /* Runtime dimension overrides.  Zero indicates default.  */
 605
 606 static int override_x_dim = 0;
 607 static int override_z_dim = 0;
 608
 609 /* }}}  */
 610 /* {{{ Debug & Diagnostic  */
 611
 612 /* Print a message to stderr if GCN_DEBUG value is set to true.  */
 613
 614 #define DEBUG_PRINT(...) \
 615   do \
 616   { \
 617     if (debug) \
 618       { \
 619         fprintf (stderr, __VA_ARGS__); \
 620       } \
 621   } \
 622   while (false);
 623
 624 /* Flush stderr if GCN_DEBUG value is set to true.  */
 625
 626 #define DEBUG_FLUSH()                           \
 627   do {                                          \
 628     if (debug)                                  \
 629       fflush (stderr);                          \
 630   } while (false)
 631
 632 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
 633    is set to true.  */
 634
 635 #define DEBUG_LOG(prefix, ...)                  \
 636   do                                            \
 637     {                                           \
 638       DEBUG_PRINT (prefix);                     \
 639       DEBUG_PRINT (__VA_ARGS__);                        \
 640       DEBUG_FLUSH ();                           \
 641     } while (false)
 642
 643 /* Print a debugging message to stderr.  */
 644
 645 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
 646
 647 /* Print a warning message to stderr.  */
 648
 649 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
 650
 651 /* Print HSA warning STR with an HSA STATUS code.  */
 652
 653 static void
 654 hsa_warn (const char *str, hsa_status_t status)
 655 {
 656   if (!debug)
 657     return;
 658
 659   const char *hsa_error_msg = "[unknown]";
 660   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 661
 662   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
 663            hsa_error_msg);
 664 }
 665
 666 /* Report a fatal error STR together with the HSA error corresponding to STATUS
 667    and terminate execution of the current process.  */
 668
 669 static void
 670 hsa_fatal (const char *str, hsa_status_t status)
 671 {
 672   const char *hsa_error_msg = "[unknown]";
 673   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 674   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
 675                      hsa_error_msg);
 676 }
 677
 678 /* Like hsa_fatal, except only report error message, and return FALSE
 679    for propagating error processing to outside of plugin.  */
 680
 681 static bool
 682 hsa_error (const char *str, hsa_status_t status)
 683 {
 684   const char *hsa_error_msg = "[unknown]";
 685   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 686   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
 687                      hsa_error_msg);
 688   return false;
 689 }
 690
 691 /* Dump information about the available hardware.  */
 692
 693 static void
 694 dump_hsa_system_info (void)
 695 {
 696   hsa_status_t status;
 697
 698   hsa_endianness_t endianness;
 699   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
 700                                            &endianness);
 701   if (status == HSA_STATUS_SUCCESS)
 702     switch (endianness)
 703       {
 704       case HSA_ENDIANNESS_LITTLE:
 705         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
 706         break;
 707       case HSA_ENDIANNESS_BIG:
 708         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
 709         break;
 710       default:
 711         GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
 712       }
 713   else
 714     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
 715
 716   uint8_t extensions[128];
 717   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
 718                                            &extensions);
 719   if (status == HSA_STATUS_SUCCESS)
 720     {
 721       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
 722         GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
 723     }
 724   else
 725     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
 726 }
 727
 728 /* Dump information about the available hardware.  */
 729
 730 static void
 731 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
 732 {
 733   switch (machine_model)
 734     {
 735     case HSA_MACHINE_MODEL_SMALL:
 736       GCN_DEBUG ("%s: SMALL\n", s);
 737       break;
 738     case HSA_MACHINE_MODEL_LARGE:
 739       GCN_DEBUG ("%s: LARGE\n", s);
 740       break;
 741     default:
 742       GCN_WARNING ("%s: UNKNOWN\n", s);
 743       break;
 744     }
 745 }
 746
 747 /* Dump information about the available hardware.  */
 748
 749 static void
 750 dump_profile (hsa_profile_t profile, const char *s)
 751 {
 752   switch (profile)
 753     {
 754     case HSA_PROFILE_FULL:
 755       GCN_DEBUG ("%s: FULL\n", s);
 756       break;
 757     case HSA_PROFILE_BASE:
 758       GCN_DEBUG ("%s: BASE\n", s);
 759       break;
 760     default:
 761       GCN_WARNING ("%s: UNKNOWN\n", s);
 762       break;
 763     }
 764 }
 765
 766 /* Dump information about a device memory region.  */
 767
 768 static hsa_status_t
 769 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
 770 {
 771   hsa_status_t status;
 772
 773   hsa_region_segment_t segment;
 774   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
 775                                            &segment);
 776   if (status == HSA_STATUS_SUCCESS)
 777     {
 778       if (segment == HSA_REGION_SEGMENT_GLOBAL)
 779         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
 780       else if (segment == HSA_REGION_SEGMENT_READONLY)
 781         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
 782       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
 783         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
 784       else if (segment == HSA_REGION_SEGMENT_GROUP)
 785         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
 786       else
 787         GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
 788     }
 789   else
 790     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
 791
 792   if (segment == HSA_REGION_SEGMENT_GLOBAL)
 793     {
 794       uint32_t flags;
 795       status
 796         = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
 797                                           &flags);
 798       if (status == HSA_STATUS_SUCCESS)
 799         {
 800           if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
 801             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
 802           if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
 803             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
 804           if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
 805             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
 806         }
 807       else
 808         GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
 809     }
 810
 811   size_t size;
 812   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
 813   if (status == HSA_STATUS_SUCCESS)
 814     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
 815   else
 816     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
 817
 818   status
 819     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
 820                                       &size);
 821   if (status == HSA_STATUS_SUCCESS)
 822     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
 823   else
 824     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
 825
 826   bool alloc_allowed;
 827   status
 828     = hsa_fns.hsa_region_get_info_fn (region,
 829                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
 830                                       &alloc_allowed);
 831   if (status == HSA_STATUS_SUCCESS)
 832     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
 833   else
 834     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
 835
 836   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
 837     return HSA_STATUS_SUCCESS;
 838
 839   status
 840     = hsa_fns.hsa_region_get_info_fn (region,
 841                                       HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
 842                                       &size);
 843   if (status == HSA_STATUS_SUCCESS)
 844     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
 845   else
 846     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
 847
 848   size_t align;
 849   status
 850     = hsa_fns.hsa_region_get_info_fn (region,
 851                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
 852                                       &align);
 853   if (status == HSA_STATUS_SUCCESS)
 854     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
 855   else
 856     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
 857
 858   return HSA_STATUS_SUCCESS;
 859 }
 860
 861 /* Dump information about all the device memory regions.  */
 862
 863 static void
 864 dump_hsa_regions (hsa_agent_t agent)
 865 {
 866   hsa_status_t status;
 867   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
 868                                                  dump_hsa_region,
 869                                                  NULL);
 870   if (status != HSA_STATUS_SUCCESS)
 871     hsa_error ("Dumping hsa regions failed", status);
 872 }
 873
 874 /* Dump information about the available devices.  */
 875
 876 static hsa_status_t
 877 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
 878 {
 879   hsa_status_t status;
 880
 881   char buf[64];
 882   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
 883                                           &buf);
 884   if (status == HSA_STATUS_SUCCESS)
 885     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
 886   else
 887     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
 888
 889   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
 890                                           &buf);
 891   if (status == HSA_STATUS_SUCCESS)
 892     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
 893   else
 894     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
 895
 896   hsa_machine_model_t machine_model;
 897   status
 898     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
 899                                      &machine_model);
 900   if (status == HSA_STATUS_SUCCESS)
 901     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
 902   else
 903     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
 904
 905   hsa_profile_t profile;
 906   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
 907                                           &profile);
 908   if (status == HSA_STATUS_SUCCESS)
 909     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
 910   else
 911     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
 912
 913   hsa_device_type_t device_type;
 914   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
 915                                           &device_type);
 916   if (status == HSA_STATUS_SUCCESS)
 917     {
 918       switch (device_type)
 919         {
 920         case HSA_DEVICE_TYPE_CPU:
 921           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
 922           break;
 923         case HSA_DEVICE_TYPE_GPU:
 924           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
 925           break;
 926         case HSA_DEVICE_TYPE_DSP:
 927           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
 928           break;
 929         default:
 930           GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
 931           break;
 932         }
 933     }
 934   else
 935     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
 936
 937   uint32_t cu_count;
 938   status = hsa_fns.hsa_agent_get_info_fn
 939     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
 940   if (status == HSA_STATUS_SUCCESS)
 941     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
 942   else
 943     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
 944
 945   uint32_t size;
 946   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
 947                                           &size);
 948   if (status == HSA_STATUS_SUCCESS)
 949     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
 950   else
 951     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
 952
 953   uint32_t max_dim;
 954   status = hsa_fns.hsa_agent_get_info_fn (agent,
 955                                           HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
 956                                           &max_dim);
 957   if (status == HSA_STATUS_SUCCESS)
 958     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
 959   else
 960     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
 961
 962   uint32_t max_size;
 963   status = hsa_fns.hsa_agent_get_info_fn (agent,
 964                                           HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
 965                                           &max_size);
 966   if (status == HSA_STATUS_SUCCESS)
 967     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
 968   else
 969     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
 970
 971   uint32_t grid_max_dim;
 972   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
 973                                           &grid_max_dim);
 974   if (status == HSA_STATUS_SUCCESS)
 975     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
 976   else
 977     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
 978
 979   uint32_t grid_max_size;
 980   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
 981                                           &grid_max_size);
 982   if (status == HSA_STATUS_SUCCESS)
 983     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
 984   else
 985     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
 986
 987   dump_hsa_regions (agent);
 988
 989   return HSA_STATUS_SUCCESS;
 990 }
 991
 992 /* Forward reference.  */
 993
 994 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
 995
 996 /* Helper function for dump_executable_symbols.  */
 997
 998 static hsa_status_t
 999 dump_executable_symbol (hsa_executable_t executable,
1000                         hsa_executable_symbol_t symbol,
1001                         void *data __attribute__((unused)))
1002 {
1003   char *name = get_executable_symbol_name (symbol);
1004
1005   if (name)
1006     {
1007       GCN_DEBUG ("executable symbol: %s\n", name);
1008       free (name);
1009     }
1010
1011   return HSA_STATUS_SUCCESS;
1012 }
1013
1014 /* Dump all global symbol in an executable.  */
1015
1016 static void
1017 dump_executable_symbols (hsa_executable_t executable)
1018 {
1019   hsa_status_t status;
1020   status
1021     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1022                                                  dump_executable_symbol,
1023                                                  NULL);
1024   if (status != HSA_STATUS_SUCCESS)
1025     hsa_fatal ("Could not dump HSA executable symbols", status);
1026 }
1027
1028 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1029
1030 static void
1031 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1032 {
1033   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1034
1035   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1036   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1037   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1038   fprintf (stderr, "%*sheap address: %p\n", indent, "",
1039            (void*)kernargs->heap_ptr);
1040   fprintf (stderr, "%*sarena address: %p\n", indent, "",
1041            (void*)kernargs->arena_ptr);
1042   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1043   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1044            dispatch->private_segment_size);
1045   fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1046            dispatch->group_segment_size);
1047   fprintf (stderr, "\n");
1048 }
1049
1050 /* }}}  */
1051 /* {{{ Utility functions  */
1052
1053 /* Cast the thread local storage to gcn_thread.  */
1054
1055 static inline struct gcn_thread *
1056 gcn_thread (void)
1057 {
1058   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1059 }
1060
1061 /* Initialize debug and suppress_host_fallback according to the environment.  */
1062
1063 static void
1064 init_environment_variables (void)
1065 {
1066   if (secure_getenv ("GCN_DEBUG"))
1067     debug = true;
1068   else
1069     debug = false;
1070
1071   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1072     suppress_host_fallback = true;
1073   else
1074     suppress_host_fallback = false;
1075
1076   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1077   if (hsa_runtime_lib == NULL)
1078     hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so.1";
1079
1080   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1081
1082   const char *x = secure_getenv ("GCN_NUM_TEAMS");
1083   if (!x)
1084     x = secure_getenv ("GCN_NUM_GANGS");
1085   if (x)
1086     override_x_dim = atoi (x);
1087
1088   const char *z = secure_getenv ("GCN_NUM_THREADS");
1089   if (!z)
1090     z = secure_getenv ("GCN_NUM_WORKERS");
1091   if (z)
1092     override_z_dim = atoi (z);
1093
1094   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1095   if (heap)
1096     {
1097       size_t tmp = atol (heap);
1098       if (tmp)
1099         gcn_kernel_heap_size = tmp;
1100     }
1101 }
1102
1103 /* Return malloc'd string with name of SYMBOL.  */
1104
1105 static char *
1106 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1107 {
1108   hsa_status_t status;
1109   char *res;
1110   uint32_t len;
1111   const hsa_executable_symbol_info_t info_name_length
1112     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1113
1114   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1115                                                       &len);
1116   if (status != HSA_STATUS_SUCCESS)
1117     {
1118       hsa_error ("Could not get length of symbol name", status);
1119       return NULL;
1120     }
1121
1122   res = GOMP_PLUGIN_malloc (len + 1);
1123
1124   const hsa_executable_symbol_info_t info_name
1125     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1126
1127   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1128
1129   if (status != HSA_STATUS_SUCCESS)
1130     {
1131       hsa_error ("Could not get symbol name", status);
1132       free (res);
1133       return NULL;
1134     }
1135
1136   res[len] = '\0';
1137
1138   return res;
1139 }
1140
1141 /* Get the number of GPU Compute Units.  */
1142
1143 static int
1144 get_cu_count (struct agent_info *agent)
1145 {
1146   uint32_t cu_count;
1147   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1148     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1149   if (status == HSA_STATUS_SUCCESS)
1150     return cu_count;
1151   else
1152     return 64;  /* The usual number for older devices.  */
1153 }
1154
1155 /* Calculate the maximum grid size for OMP threads / OACC workers.
1156    This depends on the kernel's resource usage levels.  */
1157
1158 static int
1159 limit_worker_threads (int threads)
1160 {
1161   /* FIXME Do something more inteligent here.
1162      GCN can always run 4 threads within a Compute Unit, but
1163      more than that depends on register usage.  */
1164   if (threads > 16)
1165     threads = 16;
1166   return threads;
1167 }
1168
1169 /* Parse the target attributes INPUT provided by the compiler and return true
1170    if we should run anything all.  If INPUT is NULL, fill DEF with default
1171    values, then store INPUT or DEF into *RESULT.
1172
1173    This is used for OpenMP only.  */
1174
1175 static bool
1176 parse_target_attributes (void **input,
1177                          struct GOMP_kernel_launch_attributes *def,
1178                          struct GOMP_kernel_launch_attributes **result,
1179                          struct agent_info *agent)
1180 {
1181   if (!input)
1182     GOMP_PLUGIN_fatal ("No target arguments provided");
1183
1184   bool grid_attrs_found = false;
1185   bool gcn_dims_found = false;
1186   int gcn_teams = 0;
1187   int gcn_threads = 0;
1188   while (*input)
1189     {
1190       intptr_t id = (intptr_t) *input++, val;
1191
1192       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1193         val = (intptr_t) *input++;
1194       else
1195         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1196
1197       val = (val > INT_MAX) ? INT_MAX : val;
1198
1199       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1200           && ((id & GOMP_TARGET_ARG_ID_MASK)
1201               == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1202         {
1203           grid_attrs_found = true;
1204           break;
1205         }
1206       else if ((id & GOMP_TARGET_ARG_DEVICE_MASK)
1207                == GOMP_TARGET_ARG_DEVICE_ALL)
1208         {
1209           gcn_dims_found = true;
1210           switch (id & GOMP_TARGET_ARG_ID_MASK)
1211             {
1212             case GOMP_TARGET_ARG_NUM_TEAMS:
1213               gcn_teams = val;
1214               break;
1215             case GOMP_TARGET_ARG_THREAD_LIMIT:
1216               gcn_threads = limit_worker_threads (val);
1217               break;
1218             default:
1219               ;
1220             }
1221         }
1222     }
1223
1224   if (gcn_dims_found)
1225     {
1226       if (agent->device_isa == EF_AMDGPU_MACH_AMDGCN_GFX900
1227           && gcn_threads == 0 && override_z_dim == 0)
1228         {
1229           gcn_threads = 4;
1230           GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1231                        "threads to 4 per team.\n");
1232           GCN_WARNING (" - If this is not a Vega 10 device, please use "
1233                        "GCN_NUM_THREADS=16\n");
1234         }
1235
1236       def->ndim = 3;
1237       /* Fiji has 64 CUs, but Vega20 has 60.  */
1238       def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
1239       /* Each thread is 64 work items wide.  */
1240       def->gdims[1] = 64;
1241       /* A work group can have 16 wavefronts.  */
1242       def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
1243       def->wdims[0] = 1; /* Single team per work-group.  */
1244       def->wdims[1] = 64;
1245       def->wdims[2] = 16;
1246       *result = def;
1247       return true;
1248     }
1249   else if (!grid_attrs_found)
1250     {
1251       def->ndim = 1;
1252       def->gdims[0] = 1;
1253       def->gdims[1] = 1;
1254       def->gdims[2] = 1;
1255       def->wdims[0] = 1;
1256       def->wdims[1] = 1;
1257       def->wdims[2] = 1;
1258       *result = def;
1259       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1260       return true;
1261     }
1262
1263   struct GOMP_kernel_launch_attributes *kla;
1264   kla = (struct GOMP_kernel_launch_attributes *) *input;
1265   *result = kla;
1266   if (kla->ndim == 0 || kla->ndim > 3)
1267     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1268
1269   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1270   unsigned i;
1271   for (i = 0; i < kla->ndim; i++)
1272     {
1273       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1274                  kla->gdims[i], kla->wdims[i]);
1275       if (kla->gdims[i] == 0)
1276         return false;
1277     }
1278   return true;
1279 }
1280
1281 /* Return the group size given the requested GROUP size, GRID size and number
1282    of grid dimensions NDIM.  */
1283
1284 static uint32_t
1285 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1286 {
1287   if (group == 0)
1288     {
1289       /* TODO: Provide a default via environment or device characteristics.  */
1290       if (ndim == 1)
1291         group = 64;
1292       else if (ndim == 2)
1293         group = 8;
1294       else
1295         group = 4;
1296     }
1297
1298   if (group > grid)
1299     group = grid;
1300   return group;
1301 }
1302
1303 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1304
1305 static void
1306 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1307 {
1308   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1309 }
1310
1311 /* A never-called callback for the HSA command queues.  These signal events
1312    that we don't use, so we trigger an error.
1313
1314    This "queue" is not to be confused with the async queues, below.  */
1315
1316 static void
1317 hsa_queue_callback (hsa_status_t status,
1318                 hsa_queue_t *queue __attribute__ ((unused)),
1319                 void *data __attribute__ ((unused)))
1320 {
1321   hsa_fatal ("Asynchronous queue error", status);
1322 }
1323
1324 /* }}}  */
1325 /* {{{ HSA initialization  */
1326
1327 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
1328
1329 static bool
1330 init_hsa_runtime_functions (void)
1331 {
1332 #define DLSYM_FN(function) \
1333   hsa_fns.function##_fn = dlsym (handle, #function); \
1334   if (hsa_fns.function##_fn == NULL) \
1335     return false;
1336   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1337   if (handle == NULL)
1338     return false;
1339
1340   DLSYM_FN (hsa_status_string)
1341   DLSYM_FN (hsa_system_get_info)
1342   DLSYM_FN (hsa_agent_get_info)
1343   DLSYM_FN (hsa_init)
1344   DLSYM_FN (hsa_iterate_agents)
1345   DLSYM_FN (hsa_region_get_info)
1346   DLSYM_FN (hsa_queue_create)
1347   DLSYM_FN (hsa_agent_iterate_regions)
1348   DLSYM_FN (hsa_executable_destroy)
1349   DLSYM_FN (hsa_executable_create)
1350   DLSYM_FN (hsa_executable_global_variable_define)
1351   DLSYM_FN (hsa_executable_load_code_object)
1352   DLSYM_FN (hsa_executable_freeze)
1353   DLSYM_FN (hsa_signal_create)
1354   DLSYM_FN (hsa_memory_allocate)
1355   DLSYM_FN (hsa_memory_assign_agent)
1356   DLSYM_FN (hsa_memory_copy)
1357   DLSYM_FN (hsa_memory_free)
1358   DLSYM_FN (hsa_signal_destroy)
1359   DLSYM_FN (hsa_executable_get_symbol)
1360   DLSYM_FN (hsa_executable_symbol_get_info)
1361   DLSYM_FN (hsa_executable_iterate_symbols)
1362   DLSYM_FN (hsa_queue_add_write_index_release)
1363   DLSYM_FN (hsa_queue_load_read_index_acquire)
1364   DLSYM_FN (hsa_signal_wait_acquire)
1365   DLSYM_FN (hsa_signal_store_relaxed)
1366   DLSYM_FN (hsa_signal_store_release)
1367   DLSYM_FN (hsa_signal_load_acquire)
1368   DLSYM_FN (hsa_queue_destroy)
1369   DLSYM_FN (hsa_code_object_deserialize)
1370   return true;
1371 #undef DLSYM_FN
1372 }
1373
1374 /* Return true if the agent is a GPU and can accept of concurrent submissions
1375    from different threads.  */
1376
1377 static bool
1378 suitable_hsa_agent_p (hsa_agent_t agent)
1379 {
1380   hsa_device_type_t device_type;
1381   hsa_status_t status
1382     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1383                                      &device_type);
1384   if (status != HSA_STATUS_SUCCESS)
1385     return false;
1386
1387   switch (device_type)
1388     {
1389     case HSA_DEVICE_TYPE_GPU:
1390       break;
1391     case HSA_DEVICE_TYPE_CPU:
1392       if (!support_cpu_devices)
1393         return false;
1394       break;
1395     default:
1396       return false;
1397     }
1398
1399   uint32_t features = 0;
1400   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1401                                           &features);
1402   if (status != HSA_STATUS_SUCCESS
1403       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1404     return false;
1405   hsa_queue_type_t queue_type;
1406   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1407                                           &queue_type);
1408   if (status != HSA_STATUS_SUCCESS
1409       || (queue_type != HSA_QUEUE_TYPE_MULTI))
1410     return false;
1411
1412   return true;
1413 }
1414
1415 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1416    agent_count in hsa_context.  */
1417
1418 static hsa_status_t
1419 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1420 {
1421   if (suitable_hsa_agent_p (agent))
1422     hsa_context.agent_count++;
1423   return HSA_STATUS_SUCCESS;
1424 }
1425
1426 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1427    id to the describing structure in the hsa context.  The index of the
1428    structure is pointed to by DATA, increment it afterwards.  */
1429
1430 static hsa_status_t
1431 assign_agent_ids (hsa_agent_t agent, void *data)
1432 {
1433   if (suitable_hsa_agent_p (agent))
1434     {
1435       int *agent_index = (int *) data;
1436       hsa_context.agents[*agent_index].id = agent;
1437       ++*agent_index;
1438     }
1439   return HSA_STATUS_SUCCESS;
1440 }
1441
1442 /* Initialize hsa_context if it has not already been done.
1443    Return TRUE on success.  */
1444
1445 static bool
1446 init_hsa_context (void)
1447 {
1448   hsa_status_t status;
1449   int agent_index = 0;
1450
1451   if (hsa_context.initialized)
1452     return true;
1453   init_environment_variables ();
1454   if (!init_hsa_runtime_functions ())
1455     {
1456       GCN_WARNING ("Run-time could not be dynamically opened\n");
1457       if (suppress_host_fallback)
1458         GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1459       return false;
1460     }
1461   status = hsa_fns.hsa_init_fn ();
1462   if (status != HSA_STATUS_SUCCESS)
1463     return hsa_error ("Run-time could not be initialized", status);
1464   GCN_DEBUG ("HSA run-time initialized for GCN\n");
1465
1466   if (debug)
1467     dump_hsa_system_info ();
1468
1469   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1470   if (status != HSA_STATUS_SUCCESS)
1471     return hsa_error ("GCN GPU devices could not be enumerated", status);
1472   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1473
1474   hsa_context.agents
1475     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1476                                   * sizeof (struct agent_info));
1477   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1478   if (status != HSA_STATUS_SUCCESS)
1479     return hsa_error ("Scanning compute agents failed", status);
1480   if (agent_index != hsa_context.agent_count)
1481     {
1482       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1483       return false;
1484     }
1485
1486   if (debug)
1487     {
1488       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1489       if (status != HSA_STATUS_SUCCESS)
1490         GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1491     }
1492
1493   uint16_t minor, major;
1494   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MINOR,
1495                                            &minor);
1496   if (status != HSA_STATUS_SUCCESS)
1497     GOMP_PLUGIN_error ("Failed to obtain HSA runtime minor version");
1498   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_VERSION_MAJOR,
1499                                            &major);
1500   if (status != HSA_STATUS_SUCCESS)
1501     GOMP_PLUGIN_error ("Failed to obtain HSA runtime major version");
1502
1503   size_t len = sizeof hsa_context.driver_version_s;
1504   int printed = snprintf (hsa_context.driver_version_s, len,
1505                           "HSA Runtime %hu.%hu", (unsigned short int)major,
1506                           (unsigned short int)minor);
1507   if (printed >= len)
1508     GCN_WARNING ("HSA runtime version string was truncated."
1509                  "Version %hu.%hu is too long.", (unsigned short int)major,
1510                  (unsigned short int)minor);
1511
1512   hsa_context.initialized = true;
1513   return true;
1514 }
1515
1516 /* Verify that hsa_context has already been initialized and return the
1517    agent_info structure describing device number N.  Return NULL on error.  */
1518
1519 static struct agent_info *
1520 get_agent_info (int n)
1521 {
1522   if (!hsa_context.initialized)
1523     {
1524       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1525       return NULL;
1526     }
1527   if (n >= hsa_context.agent_count)
1528     {
1529       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1530       return NULL;
1531     }
1532   if (!hsa_context.agents[n].initialized)
1533     {
1534       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1535       return NULL;
1536     }
1537   return &hsa_context.agents[n];
1538 }
1539
1540 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1541
1542    Selects (breaks at) a suitable region of type KIND.  */
1543
1544 static hsa_status_t
1545 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1546                    hsa_region_global_flag_t kind)
1547 {
1548   hsa_status_t status;
1549   hsa_region_segment_t segment;
1550
1551   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1552                                            &segment);
1553   if (status != HSA_STATUS_SUCCESS)
1554     return status;
1555   if (segment != HSA_REGION_SEGMENT_GLOBAL)
1556     return HSA_STATUS_SUCCESS;
1557
1558   uint32_t flags;
1559   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1560                                            &flags);
1561   if (status != HSA_STATUS_SUCCESS)
1562     return status;
1563   if (flags & kind)
1564     {
1565       *retval = region;
1566       return HSA_STATUS_INFO_BREAK;
1567     }
1568   return HSA_STATUS_SUCCESS;
1569 }
1570
1571 /* Callback of hsa_agent_iterate_regions.
1572
1573    Selects a kernargs memory region.  */
1574
1575 static hsa_status_t
1576 get_kernarg_memory_region (hsa_region_t region, void *data)
1577 {
1578   return get_memory_region (region, (hsa_region_t *)data,
1579                             HSA_REGION_GLOBAL_FLAG_KERNARG);
1580 }
1581
1582 /* Callback of hsa_agent_iterate_regions.
1583
1584    Selects a coarse-grained memory region suitable for the heap and
1585    offload data.  */
1586
1587 static hsa_status_t
1588 get_data_memory_region (hsa_region_t region, void *data)
1589 {
1590   return get_memory_region (region, (hsa_region_t *)data,
1591                             HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1592 }
1593
1594 static int
1595 elf_gcn_isa_field (Elf64_Ehdr *image)
1596 {
1597   return image->e_flags & EF_AMDGPU_MACH_MASK;
1598 }
1599
1600 const static char *gcn_gfx803_s = "gfx803";
1601 const static char *gcn_gfx900_s = "gfx900";
1602 const static char *gcn_gfx906_s = "gfx906";
1603 const static int gcn_isa_name_len = 6;
1604
1605 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
1606    support the ISA. */
1607
1608 static const char*
1609 isa_hsa_name (int isa) {
1610   switch(isa)
1611     {
1612     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1613       return gcn_gfx803_s;
1614     case EF_AMDGPU_MACH_AMDGCN_GFX900:
1615       return gcn_gfx900_s;
1616     case EF_AMDGPU_MACH_AMDGCN_GFX906:
1617       return gcn_gfx906_s;
1618     }
1619   return NULL;
1620 }
1621
1622 /* Returns the user-facing name that GCC uses to identify the architecture (e.g.
1623    with -march) or NULL if we do not support the ISA.
1624    Keep in sync with /gcc/config/gcn/gcn.{c,opt}.  */
1625
1626 static const char*
1627 isa_gcc_name (int isa) {
1628   switch(isa)
1629     {
1630     case EF_AMDGPU_MACH_AMDGCN_GFX803:
1631       return "fiji";
1632     default:
1633       return isa_hsa_name (isa);
1634     }
1635 }
1636
1637 /* Returns the code which is used in the GCN object code to identify the ISA with
1638    the given name (as used by the HSA runtime).  */
1639
1640 static gcn_isa
1641 isa_code(const char *isa) {
1642   if (!strncmp (isa, gcn_gfx803_s, gcn_isa_name_len))
1643     return EF_AMDGPU_MACH_AMDGCN_GFX803;
1644
1645   if (!strncmp (isa, gcn_gfx900_s, gcn_isa_name_len))
1646     return EF_AMDGPU_MACH_AMDGCN_GFX900;
1647
1648   if (!strncmp (isa, gcn_gfx906_s, gcn_isa_name_len))
1649     return EF_AMDGPU_MACH_AMDGCN_GFX906;
1650
1651   return -1;
1652 }
1653
1654 /* }}}  */
1655 /* {{{ Run  */
1656
1657 /* Create or reuse a team arena.
1658
1659    Team arenas are used by OpenMP to avoid calling malloc multiple times
1660    while setting up each team.  This is purely a performance optimization.
1661
1662    Allocating an arena also costs performance, albeit on the host side, so
1663    this function will reuse an existing arena if a large enough one is idle.
1664    The arena is released, but not deallocated, when the kernel exits.  */
1665
1666 static void *
1667 get_team_arena (struct agent_info *agent, int num_teams)
1668 {
1669   struct team_arena_list **next_ptr = &agent->team_arena_list;
1670   struct team_arena_list *item;
1671
1672   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1673     {
1674       if (item->num_teams < num_teams)
1675         continue;
1676
1677       if (pthread_mutex_trylock (&item->in_use))
1678         continue;
1679
1680       return item->arena;
1681     }
1682
1683   GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1684
1685   if (pthread_mutex_lock (&agent->team_arena_write_lock))
1686     {
1687       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1688       return false;
1689     }
1690   item = malloc (sizeof (*item));
1691   item->num_teams = num_teams;
1692   item->next = NULL;
1693   *next_ptr = item;
1694
1695   if (pthread_mutex_init (&item->in_use, NULL))
1696     {
1697       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1698       return false;
1699     }
1700   if (pthread_mutex_lock (&item->in_use))
1701     {
1702       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1703       return false;
1704     }
1705   if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1706     {
1707       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1708       return false;
1709     }
1710
1711   const int TEAM_ARENA_SIZE = 64*1024;  /* Must match libgomp.h.  */
1712   hsa_status_t status;
1713   status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1714                                            TEAM_ARENA_SIZE*num_teams,
1715                                            &item->arena);
1716   if (status != HSA_STATUS_SUCCESS)
1717     hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1718   status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1719                                                HSA_ACCESS_PERMISSION_RW);
1720   if (status != HSA_STATUS_SUCCESS)
1721     hsa_fatal ("Could not assign arena memory to device", status);
1722
1723   return item->arena;
1724 }
1725
1726 /* Mark a team arena available for reuse.  */
1727
1728 static void
1729 release_team_arena (struct agent_info* agent, void *arena)
1730 {
1731   struct team_arena_list *item;
1732
1733   for (item = agent->team_arena_list; item; item = item->next)
1734     {
1735       if (item->arena == arena)
1736         {
1737           if (pthread_mutex_unlock (&item->in_use))
1738             GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1739           return;
1740         }
1741     }
1742   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1743 }
1744
1745 /* Clean up all the allocated team arenas.  */
1746
1747 static bool
1748 destroy_team_arenas (struct agent_info *agent)
1749 {
1750   struct team_arena_list *item, *next;
1751
1752   for (item = agent->team_arena_list; item; item = next)
1753     {
1754       next = item->next;
1755       hsa_fns.hsa_memory_free_fn (item->arena);
1756       if (pthread_mutex_destroy (&item->in_use))
1757         {
1758           GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1759           return false;
1760         }
1761       free (item);
1762     }
1763   agent->team_arena_list = NULL;
1764
1765   return true;
1766 }
1767
1768 /* Allocate memory on a specified device.  */
1769
1770 static void *
1771 alloc_by_agent (struct agent_info *agent, size_t size)
1772 {
1773   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1774
1775   /* Zero-size allocations are invalid, so in order to return a valid pointer
1776      we need to pass a valid size.  One source of zero-size allocations is
1777      kernargs for kernels that have no inputs or outputs (the kernel may
1778      only use console output, for example).  */
1779   if (size == 0)
1780     size = 4;
1781
1782   void *ptr;
1783   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1784                                                         size, &ptr);
1785   if (status != HSA_STATUS_SUCCESS)
1786     {
1787       hsa_error ("Could not allocate device memory", status);
1788       return NULL;
1789     }
1790
1791   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1792                                                HSA_ACCESS_PERMISSION_RW);
1793   if (status != HSA_STATUS_SUCCESS)
1794     {
1795       hsa_error ("Could not assign data memory to device", status);
1796       return NULL;
1797     }
1798
1799   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1800   bool profiling_dispatch_p
1801     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1802   if (profiling_dispatch_p)
1803     {
1804       acc_prof_info *prof_info = thr->prof_info;
1805       acc_event_info data_event_info;
1806       acc_api_info *api_info = thr->api_info;
1807
1808       prof_info->event_type = acc_ev_alloc;
1809
1810       data_event_info.data_event.event_type = prof_info->event_type;
1811       data_event_info.data_event.valid_bytes
1812         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1813       data_event_info.data_event.parent_construct
1814         = acc_construct_parallel;
1815       data_event_info.data_event.implicit = 1;
1816       data_event_info.data_event.tool_info = NULL;
1817       data_event_info.data_event.var_name = NULL;
1818       data_event_info.data_event.bytes = size;
1819       data_event_info.data_event.host_ptr = NULL;
1820       data_event_info.data_event.device_ptr = (void *) ptr;
1821
1822       api_info->device_api = acc_device_api_other;
1823
1824       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1825                                             api_info);
1826     }
1827
1828   return ptr;
1829 }
1830
1831 /* Create kernel dispatch data structure for given KERNEL, along with
1832    the necessary device signals and memory allocations.  */
1833
1834 static struct kernel_dispatch *
1835 create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1836 {
1837   struct agent_info *agent = kernel->agent;
1838   struct kernel_dispatch *shadow
1839     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1840
1841   shadow->agent = kernel->agent;
1842   shadow->object = kernel->object;
1843
1844   hsa_signal_t sync_signal;
1845   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1846   if (status != HSA_STATUS_SUCCESS)
1847     hsa_fatal ("Error creating the GCN sync signal", status);
1848
1849   shadow->signal = sync_signal.handle;
1850   shadow->private_segment_size = kernel->private_segment_size;
1851   shadow->group_segment_size = kernel->group_segment_size;
1852
1853   /* We expect kernels to request a single pointer, explicitly, and the
1854      rest of struct kernargs, implicitly.  If they request anything else
1855      then something is wrong.  */
1856   if (kernel->kernarg_segment_size > 8)
1857     {
1858       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1859       return NULL;
1860     }
1861
1862   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1863                                            sizeof (struct kernargs),
1864                                            &shadow->kernarg_address);
1865   if (status != HSA_STATUS_SUCCESS)
1866     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1867   struct kernargs *kernargs = shadow->kernarg_address;
1868
1869   /* Zero-initialize the output_data (minimum needed).  */
1870   kernargs->out_ptr = (int64_t)&kernargs->output_data;
1871   kernargs->output_data.next_output = 0;
1872   for (unsigned i = 0;
1873        i < (sizeof (kernargs->output_data.queue)
1874             / sizeof (kernargs->output_data.queue[0]));
1875        i++)
1876     kernargs->output_data.queue[i].written = 0;
1877   kernargs->output_data.consumed = 0;
1878
1879   /* Pass in the heap location.  */
1880   kernargs->heap_ptr = (int64_t)kernel->module->heap;
1881
1882   /* Create an arena.  */
1883   if (kernel->kind == KIND_OPENMP)
1884     kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1885   else
1886     kernargs->arena_ptr = 0;
1887
1888   /* Ensure we can recognize unset return values.  */
1889   kernargs->output_data.return_value = 0xcafe0000;
1890
1891   return shadow;
1892 }
1893
1894 /* Output any data written to console output from the kernel.  It is expected
1895    that this function is polled during kernel execution.
1896
1897    We print all entries from the last item printed to the next entry without
1898    a "written" flag.  If the "final" flag is set then it'll continue right to
1899    the end.
1900
1901    The print buffer is circular, but the from and to locations don't wrap when
1902    the buffer does, so the output limit is UINT_MAX.  The target blocks on
1903    output when the buffer is full.  */
1904
1905 static void
1906 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1907                 bool final)
1908 {
1909   unsigned int limit = (sizeof (kernargs->output_data.queue)
1910                         / sizeof (kernargs->output_data.queue[0]));
1911
1912   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1913                                        __ATOMIC_ACQUIRE);
1914   unsigned int to = kernargs->output_data.next_output;
1915
1916   if (from > to)
1917     {
1918       /* Overflow.  */
1919       if (final)
1920         printf ("GCN print buffer overflowed.\n");
1921       return;
1922     }
1923
1924   unsigned int i;
1925   for (i = from; i < to; i++)
1926     {
1927       struct printf_data *data = &kernargs->output_data.queue[i%limit];
1928
1929       if (!data->written && !final)
1930         break;
1931
1932       switch (data->type)
1933         {
1934         case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1935         case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1936         case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1937         case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1938         default: printf ("GCN print buffer error!\n"); break;
1939         }
1940       data->written = 0;
1941       __atomic_store_n (&kernargs->output_data.consumed, i+1,
1942                         __ATOMIC_RELEASE);
1943     }
1944   fflush (stdout);
1945 }
1946
1947 /* Release data structure created for a kernel dispatch in SHADOW argument,
1948    and clean up the signal and memory allocations.  */
1949
1950 static void
1951 release_kernel_dispatch (struct kernel_dispatch *shadow)
1952 {
1953   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1954
1955   struct kernargs *kernargs = shadow->kernarg_address;
1956   void *arena = (void *)kernargs->arena_ptr;
1957   if (arena)
1958     release_team_arena (shadow->agent, arena);
1959
1960   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1961
1962   hsa_signal_t s;
1963   s.handle = shadow->signal;
1964   hsa_fns.hsa_signal_destroy_fn (s);
1965
1966   free (shadow);
1967 }
1968
1969 /* Extract the properties from a kernel binary.  */
1970
1971 static void
1972 init_kernel_properties (struct kernel_info *kernel)
1973 {
1974   hsa_status_t status;
1975   struct agent_info *agent = kernel->agent;
1976   hsa_executable_symbol_t kernel_symbol;
1977   char *buf = alloca (strlen (kernel->name) + 4);
1978   sprintf (buf, "%s.kd", kernel->name);
1979   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
1980                                                  buf, agent->id,
1981                                                  0, &kernel_symbol);
1982   if (status != HSA_STATUS_SUCCESS)
1983     {
1984       hsa_warn ("Could not find symbol for kernel in the code object", status);
1985       fprintf (stderr, "not found name: '%s'\n", buf);
1986       dump_executable_symbols (agent->executable);
1987       goto failure;
1988     }
1989   GCN_DEBUG ("Located kernel %s\n", kernel->name);
1990   status = hsa_fns.hsa_executable_symbol_get_info_fn
1991     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
1992   if (status != HSA_STATUS_SUCCESS)
1993     hsa_fatal ("Could not extract a kernel object from its symbol", status);
1994   status = hsa_fns.hsa_executable_symbol_get_info_fn
1995     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1996      &kernel->kernarg_segment_size);
1997   if (status != HSA_STATUS_SUCCESS)
1998     hsa_fatal ("Could not get info about kernel argument size", status);
1999   status = hsa_fns.hsa_executable_symbol_get_info_fn
2000     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
2001      &kernel->group_segment_size);
2002   if (status != HSA_STATUS_SUCCESS)
2003     hsa_fatal ("Could not get info about kernel group segment size", status);
2004   status = hsa_fns.hsa_executable_symbol_get_info_fn
2005     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
2006      &kernel->private_segment_size);
2007   if (status != HSA_STATUS_SUCCESS)
2008     hsa_fatal ("Could not get info about kernel private segment size",
2009                status);
2010
2011   /* The kernel type is not known until something tries to launch it.  */
2012   kernel->kind = KIND_UNKNOWN;
2013
2014   GCN_DEBUG ("Kernel structure for %s fully initialized with "
2015              "following segment sizes: \n", kernel->name);
2016   GCN_DEBUG ("  group_segment_size: %u\n",
2017              (unsigned) kernel->group_segment_size);
2018   GCN_DEBUG ("  private_segment_size: %u\n",
2019              (unsigned) kernel->private_segment_size);
2020   GCN_DEBUG ("  kernarg_segment_size: %u\n",
2021              (unsigned) kernel->kernarg_segment_size);
2022   return;
2023
2024 failure:
2025   kernel->initialization_failed = true;
2026 }
2027
2028 /* Do all the work that is necessary before running KERNEL for the first time.
2029    The function assumes the program has been created, finalized and frozen by
2030    create_and_finalize_hsa_program.  */
2031
2032 static void
2033 init_kernel (struct kernel_info *kernel)
2034 {
2035   if (pthread_mutex_lock (&kernel->init_mutex))
2036     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
2037   if (kernel->initialized)
2038     {
2039       if (pthread_mutex_unlock (&kernel->init_mutex))
2040         GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2041                            "mutex");
2042
2043       return;
2044     }
2045
2046   init_kernel_properties (kernel);
2047
2048   if (!kernel->initialization_failed)
2049     {
2050       GCN_DEBUG ("\n");
2051
2052       kernel->initialized = true;
2053     }
2054   if (pthread_mutex_unlock (&kernel->init_mutex))
2055     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
2056                        "mutex");
2057 }
2058
2059 /* Run KERNEL on its agent, pass VARS to it as arguments and take
2060    launch attributes from KLA.
2061
2062    MODULE_LOCKED indicates that the caller already holds the lock and
2063    run_kernel need not lock it again.
2064    If AQ is NULL then agent->sync_queue will be used.  */
2065
2066 static void
2067 run_kernel (struct kernel_info *kernel, void *vars,
2068             struct GOMP_kernel_launch_attributes *kla,
2069             struct goacc_asyncqueue *aq, bool module_locked)
2070 {
2071   GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
2072              kernel->description->vpgr_count);
2073
2074   /* Reduce the number of threads/workers if there are insufficient
2075      VGPRs available to run the kernels together.  */
2076   if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
2077     {
2078       int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
2079       int max_threads = (256 / granulated_vgprs) * 4;
2080       if (kla->gdims[2] > max_threads)
2081         {
2082           GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
2083                        " per team/gang - reducing to %d threads/workers.\n",
2084                        kla->gdims[2], max_threads);
2085           kla->gdims[2] = max_threads;
2086         }
2087     }
2088
2089   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
2090              (aq ? aq->id : 0));
2091   GCN_DEBUG ("GCN launch attribs: gdims:[");
2092   int i;
2093   for (i = 0; i < kla->ndim; ++i)
2094     {
2095       if (i)
2096         DEBUG_PRINT (", ");
2097       DEBUG_PRINT ("%u", kla->gdims[i]);
2098     }
2099   DEBUG_PRINT ("], normalized gdims:[");
2100   for (i = 0; i < kla->ndim; ++i)
2101     {
2102       if (i)
2103         DEBUG_PRINT (", ");
2104       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2105     }
2106   DEBUG_PRINT ("], wdims:[");
2107   for (i = 0; i < kla->ndim; ++i)
2108     {
2109       if (i)
2110         DEBUG_PRINT (", ");
2111       DEBUG_PRINT ("%u", kla->wdims[i]);
2112     }
2113   DEBUG_PRINT ("]\n");
2114   DEBUG_FLUSH ();
2115
2116   struct agent_info *agent = kernel->agent;
2117   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2118     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2119
2120   if (!agent->initialized)
2121     GOMP_PLUGIN_fatal ("Agent must be initialized");
2122
2123   if (!kernel->initialized)
2124     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2125
2126   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2127
2128   uint64_t index
2129     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2130   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2131
2132   /* Wait until the queue is not full before writing the packet.   */
2133   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2134          >= command_q->size)
2135     ;
2136
2137   /* Do not allow the dimensions to be overridden when running
2138      constructors or destructors.  */
2139   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2140   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2141
2142   hsa_kernel_dispatch_packet_t *packet;
2143   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2144            + index % command_q->size;
2145
2146   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2147   packet->grid_size_x = override_x ? : kla->gdims[0];
2148   packet->workgroup_size_x = get_group_size (kla->ndim,
2149                                              packet->grid_size_x,
2150                                              kla->wdims[0]);
2151
2152   if (kla->ndim >= 2)
2153     {
2154       packet->grid_size_y = kla->gdims[1];
2155       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2156                                                  kla->wdims[1]);
2157     }
2158   else
2159     {
2160       packet->grid_size_y = 1;
2161       packet->workgroup_size_y = 1;
2162     }
2163
2164   if (kla->ndim == 3)
2165     {
2166       packet->grid_size_z = limit_worker_threads (override_z
2167                                                   ? : kla->gdims[2]);
2168       packet->workgroup_size_z = get_group_size (kla->ndim,
2169                                                  packet->grid_size_z,
2170                                                  kla->wdims[2]);
2171     }
2172   else
2173     {
2174       packet->grid_size_z = 1;
2175       packet->workgroup_size_z = 1;
2176     }
2177
2178   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2179              " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2180              packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2181              packet->grid_size_x / packet->workgroup_size_x,
2182              packet->grid_size_y / packet->workgroup_size_y,
2183              packet->grid_size_z / packet->workgroup_size_z,
2184              packet->workgroup_size_x, packet->workgroup_size_y,
2185              packet->workgroup_size_z);
2186
2187   struct kernel_dispatch *shadow
2188     = create_kernel_dispatch (kernel, packet->grid_size_x);
2189   shadow->queue = command_q;
2190
2191   if (debug)
2192     {
2193       fprintf (stderr, "\nKernel has following dependencies:\n");
2194       print_kernel_dispatch (shadow, 2);
2195     }
2196
2197   packet->private_segment_size = kernel->private_segment_size;
2198   packet->group_segment_size = kernel->group_segment_size;
2199   packet->kernel_object = kernel->object;
2200   packet->kernarg_address = shadow->kernarg_address;
2201   hsa_signal_t s;
2202   s.handle = shadow->signal;
2203   packet->completion_signal = s;
2204   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2205   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2206
2207   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2208
2209   uint16_t header;
2210   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2211   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2212   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2213
2214   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2215              agent->device_id);
2216
2217   packet_store_release ((uint32_t *) packet, header,
2218                         (uint16_t) kla->ndim
2219                         << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2220
2221   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2222                                        index);
2223
2224   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2225
2226   /* Root signal waits with 1ms timeout.  */
2227   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2228                                              1000 * 1000,
2229                                              HSA_WAIT_STATE_BLOCKED) != 0)
2230     {
2231       console_output (kernel, shadow->kernarg_address, false);
2232     }
2233   console_output (kernel, shadow->kernarg_address, true);
2234
2235   struct kernargs *kernargs = shadow->kernarg_address;
2236   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2237
2238   release_kernel_dispatch (shadow);
2239
2240   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2241     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2242
2243   unsigned int upper = (return_value & ~0xffff) >> 16;
2244   if (upper == 0xcafe)
2245     ; // exit not called, normal termination.
2246   else if (upper == 0xffff)
2247     ; // exit called.
2248   else
2249     {
2250       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2251                          " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2252                          return_value);
2253       abort ();
2254     }
2255
2256   if (upper == 0xffff)
2257     {
2258       unsigned int signal = (return_value >> 8) & 0xff;
2259
2260       if (signal == SIGABRT)
2261         {
2262           GCN_WARNING ("GCN Kernel aborted\n");
2263           abort ();
2264         }
2265       else if (signal != 0)
2266         {
2267           GCN_WARNING ("GCN Kernel received unknown signal\n");
2268           abort ();
2269         }
2270
2271       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2272       exit (return_value & 0xff);
2273     }
2274 }
2275
2276 /* }}}  */
2277 /* {{{ Load/Unload  */
2278
2279 /* Initialize KERNEL from D and other parameters.  Return true on success. */
2280
2281 static bool
2282 init_basic_kernel_info (struct kernel_info *kernel,
2283                         struct hsa_kernel_description *d,
2284                         struct agent_info *agent,
2285                         struct module_info *module)
2286 {
2287   kernel->agent = agent;
2288   kernel->module = module;
2289   kernel->name = d->name;
2290   kernel->description = d;
2291   if (pthread_mutex_init (&kernel->init_mutex, NULL))
2292     {
2293       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2294       return false;
2295     }
2296   return true;
2297 }
2298
2299 /* Check that the GCN ISA of the given image matches the ISA of the agent. */
2300
2301 static bool
2302 isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image)
2303 {
2304   int isa_field = elf_gcn_isa_field (image);
2305   const char* isa_s = isa_hsa_name (isa_field);
2306   if (!isa_s)
2307     {
2308       hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR);
2309       return false;
2310     }
2311
2312   if (isa_field != agent->device_isa)
2313     {
2314       char msg[120];
2315       const char *agent_isa_s = isa_hsa_name (agent->device_isa);
2316       const char *agent_isa_gcc_s = isa_gcc_name (agent->device_isa);
2317       assert (agent_isa_s);
2318       assert (agent_isa_gcc_s);
2319
2320       snprintf (msg, sizeof msg,
2321                 "GCN code object ISA '%s' does not match GPU ISA '%s'.\n"
2322                 "Try to recompile with '-foffload=-march=%s'.\n",
2323                 isa_s, agent_isa_s, agent_isa_gcc_s);
2324
2325       hsa_error (msg, HSA_STATUS_ERROR);
2326       return false;
2327     }
2328
2329   return true;
2330 }
2331
2332 /* Create and finalize the program consisting of all loaded modules.  */
2333
2334 static bool
2335 create_and_finalize_hsa_program (struct agent_info *agent)
2336 {
2337   hsa_status_t status;
2338   bool res = true;
2339   if (pthread_mutex_lock (&agent->prog_mutex))
2340     {
2341       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2342       return false;
2343     }
2344   if (agent->prog_finalized)
2345     goto final;
2346
2347   status
2348     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2349                                         HSA_EXECUTABLE_STATE_UNFROZEN,
2350                                         "", &agent->executable);
2351   if (status != HSA_STATUS_SUCCESS)
2352     {
2353       hsa_error ("Could not create GCN executable", status);
2354       goto fail;
2355     }
2356
2357   /* Load any GCN modules.  */
2358   struct module_info *module = agent->module;
2359   if (module)
2360     {
2361       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2362
2363       if (!isa_matches_agent (agent, image))
2364         goto fail;
2365
2366       hsa_code_object_t co = { 0 };
2367       status = hsa_fns.hsa_code_object_deserialize_fn
2368         (module->image_desc->gcn_image->image,
2369          module->image_desc->gcn_image->size,
2370          NULL, &co);
2371       if (status != HSA_STATUS_SUCCESS)
2372         {
2373           hsa_error ("Could not deserialize GCN code object", status);
2374           goto fail;
2375         }
2376
2377       status = hsa_fns.hsa_executable_load_code_object_fn
2378         (agent->executable, agent->id, co, "");
2379       if (status != HSA_STATUS_SUCCESS)
2380         {
2381           hsa_error ("Could not load GCN code object", status);
2382           goto fail;
2383         }
2384
2385       if (!module->heap)
2386         {
2387           status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2388                                                    gcn_kernel_heap_size,
2389                                                    (void**)&module->heap);
2390           if (status != HSA_STATUS_SUCCESS)
2391             {
2392               hsa_error ("Could not allocate memory for GCN heap", status);
2393               goto fail;
2394             }
2395
2396           status = hsa_fns.hsa_memory_assign_agent_fn
2397                         (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2398           if (status != HSA_STATUS_SUCCESS)
2399             {
2400               hsa_error ("Could not assign GCN heap memory to device", status);
2401               goto fail;
2402             }
2403
2404           hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2405                                       &gcn_kernel_heap_size,
2406                                       sizeof (gcn_kernel_heap_size));
2407         }
2408
2409     }
2410
2411   if (debug)
2412     dump_executable_symbols (agent->executable);
2413
2414   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2415   if (status != HSA_STATUS_SUCCESS)
2416     {
2417       hsa_error ("Could not freeze the GCN executable", status);
2418       goto fail;
2419     }
2420
2421 final:
2422   agent->prog_finalized = true;
2423
2424   if (pthread_mutex_unlock (&agent->prog_mutex))
2425     {
2426       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2427       res = false;
2428     }
2429
2430   return res;
2431
2432 fail:
2433   res = false;
2434   goto final;
2435 }
2436
2437 /* Free the HSA program in agent and everything associated with it and set
2438    agent->prog_finalized and the initialized flags of all kernels to false.
2439    Return TRUE on success.  */
2440
2441 static bool
2442 destroy_hsa_program (struct agent_info *agent)
2443 {
2444   if (!agent->prog_finalized)
2445     return true;
2446
2447   hsa_status_t status;
2448
2449   GCN_DEBUG ("Destroying the current GCN program.\n");
2450
2451   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2452   if (status != HSA_STATUS_SUCCESS)
2453     return hsa_error ("Could not destroy GCN executable", status);
2454
2455   if (agent->module)
2456     {
2457       int i;
2458       for (i = 0; i < agent->module->kernel_count; i++)
2459         agent->module->kernels[i].initialized = false;
2460
2461       if (agent->module->heap)
2462         {
2463           hsa_fns.hsa_memory_free_fn (agent->module->heap);
2464           agent->module->heap = NULL;
2465         }
2466     }
2467   agent->prog_finalized = false;
2468   return true;
2469 }
2470
2471 /* Deinitialize all information associated with MODULE and kernels within
2472    it.  Return TRUE on success.  */
2473
2474 static bool
2475 destroy_module (struct module_info *module, bool locked)
2476 {
2477   /* Run destructors before destroying module.  */
2478   struct GOMP_kernel_launch_attributes kla =
2479     { 3,
2480       /* Grid size.  */
2481       { 1, 64, 1 },
2482       /* Work-group size.  */
2483       { 1, 64, 1 }
2484     };
2485
2486   if (module->fini_array_func)
2487     {
2488       init_kernel (module->fini_array_func);
2489       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2490     }
2491   module->constructors_run_p = false;
2492
2493   int i;
2494   for (i = 0; i < module->kernel_count; i++)
2495     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2496       {
2497         GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2498                            "mutex");
2499         return false;
2500       }
2501
2502   return true;
2503 }
2504
2505 /* }}}  */
2506 /* {{{ Async  */
2507
2508 /* Callback of dispatch queues to report errors.  */
2509
2510 static void
2511 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2512 {
2513   struct queue_entry *entry = &aq->queue[index];
2514
2515   switch (entry->type)
2516     {
2517     case KERNEL_LAUNCH:
2518       if (DEBUG_QUEUES)
2519         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2520                    aq->agent->device_id, aq->id, index);
2521       run_kernel (entry->u.launch.kernel,
2522                   entry->u.launch.vars,
2523                   &entry->u.launch.kla, aq, false);
2524       if (DEBUG_QUEUES)
2525         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2526                    aq->agent->device_id, aq->id, index);
2527       break;
2528
2529     case CALLBACK:
2530       if (DEBUG_QUEUES)
2531         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2532                    aq->agent->device_id, aq->id, index);
2533       entry->u.callback.fn (entry->u.callback.data);
2534       if (DEBUG_QUEUES)
2535         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2536                    aq->agent->device_id, aq->id, index);
2537       break;
2538
2539     case ASYNC_WAIT:
2540       {
2541         /* FIXME: is it safe to access a placeholder that may already have
2542            been executed?  */
2543         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2544
2545         if (DEBUG_QUEUES)
2546           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2547                      aq->agent->device_id, aq->id, index);
2548
2549         pthread_mutex_lock (&placeholderp->mutex);
2550
2551         while (!placeholderp->executed)
2552           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2553
2554         pthread_mutex_unlock (&placeholderp->mutex);
2555
2556         if (pthread_cond_destroy (&placeholderp->cond))
2557           GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2558
2559         if (pthread_mutex_destroy (&placeholderp->mutex))
2560           GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2561
2562         if (DEBUG_QUEUES)
2563           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2564                      "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2565       }
2566       break;
2567
2568     case ASYNC_PLACEHOLDER:
2569       pthread_mutex_lock (&entry->u.placeholder.mutex);
2570       entry->u.placeholder.executed = 1;
2571       pthread_cond_signal (&entry->u.placeholder.cond);
2572       pthread_mutex_unlock (&entry->u.placeholder.mutex);
2573       break;
2574
2575     default:
2576       GOMP_PLUGIN_fatal ("Unknown queue element");
2577     }
2578 }
2579
2580 /* This function is run as a thread to service an async queue in the
2581    background.  It runs continuously until the stop flag is set.  */
2582
2583 static void *
2584 drain_queue (void *thread_arg)
2585 {
2586   struct goacc_asyncqueue *aq = thread_arg;
2587
2588   if (DRAIN_QUEUE_SYNCHRONOUS_P)
2589     {
2590       aq->drain_queue_stop = 2;
2591       return NULL;
2592     }
2593
2594   pthread_mutex_lock (&aq->mutex);
2595
2596   while (true)
2597     {
2598       if (aq->drain_queue_stop)
2599         break;
2600
2601       if (aq->queue_n > 0)
2602         {
2603           pthread_mutex_unlock (&aq->mutex);
2604           execute_queue_entry (aq, aq->queue_first);
2605
2606           pthread_mutex_lock (&aq->mutex);
2607           aq->queue_first = ((aq->queue_first + 1)
2608                              % ASYNC_QUEUE_SIZE);
2609           aq->queue_n--;
2610
2611           if (DEBUG_THREAD_SIGNAL)
2612             GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2613                        aq->agent->device_id, aq->id);
2614           pthread_cond_broadcast (&aq->queue_cond_out);
2615           pthread_mutex_unlock (&aq->mutex);
2616
2617           if (DEBUG_QUEUES)
2618             GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2619                        aq->id);
2620           pthread_mutex_lock (&aq->mutex);
2621         }
2622       else
2623         {
2624           if (DEBUG_THREAD_SLEEP)
2625             GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2626                        aq->agent->device_id, aq->id);
2627           pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2628           if (DEBUG_THREAD_SLEEP)
2629             GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2630                        aq->agent->device_id, aq->id);
2631         }
2632     }
2633
2634   aq->drain_queue_stop = 2;
2635   if (DEBUG_THREAD_SIGNAL)
2636     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2637                aq->agent->device_id, aq->id);
2638   pthread_cond_broadcast (&aq->queue_cond_out);
2639   pthread_mutex_unlock (&aq->mutex);
2640
2641   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2642   return NULL;
2643 }
2644
2645 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2646    is not usually the case.  This is just a debug tool.  */
2647
2648 static void
2649 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2650 {
2651   pthread_mutex_lock (&aq->mutex);
2652
2653   while (aq->queue_n > 0)
2654     {
2655       execute_queue_entry (aq, aq->queue_first);
2656
2657       aq->queue_first = ((aq->queue_first + 1)
2658                          % ASYNC_QUEUE_SIZE);
2659       aq->queue_n--;
2660     }
2661
2662   pthread_mutex_unlock (&aq->mutex);
2663 }
2664
2665 /* Block the current thread until an async queue is writable.  The aq->mutex
2666    lock should be held on entry, and remains locked on exit.  */
2667
2668 static void
2669 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2670 {
2671   if (aq->queue_n == ASYNC_QUEUE_SIZE)
2672     {
2673       /* Queue is full.  Wait for it to not be full.  */
2674       while (aq->queue_n == ASYNC_QUEUE_SIZE)
2675         pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2676     }
2677 }
2678
2679 /* Request an asynchronous kernel launch on the specified queue.  This
2680    may block if the queue is full, but returns without waiting for the
2681    kernel to run.  */
2682
2683 static void
2684 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2685                    void *vars, struct GOMP_kernel_launch_attributes *kla)
2686 {
2687   assert (aq->agent == kernel->agent);
2688
2689   pthread_mutex_lock (&aq->mutex);
2690
2691   wait_for_queue_nonfull (aq);
2692
2693   int queue_last = ((aq->queue_first + aq->queue_n)
2694                     % ASYNC_QUEUE_SIZE);
2695   if (DEBUG_QUEUES)
2696     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2697                aq->id, queue_last);
2698
2699   aq->queue[queue_last].type = KERNEL_LAUNCH;
2700   aq->queue[queue_last].u.launch.kernel = kernel;
2701   aq->queue[queue_last].u.launch.vars = vars;
2702   aq->queue[queue_last].u.launch.kla = *kla;
2703
2704   aq->queue_n++;
2705
2706   if (DEBUG_THREAD_SIGNAL)
2707     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2708                aq->agent->device_id, aq->id);
2709   pthread_cond_signal (&aq->queue_cond_in);
2710
2711   pthread_mutex_unlock (&aq->mutex);
2712 }
2713
2714 /* Request an asynchronous callback on the specified queue.  The callback
2715    function will be called, with the given opaque data, from the appropriate
2716    async thread, when all previous items on that queue are complete.  */
2717
2718 static void
2719 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2720                      void *data)
2721 {
2722   pthread_mutex_lock (&aq->mutex);
2723
2724   wait_for_queue_nonfull (aq);
2725
2726   int queue_last = ((aq->queue_first + aq->queue_n)
2727                     % ASYNC_QUEUE_SIZE);
2728   if (DEBUG_QUEUES)
2729     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2730                aq->id, queue_last);
2731
2732   aq->queue[queue_last].type = CALLBACK;
2733   aq->queue[queue_last].u.callback.fn = fn;
2734   aq->queue[queue_last].u.callback.data = data;
2735
2736   aq->queue_n++;
2737
2738   if (DEBUG_THREAD_SIGNAL)
2739     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2740                aq->agent->device_id, aq->id);
2741   pthread_cond_signal (&aq->queue_cond_in);
2742
2743   pthread_mutex_unlock (&aq->mutex);
2744 }
2745
2746 /* Request that a given async thread wait for another thread (unspecified) to
2747    reach the given placeholder.  The wait will occur when all previous entries
2748    on the queue are complete.  A placeholder is effectively a kind of signal
2749    which simply sets a flag when encountered in a queue.  */
2750
2751 static void
2752 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2753                       struct placeholder *placeholderp)
2754 {
2755   pthread_mutex_lock (&aq->mutex);
2756
2757   wait_for_queue_nonfull (aq);
2758
2759   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2760   if (DEBUG_QUEUES)
2761     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2762                aq->id, queue_last);
2763
2764   aq->queue[queue_last].type = ASYNC_WAIT;
2765   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2766
2767   aq->queue_n++;
2768
2769   if (DEBUG_THREAD_SIGNAL)
2770     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2771                aq->agent->device_id, aq->id);
2772   pthread_cond_signal (&aq->queue_cond_in);
2773
2774   pthread_mutex_unlock (&aq->mutex);
2775 }
2776
2777 /* Add a placeholder into an async queue.  When the async thread reaches the
2778    placeholder it will set the "executed" flag to true and continue.
2779    Another thread may be waiting on this thread reaching the placeholder.  */
2780
2781 static struct placeholder *
2782 queue_push_placeholder (struct goacc_asyncqueue *aq)
2783 {
2784   struct placeholder *placeholderp;
2785
2786   pthread_mutex_lock (&aq->mutex);
2787
2788   wait_for_queue_nonfull (aq);
2789
2790   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2791   if (DEBUG_QUEUES)
2792     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2793                aq->id, queue_last);
2794
2795   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2796   placeholderp = &aq->queue[queue_last].u.placeholder;
2797
2798   if (pthread_mutex_init (&placeholderp->mutex, NULL))
2799     {
2800       pthread_mutex_unlock (&aq->mutex);
2801       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2802     }
2803
2804   if (pthread_cond_init (&placeholderp->cond, NULL))
2805     {
2806       pthread_mutex_unlock (&aq->mutex);
2807       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2808     }
2809
2810   placeholderp->executed = 0;
2811
2812   aq->queue_n++;
2813
2814   if (DEBUG_THREAD_SIGNAL)
2815     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2816                aq->agent->device_id, aq->id);
2817   pthread_cond_signal (&aq->queue_cond_in);
2818
2819   pthread_mutex_unlock (&aq->mutex);
2820
2821   return placeholderp;
2822 }
2823
2824 /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
2825
2826 static void
2827 finalize_async_thread (struct goacc_asyncqueue *aq)
2828 {
2829   pthread_mutex_lock (&aq->mutex);
2830   if (aq->drain_queue_stop == 2)
2831     {
2832       pthread_mutex_unlock (&aq->mutex);
2833       return;
2834     }
2835
2836   aq->drain_queue_stop = 1;
2837
2838   if (DEBUG_THREAD_SIGNAL)
2839     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2840                aq->agent->device_id, aq->id);
2841   pthread_cond_signal (&aq->queue_cond_in);
2842
2843   while (aq->drain_queue_stop != 2)
2844     {
2845       if (DEBUG_THREAD_SLEEP)
2846         GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2847                    " to sleep\n", aq->agent->device_id, aq->id);
2848       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2849       if (DEBUG_THREAD_SLEEP)
2850         GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
2851                    aq->agent->device_id, aq->id);
2852     }
2853
2854   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2855              aq->id);
2856   pthread_mutex_unlock (&aq->mutex);
2857
2858   int err = pthread_join (aq->thread_drain_queue, NULL);
2859   if (err != 0)
2860     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2861                        aq->agent->device_id, aq->id, strerror (err));
2862   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2863 }
2864
2865 /* Set up an async queue for OpenMP.  There will be only one.  The
2866    implementation simply uses an OpenACC async queue.
2867    FIXME: is this thread-safe if two threads call this function?  */
2868
2869 static void
2870 maybe_init_omp_async (struct agent_info *agent)
2871 {
2872   if (!agent->omp_async_queue)
2873     agent->omp_async_queue
2874       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2875 }
2876
2877 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2878    copies from read-only pages.  */
2879
2880 static void
2881 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2882 {
2883   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2884
2885   if (status == HSA_STATUS_SUCCESS)
2886     return;
2887
2888   /* It appears that the copy fails if the source data is in a read-only page.
2889      We can't detect that easily, so try copying the data to a temporary buffer
2890      and doing the copy again if we got an error above.  */
2891
2892   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2893                "[%p:+%d]\n", (void *) src, (int) len);
2894
2895   void *src_copy = malloc (len);
2896   memcpy (src_copy, src, len);
2897   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2898   free (src_copy);
2899   if (status != HSA_STATUS_SUCCESS)
2900     GOMP_PLUGIN_error ("memory copy failed");
2901 }
2902
2903 /* Copy data to or from a device.  This is intended for use as an async
2904    callback event.  */
2905
2906 static void
2907 copy_data (void *data_)
2908 {
2909   struct copy_data *data = (struct copy_data *)data_;
2910   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2911              data->aq->agent->device_id, data->aq->id, data->len, data->src,
2912              data->dst);
2913   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
2914   if (data->free_src)
2915     free ((void *) data->src);
2916   free (data);
2917 }
2918
2919 /* Free device data.  This is intended for use as an async callback event.  */
2920
2921 static void
2922 gomp_offload_free (void *ptr)
2923 {
2924   GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2925   GOMP_OFFLOAD_free (0, ptr);
2926 }
2927
2928 /* Request an asynchronous data copy, to or from a device, on a given queue.
2929    The event will be registered as a callback.  If FREE_SRC is true
2930    then the source data will be freed following the copy.  */
2931
2932 static void
2933 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
2934                  size_t len, bool free_src)
2935 {
2936   if (DEBUG_QUEUES)
2937     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
2938                aq->agent->device_id, aq->id, len, src, dst);
2939   struct copy_data *data
2940     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
2941   data->dst = dst;
2942   data->src = src;
2943   data->len = len;
2944   data->free_src = free_src;
2945   data->aq = aq;
2946   queue_push_callback (aq, copy_data, data);
2947 }
2948
2949 /* Return true if the given queue is currently empty.  */
2950
2951 static int
2952 queue_empty (struct goacc_asyncqueue *aq)
2953 {
2954   pthread_mutex_lock (&aq->mutex);
2955   int res = aq->queue_n == 0 ? 1 : 0;
2956   pthread_mutex_unlock (&aq->mutex);
2957
2958   return res;
2959 }
2960
2961 /* Wait for a given queue to become empty.  This implements an OpenACC wait
2962    directive.  */
2963
2964 static void
2965 wait_queue (struct goacc_asyncqueue *aq)
2966 {
2967   if (DRAIN_QUEUE_SYNCHRONOUS_P)
2968     {
2969       drain_queue_synchronous (aq);
2970       return;
2971     }
2972
2973   pthread_mutex_lock (&aq->mutex);
2974
2975   while (aq->queue_n > 0)
2976     {
2977       if (DEBUG_THREAD_SLEEP)
2978         GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
2979                    aq->agent->device_id, aq->id);
2980       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2981       if (DEBUG_THREAD_SLEEP)
2982         GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
2983                    aq->id);
2984     }
2985
2986   pthread_mutex_unlock (&aq->mutex);
2987   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
2988 }
2989
2990 /* }}}  */
2991 /* {{{ OpenACC support  */
2992
2993 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
2994
2995 static void
2996 gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
2997           void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
2998           struct goacc_asyncqueue *aq)
2999 {
3000   if (!GOMP_OFFLOAD_can_run (kernel))
3001     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3002
3003   /* If we get here then this must be an OpenACC kernel.  */
3004   kernel->kind = KIND_OPENACC;
3005
3006   /* devaddrs must be double-indirect on the target.  */
3007   void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3008   for (size_t i = 0; i < mapnum; i++)
3009     hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3010                                 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3011                                 sizeof (void *));
3012
3013   struct hsa_kernel_description *hsa_kernel_desc = NULL;
3014   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3015     {
3016       struct hsa_kernel_description *d
3017         = &kernel->module->image_desc->kernel_infos[i];
3018       if (d->name == kernel->name)
3019         {
3020           hsa_kernel_desc = d;
3021           break;
3022         }
3023     }
3024
3025   /* We may have statically-determined dimensions in
3026      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3027      invocation at runtime in dims[].  We allow static dimensions to take
3028      priority over dynamic dimensions when present (non-zero).  */
3029   if (hsa_kernel_desc->oacc_dims[0] > 0)
3030     dims[0] = hsa_kernel_desc->oacc_dims[0];
3031   if (hsa_kernel_desc->oacc_dims[1] > 0)
3032     dims[1] = hsa_kernel_desc->oacc_dims[1];
3033   if (hsa_kernel_desc->oacc_dims[2] > 0)
3034     dims[2] = hsa_kernel_desc->oacc_dims[2];
3035
3036   /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3037      There isn't really a correct answer for this without a clue about the
3038      problem size, so let's do a reasonable number of single-worker gangs.
3039      64 gangs matches a typical Fiji device.  */
3040
3041   /* NOTE: Until support for middle-end worker partitioning is merged, use 1
3042      for the default number of workers.  */
3043   if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs.  */
3044   if (dims[1] == 0) dims[1] = 1;  /* Workers.  */
3045
3046   /* The incoming dimensions are expressed in terms of gangs, workers, and
3047      vectors.  The HSA dimensions are expressed in terms of "work-items",
3048      which means multiples of vector lanes.
3049
3050      The "grid size" specifies the size of the problem space, and the
3051      "work-group size" specifies how much of that we want a single compute
3052      unit to chew on at once.
3053
3054      The three dimensions do not really correspond to hardware, but the
3055      important thing is that the HSA runtime will launch as many
3056      work-groups as it takes to process the entire grid, and each
3057      work-group will contain as many wave-fronts as it takes to process
3058      the work-items in that group.
3059
3060      Essentially, as long as we set the Y dimension to 64 (the number of
3061      vector lanes in hardware), and the Z group size to the maximum (16),
3062      then we will get the gangs (X) and workers (Z) launched as we expect.
3063
3064      The reason for the apparent reversal of vector and worker dimension
3065      order is to do with the way the run-time distributes work-items across
3066      v1 and v2.  */
3067   struct GOMP_kernel_launch_attributes kla =
3068     {3,
3069      /* Grid size.  */
3070      {dims[0], 64, dims[1]},
3071      /* Work-group size.  */
3072      {1,       64, 16}
3073     };
3074
3075   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3076   acc_prof_info *prof_info = thr->prof_info;
3077   acc_event_info enqueue_launch_event_info;
3078   acc_api_info *api_info = thr->api_info;
3079   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3080   if (profiling_dispatch_p)
3081     {
3082       prof_info->event_type = acc_ev_enqueue_launch_start;
3083
3084       enqueue_launch_event_info.launch_event.event_type
3085         = prof_info->event_type;
3086       enqueue_launch_event_info.launch_event.valid_bytes
3087         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3088       enqueue_launch_event_info.launch_event.parent_construct
3089         = acc_construct_parallel;
3090       enqueue_launch_event_info.launch_event.implicit = 1;
3091       enqueue_launch_event_info.launch_event.tool_info = NULL;
3092       enqueue_launch_event_info.launch_event.kernel_name
3093         = (char *) kernel->name;
3094       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3095       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3096       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3097
3098       api_info->device_api = acc_device_api_other;
3099
3100       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3101         &enqueue_launch_event_info, api_info);
3102     }
3103
3104   if (!async)
3105     {
3106       run_kernel (kernel, ind_da, &kla, NULL, false);
3107       gomp_offload_free (ind_da);
3108     }
3109   else
3110     {
3111       queue_push_launch (aq, kernel, ind_da, &kla);
3112       if (DEBUG_QUEUES)
3113         GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3114                    aq->agent->device_id, aq->id, ind_da);
3115       queue_push_callback (aq, gomp_offload_free, ind_da);
3116     }
3117
3118   if (profiling_dispatch_p)
3119     {
3120       prof_info->event_type = acc_ev_enqueue_launch_end;
3121       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3122       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3123                                             &enqueue_launch_event_info,
3124                                             api_info);
3125     }
3126 }
3127
3128 /* }}}  */
3129 /* {{{ Generic Plugin API  */
3130
3131 /* Return the name of the accelerator, which is "gcn".  */
3132
3133 const char *
3134 GOMP_OFFLOAD_get_name (void)
3135 {
3136   return "gcn";
3137 }
3138
3139 /* Return the specific capabilities the HSA accelerator have.  */
3140
3141 unsigned int
3142 GOMP_OFFLOAD_get_caps (void)
3143 {
3144   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
3145   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3146             | GOMP_OFFLOAD_CAP_OPENACC_200;
3147 }
3148
3149 /* Identify as GCN accelerator.  */
3150
3151 int
3152 GOMP_OFFLOAD_get_type (void)
3153 {
3154   return OFFLOAD_TARGET_TYPE_GCN;
3155 }
3156
3157 /* Return the libgomp version number we're compatible with.  There is
3158    no requirement for cross-version compatibility.  */
3159
3160 unsigned
3161 GOMP_OFFLOAD_version (void)
3162 {
3163   return GOMP_VERSION;
3164 }
3165
3166 /* Return the number of GCN devices on the system.  */
3167
3168 int
3169 GOMP_OFFLOAD_get_num_devices (void)
3170 {
3171   if (!init_hsa_context ())
3172     return 0;
3173   return hsa_context.agent_count;
3174 }
3175
3176 /* Initialize device (agent) number N so that it can be used for computation.
3177    Return TRUE on success.  */
3178
3179 bool
3180 GOMP_OFFLOAD_init_device (int n)
3181 {
3182   if (!init_hsa_context ())
3183     return false;
3184   if (n >= hsa_context.agent_count)
3185     {
3186       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3187       return false;
3188     }
3189   struct agent_info *agent = &hsa_context.agents[n];
3190
3191   if (agent->initialized)
3192     return true;
3193
3194   agent->device_id = n;
3195
3196   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3197     {
3198       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3199       return false;
3200     }
3201   if (pthread_mutex_init (&agent->prog_mutex, NULL))
3202     {
3203       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3204       return false;
3205     }
3206   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3207     {
3208       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3209       return false;
3210     }
3211   if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3212     {
3213       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3214       return false;
3215     }
3216   agent->async_queues = NULL;
3217   agent->omp_async_queue = NULL;
3218   agent->team_arena_list = NULL;
3219
3220   uint32_t queue_size;
3221   hsa_status_t status;
3222   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3223                                           HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3224                                           &queue_size);
3225   if (status != HSA_STATUS_SUCCESS)
3226     return hsa_error ("Error requesting maximum queue size of the GCN agent",
3227                       status);
3228
3229   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3230                                           &agent->name);
3231   if (status != HSA_STATUS_SUCCESS)
3232     return hsa_error ("Error querying the name of the agent", status);
3233
3234   agent->device_isa = isa_code (agent->name);
3235   if (agent->device_isa < 0)
3236     return hsa_error ("Unknown GCN agent architecture", HSA_STATUS_ERROR);
3237
3238   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_VENDOR_NAME,
3239                                           &agent->vendor_name);
3240   if (status != HSA_STATUS_SUCCESS)
3241     return hsa_error ("Error querying the vendor name of the agent", status);
3242
3243   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3244                                         HSA_QUEUE_TYPE_MULTI,
3245                                         hsa_queue_callback, NULL, UINT32_MAX,
3246                                         UINT32_MAX, &agent->sync_queue);
3247   if (status != HSA_STATUS_SUCCESS)
3248     return hsa_error ("Error creating command queue", status);
3249
3250   agent->kernarg_region.handle = (uint64_t) -1;
3251   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3252                                                  get_kernarg_memory_region,
3253                                                  &agent->kernarg_region);
3254   if (status != HSA_STATUS_SUCCESS
3255       && status != HSA_STATUS_INFO_BREAK)
3256     hsa_error ("Scanning memory regions failed", status);
3257   if (agent->kernarg_region.handle == (uint64_t) -1)
3258     {
3259       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3260                          "arguments");
3261       return false;
3262     }
3263   GCN_DEBUG ("Selected kernel arguments memory region:\n");
3264   dump_hsa_region (agent->kernarg_region, NULL);
3265
3266   agent->data_region.handle = (uint64_t) -1;
3267   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3268                                                  get_data_memory_region,
3269                                                  &agent->data_region);
3270   if (status != HSA_STATUS_SUCCESS
3271       && status != HSA_STATUS_INFO_BREAK)
3272     hsa_error ("Scanning memory regions failed", status);
3273   if (agent->data_region.handle == (uint64_t) -1)
3274     {
3275       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3276                          "data");
3277       return false;
3278     }
3279   GCN_DEBUG ("Selected device data memory region:\n");
3280   dump_hsa_region (agent->data_region, NULL);
3281
3282   GCN_DEBUG ("GCN agent %d initialized\n", n);
3283
3284   agent->initialized = true;
3285   return true;
3286 }
3287
3288 /* Load GCN object-code module described by struct gcn_image_desc in
3289    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3290    If there are any constructors then run them.  */
3291
3292 int
3293 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3294                          struct addr_pair **target_table)
3295 {
3296   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3297     {
3298       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3299                          " (expected %u, received %u)",
3300                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3301       return -1;
3302     }
3303
3304   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3305   struct agent_info *agent;
3306   struct addr_pair *pair;
3307   struct module_info *module;
3308   struct kernel_info *kernel;
3309   int kernel_count = image_desc->kernel_count;
3310   unsigned var_count = image_desc->global_variable_count;
3311
3312   agent = get_agent_info (ord);
3313   if (!agent)
3314     return -1;
3315
3316   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3317     {
3318       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3319       return -1;
3320     }
3321   if (agent->prog_finalized
3322       && !destroy_hsa_program (agent))
3323     return -1;
3324
3325   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3326   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3327   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
3328                              * sizeof (struct addr_pair));
3329   *target_table = pair;
3330   module = (struct module_info *)
3331     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3332                                 + kernel_count * sizeof (struct kernel_info));
3333   module->image_desc = image_desc;
3334   module->kernel_count = kernel_count;
3335   module->heap = NULL;
3336   module->constructors_run_p = false;
3337
3338   kernel = &module->kernels[0];
3339
3340   /* Allocate memory for kernel dependencies.  */
3341   for (unsigned i = 0; i < kernel_count; i++)
3342     {
3343       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3344       if (!init_basic_kernel_info (kernel, d, agent, module))
3345         return -1;
3346       if (strcmp (d->name, "_init_array") == 0)
3347         module->init_array_func = kernel;
3348       else if (strcmp (d->name, "_fini_array") == 0)
3349         module->fini_array_func = kernel;
3350       else
3351         {
3352           pair->start = (uintptr_t) kernel;
3353           pair->end = (uintptr_t) (kernel + 1);
3354           pair++;
3355         }
3356       kernel++;
3357     }
3358
3359   agent->module = module;
3360   if (pthread_rwlock_unlock (&agent->module_rwlock))
3361     {
3362       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3363       return -1;
3364     }
3365
3366   if (!create_and_finalize_hsa_program (agent))
3367     return -1;
3368
3369   for (unsigned i = 0; i < var_count; i++)
3370     {
3371       struct global_var_info *v = &image_desc->global_variables[i];
3372       GCN_DEBUG ("Looking for variable %s\n", v->name);
3373
3374       hsa_status_t status;
3375       hsa_executable_symbol_t var_symbol;
3376       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3377                                                      v->name, agent->id,
3378                                                      0, &var_symbol);
3379
3380       if (status != HSA_STATUS_SUCCESS)
3381         hsa_fatal ("Could not find symbol for variable in the code object",
3382                    status);
3383
3384       uint64_t var_addr;
3385       uint32_t var_size;
3386       status = hsa_fns.hsa_executable_symbol_get_info_fn
3387         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
3388       if (status != HSA_STATUS_SUCCESS)
3389         hsa_fatal ("Could not extract a variable from its symbol", status);
3390       status = hsa_fns.hsa_executable_symbol_get_info_fn
3391         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
3392       if (status != HSA_STATUS_SUCCESS)
3393         hsa_fatal ("Could not extract a variable size from its symbol", status);
3394
3395       pair->start = var_addr;
3396       pair->end = var_addr + var_size;
3397       GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
3398                  (void *)var_addr, var_size);
3399       pair++;
3400     }
3401
3402   /* Ensure that constructors are run first.  */
3403   struct GOMP_kernel_launch_attributes kla =
3404     { 3,
3405       /* Grid size.  */
3406       { 1, 64, 1 },
3407       /* Work-group size.  */
3408       { 1, 64, 1 }
3409     };
3410
3411   if (module->init_array_func)
3412     {
3413       init_kernel (module->init_array_func);
3414       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3415     }
3416   module->constructors_run_p = true;
3417
3418   /* Don't report kernels that libgomp need not know about.  */
3419   if (module->init_array_func)
3420     kernel_count--;
3421   if (module->fini_array_func)
3422     kernel_count--;
3423
3424   return kernel_count + var_count;
3425 }
3426
3427 /* Unload GCN object-code module described by struct gcn_image_desc in
3428    TARGET_DATA from agent number N.  Return TRUE on success.  */
3429
3430 bool
3431 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3432 {
3433   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3434     {
3435       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3436                          " (expected %u, received %u)",
3437                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3438       return false;
3439     }
3440
3441   struct agent_info *agent;
3442   agent = get_agent_info (n);
3443   if (!agent)
3444     return false;
3445
3446   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3447     {
3448       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3449       return false;
3450     }
3451
3452   if (!agent->module || agent->module->image_desc != target_data)
3453     {
3454       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3455                          "loaded before");
3456       return false;
3457     }
3458
3459   if (!destroy_module (agent->module, true))
3460     return false;
3461   free (agent->module);
3462   agent->module = NULL;
3463   if (!destroy_hsa_program (agent))
3464     return false;
3465   if (pthread_rwlock_unlock (&agent->module_rwlock))
3466     {
3467       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3468       return false;
3469     }
3470   return true;
3471 }
3472
3473 /* Deinitialize all information and status associated with agent number N.  We
3474    do not attempt any synchronization, assuming the user and libgomp will not
3475    attempt deinitialization of a device that is in any way being used at the
3476    same time.  Return TRUE on success.  */
3477
3478 bool
3479 GOMP_OFFLOAD_fini_device (int n)
3480 {
3481   struct agent_info *agent = get_agent_info (n);
3482   if (!agent)
3483     return false;
3484
3485   if (!agent->initialized)
3486     return true;
3487
3488   if (agent->omp_async_queue)
3489     {
3490       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3491       agent->omp_async_queue = NULL;
3492     }
3493
3494   if (agent->module)
3495     {
3496       if (!destroy_module (agent->module, false))
3497         return false;
3498       free (agent->module);
3499       agent->module = NULL;
3500     }
3501
3502   if (!destroy_team_arenas (agent))
3503     return false;
3504
3505   if (!destroy_hsa_program (agent))
3506     return false;
3507
3508   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3509   if (status != HSA_STATUS_SUCCESS)
3510     return hsa_error ("Error destroying command queue", status);
3511
3512   if (pthread_mutex_destroy (&agent->prog_mutex))
3513     {
3514       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3515       return false;
3516     }
3517   if (pthread_rwlock_destroy (&agent->module_rwlock))
3518     {
3519       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3520       return false;
3521     }
3522
3523   if (pthread_mutex_destroy (&agent->async_queues_mutex))
3524     {
3525       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3526       return false;
3527     }
3528   if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3529     {
3530       GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3531       return false;
3532     }
3533   agent->initialized = false;
3534   return true;
3535 }
3536
3537 /* Return true if the HSA runtime can run function FN_PTR.  */
3538
3539 bool
3540 GOMP_OFFLOAD_can_run (void *fn_ptr)
3541 {
3542   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3543
3544   init_kernel (kernel);
3545   if (kernel->initialization_failed)
3546     goto failure;
3547
3548   return true;
3549
3550 failure:
3551   if (suppress_host_fallback)
3552     GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3553   GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3554   return false;
3555 }
3556
3557 /* Allocate memory on device N.  */
3558
3559 void *
3560 GOMP_OFFLOAD_alloc (int n, size_t size)
3561 {
3562   struct agent_info *agent = get_agent_info (n);
3563   return alloc_by_agent (agent, size);
3564 }
3565
3566 /* Free memory from device N.  */
3567
3568 bool
3569 GOMP_OFFLOAD_free (int device, void *ptr)
3570 {
3571   GCN_DEBUG ("Freeing memory on device %d\n", device);
3572
3573   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3574   if (status != HSA_STATUS_SUCCESS)
3575     {
3576       hsa_error ("Could not free device memory", status);
3577       return false;
3578     }
3579
3580   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3581   bool profiling_dispatch_p
3582     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3583   if (profiling_dispatch_p)
3584     {
3585       acc_prof_info *prof_info = thr->prof_info;
3586       acc_event_info data_event_info;
3587       acc_api_info *api_info = thr->api_info;
3588
3589       prof_info->event_type = acc_ev_free;
3590
3591       data_event_info.data_event.event_type = prof_info->event_type;
3592       data_event_info.data_event.valid_bytes
3593         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3594       data_event_info.data_event.parent_construct
3595         = acc_construct_parallel;
3596       data_event_info.data_event.implicit = 1;
3597       data_event_info.data_event.tool_info = NULL;
3598       data_event_info.data_event.var_name = NULL;
3599       data_event_info.data_event.bytes = 0;
3600       data_event_info.data_event.host_ptr = NULL;
3601       data_event_info.data_event.device_ptr = (void *) ptr;
3602
3603       api_info->device_api = acc_device_api_other;
3604
3605       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3606                                             api_info);
3607     }
3608
3609   return true;
3610 }
3611
3612 /* Copy data from DEVICE to host.  */
3613
3614 bool
3615 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3616 {
3617   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3618              src, dst);
3619   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3620   if (status != HSA_STATUS_SUCCESS)
3621     GOMP_PLUGIN_error ("memory copy failed");
3622   return true;
3623 }
3624
3625 /* Copy data from host to DEVICE.  */
3626
3627 bool
3628 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3629 {
3630   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3631              device, dst);
3632   hsa_memory_copy_wrapper (dst, src, n);
3633   return true;
3634 }
3635
3636 /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
3637
3638 bool
3639 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3640 {
3641   struct gcn_thread *thread_data = gcn_thread ();
3642
3643   if (thread_data && !async_synchronous_p (thread_data->async))
3644     {
3645       struct agent_info *agent = get_agent_info (device);
3646       maybe_init_omp_async (agent);
3647       queue_push_copy (agent->omp_async_queue, dst, src, n, false);
3648       return true;
3649     }
3650
3651   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3652              device, src, device, dst);
3653   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3654   if (status != HSA_STATUS_SUCCESS)
3655     GOMP_PLUGIN_error ("memory copy failed");
3656   return true;
3657 }
3658
3659 /* }}}  */
3660 /* {{{ OpenMP Plugin API  */
3661
3662 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3663    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
3664    to a kernel_info structure, and must have previously been loaded to the
3665    specified device.  */
3666
3667 void
3668 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3669 {
3670   struct agent_info *agent = get_agent_info (device);
3671   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3672   struct GOMP_kernel_launch_attributes def;
3673   struct GOMP_kernel_launch_attributes *kla;
3674   assert (agent == kernel->agent);
3675
3676   /* If we get here then the kernel must be OpenMP.  */
3677   kernel->kind = KIND_OPENMP;
3678
3679   if (!parse_target_attributes (args, &def, &kla, agent))
3680     {
3681       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3682       return;
3683     }
3684   run_kernel (kernel, vars, kla, NULL, false);
3685 }
3686
3687 /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
3688    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3689    GOMP_PLUGIN_target_task_completion when it has finished.  */
3690
3691 void
3692 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3693                         void **args, void *async_data)
3694 {
3695   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3696   struct agent_info *agent = get_agent_info (device);
3697   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3698   struct GOMP_kernel_launch_attributes def;
3699   struct GOMP_kernel_launch_attributes *kla;
3700   assert (agent == kernel->agent);
3701
3702   /* If we get here then the kernel must be OpenMP.  */
3703   kernel->kind = KIND_OPENMP;
3704
3705   if (!parse_target_attributes (args, &def, &kla, agent))
3706     {
3707       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3708       return;
3709     }
3710
3711   maybe_init_omp_async (agent);
3712   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3713   queue_push_callback (agent->omp_async_queue,
3714                        GOMP_PLUGIN_target_task_completion, async_data);
3715 }
3716
3717 /* }}} */
3718 /* {{{ OpenACC Plugin API  */
3719
3720 /* Run a synchronous OpenACC kernel.  The device number is inferred from the
3721    already-loaded KERNEL.  */
3722
3723 void
3724 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3725                            void **hostaddrs, void **devaddrs, unsigned *dims,
3726                            void *targ_mem_desc)
3727 {
3728   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3729
3730   gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3731             NULL);
3732 }
3733
3734 /* Run an asynchronous OpenACC kernel on the specified queue.  */
3735
3736 void
3737 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3738                                  void **hostaddrs, void **devaddrs,
3739                                  unsigned *dims, void *targ_mem_desc,
3740                                  struct goacc_asyncqueue *aq)
3741 {
3742   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3743
3744   gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3745             aq);
3746 }
3747
3748 /* Create a new asynchronous thread and queue for running future kernels.  */
3749
3750 struct goacc_asyncqueue *
3751 GOMP_OFFLOAD_openacc_async_construct (int device)
3752 {
3753   struct agent_info *agent = get_agent_info (device);
3754
3755   pthread_mutex_lock (&agent->async_queues_mutex);
3756
3757   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3758   aq->agent = get_agent_info (device);
3759   aq->prev = NULL;
3760   aq->next = agent->async_queues;
3761   if (aq->next)
3762     {
3763       aq->next->prev = aq;
3764       aq->id = aq->next->id + 1;
3765     }
3766   else
3767     aq->id = 1;
3768   agent->async_queues = aq;
3769
3770   aq->queue_first = 0;
3771   aq->queue_n = 0;
3772   aq->drain_queue_stop = 0;
3773
3774   if (pthread_mutex_init (&aq->mutex, NULL))
3775     {
3776       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3777       return false;
3778     }
3779   if (pthread_cond_init (&aq->queue_cond_in, NULL))
3780     {
3781       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3782       return false;
3783     }
3784   if (pthread_cond_init (&aq->queue_cond_out, NULL))
3785     {
3786       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3787       return false;
3788     }
3789
3790   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3791                                                      ASYNC_QUEUE_SIZE,
3792                                                      HSA_QUEUE_TYPE_MULTI,
3793                                                      hsa_queue_callback, NULL,
3794                                                      UINT32_MAX, UINT32_MAX,
3795                                                      &aq->hsa_queue);
3796   if (status != HSA_STATUS_SUCCESS)
3797     hsa_fatal ("Error creating command queue", status);
3798
3799   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3800   if (err != 0)
3801     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3802                        strerror (err));
3803   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3804              aq->id);
3805
3806   pthread_mutex_unlock (&agent->async_queues_mutex);
3807
3808   return aq;
3809 }
3810
3811 /* Destroy an existing asynchronous thread and queue.  Waits for any
3812    currently-running task to complete, but cancels any queued tasks.  */
3813
3814 bool
3815 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3816 {
3817   struct agent_info *agent = aq->agent;
3818
3819   finalize_async_thread (aq);
3820
3821   pthread_mutex_lock (&agent->async_queues_mutex);
3822
3823   int err;
3824   if ((err = pthread_mutex_destroy (&aq->mutex)))
3825     {
3826       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3827       goto fail;
3828     }
3829   if (pthread_cond_destroy (&aq->queue_cond_in))
3830     {
3831       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3832       goto fail;
3833     }
3834   if (pthread_cond_destroy (&aq->queue_cond_out))
3835     {
3836       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3837       goto fail;
3838     }
3839   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3840   if (status != HSA_STATUS_SUCCESS)
3841     {
3842       hsa_error ("Error destroying command queue", status);
3843       goto fail;
3844     }
3845
3846   if (aq->prev)
3847     aq->prev->next = aq->next;
3848   if (aq->next)
3849     aq->next->prev = aq->prev;
3850   if (agent->async_queues == aq)
3851     agent->async_queues = aq->next;
3852
3853   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3854
3855   free (aq);
3856   pthread_mutex_unlock (&agent->async_queues_mutex);
3857   return true;
3858
3859 fail:
3860   pthread_mutex_unlock (&agent->async_queues_mutex);
3861   return false;
3862 }
3863
3864 /* Return true if the specified async queue is currently empty.  */
3865
3866 int
3867 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
3868 {
3869   return queue_empty (aq);
3870 }
3871
3872 /* Block until the specified queue has executed all its tasks and the
3873    queue is empty.  */
3874
3875 bool
3876 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
3877 {
3878   wait_queue (aq);
3879   return true;
3880 }
3881
3882 /* Add a serialization point across two async queues. Any new tasks added to
3883    AQ2, after this call, will not run until all tasks on AQ1, at the time
3884    of this call, have completed.  */
3885
3886 bool
3887 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
3888                                       struct goacc_asyncqueue *aq2)
3889 {
3890   /* For serialize, stream aq2 waits for aq1 to complete work that has been
3891      scheduled to run on it up to this point.  */
3892   if (aq1 != aq2)
3893     {
3894       struct placeholder *placeholderp = queue_push_placeholder (aq1);
3895       queue_push_asyncwait (aq2, placeholderp);
3896     }
3897   return true;
3898 }
3899
3900 /* Add an opaque callback to the given async queue.  */
3901
3902 void
3903 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
3904                                            void (*fn) (void *), void *data)
3905 {
3906   queue_push_callback (aq, fn, data);
3907 }
3908
3909 /* Queue up an asynchronous data copy from host to DEVICE.  */
3910
3911 bool
3912 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
3913                                      size_t n, struct goacc_asyncqueue *aq)
3914 {
3915   struct agent_info *agent = get_agent_info (device);
3916   assert (agent == aq->agent);
3917   /* The source data does not necessarily remain live until the deferred
3918      copy happens.  Taking a snapshot of the data here avoids reading
3919      uninitialised data later, but means that (a) data is copied twice and
3920      (b) modifications to the copied data between the "spawning" point of
3921      the asynchronous kernel and when it is executed will not be seen.
3922      But, that is probably correct.  */
3923   void *src_copy = GOMP_PLUGIN_malloc (n);
3924   memcpy (src_copy, src, n);
3925   queue_push_copy (aq, dst, src_copy, n, true);
3926   return true;
3927 }
3928
3929 /* Queue up an asynchronous data copy from DEVICE to host.  */
3930
3931 bool
3932 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
3933                                      size_t n, struct goacc_asyncqueue *aq)
3934 {
3935   struct agent_info *agent = get_agent_info (device);
3936   assert (agent == aq->agent);
3937   queue_push_copy (aq, dst, src, n, false);
3938   return true;
3939 }
3940
3941 union goacc_property_value
3942 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
3943 {
3944   struct agent_info *agent = get_agent_info (device);
3945
3946   union goacc_property_value propval = { .val = 0 };
3947
3948   switch (prop)
3949     {
3950     case GOACC_PROPERTY_FREE_MEMORY:
3951       /* Not supported. */
3952       break;
3953     case GOACC_PROPERTY_MEMORY:
3954       {
3955         size_t size;
3956         hsa_region_t region = agent->data_region;
3957         hsa_status_t status =
3958           hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
3959         if (status == HSA_STATUS_SUCCESS)
3960           propval.val = size;
3961         break;
3962       }
3963     case GOACC_PROPERTY_NAME:
3964       propval.ptr = agent->name;
3965       break;
3966     case GOACC_PROPERTY_VENDOR:
3967       propval.ptr = agent->vendor_name;
3968       break;
3969     case GOACC_PROPERTY_DRIVER:
3970       propval.ptr = hsa_context.driver_version_s;
3971       break;
3972     }
3973
3974   return propval;
3975 }
3976
3977 /* Set up plugin-specific thread-local-data (host-side).  */
3978
3979 void *
3980 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
3981 {
3982   struct gcn_thread *thread_data
3983     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
3984
3985   thread_data->async = GOMP_ASYNC_SYNC;
3986
3987   return (void *) thread_data;
3988 }
3989
3990 /* Clean up plugin-specific thread-local-data.  */
3991
3992 void
3993 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
3994 {
3995   free (data);
3996 }
3997
3998 /* }}} */