libgomp/plugin/plugin-gcn.c

   1 /* Plugin for AMD GCN execution.
   2
   3    Copyright (C) 2013-2020 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* {{{ Includes and defines  */
  30
  31 #include "config.h"
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <string.h>
  35 #include <pthread.h>
  36 #include <inttypes.h>
  37 #include <stdbool.h>
  38 #include <limits.h>
  39 #include <hsa.h>
  40 #include <dlfcn.h>
  41 #include <signal.h>
  42 #include "libgomp-plugin.h"
  43 #include "gomp-constants.h"
  44 #include <elf.h>
  45 #include "oacc-plugin.h"
  46 #include "oacc-int.h"
  47 #include <assert.h>
  48
  49 /* Additional definitions not in HSA 1.1.
  50    FIXME: this needs to be updated in hsa.h for upstream, but the only source
  51           right now is the ROCr source which may cause license issues.  */
  52 #define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
  53
  54 /* These probably won't be in elf.h for a while.  */
  55 #define R_AMDGPU_NONE           0
  56 #define R_AMDGPU_ABS32_LO       1       /* (S + A) & 0xFFFFFFFF  */
  57 #define R_AMDGPU_ABS32_HI       2       /* (S + A) >> 32  */
  58 #define R_AMDGPU_ABS64          3       /* S + A  */
  59 #define R_AMDGPU_REL32          4       /* S + A - P  */
  60 #define R_AMDGPU_REL64          5       /* S + A - P  */
  61 #define R_AMDGPU_ABS32          6       /* S + A  */
  62 #define R_AMDGPU_GOTPCREL       7       /* G + GOT + A - P  */
  63 #define R_AMDGPU_GOTPCREL32_LO  8       /* (G + GOT + A - P) & 0xFFFFFFFF  */
  64 #define R_AMDGPU_GOTPCREL32_HI  9       /* (G + GOT + A - P) >> 32  */
  65 #define R_AMDGPU_REL32_LO       10      /* (S + A - P) & 0xFFFFFFFF  */
  66 #define R_AMDGPU_REL32_HI       11      /* (S + A - P) >> 32  */
  67 #define reserved                12
  68 #define R_AMDGPU_RELATIVE64     13      /* B + A  */
  69
  70 /* GCN specific definitions for asynchronous queues.  */
  71
  72 #define ASYNC_QUEUE_SIZE 64
  73 #define DRAIN_QUEUE_SYNCHRONOUS_P false
  74 #define DEBUG_QUEUES 0
  75 #define DEBUG_THREAD_SLEEP 0
  76 #define DEBUG_THREAD_SIGNAL 0
  77
  78 /* Defaults.  */
  79 #define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
  80
  81 /* Secure getenv() which returns NULL if running as SUID/SGID.  */
  82 #ifndef HAVE_SECURE_GETENV
  83 #ifdef HAVE___SECURE_GETENV
  84 #define secure_getenv __secure_getenv
  85 #elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
  86   && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
  87
  88 #include <unistd.h>
  89
  90 /* Implementation of secure_getenv() for targets where it is not provided but
  91    we have at least means to test real and effective IDs. */
  92
  93 static char *
  94 secure_getenv (const char *name)
  95 {
  96   if ((getuid () == geteuid ()) && (getgid () == getegid ()))
  97     return getenv (name);
  98   else
  99     return NULL;
 100 }
 101
 102 #else
 103 #define secure_getenv getenv
 104 #endif
 105 #endif
 106
 107 /* }}}  */
 108 /* {{{ Types  */
 109
 110 /* GCN-specific implementation of the GOMP_PLUGIN_acc_thread data.  */
 111
 112 struct gcn_thread
 113 {
 114   /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
 115   int async;
 116 };
 117
 118 /* As an HSA runtime is dlopened, following structure defines function
 119    pointers utilized by the HSA plug-in.  */
 120
 121 struct hsa_runtime_fn_info
 122 {
 123   /* HSA runtime.  */
 124   hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
 125                                         const char **status_string);
 126   hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
 127                                           void *value);
 128   hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
 129                                          hsa_agent_info_t attribute,
 130                                          void *value);
 131   hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
 132                                       hsa_isa_info_t attribute,
 133                                       uint32_t index,
 134                                       void *value);
 135   hsa_status_t (*hsa_init_fn) (void);
 136   hsa_status_t (*hsa_iterate_agents_fn)
 137     (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
 138   hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
 139                                           hsa_region_info_t attribute,
 140                                           void *value);
 141   hsa_status_t (*hsa_queue_create_fn)
 142     (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
 143      void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
 144      void *data, uint32_t private_segment_size,
 145      uint32_t group_segment_size, hsa_queue_t **queue);
 146   hsa_status_t (*hsa_agent_iterate_regions_fn)
 147     (hsa_agent_t agent,
 148      hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
 149   hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
 150   hsa_status_t (*hsa_executable_create_fn)
 151     (hsa_profile_t profile, hsa_executable_state_t executable_state,
 152      const char *options, hsa_executable_t *executable);
 153   hsa_status_t (*hsa_executable_global_variable_define_fn)
 154     (hsa_executable_t executable, const char *variable_name, void *address);
 155   hsa_status_t (*hsa_executable_load_code_object_fn)
 156     (hsa_executable_t executable, hsa_agent_t agent,
 157      hsa_code_object_t code_object, const char *options);
 158   hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
 159                                            const char *options);
 160   hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
 161                                         uint32_t num_consumers,
 162                                         const hsa_agent_t *consumers,
 163                                         hsa_signal_t *signal);
 164   hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
 165                                           void **ptr);
 166   hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
 167                                               hsa_access_permission_t access);
 168   hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
 169   hsa_status_t (*hsa_memory_free_fn) (void *ptr);
 170   hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
 171   hsa_status_t (*hsa_executable_get_symbol_fn)
 172     (hsa_executable_t executable, const char *module_name,
 173      const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
 174      hsa_executable_symbol_t *symbol);
 175   hsa_status_t (*hsa_executable_symbol_get_info_fn)
 176     (hsa_executable_symbol_t executable_symbol,
 177      hsa_executable_symbol_info_t attribute, void *value);
 178   hsa_status_t (*hsa_executable_iterate_symbols_fn)
 179     (hsa_executable_t executable,
 180      hsa_status_t (*callback)(hsa_executable_t executable,
 181                               hsa_executable_symbol_t symbol, void *data),
 182      void *data);
 183   uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
 184                                                     uint64_t value);
 185   uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
 186   void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
 187                                        hsa_signal_value_t value);
 188   void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
 189                                        hsa_signal_value_t value);
 190   hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
 191     (hsa_signal_t signal, hsa_signal_condition_t condition,
 192      hsa_signal_value_t compare_value, uint64_t timeout_hint,
 193      hsa_wait_state_t wait_state_hint);
 194   hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
 195   hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
 196
 197   hsa_status_t (*hsa_code_object_deserialize_fn)
 198     (void *serialized_code_object, size_t serialized_code_object_size,
 199      const char *options, hsa_code_object_t *code_object);
 200 };
 201
 202 /* Structure describing the run-time and grid properties of an HSA kernel
 203    lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
 204
 205 struct GOMP_kernel_launch_attributes
 206 {
 207   /* Number of dimensions the workload has.  Maximum number is 3.  */
 208   uint32_t ndim;
 209   /* Size of the grid in the three respective dimensions.  */
 210   uint32_t gdims[3];
 211   /* Size of work-groups in the respective dimensions.  */
 212   uint32_t wdims[3];
 213 };
 214
 215 /* Collection of information needed for a dispatch of a kernel from a
 216    kernel.  */
 217
 218 struct kernel_dispatch
 219 {
 220   struct agent_info *agent;
 221   /* Pointer to a command queue associated with a kernel dispatch agent.  */
 222   void *queue;
 223   /* Pointer to a memory space used for kernel arguments passing.  */
 224   void *kernarg_address;
 225   /* Kernel object.  */
 226   uint64_t object;
 227   /* Synchronization signal used for dispatch synchronization.  */
 228   uint64_t signal;
 229   /* Private segment size.  */
 230   uint32_t private_segment_size;
 231   /* Group segment size.  */
 232   uint32_t group_segment_size;
 233 };
 234
 235 /* Structure of the kernargs segment, supporting console output.
 236
 237    This needs to match the definitions in Newlib, and the expectations
 238    in libgomp target code.  */
 239
 240 struct kernargs {
 241   /* Leave space for the real kernel arguments.
 242      OpenACC and OpenMP only use one pointer.  */
 243   int64_t dummy1;
 244   int64_t dummy2;
 245
 246   /* A pointer to struct output, below, for console output data.  */
 247   int64_t out_ptr;
 248
 249   /* A pointer to struct heap, below.  */
 250   int64_t heap_ptr;
 251
 252   /* A pointer to an ephemeral memory arena.
 253     Only needed for OpenMP.  */
 254   int64_t arena_ptr;
 255
 256   /* Output data.  */
 257   struct output {
 258     int return_value;
 259     unsigned int next_output;
 260     struct printf_data {
 261       int written;
 262       char msg[128];
 263       int type;
 264       union {
 265         int64_t ivalue;
 266         double dvalue;
 267         char text[128];
 268       };
 269     } queue[1024];
 270     unsigned int consumed;
 271   } output_data;
 272 };
 273
 274 /* A queue entry for a future asynchronous launch.  */
 275
 276 struct kernel_launch
 277 {
 278   struct kernel_info *kernel;
 279   void *vars;
 280   struct GOMP_kernel_launch_attributes kla;
 281 };
 282
 283 /* A queue entry for a future callback.  */
 284
 285 struct callback
 286 {
 287   void (*fn)(void *);
 288   void *data;
 289 };
 290
 291 /* A data struct for the copy_data callback.  */
 292
 293 struct copy_data
 294 {
 295   void *dst;
 296   const void *src;
 297   size_t len;
 298   bool free_src;
 299   struct goacc_asyncqueue *aq;
 300 };
 301
 302 /* A queue entry for a placeholder.  These correspond to a wait event.  */
 303
 304 struct placeholder
 305 {
 306   int executed;
 307   pthread_cond_t cond;
 308   pthread_mutex_t mutex;
 309 };
 310
 311 /* A queue entry for a wait directive.  */
 312
 313 struct asyncwait_info
 314 {
 315   struct placeholder *placeholderp;
 316 };
 317
 318 /* Encode the type of an entry in an async queue.  */
 319
 320 enum entry_type
 321 {
 322   KERNEL_LAUNCH,
 323   CALLBACK,
 324   ASYNC_WAIT,
 325   ASYNC_PLACEHOLDER
 326 };
 327
 328 /* An entry in an async queue.  */
 329
 330 struct queue_entry
 331 {
 332   enum entry_type type;
 333   union {
 334     struct kernel_launch launch;
 335     struct callback callback;
 336     struct asyncwait_info asyncwait;
 337     struct placeholder placeholder;
 338   } u;
 339 };
 340
 341 /* An async queue header.
 342
 343    OpenMP may create one of these.
 344    OpenACC may create many.  */
 345
 346 struct goacc_asyncqueue
 347 {
 348   struct agent_info *agent;
 349   hsa_queue_t *hsa_queue;
 350
 351   pthread_t thread_drain_queue;
 352   pthread_mutex_t mutex;
 353   pthread_cond_t queue_cond_in;
 354   pthread_cond_t queue_cond_out;
 355   struct queue_entry queue[ASYNC_QUEUE_SIZE];
 356   int queue_first;
 357   int queue_n;
 358   int drain_queue_stop;
 359
 360   int id;
 361   struct goacc_asyncqueue *prev;
 362   struct goacc_asyncqueue *next;
 363 };
 364
 365 /* Mkoffload uses this structure to describe a kernel.
 366
 367    OpenMP kernel dimensions are passed at runtime.
 368    OpenACC kernel dimensions are passed at compile time, here.  */
 369
 370 struct hsa_kernel_description
 371 {
 372   const char *name;
 373   int oacc_dims[3];  /* Only present for GCN kernels.  */
 374 };
 375
 376 /* Mkoffload uses this structure to describe an offload variable.  */
 377
 378 struct global_var_info
 379 {
 380   const char *name;
 381   void *address;
 382 };
 383
 384 /* Mkoffload uses this structure to describe all the kernels in a
 385    loadable module.  These are passed the libgomp via static constructors.  */
 386
 387 struct gcn_image_desc
 388 {
 389   struct gcn_image {
 390     size_t size;
 391     void *image;
 392   } *gcn_image;
 393   const unsigned kernel_count;
 394   struct hsa_kernel_description *kernel_infos;
 395   const unsigned global_variable_count;
 396   struct global_var_info *global_variables;
 397 };
 398
 399 /* Description of an HSA GPU agent (device) and the program associated with
 400    it.  */
 401
 402 struct agent_info
 403 {
 404   /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
 405   hsa_agent_t id;
 406   /* The user-visible device number.  */
 407   int device_id;
 408   /* Whether the agent has been initialized.  The fields below are usable only
 409      if it has been.  */
 410   bool initialized;
 411   /* Precomputed check for problem architectures.  */
 412   bool gfx900_p;
 413
 414   /* Command queues of the agent.  */
 415   hsa_queue_t *sync_queue;
 416   struct goacc_asyncqueue *async_queues, *omp_async_queue;
 417   pthread_mutex_t async_queues_mutex;
 418
 419   /* The HSA memory region from which to allocate kernel arguments.  */
 420   hsa_region_t kernarg_region;
 421
 422   /* The HSA memory region from which to allocate device data.  */
 423   hsa_region_t data_region;
 424
 425   /* Allocated team arenas.  */
 426   struct team_arena_list *team_arena_list;
 427   pthread_mutex_t team_arena_write_lock;
 428
 429   /* Read-write lock that protects kernels which are running or about to be run
 430      from interference with loading and unloading of images.  Needs to be
 431      locked for reading while a kernel is being run, and for writing if the
 432      list of modules is manipulated (and thus the HSA program invalidated).  */
 433   pthread_rwlock_t module_rwlock;
 434
 435   /* The module associated with this kernel.  */
 436   struct module_info *module;
 437
 438   /* Mutex enforcing that only one thread will finalize the HSA program.  A
 439      thread should have locked agent->module_rwlock for reading before
 440      acquiring it.  */
 441   pthread_mutex_t prog_mutex;
 442   /* Flag whether the HSA program that consists of all the modules has been
 443      finalized.  */
 444   bool prog_finalized;
 445   /* HSA executable - the finalized program that is used to locate kernels.  */
 446   hsa_executable_t executable;
 447 };
 448
 449 /* Information required to identify, finalize and run any given kernel.  */
 450
 451 enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
 452
 453 struct kernel_info
 454 {
 455   /* Name of the kernel, required to locate it within the GCN object-code
 456      module.  */
 457   const char *name;
 458   /* The specific agent the kernel has been or will be finalized for and run
 459      on.  */
 460   struct agent_info *agent;
 461   /* The specific module where the kernel takes place.  */
 462   struct module_info *module;
 463   /* Mutex enforcing that at most once thread ever initializes a kernel for
 464      use.  A thread should have locked agent->module_rwlock for reading before
 465      acquiring it.  */
 466   pthread_mutex_t init_mutex;
 467   /* Flag indicating whether the kernel has been initialized and all fields
 468      below it contain valid data.  */
 469   bool initialized;
 470   /* Flag indicating that the kernel has a problem that blocks an execution.  */
 471   bool initialization_failed;
 472   /* The object to be put into the dispatch queue.  */
 473   uint64_t object;
 474   /* Required size of kernel arguments.  */
 475   uint32_t kernarg_segment_size;
 476   /* Required size of group segment.  */
 477   uint32_t group_segment_size;
 478   /* Required size of private segment.  */
 479   uint32_t private_segment_size;
 480   /* Set up for OpenMP or OpenACC?  */
 481   enum offload_kind kind;
 482 };
 483
 484 /* Information about a particular GCN module, its image and kernels.  */
 485
 486 struct module_info
 487 {
 488   /* The description with which the program has registered the image.  */
 489   struct gcn_image_desc *image_desc;
 490   /* GCN heap allocation.  */
 491   struct heap *heap;
 492   /* Physical boundaries of the loaded module.  */
 493   Elf64_Addr phys_address_start;
 494   Elf64_Addr phys_address_end;
 495
 496   bool constructors_run_p;
 497   struct kernel_info *init_array_func, *fini_array_func;
 498
 499   /* Number of kernels in this module.  */
 500   int kernel_count;
 501   /* An array of kernel_info structures describing each kernel in this
 502      module.  */
 503   struct kernel_info kernels[];
 504 };
 505
 506 /* A linked list of memory arenas allocated on the device.
 507    These are only used by OpenMP, as a means to optimize per-team malloc.  */
 508
 509 struct team_arena_list
 510 {
 511   struct team_arena_list *next;
 512
 513   /* The number of teams determines the size of the allocation.  */
 514   int num_teams;
 515   /* The device address of the arena itself.  */
 516   void *arena;
 517   /* A flag to prevent two asynchronous kernels trying to use the same arena.
 518      The mutex is locked until the kernel exits.  */
 519   pthread_mutex_t in_use;
 520 };
 521
 522 /* Information about the whole HSA environment and all of its agents.  */
 523
 524 struct hsa_context_info
 525 {
 526   /* Whether the structure has been initialized.  */
 527   bool initialized;
 528   /* Number of usable GPU HSA agents in the system.  */
 529   int agent_count;
 530   /* Array of agent_info structures describing the individual HSA agents.  */
 531   struct agent_info *agents;
 532 };
 533
 534 /* Format of the on-device heap.
 535
 536    This must match the definition in Newlib and gcn-run.  */
 537
 538 struct heap {
 539   int64_t size;
 540   char data[0];
 541 };
 542
 543 /* }}}  */
 544 /* {{{ Global variables  */
 545
 546 /* Information about the whole HSA environment and all of its agents.  */
 547
 548 static struct hsa_context_info hsa_context;
 549
 550 /* HSA runtime functions that are initialized in init_hsa_context.  */
 551
 552 static struct hsa_runtime_fn_info hsa_fns;
 553
 554 /* Heap space, allocated target-side, provided for use of newlib malloc.
 555    Each module should have it's own heap allocated.
 556    Beware that heap usage increases with OpenMP teams.  See also arenas.  */
 557
 558 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
 559
 560 /* Flag to decide whether print to stderr information about what is going on.
 561    Set in init_debug depending on environment variables.  */
 562
 563 static bool debug;
 564
 565 /* Flag to decide if the runtime should suppress a possible fallback to host
 566    execution.  */
 567
 568 static bool suppress_host_fallback;
 569
 570 /* Flag to locate HSA runtime shared library that is dlopened
 571    by this plug-in.  */
 572
 573 static const char *hsa_runtime_lib;
 574
 575 /* Flag to decide if the runtime should support also CPU devices (can be
 576    a simulator).  */
 577
 578 static bool support_cpu_devices;
 579
 580 /* Runtime dimension overrides.  Zero indicates default.  */
 581
 582 static int override_x_dim = 0;
 583 static int override_z_dim = 0;
 584
 585 /* }}}  */
 586 /* {{{ Debug & Diagnostic  */
 587
 588 /* Print a message to stderr if GCN_DEBUG value is set to true.  */
 589
 590 #define DEBUG_PRINT(...) \
 591   do \
 592   { \
 593     if (debug) \
 594       { \
 595         fprintf (stderr, __VA_ARGS__); \
 596       } \
 597   } \
 598   while (false);
 599
 600 /* Flush stderr if GCN_DEBUG value is set to true.  */
 601
 602 #define DEBUG_FLUSH()                           \
 603   do {                                          \
 604     if (debug)                                  \
 605       fflush (stderr);                          \
 606   } while (false)
 607
 608 /* Print a logging message with PREFIX to stderr if GCN_DEBUG value
 609    is set to true.  */
 610
 611 #define DEBUG_LOG(prefix, ...)                  \
 612   do                                            \
 613     {                                           \
 614       DEBUG_PRINT (prefix);                     \
 615       DEBUG_PRINT (__VA_ARGS__);                        \
 616       DEBUG_FLUSH ();                           \
 617     } while (false)
 618
 619 /* Print a debugging message to stderr.  */
 620
 621 #define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
 622
 623 /* Print a warning message to stderr.  */
 624
 625 #define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
 626
 627 /* Print HSA warning STR with an HSA STATUS code.  */
 628
 629 static void
 630 hsa_warn (const char *str, hsa_status_t status)
 631 {
 632   if (!debug)
 633     return;
 634
 635   const char *hsa_error_msg = "[unknown]";
 636   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 637
 638   fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
 639            hsa_error_msg);
 640 }
 641
 642 /* Report a fatal error STR together with the HSA error corresponding to STATUS
 643    and terminate execution of the current process.  */
 644
 645 static void
 646 hsa_fatal (const char *str, hsa_status_t status)
 647 {
 648   const char *hsa_error_msg = "[unknown]";
 649   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 650   GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
 651                      hsa_error_msg);
 652 }
 653
 654 /* Like hsa_fatal, except only report error message, and return FALSE
 655    for propagating error processing to outside of plugin.  */
 656
 657 static bool
 658 hsa_error (const char *str, hsa_status_t status)
 659 {
 660   const char *hsa_error_msg = "[unknown]";
 661   hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
 662   GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
 663                      hsa_error_msg);
 664   return false;
 665 }
 666
 667 /* Dump information about the available hardware.  */
 668
 669 static void
 670 dump_hsa_system_info (void)
 671 {
 672   hsa_status_t status;
 673
 674   hsa_endianness_t endianness;
 675   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
 676                                            &endianness);
 677   if (status == HSA_STATUS_SUCCESS)
 678     switch (endianness)
 679       {
 680       case HSA_ENDIANNESS_LITTLE:
 681         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
 682         break;
 683       case HSA_ENDIANNESS_BIG:
 684         GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
 685         break;
 686       default:
 687         GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
 688       }
 689   else
 690     GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
 691
 692   uint8_t extensions[128];
 693   status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
 694                                            &extensions);
 695   if (status == HSA_STATUS_SUCCESS)
 696     {
 697       if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
 698         GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
 699     }
 700   else
 701     GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
 702 }
 703
 704 /* Dump information about the available hardware.  */
 705
 706 static void
 707 dump_machine_model (hsa_machine_model_t machine_model, const char *s)
 708 {
 709   switch (machine_model)
 710     {
 711     case HSA_MACHINE_MODEL_SMALL:
 712       GCN_DEBUG ("%s: SMALL\n", s);
 713       break;
 714     case HSA_MACHINE_MODEL_LARGE:
 715       GCN_DEBUG ("%s: LARGE\n", s);
 716       break;
 717     default:
 718       GCN_WARNING ("%s: UNKNOWN\n", s);
 719       break;
 720     }
 721 }
 722
 723 /* Dump information about the available hardware.  */
 724
 725 static void
 726 dump_profile (hsa_profile_t profile, const char *s)
 727 {
 728   switch (profile)
 729     {
 730     case HSA_PROFILE_FULL:
 731       GCN_DEBUG ("%s: FULL\n", s);
 732       break;
 733     case HSA_PROFILE_BASE:
 734       GCN_DEBUG ("%s: BASE\n", s);
 735       break;
 736     default:
 737       GCN_WARNING ("%s: UNKNOWN\n", s);
 738       break;
 739     }
 740 }
 741
 742 /* Dump information about a device memory region.  */
 743
 744 static hsa_status_t
 745 dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
 746 {
 747   hsa_status_t status;
 748
 749   hsa_region_segment_t segment;
 750   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
 751                                            &segment);
 752   if (status == HSA_STATUS_SUCCESS)
 753     {
 754       if (segment == HSA_REGION_SEGMENT_GLOBAL)
 755         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
 756       else if (segment == HSA_REGION_SEGMENT_READONLY)
 757         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
 758       else if (segment == HSA_REGION_SEGMENT_PRIVATE)
 759         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
 760       else if (segment == HSA_REGION_SEGMENT_GROUP)
 761         GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
 762       else
 763         GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
 764     }
 765   else
 766     GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
 767
 768   if (segment == HSA_REGION_SEGMENT_GLOBAL)
 769     {
 770       uint32_t flags;
 771       status
 772         = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
 773                                           &flags);
 774       if (status == HSA_STATUS_SUCCESS)
 775         {
 776           if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
 777             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
 778           if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
 779             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
 780           if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
 781             GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
 782         }
 783       else
 784         GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
 785     }
 786
 787   size_t size;
 788   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
 789   if (status == HSA_STATUS_SUCCESS)
 790     GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
 791   else
 792     GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
 793
 794   status
 795     = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
 796                                       &size);
 797   if (status == HSA_STATUS_SUCCESS)
 798     GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
 799   else
 800     GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
 801
 802   bool alloc_allowed;
 803   status
 804     = hsa_fns.hsa_region_get_info_fn (region,
 805                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
 806                                       &alloc_allowed);
 807   if (status == HSA_STATUS_SUCCESS)
 808     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
 809   else
 810     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
 811
 812   if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
 813     return HSA_STATUS_SUCCESS;
 814
 815   status
 816     = hsa_fns.hsa_region_get_info_fn (region,
 817                                       HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
 818                                       &size);
 819   if (status == HSA_STATUS_SUCCESS)
 820     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
 821   else
 822     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
 823
 824   size_t align;
 825   status
 826     = hsa_fns.hsa_region_get_info_fn (region,
 827                                       HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
 828                                       &align);
 829   if (status == HSA_STATUS_SUCCESS)
 830     GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
 831   else
 832     GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
 833
 834   return HSA_STATUS_SUCCESS;
 835 }
 836
 837 /* Dump information about all the device memory regions.  */
 838
 839 static void
 840 dump_hsa_regions (hsa_agent_t agent)
 841 {
 842   hsa_status_t status;
 843   status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
 844                                                  dump_hsa_region,
 845                                                  NULL);
 846   if (status != HSA_STATUS_SUCCESS)
 847     hsa_error ("Dumping hsa regions failed", status);
 848 }
 849
 850 /* Dump information about the available devices.  */
 851
 852 static hsa_status_t
 853 dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
 854 {
 855   hsa_status_t status;
 856
 857   char buf[64];
 858   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
 859                                           &buf);
 860   if (status == HSA_STATUS_SUCCESS)
 861     GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
 862   else
 863     GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
 864
 865   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
 866                                           &buf);
 867   if (status == HSA_STATUS_SUCCESS)
 868     GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
 869   else
 870     GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
 871
 872   hsa_machine_model_t machine_model;
 873   status
 874     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
 875                                      &machine_model);
 876   if (status == HSA_STATUS_SUCCESS)
 877     dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
 878   else
 879     GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
 880
 881   hsa_profile_t profile;
 882   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
 883                                           &profile);
 884   if (status == HSA_STATUS_SUCCESS)
 885     dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
 886   else
 887     GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
 888
 889   hsa_device_type_t device_type;
 890   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
 891                                           &device_type);
 892   if (status == HSA_STATUS_SUCCESS)
 893     {
 894       switch (device_type)
 895         {
 896         case HSA_DEVICE_TYPE_CPU:
 897           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
 898           break;
 899         case HSA_DEVICE_TYPE_GPU:
 900           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
 901           break;
 902         case HSA_DEVICE_TYPE_DSP:
 903           GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
 904           break;
 905         default:
 906           GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
 907           break;
 908         }
 909     }
 910   else
 911     GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
 912
 913   uint32_t cu_count;
 914   status = hsa_fns.hsa_agent_get_info_fn
 915     (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
 916   if (status == HSA_STATUS_SUCCESS)
 917     GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
 918   else
 919     GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
 920
 921   uint32_t size;
 922   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
 923                                           &size);
 924   if (status == HSA_STATUS_SUCCESS)
 925     GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
 926   else
 927     GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
 928
 929   uint32_t max_dim;
 930   status = hsa_fns.hsa_agent_get_info_fn (agent,
 931                                           HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
 932                                           &max_dim);
 933   if (status == HSA_STATUS_SUCCESS)
 934     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
 935   else
 936     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
 937
 938   uint32_t max_size;
 939   status = hsa_fns.hsa_agent_get_info_fn (agent,
 940                                           HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
 941                                           &max_size);
 942   if (status == HSA_STATUS_SUCCESS)
 943     GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
 944   else
 945     GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
 946
 947   uint32_t grid_max_dim;
 948   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
 949                                           &grid_max_dim);
 950   if (status == HSA_STATUS_SUCCESS)
 951     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
 952   else
 953     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
 954
 955   uint32_t grid_max_size;
 956   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
 957                                           &grid_max_size);
 958   if (status == HSA_STATUS_SUCCESS)
 959     GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
 960   else
 961     GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
 962
 963   dump_hsa_regions (agent);
 964
 965   return HSA_STATUS_SUCCESS;
 966 }
 967
 968 /* Forward reference.  */
 969
 970 static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
 971
 972 /* Helper function for dump_executable_symbols.  */
 973
 974 static hsa_status_t
 975 dump_executable_symbol (hsa_executable_t executable,
 976                         hsa_executable_symbol_t symbol,
 977                         void *data __attribute__((unused)))
 978 {
 979   char *name = get_executable_symbol_name (symbol);
 980
 981   if (name)
 982     {
 983       GCN_DEBUG ("executable symbol: %s\n", name);
 984       free (name);
 985     }
 986
 987   return HSA_STATUS_SUCCESS;
 988 }
 989
 990 /* Dump all global symbol in an executable.  */
 991
 992 static void
 993 dump_executable_symbols (hsa_executable_t executable)
 994 {
 995   hsa_status_t status;
 996   status
 997     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
 998                                                  dump_executable_symbol,
 999                                                  NULL);
1000   if (status != HSA_STATUS_SUCCESS)
1001     hsa_fatal ("Could not dump HSA executable symbols", status);
1002 }
1003
1004 /* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
1005
1006 static void
1007 print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
1008 {
1009   struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
1010
1011   fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
1012   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
1013   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
1014   fprintf (stderr, "%*sheap address: %p\n", indent, "",
1015            (void*)kernargs->heap_ptr);
1016   fprintf (stderr, "%*sarena address: %p\n", indent, "",
1017            (void*)kernargs->arena_ptr);
1018   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
1019   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
1020            dispatch->private_segment_size);
1021   fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
1022            dispatch->group_segment_size);
1023   fprintf (stderr, "\n");
1024 }
1025
1026 /* }}}  */
1027 /* {{{ Utility functions  */
1028
1029 /* Cast the thread local storage to gcn_thread.  */
1030
1031 static inline struct gcn_thread *
1032 gcn_thread (void)
1033 {
1034   return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
1035 }
1036
1037 /* Initialize debug and suppress_host_fallback according to the environment.  */
1038
1039 static void
1040 init_environment_variables (void)
1041 {
1042   if (secure_getenv ("GCN_DEBUG"))
1043     debug = true;
1044   else
1045     debug = false;
1046
1047   if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
1048     suppress_host_fallback = true;
1049   else
1050     suppress_host_fallback = false;
1051
1052   hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
1053   if (hsa_runtime_lib == NULL)
1054     hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so";
1055
1056   support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
1057
1058   const char *x = secure_getenv ("GCN_NUM_TEAMS");
1059   if (!x)
1060     x = secure_getenv ("GCN_NUM_GANGS");
1061   if (x)
1062     override_x_dim = atoi (x);
1063
1064   const char *z = secure_getenv ("GCN_NUM_THREADS");
1065   if (!z)
1066     z = secure_getenv ("GCN_NUM_WORKERS");
1067   if (z)
1068     override_z_dim = atoi (z);
1069
1070   const char *heap = secure_getenv ("GCN_HEAP_SIZE");
1071   if (heap)
1072     {
1073       size_t tmp = atol (heap);
1074       if (tmp)
1075         gcn_kernel_heap_size = tmp;
1076     }
1077 }
1078
1079 /* Return malloc'd string with name of SYMBOL.  */
1080
1081 static char *
1082 get_executable_symbol_name (hsa_executable_symbol_t symbol)
1083 {
1084   hsa_status_t status;
1085   char *res;
1086   uint32_t len;
1087   const hsa_executable_symbol_info_t info_name_length
1088     = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
1089
1090   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
1091                                                       &len);
1092   if (status != HSA_STATUS_SUCCESS)
1093     {
1094       hsa_error ("Could not get length of symbol name", status);
1095       return NULL;
1096     }
1097
1098   res = GOMP_PLUGIN_malloc (len + 1);
1099
1100   const hsa_executable_symbol_info_t info_name
1101     = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
1102
1103   status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
1104
1105   if (status != HSA_STATUS_SUCCESS)
1106     {
1107       hsa_error ("Could not get symbol name", status);
1108       free (res);
1109       return NULL;
1110     }
1111
1112   res[len] = '\0';
1113
1114   return res;
1115 }
1116
1117 /* Helper function for find_executable_symbol.  */
1118
1119 static hsa_status_t
1120 find_executable_symbol_1 (hsa_executable_t executable,
1121                           hsa_executable_symbol_t symbol,
1122                           void *data)
1123 {
1124   hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
1125   *res = symbol;
1126   return HSA_STATUS_INFO_BREAK;
1127 }
1128
1129 /* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true.  If not
1130    found, return false.  */
1131
1132 static bool
1133 find_executable_symbol (hsa_executable_t executable,
1134                         hsa_executable_symbol_t *symbol)
1135 {
1136   hsa_status_t status;
1137
1138   status
1139     = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
1140                                                  find_executable_symbol_1,
1141                                                  symbol);
1142   if (status != HSA_STATUS_INFO_BREAK)
1143     {
1144       hsa_error ("Could not find executable symbol", status);
1145       return false;
1146     }
1147
1148   return true;
1149 }
1150
1151 /* Get the number of GPU Compute Units.  */
1152
1153 static int
1154 get_cu_count (struct agent_info *agent)
1155 {
1156   uint32_t cu_count;
1157   hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
1158     (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
1159   if (status == HSA_STATUS_SUCCESS)
1160     return cu_count;
1161   else
1162     return 64;  /* The usual number for older devices.  */
1163 }
1164
1165 /* Calculate the maximum grid size for OMP threads / OACC workers.
1166    This depends on the kernel's resource usage levels.  */
1167
1168 static int
1169 limit_worker_threads (int threads)
1170 {
1171   /* FIXME Do something more inteligent here.
1172      GCN can always run 4 threads within a Compute Unit, but
1173      more than that depends on register usage.  */
1174   if (threads > 16)
1175     threads = 16;
1176   return threads;
1177 }
1178
1179 /* Parse the target attributes INPUT provided by the compiler and return true
1180    if we should run anything all.  If INPUT is NULL, fill DEF with default
1181    values, then store INPUT or DEF into *RESULT.
1182
1183    This is used for OpenMP only.  */
1184
1185 static bool
1186 parse_target_attributes (void **input,
1187                          struct GOMP_kernel_launch_attributes *def,
1188                          struct GOMP_kernel_launch_attributes **result,
1189                          struct agent_info *agent)
1190 {
1191   if (!input)
1192     GOMP_PLUGIN_fatal ("No target arguments provided");
1193
1194   bool grid_attrs_found = false;
1195   bool gcn_dims_found = false;
1196   int gcn_teams = 0;
1197   int gcn_threads = 0;
1198   while (*input)
1199     {
1200       intptr_t id = (intptr_t) *input++, val;
1201
1202       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1203         val = (intptr_t) *input++;
1204       else
1205         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1206
1207       val = (val > INT_MAX) ? INT_MAX : val;
1208
1209       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
1210           && ((id & GOMP_TARGET_ARG_ID_MASK)
1211               == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
1212         {
1213           grid_attrs_found = true;
1214           break;
1215         }
1216       else if ((id & GOMP_TARGET_ARG_DEVICE_ALL) == GOMP_TARGET_ARG_DEVICE_ALL)
1217         {
1218           gcn_dims_found = true;
1219           switch (id & GOMP_TARGET_ARG_ID_MASK)
1220             {
1221             case GOMP_TARGET_ARG_NUM_TEAMS:
1222               gcn_teams = val;
1223               break;
1224             case GOMP_TARGET_ARG_THREAD_LIMIT:
1225               gcn_threads = limit_worker_threads (val);
1226               break;
1227             default:
1228               ;
1229             }
1230         }
1231     }
1232
1233   if (gcn_dims_found)
1234     {
1235       if (agent->gfx900_p && gcn_threads == 0 && override_z_dim == 0)
1236         {
1237           gcn_threads = 4;
1238           GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
1239                        "threads to 4 per team.\n");
1240           GCN_WARNING (" - If this is not a Vega 10 device, please use "
1241                        "GCN_NUM_THREADS=16\n");
1242         }
1243
1244       def->ndim = 3;
1245       /* Fiji has 64 CUs, but Vega20 has 60.  */
1246       def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
1247       /* Each thread is 64 work items wide.  */
1248       def->gdims[1] = 64;
1249       /* A work group can have 16 wavefronts.  */
1250       def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
1251       def->wdims[0] = 1; /* Single team per work-group.  */
1252       def->wdims[1] = 64;
1253       def->wdims[2] = 16;
1254       *result = def;
1255       return true;
1256     }
1257   else if (!grid_attrs_found)
1258     {
1259       def->ndim = 1;
1260       def->gdims[0] = 1;
1261       def->gdims[1] = 1;
1262       def->gdims[2] = 1;
1263       def->wdims[0] = 1;
1264       def->wdims[1] = 1;
1265       def->wdims[2] = 1;
1266       *result = def;
1267       GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
1268       return true;
1269     }
1270
1271   struct GOMP_kernel_launch_attributes *kla;
1272   kla = (struct GOMP_kernel_launch_attributes *) *input;
1273   *result = kla;
1274   if (kla->ndim == 0 || kla->ndim > 3)
1275     GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
1276
1277   GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
1278   unsigned i;
1279   for (i = 0; i < kla->ndim; i++)
1280     {
1281       GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
1282                  kla->gdims[i], kla->wdims[i]);
1283       if (kla->gdims[i] == 0)
1284         return false;
1285     }
1286   return true;
1287 }
1288
1289 /* Return the group size given the requested GROUP size, GRID size and number
1290    of grid dimensions NDIM.  */
1291
1292 static uint32_t
1293 get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
1294 {
1295   if (group == 0)
1296     {
1297       /* TODO: Provide a default via environment or device characteristics.  */
1298       if (ndim == 1)
1299         group = 64;
1300       else if (ndim == 2)
1301         group = 8;
1302       else
1303         group = 4;
1304     }
1305
1306   if (group > grid)
1307     group = grid;
1308   return group;
1309 }
1310
1311 /* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
1312
1313 static void
1314 packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
1315 {
1316   __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
1317 }
1318
1319 /* A never-called callback for the HSA command queues.  These signal events
1320    that we don't use, so we trigger an error.
1321
1322    This "queue" is not to be confused with the async queues, below.  */
1323
1324 static void
1325 hsa_queue_callback (hsa_status_t status,
1326                 hsa_queue_t *queue __attribute__ ((unused)),
1327                 void *data __attribute__ ((unused)))
1328 {
1329   hsa_fatal ("Asynchronous queue error", status);
1330 }
1331
1332 /* }}}  */
1333 /* {{{ HSA initialization  */
1334
1335 /* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
1336
1337 static bool
1338 init_hsa_runtime_functions (void)
1339 {
1340 #define DLSYM_FN(function) \
1341   hsa_fns.function##_fn = dlsym (handle, #function); \
1342   if (hsa_fns.function##_fn == NULL) \
1343     return false;
1344   void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
1345   if (handle == NULL)
1346     return false;
1347
1348   DLSYM_FN (hsa_status_string)
1349   DLSYM_FN (hsa_system_get_info)
1350   DLSYM_FN (hsa_agent_get_info)
1351   DLSYM_FN (hsa_init)
1352   DLSYM_FN (hsa_iterate_agents)
1353   DLSYM_FN (hsa_region_get_info)
1354   DLSYM_FN (hsa_queue_create)
1355   DLSYM_FN (hsa_agent_iterate_regions)
1356   DLSYM_FN (hsa_executable_destroy)
1357   DLSYM_FN (hsa_executable_create)
1358   DLSYM_FN (hsa_executable_global_variable_define)
1359   DLSYM_FN (hsa_executable_load_code_object)
1360   DLSYM_FN (hsa_executable_freeze)
1361   DLSYM_FN (hsa_signal_create)
1362   DLSYM_FN (hsa_memory_allocate)
1363   DLSYM_FN (hsa_memory_assign_agent)
1364   DLSYM_FN (hsa_memory_copy)
1365   DLSYM_FN (hsa_memory_free)
1366   DLSYM_FN (hsa_signal_destroy)
1367   DLSYM_FN (hsa_executable_get_symbol)
1368   DLSYM_FN (hsa_executable_symbol_get_info)
1369   DLSYM_FN (hsa_executable_iterate_symbols)
1370   DLSYM_FN (hsa_queue_add_write_index_release)
1371   DLSYM_FN (hsa_queue_load_read_index_acquire)
1372   DLSYM_FN (hsa_signal_wait_acquire)
1373   DLSYM_FN (hsa_signal_store_relaxed)
1374   DLSYM_FN (hsa_signal_store_release)
1375   DLSYM_FN (hsa_signal_load_acquire)
1376   DLSYM_FN (hsa_queue_destroy)
1377   DLSYM_FN (hsa_code_object_deserialize)
1378   return true;
1379 #undef DLSYM_FN
1380 }
1381
1382 /* Return true if the agent is a GPU and can accept of concurrent submissions
1383    from different threads.  */
1384
1385 static bool
1386 suitable_hsa_agent_p (hsa_agent_t agent)
1387 {
1388   hsa_device_type_t device_type;
1389   hsa_status_t status
1390     = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
1391                                      &device_type);
1392   if (status != HSA_STATUS_SUCCESS)
1393     return false;
1394
1395   switch (device_type)
1396     {
1397     case HSA_DEVICE_TYPE_GPU:
1398       break;
1399     case HSA_DEVICE_TYPE_CPU:
1400       if (!support_cpu_devices)
1401         return false;
1402       break;
1403     default:
1404       return false;
1405     }
1406
1407   uint32_t features = 0;
1408   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
1409                                           &features);
1410   if (status != HSA_STATUS_SUCCESS
1411       || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
1412     return false;
1413   hsa_queue_type_t queue_type;
1414   status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
1415                                           &queue_type);
1416   if (status != HSA_STATUS_SUCCESS
1417       || (queue_type != HSA_QUEUE_TYPE_MULTI))
1418     return false;
1419
1420   return true;
1421 }
1422
1423 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
1424    agent_count in hsa_context.  */
1425
1426 static hsa_status_t
1427 count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
1428 {
1429   if (suitable_hsa_agent_p (agent))
1430     hsa_context.agent_count++;
1431   return HSA_STATUS_SUCCESS;
1432 }
1433
1434 /* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
1435    id to the describing structure in the hsa context.  The index of the
1436    structure is pointed to by DATA, increment it afterwards.  */
1437
1438 static hsa_status_t
1439 assign_agent_ids (hsa_agent_t agent, void *data)
1440 {
1441   if (suitable_hsa_agent_p (agent))
1442     {
1443       int *agent_index = (int *) data;
1444       hsa_context.agents[*agent_index].id = agent;
1445       ++*agent_index;
1446     }
1447   return HSA_STATUS_SUCCESS;
1448 }
1449
1450 /* Initialize hsa_context if it has not already been done.
1451    Return TRUE on success.  */
1452
1453 static bool
1454 init_hsa_context (void)
1455 {
1456   hsa_status_t status;
1457   int agent_index = 0;
1458
1459   if (hsa_context.initialized)
1460     return true;
1461   init_environment_variables ();
1462   if (!init_hsa_runtime_functions ())
1463     {
1464       GCN_WARNING ("Run-time could not be dynamically opened\n");
1465       if (suppress_host_fallback)
1466         GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
1467       return false;
1468     }
1469   status = hsa_fns.hsa_init_fn ();
1470   if (status != HSA_STATUS_SUCCESS)
1471     return hsa_error ("Run-time could not be initialized", status);
1472   GCN_DEBUG ("HSA run-time initialized for GCN\n");
1473
1474   if (debug)
1475     dump_hsa_system_info ();
1476
1477   status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
1478   if (status != HSA_STATUS_SUCCESS)
1479     return hsa_error ("GCN GPU devices could not be enumerated", status);
1480   GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
1481
1482   hsa_context.agents
1483     = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
1484                                   * sizeof (struct agent_info));
1485   status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
1486   if (agent_index != hsa_context.agent_count)
1487     {
1488       GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
1489       return false;
1490     }
1491
1492   if (debug)
1493     {
1494       status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
1495       if (status != HSA_STATUS_SUCCESS)
1496         GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
1497     }
1498
1499   hsa_context.initialized = true;
1500   return true;
1501 }
1502
1503 /* Verify that hsa_context has already been initialized and return the
1504    agent_info structure describing device number N.  Return NULL on error.  */
1505
1506 static struct agent_info *
1507 get_agent_info (int n)
1508 {
1509   if (!hsa_context.initialized)
1510     {
1511       GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
1512       return NULL;
1513     }
1514   if (n >= hsa_context.agent_count)
1515     {
1516       GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
1517       return NULL;
1518     }
1519   if (!hsa_context.agents[n].initialized)
1520     {
1521       GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
1522       return NULL;
1523     }
1524   return &hsa_context.agents[n];
1525 }
1526
1527 /* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
1528
1529    Selects (breaks at) a suitable region of type KIND.  */
1530
1531 static hsa_status_t
1532 get_memory_region (hsa_region_t region, hsa_region_t *retval,
1533                    hsa_region_global_flag_t kind)
1534 {
1535   hsa_status_t status;
1536   hsa_region_segment_t segment;
1537
1538   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
1539                                            &segment);
1540   if (status != HSA_STATUS_SUCCESS)
1541     return status;
1542   if (segment != HSA_REGION_SEGMENT_GLOBAL)
1543     return HSA_STATUS_SUCCESS;
1544
1545   uint32_t flags;
1546   status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
1547                                            &flags);
1548   if (status != HSA_STATUS_SUCCESS)
1549     return status;
1550   if (flags & kind)
1551     {
1552       *retval = region;
1553       return HSA_STATUS_INFO_BREAK;
1554     }
1555   return HSA_STATUS_SUCCESS;
1556 }
1557
1558 /* Callback of hsa_agent_iterate_regions.
1559
1560    Selects a kernargs memory region.  */
1561
1562 static hsa_status_t
1563 get_kernarg_memory_region (hsa_region_t region, void *data)
1564 {
1565   return get_memory_region (region, (hsa_region_t *)data,
1566                             HSA_REGION_GLOBAL_FLAG_KERNARG);
1567 }
1568
1569 /* Callback of hsa_agent_iterate_regions.
1570
1571    Selects a coarse-grained memory region suitable for the heap and
1572    offload data.  */
1573
1574 static hsa_status_t
1575 get_data_memory_region (hsa_region_t region, void *data)
1576 {
1577   return get_memory_region (region, (hsa_region_t *)data,
1578                             HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
1579 }
1580
1581 /* }}}  */
1582 /* {{{ Run  */
1583
1584 /* Create or reuse a team arena.
1585
1586    Team arenas are used by OpenMP to avoid calling malloc multiple times
1587    while setting up each team.  This is purely a performance optimization.
1588
1589    Allocating an arena also costs performance, albeit on the host side, so
1590    this function will reuse an existing arena if a large enough one is idle.
1591    The arena is released, but not deallocated, when the kernel exits.  */
1592
1593 static void *
1594 get_team_arena (struct agent_info *agent, int num_teams)
1595 {
1596   struct team_arena_list **next_ptr = &agent->team_arena_list;
1597   struct team_arena_list *item;
1598
1599   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
1600     {
1601       if (item->num_teams < num_teams)
1602         continue;
1603
1604       if (pthread_mutex_trylock (&item->in_use))
1605         continue;
1606
1607       return item->arena;
1608     }
1609
1610   GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
1611
1612   if (pthread_mutex_lock (&agent->team_arena_write_lock))
1613     {
1614       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1615       return false;
1616     }
1617   item = malloc (sizeof (*item));
1618   item->num_teams = num_teams;
1619   item->next = NULL;
1620   *next_ptr = item;
1621
1622   if (pthread_mutex_init (&item->in_use, NULL))
1623     {
1624       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
1625       return false;
1626     }
1627   if (pthread_mutex_lock (&item->in_use))
1628     {
1629       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
1630       return false;
1631     }
1632   if (pthread_mutex_unlock (&agent->team_arena_write_lock))
1633     {
1634       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1635       return false;
1636     }
1637
1638   const int TEAM_ARENA_SIZE = 64*1024;  /* Must match libgomp.h.  */
1639   hsa_status_t status;
1640   status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1641                                            TEAM_ARENA_SIZE*num_teams,
1642                                            &item->arena);
1643   if (status != HSA_STATUS_SUCCESS)
1644     hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
1645   status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
1646                                                HSA_ACCESS_PERMISSION_RW);
1647   if (status != HSA_STATUS_SUCCESS)
1648     hsa_fatal ("Could not assign arena memory to device", status);
1649
1650   return item->arena;
1651 }
1652
1653 /* Mark a team arena available for reuse.  */
1654
1655 static void
1656 release_team_arena (struct agent_info* agent, void *arena)
1657 {
1658   struct team_arena_list *item;
1659
1660   for (item = agent->team_arena_list; item; item = item->next)
1661     {
1662       if (item->arena == arena)
1663         {
1664           if (pthread_mutex_unlock (&item->in_use))
1665             GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
1666           return;
1667         }
1668     }
1669   GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
1670 }
1671
1672 /* Clean up all the allocated team arenas.  */
1673
1674 static bool
1675 destroy_team_arenas (struct agent_info *agent)
1676 {
1677   struct team_arena_list *item, *next;
1678
1679   for (item = agent->team_arena_list; item; item = next)
1680     {
1681       next = item->next;
1682       hsa_fns.hsa_memory_free_fn (item->arena);
1683       if (pthread_mutex_destroy (&item->in_use))
1684         {
1685           GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
1686           return false;
1687         }
1688       free (item);
1689     }
1690   agent->team_arena_list = NULL;
1691
1692   return true;
1693 }
1694
1695 /* Allocate memory on a specified device.  */
1696
1697 static void *
1698 alloc_by_agent (struct agent_info *agent, size_t size)
1699 {
1700   GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
1701
1702   /* Zero-size allocations are invalid, so in order to return a valid pointer
1703      we need to pass a valid size.  One source of zero-size allocations is
1704      kernargs for kernels that have no inputs or outputs (the kernel may
1705      only use console output, for example).  */
1706   if (size == 0)
1707     size = 4;
1708
1709   void *ptr;
1710   hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
1711                                                         size, &ptr);
1712   if (status != HSA_STATUS_SUCCESS)
1713     {
1714       hsa_error ("Could not allocate device memory", status);
1715       return NULL;
1716     }
1717
1718   status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
1719                                                HSA_ACCESS_PERMISSION_RW);
1720   if (status != HSA_STATUS_SUCCESS)
1721     {
1722       hsa_error ("Could not assign data memory to device", status);
1723       return NULL;
1724     }
1725
1726   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1727   bool profiling_dispatch_p
1728     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1729   if (profiling_dispatch_p)
1730     {
1731       acc_prof_info *prof_info = thr->prof_info;
1732       acc_event_info data_event_info;
1733       acc_api_info *api_info = thr->api_info;
1734
1735       prof_info->event_type = acc_ev_alloc;
1736
1737       data_event_info.data_event.event_type = prof_info->event_type;
1738       data_event_info.data_event.valid_bytes
1739         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1740       data_event_info.data_event.parent_construct
1741         = acc_construct_parallel;
1742       data_event_info.data_event.implicit = 1;
1743       data_event_info.data_event.tool_info = NULL;
1744       data_event_info.data_event.var_name = NULL;
1745       data_event_info.data_event.bytes = size;
1746       data_event_info.data_event.host_ptr = NULL;
1747       data_event_info.data_event.device_ptr = (void *) ptr;
1748
1749       api_info->device_api = acc_device_api_other;
1750
1751       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1752                                             api_info);
1753     }
1754
1755   return ptr;
1756 }
1757
1758 /* Create kernel dispatch data structure for given KERNEL, along with
1759    the necessary device signals and memory allocations.  */
1760
1761 static struct kernel_dispatch *
1762 create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
1763 {
1764   struct agent_info *agent = kernel->agent;
1765   struct kernel_dispatch *shadow
1766     = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
1767
1768   shadow->agent = kernel->agent;
1769   shadow->object = kernel->object;
1770
1771   hsa_signal_t sync_signal;
1772   hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
1773   if (status != HSA_STATUS_SUCCESS)
1774     hsa_fatal ("Error creating the GCN sync signal", status);
1775
1776   shadow->signal = sync_signal.handle;
1777   shadow->private_segment_size = kernel->private_segment_size;
1778   shadow->group_segment_size = kernel->group_segment_size;
1779
1780   /* We expect kernels to request a single pointer, explicitly, and the
1781      rest of struct kernargs, implicitly.  If they request anything else
1782      then something is wrong.  */
1783   if (kernel->kernarg_segment_size > 8)
1784     {
1785       GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
1786       return NULL;
1787     }
1788
1789   status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
1790                                            sizeof (struct kernargs),
1791                                            &shadow->kernarg_address);
1792   if (status != HSA_STATUS_SUCCESS)
1793     hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
1794   struct kernargs *kernargs = shadow->kernarg_address;
1795
1796   /* Zero-initialize the output_data (minimum needed).  */
1797   kernargs->out_ptr = (int64_t)&kernargs->output_data;
1798   kernargs->output_data.next_output = 0;
1799   for (unsigned i = 0;
1800        i < (sizeof (kernargs->output_data.queue)
1801             / sizeof (kernargs->output_data.queue[0]));
1802        i++)
1803     kernargs->output_data.queue[i].written = 0;
1804   kernargs->output_data.consumed = 0;
1805
1806   /* Pass in the heap location.  */
1807   kernargs->heap_ptr = (int64_t)kernel->module->heap;
1808
1809   /* Create an arena.  */
1810   if (kernel->kind == KIND_OPENMP)
1811     kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
1812   else
1813     kernargs->arena_ptr = 0;
1814
1815   /* Ensure we can recognize unset return values.  */
1816   kernargs->output_data.return_value = 0xcafe0000;
1817
1818   return shadow;
1819 }
1820
1821 /* Output any data written to console output from the kernel.  It is expected
1822    that this function is polled during kernel execution.
1823
1824    We print all entries from the last item printed to the next entry without
1825    a "written" flag.  If the "final" flag is set then it'll continue right to
1826    the end.
1827
1828    The print buffer is circular, but the from and to locations don't wrap when
1829    the buffer does, so the output limit is UINT_MAX.  The target blocks on
1830    output when the buffer is full.  */
1831
1832 static void
1833 console_output (struct kernel_info *kernel, struct kernargs *kernargs,
1834                 bool final)
1835 {
1836   unsigned int limit = (sizeof (kernargs->output_data.queue)
1837                         / sizeof (kernargs->output_data.queue[0]));
1838
1839   unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
1840                                        __ATOMIC_ACQUIRE);
1841   unsigned int to = kernargs->output_data.next_output;
1842
1843   if (from > to)
1844     {
1845       /* Overflow.  */
1846       if (final)
1847         printf ("GCN print buffer overflowed.\n");
1848       return;
1849     }
1850
1851   unsigned int i;
1852   for (i = from; i < to; i++)
1853     {
1854       struct printf_data *data = &kernargs->output_data.queue[i%limit];
1855
1856       if (!data->written && !final)
1857         break;
1858
1859       switch (data->type)
1860         {
1861         case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
1862         case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
1863         case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
1864         case 3: printf ("%.128s%.128s", data->msg, data->text); break;
1865         default: printf ("GCN print buffer error!\n"); break;
1866         }
1867       data->written = 0;
1868       __atomic_store_n (&kernargs->output_data.consumed, i+1,
1869                         __ATOMIC_RELEASE);
1870     }
1871   fflush (stdout);
1872 }
1873
1874 /* Release data structure created for a kernel dispatch in SHADOW argument,
1875    and clean up the signal and memory allocations.  */
1876
1877 static void
1878 release_kernel_dispatch (struct kernel_dispatch *shadow)
1879 {
1880   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
1881
1882   struct kernargs *kernargs = shadow->kernarg_address;
1883   void *arena = (void *)kernargs->arena_ptr;
1884   if (arena)
1885     release_team_arena (shadow->agent, arena);
1886
1887   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
1888
1889   hsa_signal_t s;
1890   s.handle = shadow->signal;
1891   hsa_fns.hsa_signal_destroy_fn (s);
1892
1893   free (shadow);
1894 }
1895
1896 /* Extract the properties from a kernel binary.  */
1897
1898 static void
1899 init_kernel_properties (struct kernel_info *kernel)
1900 {
1901   hsa_status_t status;
1902   struct agent_info *agent = kernel->agent;
1903   hsa_executable_symbol_t kernel_symbol;
1904   status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
1905                                                  kernel->name, agent->id,
1906                                                  0, &kernel_symbol);
1907   if (status != HSA_STATUS_SUCCESS)
1908     {
1909       hsa_warn ("Could not find symbol for kernel in the code object", status);
1910       fprintf (stderr, "not found name: '%s'\n", kernel->name);
1911       dump_executable_symbols (agent->executable);
1912       goto failure;
1913     }
1914   GCN_DEBUG ("Located kernel %s\n", kernel->name);
1915   status = hsa_fns.hsa_executable_symbol_get_info_fn
1916     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
1917   if (status != HSA_STATUS_SUCCESS)
1918     hsa_fatal ("Could not extract a kernel object from its symbol", status);
1919   status = hsa_fns.hsa_executable_symbol_get_info_fn
1920     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
1921      &kernel->kernarg_segment_size);
1922   if (status != HSA_STATUS_SUCCESS)
1923     hsa_fatal ("Could not get info about kernel argument size", status);
1924   status = hsa_fns.hsa_executable_symbol_get_info_fn
1925     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
1926      &kernel->group_segment_size);
1927   if (status != HSA_STATUS_SUCCESS)
1928     hsa_fatal ("Could not get info about kernel group segment size", status);
1929   status = hsa_fns.hsa_executable_symbol_get_info_fn
1930     (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
1931      &kernel->private_segment_size);
1932   if (status != HSA_STATUS_SUCCESS)
1933     hsa_fatal ("Could not get info about kernel private segment size",
1934                status);
1935
1936   /* The kernel type is not known until something tries to launch it.  */
1937   kernel->kind = KIND_UNKNOWN;
1938
1939   GCN_DEBUG ("Kernel structure for %s fully initialized with "
1940              "following segment sizes: \n", kernel->name);
1941   GCN_DEBUG ("  group_segment_size: %u\n",
1942              (unsigned) kernel->group_segment_size);
1943   GCN_DEBUG ("  private_segment_size: %u\n",
1944              (unsigned) kernel->private_segment_size);
1945   GCN_DEBUG ("  kernarg_segment_size: %u\n",
1946              (unsigned) kernel->kernarg_segment_size);
1947   return;
1948
1949 failure:
1950   kernel->initialization_failed = true;
1951 }
1952
1953 /* Do all the work that is necessary before running KERNEL for the first time.
1954    The function assumes the program has been created, finalized and frozen by
1955    create_and_finalize_hsa_program.  */
1956
1957 static void
1958 init_kernel (struct kernel_info *kernel)
1959 {
1960   if (pthread_mutex_lock (&kernel->init_mutex))
1961     GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
1962   if (kernel->initialized)
1963     {
1964       if (pthread_mutex_unlock (&kernel->init_mutex))
1965         GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
1966                            "mutex");
1967
1968       return;
1969     }
1970
1971   init_kernel_properties (kernel);
1972
1973   if (!kernel->initialization_failed)
1974     {
1975       GCN_DEBUG ("\n");
1976
1977       kernel->initialized = true;
1978     }
1979   if (pthread_mutex_unlock (&kernel->init_mutex))
1980     GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
1981                        "mutex");
1982 }
1983
1984 /* Run KERNEL on its agent, pass VARS to it as arguments and take
1985    launch attributes from KLA.
1986
1987    MODULE_LOCKED indicates that the caller already holds the lock and
1988    run_kernel need not lock it again.
1989    If AQ is NULL then agent->sync_queue will be used.  */
1990
1991 static void
1992 run_kernel (struct kernel_info *kernel, void *vars,
1993             struct GOMP_kernel_launch_attributes *kla,
1994             struct goacc_asyncqueue *aq, bool module_locked)
1995 {
1996   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
1997              (aq ? aq->id : 0));
1998   GCN_DEBUG ("GCN launch attribs: gdims:[");
1999   int i;
2000   for (i = 0; i < kla->ndim; ++i)
2001     {
2002       if (i)
2003         DEBUG_PRINT (", ");
2004       DEBUG_PRINT ("%u", kla->gdims[i]);
2005     }
2006   DEBUG_PRINT ("], normalized gdims:[");
2007   for (i = 0; i < kla->ndim; ++i)
2008     {
2009       if (i)
2010         DEBUG_PRINT (", ");
2011       DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
2012     }
2013   DEBUG_PRINT ("], wdims:[");
2014   for (i = 0; i < kla->ndim; ++i)
2015     {
2016       if (i)
2017         DEBUG_PRINT (", ");
2018       DEBUG_PRINT ("%u", kla->wdims[i]);
2019     }
2020   DEBUG_PRINT ("]\n");
2021   DEBUG_FLUSH ();
2022
2023   struct agent_info *agent = kernel->agent;
2024   if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
2025     GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
2026
2027   if (!agent->initialized)
2028     GOMP_PLUGIN_fatal ("Agent must be initialized");
2029
2030   if (!kernel->initialized)
2031     GOMP_PLUGIN_fatal ("Called kernel must be initialized");
2032
2033   hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
2034
2035   uint64_t index
2036     = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
2037   GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
2038
2039   /* Wait until the queue is not full before writing the packet.   */
2040   while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
2041          >= command_q->size)
2042     ;
2043
2044   /* Do not allow the dimensions to be overridden when running
2045      constructors or destructors.  */
2046   int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
2047   int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
2048
2049   hsa_kernel_dispatch_packet_t *packet;
2050   packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
2051            + index % command_q->size;
2052
2053   memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
2054   packet->grid_size_x = override_x ? : kla->gdims[0];
2055   packet->workgroup_size_x = get_group_size (kla->ndim,
2056                                              packet->grid_size_x,
2057                                              kla->wdims[0]);
2058
2059   if (kla->ndim >= 2)
2060     {
2061       packet->grid_size_y = kla->gdims[1];
2062       packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
2063                                                  kla->wdims[1]);
2064     }
2065   else
2066     {
2067       packet->grid_size_y = 1;
2068       packet->workgroup_size_y = 1;
2069     }
2070
2071   if (kla->ndim == 3)
2072     {
2073       packet->grid_size_z = limit_worker_threads (override_z
2074                                                   ? : kla->gdims[2]);
2075       packet->workgroup_size_z = get_group_size (kla->ndim,
2076                                                  packet->grid_size_z,
2077                                                  kla->wdims[2]);
2078     }
2079   else
2080     {
2081       packet->grid_size_z = 1;
2082       packet->workgroup_size_z = 1;
2083     }
2084
2085   GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
2086              " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
2087              packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
2088              packet->grid_size_x / packet->workgroup_size_x,
2089              packet->grid_size_y / packet->workgroup_size_y,
2090              packet->grid_size_z / packet->workgroup_size_z,
2091              packet->workgroup_size_x, packet->workgroup_size_y,
2092              packet->workgroup_size_z);
2093
2094   struct kernel_dispatch *shadow
2095     = create_kernel_dispatch (kernel, packet->grid_size_x);
2096   shadow->queue = command_q;
2097
2098   if (debug)
2099     {
2100       fprintf (stderr, "\nKernel has following dependencies:\n");
2101       print_kernel_dispatch (shadow, 2);
2102     }
2103
2104   packet->private_segment_size = kernel->private_segment_size;
2105   packet->group_segment_size = kernel->group_segment_size;
2106   packet->kernel_object = kernel->object;
2107   packet->kernarg_address = shadow->kernarg_address;
2108   hsa_signal_t s;
2109   s.handle = shadow->signal;
2110   packet->completion_signal = s;
2111   hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
2112   memcpy (shadow->kernarg_address, &vars, sizeof (vars));
2113
2114   GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
2115
2116   uint16_t header;
2117   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
2118   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
2119   header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
2120
2121   GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
2122              agent->device_id);
2123
2124   packet_store_release ((uint32_t *) packet, header,
2125                         (uint16_t) kla->ndim
2126                         << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
2127
2128   hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
2129                                        index);
2130
2131   GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
2132
2133   /* Root signal waits with 1ms timeout.  */
2134   while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
2135                                              1000 * 1000,
2136                                              HSA_WAIT_STATE_BLOCKED) != 0)
2137     {
2138       console_output (kernel, shadow->kernarg_address, false);
2139     }
2140   console_output (kernel, shadow->kernarg_address, true);
2141
2142   struct kernargs *kernargs = shadow->kernarg_address;
2143   unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
2144
2145   release_kernel_dispatch (shadow);
2146
2147   if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
2148     GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
2149
2150   unsigned int upper = (return_value & ~0xffff) >> 16;
2151   if (upper == 0xcafe)
2152     ; // exit not called, normal termination.
2153   else if (upper == 0xffff)
2154     ; // exit called.
2155   else
2156     {
2157       GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
2158                          " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
2159                          return_value);
2160       abort ();
2161     }
2162
2163   if (upper == 0xffff)
2164     {
2165       unsigned int signal = (return_value >> 8) & 0xff;
2166
2167       if (signal == SIGABRT)
2168         {
2169           GCN_WARNING ("GCN Kernel aborted\n");
2170           abort ();
2171         }
2172       else if (signal != 0)
2173         {
2174           GCN_WARNING ("GCN Kernel received unknown signal\n");
2175           abort ();
2176         }
2177
2178       GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
2179       exit (return_value & 0xff);
2180     }
2181 }
2182
2183 /* }}}  */
2184 /* {{{ Load/Unload  */
2185
2186 /* Initialize KERNEL from D and other parameters.  Return true on success. */
2187
2188 static bool
2189 init_basic_kernel_info (struct kernel_info *kernel,
2190                         struct hsa_kernel_description *d,
2191                         struct agent_info *agent,
2192                         struct module_info *module)
2193 {
2194   kernel->agent = agent;
2195   kernel->module = module;
2196   kernel->name = d->name;
2197   if (pthread_mutex_init (&kernel->init_mutex, NULL))
2198     {
2199       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
2200       return false;
2201     }
2202   return true;
2203 }
2204
2205 /* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true.  If
2206    not found, return false.  */
2207
2208 static bool
2209 find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
2210                   struct module_info *module, Elf64_Ehdr *image,
2211                   Elf64_Shdr *sections)
2212 {
2213   bool res = false;
2214
2215   hsa_status_t status;
2216
2217   hsa_executable_symbol_t symbol;
2218   if (!find_executable_symbol (agent->executable, &symbol))
2219     return false;
2220
2221   status = hsa_fns.hsa_executable_symbol_get_info_fn
2222     (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
2223   if (status != HSA_STATUS_SUCCESS)
2224     {
2225       hsa_error ("Could not extract symbol address", status);
2226       return false;
2227     }
2228
2229   char *symbol_name = get_executable_symbol_name (symbol);
2230   if (symbol_name == NULL)
2231     return false;
2232
2233   /* Find the kernel function in ELF, and calculate actual load offset.  */
2234   for (int i = 0; i < image->e_shnum; i++)
2235     if (sections[i].sh_type == SHT_SYMTAB)
2236       {
2237         Elf64_Shdr *strtab = &sections[sections[i].sh_link];
2238         char *strings = (char *)image + strtab->sh_offset;
2239
2240         for (size_t offset = 0;
2241              offset < sections[i].sh_size;
2242              offset += sections[i].sh_entsize)
2243           {
2244             Elf64_Sym *sym = (Elf64_Sym*)((char*)image
2245                                           + sections[i].sh_offset
2246                                           + offset);
2247             if (strcmp (symbol_name, strings + sym->st_name) == 0)
2248               {
2249                 *load_offset -= sym->st_value;
2250                 res = true;
2251                 break;
2252               }
2253           }
2254       }
2255
2256   free (symbol_name);
2257   return res;
2258 }
2259
2260 /* Create and finalize the program consisting of all loaded modules.  */
2261
2262 static bool
2263 create_and_finalize_hsa_program (struct agent_info *agent)
2264 {
2265   hsa_status_t status;
2266   int reloc_count = 0;
2267   bool res = true;
2268   if (pthread_mutex_lock (&agent->prog_mutex))
2269     {
2270       GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
2271       return false;
2272     }
2273   if (agent->prog_finalized)
2274     goto final;
2275
2276   status
2277     = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
2278                                         HSA_EXECUTABLE_STATE_UNFROZEN,
2279                                         "", &agent->executable);
2280   if (status != HSA_STATUS_SUCCESS)
2281     {
2282       hsa_error ("Could not create GCN executable", status);
2283       goto fail;
2284     }
2285
2286   /* Load any GCN modules.  */
2287   struct module_info *module = agent->module;
2288   if (module)
2289     {
2290       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2291
2292       /* Hide relocations from the HSA runtime loader.
2293          Keep a copy of the unmodified section headers to use later.  */
2294       Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
2295                                                   + image->e_shoff);
2296       for (int i = image->e_shnum - 1; i >= 0; i--)
2297         {
2298           if (image_sections[i].sh_type == SHT_RELA
2299               || image_sections[i].sh_type == SHT_REL)
2300             /* Change section type to something harmless.  */
2301             image_sections[i].sh_type |= 0x80;
2302         }
2303
2304       hsa_code_object_t co = { 0 };
2305       status = hsa_fns.hsa_code_object_deserialize_fn
2306         (module->image_desc->gcn_image->image,
2307          module->image_desc->gcn_image->size,
2308          NULL, &co);
2309       if (status != HSA_STATUS_SUCCESS)
2310         {
2311           hsa_error ("Could not deserialize GCN code object", status);
2312           goto fail;
2313         }
2314
2315       status = hsa_fns.hsa_executable_load_code_object_fn
2316         (agent->executable, agent->id, co, "");
2317       if (status != HSA_STATUS_SUCCESS)
2318         {
2319           hsa_error ("Could not load GCN code object", status);
2320           goto fail;
2321         }
2322
2323       if (!module->heap)
2324         {
2325           status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
2326                                                    gcn_kernel_heap_size,
2327                                                    (void**)&module->heap);
2328           if (status != HSA_STATUS_SUCCESS)
2329             {
2330               hsa_error ("Could not allocate memory for GCN heap", status);
2331               goto fail;
2332             }
2333
2334           status = hsa_fns.hsa_memory_assign_agent_fn
2335                         (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
2336           if (status != HSA_STATUS_SUCCESS)
2337             {
2338               hsa_error ("Could not assign GCN heap memory to device", status);
2339               goto fail;
2340             }
2341
2342           hsa_fns.hsa_memory_copy_fn (&module->heap->size,
2343                                       &gcn_kernel_heap_size,
2344                                       sizeof (gcn_kernel_heap_size));
2345         }
2346
2347     }
2348
2349   if (debug)
2350     dump_executable_symbols (agent->executable);
2351
2352   status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
2353   if (status != HSA_STATUS_SUCCESS)
2354     {
2355       hsa_error ("Could not freeze the GCN executable", status);
2356       goto fail;
2357     }
2358
2359   if (agent->module)
2360     {
2361       struct module_info *module = agent->module;
2362       Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
2363       Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
2364
2365       Elf64_Addr load_offset;
2366       if (!find_load_offset (&load_offset, agent, module, image, sections))
2367         goto fail;
2368
2369       /* Record the physical load address range.
2370          We need this for data copies later.  */
2371       Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
2372       Elf64_Addr low = ~0, high = 0;
2373       for (int i = 0; i < image->e_phnum; i++)
2374         if (segments[i].p_memsz > 0)
2375           {
2376             if (segments[i].p_paddr < low)
2377               low = segments[i].p_paddr;
2378             if (segments[i].p_paddr > high)
2379               high = segments[i].p_paddr + segments[i].p_memsz - 1;
2380           }
2381       module->phys_address_start = low + load_offset;
2382       module->phys_address_end = high + load_offset;
2383
2384       // Find dynamic symbol table
2385       Elf64_Shdr *dynsym = NULL;
2386       for (int i = 0; i < image->e_shnum; i++)
2387         if (sections[i].sh_type == SHT_DYNSYM)
2388           {
2389             dynsym = &sections[i];
2390             break;
2391           }
2392
2393       /* Fix up relocations.  */
2394       for (int i = 0; i < image->e_shnum; i++)
2395         {
2396           if (sections[i].sh_type == (SHT_RELA | 0x80))
2397             for (size_t offset = 0;
2398                  offset < sections[i].sh_size;
2399                  offset += sections[i].sh_entsize)
2400               {
2401                 Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
2402                                                   + sections[i].sh_offset
2403                                                   + offset);
2404                 Elf64_Sym *sym =
2405                   (dynsym
2406                    ? (Elf64_Sym*)((char*)image
2407                                   + dynsym->sh_offset
2408                                   + (dynsym->sh_entsize
2409                                      * ELF64_R_SYM (reloc->r_info)))
2410                    : NULL);
2411
2412                 int64_t S = (sym ? sym->st_value : 0);
2413                 int64_t P = reloc->r_offset + load_offset;
2414                 int64_t A = reloc->r_addend;
2415                 int64_t B = load_offset;
2416                 int64_t V, size;
2417                 switch (ELF64_R_TYPE (reloc->r_info))
2418                   {
2419                   case R_AMDGPU_ABS32_LO:
2420                     V = (S + A) & 0xFFFFFFFF;
2421                     size = 4;
2422                     break;
2423                   case R_AMDGPU_ABS32_HI:
2424                     V = (S + A) >> 32;
2425                     size = 4;
2426                     break;
2427                   case R_AMDGPU_ABS64:
2428                     V = S + A;
2429                     size = 8;
2430                     break;
2431                   case R_AMDGPU_REL32:
2432                     V = S + A - P;
2433                     size = 4;
2434                     break;
2435                   case R_AMDGPU_REL64:
2436                     /* FIXME
2437                        LLD seems to emit REL64 where the the assembler has
2438                        ABS64.  This is clearly wrong because it's not what the
2439                        compiler is expecting.  Let's assume, for now, that
2440                        it's a bug.  In any case, GCN kernels are always self
2441                        contained and therefore relative relocations will have
2442                        been resolved already, so this should be a safe
2443                        workaround.  */
2444                     V = S + A/* - P*/;
2445                     size = 8;
2446                     break;
2447                   case R_AMDGPU_ABS32:
2448                     V = S + A;
2449                     size = 4;
2450                     break;
2451                     /* TODO R_AMDGPU_GOTPCREL */
2452                     /* TODO R_AMDGPU_GOTPCREL32_LO */
2453                     /* TODO R_AMDGPU_GOTPCREL32_HI */
2454                   case R_AMDGPU_REL32_LO:
2455                     V = (S + A - P) & 0xFFFFFFFF;
2456                     size = 4;
2457                     break;
2458                   case R_AMDGPU_REL32_HI:
2459                     V = (S + A - P) >> 32;
2460                     size = 4;
2461                     break;
2462                   case R_AMDGPU_RELATIVE64:
2463                     V = B + A;
2464                     size = 8;
2465                     break;
2466                   default:
2467                     fprintf (stderr, "Error: unsupported relocation type.\n");
2468                     exit (1);
2469                   }
2470                 status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
2471                 if (status != HSA_STATUS_SUCCESS)
2472                   {
2473                     hsa_error ("Failed to fix up relocation", status);
2474                     goto fail;
2475                   }
2476                 reloc_count++;
2477               }
2478         }
2479     }
2480
2481   GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
2482              agent->device_id, reloc_count);
2483
2484 final:
2485   agent->prog_finalized = true;
2486
2487   if (pthread_mutex_unlock (&agent->prog_mutex))
2488     {
2489       GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
2490       res = false;
2491     }
2492
2493   return res;
2494
2495 fail:
2496   res = false;
2497   goto final;
2498 }
2499
2500 /* Free the HSA program in agent and everything associated with it and set
2501    agent->prog_finalized and the initialized flags of all kernels to false.
2502    Return TRUE on success.  */
2503
2504 static bool
2505 destroy_hsa_program (struct agent_info *agent)
2506 {
2507   if (!agent->prog_finalized)
2508     return true;
2509
2510   hsa_status_t status;
2511
2512   GCN_DEBUG ("Destroying the current GCN program.\n");
2513
2514   status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
2515   if (status != HSA_STATUS_SUCCESS)
2516     return hsa_error ("Could not destroy GCN executable", status);
2517
2518   if (agent->module)
2519     {
2520       int i;
2521       for (i = 0; i < agent->module->kernel_count; i++)
2522         agent->module->kernels[i].initialized = false;
2523
2524       if (agent->module->heap)
2525         {
2526           hsa_fns.hsa_memory_free_fn (agent->module->heap);
2527           agent->module->heap = NULL;
2528         }
2529     }
2530   agent->prog_finalized = false;
2531   return true;
2532 }
2533
2534 /* Deinitialize all information associated with MODULE and kernels within
2535    it.  Return TRUE on success.  */
2536
2537 static bool
2538 destroy_module (struct module_info *module, bool locked)
2539 {
2540   /* Run destructors before destroying module.  */
2541   struct GOMP_kernel_launch_attributes kla =
2542     { 3,
2543       /* Grid size.  */
2544       { 1, 64, 1 },
2545       /* Work-group size.  */
2546       { 1, 64, 1 }
2547     };
2548
2549   if (module->fini_array_func)
2550     {
2551       init_kernel (module->fini_array_func);
2552       run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
2553     }
2554   module->constructors_run_p = false;
2555
2556   int i;
2557   for (i = 0; i < module->kernel_count; i++)
2558     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
2559       {
2560         GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
2561                            "mutex");
2562         return false;
2563       }
2564
2565   return true;
2566 }
2567
2568 /* }}}  */
2569 /* {{{ Async  */
2570
2571 /* Callback of dispatch queues to report errors.  */
2572
2573 static void
2574 execute_queue_entry (struct goacc_asyncqueue *aq, int index)
2575 {
2576   struct queue_entry *entry = &aq->queue[index];
2577
2578   switch (entry->type)
2579     {
2580     case KERNEL_LAUNCH:
2581       if (DEBUG_QUEUES)
2582         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
2583                    aq->agent->device_id, aq->id, index);
2584       run_kernel (entry->u.launch.kernel,
2585                   entry->u.launch.vars,
2586                   &entry->u.launch.kla, aq, false);
2587       if (DEBUG_QUEUES)
2588         GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
2589                    aq->agent->device_id, aq->id, index);
2590       break;
2591
2592     case CALLBACK:
2593       if (DEBUG_QUEUES)
2594         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
2595                    aq->agent->device_id, aq->id, index);
2596       entry->u.callback.fn (entry->u.callback.data);
2597       if (DEBUG_QUEUES)
2598         GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
2599                    aq->agent->device_id, aq->id, index);
2600       break;
2601
2602     case ASYNC_WAIT:
2603       {
2604         /* FIXME: is it safe to access a placeholder that may already have
2605            been executed?  */
2606         struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
2607
2608         if (DEBUG_QUEUES)
2609           GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
2610                      aq->agent->device_id, aq->id, index);
2611
2612         pthread_mutex_lock (&placeholderp->mutex);
2613
2614         while (!placeholderp->executed)
2615           pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
2616
2617         pthread_mutex_unlock (&placeholderp->mutex);
2618
2619         if (pthread_cond_destroy (&placeholderp->cond))
2620           GOMP_PLUGIN_error ("Failed to destroy serialization cond");
2621
2622         if (pthread_mutex_destroy (&placeholderp->mutex))
2623           GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
2624
2625         if (DEBUG_QUEUES)
2626           GCN_DEBUG ("Async thread %d:%d: Executing async wait "
2627                      "entry (%d) done\n", aq->agent->device_id, aq->id, index);
2628       }
2629       break;
2630
2631     case ASYNC_PLACEHOLDER:
2632       pthread_mutex_lock (&entry->u.placeholder.mutex);
2633       entry->u.placeholder.executed = 1;
2634       pthread_cond_signal (&entry->u.placeholder.cond);
2635       pthread_mutex_unlock (&entry->u.placeholder.mutex);
2636       break;
2637
2638     default:
2639       GOMP_PLUGIN_fatal ("Unknown queue element");
2640     }
2641 }
2642
2643 /* This function is run as a thread to service an async queue in the
2644    background.  It runs continuously until the stop flag is set.  */
2645
2646 static void *
2647 drain_queue (void *thread_arg)
2648 {
2649   struct goacc_asyncqueue *aq = thread_arg;
2650
2651   if (DRAIN_QUEUE_SYNCHRONOUS_P)
2652     {
2653       aq->drain_queue_stop = 2;
2654       return NULL;
2655     }
2656
2657   pthread_mutex_lock (&aq->mutex);
2658
2659   while (true)
2660     {
2661       if (aq->drain_queue_stop)
2662         break;
2663
2664       if (aq->queue_n > 0)
2665         {
2666           pthread_mutex_unlock (&aq->mutex);
2667           execute_queue_entry (aq, aq->queue_first);
2668
2669           pthread_mutex_lock (&aq->mutex);
2670           aq->queue_first = ((aq->queue_first + 1)
2671                              % ASYNC_QUEUE_SIZE);
2672           aq->queue_n--;
2673
2674           if (DEBUG_THREAD_SIGNAL)
2675             GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
2676                        aq->agent->device_id, aq->id);
2677           pthread_cond_broadcast (&aq->queue_cond_out);
2678           pthread_mutex_unlock (&aq->mutex);
2679
2680           if (DEBUG_QUEUES)
2681             GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
2682                        aq->id);
2683           pthread_mutex_lock (&aq->mutex);
2684         }
2685       else
2686         {
2687           if (DEBUG_THREAD_SLEEP)
2688             GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
2689                        aq->agent->device_id, aq->id);
2690           pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
2691           if (DEBUG_THREAD_SLEEP)
2692             GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
2693                        aq->agent->device_id, aq->id);
2694         }
2695     }
2696
2697   aq->drain_queue_stop = 2;
2698   if (DEBUG_THREAD_SIGNAL)
2699     GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
2700                aq->agent->device_id, aq->id);
2701   pthread_cond_broadcast (&aq->queue_cond_out);
2702   pthread_mutex_unlock (&aq->mutex);
2703
2704   GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
2705   return NULL;
2706 }
2707
2708 /* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
2709    is not usually the case.  This is just a debug tool.  */
2710
2711 static void
2712 drain_queue_synchronous (struct goacc_asyncqueue *aq)
2713 {
2714   pthread_mutex_lock (&aq->mutex);
2715
2716   while (aq->queue_n > 0)
2717     {
2718       execute_queue_entry (aq, aq->queue_first);
2719
2720       aq->queue_first = ((aq->queue_first + 1)
2721                          % ASYNC_QUEUE_SIZE);
2722       aq->queue_n--;
2723     }
2724
2725   pthread_mutex_unlock (&aq->mutex);
2726 }
2727
2728 /* Block the current thread until an async queue is writable.  The aq->mutex
2729    lock should be held on entry, and remains locked on exit.  */
2730
2731 static void
2732 wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
2733 {
2734   if (aq->queue_n == ASYNC_QUEUE_SIZE)
2735     {
2736       /* Queue is full.  Wait for it to not be full.  */
2737       while (aq->queue_n == ASYNC_QUEUE_SIZE)
2738         pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2739     }
2740 }
2741
2742 /* Request an asynchronous kernel launch on the specified queue.  This
2743    may block if the queue is full, but returns without waiting for the
2744    kernel to run.  */
2745
2746 static void
2747 queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
2748                    void *vars, struct GOMP_kernel_launch_attributes *kla)
2749 {
2750   assert (aq->agent == kernel->agent);
2751
2752   pthread_mutex_lock (&aq->mutex);
2753
2754   wait_for_queue_nonfull (aq);
2755
2756   int queue_last = ((aq->queue_first + aq->queue_n)
2757                     % ASYNC_QUEUE_SIZE);
2758   if (DEBUG_QUEUES)
2759     GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
2760                aq->id, queue_last);
2761
2762   aq->queue[queue_last].type = KERNEL_LAUNCH;
2763   aq->queue[queue_last].u.launch.kernel = kernel;
2764   aq->queue[queue_last].u.launch.vars = vars;
2765   aq->queue[queue_last].u.launch.kla = *kla;
2766
2767   aq->queue_n++;
2768
2769   if (DEBUG_THREAD_SIGNAL)
2770     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2771                aq->agent->device_id, aq->id);
2772   pthread_cond_signal (&aq->queue_cond_in);
2773
2774   pthread_mutex_unlock (&aq->mutex);
2775 }
2776
2777 /* Request an asynchronous callback on the specified queue.  The callback
2778    function will be called, with the given opaque data, from the appropriate
2779    async thread, when all previous items on that queue are complete.  */
2780
2781 static void
2782 queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
2783                      void *data)
2784 {
2785   pthread_mutex_lock (&aq->mutex);
2786
2787   wait_for_queue_nonfull (aq);
2788
2789   int queue_last = ((aq->queue_first + aq->queue_n)
2790                     % ASYNC_QUEUE_SIZE);
2791   if (DEBUG_QUEUES)
2792     GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
2793                aq->id, queue_last);
2794
2795   aq->queue[queue_last].type = CALLBACK;
2796   aq->queue[queue_last].u.callback.fn = fn;
2797   aq->queue[queue_last].u.callback.data = data;
2798
2799   aq->queue_n++;
2800
2801   if (DEBUG_THREAD_SIGNAL)
2802     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2803                aq->agent->device_id, aq->id);
2804   pthread_cond_signal (&aq->queue_cond_in);
2805
2806   pthread_mutex_unlock (&aq->mutex);
2807 }
2808
2809 /* Request that a given async thread wait for another thread (unspecified) to
2810    reach the given placeholder.  The wait will occur when all previous entries
2811    on the queue are complete.  A placeholder is effectively a kind of signal
2812    which simply sets a flag when encountered in a queue.  */
2813
2814 static void
2815 queue_push_asyncwait (struct goacc_asyncqueue *aq,
2816                       struct placeholder *placeholderp)
2817 {
2818   pthread_mutex_lock (&aq->mutex);
2819
2820   wait_for_queue_nonfull (aq);
2821
2822   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2823   if (DEBUG_QUEUES)
2824     GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
2825                aq->id, queue_last);
2826
2827   aq->queue[queue_last].type = ASYNC_WAIT;
2828   aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
2829
2830   aq->queue_n++;
2831
2832   if (DEBUG_THREAD_SIGNAL)
2833     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2834                aq->agent->device_id, aq->id);
2835   pthread_cond_signal (&aq->queue_cond_in);
2836
2837   pthread_mutex_unlock (&aq->mutex);
2838 }
2839
2840 /* Add a placeholder into an async queue.  When the async thread reaches the
2841    placeholder it will set the "executed" flag to true and continue.
2842    Another thread may be waiting on this thread reaching the placeholder.  */
2843
2844 static struct placeholder *
2845 queue_push_placeholder (struct goacc_asyncqueue *aq)
2846 {
2847   struct placeholder *placeholderp;
2848
2849   pthread_mutex_lock (&aq->mutex);
2850
2851   wait_for_queue_nonfull (aq);
2852
2853   int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
2854   if (DEBUG_QUEUES)
2855     GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
2856                aq->id, queue_last);
2857
2858   aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
2859   placeholderp = &aq->queue[queue_last].u.placeholder;
2860
2861   if (pthread_mutex_init (&placeholderp->mutex, NULL))
2862     {
2863       pthread_mutex_unlock (&aq->mutex);
2864       GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
2865     }
2866
2867   if (pthread_cond_init (&placeholderp->cond, NULL))
2868     {
2869       pthread_mutex_unlock (&aq->mutex);
2870       GOMP_PLUGIN_error ("Failed to initialize serialization cond");
2871     }
2872
2873   placeholderp->executed = 0;
2874
2875   aq->queue_n++;
2876
2877   if (DEBUG_THREAD_SIGNAL)
2878     GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
2879                aq->agent->device_id, aq->id);
2880   pthread_cond_signal (&aq->queue_cond_in);
2881
2882   pthread_mutex_unlock (&aq->mutex);
2883
2884   return placeholderp;
2885 }
2886
2887 /* Signal an asynchronous thread to terminate, and wait for it to do so.  */
2888
2889 static void
2890 finalize_async_thread (struct goacc_asyncqueue *aq)
2891 {
2892   pthread_mutex_lock (&aq->mutex);
2893   if (aq->drain_queue_stop == 2)
2894     {
2895       pthread_mutex_unlock (&aq->mutex);
2896       return;
2897     }
2898
2899   aq->drain_queue_stop = 1;
2900
2901   if (DEBUG_THREAD_SIGNAL)
2902     GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
2903                aq->agent->device_id, aq->id);
2904   pthread_cond_signal (&aq->queue_cond_in);
2905
2906   while (aq->drain_queue_stop != 2)
2907     {
2908       if (DEBUG_THREAD_SLEEP)
2909         GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
2910                    " to sleep\n", aq->agent->device_id, aq->id);
2911       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
2912       if (DEBUG_THREAD_SLEEP)
2913         GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
2914                    aq->agent->device_id, aq->id);
2915     }
2916
2917   GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
2918              aq->id);
2919   pthread_mutex_unlock (&aq->mutex);
2920
2921   int err = pthread_join (aq->thread_drain_queue, NULL);
2922   if (err != 0)
2923     GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
2924                        aq->agent->device_id, aq->id, strerror (err));
2925   GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
2926 }
2927
2928 /* Set up an async queue for OpenMP.  There will be only one.  The
2929    implementation simply uses an OpenACC async queue.
2930    FIXME: is this thread-safe if two threads call this function?  */
2931
2932 static void
2933 maybe_init_omp_async (struct agent_info *agent)
2934 {
2935   if (!agent->omp_async_queue)
2936     agent->omp_async_queue
2937       = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
2938 }
2939
2940 /* A wrapper that works around an issue in the HSA runtime with host-to-device
2941    copies from read-only pages.  */
2942
2943 static void
2944 hsa_memory_copy_wrapper (void *dst, const void *src, size_t len)
2945 {
2946   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, len);
2947
2948   if (status == HSA_STATUS_SUCCESS)
2949     return;
2950
2951   /* It appears that the copy fails if the source data is in a read-only page.
2952      We can't detect that easily, so try copying the data to a temporary buffer
2953      and doing the copy again if we got an error above.  */
2954
2955   GCN_WARNING ("Read-only data transfer bug workaround triggered for "
2956                "[%p:+%d]\n", (void *) src, (int) len);
2957
2958   void *src_copy = malloc (len);
2959   memcpy (src_copy, src, len);
2960   status = hsa_fns.hsa_memory_copy_fn (dst, (const void *) src_copy, len);
2961   free (src_copy);
2962   if (status != HSA_STATUS_SUCCESS)
2963     GOMP_PLUGIN_error ("memory copy failed");
2964 }
2965
2966 /* Copy data to or from a device.  This is intended for use as an async
2967    callback event.  */
2968
2969 static void
2970 copy_data (void *data_)
2971 {
2972   struct copy_data *data = (struct copy_data *)data_;
2973   GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
2974              data->aq->agent->device_id, data->aq->id, data->len, data->src,
2975              data->dst);
2976   hsa_memory_copy_wrapper (data->dst, data->src, data->len);
2977   if (data->free_src)
2978     free ((void *) data->src);
2979   free (data);
2980 }
2981
2982 /* Free device data.  This is intended for use as an async callback event.  */
2983
2984 static void
2985 gomp_offload_free (void *ptr)
2986 {
2987   GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
2988   GOMP_OFFLOAD_free (0, ptr);
2989 }
2990
2991 /* Request an asynchronous data copy, to or from a device, on a given queue.
2992    The event will be registered as a callback.  If FREE_SRC is true
2993    then the source data will be freed following the copy.  */
2994
2995 static void
2996 queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
2997                  size_t len, bool free_src)
2998 {
2999   if (DEBUG_QUEUES)
3000     GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
3001                aq->agent->device_id, aq->id, len, src, dst);
3002   struct copy_data *data
3003     = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
3004   data->dst = dst;
3005   data->src = src;
3006   data->len = len;
3007   data->free_src = free_src;
3008   data->aq = aq;
3009   queue_push_callback (aq, copy_data, data);
3010 }
3011
3012 /* Return true if the given queue is currently empty.  */
3013
3014 static int
3015 queue_empty (struct goacc_asyncqueue *aq)
3016 {
3017   pthread_mutex_lock (&aq->mutex);
3018   int res = aq->queue_n == 0 ? 1 : 0;
3019   pthread_mutex_unlock (&aq->mutex);
3020
3021   return res;
3022 }
3023
3024 /* Wait for a given queue to become empty.  This implements an OpenACC wait
3025    directive.  */
3026
3027 static void
3028 wait_queue (struct goacc_asyncqueue *aq)
3029 {
3030   if (DRAIN_QUEUE_SYNCHRONOUS_P)
3031     {
3032       drain_queue_synchronous (aq);
3033       return;
3034     }
3035
3036   pthread_mutex_lock (&aq->mutex);
3037
3038   while (aq->queue_n > 0)
3039     {
3040       if (DEBUG_THREAD_SLEEP)
3041         GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
3042                    aq->agent->device_id, aq->id);
3043       pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
3044       if (DEBUG_THREAD_SLEEP)
3045         GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
3046                    aq->id);
3047     }
3048
3049   pthread_mutex_unlock (&aq->mutex);
3050   GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
3051 }
3052
3053 /* }}}  */
3054 /* {{{ OpenACC support  */
3055
3056 /* Execute an OpenACC kernel, synchronously or asynchronously.  */
3057
3058 static void
3059 gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
3060           void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
3061           struct goacc_asyncqueue *aq)
3062 {
3063   if (!GOMP_OFFLOAD_can_run (kernel))
3064     GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
3065
3066   /* If we get here then this must be an OpenACC kernel.  */
3067   kernel->kind = KIND_OPENACC;
3068
3069   /* devaddrs must be double-indirect on the target.  */
3070   void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
3071   for (size_t i = 0; i < mapnum; i++)
3072     hsa_fns.hsa_memory_copy_fn (&ind_da[i],
3073                                 devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
3074                                 sizeof (void *));
3075
3076   struct hsa_kernel_description *hsa_kernel_desc = NULL;
3077   for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
3078     {
3079       struct hsa_kernel_description *d
3080         = &kernel->module->image_desc->kernel_infos[i];
3081       if (d->name == kernel->name)
3082         {
3083           hsa_kernel_desc = d;
3084           break;
3085         }
3086     }
3087
3088   /* We may have statically-determined dimensions in
3089      hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
3090      invocation at runtime in dims[].  We allow static dimensions to take
3091      priority over dynamic dimensions when present (non-zero).  */
3092   if (hsa_kernel_desc->oacc_dims[0] > 0)
3093     dims[0] = hsa_kernel_desc->oacc_dims[0];
3094   if (hsa_kernel_desc->oacc_dims[1] > 0)
3095     dims[1] = hsa_kernel_desc->oacc_dims[1];
3096   if (hsa_kernel_desc->oacc_dims[2] > 0)
3097     dims[2] = hsa_kernel_desc->oacc_dims[2];
3098
3099   /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
3100      There isn't really a correct answer for this without a clue about the
3101      problem size, so let's do a reasonable number of single-worker gangs.
3102      64 gangs matches a typical Fiji device.  */
3103
3104   /* NOTE: Until support for middle-end worker partitioning is merged, use 1
3105      for the default number of workers.  */
3106   if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs.  */
3107   if (dims[1] == 0) dims[1] = 1;  /* Workers.  */
3108
3109   /* The incoming dimensions are expressed in terms of gangs, workers, and
3110      vectors.  The HSA dimensions are expressed in terms of "work-items",
3111      which means multiples of vector lanes.
3112
3113      The "grid size" specifies the size of the problem space, and the
3114      "work-group size" specifies how much of that we want a single compute
3115      unit to chew on at once.
3116
3117      The three dimensions do not really correspond to hardware, but the
3118      important thing is that the HSA runtime will launch as many
3119      work-groups as it takes to process the entire grid, and each
3120      work-group will contain as many wave-fronts as it takes to process
3121      the work-items in that group.
3122
3123      Essentially, as long as we set the Y dimension to 64 (the number of
3124      vector lanes in hardware), and the Z group size to the maximum (16),
3125      then we will get the gangs (X) and workers (Z) launched as we expect.
3126
3127      The reason for the apparent reversal of vector and worker dimension
3128      order is to do with the way the run-time distributes work-items across
3129      v1 and v2.  */
3130   struct GOMP_kernel_launch_attributes kla =
3131     {3,
3132      /* Grid size.  */
3133      {dims[0], 64, dims[1]},
3134      /* Work-group size.  */
3135      {1,       64, 16}
3136     };
3137
3138   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3139   acc_prof_info *prof_info = thr->prof_info;
3140   acc_event_info enqueue_launch_event_info;
3141   acc_api_info *api_info = thr->api_info;
3142   bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
3143   if (profiling_dispatch_p)
3144     {
3145       prof_info->event_type = acc_ev_enqueue_launch_start;
3146
3147       enqueue_launch_event_info.launch_event.event_type
3148         = prof_info->event_type;
3149       enqueue_launch_event_info.launch_event.valid_bytes
3150         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
3151       enqueue_launch_event_info.launch_event.parent_construct
3152         = acc_construct_parallel;
3153       enqueue_launch_event_info.launch_event.implicit = 1;
3154       enqueue_launch_event_info.launch_event.tool_info = NULL;
3155       enqueue_launch_event_info.launch_event.kernel_name
3156         = (char *) kernel->name;
3157       enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
3158       enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
3159       enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
3160
3161       api_info->device_api = acc_device_api_other;
3162
3163       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3164         &enqueue_launch_event_info, api_info);
3165     }
3166
3167   if (!async)
3168     {
3169       run_kernel (kernel, ind_da, &kla, NULL, false);
3170       gomp_offload_free (ind_da);
3171     }
3172   else
3173     {
3174       queue_push_launch (aq, kernel, ind_da, &kla);
3175       if (DEBUG_QUEUES)
3176         GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
3177                    aq->agent->device_id, aq->id, ind_da);
3178       queue_push_callback (aq, gomp_offload_free, ind_da);
3179     }
3180
3181   if (profiling_dispatch_p)
3182     {
3183       prof_info->event_type = acc_ev_enqueue_launch_end;
3184       enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
3185       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
3186                                             &enqueue_launch_event_info,
3187                                             api_info);
3188     }
3189 }
3190
3191 /* }}}  */
3192 /* {{{ Generic Plugin API  */
3193
3194 /* Return the name of the accelerator, which is "gcn".  */
3195
3196 const char *
3197 GOMP_OFFLOAD_get_name (void)
3198 {
3199   return "gcn";
3200 }
3201
3202 /* Return the specific capabilities the HSA accelerator have.  */
3203
3204 unsigned int
3205 GOMP_OFFLOAD_get_caps (void)
3206 {
3207   /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
3208   return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
3209             | GOMP_OFFLOAD_CAP_OPENACC_200;
3210 }
3211
3212 /* Identify as GCN accelerator.  */
3213
3214 int
3215 GOMP_OFFLOAD_get_type (void)
3216 {
3217   return OFFLOAD_TARGET_TYPE_GCN;
3218 }
3219
3220 /* Return the libgomp version number we're compatible with.  There is
3221    no requirement for cross-version compatibility.  */
3222
3223 unsigned
3224 GOMP_OFFLOAD_version (void)
3225 {
3226   return GOMP_VERSION;
3227 }
3228
3229 /* Return the number of GCN devices on the system.  */
3230
3231 int
3232 GOMP_OFFLOAD_get_num_devices (void)
3233 {
3234   if (!init_hsa_context ())
3235     return 0;
3236   return hsa_context.agent_count;
3237 }
3238
3239 /* Initialize device (agent) number N so that it can be used for computation.
3240    Return TRUE on success.  */
3241
3242 bool
3243 GOMP_OFFLOAD_init_device (int n)
3244 {
3245   if (!init_hsa_context ())
3246     return false;
3247   if (n >= hsa_context.agent_count)
3248     {
3249       GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
3250       return false;
3251     }
3252   struct agent_info *agent = &hsa_context.agents[n];
3253
3254   if (agent->initialized)
3255     return true;
3256
3257   agent->device_id = n;
3258
3259   if (pthread_rwlock_init (&agent->module_rwlock, NULL))
3260     {
3261       GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
3262       return false;
3263     }
3264   if (pthread_mutex_init (&agent->prog_mutex, NULL))
3265     {
3266       GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
3267       return false;
3268     }
3269   if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
3270     {
3271       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3272       return false;
3273     }
3274   if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
3275     {
3276       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
3277       return false;
3278     }
3279   agent->async_queues = NULL;
3280   agent->omp_async_queue = NULL;
3281   agent->team_arena_list = NULL;
3282
3283   uint32_t queue_size;
3284   hsa_status_t status;
3285   status = hsa_fns.hsa_agent_get_info_fn (agent->id,
3286                                           HSA_AGENT_INFO_QUEUE_MAX_SIZE,
3287                                           &queue_size);
3288   if (status != HSA_STATUS_SUCCESS)
3289     return hsa_error ("Error requesting maximum queue size of the GCN agent",
3290                       status);
3291
3292   char buf[64];
3293   status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
3294                                           &buf);
3295   if (status != HSA_STATUS_SUCCESS)
3296     return hsa_error ("Error querying the name of the agent", status);
3297   agent->gfx900_p = (strncmp (buf, "gfx900", 6) == 0);
3298
3299   status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
3300                                         HSA_QUEUE_TYPE_MULTI,
3301                                         hsa_queue_callback, NULL, UINT32_MAX,
3302                                         UINT32_MAX, &agent->sync_queue);
3303   if (status != HSA_STATUS_SUCCESS)
3304     return hsa_error ("Error creating command queue", status);
3305
3306   agent->kernarg_region.handle = (uint64_t) -1;
3307   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3308                                                  get_kernarg_memory_region,
3309                                                  &agent->kernarg_region);
3310   if (agent->kernarg_region.handle == (uint64_t) -1)
3311     {
3312       GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
3313                          "arguments");
3314       return false;
3315     }
3316   GCN_DEBUG ("Selected kernel arguments memory region:\n");
3317   dump_hsa_region (agent->kernarg_region, NULL);
3318
3319   agent->data_region.handle = (uint64_t) -1;
3320   status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
3321                                                  get_data_memory_region,
3322                                                  &agent->data_region);
3323   if (agent->data_region.handle == (uint64_t) -1)
3324     {
3325       GOMP_PLUGIN_error ("Could not find suitable memory region for device "
3326                          "data");
3327       return false;
3328     }
3329   GCN_DEBUG ("Selected device data memory region:\n");
3330   dump_hsa_region (agent->data_region, NULL);
3331
3332   GCN_DEBUG ("GCN agent %d initialized\n", n);
3333
3334   agent->initialized = true;
3335   return true;
3336 }
3337
3338 /* Load GCN object-code module described by struct gcn_image_desc in
3339    TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
3340    If there are any constructors then run them.  */
3341
3342 int
3343 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
3344                          struct addr_pair **target_table)
3345 {
3346   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3347     {
3348       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3349                          " (expected %u, received %u)",
3350                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3351       return -1;
3352     }
3353
3354   struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
3355   struct agent_info *agent;
3356   struct addr_pair *pair;
3357   struct module_info *module;
3358   struct kernel_info *kernel;
3359   int kernel_count = image_desc->kernel_count;
3360   unsigned var_count = image_desc->global_variable_count;
3361
3362   agent = get_agent_info (ord);
3363   if (!agent)
3364     return -1;
3365
3366   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3367     {
3368       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3369       return -1;
3370     }
3371   if (agent->prog_finalized
3372       && !destroy_hsa_program (agent))
3373     return -1;
3374
3375   GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
3376   GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
3377   pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
3378                              * sizeof (struct addr_pair));
3379   *target_table = pair;
3380   module = (struct module_info *)
3381     GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
3382                                 + kernel_count * sizeof (struct kernel_info));
3383   module->image_desc = image_desc;
3384   module->kernel_count = kernel_count;
3385   module->heap = NULL;
3386   module->constructors_run_p = false;
3387
3388   kernel = &module->kernels[0];
3389
3390   /* Allocate memory for kernel dependencies.  */
3391   for (unsigned i = 0; i < kernel_count; i++)
3392     {
3393       struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
3394       if (!init_basic_kernel_info (kernel, d, agent, module))
3395         return -1;
3396       if (strcmp (d->name, "_init_array") == 0)
3397         module->init_array_func = kernel;
3398       else if (strcmp (d->name, "_fini_array") == 0)
3399         module->fini_array_func = kernel;
3400       else
3401         {
3402           pair->start = (uintptr_t) kernel;
3403           pair->end = (uintptr_t) (kernel + 1);
3404           pair++;
3405         }
3406       kernel++;
3407     }
3408
3409   agent->module = module;
3410   if (pthread_rwlock_unlock (&agent->module_rwlock))
3411     {
3412       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3413       return -1;
3414     }
3415
3416   if (!create_and_finalize_hsa_program (agent))
3417     return -1;
3418
3419   for (unsigned i = 0; i < var_count; i++)
3420     {
3421       struct global_var_info *v = &image_desc->global_variables[i];
3422       GCN_DEBUG ("Looking for variable %s\n", v->name);
3423
3424       hsa_status_t status;
3425       hsa_executable_symbol_t var_symbol;
3426       status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
3427                                                      v->name, agent->id,
3428                                                      0, &var_symbol);
3429
3430       if (status != HSA_STATUS_SUCCESS)
3431         hsa_fatal ("Could not find symbol for variable in the code object",
3432                    status);
3433
3434       uint64_t var_addr;
3435       uint32_t var_size;
3436       status = hsa_fns.hsa_executable_symbol_get_info_fn
3437         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
3438       if (status != HSA_STATUS_SUCCESS)
3439         hsa_fatal ("Could not extract a variable from its symbol", status);
3440       status = hsa_fns.hsa_executable_symbol_get_info_fn
3441         (var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
3442       if (status != HSA_STATUS_SUCCESS)
3443         hsa_fatal ("Could not extract a variable size from its symbol", status);
3444
3445       pair->start = var_addr;
3446       pair->end = var_addr + var_size;
3447       GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
3448                  (void *)var_addr, var_size);
3449       pair++;
3450     }
3451
3452   /* Ensure that constructors are run first.  */
3453   struct GOMP_kernel_launch_attributes kla =
3454     { 3,
3455       /* Grid size.  */
3456       { 1, 64, 1 },
3457       /* Work-group size.  */
3458       { 1, 64, 1 }
3459     };
3460
3461   if (module->init_array_func)
3462     {
3463       init_kernel (module->init_array_func);
3464       run_kernel (module->init_array_func, NULL, &kla, NULL, false);
3465     }
3466   module->constructors_run_p = true;
3467
3468   /* Don't report kernels that libgomp need not know about.  */
3469   if (module->init_array_func)
3470     kernel_count--;
3471   if (module->fini_array_func)
3472     kernel_count--;
3473
3474   return kernel_count + var_count;
3475 }
3476
3477 /* Unload GCN object-code module described by struct gcn_image_desc in
3478    TARGET_DATA from agent number N.  Return TRUE on success.  */
3479
3480 bool
3481 GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
3482 {
3483   if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
3484     {
3485       GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
3486                          " (expected %u, received %u)",
3487                          GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
3488       return false;
3489     }
3490
3491   struct agent_info *agent;
3492   agent = get_agent_info (n);
3493   if (!agent)
3494     return false;
3495
3496   if (pthread_rwlock_wrlock (&agent->module_rwlock))
3497     {
3498       GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
3499       return false;
3500     }
3501
3502   if (!agent->module || agent->module->image_desc != target_data)
3503     {
3504       GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
3505                          "loaded before");
3506       return false;
3507     }
3508
3509   if (!destroy_module (agent->module, true))
3510     return false;
3511   free (agent->module);
3512   agent->module = NULL;
3513   if (!destroy_hsa_program (agent))
3514     return false;
3515   if (pthread_rwlock_unlock (&agent->module_rwlock))
3516     {
3517       GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
3518       return false;
3519     }
3520   return true;
3521 }
3522
3523 /* Deinitialize all information and status associated with agent number N.  We
3524    do not attempt any synchronization, assuming the user and libgomp will not
3525    attempt deinitialization of a device that is in any way being used at the
3526    same time.  Return TRUE on success.  */
3527
3528 bool
3529 GOMP_OFFLOAD_fini_device (int n)
3530 {
3531   struct agent_info *agent = get_agent_info (n);
3532   if (!agent)
3533     return false;
3534
3535   if (!agent->initialized)
3536     return true;
3537
3538   if (agent->omp_async_queue)
3539     {
3540       GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
3541       agent->omp_async_queue = NULL;
3542     }
3543
3544   if (agent->module)
3545     {
3546       if (!destroy_module (agent->module, false))
3547         return false;
3548       free (agent->module);
3549       agent->module = NULL;
3550     }
3551
3552   if (!destroy_team_arenas (agent))
3553     return false;
3554
3555   if (!destroy_hsa_program (agent))
3556     return false;
3557
3558   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
3559   if (status != HSA_STATUS_SUCCESS)
3560     return hsa_error ("Error destroying command queue", status);
3561
3562   if (pthread_mutex_destroy (&agent->prog_mutex))
3563     {
3564       GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
3565       return false;
3566     }
3567   if (pthread_rwlock_destroy (&agent->module_rwlock))
3568     {
3569       GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
3570       return false;
3571     }
3572
3573   if (pthread_mutex_destroy (&agent->async_queues_mutex))
3574     {
3575       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
3576       return false;
3577     }
3578   if (pthread_mutex_destroy (&agent->team_arena_write_lock))
3579     {
3580       GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
3581       return false;
3582     }
3583   agent->initialized = false;
3584   return true;
3585 }
3586
3587 /* Return true if the HSA runtime can run function FN_PTR.  */
3588
3589 bool
3590 GOMP_OFFLOAD_can_run (void *fn_ptr)
3591 {
3592   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3593
3594   init_kernel (kernel);
3595   if (kernel->initialization_failed)
3596     goto failure;
3597
3598   return true;
3599
3600 failure:
3601   if (suppress_host_fallback)
3602     GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
3603   GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
3604   return false;
3605 }
3606
3607 /* Allocate memory on device N.  */
3608
3609 void *
3610 GOMP_OFFLOAD_alloc (int n, size_t size)
3611 {
3612   struct agent_info *agent = get_agent_info (n);
3613   return alloc_by_agent (agent, size);
3614 }
3615
3616 /* Free memory from device N.  */
3617
3618 bool
3619 GOMP_OFFLOAD_free (int device, void *ptr)
3620 {
3621   GCN_DEBUG ("Freeing memory on device %d\n", device);
3622
3623   hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
3624   if (status != HSA_STATUS_SUCCESS)
3625     {
3626       hsa_error ("Could not free device memory", status);
3627       return false;
3628     }
3629
3630   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
3631   bool profiling_dispatch_p
3632     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
3633   if (profiling_dispatch_p)
3634     {
3635       acc_prof_info *prof_info = thr->prof_info;
3636       acc_event_info data_event_info;
3637       acc_api_info *api_info = thr->api_info;
3638
3639       prof_info->event_type = acc_ev_free;
3640
3641       data_event_info.data_event.event_type = prof_info->event_type;
3642       data_event_info.data_event.valid_bytes
3643         = _ACC_DATA_EVENT_INFO_VALID_BYTES;
3644       data_event_info.data_event.parent_construct
3645         = acc_construct_parallel;
3646       data_event_info.data_event.implicit = 1;
3647       data_event_info.data_event.tool_info = NULL;
3648       data_event_info.data_event.var_name = NULL;
3649       data_event_info.data_event.bytes = 0;
3650       data_event_info.data_event.host_ptr = NULL;
3651       data_event_info.data_event.device_ptr = (void *) ptr;
3652
3653       api_info->device_api = acc_device_api_other;
3654
3655       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
3656                                             api_info);
3657     }
3658
3659   return true;
3660 }
3661
3662 /* Copy data from DEVICE to host.  */
3663
3664 bool
3665 GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
3666 {
3667   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
3668              src, dst);
3669   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3670   if (status != HSA_STATUS_SUCCESS)
3671     GOMP_PLUGIN_error ("memory copy failed");
3672   return true;
3673 }
3674
3675 /* Copy data from host to DEVICE.  */
3676
3677 bool
3678 GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
3679 {
3680   GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
3681              device, dst);
3682   hsa_memory_copy_wrapper (dst, src, n);
3683   return true;
3684 }
3685
3686 /* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
3687
3688 bool
3689 GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
3690 {
3691   struct gcn_thread *thread_data = gcn_thread ();
3692
3693   if (thread_data && !async_synchronous_p (thread_data->async))
3694     {
3695       struct agent_info *agent = get_agent_info (device);
3696       maybe_init_omp_async (agent);
3697       queue_push_copy (agent->omp_async_queue, dst, src, n, false);
3698       return true;
3699     }
3700
3701   GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
3702              device, src, device, dst);
3703   hsa_status_t status = hsa_fns.hsa_memory_copy_fn (dst, src, n);
3704   if (status != HSA_STATUS_SUCCESS)
3705     GOMP_PLUGIN_error ("memory copy failed");
3706   return true;
3707 }
3708
3709 /* }}}  */
3710 /* {{{ OpenMP Plugin API  */
3711
3712 /* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
3713    in VARS as a parameter.  The kernel is identified by FN_PTR which must point
3714    to a kernel_info structure, and must have previously been loaded to the
3715    specified device.  */
3716
3717 void
3718 GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
3719 {
3720   struct agent_info *agent = get_agent_info (device);
3721   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3722   struct GOMP_kernel_launch_attributes def;
3723   struct GOMP_kernel_launch_attributes *kla;
3724   assert (agent == kernel->agent);
3725
3726   /* If we get here then the kernel must be OpenMP.  */
3727   kernel->kind = KIND_OPENMP;
3728
3729   if (!parse_target_attributes (args, &def, &kla, agent))
3730     {
3731       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3732       return;
3733     }
3734   run_kernel (kernel, vars, kla, NULL, false);
3735 }
3736
3737 /* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
3738    GOMP_OFFLOAD_run except that the launch is queued and there is a call to
3739    GOMP_PLUGIN_target_task_completion when it has finished.  */
3740
3741 void
3742 GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
3743                         void **args, void *async_data)
3744 {
3745   GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
3746   struct agent_info *agent = get_agent_info (device);
3747   struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
3748   struct GOMP_kernel_launch_attributes def;
3749   struct GOMP_kernel_launch_attributes *kla;
3750   assert (agent == kernel->agent);
3751
3752   /* If we get here then the kernel must be OpenMP.  */
3753   kernel->kind = KIND_OPENMP;
3754
3755   if (!parse_target_attributes (args, &def, &kla, agent))
3756     {
3757       GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
3758       return;
3759     }
3760
3761   maybe_init_omp_async (agent);
3762   queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
3763   queue_push_callback (agent->omp_async_queue,
3764                        GOMP_PLUGIN_target_task_completion, async_data);
3765 }
3766
3767 /* }}} */
3768 /* {{{ OpenACC Plugin API  */
3769
3770 /* Run a synchronous OpenACC kernel.  The device number is inferred from the
3771    already-loaded KERNEL.  */
3772
3773 void
3774 GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
3775                            void **hostaddrs, void **devaddrs, unsigned *dims,
3776                            void *targ_mem_desc)
3777 {
3778   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3779
3780   gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
3781             NULL);
3782 }
3783
3784 /* Run an asynchronous OpenACC kernel on the specified queue.  */
3785
3786 void
3787 GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
3788                                  void **hostaddrs, void **devaddrs,
3789                                  unsigned *dims, void *targ_mem_desc,
3790                                  struct goacc_asyncqueue *aq)
3791 {
3792   struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
3793
3794   gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
3795             aq);
3796 }
3797
3798 /* Create a new asynchronous thread and queue for running future kernels.  */
3799
3800 struct goacc_asyncqueue *
3801 GOMP_OFFLOAD_openacc_async_construct (int device)
3802 {
3803   struct agent_info *agent = get_agent_info (device);
3804
3805   pthread_mutex_lock (&agent->async_queues_mutex);
3806
3807   struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
3808   aq->agent = get_agent_info (device);
3809   aq->prev = NULL;
3810   aq->next = agent->async_queues;
3811   if (aq->next)
3812     {
3813       aq->next->prev = aq;
3814       aq->id = aq->next->id + 1;
3815     }
3816   else
3817     aq->id = 1;
3818   agent->async_queues = aq;
3819
3820   aq->queue_first = 0;
3821   aq->queue_n = 0;
3822   aq->drain_queue_stop = 0;
3823
3824   if (pthread_mutex_init (&aq->mutex, NULL))
3825     {
3826       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
3827       return false;
3828     }
3829   if (pthread_cond_init (&aq->queue_cond_in, NULL))
3830     {
3831       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3832       return false;
3833     }
3834   if (pthread_cond_init (&aq->queue_cond_out, NULL))
3835     {
3836       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
3837       return false;
3838     }
3839
3840   hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
3841                                                      ASYNC_QUEUE_SIZE,
3842                                                      HSA_QUEUE_TYPE_MULTI,
3843                                                      hsa_queue_callback, NULL,
3844                                                      UINT32_MAX, UINT32_MAX,
3845                                                      &aq->hsa_queue);
3846   if (status != HSA_STATUS_SUCCESS)
3847     hsa_fatal ("Error creating command queue", status);
3848
3849   int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
3850   if (err != 0)
3851     GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
3852                        strerror (err));
3853   GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
3854              aq->id);
3855
3856   pthread_mutex_unlock (&agent->async_queues_mutex);
3857
3858   return aq;
3859 }
3860
3861 /* Destroy an existing asynchronous thread and queue.  Waits for any
3862    currently-running task to complete, but cancels any queued tasks.  */
3863
3864 bool
3865 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
3866 {
3867   struct agent_info *agent = aq->agent;
3868
3869   finalize_async_thread (aq);
3870
3871   pthread_mutex_lock (&agent->async_queues_mutex);
3872
3873   int err;
3874   if ((err = pthread_mutex_destroy (&aq->mutex)))
3875     {
3876       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
3877       goto fail;
3878     }
3879   if (pthread_cond_destroy (&aq->queue_cond_in))
3880     {
3881       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3882       goto fail;
3883     }
3884   if (pthread_cond_destroy (&aq->queue_cond_out))
3885     {
3886       GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
3887       goto fail;
3888     }
3889   hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
3890   if (status != HSA_STATUS_SUCCESS)
3891     {
3892       hsa_error ("Error destroying command queue", status);
3893       goto fail;
3894     }
3895
3896   if (aq->prev)
3897     aq->prev->next = aq->next;
3898   if (aq->next)
3899     aq->next->prev = aq->prev;
3900   if (agent->async_queues == aq)
3901     agent->async_queues = aq->next;
3902
3903   GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
3904
3905   free (aq);
3906   pthread_mutex_unlock (&agent->async_queues_mutex);
3907   return true;
3908
3909 fail:
3910   pthread_mutex_unlock (&agent->async_queues_mutex);
3911   return false;
3912 }
3913
3914 /* Return true if the specified async queue is currently empty.  */
3915
3916 int
3917 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
3918 {
3919   return queue_empty (aq);
3920 }
3921
3922 /* Block until the specified queue has executed all its tasks and the
3923    queue is empty.  */
3924
3925 bool
3926 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
3927 {
3928   wait_queue (aq);
3929   return true;
3930 }
3931
3932 /* Add a serialization point across two async queues. Any new tasks added to
3933    AQ2, after this call, will not run until all tasks on AQ1, at the time
3934    of this call, have completed.  */
3935
3936 bool
3937 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
3938                                       struct goacc_asyncqueue *aq2)
3939 {
3940   /* For serialize, stream aq2 waits for aq1 to complete work that has been
3941      scheduled to run on it up to this point.  */
3942   if (aq1 != aq2)
3943     {
3944       struct placeholder *placeholderp = queue_push_placeholder (aq1);
3945       queue_push_asyncwait (aq2, placeholderp);
3946     }
3947   return true;
3948 }
3949
3950 /* Add an opaque callback to the given async queue.  */
3951
3952 void
3953 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
3954                                            void (*fn) (void *), void *data)
3955 {
3956   queue_push_callback (aq, fn, data);
3957 }
3958
3959 /* Queue up an asynchronous data copy from host to DEVICE.  */
3960
3961 bool
3962 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
3963                                      size_t n, struct goacc_asyncqueue *aq)
3964 {
3965   struct agent_info *agent = get_agent_info (device);
3966   assert (agent == aq->agent);
3967   /* The source data does not necessarily remain live until the deferred
3968      copy happens.  Taking a snapshot of the data here avoids reading
3969      uninitialised data later, but means that (a) data is copied twice and
3970      (b) modifications to the copied data between the "spawning" point of
3971      the asynchronous kernel and when it is executed will not be seen.
3972      But, that is probably correct.  */
3973   void *src_copy = GOMP_PLUGIN_malloc (n);
3974   memcpy (src_copy, src, n);
3975   queue_push_copy (aq, dst, src_copy, n, true);
3976   return true;
3977 }
3978
3979 /* Queue up an asynchronous data copy from DEVICE to host.  */
3980
3981 bool
3982 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
3983                                      size_t n, struct goacc_asyncqueue *aq)
3984 {
3985   struct agent_info *agent = get_agent_info (device);
3986   assert (agent == aq->agent);
3987   queue_push_copy (aq, dst, src, n, false);
3988   return true;
3989 }
3990
3991 union goacc_property_value
3992 GOMP_OFFLOAD_openacc_get_property (int device, enum goacc_property prop)
3993 {
3994   /* Stub. Check device and return default value for unsupported properties. */
3995   /* TODO: Implement this function. */
3996   get_agent_info (device);
3997
3998   union goacc_property_value nullval = { .val = 0 };
3999   return nullval;
4000 }
4001
4002 /* Set up plugin-specific thread-local-data (host-side).  */
4003
4004 void *
4005 GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
4006 {
4007   struct gcn_thread *thread_data
4008     = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
4009
4010   thread_data->async = GOMP_ASYNC_SYNC;
4011
4012   return (void *) thread_data;
4013 }
4014
4015 /* Clean up plugin-specific thread-local-data.  */
4016
4017 void
4018 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
4019 {
4020   free (data);
4021 }
4022
4023 /* }}} */