libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2022 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #define _GNU_SOURCE
  35 #include "openacc.h"
  36 #include "config.h"
  37 #include "symcat.h"
  38 #include "libgomp-plugin.h"
  39 #include "oacc-plugin.h"
  40 #include "gomp-constants.h"
  41 #include "oacc-int.h"
  42
  43 /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
  44 #include "config/nvptx/libgomp-nvptx.h"
  45
  46 #include <pthread.h>
  47 #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
  48 # include "cuda/cuda.h"
  49 #else
  50 # include <cuda.h>
  51 #endif
  52 #include <stdbool.h>
  53 #include <limits.h>
  54 #include <string.h>
  55 #include <stdio.h>
  56 #include <unistd.h>
  57 #include <assert.h>
  58 #include <errno.h>
  59 #include <stdlib.h>
  60
  61 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
  62    block to cache between kernel invocations.  For soft-stacks blocks bigger
  63    than this, we will free the block before attempting another GPU memory
  64    allocation (i.e. in GOMP_OFFLOAD_alloc).  Otherwise, if an allocation fails,
  65    we will free the cached soft-stacks block anyway then retry the
  66    allocation.  If that fails too, we lose.  */
  67
  68 #define SOFTSTACK_CACHE_LIMIT 134217728
  69
  70 #if CUDA_VERSION < 6000
  71 extern CUresult cuGetErrorString (CUresult, const char **);
  72 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
  73 #endif
  74
  75 #if CUDA_VERSION >= 6050
  76 #undef cuLinkCreate
  77 #undef cuLinkAddData
  78 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
  79                         const char *, unsigned, CUjit_option *, void **);
  80 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
  81 #undef cuMemHostRegister
  82 CUresult cuMemHostRegister (void *, size_t, unsigned int);
  83 #else
  84 typedef size_t (*CUoccupancyB2DSize)(int);
  85 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
  86                            const char *, unsigned, CUjit_option *, void **);
  87 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
  88 CUresult cuMemHostRegister_v2 (void *, size_t, unsigned int);
  89 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
  90                                           CUoccupancyB2DSize, size_t, int);
  91 #endif
  92
  93 #define DO_PRAGMA(x) _Pragma (#x)
  94
  95 #ifndef PLUGIN_NVPTX_LINK_LIBCUDA
  96 # include <dlfcn.h>
  97
  98 struct cuda_lib_s {
  99
 100 # define CUDA_ONE_CALL(call)                    \
 101   __typeof (call) *call;
 102 # define CUDA_ONE_CALL_MAYBE_NULL(call)         \
 103   CUDA_ONE_CALL (call)
 104 #include "cuda-lib.def"
 105 # undef CUDA_ONE_CALL
 106 # undef CUDA_ONE_CALL_MAYBE_NULL
 107
 108 } cuda_lib;
 109
 110 /* -1 if init_cuda_lib has not been called yet, false
 111    if it has been and failed, true if it has been and succeeded.  */
 112 static signed char cuda_lib_inited = -1;
 113
 114 /* Dynamically load the CUDA runtime library and initialize function
 115    pointers, return false if unsuccessful, true if successful.  */
 116 static bool
 117 init_cuda_lib (void)
 118 {
 119   if (cuda_lib_inited != -1)
 120     return cuda_lib_inited;
 121   const char *cuda_runtime_lib = "libcuda.so.1";
 122   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 123   cuda_lib_inited = false;
 124   if (h == NULL)
 125     return false;
 126
 127 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
 128 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
 129 # define CUDA_ONE_CALL_1(call, allow_null)              \
 130   cuda_lib.call = dlsym (h, #call);     \
 131   if (!allow_null && cuda_lib.call == NULL)             \
 132     return false;
 133 #include "cuda-lib.def"
 134 # undef CUDA_ONE_CALL
 135 # undef CUDA_ONE_CALL_1
 136 # undef CUDA_ONE_CALL_MAYBE_NULL
 137
 138   cuda_lib_inited = true;
 139   return true;
 140 }
 141 # define CUDA_CALL_PREFIX cuda_lib.
 142 #else
 143
 144 # define CUDA_ONE_CALL(call)
 145 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
 146 #include "cuda-lib.def"
 147 #undef CUDA_ONE_CALL_MAYBE_NULL
 148 #undef CUDA_ONE_CALL
 149
 150 # define CUDA_CALL_PREFIX
 151 # define init_cuda_lib() true
 152 #endif
 153
 154 #include "secure_getenv.h"
 155
 156 #undef MIN
 157 #undef MAX
 158 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 159 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 160
 161 /* Convenience macros for the frequently used CUDA library call and
 162    error handling sequence as well as CUDA library calls that
 163    do the error checking themselves or don't do it at all.  */
 164
 165 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 166   do {                                          \
 167     unsigned __r                                \
 168       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 169     if (__r != CUDA_SUCCESS)                    \
 170       {                                         \
 171         GOMP_PLUGIN_error (#FN " error: %s",    \
 172                            cuda_error (__r));   \
 173         return ERET;                            \
 174       }                                         \
 175   } while (0)
 176
 177 #define CUDA_CALL(FN, ...)                      \
 178   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 179
 180 #define CUDA_CALL_ASSERT(FN, ...)               \
 181   do {                                          \
 182     unsigned __r                                \
 183       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 184     if (__r != CUDA_SUCCESS)                    \
 185       {                                         \
 186         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 187                            cuda_error (__r));   \
 188       }                                         \
 189   } while (0)
 190
 191 #define CUDA_CALL_NOCHECK(FN, ...)              \
 192   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 193
 194 #define CUDA_CALL_EXISTS(FN)                    \
 195   CUDA_CALL_PREFIX FN
 196
 197 static const char *
 198 cuda_error (CUresult r)
 199 {
 200   const char *fallback = "unknown cuda error";
 201   const char *desc;
 202
 203   if (!CUDA_CALL_EXISTS (cuGetErrorString))
 204     return fallback;
 205
 206   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 207   if (r == CUDA_SUCCESS)
 208     return desc;
 209
 210   return fallback;
 211 }
 212
 213 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
 214    Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
 215 static char cuda_driver_version_s[30];
 216
 217 static unsigned int instantiated_devices = 0;
 218 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 219
 220 /* NVPTX/CUDA specific definition of asynchronous queues.  */
 221 struct goacc_asyncqueue
 222 {
 223   CUstream cuda_stream;
 224   pthread_mutex_t page_locked_host_unregister_blocks_lock;
 225   struct ptx_free_block *page_locked_host_unregister_blocks;
 226 };
 227
 228 struct nvptx_callback
 229 {
 230   void (*fn) (void *);
 231   void *ptr;
 232   struct goacc_asyncqueue *aq;
 233   struct nvptx_callback *next;
 234 };
 235
 236 /* Thread-specific data for PTX.  */
 237
 238 struct nvptx_thread
 239 {
 240   /* We currently have this embedded inside the plugin because libgomp manages
 241      devices through integer target_ids.  This might be better if using an
 242      opaque target-specific pointer directly from gomp_device_descr.  */
 243   struct ptx_device *ptx_dev;
 244 };
 245
 246 /* Target data function launch information.  */
 247
 248 struct targ_fn_launch
 249 {
 250   const char *fn;
 251   unsigned short dim[GOMP_DIM_MAX];
 252 };
 253
 254 /* Target PTX object information.  */
 255
 256 struct targ_ptx_obj
 257 {
 258   const char *code;
 259   size_t size;
 260 };
 261
 262 /* Target data image information.  */
 263
 264 typedef struct nvptx_tdata
 265 {
 266   const struct targ_ptx_obj *ptx_objs;
 267   unsigned ptx_num;
 268
 269   const char *const *var_names;
 270   unsigned var_num;
 271
 272   const struct targ_fn_launch *fn_descs;
 273   unsigned fn_num;
 274 } nvptx_tdata_t;
 275
 276 /* Descriptor of a loaded function.  */
 277
 278 struct targ_fn_descriptor
 279 {
 280   CUfunction fn;
 281   const struct targ_fn_launch *launch;
 282   int regs_per_thread;
 283   int max_threads_per_block;
 284 };
 285
 286 /* A loaded PTX image.  */
 287 struct ptx_image_data
 288 {
 289   const void *target_data;
 290   CUmodule module;
 291
 292   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 293
 294   struct ptx_image_data *next;
 295 };
 296
 297 struct ptx_free_block
 298 {
 299   void *ptr;
 300   struct ptx_free_block *next;
 301 };
 302
 303 struct ptx_device
 304 {
 305   CUcontext ctx;
 306   bool ctx_shared;
 307   CUdevice dev;
 308
 309   int ord;
 310   bool overlap;
 311   bool map;
 312   bool concur;
 313   bool mkern;
 314   int mode;
 315   int clock_khz;
 316   int num_sms;
 317   int regs_per_block;
 318   int regs_per_sm;
 319   int warp_size;
 320   int max_threads_per_block;
 321   int max_threads_per_multiprocessor;
 322   bool read_only_host_register_supported;
 323   int default_dims[GOMP_DIM_MAX];
 324   int compute_major, compute_minor;
 325
 326   /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp').  */
 327   char name[256];
 328
 329   struct ptx_image_data *images;  /* Images loaded on device.  */
 330   pthread_mutex_t image_lock;     /* Lock for above list.  */
 331
 332   struct ptx_free_block *free_blocks;
 333   pthread_mutex_t free_blocks_lock;
 334
 335   /* OpenMP stacks, cached between kernel invocations.  */
 336   struct
 337     {
 338       CUdeviceptr ptr;
 339       size_t size;
 340       pthread_mutex_t lock;
 341     } omp_stacks;
 342
 343   struct rev_offload *rev_data;
 344   struct ptx_device *next;
 345 };
 346
 347 static struct ptx_device **ptx_devices;
 348
 349 static struct ptx_free_block *free_host_blocks = NULL;
 350 static pthread_mutex_t free_host_blocks_lock = PTHREAD_MUTEX_INITIALIZER;
 351
 352 static bool
 353 nvptx_run_deferred_page_locked_host_free (void)
 354 {
 355   GOMP_PLUGIN_debug (0, "%s\n",
 356                      __FUNCTION__);
 357
 358   pthread_mutex_lock (&free_host_blocks_lock);
 359   struct ptx_free_block *b = free_host_blocks;
 360   free_host_blocks = NULL;
 361   pthread_mutex_unlock (&free_host_blocks_lock);
 362
 363   while (b)
 364     {
 365       GOMP_PLUGIN_debug (0, "  b=%p: cuMemFreeHost(b->ptr=%p)\n",
 366                          b, b->ptr);
 367
 368       struct ptx_free_block *b_next = b->next;
 369       CUDA_CALL (cuMemFreeHost, b->ptr);
 370       free (b);
 371       b = b_next;
 372     }
 373   return true;
 374 }
 375
 376 /* OpenMP kernels reserve a small amount of ".shared" space for use by
 377    omp_alloc.  The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
 378    default is set here.  */
 379 static unsigned lowlat_pool_size = 8*1024;
 380
 381 static bool nvptx_do_global_cdtors (CUmodule, struct ptx_device *,
 382                                     const char *);
 383 static size_t nvptx_stacks_size ();
 384 static void *nvptx_stacks_acquire (struct ptx_device *, size_t, int);
 385
 386 static inline struct nvptx_thread *
 387 nvptx_thread (void)
 388 {
 389   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 390 }
 391
 392 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 393    should be locked on entry and remains locked on exit.  */
 394
 395 static bool
 396 nvptx_init (void)
 397 {
 398   int ndevs;
 399
 400   if (instantiated_devices != 0)
 401     return true;
 402
 403   if (!init_cuda_lib ())
 404     return false;
 405
 406   CUDA_CALL (cuInit, 0);
 407
 408   int cuda_driver_version;
 409   CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
 410   snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
 411             "CUDA Driver %u.%u",
 412             cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
 413
 414   CUDA_CALL (cuDeviceGetCount, &ndevs);
 415   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 416                                             * ndevs);
 417
 418   return true;
 419 }
 420
 421 /* Select the N'th PTX device for the current host thread.  The device must
 422    have been previously opened before calling this function.  */
 423
 424 static bool
 425 nvptx_attach_host_thread_to_device (int n)
 426 {
 427   CUdevice dev;
 428   CUresult r;
 429   struct ptx_device *ptx_dev;
 430   CUcontext thd_ctx;
 431
 432   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 433   if (r == CUDA_ERROR_NOT_PERMITTED)
 434     {
 435       /* Assume we're in a CUDA callback, just return true.  */
 436       return true;
 437     }
 438   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 439     {
 440       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 441       return false;
 442     }
 443
 444   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 445     return true;
 446   else
 447     {
 448       CUcontext old_ctx;
 449
 450       ptx_dev = ptx_devices[n];
 451       if (!ptx_dev)
 452         {
 453           GOMP_PLUGIN_error ("device %d not found", n);
 454           return false;
 455         }
 456
 457       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 458
 459       /* We don't necessarily have a current context (e.g. if it has been
 460          destroyed.  Pop it if we do though.  */
 461       if (thd_ctx != NULL)
 462         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 463
 464       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 465     }
 466   return true;
 467 }
 468
 469 static struct ptx_device *
 470 nvptx_open_device (int n)
 471 {
 472   struct ptx_device *ptx_dev;
 473   CUdevice dev, ctx_dev;
 474   CUresult r;
 475   int pi;
 476
 477   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 478
 479   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 480
 481   ptx_dev->ord = n;
 482   ptx_dev->dev = dev;
 483   ptx_dev->ctx_shared = false;
 484
 485   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 486   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 487     {
 488       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 489       return NULL;
 490     }
 491
 492   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 493     {
 494       /* The current host thread has an active context for a different device.
 495          Detach it.  */
 496       CUcontext old_ctx;
 497       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 498     }
 499
 500   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 501
 502   if (!ptx_dev->ctx)
 503     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 504   else
 505     ptx_dev->ctx_shared = true;
 506
 507   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 508                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 509   ptx_dev->overlap = pi;
 510
 511   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 512                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 513   ptx_dev->map = pi;
 514
 515   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 516                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 517   ptx_dev->concur = pi;
 518
 519   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 520                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 521   ptx_dev->mode = pi;
 522
 523   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 524                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 525   ptx_dev->mkern = pi;
 526
 527   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 528                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 529   ptx_dev->clock_khz = pi;
 530
 531   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 532                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 533   ptx_dev->num_sms = pi;
 534
 535   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 536                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 537   ptx_dev->regs_per_block = pi;
 538
 539   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
 540      in CUDA 6.0 and newer.  */
 541   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 542                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
 543                          dev);
 544   /* Fallback: use limit of registers per block, which is usually equal.  */
 545   if (r == CUDA_ERROR_INVALID_VALUE)
 546     pi = ptx_dev->regs_per_block;
 547   else if (r != CUDA_SUCCESS)
 548     {
 549       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 550       return NULL;
 551     }
 552   ptx_dev->regs_per_sm = pi;
 553
 554   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 555                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 556   if (pi != 32)
 557     {
 558       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 559       return NULL;
 560     }
 561   ptx_dev->warp_size = pi;
 562
 563   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 564                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 565   ptx_dev->max_threads_per_block = pi;
 566
 567   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 568                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
 569   ptx_dev->max_threads_per_multiprocessor = pi;
 570
 571   /* Required below for reverse offload as implemented, but with compute
 572      capability >= 2.0 and 64bit device processes, this should be universally be
 573      the case; hence, an assert.  */
 574   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 575                          CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
 576   assert (r == CUDA_SUCCESS && pi);
 577
 578   /* This is a CUDA 11.1 feature.  */
 579   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 580                          CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED,
 581                          dev);
 582   if (r == CUDA_ERROR_INVALID_VALUE)
 583     pi = false;
 584   else if (r != CUDA_SUCCESS)
 585     {
 586       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 587       return NULL;
 588     }
 589   ptx_dev->read_only_host_register_supported = pi;
 590
 591   for (int i = 0; i != GOMP_DIM_MAX; i++)
 592     ptx_dev->default_dims[i] = 0;
 593
 594   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 595                   CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
 596   ptx_dev->compute_major = pi;
 597
 598   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 599                   CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
 600   ptx_dev->compute_minor = pi;
 601
 602   CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
 603                   dev);
 604
 605   ptx_dev->images = NULL;
 606   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 607
 608   ptx_dev->free_blocks = NULL;
 609   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 610
 611   ptx_dev->omp_stacks.ptr = 0;
 612   ptx_dev->omp_stacks.size = 0;
 613   pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
 614
 615   ptx_dev->rev_data = NULL;
 616
 617   return ptx_dev;
 618 }
 619
 620 static bool
 621 nvptx_close_device (struct ptx_device *ptx_dev)
 622 {
 623   if (!ptx_dev)
 624     return true;
 625
 626   bool ret = true;
 627
 628   for (struct ptx_image_data *image = ptx_dev->images;
 629        image != NULL;
 630        image = image->next)
 631     {
 632       if (!nvptx_do_global_cdtors (image->module, ptx_dev,
 633                                    "__do_global_dtors__entry"))
 634         ret = false;
 635     }
 636
 637   for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
 638     {
 639       struct ptx_free_block *b_next = b->next;
 640       CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
 641       free (b);
 642       b = b_next;
 643     }
 644
 645   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
 646   pthread_mutex_destroy (&ptx_dev->image_lock);
 647
 648   pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
 649
 650   if (ptx_dev->omp_stacks.ptr)
 651     CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
 652
 653   if (!ptx_dev->ctx_shared)
 654     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 655
 656   free (ptx_dev);
 657
 658   return ret;
 659 }
 660
 661 static int
 662 nvptx_get_num_devices (void)
 663 {
 664   int n;
 665
 666   /* This function will be called before the plugin has been initialized in
 667      order to enumerate available devices, but CUDA API routines can't be used
 668      until cuInit has been called.  Just call it now (but don't yet do any
 669      further initialization).  */
 670   if (instantiated_devices == 0)
 671     {
 672       if (!init_cuda_lib ())
 673         return 0;
 674       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 675       /* This is not an error: e.g. we may have CUDA libraries installed but
 676          no devices available.  */
 677       if (r != CUDA_SUCCESS)
 678         {
 679           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 680                              cuda_error (r));
 681           return 0;
 682         }
 683     }
 684
 685   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 686   return n;
 687 }
 688
 689 static void
 690 notify_var (const char *var_name, const char *env_var)
 691 {
 692   if (env_var == NULL)
 693     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 694   else
 695     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 696 }
 697
 698 static void
 699 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
 700 {
 701   const char *var_name = "GOMP_NVPTX_JIT";
 702   const char *env_var = secure_getenv (var_name);
 703   notify_var (var_name, env_var);
 704
 705   if (env_var == NULL)
 706     return;
 707
 708   const char *c = env_var;
 709   while (*c != '\0')
 710     {
 711       while (*c == ' ')
 712         c++;
 713
 714       if (c[0] == '-' && c[1] == 'O'
 715           && '0' <= c[2] && c[2] <= '4'
 716           && (c[3] == '\0' || c[3] == ' '))
 717         {
 718           *gomp_nvptx_o = c[2] - '0';
 719           c += 3;
 720           continue;
 721         }
 722
 723       GOMP_PLUGIN_error ("Error parsing %s", var_name);
 724       break;
 725     }
 726 }
 727
 728 static bool
 729 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 730           unsigned num_objs)
 731 {
 732   CUjit_option opts[7];
 733   void *optvals[7];
 734   float elapsed = 0.0;
 735   char elog[1024];
 736   char ilog[16384];
 737   CUlinkState linkstate;
 738   CUresult r;
 739   void *linkout;
 740   size_t linkoutsize __attribute__ ((unused));
 741
 742   opts[0] = CU_JIT_WALL_TIME;
 743   optvals[0] = &elapsed;
 744
 745   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 746   optvals[1] = &ilog[0];
 747
 748   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 749   optvals[2] = (void *) sizeof ilog;
 750
 751   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 752   optvals[3] = &elog[0];
 753
 754   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 755   optvals[4] = (void *) sizeof elog;
 756
 757   opts[5] = CU_JIT_LOG_VERBOSE;
 758   optvals[5] = (void *) 1;
 759
 760   static intptr_t gomp_nvptx_o = -1;
 761
 762   static bool init_done = false;
 763   if (!init_done)
 764     {
 765       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
 766       init_done = true;
 767   }
 768
 769   int nopts = 6;
 770   if (gomp_nvptx_o != -1)
 771     {
 772       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
 773       optvals[nopts] = (void *) gomp_nvptx_o;
 774       nopts++;
 775     }
 776
 777   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
 778     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
 779   else
 780     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
 781
 782   for (; num_objs--; ptx_objs++)
 783     {
 784       /* cuLinkAddData's 'data' argument erroneously omits the const
 785          qualifier.  */
 786       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 787       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
 788         r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
 789                                (char *) ptx_objs->code, ptx_objs->size,
 790                                0, 0, 0, 0);
 791       else
 792         r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 793                                (char *) ptx_objs->code, ptx_objs->size,
 794                                0, 0, 0, 0);
 795       if (r != CUDA_SUCCESS)
 796         {
 797           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 798           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 799                              cuda_error (r));
 800           return false;
 801         }
 802     }
 803
 804   GOMP_PLUGIN_debug (0, "Linking\n");
 805   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 806
 807   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 808   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 809
 810   if (r != CUDA_SUCCESS)
 811     {
 812       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 813       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 814       return false;
 815     }
 816
 817   CUDA_CALL (cuModuleLoadData, module, linkout);
 818   CUDA_CALL (cuLinkDestroy, linkstate);
 819   return true;
 820 }
 821
 822 static void
 823 nvptx_exec (void (*fn), unsigned *dims, void *targ_mem_desc,
 824             CUdeviceptr dp, CUstream stream)
 825 {
 826   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 827   CUfunction function;
 828   int i;
 829   void *kargs[1];
 830   struct nvptx_thread *nvthd = nvptx_thread ();
 831   int warp_size = nvthd->ptx_dev->warp_size;
 832
 833   function = targ_fn->fn;
 834
 835   /* Initialize the launch dimensions.  Typically this is constant,
 836      provided by the device compiler, but we must permit runtime
 837      values.  */
 838   int seen_zero = 0;
 839   for (i = 0; i != GOMP_DIM_MAX; i++)
 840     {
 841       if (targ_fn->launch->dim[i])
 842        dims[i] = targ_fn->launch->dim[i];
 843       if (!dims[i])
 844        seen_zero = 1;
 845     }
 846
 847   if (seen_zero)
 848     {
 849       pthread_mutex_lock (&ptx_dev_lock);
 850
 851       static int gomp_openacc_dims[GOMP_DIM_MAX];
 852       if (!gomp_openacc_dims[0])
 853         {
 854           /* See if the user provided GOMP_OPENACC_DIM environment
 855              variable to specify runtime defaults.  */
 856           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 857             gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
 858         }
 859
 860       if (!nvthd->ptx_dev->default_dims[0])
 861         {
 862           int default_dims[GOMP_DIM_MAX];
 863           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 864             default_dims[i] = gomp_openacc_dims[i];
 865
 866           int gang, worker, vector;
 867           {
 868             int block_size = nvthd->ptx_dev->max_threads_per_block;
 869             int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
 870             int dev_size = nvthd->ptx_dev->num_sms;
 871             GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
 872                                " dev_size=%d, cpu_size=%d\n",
 873                                warp_size, block_size, dev_size, cpu_size);
 874
 875             gang = (cpu_size / block_size) * dev_size;
 876             worker = block_size / warp_size;
 877             vector = warp_size;
 878           }
 879
 880           /* There is no upper bound on the gang size.  The best size
 881              matches the hardware configuration.  Logical gangs are
 882              scheduled onto physical hardware.  To maximize usage, we
 883              should guess a large number.  */
 884           if (default_dims[GOMP_DIM_GANG] < 1)
 885             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
 886           /* The worker size must not exceed the hardware.  */
 887           if (default_dims[GOMP_DIM_WORKER] < 1
 888               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
 889             default_dims[GOMP_DIM_WORKER] = worker;
 890           /* The vector size must exactly match the hardware.  */
 891           if (default_dims[GOMP_DIM_VECTOR] < 1
 892               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
 893             default_dims[GOMP_DIM_VECTOR] = vector;
 894
 895           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
 896                              default_dims[GOMP_DIM_GANG],
 897                              default_dims[GOMP_DIM_WORKER],
 898                              default_dims[GOMP_DIM_VECTOR]);
 899
 900           for (i = 0; i != GOMP_DIM_MAX; i++)
 901             nvthd->ptx_dev->default_dims[i] = default_dims[i];
 902         }
 903       pthread_mutex_unlock (&ptx_dev_lock);
 904
 905       {
 906         bool default_dim_p[GOMP_DIM_MAX];
 907         for (i = 0; i != GOMP_DIM_MAX; i++)
 908           default_dim_p[i] = !dims[i];
 909
 910         if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
 911           {
 912             for (i = 0; i != GOMP_DIM_MAX; i++)
 913               if (default_dim_p[i])
 914                 dims[i] = nvthd->ptx_dev->default_dims[i];
 915
 916             if (default_dim_p[GOMP_DIM_VECTOR])
 917               dims[GOMP_DIM_VECTOR]
 918                 = MIN (dims[GOMP_DIM_VECTOR],
 919                        (targ_fn->max_threads_per_block / warp_size
 920                         * warp_size));
 921
 922             if (default_dim_p[GOMP_DIM_WORKER])
 923               dims[GOMP_DIM_WORKER]
 924                 = MIN (dims[GOMP_DIM_WORKER],
 925                        targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
 926           }
 927         else
 928           {
 929             /* Handle the case that the compiler allows the runtime to choose
 930                the vector-length conservatively, by ignoring
 931                gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
 932                it.  */
 933             int vectors = 0;
 934             /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
 935                gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
 936                exceed targ_fn->max_threads_per_block. */
 937             int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
 938             int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
 939             int grids, blocks;
 940
 941             CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
 942                               &blocks, function, NULL, 0,
 943                               dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
 944             GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
 945                                "grid = %d, block = %d\n", grids, blocks);
 946
 947             /* Keep the num_gangs proportional to the block size.  In
 948                the case were a block size is limited by shared-memory
 949                or the register file capacity, the runtime will not
 950                excessively over assign gangs to the multiprocessor
 951                units if their state is going to be swapped out even
 952                more than necessary. The constant factor 2 is there to
 953                prevent threads from idling when there is insufficient
 954                work for them.  */
 955             if (gangs == 0)
 956               gangs = 2 * grids * (blocks / warp_size);
 957
 958             if (vectors == 0)
 959               vectors = warp_size;
 960
 961             if (workers == 0)
 962               {
 963                 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
 964                                       ? vectors
 965                                       : dims[GOMP_DIM_VECTOR]);
 966                 workers = blocks / actual_vectors;
 967                 workers = MAX (workers, 1);
 968                 /* If we need a per-worker barrier ... .  */
 969                 if (actual_vectors > 32)
 970                   /* Don't use more barriers than available.  */
 971                   workers = MIN (workers, 15);
 972               }
 973
 974             for (i = 0; i != GOMP_DIM_MAX; i++)
 975               if (default_dim_p[i])
 976                 switch (i)
 977                   {
 978                   case GOMP_DIM_GANG: dims[i] = gangs; break;
 979                   case GOMP_DIM_WORKER: dims[i] = workers; break;
 980                   case GOMP_DIM_VECTOR: dims[i] = vectors; break;
 981                   default: GOMP_PLUGIN_fatal ("invalid dim");
 982                   }
 983           }
 984       }
 985     }
 986
 987   /* Check if the accelerator has sufficient hardware resources to
 988      launch the offloaded kernel.  */
 989   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
 990       > targ_fn->max_threads_per_block)
 991     {
 992       const char *msg
 993         = ("The Nvidia accelerator has insufficient resources to launch '%s'"
 994            " with num_workers = %d and vector_length = %d"
 995            "; "
 996            "recompile the program with 'num_workers = x and vector_length = y'"
 997            " on that offloaded region or '-fopenacc-dim=:x:y' where"
 998            " x * y <= %d"
 999            ".\n");
1000       GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
1001                          dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
1002     }
1003
1004   /* Check if the accelerator has sufficient barrier resources to
1005      launch the offloaded kernel.  */
1006   if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
1007     {
1008       const char *msg
1009         = ("The Nvidia accelerator has insufficient barrier resources to launch"
1010            " '%s' with num_workers = %d and vector_length = %d"
1011            "; "
1012            "recompile the program with 'num_workers = x' on that offloaded"
1013            " region or '-fopenacc-dim=:x:' where x <= 15"
1014            "; "
1015            "or, recompile the program with 'vector_length = 32' on that"
1016            " offloaded region or '-fopenacc-dim=::32'"
1017            ".\n");
1018         GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
1019                            dims[GOMP_DIM_VECTOR]);
1020     }
1021
1022   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
1023                      " gangs=%u, workers=%u, vectors=%u\n",
1024                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1025                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1026
1027   // OpenACC            CUDA
1028   //
1029   // num_gangs          nctaid.x
1030   // num_workers        ntid.y
1031   // vector length      ntid.x
1032
1033   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1034   acc_prof_info *prof_info = thr->prof_info;
1035   acc_event_info enqueue_launch_event_info;
1036   acc_api_info *api_info = thr->api_info;
1037   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1038   if (profiling_p)
1039     {
1040       prof_info->event_type = acc_ev_enqueue_launch_start;
1041
1042       enqueue_launch_event_info.launch_event.event_type
1043         = prof_info->event_type;
1044       enqueue_launch_event_info.launch_event.valid_bytes
1045         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
1046       enqueue_launch_event_info.launch_event.parent_construct
1047         = acc_construct_parallel;
1048       enqueue_launch_event_info.launch_event.implicit = 1;
1049       enqueue_launch_event_info.launch_event.tool_info = NULL;
1050       enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
1051       enqueue_launch_event_info.launch_event.num_gangs
1052         = dims[GOMP_DIM_GANG];
1053       enqueue_launch_event_info.launch_event.num_workers
1054         = dims[GOMP_DIM_WORKER];
1055       enqueue_launch_event_info.launch_event.vector_length
1056         = dims[GOMP_DIM_VECTOR];
1057
1058       api_info->device_api = acc_device_api_cuda;
1059
1060       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1061                                             api_info);
1062     }
1063
1064   /* Per 'nvptx_goacc_validate_dims'.  */
1065   assert (dims[GOMP_DIM_VECTOR] % warp_size == 0);
1066
1067   kargs[0] = &dp;
1068   CUDA_CALL_ASSERT (cuLaunchKernel, function,
1069                     dims[GOMP_DIM_GANG], 1, 1,
1070                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1071                     0, stream, kargs, 0);
1072
1073   if (profiling_p)
1074     {
1075       prof_info->event_type = acc_ev_enqueue_launch_end;
1076       enqueue_launch_event_info.launch_event.event_type
1077         = prof_info->event_type;
1078       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1079                                             api_info);
1080     }
1081
1082   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1083                      targ_fn->launch->fn);
1084 }
1085
1086 void * openacc_get_current_cuda_context (void);
1087
1088 static void
1089 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1090 {
1091   acc_prof_info *prof_info = thr->prof_info;
1092   acc_event_info data_event_info;
1093   acc_api_info *api_info = thr->api_info;
1094
1095   prof_info->event_type = acc_ev_alloc;
1096
1097   data_event_info.data_event.event_type = prof_info->event_type;
1098   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1099   data_event_info.data_event.parent_construct = acc_construct_parallel;
1100   data_event_info.data_event.implicit = 1;
1101   data_event_info.data_event.tool_info = NULL;
1102   data_event_info.data_event.var_name = NULL;
1103   data_event_info.data_event.bytes = s;
1104   data_event_info.data_event.host_ptr = NULL;
1105   data_event_info.data_event.device_ptr = dp;
1106
1107   api_info->device_api = acc_device_api_cuda;
1108
1109   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1110 }
1111
1112 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1113    size threshold, or if FORCE is true.  */
1114
1115 static void
1116 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1117 {
1118   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1119   if (ptx_dev->omp_stacks.ptr
1120       && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1121     {
1122       CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1123       if (r != CUDA_SUCCESS)
1124         GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1125       ptx_dev->omp_stacks.ptr = 0;
1126       ptx_dev->omp_stacks.size = 0;
1127     }
1128   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1129 }
1130
1131 static void *
1132 nvptx_alloc (size_t s, bool suppress_errors, bool usm)
1133 {
1134   CUdeviceptr d;
1135
1136   CUresult r = (usm ? CUDA_CALL_NOCHECK (cuMemAllocManaged, &d, s,
1137                                          CU_MEM_ATTACH_GLOBAL)
1138                 : CUDA_CALL_NOCHECK (cuMemAlloc, &d, s));
1139   if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1140     return NULL;
1141   else if (r != CUDA_SUCCESS)
1142     {
1143       GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1144       return NULL;
1145     }
1146
1147   /* NOTE: We only do profiling stuff if the memory allocation succeeds.  */
1148   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1149   bool profiling_p
1150     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1151   if (profiling_p)
1152     goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1153
1154   return (void *) d;
1155 }
1156
1157 static void
1158 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1159 {
1160   acc_prof_info *prof_info = thr->prof_info;
1161   acc_event_info data_event_info;
1162   acc_api_info *api_info = thr->api_info;
1163
1164   prof_info->event_type = acc_ev_free;
1165
1166   data_event_info.data_event.event_type = prof_info->event_type;
1167   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1168   data_event_info.data_event.parent_construct = acc_construct_parallel;
1169   data_event_info.data_event.implicit = 1;
1170   data_event_info.data_event.tool_info = NULL;
1171   data_event_info.data_event.var_name = NULL;
1172   data_event_info.data_event.bytes = -1;
1173   data_event_info.data_event.host_ptr = NULL;
1174   data_event_info.data_event.device_ptr = p;
1175
1176   api_info->device_api = acc_device_api_cuda;
1177
1178   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1179 }
1180
1181 static bool
1182 nvptx_free (void *p, struct ptx_device *ptx_dev)
1183 {
1184   CUdeviceptr pb;
1185   size_t ps;
1186
1187   CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1188                                   (CUdeviceptr) p);
1189   if (r == CUDA_ERROR_NOT_PERMITTED)
1190     {
1191       /* We assume that this error indicates we are in a CUDA callback context,
1192          where all CUDA calls are not allowed (see cuStreamAddCallback
1193          documentation for description). Arrange to free this piece of device
1194          memory later.  */
1195       struct ptx_free_block *n
1196         = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1197       n->ptr = p;
1198       pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1199       n->next = ptx_dev->free_blocks;
1200       ptx_dev->free_blocks = n;
1201       pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1202       return true;
1203     }
1204   else if (r != CUDA_SUCCESS)
1205     {
1206       GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1207       return false;
1208     }
1209   if ((CUdeviceptr) p != pb)
1210     {
1211       GOMP_PLUGIN_error ("invalid device address");
1212       return false;
1213     }
1214
1215   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1216   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1217   bool profiling_p
1218     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1219   if (profiling_p)
1220     goacc_profiling_acc_ev_free (thr, p);
1221
1222   return true;
1223 }
1224
1225 static void *
1226 nvptx_get_current_cuda_device (void)
1227 {
1228   struct nvptx_thread *nvthd = nvptx_thread ();
1229
1230   if (!nvthd || !nvthd->ptx_dev)
1231     return NULL;
1232
1233   return &nvthd->ptx_dev->dev;
1234 }
1235
1236 static void *
1237 nvptx_get_current_cuda_context (void)
1238 {
1239   struct nvptx_thread *nvthd = nvptx_thread ();
1240
1241   if (!nvthd || !nvthd->ptx_dev)
1242     return NULL;
1243
1244   return nvthd->ptx_dev->ctx;
1245 }
1246
1247 /* Plugin entry points.  */
1248
1249 const char *
1250 GOMP_OFFLOAD_get_name (void)
1251 {
1252   return "nvptx";
1253 }
1254
1255 unsigned int
1256 GOMP_OFFLOAD_get_caps (void)
1257 {
1258   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1259 }
1260
1261 int
1262 GOMP_OFFLOAD_get_type (void)
1263 {
1264   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1265 }
1266
1267 int
1268 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
1269 {
1270   int num_devices = nvptx_get_num_devices ();
1271   /* Return -1 if no omp_requires_mask cannot be fulfilled but
1272      devices were present.  Unified-shared address: see comment in
1273      nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.  */
1274   if (num_devices > 0
1275       && ((omp_requires_mask
1276            & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1277                | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
1278                | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
1279     return -1;
1280   return num_devices;
1281 }
1282
1283 bool
1284 GOMP_OFFLOAD_init_device (int n)
1285 {
1286   struct ptx_device *dev;
1287
1288   pthread_mutex_lock (&ptx_dev_lock);
1289
1290   if (!nvptx_init () || ptx_devices[n] != NULL)
1291     {
1292       pthread_mutex_unlock (&ptx_dev_lock);
1293       return false;
1294     }
1295
1296   dev = nvptx_open_device (n);
1297   if (dev)
1298     {
1299       ptx_devices[n] = dev;
1300       instantiated_devices++;
1301     }
1302
1303   const char *var_name = "GOMP_NVPTX_LOWLAT_POOL";
1304   const char *env_var = secure_getenv (var_name);
1305   notify_var (var_name, env_var);
1306
1307   if (env_var != NULL)
1308     {
1309       char *endptr;
1310       unsigned long val = strtoul (env_var, &endptr, 10);
1311       if (endptr == NULL || *endptr != '\0'
1312           || errno == ERANGE || errno == EINVAL
1313           || val > UINT_MAX)
1314         GOMP_PLUGIN_error ("Error parsing %s", var_name);
1315       else
1316         lowlat_pool_size = val;
1317     }
1318
1319   pthread_mutex_unlock (&ptx_dev_lock);
1320
1321   return dev != NULL;
1322 }
1323
1324 bool
1325 GOMP_OFFLOAD_fini_device (int n)
1326 {
1327   /* This isn't related to this specific 'ptx_devices[n]', but is a convenient
1328      place to clean up.  */
1329   if (!nvptx_run_deferred_page_locked_host_free ())
1330     return false;
1331
1332   pthread_mutex_lock (&ptx_dev_lock);
1333
1334   if (ptx_devices[n] != NULL)
1335     {
1336       if (!nvptx_attach_host_thread_to_device (n)
1337           || !nvptx_close_device (ptx_devices[n]))
1338         {
1339           pthread_mutex_unlock (&ptx_dev_lock);
1340           return false;
1341         }
1342       ptx_devices[n] = NULL;
1343       instantiated_devices--;
1344     }
1345
1346   if (instantiated_devices == 0)
1347     {
1348       free (ptx_devices);
1349       ptx_devices = NULL;
1350     }
1351
1352   pthread_mutex_unlock (&ptx_dev_lock);
1353   return true;
1354 }
1355
1356 /* Return the libgomp version number we're compatible with.  There is
1357    no requirement for cross-version compatibility.  */
1358
1359 unsigned
1360 GOMP_OFFLOAD_version (void)
1361 {
1362   return GOMP_VERSION;
1363 }
1364
1365 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1366
1367 static void
1368 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1369 {
1370   CUdeviceptr dptr;
1371   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1372                                   module, "__nvptx_clocktick");
1373   if (r == CUDA_ERROR_NOT_FOUND)
1374     return;
1375   if (r != CUDA_SUCCESS)
1376     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1377   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1378   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1379                          sizeof (__nvptx_clocktick));
1380   if (r != CUDA_SUCCESS)
1381     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1382 }
1383
1384 /* Invoke MODULE's global constructors/destructors.  */
1385
1386 static bool
1387 nvptx_do_global_cdtors (CUmodule module, struct ptx_device *ptx_dev,
1388                         const char *funcname)
1389 {
1390   bool ret = true;
1391   char *funcname_mgomp = NULL;
1392   CUresult r;
1393   CUfunction funcptr;
1394   r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
1395                          &funcptr, module, funcname);
1396   GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
1397                      funcname, cuda_error (r));
1398   if (r == CUDA_ERROR_NOT_FOUND)
1399     {
1400       /* Try '[funcname]__mgomp'.  */
1401
1402       size_t funcname_len = strlen (funcname);
1403       const char *mgomp_suffix = "__mgomp";
1404       size_t mgomp_suffix_len = strlen (mgomp_suffix);
1405       funcname_mgomp
1406         = GOMP_PLUGIN_malloc (funcname_len + mgomp_suffix_len + 1);
1407       memcpy (funcname_mgomp, funcname, funcname_len);
1408       memcpy (funcname_mgomp + funcname_len,
1409               mgomp_suffix, mgomp_suffix_len + 1);
1410       funcname = funcname_mgomp;
1411
1412       r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
1413                              &funcptr, module, funcname);
1414       GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
1415                          funcname, cuda_error (r));
1416     }
1417   if (r == CUDA_ERROR_NOT_FOUND)
1418     ;
1419   else if (r != CUDA_SUCCESS)
1420     {
1421       GOMP_PLUGIN_error ("cuModuleGetFunction (%s) error: %s",
1422                          funcname, cuda_error (r));
1423       ret = false;
1424     }
1425   else
1426     {
1427       /* If necessary, set up soft stack.  */
1428       void *nvptx_stacks_0;
1429       void *kargs[1];
1430       if (funcname_mgomp)
1431         {
1432           size_t stack_size = nvptx_stacks_size ();
1433           pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1434           nvptx_stacks_0 = nvptx_stacks_acquire (ptx_dev, stack_size, 1);
1435           nvptx_stacks_0 += stack_size;
1436           kargs[0] = &nvptx_stacks_0;
1437         }
1438       r = CUDA_CALL_NOCHECK (cuLaunchKernel,
1439                              funcptr,
1440                              1, 1, 1, 1, 1, 1,
1441                              /* sharedMemBytes */ 0,
1442                              /* hStream */ NULL,
1443                              /* kernelParams */ funcname_mgomp ? kargs : NULL,
1444                              /* extra */ NULL);
1445       if (r != CUDA_SUCCESS)
1446         {
1447           GOMP_PLUGIN_error ("cuLaunchKernel (%s) error: %s",
1448                              funcname, cuda_error (r));
1449           ret = false;
1450         }
1451
1452       r = CUDA_CALL_NOCHECK (cuStreamSynchronize,
1453                              NULL);
1454       if (r != CUDA_SUCCESS)
1455         {
1456           GOMP_PLUGIN_error ("cuStreamSynchronize (%s) error: %s",
1457                              funcname, cuda_error (r));
1458           ret = false;
1459         }
1460
1461       if (funcname_mgomp)
1462         pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1463     }
1464
1465   if (funcname_mgomp)
1466     free (funcname_mgomp);
1467
1468   return ret;
1469 }
1470
1471 /* Load the (partial) program described by TARGET_DATA to device
1472    number ORD.  Allocate and return TARGET_TABLE.  If not NULL, REV_FN_TABLE
1473    will contain the on-device addresses of the functions for reverse offload.
1474    To be freed by the caller.  */
1475
1476 int
1477 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1478                          struct addr_pair **target_table,
1479                          uint64_t **rev_fn_table)
1480 {
1481   CUmodule module;
1482   const char *const *var_names;
1483   const struct targ_fn_launch *fn_descs;
1484   unsigned int fn_entries, var_entries, other_entries, i, j;
1485   struct targ_fn_descriptor *targ_fns;
1486   struct addr_pair *targ_tbl;
1487   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1488   struct ptx_image_data *new_image;
1489   struct ptx_device *dev;
1490
1491   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1492     {
1493       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1494                          " (expected %u, received %u)",
1495                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1496       return -1;
1497     }
1498
1499   if (!nvptx_attach_host_thread_to_device (ord)
1500       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1501     return -1;
1502
1503   dev = ptx_devices[ord];
1504
1505   /* The mkoffload utility emits a struct of pointers/integers at the
1506      start of each offload image.  The array of kernel names and the
1507      functions addresses form a one-to-one correspondence.  */
1508
1509   var_entries = img_header->var_num;
1510   var_names = img_header->var_names;
1511   fn_entries = img_header->fn_num;
1512   fn_descs = img_header->fn_descs;
1513
1514   /* Currently, other_entries contains only the struct of ICVs.  */
1515   other_entries = 1;
1516
1517   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1518                                  * (fn_entries + var_entries + other_entries));
1519   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1520                                  * fn_entries);
1521
1522   *target_table = targ_tbl;
1523
1524   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1525   new_image->target_data = target_data;
1526   new_image->module = module;
1527   new_image->fns = targ_fns;
1528
1529   pthread_mutex_lock (&dev->image_lock);
1530   new_image->next = dev->images;
1531   dev->images = new_image;
1532   pthread_mutex_unlock (&dev->image_lock);
1533
1534   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1535     {
1536       CUfunction function;
1537       int nregs, mthrs;
1538
1539       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1540                       fn_descs[i].fn);
1541       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1542                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1543       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1544                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1545
1546       targ_fns->fn = function;
1547       targ_fns->launch = &fn_descs[i];
1548       targ_fns->regs_per_thread = nregs;
1549       targ_fns->max_threads_per_block = mthrs;
1550
1551       targ_tbl->start = (uintptr_t) targ_fns;
1552       targ_tbl->end = targ_tbl->start + 1;
1553     }
1554
1555   for (j = 0; j < var_entries; j++, targ_tbl++)
1556     {
1557       CUdeviceptr var;
1558       size_t bytes;
1559
1560       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1561                       &var, &bytes, module, var_names[j]);
1562
1563       targ_tbl->start = (uintptr_t) var;
1564       targ_tbl->end = targ_tbl->start + bytes;
1565     }
1566
1567   CUdeviceptr varptr;
1568   size_t varsize;
1569   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1570                                   module, XSTRING (GOMP_ADDITIONAL_ICVS));
1571
1572   if (r == CUDA_SUCCESS)
1573     {
1574       targ_tbl->start = (uintptr_t) varptr;
1575       targ_tbl->end = (uintptr_t) (varptr + varsize);
1576     }
1577   else
1578     /* The variable was not in this image.  */
1579     targ_tbl->start = targ_tbl->end = 0;
1580
1581   if (rev_fn_table && fn_entries == 0)
1582     *rev_fn_table = NULL;
1583   else if (rev_fn_table)
1584     {
1585       CUdeviceptr var;
1586       size_t bytes;
1587       unsigned int i;
1588       r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1589                              "$offload_func_table");
1590       if (r != CUDA_SUCCESS)
1591         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1592       assert (bytes == sizeof (uint64_t) * fn_entries);
1593       *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
1594       r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
1595       if (r != CUDA_SUCCESS)
1596         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1597       /* Free if only NULL entries.  */
1598       for (i = 0; i < fn_entries; ++i)
1599         if ((*rev_fn_table)[i] != 0)
1600           break;
1601       if (i == fn_entries)
1602         {
1603           free (*rev_fn_table);
1604           *rev_fn_table = NULL;
1605         }
1606     }
1607
1608   if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
1609     {
1610       /* Get the on-device GOMP_REV_OFFLOAD_VAR variable.  It should be
1611          available but it might be not.  One reason could be: if the user code
1612          has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1613          is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1614          are not linked in.  */
1615       CUdeviceptr device_rev_offload_var;
1616       size_t device_rev_offload_size;
1617       CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
1618                                       &device_rev_offload_var,
1619                                       &device_rev_offload_size, module,
1620                                       XSTRING (GOMP_REV_OFFLOAD_VAR));
1621       if (r != CUDA_SUCCESS)
1622         {
1623           free (*rev_fn_table);
1624           *rev_fn_table = NULL;
1625         }
1626       else
1627         {
1628           /* cuMemHostAlloc memory is accessible on the device, if
1629              unified-shared address is supported; this is assumed - see comment
1630              in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1631           CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
1632                             sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
1633           CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
1634           r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
1635                                  sizeof (dp));
1636           if (r != CUDA_SUCCESS)
1637             GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1638         }
1639     }
1640
1641   nvptx_set_clocktick (module, dev);
1642
1643   if (!nvptx_do_global_cdtors (module, dev, "__do_global_ctors__entry"))
1644     return -1;
1645
1646   return fn_entries + var_entries + other_entries;
1647 }
1648
1649 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1650    function descriptors allocated by G_O_load_image.  */
1651
1652 bool
1653 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1654 {
1655   struct ptx_image_data *image, **prev_p;
1656   struct ptx_device *dev = ptx_devices[ord];
1657
1658   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1659     {
1660       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1661                          " (expected %u, received %u)",
1662                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1663       return false;
1664     }
1665
1666   bool ret = true;
1667   pthread_mutex_lock (&dev->image_lock);
1668   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1669     if (image->target_data == target_data)
1670       {
1671         if (!nvptx_do_global_cdtors (image->module, dev,
1672                                      "__do_global_dtors__entry"))
1673           ret = false;
1674
1675         *prev_p = image->next;
1676         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1677           ret = false;
1678         free (image->fns);
1679         free (image);
1680         break;
1681       }
1682   pthread_mutex_unlock (&dev->image_lock);
1683   return ret;
1684 }
1685
1686 static void *
1687 GOMP_OFFLOAD_alloc_1 (int ord, size_t size, bool usm)
1688 {
1689   if (!nvptx_attach_host_thread_to_device (ord))
1690     return NULL;
1691
1692   struct ptx_device *ptx_dev = ptx_devices[ord];
1693   struct ptx_free_block *blocks, *tmp;
1694
1695   pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1696   blocks = ptx_dev->free_blocks;
1697   ptx_dev->free_blocks = NULL;
1698   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1699
1700   while (blocks)
1701     {
1702       tmp = blocks->next;
1703       nvptx_free (blocks->ptr, ptx_dev);
1704       free (blocks);
1705       blocks = tmp;
1706     }
1707
1708   void *d = nvptx_alloc (size, true, usm);
1709   if (d)
1710     return d;
1711   else
1712     {
1713       /* Memory allocation failed.  Try freeing the stacks block, and
1714          retrying.  */
1715       nvptx_stacks_free (ptx_dev, true);
1716       return nvptx_alloc (size, false, usm);
1717     }
1718 }
1719
1720 void *
1721 GOMP_OFFLOAD_alloc (int ord, size_t size)
1722 {
1723   return GOMP_OFFLOAD_alloc_1 (ord, size, false);
1724 }
1725
1726 void *
1727 GOMP_OFFLOAD_usm_alloc (int ord, size_t size)
1728 {
1729   return GOMP_OFFLOAD_alloc_1 (ord, size, true);
1730 }
1731
1732 bool
1733 GOMP_OFFLOAD_free (int ord, void *ptr)
1734 {
1735   return (nvptx_attach_host_thread_to_device (ord)
1736           && nvptx_free (ptr, ptx_devices[ord]));
1737 }
1738
1739 bool
1740 GOMP_OFFLOAD_usm_free (int ord, void *ptr)
1741 {
1742   return GOMP_OFFLOAD_free (ord, ptr);
1743 }
1744
1745 bool
1746 GOMP_OFFLOAD_is_usm_ptr (void *ptr)
1747 {
1748   bool managed = false;
1749   /* This returns 3 outcomes ...
1750      CUDA_ERROR_INVALID_VALUE    - Not a Cuda allocated pointer.
1751      CUDA_SUCCESS, managed:false - Cuda allocated, but not USM.
1752      CUDA_SUCCESS, managed:true  - USM.  */
1753   CUDA_CALL_NOCHECK (cuPointerGetAttribute, &managed,
1754                      CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr);
1755   return managed;
1756 }
1757
1758
1759 bool
1760 GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
1761 {
1762   GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
1763                      __FUNCTION__, ptr, (unsigned long long) size);
1764
1765   /* TODO: Maybe running the deferred 'cuMemFreeHost's here is not the best
1766      idea, given that we don't know what context we're called from?  (See
1767      'GOMP_OFFLOAD_run' reverse offload handling.)  But, where to do it?  */
1768   if (!nvptx_run_deferred_page_locked_host_free ())
1769     return false;
1770
1771   CUresult r;
1772
1773   unsigned int flags = 0;
1774   /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
1775      'flags |= CU_MEMHOSTALLOC_PORTABLE;' here.  */
1776   r = CUDA_CALL_NOCHECK (cuMemHostAlloc, ptr, size, flags);
1777   if (r == CUDA_ERROR_OUT_OF_MEMORY)
1778     *ptr = NULL;
1779   else if (r != CUDA_SUCCESS)
1780     {
1781       GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r));
1782       return false;
1783     }
1784   GOMP_PLUGIN_debug (0, "  -> *ptr=%p\n",
1785                      *ptr);
1786   return true;
1787 }
1788
1789 static void
1790 nvptx_page_locked_host_free_callback (CUstream stream, CUresult r, void *ptr)
1791 {
1792   GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, ptr=%p\n",
1793                      __FUNCTION__, stream, (unsigned) r, ptr);
1794
1795   if (r != CUDA_SUCCESS)
1796     GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__, cuda_error (r));
1797
1798   /* We can't now call 'cuMemFreeHost': we're in a CUDA stream context,
1799      where we "must not make any CUDA API calls".
1800      And, in particular in an OpenMP 'target' reverse offload context,
1801      this may even dead-lock?!  */
1802   /* See 'nvptx_free'.  */
1803   struct ptx_free_block *n
1804     = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1805   GOMP_PLUGIN_debug (0, "  defer; n=%p\n", n);
1806   n->ptr = ptr;
1807   pthread_mutex_lock (&free_host_blocks_lock);
1808   n->next = free_host_blocks;
1809   free_host_blocks = n;
1810   pthread_mutex_unlock (&free_host_blocks_lock);
1811 }
1812
1813 bool
1814 GOMP_OFFLOAD_page_locked_host_free (void *ptr, struct goacc_asyncqueue *aq)
1815 {
1816   GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, aq=%p\n",
1817                      __FUNCTION__, ptr, aq);
1818
1819   if (aq)
1820     {
1821       GOMP_PLUGIN_debug (0, "  aq <-"
1822                          " nvptx_page_locked_host_free_callback(ptr)\n");
1823       CUDA_CALL (cuStreamAddCallback, aq->cuda_stream,
1824                  nvptx_page_locked_host_free_callback, ptr, 0);
1825     }
1826   else
1827     CUDA_CALL (cuMemFreeHost, ptr);
1828   return true;
1829 }
1830
1831 static int
1832 nvptx_page_locked_host_p (const void *ptr, size_t size)
1833 {
1834   GOMP_PLUGIN_debug (0, "%s: ptr=%p, size=%llu\n",
1835                      __FUNCTION__, ptr, (unsigned long long) size);
1836
1837   int ret;
1838
1839   CUresult r;
1840
1841   /* Apparently, there exists no CUDA call to query 'PTR + [0, SIZE)'.  Instead
1842      of invoking 'cuMemHostGetFlags' SIZE times, we deem it sufficient to only
1843      query the base PTR.  */
1844   unsigned int flags;
1845   void *ptr_noconst = (void *) ptr;
1846   r = CUDA_CALL_NOCHECK (cuMemHostGetFlags, &flags, ptr_noconst);
1847   (void) flags;
1848   if (r == CUDA_SUCCESS)
1849     ret = 1;
1850   else if (r == CUDA_ERROR_INVALID_VALUE)
1851     ret = 0;
1852   else
1853     {
1854       GOMP_PLUGIN_error ("cuMemHostGetFlags error: %s", cuda_error (r));
1855       ret = -1;
1856     }
1857   GOMP_PLUGIN_debug (0, "  -> %d (with r = %u)\n",
1858                      ret, (unsigned) r);
1859   return ret;
1860 }
1861
1862 int
1863 GOMP_OFFLOAD_page_locked_host_register (int ord,
1864                                         void *ptr, size_t size, int kind)
1865 {
1866   bool try_read_only;
1867   /* Magic number: if the actualy mapping kind is unknown...  */
1868   if (kind == -1)
1869     /* ..., allow for trying read-only registration here.  */
1870     try_read_only = true;
1871   else
1872     try_read_only = !GOMP_MAP_COPY_FROM_P (kind);
1873   GOMP_PLUGIN_debug (0, "nvptx %s: ord=%d, ptr=%p, size=%llu,"
1874                      " kind=%d (try_read_only=%d)\n",
1875                      __FUNCTION__, ord, ptr, (unsigned long long) size,
1876                      kind, try_read_only);
1877   assert (size != 0);
1878
1879   if (!nvptx_attach_host_thread_to_device (ord))
1880     return -1;
1881   struct ptx_device *ptx_dev = ptx_devices[ord];
1882
1883   int ret = -1;
1884
1885   CUresult r;
1886
1887   unsigned int flags = 0;
1888   /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
1889      'flags |= CU_MEMHOSTREGISTER_PORTABLE;' here.  */
1890  cuMemHostRegister:
1891   if (CUDA_CALL_EXISTS (cuMemHostRegister_v2))
1892     r = CUDA_CALL_NOCHECK (cuMemHostRegister_v2, ptr, size, flags);
1893   else
1894     r = CUDA_CALL_NOCHECK (cuMemHostRegister, ptr, size, flags);
1895   if (r == CUDA_SUCCESS)
1896     ret = 1;
1897   else if (r == CUDA_ERROR_INVALID_VALUE)
1898     {
1899       /* For example, for 'cuMemHostAlloc' (via the user code, for example)
1900          followed by 'cuMemHostRegister' (via 'always_pinned_mode', for
1901          example), we don't get 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED' but
1902          'CUDA_ERROR_INVALID_VALUE'.  */
1903       if (nvptx_page_locked_host_p (ptr, size))
1904         /* Accept the case that the region already is page-locked.  */
1905         ret = 0;
1906       /* Depending on certain circumstances (see 'cuMemHostRegister'
1907          documentation), for example, 'const' data that is placed in section
1908          '.rodata' may need 'flags |= CU_MEMHOSTREGISTER_READ_ONLY;', to avoid
1909          'CUDA_ERROR_INVALID_VALUE'.  If running into that, we now apply/re-try
1910          lazily instead of actively setting it above, to avoid the following
1911          problem.  Supposedly/observably (but, not documented), if part of a
1912          memory page has been registered without 'CU_MEMHOSTREGISTER_READ_ONLY'
1913          and we then try to register another part with
1914          'CU_MEMHOSTREGISTER_READ_ONLY', we'll get 'CUDA_ERROR_INVALID_VALUE'.
1915          In that case, we can solve the issue by re-trying with
1916          'CU_MEMHOSTREGISTER_READ_ONLY' masked out.  However, if part of a
1917          memory page has been registered with 'CU_MEMHOSTREGISTER_READ_ONLY'
1918          and we then try to register another part without
1919          'CU_MEMHOSTREGISTER_READ_ONLY', that latter part apparently inherits
1920          the former's 'CU_MEMHOSTREGISTER_READ_ONLY' (and any device to host
1921          copy then fails).  We can't easily resolve that situation
1922          retroactively, that is, we can't easily re-register the first
1923          'CU_MEMHOSTREGISTER_READ_ONLY' part without that flag.  */
1924       else if (!(flags & CU_MEMHOSTREGISTER_READ_ONLY)
1925                && try_read_only
1926                && ptx_dev->read_only_host_register_supported)
1927         {
1928           GOMP_PLUGIN_debug (0, "  flags |= CU_MEMHOSTREGISTER_READ_ONLY;\n");
1929           flags |= CU_MEMHOSTREGISTER_READ_ONLY;
1930           goto cuMemHostRegister;
1931         }
1932       /* We ought to use 'CU_MEMHOSTREGISTER_READ_ONLY', but it's not
1933          available.  */
1934       else if (try_read_only
1935                && !ptx_dev->read_only_host_register_supported)
1936         {
1937           assert (!(flags & CU_MEMHOSTREGISTER_READ_ONLY));
1938           GOMP_PLUGIN_debug (0, "  punt;"
1939                              " CU_MEMHOSTREGISTER_READ_ONLY not available\n");
1940           /* Accept this (legacy) case; we can't (easily) register page-locked
1941              this region of host memory.  */
1942           ret = 0;
1943         }
1944     }
1945   else if (r == CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
1946     {
1947       /* 'cuMemHostRegister' (via the user code, for example) followed by
1948          another (potentially partially overlapping) 'cuMemHostRegister'
1949          (via 'always_pinned_mode', for example).  */
1950       /* Accept this case in good faith; do not verify further.  */
1951       ret = 0;
1952     }
1953   if (ret == -1)
1954     GOMP_PLUGIN_error ("cuMemHostRegister error: %s", cuda_error (r));
1955   GOMP_PLUGIN_debug (0, "  -> %d (with r = %u)\n",
1956                      ret, (unsigned) r);
1957   return ret;
1958 }
1959
1960 static void
1961 nvptx_page_locked_host_unregister_callback (CUstream stream, CUresult r,
1962                                             void *b_)
1963 {
1964   void **b = b_;
1965   struct goacc_asyncqueue *aq = b[0];
1966   void *ptr = b[1];
1967   GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, b_=%p (aq=%p, ptr=%p)\n",
1968                      __FUNCTION__, stream, (unsigned) r, b_, aq, ptr);
1969
1970   free (b_);
1971
1972   if (r != CUDA_SUCCESS)
1973     GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__, cuda_error (r));
1974
1975   /* We can't now call 'cuMemHostUnregister': we're in a CUDA stream context,
1976      where we "must not make any CUDA API calls".  */
1977   /* See 'nvptx_free'.  */
1978   struct ptx_free_block *n
1979     = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1980   GOMP_PLUGIN_debug (0, "  defer; n=%p\n", n);
1981   n->ptr = ptr;
1982   pthread_mutex_lock (&aq->page_locked_host_unregister_blocks_lock);
1983   n->next = aq->page_locked_host_unregister_blocks;
1984   aq->page_locked_host_unregister_blocks = n;
1985   pthread_mutex_unlock (&aq->page_locked_host_unregister_blocks_lock);
1986 }
1987
1988 bool
1989 GOMP_OFFLOAD_page_locked_host_unregister (void *ptr, size_t size,
1990                                           struct goacc_asyncqueue *aq)
1991 {
1992   GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu, aq=%p\n",
1993                      __FUNCTION__, ptr, (unsigned long long) size, aq);
1994   assert (size != 0);
1995
1996   if (aq)
1997     {
1998       /* We don't unregister right away, as in-flight operations may still
1999          benefit from the registration.  */
2000       void **b = GOMP_PLUGIN_malloc (2 * sizeof (*b));
2001       b[0] = aq;
2002       b[1] = ptr;
2003       GOMP_PLUGIN_debug (0, "  aq <-"
2004                          " nvptx_page_locked_host_unregister_callback(b=%p)\n",
2005                          b);
2006       CUDA_CALL (cuStreamAddCallback, aq->cuda_stream,
2007                  nvptx_page_locked_host_unregister_callback, b, 0);
2008     }
2009   else
2010     CUDA_CALL (cuMemHostUnregister, ptr);
2011   return true;
2012 }
2013
2014 int
2015 GOMP_OFFLOAD_page_locked_host_p (int ord, const void *ptr, size_t size)
2016 {
2017   GOMP_PLUGIN_debug (0, "nvptx %s: ord=%d, ptr=%p, size=%llu\n",
2018                      __FUNCTION__, ord, ptr, (unsigned long long) size);
2019
2020   if (!nvptx_attach_host_thread_to_device (ord))
2021     return -1;
2022
2023   return nvptx_page_locked_host_p (ptr, size);
2024 }
2025
2026
2027 void
2028 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
2029                            size_t mapnum  __attribute__((unused)),
2030                            void **hostaddrs __attribute__((unused)),
2031                            void **devaddrs,
2032                            unsigned *dims, void *targ_mem_desc)
2033 {
2034   GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
2035
2036   CUdeviceptr dp = (CUdeviceptr) devaddrs;
2037   nvptx_exec (fn, dims, targ_mem_desc, dp, NULL);
2038
2039   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
2040   const char *maybe_abort_msg = "(perhaps abort was called)";
2041   if (r == CUDA_ERROR_LAUNCH_FAILED)
2042     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
2043                        maybe_abort_msg);
2044   else if (r != CUDA_SUCCESS)
2045     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
2046 }
2047
2048 void
2049 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *),
2050                                  size_t mapnum __attribute__((unused)),
2051                                  void **hostaddrs __attribute__((unused)),
2052                                  void **devaddrs,
2053                                  unsigned *dims, void *targ_mem_desc,
2054                                  struct goacc_asyncqueue *aq)
2055 {
2056   GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
2057
2058   CUdeviceptr dp = (CUdeviceptr) devaddrs;
2059   nvptx_exec (fn, dims, targ_mem_desc, dp, aq->cuda_stream);
2060 }
2061
2062 void *
2063 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2064 {
2065   struct ptx_device *ptx_dev;
2066   struct nvptx_thread *nvthd
2067     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2068   CUcontext thd_ctx;
2069
2070   ptx_dev = ptx_devices[ord];
2071
2072   assert (ptx_dev);
2073
2074   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2075
2076   assert (ptx_dev->ctx);
2077
2078   if (!thd_ctx)
2079     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2080
2081   nvthd->ptx_dev = ptx_dev;
2082
2083   return (void *) nvthd;
2084 }
2085
2086 void
2087 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2088 {
2089   free (data);
2090 }
2091
2092 void *
2093 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2094 {
2095   return nvptx_get_current_cuda_device ();
2096 }
2097
2098 void *
2099 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2100 {
2101   return nvptx_get_current_cuda_context ();
2102 }
2103
2104 /* This returns a CUstream.  */
2105 void *
2106 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
2107 {
2108   return (void *) aq->cuda_stream;
2109 }
2110
2111 /* This takes a CUstream.  */
2112 int
2113 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
2114 {
2115   if (aq->cuda_stream)
2116     {
2117       CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
2118       CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
2119     }
2120
2121   aq->cuda_stream = (CUstream) stream;
2122   return 1;
2123 }
2124
2125 static struct goacc_asyncqueue *
2126 nvptx_goacc_asyncqueue_construct (unsigned int flags)
2127 {
2128   GOMP_PLUGIN_debug (0, "%s: flags=%u\n",
2129                      __FUNCTION__, flags);
2130
2131   CUstream stream = NULL;
2132   CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
2133
2134   struct goacc_asyncqueue *aq
2135     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
2136   aq->cuda_stream = stream;
2137   pthread_mutex_init (&aq->page_locked_host_unregister_blocks_lock, NULL);
2138   aq->page_locked_host_unregister_blocks = NULL;
2139   GOMP_PLUGIN_debug (0, "  -> aq=%p (with cuda_stream=%p)\n",
2140                      aq, aq->cuda_stream);
2141   return aq;
2142 }
2143
2144 struct goacc_asyncqueue *
2145 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
2146 {
2147   return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
2148 }
2149
2150 static bool
2151 nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
2152 {
2153   GOMP_PLUGIN_debug (0, "nvptx %s: aq=%p\n",
2154                      __FUNCTION__, aq);
2155
2156   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
2157
2158   bool ret = true;
2159   pthread_mutex_lock (&aq->page_locked_host_unregister_blocks_lock);
2160   if (aq->page_locked_host_unregister_blocks != NULL)
2161     {
2162       GOMP_PLUGIN_error ("aq->page_locked_host_unregister_blocks not empty");
2163       ret = false;
2164     }
2165   pthread_mutex_unlock (&aq->page_locked_host_unregister_blocks_lock);
2166   pthread_mutex_destroy (&aq->page_locked_host_unregister_blocks_lock);
2167
2168   free (aq);
2169
2170   return ret;
2171 }
2172
2173 bool
2174 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
2175 {
2176   return nvptx_goacc_asyncqueue_destruct (aq);
2177 }
2178
2179 static bool
2180 nvptx_run_deferred_page_locked_host_unregister (struct goacc_asyncqueue *aq)
2181 {
2182   GOMP_PLUGIN_debug (0, "%s: aq=%p\n",
2183                      __FUNCTION__, aq);
2184
2185   bool ret = true;
2186   pthread_mutex_lock (&aq->page_locked_host_unregister_blocks_lock);
2187   for (struct ptx_free_block *b = aq->page_locked_host_unregister_blocks; b;)
2188     {
2189       GOMP_PLUGIN_debug (0, "  b=%p: cuMemHostUnregister(b->ptr=%p)\n",
2190                          b, b->ptr);
2191
2192       struct ptx_free_block *b_next = b->next;
2193       CUresult r = CUDA_CALL_NOCHECK (cuMemHostUnregister, b->ptr);
2194       if (r != CUDA_SUCCESS)
2195         {
2196           GOMP_PLUGIN_error ("cuMemHostUnregister error: %s", cuda_error (r));
2197           ret = false;
2198         }
2199       free (b);
2200       b = b_next;
2201     }
2202   aq->page_locked_host_unregister_blocks = NULL;
2203   pthread_mutex_unlock (&aq->page_locked_host_unregister_blocks_lock);
2204   return ret;
2205 }
2206
2207 int
2208 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
2209 {
2210   GOMP_PLUGIN_debug (0, "nvptx %s: aq=%p\n",
2211                      __FUNCTION__, aq);
2212
2213   CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
2214   if (r == CUDA_SUCCESS)
2215     {
2216       /* As a user may expect that they don't need to 'wait' if
2217          'acc_async_test' returns 'true', clean up here, too.  */
2218       if (!nvptx_run_deferred_page_locked_host_unregister (aq))
2219         return -1;
2220
2221       return 1;
2222     }
2223   if (r == CUDA_ERROR_NOT_READY)
2224     return 0;
2225
2226   GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
2227   return -1;
2228 }
2229
2230 static bool
2231 nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
2232 {
2233   GOMP_PLUGIN_debug (0, "%s: aq=%p\n",
2234                      __FUNCTION__, aq);
2235
2236   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
2237
2238   /* This is called from a user code (non-stream) context, and upon returning,
2239      we must've given up on any page-locked memory registrations, so unregister
2240      any pending ones now.  */
2241   if (!nvptx_run_deferred_page_locked_host_unregister (aq))
2242     return false;
2243
2244   return true;
2245 }
2246
2247 bool
2248 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
2249 {
2250   return nvptx_goacc_asyncqueue_synchronize (aq);
2251 }
2252
2253 static void
2254 nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback
2255 (CUstream stream, CUresult r, void *b_)
2256 {
2257   void **b = b_;
2258   struct goacc_asyncqueue *aq1 = b[0];
2259   struct goacc_asyncqueue *aq2 = b[1];
2260   GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, b_=%p (aq1=%p, aq2=%p)\n",
2261                      __FUNCTION__, stream, (unsigned) r, b_, aq1, aq2);
2262
2263   free (b_);
2264
2265   if (r != CUDA_SUCCESS)
2266     GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__, cuda_error (r));
2267
2268   pthread_mutex_lock (&aq1->page_locked_host_unregister_blocks_lock);
2269   if (aq1->page_locked_host_unregister_blocks)
2270     {
2271       pthread_mutex_lock (&aq2->page_locked_host_unregister_blocks_lock);
2272       GOMP_PLUGIN_debug (0, "  page_locked_host_unregister_blocks:"
2273                          " aq1 -> aq2\n");
2274       if (aq2->page_locked_host_unregister_blocks == NULL)
2275         aq2->page_locked_host_unregister_blocks
2276           = aq1->page_locked_host_unregister_blocks;
2277       else
2278         {
2279           struct ptx_free_block *b = aq2->page_locked_host_unregister_blocks;
2280           while (b->next != NULL)
2281             b = b->next;
2282           b->next = aq1->page_locked_host_unregister_blocks;
2283         }
2284       pthread_mutex_unlock (&aq2->page_locked_host_unregister_blocks_lock);
2285       aq1->page_locked_host_unregister_blocks = NULL;
2286     }
2287   pthread_mutex_unlock (&aq1->page_locked_host_unregister_blocks_lock);
2288 }
2289
2290 bool
2291 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
2292                                       struct goacc_asyncqueue *aq2)
2293 {
2294   GOMP_PLUGIN_debug (0, "nvptx %s: aq1=%p, aq2=%p\n",
2295                      __FUNCTION__, aq1, aq2);
2296
2297   if (aq1 != aq2)
2298     {
2299       void **b = GOMP_PLUGIN_malloc (2 * sizeof (*b));
2300       b[0] = aq1;
2301       b[1] = aq2;
2302       /* Enqueue on 'aq1': move 'page_locked_host_unregister_blocks' of 'aq1'
2303          to 'aq2'.  */
2304       GOMP_PLUGIN_debug (0, "  aq1 <-"
2305                          " nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback"
2306                          "(b=%p)\n", b);
2307       CUDA_CALL (cuStreamAddCallback, aq1->cuda_stream,
2308                  nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback,
2309                  b, 0);
2310     }
2311
2312   CUevent e;
2313   CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
2314   CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
2315   CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
2316
2317   return true;
2318 }
2319
2320 static void
2321 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
2322 {
2323   if (res != CUDA_SUCCESS)
2324     GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
2325   struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
2326   cb->fn (cb->ptr);
2327   free (ptr);
2328 }
2329
2330 void
2331 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
2332                                            void (*callback_fn)(void *),
2333                                            void *userptr)
2334 {
2335   struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
2336   b->fn = callback_fn;
2337   b->ptr = userptr;
2338   b->aq = aq;
2339   CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
2340                     cuda_callback_wrapper, (void *) b, 0);
2341 }
2342
2343 static bool
2344 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
2345 {
2346   CUdeviceptr pb;
2347   size_t ps;
2348   if (!s)
2349     return true;
2350   if (!d)
2351     {
2352       GOMP_PLUGIN_error ("invalid device address");
2353       return false;
2354     }
2355   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
2356   if (!pb)
2357     {
2358       GOMP_PLUGIN_error ("invalid device address");
2359       return false;
2360     }
2361   if (!h)
2362     {
2363       GOMP_PLUGIN_error ("invalid host address");
2364       return false;
2365     }
2366   if (d == h)
2367     {
2368       GOMP_PLUGIN_error ("invalid host or device address");
2369       return false;
2370     }
2371   if ((void *)(d + s) > (void *)(pb + ps))
2372     {
2373       GOMP_PLUGIN_error ("invalid size");
2374       return false;
2375     }
2376   return true;
2377 }
2378
2379 bool
2380 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
2381 {
2382   if (!nvptx_attach_host_thread_to_device (ord)
2383       || !cuda_memcpy_sanity_check (src, dst, n))
2384     return false;
2385   CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
2386   return true;
2387 }
2388
2389 bool
2390 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
2391 {
2392   if (!nvptx_attach_host_thread_to_device (ord)
2393       || !cuda_memcpy_sanity_check (dst, src, n))
2394     return false;
2395   CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
2396   return true;
2397 }
2398
2399 bool
2400 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
2401 {
2402   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
2403   return true;
2404 }
2405
2406 bool
2407 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
2408                                      size_t n, struct goacc_asyncqueue *aq)
2409 {
2410   if (!nvptx_attach_host_thread_to_device (ord)
2411       || !cuda_memcpy_sanity_check (src, dst, n))
2412     return false;
2413   CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
2414   return true;
2415 }
2416
2417 bool
2418 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
2419                                      size_t n, struct goacc_asyncqueue *aq)
2420 {
2421   if (!nvptx_attach_host_thread_to_device (ord)
2422       || !cuda_memcpy_sanity_check (dst, src, n))
2423     return false;
2424   CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
2425   return true;
2426 }
2427
2428 union goacc_property_value
2429 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
2430 {
2431   union goacc_property_value propval = { .val = 0 };
2432
2433   pthread_mutex_lock (&ptx_dev_lock);
2434
2435   if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
2436     {
2437       pthread_mutex_unlock (&ptx_dev_lock);
2438       return propval;
2439     }
2440
2441   struct ptx_device *ptx_dev = ptx_devices[n];
2442   switch (prop)
2443     {
2444     case GOACC_PROPERTY_MEMORY:
2445       {
2446         size_t total_mem;
2447
2448         CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
2449         propval.val = total_mem;
2450       }
2451       break;
2452     case GOACC_PROPERTY_FREE_MEMORY:
2453       {
2454         size_t total_mem;
2455         size_t free_mem;
2456         CUdevice ctxdev;
2457
2458         CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
2459         if (ptx_dev->dev == ctxdev)
2460           CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2461         else if (ptx_dev->ctx)
2462           {
2463             CUcontext old_ctx;
2464
2465             CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
2466             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2467             CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
2468           }
2469         else
2470           {
2471             CUcontext new_ctx;
2472
2473             CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
2474                             ptx_dev->dev);
2475             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2476             CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
2477           }
2478         propval.val = free_mem;
2479       }
2480       break;
2481     case GOACC_PROPERTY_NAME:
2482       propval.ptr = ptx_dev->name;
2483       break;
2484     case GOACC_PROPERTY_VENDOR:
2485       propval.ptr = "Nvidia";
2486       break;
2487     case GOACC_PROPERTY_DRIVER:
2488       propval.ptr = cuda_driver_version_s;
2489       break;
2490     default:
2491       break;
2492     }
2493
2494   pthread_mutex_unlock (&ptx_dev_lock);
2495   return propval;
2496 }
2497
2498 /* Adjust launch dimensions: pick good values for number of blocks and warps
2499    and ensure that number of warps does not exceed CUDA limits as well as GCC's
2500    own limits.  */
2501
2502 static void
2503 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2504                             struct ptx_device *ptx_dev,
2505                             int *teams_p, int *threads_p)
2506 {
2507   int max_warps_block = fn->max_threads_per_block / 32;
2508   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2509      and libgcc, which matches documented limit of all GPUs as of 2015.  */
2510   if (max_warps_block > 32)
2511     max_warps_block = 32;
2512   if (*threads_p <= 0)
2513     *threads_p = 8;
2514   if (*threads_p > max_warps_block)
2515     *threads_p = max_warps_block;
2516
2517   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2518   /* This is an estimate of how many blocks the device can host simultaneously.
2519      Actual limit, which may be lower, can be queried with "occupancy control"
2520      driver interface (since CUDA 6.0).  */
2521   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2522   if (*teams_p <= 0 || *teams_p > max_blocks)
2523     *teams_p = max_blocks;
2524 }
2525
2526 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2527    target regions.  */
2528
2529 static size_t
2530 nvptx_stacks_size ()
2531 {
2532   return 128 * 1024;
2533 }
2534
2535 /* Return contiguous storage for NUM stacks, each SIZE bytes.  The lock for
2536    the storage should be held on entry, and remains held on exit.  */
2537
2538 static void *
2539 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
2540 {
2541   if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
2542     return (void *) ptx_dev->omp_stacks.ptr;
2543
2544   /* Free the old, too-small stacks.  */
2545   if (ptx_dev->omp_stacks.ptr)
2546     {
2547       CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2548       if (r != CUDA_SUCCESS)
2549         GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
2550       r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
2551       if (r != CUDA_SUCCESS)
2552         GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2553     }
2554
2555   /* Make new and bigger stacks, and remember where we put them and how big
2556      they are.  */
2557   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
2558                                   size * num);
2559   if (r != CUDA_SUCCESS)
2560     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2561
2562   ptx_dev->omp_stacks.size = size * num;
2563
2564   return (void *) ptx_dev->omp_stacks.ptr;
2565 }
2566
2567
2568 void
2569 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2570 {
2571   struct targ_fn_descriptor *tgt_fn_desc
2572     = (struct targ_fn_descriptor *) tgt_fn;
2573   CUfunction function = tgt_fn_desc->fn;
2574   const struct targ_fn_launch *launch = tgt_fn_desc->launch;
2575   const char *fn_name = launch->fn;
2576   CUresult r;
2577   struct ptx_device *ptx_dev = ptx_devices[ord];
2578   const char *maybe_abort_msg = "(perhaps abort was called)";
2579   int teams = 0, threads = 0;
2580
2581   if (!args)
2582     GOMP_PLUGIN_fatal ("No target arguments provided");
2583   while (*args)
2584     {
2585       intptr_t id = (intptr_t) *args++, val;
2586       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2587         val = (intptr_t) *args++;
2588       else
2589         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2590       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2591         continue;
2592       val = val > INT_MAX ? INT_MAX : val;
2593       id &= GOMP_TARGET_ARG_ID_MASK;
2594       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2595         teams = val;
2596       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2597         threads = val;
2598     }
2599   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2600
2601   bool reverse_offload = ptx_dev->rev_data != NULL;
2602   struct goacc_asyncqueue *reverse_offload_aq = NULL;
2603   if (reverse_offload)
2604     {
2605       reverse_offload_aq
2606         = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
2607       if (!reverse_offload_aq)
2608         exit (EXIT_FAILURE);
2609     }
2610
2611   size_t stack_size = nvptx_stacks_size ();
2612
2613   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2614   void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2615   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2616   size_t fn_args_size = sizeof fn_args;
2617   void *config[] = {
2618     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2619     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2620     CU_LAUNCH_PARAM_END
2621   };
2622   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
2623                      " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2624                      __FUNCTION__, fn_name, teams, threads);
2625   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2626                          32, threads, 1, lowlat_pool_size, NULL, NULL, config);
2627   if (r != CUDA_SUCCESS)
2628     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2629   if (reverse_offload)
2630     while (true)
2631       {
2632         r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
2633         if (r == CUDA_SUCCESS)
2634           break;
2635         if (r == CUDA_ERROR_LAUNCH_FAILED)
2636           GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
2637                              maybe_abort_msg);
2638         else if (r != CUDA_ERROR_NOT_READY)
2639           GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
2640
2641         if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
2642           {
2643             struct rev_offload *rev_data = ptx_dev->rev_data;
2644             GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
2645                                     rev_data->addrs, rev_data->sizes,
2646                                     rev_data->kinds, rev_data->dev_num,
2647                                     reverse_offload_aq);
2648             if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
2649               exit (EXIT_FAILURE);
2650             __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
2651
2652             /* Clean up here; otherwise we may run into the situation that
2653                a following reverse offload does
2654                'GOMP_OFFLOAD_page_locked_host_alloc', and that then runs the
2655                deferred 'cuMemFreeHost's -- which may dead-lock?!
2656                TODO: This may need more considerations for the case that
2657                different host threads do reverse offload?  We could move
2658                'free_host_blocks' into 'aq' (which is separate per reverse
2659                offload) instead of global, like
2660                'page_locked_host_unregister_blocks', but that doesn't seem the
2661                right thing for OpenACC 'async' generally?  */
2662             if (!nvptx_run_deferred_page_locked_host_free ())
2663               exit (EXIT_FAILURE);
2664           }
2665         usleep (1);
2666       }
2667   else
2668     r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2669   if (r == CUDA_ERROR_LAUNCH_FAILED)
2670     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2671                        maybe_abort_msg);
2672   else if (r != CUDA_SUCCESS)
2673     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2674
2675   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2676
2677   if (reverse_offload)
2678     {
2679       if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
2680         exit (EXIT_FAILURE);
2681     }
2682 }
2683
2684 /* TODO: Implement GOMP_OFFLOAD_async_run. */
2685
2686 #define CHECK_ISA(major, minor) \
2687   if (((device->compute_major == major && device->compute_minor >= minor) \
2688        || device->compute_major > major) \
2689       && strcmp (isa, "sm_"#major#minor) == 0) \
2690     return true
2691
2692 bool
2693 GOMP_OFFLOAD_evaluate_device (int device_num, const char *kind,
2694                               const char *arch, const char *isa)
2695 {
2696   if (kind && strcmp (kind, "gpu") != 0)
2697     return false;
2698   if (arch && strcmp (arch, "nvptx") != 0)
2699     return false;
2700   if (!isa)
2701     return true;
2702
2703   struct ptx_device *device = ptx_devices[device_num];
2704
2705   CHECK_ISA (3, 0);
2706   CHECK_ISA (3, 5);
2707   CHECK_ISA (3, 7);
2708   CHECK_ISA (5, 0);
2709   CHECK_ISA (5, 2);
2710   CHECK_ISA (5, 3);
2711   CHECK_ISA (6, 0);
2712   CHECK_ISA (6, 1);
2713   CHECK_ISA (6, 2);
2714   CHECK_ISA (7, 0);
2715   CHECK_ISA (7, 2);
2716   CHECK_ISA (7, 5);
2717   CHECK_ISA (8, 0);
2718   CHECK_ISA (8, 6);
2719
2720   return false;
2721 }