libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2020 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #define _GNU_SOURCE
  35 #include "openacc.h"
  36 #include "config.h"
  37 #include "libgomp-plugin.h"
  38 #include "oacc-plugin.h"
  39 #include "gomp-constants.h"
  40 #include "oacc-int.h"
  41
  42 #include <pthread.h>
  43 #include <cuda.h>
  44 #include <stdbool.h>
  45 #include <limits.h>
  46 #include <string.h>
  47 #include <stdio.h>
  48 #include <unistd.h>
  49 #include <assert.h>
  50 #include <errno.h>
  51
  52 #if CUDA_VERSION < 6000
  53 extern CUresult cuGetErrorString (CUresult, const char **);
  54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
  55 #endif
  56
  57 #if CUDA_VERSION >= 6050
  58 #undef cuLinkCreate
  59 #undef cuLinkAddData
  60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
  61                         const char *, unsigned, CUjit_option *, void **);
  62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
  63 #else
  64 typedef size_t (*CUoccupancyB2DSize)(int);
  65 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
  66                            const char *, unsigned, CUjit_option *, void **);
  67 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
  68 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
  69                                           CUoccupancyB2DSize, size_t, int);
  70 #endif
  71
  72 #define DO_PRAGMA(x) _Pragma (#x)
  73
  74 #if PLUGIN_NVPTX_DYNAMIC
  75 # include <dlfcn.h>
  76
  77 struct cuda_lib_s {
  78
  79 # define CUDA_ONE_CALL(call)                    \
  80   __typeof (call) *call;
  81 # define CUDA_ONE_CALL_MAYBE_NULL(call)         \
  82   CUDA_ONE_CALL (call)
  83 #include "cuda-lib.def"
  84 # undef CUDA_ONE_CALL
  85 # undef CUDA_ONE_CALL_MAYBE_NULL
  86
  87 } cuda_lib;
  88
  89 /* -1 if init_cuda_lib has not been called yet, false
  90    if it has been and failed, true if it has been and succeeded.  */
  91 static signed char cuda_lib_inited = -1;
  92
  93 /* Dynamically load the CUDA runtime library and initialize function
  94    pointers, return false if unsuccessful, true if successful.  */
  95 static bool
  96 init_cuda_lib (void)
  97 {
  98   if (cuda_lib_inited != -1)
  99     return cuda_lib_inited;
 100   const char *cuda_runtime_lib = "libcuda.so.1";
 101   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 102   cuda_lib_inited = false;
 103   if (h == NULL)
 104     return false;
 105
 106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
 107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
 108 # define CUDA_ONE_CALL_1(call, allow_null)              \
 109   cuda_lib.call = dlsym (h, #call);     \
 110   if (!allow_null && cuda_lib.call == NULL)             \
 111     return false;
 112 #include "cuda-lib.def"
 113 # undef CUDA_ONE_CALL
 114 # undef CUDA_ONE_CALL_1
 115 # undef CUDA_ONE_CALL_MAYBE_NULL
 116
 117   cuda_lib_inited = true;
 118   return true;
 119 }
 120 # define CUDA_CALL_PREFIX cuda_lib.
 121 #else
 122
 123 # define CUDA_ONE_CALL(call)
 124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
 125 #include "cuda-lib.def"
 126 #undef CUDA_ONE_CALL_MAYBE_NULL
 127 #undef CUDA_ONE_CALL
 128
 129 # define CUDA_CALL_PREFIX
 130 # define init_cuda_lib() true
 131 #endif
 132
 133 #include "secure_getenv.h"
 134
 135 #undef MIN
 136 #undef MAX
 137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 139
 140 /* Convenience macros for the frequently used CUDA library call and
 141    error handling sequence as well as CUDA library calls that
 142    do the error checking themselves or don't do it at all.  */
 143
 144 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 145   do {                                          \
 146     unsigned __r                                \
 147       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 148     if (__r != CUDA_SUCCESS)                    \
 149       {                                         \
 150         GOMP_PLUGIN_error (#FN " error: %s",    \
 151                            cuda_error (__r));   \
 152         return ERET;                            \
 153       }                                         \
 154   } while (0)
 155
 156 #define CUDA_CALL(FN, ...)                      \
 157   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 158
 159 #define CUDA_CALL_ASSERT(FN, ...)               \
 160   do {                                          \
 161     unsigned __r                                \
 162       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 163     if (__r != CUDA_SUCCESS)                    \
 164       {                                         \
 165         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 166                            cuda_error (__r));   \
 167       }                                         \
 168   } while (0)
 169
 170 #define CUDA_CALL_NOCHECK(FN, ...)              \
 171   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 172
 173 #define CUDA_CALL_EXISTS(FN)                    \
 174   CUDA_CALL_PREFIX FN
 175
 176 static const char *
 177 cuda_error (CUresult r)
 178 {
 179   const char *fallback = "unknown cuda error";
 180   const char *desc;
 181
 182   if (!CUDA_CALL_EXISTS (cuGetErrorString))
 183     return fallback;
 184
 185   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 186   if (r == CUDA_SUCCESS)
 187     return desc;
 188
 189   return fallback;
 190 }
 191
 192 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
 193    Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
 194 static char cuda_driver_version_s[30];
 195
 196 static unsigned int instantiated_devices = 0;
 197 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 198
 199 /* NVPTX/CUDA specific definition of asynchronous queues.  */
 200 struct goacc_asyncqueue
 201 {
 202   CUstream cuda_stream;
 203 };
 204
 205 struct nvptx_callback
 206 {
 207   void (*fn) (void *);
 208   void *ptr;
 209   struct goacc_asyncqueue *aq;
 210   struct nvptx_callback *next;
 211 };
 212
 213 /* Thread-specific data for PTX.  */
 214
 215 struct nvptx_thread
 216 {
 217   /* We currently have this embedded inside the plugin because libgomp manages
 218      devices through integer target_ids.  This might be better if using an
 219      opaque target-specific pointer directly from gomp_device_descr.  */
 220   struct ptx_device *ptx_dev;
 221 };
 222
 223 /* Target data function launch information.  */
 224
 225 struct targ_fn_launch
 226 {
 227   const char *fn;
 228   unsigned short dim[GOMP_DIM_MAX];
 229 };
 230
 231 /* Target PTX object information.  */
 232
 233 struct targ_ptx_obj
 234 {
 235   const char *code;
 236   size_t size;
 237 };
 238
 239 /* Target data image information.  */
 240
 241 typedef struct nvptx_tdata
 242 {
 243   const struct targ_ptx_obj *ptx_objs;
 244   unsigned ptx_num;
 245
 246   const char *const *var_names;
 247   unsigned var_num;
 248
 249   const struct targ_fn_launch *fn_descs;
 250   unsigned fn_num;
 251 } nvptx_tdata_t;
 252
 253 /* Descriptor of a loaded function.  */
 254
 255 struct targ_fn_descriptor
 256 {
 257   CUfunction fn;
 258   const struct targ_fn_launch *launch;
 259   int regs_per_thread;
 260   int max_threads_per_block;
 261 };
 262
 263 /* A loaded PTX image.  */
 264 struct ptx_image_data
 265 {
 266   const void *target_data;
 267   CUmodule module;
 268
 269   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 270
 271   struct ptx_image_data *next;
 272 };
 273
 274 struct ptx_free_block
 275 {
 276   void *ptr;
 277   struct ptx_free_block *next;
 278 };
 279
 280 struct ptx_device
 281 {
 282   CUcontext ctx;
 283   bool ctx_shared;
 284   CUdevice dev;
 285
 286   int ord;
 287   bool overlap;
 288   bool map;
 289   bool concur;
 290   bool mkern;
 291   int mode;
 292   int clock_khz;
 293   int num_sms;
 294   int regs_per_block;
 295   int regs_per_sm;
 296   int warp_size;
 297   int max_threads_per_block;
 298   int max_threads_per_multiprocessor;
 299   int default_dims[GOMP_DIM_MAX];
 300
 301   /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp').  */
 302   char name[256];
 303
 304   struct ptx_image_data *images;  /* Images loaded on device.  */
 305   pthread_mutex_t image_lock;     /* Lock for above list.  */
 306
 307   struct ptx_free_block *free_blocks;
 308   pthread_mutex_t free_blocks_lock;
 309
 310   struct ptx_device *next;
 311 };
 312
 313 static struct ptx_device **ptx_devices;
 314
 315 static inline struct nvptx_thread *
 316 nvptx_thread (void)
 317 {
 318   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 319 }
 320
 321 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 322    should be locked on entry and remains locked on exit.  */
 323
 324 static bool
 325 nvptx_init (void)
 326 {
 327   int ndevs;
 328
 329   if (instantiated_devices != 0)
 330     return true;
 331
 332   if (!init_cuda_lib ())
 333     return false;
 334
 335   CUDA_CALL (cuInit, 0);
 336
 337   int cuda_driver_version;
 338   CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
 339   snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
 340             "CUDA Driver %u.%u",
 341             cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
 342
 343   CUDA_CALL (cuDeviceGetCount, &ndevs);
 344   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 345                                             * ndevs);
 346
 347   return true;
 348 }
 349
 350 /* Select the N'th PTX device for the current host thread.  The device must
 351    have been previously opened before calling this function.  */
 352
 353 static bool
 354 nvptx_attach_host_thread_to_device (int n)
 355 {
 356   CUdevice dev;
 357   CUresult r;
 358   struct ptx_device *ptx_dev;
 359   CUcontext thd_ctx;
 360
 361   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 362   if (r == CUDA_ERROR_NOT_PERMITTED)
 363     {
 364       /* Assume we're in a CUDA callback, just return true.  */
 365       return true;
 366     }
 367   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 368     {
 369       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 370       return false;
 371     }
 372
 373   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 374     return true;
 375   else
 376     {
 377       CUcontext old_ctx;
 378
 379       ptx_dev = ptx_devices[n];
 380       if (!ptx_dev)
 381         {
 382           GOMP_PLUGIN_error ("device %d not found", n);
 383           return false;
 384         }
 385
 386       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 387
 388       /* We don't necessarily have a current context (e.g. if it has been
 389          destroyed.  Pop it if we do though.  */
 390       if (thd_ctx != NULL)
 391         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 392
 393       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 394     }
 395   return true;
 396 }
 397
 398 static struct ptx_device *
 399 nvptx_open_device (int n)
 400 {
 401   struct ptx_device *ptx_dev;
 402   CUdevice dev, ctx_dev;
 403   CUresult r;
 404   int async_engines, pi;
 405
 406   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 407
 408   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 409
 410   ptx_dev->ord = n;
 411   ptx_dev->dev = dev;
 412   ptx_dev->ctx_shared = false;
 413
 414   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 415   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 416     {
 417       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 418       return NULL;
 419     }
 420
 421   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 422     {
 423       /* The current host thread has an active context for a different device.
 424          Detach it.  */
 425       CUcontext old_ctx;
 426       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 427     }
 428
 429   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 430
 431   if (!ptx_dev->ctx)
 432     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 433   else
 434     ptx_dev->ctx_shared = true;
 435
 436   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 437                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 438   ptx_dev->overlap = pi;
 439
 440   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 441                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 442   ptx_dev->map = pi;
 443
 444   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 445                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 446   ptx_dev->concur = pi;
 447
 448   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 449                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 450   ptx_dev->mode = pi;
 451
 452   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 453                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 454   ptx_dev->mkern = pi;
 455
 456   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 457                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 458   ptx_dev->clock_khz = pi;
 459
 460   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 461                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 462   ptx_dev->num_sms = pi;
 463
 464   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 465                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 466   ptx_dev->regs_per_block = pi;
 467
 468   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
 469      in CUDA 6.0 and newer.  */
 470   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 471                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
 472                          dev);
 473   /* Fallback: use limit of registers per block, which is usually equal.  */
 474   if (r == CUDA_ERROR_INVALID_VALUE)
 475     pi = ptx_dev->regs_per_block;
 476   else if (r != CUDA_SUCCESS)
 477     {
 478       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 479       return NULL;
 480     }
 481   ptx_dev->regs_per_sm = pi;
 482
 483   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 484                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 485   if (pi != 32)
 486     {
 487       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 488       return NULL;
 489     }
 490   ptx_dev->warp_size = pi;
 491
 492   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 493                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 494   ptx_dev->max_threads_per_block = pi;
 495
 496   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 497                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
 498   ptx_dev->max_threads_per_multiprocessor = pi;
 499
 500   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 501                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 502   if (r != CUDA_SUCCESS)
 503     async_engines = 1;
 504
 505   for (int i = 0; i != GOMP_DIM_MAX; i++)
 506     ptx_dev->default_dims[i] = 0;
 507
 508   CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
 509                   dev);
 510
 511   ptx_dev->images = NULL;
 512   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 513
 514   ptx_dev->free_blocks = NULL;
 515   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 516
 517   return ptx_dev;
 518 }
 519
 520 static bool
 521 nvptx_close_device (struct ptx_device *ptx_dev)
 522 {
 523   if (!ptx_dev)
 524     return true;
 525
 526   for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
 527     {
 528       struct ptx_free_block *b_next = b->next;
 529       CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
 530       free (b);
 531       b = b_next;
 532     }
 533
 534   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
 535   pthread_mutex_destroy (&ptx_dev->image_lock);
 536
 537   if (!ptx_dev->ctx_shared)
 538     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 539
 540   free (ptx_dev);
 541   return true;
 542 }
 543
 544 static int
 545 nvptx_get_num_devices (void)
 546 {
 547   int n;
 548
 549   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 550      configurations.  */
 551   if (sizeof (void *) != 8)
 552     {
 553       GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
 554                          " only 64-bit configurations are supported\n");
 555       return 0;
 556     }
 557
 558   /* This function will be called before the plugin has been initialized in
 559      order to enumerate available devices, but CUDA API routines can't be used
 560      until cuInit has been called.  Just call it now (but don't yet do any
 561      further initialization).  */
 562   if (instantiated_devices == 0)
 563     {
 564       if (!init_cuda_lib ())
 565         return 0;
 566       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 567       /* This is not an error: e.g. we may have CUDA libraries installed but
 568          no devices available.  */
 569       if (r != CUDA_SUCCESS)
 570         {
 571           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 572                              cuda_error (r));
 573           return 0;
 574         }
 575     }
 576
 577   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 578   return n;
 579 }
 580
 581 static void
 582 notify_var (const char *var_name, const char *env_var)
 583 {
 584   if (env_var == NULL)
 585     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 586   else
 587     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 588 }
 589
 590 static void
 591 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
 592 {
 593   const char *var_name = "GOMP_NVPTX_JIT";
 594   const char *env_var = secure_getenv (var_name);
 595   notify_var (var_name, env_var);
 596
 597   if (env_var == NULL)
 598     return;
 599
 600   const char *c = env_var;
 601   while (*c != '\0')
 602     {
 603       while (*c == ' ')
 604         c++;
 605
 606       if (c[0] == '-' && c[1] == 'O'
 607           && '0' <= c[2] && c[2] <= '4'
 608           && (c[3] == '\0' || c[3] == ' '))
 609         {
 610           *gomp_nvptx_o = c[2] - '0';
 611           c += 3;
 612           continue;
 613         }
 614
 615       GOMP_PLUGIN_error ("Error parsing %s", var_name);
 616       break;
 617     }
 618 }
 619
 620 static bool
 621 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 622           unsigned num_objs)
 623 {
 624   CUjit_option opts[7];
 625   void *optvals[7];
 626   float elapsed = 0.0;
 627   char elog[1024];
 628   char ilog[16384];
 629   CUlinkState linkstate;
 630   CUresult r;
 631   void *linkout;
 632   size_t linkoutsize __attribute__ ((unused));
 633
 634   opts[0] = CU_JIT_WALL_TIME;
 635   optvals[0] = &elapsed;
 636
 637   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 638   optvals[1] = &ilog[0];
 639
 640   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 641   optvals[2] = (void *) sizeof ilog;
 642
 643   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 644   optvals[3] = &elog[0];
 645
 646   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 647   optvals[4] = (void *) sizeof elog;
 648
 649   opts[5] = CU_JIT_LOG_VERBOSE;
 650   optvals[5] = (void *) 1;
 651
 652   static intptr_t gomp_nvptx_o = -1;
 653
 654   static bool init_done = false;
 655   if (!init_done)
 656     {
 657       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
 658       init_done = true;
 659   }
 660
 661   int nopts = 6;
 662   if (gomp_nvptx_o != -1)
 663     {
 664       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
 665       optvals[nopts] = (void *) gomp_nvptx_o;
 666       nopts++;
 667     }
 668
 669   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
 670     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
 671   else
 672     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
 673
 674   for (; num_objs--; ptx_objs++)
 675     {
 676       /* cuLinkAddData's 'data' argument erroneously omits the const
 677          qualifier.  */
 678       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 679       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
 680         r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
 681                                (char *) ptx_objs->code, ptx_objs->size,
 682                                0, 0, 0, 0);
 683       else
 684         r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 685                                (char *) ptx_objs->code, ptx_objs->size,
 686                                0, 0, 0, 0);
 687       if (r != CUDA_SUCCESS)
 688         {
 689           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 690           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 691                              cuda_error (r));
 692           return false;
 693         }
 694     }
 695
 696   GOMP_PLUGIN_debug (0, "Linking\n");
 697   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 698
 699   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 700   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 701
 702   if (r != CUDA_SUCCESS)
 703     {
 704       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 705       return false;
 706     }
 707
 708   CUDA_CALL (cuModuleLoadData, module, linkout);
 709   CUDA_CALL (cuLinkDestroy, linkstate);
 710   return true;
 711 }
 712
 713 static void
 714 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 715             unsigned *dims, void *targ_mem_desc,
 716             CUdeviceptr dp, CUstream stream)
 717 {
 718   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 719   CUfunction function;
 720   int i;
 721   void *kargs[1];
 722   struct nvptx_thread *nvthd = nvptx_thread ();
 723   int warp_size = nvthd->ptx_dev->warp_size;
 724
 725   function = targ_fn->fn;
 726
 727   /* Initialize the launch dimensions.  Typically this is constant,
 728      provided by the device compiler, but we must permit runtime
 729      values.  */
 730   int seen_zero = 0;
 731   for (i = 0; i != GOMP_DIM_MAX; i++)
 732     {
 733       if (targ_fn->launch->dim[i])
 734        dims[i] = targ_fn->launch->dim[i];
 735       if (!dims[i])
 736        seen_zero = 1;
 737     }
 738
 739   if (seen_zero)
 740     {
 741       pthread_mutex_lock (&ptx_dev_lock);
 742
 743       static int gomp_openacc_dims[GOMP_DIM_MAX];
 744       if (!gomp_openacc_dims[0])
 745         {
 746           /* See if the user provided GOMP_OPENACC_DIM environment
 747              variable to specify runtime defaults.  */
 748           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 749             gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
 750         }
 751
 752       if (!nvthd->ptx_dev->default_dims[0])
 753         {
 754           int default_dims[GOMP_DIM_MAX];
 755           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 756             default_dims[i] = gomp_openacc_dims[i];
 757
 758           int gang, worker, vector;
 759           {
 760             int block_size = nvthd->ptx_dev->max_threads_per_block;
 761             int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
 762             int dev_size = nvthd->ptx_dev->num_sms;
 763             GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
 764                                " dev_size=%d, cpu_size=%d\n",
 765                                warp_size, block_size, dev_size, cpu_size);
 766
 767             gang = (cpu_size / block_size) * dev_size;
 768             worker = block_size / warp_size;
 769             vector = warp_size;
 770           }
 771
 772           /* There is no upper bound on the gang size.  The best size
 773              matches the hardware configuration.  Logical gangs are
 774              scheduled onto physical hardware.  To maximize usage, we
 775              should guess a large number.  */
 776           if (default_dims[GOMP_DIM_GANG] < 1)
 777             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
 778           /* The worker size must not exceed the hardware.  */
 779           if (default_dims[GOMP_DIM_WORKER] < 1
 780               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
 781             default_dims[GOMP_DIM_WORKER] = worker;
 782           /* The vector size must exactly match the hardware.  */
 783           if (default_dims[GOMP_DIM_VECTOR] < 1
 784               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
 785             default_dims[GOMP_DIM_VECTOR] = vector;
 786
 787           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
 788                              default_dims[GOMP_DIM_GANG],
 789                              default_dims[GOMP_DIM_WORKER],
 790                              default_dims[GOMP_DIM_VECTOR]);
 791
 792           for (i = 0; i != GOMP_DIM_MAX; i++)
 793             nvthd->ptx_dev->default_dims[i] = default_dims[i];
 794         }
 795       pthread_mutex_unlock (&ptx_dev_lock);
 796
 797       {
 798         bool default_dim_p[GOMP_DIM_MAX];
 799         for (i = 0; i != GOMP_DIM_MAX; i++)
 800           default_dim_p[i] = !dims[i];
 801
 802         if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
 803           {
 804             for (i = 0; i != GOMP_DIM_MAX; i++)
 805               if (default_dim_p[i])
 806                 dims[i] = nvthd->ptx_dev->default_dims[i];
 807
 808             if (default_dim_p[GOMP_DIM_VECTOR])
 809               dims[GOMP_DIM_VECTOR]
 810                 = MIN (dims[GOMP_DIM_VECTOR],
 811                        (targ_fn->max_threads_per_block / warp_size
 812                         * warp_size));
 813
 814             if (default_dim_p[GOMP_DIM_WORKER])
 815               dims[GOMP_DIM_WORKER]
 816                 = MIN (dims[GOMP_DIM_WORKER],
 817                        targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
 818           }
 819         else
 820           {
 821             /* Handle the case that the compiler allows the runtime to choose
 822                the vector-length conservatively, by ignoring
 823                gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
 824                it.  */
 825             int vectors = 0;
 826             /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
 827                gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
 828                exceed targ_fn->max_threads_per_block. */
 829             int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
 830             int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
 831             int grids, blocks;
 832
 833             CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
 834                               &blocks, function, NULL, 0,
 835                               dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
 836             GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
 837                                "grid = %d, block = %d\n", grids, blocks);
 838
 839             /* Keep the num_gangs proportional to the block size.  In
 840                the case were a block size is limited by shared-memory
 841                or the register file capacity, the runtime will not
 842                excessively over assign gangs to the multiprocessor
 843                units if their state is going to be swapped out even
 844                more than necessary. The constant factor 2 is there to
 845                prevent threads from idling when there is insufficient
 846                work for them.  */
 847             if (gangs == 0)
 848               gangs = 2 * grids * (blocks / warp_size);
 849
 850             if (vectors == 0)
 851               vectors = warp_size;
 852
 853             if (workers == 0)
 854               {
 855                 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
 856                                       ? vectors
 857                                       : dims[GOMP_DIM_VECTOR]);
 858                 workers = blocks / actual_vectors;
 859                 workers = MAX (workers, 1);
 860                 /* If we need a per-worker barrier ... .  */
 861                 if (actual_vectors > 32)
 862                   /* Don't use more barriers than available.  */
 863                   workers = MIN (workers, 15);
 864               }
 865
 866             for (i = 0; i != GOMP_DIM_MAX; i++)
 867               if (default_dim_p[i])
 868                 switch (i)
 869                   {
 870                   case GOMP_DIM_GANG: dims[i] = gangs; break;
 871                   case GOMP_DIM_WORKER: dims[i] = workers; break;
 872                   case GOMP_DIM_VECTOR: dims[i] = vectors; break;
 873                   default: GOMP_PLUGIN_fatal ("invalid dim");
 874                   }
 875           }
 876       }
 877     }
 878
 879   /* Check if the accelerator has sufficient hardware resources to
 880      launch the offloaded kernel.  */
 881   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
 882       > targ_fn->max_threads_per_block)
 883     {
 884       const char *msg
 885         = ("The Nvidia accelerator has insufficient resources to launch '%s'"
 886            " with num_workers = %d and vector_length = %d"
 887            "; "
 888            "recompile the program with 'num_workers = x and vector_length = y'"
 889            " on that offloaded region or '-fopenacc-dim=:x:y' where"
 890            " x * y <= %d"
 891            ".\n");
 892       GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 893                          dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
 894     }
 895
 896   /* Check if the accelerator has sufficient barrier resources to
 897      launch the offloaded kernel.  */
 898   if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
 899     {
 900       const char *msg
 901         = ("The Nvidia accelerator has insufficient barrier resources to launch"
 902            " '%s' with num_workers = %d and vector_length = %d"
 903            "; "
 904            "recompile the program with 'num_workers = x' on that offloaded"
 905            " region or '-fopenacc-dim=:x:' where x <= 15"
 906            "; "
 907            "or, recompile the program with 'vector_length = 32' on that"
 908            " offloaded region or '-fopenacc-dim=::32'"
 909            ".\n");
 910         GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 911                            dims[GOMP_DIM_VECTOR]);
 912     }
 913
 914   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 915                      " gangs=%u, workers=%u, vectors=%u\n",
 916                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
 917                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
 918
 919   // OpenACC            CUDA
 920   //
 921   // num_gangs          nctaid.x
 922   // num_workers        ntid.y
 923   // vector length      ntid.x
 924
 925   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
 926   acc_prof_info *prof_info = thr->prof_info;
 927   acc_event_info enqueue_launch_event_info;
 928   acc_api_info *api_info = thr->api_info;
 929   bool profiling_p = __builtin_expect (prof_info != NULL, false);
 930   if (profiling_p)
 931     {
 932       prof_info->event_type = acc_ev_enqueue_launch_start;
 933
 934       enqueue_launch_event_info.launch_event.event_type
 935         = prof_info->event_type;
 936       enqueue_launch_event_info.launch_event.valid_bytes
 937         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
 938       enqueue_launch_event_info.launch_event.parent_construct
 939         = acc_construct_parallel;
 940       enqueue_launch_event_info.launch_event.implicit = 1;
 941       enqueue_launch_event_info.launch_event.tool_info = NULL;
 942       enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
 943       enqueue_launch_event_info.launch_event.num_gangs
 944         = dims[GOMP_DIM_GANG];
 945       enqueue_launch_event_info.launch_event.num_workers
 946         = dims[GOMP_DIM_WORKER];
 947       enqueue_launch_event_info.launch_event.vector_length
 948         = dims[GOMP_DIM_VECTOR];
 949
 950       api_info->device_api = acc_device_api_cuda;
 951
 952       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
 953                                             api_info);
 954     }
 955
 956   kargs[0] = &dp;
 957   CUDA_CALL_ASSERT (cuLaunchKernel, function,
 958                     dims[GOMP_DIM_GANG], 1, 1,
 959                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 960                     0, stream, kargs, 0);
 961
 962   if (profiling_p)
 963     {
 964       prof_info->event_type = acc_ev_enqueue_launch_end;
 965       enqueue_launch_event_info.launch_event.event_type
 966         = prof_info->event_type;
 967       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
 968                                             api_info);
 969     }
 970
 971   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
 972                      targ_fn->launch->fn);
 973 }
 974
 975 void * openacc_get_current_cuda_context (void);
 976
 977 static void
 978 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
 979 {
 980   acc_prof_info *prof_info = thr->prof_info;
 981   acc_event_info data_event_info;
 982   acc_api_info *api_info = thr->api_info;
 983
 984   prof_info->event_type = acc_ev_alloc;
 985
 986   data_event_info.data_event.event_type = prof_info->event_type;
 987   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
 988   data_event_info.data_event.parent_construct = acc_construct_parallel;
 989   data_event_info.data_event.implicit = 1;
 990   data_event_info.data_event.tool_info = NULL;
 991   data_event_info.data_event.var_name = NULL;
 992   data_event_info.data_event.bytes = s;
 993   data_event_info.data_event.host_ptr = NULL;
 994   data_event_info.data_event.device_ptr = dp;
 995
 996   api_info->device_api = acc_device_api_cuda;
 997
 998   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
 999 }
1000
1001 static void *
1002 nvptx_alloc (size_t s)
1003 {
1004   CUdeviceptr d;
1005
1006   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1007   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1008   bool profiling_p
1009     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1010   if (profiling_p)
1011     goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1012
1013   return (void *) d;
1014 }
1015
1016 static void
1017 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1018 {
1019   acc_prof_info *prof_info = thr->prof_info;
1020   acc_event_info data_event_info;
1021   acc_api_info *api_info = thr->api_info;
1022
1023   prof_info->event_type = acc_ev_free;
1024
1025   data_event_info.data_event.event_type = prof_info->event_type;
1026   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1027   data_event_info.data_event.parent_construct = acc_construct_parallel;
1028   data_event_info.data_event.implicit = 1;
1029   data_event_info.data_event.tool_info = NULL;
1030   data_event_info.data_event.var_name = NULL;
1031   data_event_info.data_event.bytes = -1;
1032   data_event_info.data_event.host_ptr = NULL;
1033   data_event_info.data_event.device_ptr = p;
1034
1035   api_info->device_api = acc_device_api_cuda;
1036
1037   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1038 }
1039
1040 static bool
1041 nvptx_free (void *p, struct ptx_device *ptx_dev)
1042 {
1043   /* Assume callback context if this is null.  */
1044   if (GOMP_PLUGIN_acc_thread () == NULL)
1045     {
1046       struct ptx_free_block *n
1047         = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1048       n->ptr = p;
1049       pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1050       n->next = ptx_dev->free_blocks;
1051       ptx_dev->free_blocks = n;
1052       pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1053       return true;
1054     }
1055
1056   CUdeviceptr pb;
1057   size_t ps;
1058
1059   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1060   if ((CUdeviceptr) p != pb)
1061     {
1062       GOMP_PLUGIN_error ("invalid device address");
1063       return false;
1064     }
1065
1066   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1067   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1068   bool profiling_p
1069     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1070   if (profiling_p)
1071     goacc_profiling_acc_ev_free (thr, p);
1072
1073   return true;
1074 }
1075
1076 static void *
1077 nvptx_get_current_cuda_device (void)
1078 {
1079   struct nvptx_thread *nvthd = nvptx_thread ();
1080
1081   if (!nvthd || !nvthd->ptx_dev)
1082     return NULL;
1083
1084   return &nvthd->ptx_dev->dev;
1085 }
1086
1087 static void *
1088 nvptx_get_current_cuda_context (void)
1089 {
1090   struct nvptx_thread *nvthd = nvptx_thread ();
1091
1092   if (!nvthd || !nvthd->ptx_dev)
1093     return NULL;
1094
1095   return nvthd->ptx_dev->ctx;
1096 }
1097
1098 /* Plugin entry points.  */
1099
1100 const char *
1101 GOMP_OFFLOAD_get_name (void)
1102 {
1103   return "nvptx";
1104 }
1105
1106 unsigned int
1107 GOMP_OFFLOAD_get_caps (void)
1108 {
1109   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1110 }
1111
1112 int
1113 GOMP_OFFLOAD_get_type (void)
1114 {
1115   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1116 }
1117
1118 int
1119 GOMP_OFFLOAD_get_num_devices (void)
1120 {
1121   return nvptx_get_num_devices ();
1122 }
1123
1124 union gomp_device_property_value
1125 GOMP_OFFLOAD_get_property (int n, int prop)
1126 {
1127   union gomp_device_property_value propval = { .val = 0 };
1128
1129   pthread_mutex_lock (&ptx_dev_lock);
1130
1131   if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1132     {
1133       pthread_mutex_unlock (&ptx_dev_lock);
1134       return propval;
1135     }
1136
1137   struct ptx_device *ptx_dev = ptx_devices[n];
1138   switch (prop)
1139     {
1140     case GOMP_DEVICE_PROPERTY_MEMORY:
1141       {
1142         size_t total_mem;
1143
1144         CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1145         propval.val = total_mem;
1146       }
1147       break;
1148     case GOMP_DEVICE_PROPERTY_FREE_MEMORY:
1149       {
1150         size_t total_mem;
1151         size_t free_mem;
1152         CUdevice ctxdev;
1153
1154         CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1155         if (ptx_dev->dev == ctxdev)
1156           CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1157         else if (ptx_dev->ctx)
1158           {
1159             CUcontext old_ctx;
1160
1161             CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1162             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1163             CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1164           }
1165         else
1166           {
1167             CUcontext new_ctx;
1168
1169             CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1170                             ptx_dev->dev);
1171             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1172             CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1173           }
1174         propval.val = free_mem;
1175       }
1176       break;
1177     case GOMP_DEVICE_PROPERTY_NAME:
1178       propval.ptr = ptx_dev->name;
1179       break;
1180     case GOMP_DEVICE_PROPERTY_VENDOR:
1181       propval.ptr = "Nvidia";
1182       break;
1183     case GOMP_DEVICE_PROPERTY_DRIVER:
1184       propval.ptr = cuda_driver_version_s;
1185       break;
1186     }
1187
1188   pthread_mutex_unlock (&ptx_dev_lock);
1189   return propval;
1190 }
1191
1192 bool
1193 GOMP_OFFLOAD_init_device (int n)
1194 {
1195   struct ptx_device *dev;
1196
1197   pthread_mutex_lock (&ptx_dev_lock);
1198
1199   if (!nvptx_init () || ptx_devices[n] != NULL)
1200     {
1201       pthread_mutex_unlock (&ptx_dev_lock);
1202       return false;
1203     }
1204
1205   dev = nvptx_open_device (n);
1206   if (dev)
1207     {
1208       ptx_devices[n] = dev;
1209       instantiated_devices++;
1210     }
1211
1212   pthread_mutex_unlock (&ptx_dev_lock);
1213
1214   return dev != NULL;
1215 }
1216
1217 bool
1218 GOMP_OFFLOAD_fini_device (int n)
1219 {
1220   pthread_mutex_lock (&ptx_dev_lock);
1221
1222   if (ptx_devices[n] != NULL)
1223     {
1224       if (!nvptx_attach_host_thread_to_device (n)
1225           || !nvptx_close_device (ptx_devices[n]))
1226         {
1227           pthread_mutex_unlock (&ptx_dev_lock);
1228           return false;
1229         }
1230       ptx_devices[n] = NULL;
1231       instantiated_devices--;
1232     }
1233
1234   if (instantiated_devices == 0)
1235     {
1236       free (ptx_devices);
1237       ptx_devices = NULL;
1238     }
1239
1240   pthread_mutex_unlock (&ptx_dev_lock);
1241   return true;
1242 }
1243
1244 /* Return the libgomp version number we're compatible with.  There is
1245    no requirement for cross-version compatibility.  */
1246
1247 unsigned
1248 GOMP_OFFLOAD_version (void)
1249 {
1250   return GOMP_VERSION;
1251 }
1252
1253 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1254
1255 static void
1256 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1257 {
1258   CUdeviceptr dptr;
1259   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1260                                   module, "__nvptx_clocktick");
1261   if (r == CUDA_ERROR_NOT_FOUND)
1262     return;
1263   if (r != CUDA_SUCCESS)
1264     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1265   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1266   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1267                          sizeof (__nvptx_clocktick));
1268   if (r != CUDA_SUCCESS)
1269     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1270 }
1271
1272 /* Load the (partial) program described by TARGET_DATA to device
1273    number ORD.  Allocate and return TARGET_TABLE.  */
1274
1275 int
1276 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1277                          struct addr_pair **target_table)
1278 {
1279   CUmodule module;
1280   const char *const *var_names;
1281   const struct targ_fn_launch *fn_descs;
1282   unsigned int fn_entries, var_entries, i, j;
1283   struct targ_fn_descriptor *targ_fns;
1284   struct addr_pair *targ_tbl;
1285   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1286   struct ptx_image_data *new_image;
1287   struct ptx_device *dev;
1288
1289   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1290     {
1291       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1292                          " (expected %u, received %u)",
1293                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1294       return -1;
1295     }
1296
1297   if (!nvptx_attach_host_thread_to_device (ord)
1298       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1299     return -1;
1300
1301   dev = ptx_devices[ord];
1302
1303   /* The mkoffload utility emits a struct of pointers/integers at the
1304      start of each offload image.  The array of kernel names and the
1305      functions addresses form a one-to-one correspondence.  */
1306
1307   var_entries = img_header->var_num;
1308   var_names = img_header->var_names;
1309   fn_entries = img_header->fn_num;
1310   fn_descs = img_header->fn_descs;
1311
1312   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1313                                  * (fn_entries + var_entries));
1314   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1315                                  * fn_entries);
1316
1317   *target_table = targ_tbl;
1318
1319   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1320   new_image->target_data = target_data;
1321   new_image->module = module;
1322   new_image->fns = targ_fns;
1323
1324   pthread_mutex_lock (&dev->image_lock);
1325   new_image->next = dev->images;
1326   dev->images = new_image;
1327   pthread_mutex_unlock (&dev->image_lock);
1328
1329   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1330     {
1331       CUfunction function;
1332       int nregs, mthrs;
1333
1334       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1335                       fn_descs[i].fn);
1336       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1337                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1338       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1339                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1340
1341       targ_fns->fn = function;
1342       targ_fns->launch = &fn_descs[i];
1343       targ_fns->regs_per_thread = nregs;
1344       targ_fns->max_threads_per_block = mthrs;
1345
1346       targ_tbl->start = (uintptr_t) targ_fns;
1347       targ_tbl->end = targ_tbl->start + 1;
1348     }
1349
1350   for (j = 0; j < var_entries; j++, targ_tbl++)
1351     {
1352       CUdeviceptr var;
1353       size_t bytes;
1354
1355       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1356                       &var, &bytes, module, var_names[j]);
1357
1358       targ_tbl->start = (uintptr_t) var;
1359       targ_tbl->end = targ_tbl->start + bytes;
1360     }
1361
1362   nvptx_set_clocktick (module, dev);
1363
1364   return fn_entries + var_entries;
1365 }
1366
1367 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1368    function descriptors allocated by G_O_load_image.  */
1369
1370 bool
1371 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1372 {
1373   struct ptx_image_data *image, **prev_p;
1374   struct ptx_device *dev = ptx_devices[ord];
1375
1376   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1377     {
1378       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1379                          " (expected %u, received %u)",
1380                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1381       return false;
1382     }
1383
1384   bool ret = true;
1385   pthread_mutex_lock (&dev->image_lock);
1386   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1387     if (image->target_data == target_data)
1388       {
1389         *prev_p = image->next;
1390         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1391           ret = false;
1392         free (image->fns);
1393         free (image);
1394         break;
1395       }
1396   pthread_mutex_unlock (&dev->image_lock);
1397   return ret;
1398 }
1399
1400 void *
1401 GOMP_OFFLOAD_alloc (int ord, size_t size)
1402 {
1403   if (!nvptx_attach_host_thread_to_device (ord))
1404     return NULL;
1405
1406   struct ptx_device *ptx_dev = ptx_devices[ord];
1407   struct ptx_free_block *blocks, *tmp;
1408
1409   pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1410   blocks = ptx_dev->free_blocks;
1411   ptx_dev->free_blocks = NULL;
1412   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1413
1414   while (blocks)
1415     {
1416       tmp = blocks->next;
1417       nvptx_free (blocks->ptr, ptx_dev);
1418       free (blocks);
1419       blocks = tmp;
1420     }
1421
1422   return nvptx_alloc (size);
1423 }
1424
1425 bool
1426 GOMP_OFFLOAD_free (int ord, void *ptr)
1427 {
1428   return (nvptx_attach_host_thread_to_device (ord)
1429           && nvptx_free (ptr, ptx_devices[ord]));
1430 }
1431
1432 void
1433 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1434                            void **hostaddrs, void **devaddrs,
1435                            unsigned *dims, void *targ_mem_desc)
1436 {
1437   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1438
1439   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1440   acc_prof_info *prof_info = thr->prof_info;
1441   acc_event_info data_event_info;
1442   acc_api_info *api_info = thr->api_info;
1443   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1444
1445   void **hp = NULL;
1446   CUdeviceptr dp = 0;
1447
1448   if (mapnum > 0)
1449     {
1450       size_t s = mapnum * sizeof (void *);
1451       hp = alloca (s);
1452       for (int i = 0; i < mapnum; i++)
1453         hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1454       CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1455       if (profiling_p)
1456         goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1457     }
1458
1459   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1460      fact have the same value on a unified-memory system).  */
1461   if (mapnum > 0)
1462     {
1463       if (profiling_p)
1464         {
1465           prof_info->event_type = acc_ev_enqueue_upload_start;
1466
1467           data_event_info.data_event.event_type = prof_info->event_type;
1468           data_event_info.data_event.valid_bytes
1469             = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1470           data_event_info.data_event.parent_construct
1471             = acc_construct_parallel;
1472           data_event_info.data_event.implicit = 1; /* Always implicit.  */
1473           data_event_info.data_event.tool_info = NULL;
1474           data_event_info.data_event.var_name = NULL;
1475           data_event_info.data_event.bytes = mapnum * sizeof (void *);
1476           data_event_info.data_event.host_ptr = hp;
1477           data_event_info.data_event.device_ptr = (const void *) dp;
1478
1479           api_info->device_api = acc_device_api_cuda;
1480
1481           GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1482                                                 api_info);
1483         }
1484       CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1485                         mapnum * sizeof (void *));
1486       if (profiling_p)
1487         {
1488           prof_info->event_type = acc_ev_enqueue_upload_end;
1489           data_event_info.data_event.event_type = prof_info->event_type;
1490           GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1491                                                 api_info);
1492         }
1493     }
1494
1495   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1496               dp, NULL);
1497
1498   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1499   const char *maybe_abort_msg = "(perhaps abort was called)";
1500   if (r == CUDA_ERROR_LAUNCH_FAILED)
1501     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1502                        maybe_abort_msg);
1503   else if (r != CUDA_SUCCESS)
1504     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1505
1506   CUDA_CALL_ASSERT (cuMemFree, dp);
1507   if (profiling_p)
1508     goacc_profiling_acc_ev_free (thr, (void *) dp);
1509 }
1510
1511 static void
1512 cuda_free_argmem (void *ptr)
1513 {
1514   void **block = (void **) ptr;
1515   nvptx_free (block[0], (struct ptx_device *) block[1]);
1516   free (block);
1517 }
1518
1519 void
1520 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1521                                  void **hostaddrs, void **devaddrs,
1522                                  unsigned *dims, void *targ_mem_desc,
1523                                  struct goacc_asyncqueue *aq)
1524 {
1525   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1526
1527   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1528   acc_prof_info *prof_info = thr->prof_info;
1529   acc_event_info data_event_info;
1530   acc_api_info *api_info = thr->api_info;
1531   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1532
1533   void **hp = NULL;
1534   CUdeviceptr dp = 0;
1535   void **block = NULL;
1536
1537   if (mapnum > 0)
1538     {
1539       size_t s = mapnum * sizeof (void *);
1540       block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1541       hp = block + 2;
1542       for (int i = 0; i < mapnum; i++)
1543         hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1544       CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1545       if (profiling_p)
1546         goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1547     }
1548
1549   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1550      fact have the same value on a unified-memory system).  */
1551   if (mapnum > 0)
1552     {
1553       if (profiling_p)
1554         {
1555           prof_info->event_type = acc_ev_enqueue_upload_start;
1556
1557           data_event_info.data_event.event_type = prof_info->event_type;
1558           data_event_info.data_event.valid_bytes
1559             = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1560           data_event_info.data_event.parent_construct
1561             = acc_construct_parallel;
1562           data_event_info.data_event.implicit = 1; /* Always implicit.  */
1563           data_event_info.data_event.tool_info = NULL;
1564           data_event_info.data_event.var_name = NULL;
1565           data_event_info.data_event.bytes = mapnum * sizeof (void *);
1566           data_event_info.data_event.host_ptr = hp;
1567           data_event_info.data_event.device_ptr = (const void *) dp;
1568
1569           api_info->device_api = acc_device_api_cuda;
1570
1571           GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1572                                                 api_info);
1573         }
1574
1575       CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1576                         mapnum * sizeof (void *), aq->cuda_stream);
1577       block[0] = (void *) dp;
1578
1579       struct nvptx_thread *nvthd =
1580         (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1581       block[1] = (void *) nvthd->ptx_dev;
1582
1583       if (profiling_p)
1584         {
1585           prof_info->event_type = acc_ev_enqueue_upload_end;
1586           data_event_info.data_event.event_type = prof_info->event_type;
1587           GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1588                                                 api_info);
1589         }
1590     }
1591
1592   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1593               dp, aq->cuda_stream);
1594
1595   if (mapnum > 0)
1596     GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1597 }
1598
1599 void *
1600 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1601 {
1602   struct ptx_device *ptx_dev;
1603   struct nvptx_thread *nvthd
1604     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1605   CUcontext thd_ctx;
1606
1607   ptx_dev = ptx_devices[ord];
1608
1609   assert (ptx_dev);
1610
1611   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1612
1613   assert (ptx_dev->ctx);
1614
1615   if (!thd_ctx)
1616     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1617
1618   nvthd->ptx_dev = ptx_dev;
1619
1620   return (void *) nvthd;
1621 }
1622
1623 void
1624 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1625 {
1626   free (data);
1627 }
1628
1629 void *
1630 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1631 {
1632   return nvptx_get_current_cuda_device ();
1633 }
1634
1635 void *
1636 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1637 {
1638   return nvptx_get_current_cuda_context ();
1639 }
1640
1641 /* This returns a CUstream.  */
1642 void *
1643 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1644 {
1645   return (void *) aq->cuda_stream;
1646 }
1647
1648 /* This takes a CUstream.  */
1649 int
1650 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1651 {
1652   if (aq->cuda_stream)
1653     {
1654       CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1655       CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1656     }
1657
1658   aq->cuda_stream = (CUstream) stream;
1659   return 1;
1660 }
1661
1662 struct goacc_asyncqueue *
1663 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1664 {
1665   CUstream stream = NULL;
1666   CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1667
1668   struct goacc_asyncqueue *aq
1669     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1670   aq->cuda_stream = stream;
1671   return aq;
1672 }
1673
1674 bool
1675 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1676 {
1677   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1678   free (aq);
1679   return true;
1680 }
1681
1682 int
1683 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1684 {
1685   CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1686   if (r == CUDA_SUCCESS)
1687     return 1;
1688   if (r == CUDA_ERROR_NOT_READY)
1689     return 0;
1690
1691   GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1692   return -1;
1693 }
1694
1695 bool
1696 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1697 {
1698   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1699   return true;
1700 }
1701
1702 bool
1703 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1704                                       struct goacc_asyncqueue *aq2)
1705 {
1706   CUevent e;
1707   CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1708   CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1709   CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1710   return true;
1711 }
1712
1713 static void
1714 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1715 {
1716   if (res != CUDA_SUCCESS)
1717     GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1718   struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1719   cb->fn (cb->ptr);
1720   free (ptr);
1721 }
1722
1723 void
1724 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1725                                            void (*callback_fn)(void *),
1726                                            void *userptr)
1727 {
1728   struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1729   b->fn = callback_fn;
1730   b->ptr = userptr;
1731   b->aq = aq;
1732   CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1733                     cuda_callback_wrapper, (void *) b, 0);
1734 }
1735
1736 static bool
1737 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1738 {
1739   CUdeviceptr pb;
1740   size_t ps;
1741   if (!s)
1742     return true;
1743   if (!d)
1744     {
1745       GOMP_PLUGIN_error ("invalid device address");
1746       return false;
1747     }
1748   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1749   if (!pb)
1750     {
1751       GOMP_PLUGIN_error ("invalid device address");
1752       return false;
1753     }
1754   if (!h)
1755     {
1756       GOMP_PLUGIN_error ("invalid host address");
1757       return false;
1758     }
1759   if (d == h)
1760     {
1761       GOMP_PLUGIN_error ("invalid host or device address");
1762       return false;
1763     }
1764   if ((void *)(d + s) > (void *)(pb + ps))
1765     {
1766       GOMP_PLUGIN_error ("invalid size");
1767       return false;
1768     }
1769   return true;
1770 }
1771
1772 bool
1773 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1774 {
1775   if (!nvptx_attach_host_thread_to_device (ord)
1776       || !cuda_memcpy_sanity_check (src, dst, n))
1777     return false;
1778   CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1779   return true;
1780 }
1781
1782 bool
1783 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1784 {
1785   if (!nvptx_attach_host_thread_to_device (ord)
1786       || !cuda_memcpy_sanity_check (dst, src, n))
1787     return false;
1788   CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1789   return true;
1790 }
1791
1792 bool
1793 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1794 {
1795   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1796   return true;
1797 }
1798
1799 bool
1800 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1801                                      size_t n, struct goacc_asyncqueue *aq)
1802 {
1803   if (!nvptx_attach_host_thread_to_device (ord)
1804       || !cuda_memcpy_sanity_check (src, dst, n))
1805     return false;
1806   CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1807   return true;
1808 }
1809
1810 bool
1811 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1812                                      size_t n, struct goacc_asyncqueue *aq)
1813 {
1814   if (!nvptx_attach_host_thread_to_device (ord)
1815       || !cuda_memcpy_sanity_check (dst, src, n))
1816     return false;
1817   CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1818   return true;
1819 }
1820
1821 /* Adjust launch dimensions: pick good values for number of blocks and warps
1822    and ensure that number of warps does not exceed CUDA limits as well as GCC's
1823    own limits.  */
1824
1825 static void
1826 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1827                             struct ptx_device *ptx_dev,
1828                             int *teams_p, int *threads_p)
1829 {
1830   int max_warps_block = fn->max_threads_per_block / 32;
1831   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1832      and libgcc, which matches documented limit of all GPUs as of 2015.  */
1833   if (max_warps_block > 32)
1834     max_warps_block = 32;
1835   if (*threads_p <= 0)
1836     *threads_p = 8;
1837   if (*threads_p > max_warps_block)
1838     *threads_p = max_warps_block;
1839
1840   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1841   /* This is an estimate of how many blocks the device can host simultaneously.
1842      Actual limit, which may be lower, can be queried with "occupancy control"
1843      driver interface (since CUDA 6.0).  */
1844   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1845   if (*teams_p <= 0 || *teams_p > max_blocks)
1846     *teams_p = max_blocks;
1847 }
1848
1849 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1850    target regions.  */
1851
1852 static size_t
1853 nvptx_stacks_size ()
1854 {
1855   return 128 * 1024;
1856 }
1857
1858 /* Return contiguous storage for NUM stacks, each SIZE bytes.  */
1859
1860 static void *
1861 nvptx_stacks_alloc (size_t size, int num)
1862 {
1863   CUdeviceptr stacks;
1864   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
1865   if (r != CUDA_SUCCESS)
1866     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1867   return (void *) stacks;
1868 }
1869
1870 /* Release storage previously allocated by nvptx_stacks_alloc.  */
1871
1872 static void
1873 nvptx_stacks_free (void *p, int num)
1874 {
1875   CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
1876   if (r != CUDA_SUCCESS)
1877     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1878 }
1879
1880 void
1881 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1882 {
1883   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
1884   CUresult r;
1885   struct ptx_device *ptx_dev = ptx_devices[ord];
1886   const char *maybe_abort_msg = "(perhaps abort was called)";
1887   int teams = 0, threads = 0;
1888
1889   if (!args)
1890     GOMP_PLUGIN_fatal ("No target arguments provided");
1891   while (*args)
1892     {
1893       intptr_t id = (intptr_t) *args++, val;
1894       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1895         val = (intptr_t) *args++;
1896       else
1897         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1898       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1899         continue;
1900       val = val > INT_MAX ? INT_MAX : val;
1901       id &= GOMP_TARGET_ARG_ID_MASK;
1902       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
1903         teams = val;
1904       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
1905         threads = val;
1906     }
1907   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
1908
1909   size_t stack_size = nvptx_stacks_size ();
1910   void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
1911   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
1912   size_t fn_args_size = sizeof fn_args;
1913   void *config[] = {
1914     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
1915     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
1916     CU_LAUNCH_PARAM_END
1917   };
1918   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
1919                          32, threads, 1, 0, NULL, NULL, config);
1920   if (r != CUDA_SUCCESS)
1921     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1922
1923   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1924   if (r == CUDA_ERROR_LAUNCH_FAILED)
1925     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1926                        maybe_abort_msg);
1927   else if (r != CUDA_SUCCESS)
1928     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1929   nvptx_stacks_free (stacks, teams * threads);
1930 }
1931
1932 void
1933 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
1934                         void *async_data)
1935 {
1936   GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
1937 }