libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2019 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #define _GNU_SOURCE
  35 #include "openacc.h"
  36 #include "config.h"
  37 #include "libgomp-plugin.h"
  38 #include "oacc-plugin.h"
  39 #include "gomp-constants.h"
  40
  41 #include <pthread.h>
  42 #include <cuda.h>
  43 #include <stdbool.h>
  44 #include <stdint.h>
  45 #include <limits.h>
  46 #include <string.h>
  47 #include <stdio.h>
  48 #include <unistd.h>
  49 #include <assert.h>
  50 #include <errno.h>
  51
  52 #if CUDA_VERSION < 6000
  53 extern CUresult cuGetErrorString (CUresult, const char **);
  54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
  55 #endif
  56
  57 #if CUDA_VERSION >= 6050
  58 #undef cuLinkCreate
  59 #undef cuLinkAddData
  60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
  61                         const char *, unsigned, CUjit_option *, void **);
  62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
  63 #else
  64 typedef size_t (*CUoccupancyB2DSize)(int);
  65 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
  66                            const char *, unsigned, CUjit_option *, void **);
  67 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
  68 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
  69                                           CUoccupancyB2DSize, size_t, int);
  70 #endif
  71
  72 #define DO_PRAGMA(x) _Pragma (#x)
  73
  74 #if PLUGIN_NVPTX_DYNAMIC
  75 # include <dlfcn.h>
  76
  77 struct cuda_lib_s {
  78
  79 # define CUDA_ONE_CALL(call)                    \
  80   __typeof (call) *call;
  81 # define CUDA_ONE_CALL_MAYBE_NULL(call)         \
  82   CUDA_ONE_CALL (call)
  83 #include "cuda-lib.def"
  84 # undef CUDA_ONE_CALL
  85 # undef CUDA_ONE_CALL_MAYBE_NULL
  86
  87 } cuda_lib;
  88
  89 /* -1 if init_cuda_lib has not been called yet, false
  90    if it has been and failed, true if it has been and succeeded.  */
  91 static signed char cuda_lib_inited = -1;
  92
  93 /* Dynamically load the CUDA runtime library and initialize function
  94    pointers, return false if unsuccessful, true if successful.  */
  95 static bool
  96 init_cuda_lib (void)
  97 {
  98   if (cuda_lib_inited != -1)
  99     return cuda_lib_inited;
 100   const char *cuda_runtime_lib = "libcuda.so.1";
 101   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 102   cuda_lib_inited = false;
 103   if (h == NULL)
 104     return false;
 105
 106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
 107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
 108 # define CUDA_ONE_CALL_1(call, allow_null)              \
 109   cuda_lib.call = dlsym (h, #call);     \
 110   if (!allow_null && cuda_lib.call == NULL)             \
 111     return false;
 112 #include "cuda-lib.def"
 113 # undef CUDA_ONE_CALL
 114 # undef CUDA_ONE_CALL_1
 115 # undef CUDA_ONE_CALL_MAYBE_NULL
 116
 117   cuda_lib_inited = true;
 118   return true;
 119 }
 120 # define CUDA_CALL_PREFIX cuda_lib.
 121 #else
 122
 123 # define CUDA_ONE_CALL(call)
 124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
 125 #include "cuda-lib.def"
 126 #undef CUDA_ONE_CALL_MAYBE_NULL
 127 #undef CUDA_ONE_CALL
 128
 129 # define CUDA_CALL_PREFIX
 130 # define init_cuda_lib() true
 131 #endif
 132
 133 #include "secure_getenv.h"
 134
 135 #undef MIN
 136 #undef MAX
 137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 139
 140 /* Convenience macros for the frequently used CUDA library call and
 141    error handling sequence as well as CUDA library calls that
 142    do the error checking themselves or don't do it at all.  */
 143
 144 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 145   do {                                          \
 146     unsigned __r                                \
 147       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 148     if (__r != CUDA_SUCCESS)                    \
 149       {                                         \
 150         GOMP_PLUGIN_error (#FN " error: %s",    \
 151                            cuda_error (__r));   \
 152         return ERET;                            \
 153       }                                         \
 154   } while (0)
 155
 156 #define CUDA_CALL(FN, ...)                      \
 157   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 158
 159 #define CUDA_CALL_ASSERT(FN, ...)               \
 160   do {                                          \
 161     unsigned __r                                \
 162       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 163     if (__r != CUDA_SUCCESS)                    \
 164       {                                         \
 165         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 166                            cuda_error (__r));   \
 167       }                                         \
 168   } while (0)
 169
 170 #define CUDA_CALL_NOCHECK(FN, ...)              \
 171   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 172
 173 #define CUDA_CALL_EXISTS(FN)                    \
 174   CUDA_CALL_PREFIX FN
 175
 176 static const char *
 177 cuda_error (CUresult r)
 178 {
 179   const char *fallback = "unknown cuda error";
 180   const char *desc;
 181
 182   if (!CUDA_CALL_EXISTS (cuGetErrorString))
 183     return fallback;
 184
 185   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 186   if (r == CUDA_SUCCESS)
 187     return desc;
 188
 189   return fallback;
 190 }
 191
 192 static unsigned int instantiated_devices = 0;
 193 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 194
 195 /* NVPTX/CUDA specific definition of asynchronous queues.  */
 196 struct goacc_asyncqueue
 197 {
 198   CUstream cuda_stream;
 199 };
 200
 201 struct nvptx_callback
 202 {
 203   void (*fn) (void *);
 204   void *ptr;
 205   struct goacc_asyncqueue *aq;
 206   struct nvptx_callback *next;
 207 };
 208
 209 /* Thread-specific data for PTX.  */
 210
 211 struct nvptx_thread
 212 {
 213   /* We currently have this embedded inside the plugin because libgomp manages
 214      devices through integer target_ids.  This might be better if using an
 215      opaque target-specific pointer directly from gomp_device_descr.  */
 216   struct ptx_device *ptx_dev;
 217 };
 218
 219 /* Target data function launch information.  */
 220
 221 struct targ_fn_launch
 222 {
 223   const char *fn;
 224   unsigned short dim[GOMP_DIM_MAX];
 225 };
 226
 227 /* Target PTX object information.  */
 228
 229 struct targ_ptx_obj
 230 {
 231   const char *code;
 232   size_t size;
 233 };
 234
 235 /* Target data image information.  */
 236
 237 typedef struct nvptx_tdata
 238 {
 239   const struct targ_ptx_obj *ptx_objs;
 240   unsigned ptx_num;
 241
 242   const char *const *var_names;
 243   unsigned var_num;
 244
 245   const struct targ_fn_launch *fn_descs;
 246   unsigned fn_num;
 247 } nvptx_tdata_t;
 248
 249 /* Descriptor of a loaded function.  */
 250
 251 struct targ_fn_descriptor
 252 {
 253   CUfunction fn;
 254   const struct targ_fn_launch *launch;
 255   int regs_per_thread;
 256   int max_threads_per_block;
 257 };
 258
 259 /* A loaded PTX image.  */
 260 struct ptx_image_data
 261 {
 262   const void *target_data;
 263   CUmodule module;
 264
 265   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 266
 267   struct ptx_image_data *next;
 268 };
 269
 270 struct ptx_free_block
 271 {
 272   void *ptr;
 273   struct ptx_free_block *next;
 274 };
 275
 276 struct ptx_device
 277 {
 278   CUcontext ctx;
 279   bool ctx_shared;
 280   CUdevice dev;
 281
 282   int ord;
 283   bool overlap;
 284   bool map;
 285   bool concur;
 286   bool mkern;
 287   int  mode;
 288   int clock_khz;
 289   int num_sms;
 290   int regs_per_block;
 291   int regs_per_sm;
 292   int warp_size;
 293   int max_threads_per_block;
 294   int max_threads_per_multiprocessor;
 295   int default_dims[GOMP_DIM_MAX];
 296
 297   struct ptx_image_data *images;  /* Images loaded on device.  */
 298   pthread_mutex_t image_lock;     /* Lock for above list.  */
 299
 300   struct ptx_free_block *free_blocks;
 301   pthread_mutex_t free_blocks_lock;
 302
 303   struct ptx_device *next;
 304 };
 305
 306 static struct ptx_device **ptx_devices;
 307
 308 static inline struct nvptx_thread *
 309 nvptx_thread (void)
 310 {
 311   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 312 }
 313
 314 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 315    should be locked on entry and remains locked on exit.  */
 316
 317 static bool
 318 nvptx_init (void)
 319 {
 320   int ndevs;
 321
 322   if (instantiated_devices != 0)
 323     return true;
 324
 325   if (!init_cuda_lib ())
 326     return false;
 327
 328   CUDA_CALL (cuInit, 0);
 329
 330   CUDA_CALL (cuDeviceGetCount, &ndevs);
 331   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 332                                             * ndevs);
 333   return true;
 334 }
 335
 336 /* Select the N'th PTX device for the current host thread.  The device must
 337    have been previously opened before calling this function.  */
 338
 339 static bool
 340 nvptx_attach_host_thread_to_device (int n)
 341 {
 342   CUdevice dev;
 343   CUresult r;
 344   struct ptx_device *ptx_dev;
 345   CUcontext thd_ctx;
 346
 347   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 348   if (r == CUDA_ERROR_NOT_PERMITTED)
 349     {
 350       /* Assume we're in a CUDA callback, just return true.  */
 351       return true;
 352     }
 353   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 354     {
 355       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 356       return false;
 357     }
 358
 359   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 360     return true;
 361   else
 362     {
 363       CUcontext old_ctx;
 364
 365       ptx_dev = ptx_devices[n];
 366       if (!ptx_dev)
 367         {
 368           GOMP_PLUGIN_error ("device %d not found", n);
 369           return false;
 370         }
 371
 372       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 373
 374       /* We don't necessarily have a current context (e.g. if it has been
 375          destroyed.  Pop it if we do though.  */
 376       if (thd_ctx != NULL)
 377         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 378
 379       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 380     }
 381   return true;
 382 }
 383
 384 static struct ptx_device *
 385 nvptx_open_device (int n)
 386 {
 387   struct ptx_device *ptx_dev;
 388   CUdevice dev, ctx_dev;
 389   CUresult r;
 390   int async_engines, pi;
 391
 392   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 393
 394   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 395
 396   ptx_dev->ord = n;
 397   ptx_dev->dev = dev;
 398   ptx_dev->ctx_shared = false;
 399
 400   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 401   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 402     {
 403       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 404       return NULL;
 405     }
 406
 407   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 408     {
 409       /* The current host thread has an active context for a different device.
 410          Detach it.  */
 411       CUcontext old_ctx;
 412       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 413     }
 414
 415   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 416
 417   if (!ptx_dev->ctx)
 418     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 419   else
 420     ptx_dev->ctx_shared = true;
 421
 422   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 423                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 424   ptx_dev->overlap = pi;
 425
 426   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 427                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 428   ptx_dev->map = pi;
 429
 430   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 431                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 432   ptx_dev->concur = pi;
 433
 434   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 435                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 436   ptx_dev->mode = pi;
 437
 438   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 439                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 440   ptx_dev->mkern = pi;
 441
 442   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 443                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 444   ptx_dev->clock_khz = pi;
 445
 446   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 447                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 448   ptx_dev->num_sms = pi;
 449
 450   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 451                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 452   ptx_dev->regs_per_block = pi;
 453
 454   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
 455      in CUDA 6.0 and newer.  */
 456   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 457                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
 458                          dev);
 459   /* Fallback: use limit of registers per block, which is usually equal.  */
 460   if (r == CUDA_ERROR_INVALID_VALUE)
 461     pi = ptx_dev->regs_per_block;
 462   else if (r != CUDA_SUCCESS)
 463     {
 464       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 465       return NULL;
 466     }
 467   ptx_dev->regs_per_sm = pi;
 468
 469   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 470                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 471   if (pi != 32)
 472     {
 473       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 474       return NULL;
 475     }
 476   ptx_dev->warp_size = pi;
 477
 478   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 479                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 480   ptx_dev->max_threads_per_block = pi;
 481
 482   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 483                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
 484   ptx_dev->max_threads_per_multiprocessor = pi;
 485
 486   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 487                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 488   if (r != CUDA_SUCCESS)
 489     async_engines = 1;
 490
 491   for (int i = 0; i != GOMP_DIM_MAX; i++)
 492     ptx_dev->default_dims[i] = 0;
 493
 494   ptx_dev->images = NULL;
 495   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 496
 497   ptx_dev->free_blocks = NULL;
 498   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 499
 500   return ptx_dev;
 501 }
 502
 503 static bool
 504 nvptx_close_device (struct ptx_device *ptx_dev)
 505 {
 506   if (!ptx_dev)
 507     return true;
 508
 509   for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
 510     {
 511       struct ptx_free_block *b_next = b->next;
 512       CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
 513       free (b);
 514       b = b_next;
 515     }
 516
 517   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
 518   pthread_mutex_destroy (&ptx_dev->image_lock);
 519
 520   if (!ptx_dev->ctx_shared)
 521     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 522
 523   free (ptx_dev);
 524   return true;
 525 }
 526
 527 static int
 528 nvptx_get_num_devices (void)
 529 {
 530   int n;
 531
 532   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 533      configurations.  */
 534   if (sizeof (void *) != 8)
 535     {
 536       GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
 537                          " only 64-bit configurations are supported\n");
 538       return 0;
 539     }
 540
 541   /* This function will be called before the plugin has been initialized in
 542      order to enumerate available devices, but CUDA API routines can't be used
 543      until cuInit has been called.  Just call it now (but don't yet do any
 544      further initialization).  */
 545   if (instantiated_devices == 0)
 546     {
 547       if (!init_cuda_lib ())
 548         return 0;
 549       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 550       /* This is not an error: e.g. we may have CUDA libraries installed but
 551          no devices available.  */
 552       if (r != CUDA_SUCCESS)
 553         {
 554           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 555                              cuda_error (r));
 556           return 0;
 557         }
 558     }
 559
 560   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 561   return n;
 562 }
 563
 564 static void
 565 notify_var (const char *var_name, const char *env_var)
 566 {
 567   if (env_var == NULL)
 568     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 569   else
 570     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 571 }
 572
 573 static void
 574 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
 575 {
 576   const char *var_name = "GOMP_NVPTX_JIT";
 577   const char *env_var = secure_getenv (var_name);
 578   notify_var (var_name, env_var);
 579
 580   if (env_var == NULL)
 581     return;
 582
 583   const char *c = env_var;
 584   while (*c != '\0')
 585     {
 586       while (*c == ' ')
 587         c++;
 588
 589       if (c[0] == '-' && c[1] == 'O'
 590           && '0' <= c[2] && c[2] <= '4'
 591           && (c[3] == '\0' || c[3] == ' '))
 592         {
 593           *gomp_nvptx_o = c[2] - '0';
 594           c += 3;
 595           continue;
 596         }
 597
 598       GOMP_PLUGIN_error ("Error parsing %s", var_name);
 599       break;
 600     }
 601 }
 602
 603 static bool
 604 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 605           unsigned num_objs)
 606 {
 607   CUjit_option opts[7];
 608   void *optvals[7];
 609   float elapsed = 0.0;
 610   char elog[1024];
 611   char ilog[16384];
 612   CUlinkState linkstate;
 613   CUresult r;
 614   void *linkout;
 615   size_t linkoutsize __attribute__ ((unused));
 616
 617   opts[0] = CU_JIT_WALL_TIME;
 618   optvals[0] = &elapsed;
 619
 620   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 621   optvals[1] = &ilog[0];
 622
 623   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 624   optvals[2] = (void *) sizeof ilog;
 625
 626   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 627   optvals[3] = &elog[0];
 628
 629   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 630   optvals[4] = (void *) sizeof elog;
 631
 632   opts[5] = CU_JIT_LOG_VERBOSE;
 633   optvals[5] = (void *) 1;
 634
 635   static intptr_t gomp_nvptx_o = -1;
 636
 637   static bool init_done = false;
 638   if (!init_done)
 639     {
 640       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
 641       init_done = true;
 642   }
 643
 644   int nopts = 6;
 645   if (gomp_nvptx_o != -1)
 646     {
 647       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
 648       optvals[nopts] = (void *) gomp_nvptx_o;
 649       nopts++;
 650     }
 651
 652   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
 653     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
 654   else
 655     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
 656
 657   for (; num_objs--; ptx_objs++)
 658     {
 659       /* cuLinkAddData's 'data' argument erroneously omits the const
 660          qualifier.  */
 661       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 662       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
 663         r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
 664                                (char *) ptx_objs->code, ptx_objs->size,
 665                                0, 0, 0, 0);
 666       else
 667         r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 668                                (char *) ptx_objs->code, ptx_objs->size,
 669                                0, 0, 0, 0);
 670       if (r != CUDA_SUCCESS)
 671         {
 672           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 673           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 674                              cuda_error (r));
 675           return false;
 676         }
 677     }
 678
 679   GOMP_PLUGIN_debug (0, "Linking\n");
 680   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 681
 682   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 683   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 684
 685   if (r != CUDA_SUCCESS)
 686     {
 687       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 688       return false;
 689     }
 690
 691   CUDA_CALL (cuModuleLoadData, module, linkout);
 692   CUDA_CALL (cuLinkDestroy, linkstate);
 693   return true;
 694 }
 695
 696 static void
 697 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 698             unsigned *dims, void *targ_mem_desc,
 699             CUdeviceptr dp, CUstream stream)
 700 {
 701   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 702   CUfunction function;
 703   int i;
 704   void *kargs[1];
 705   struct nvptx_thread *nvthd = nvptx_thread ();
 706   int warp_size = nvthd->ptx_dev->warp_size;
 707
 708   function = targ_fn->fn;
 709
 710   /* Initialize the launch dimensions.  Typically this is constant,
 711      provided by the device compiler, but we must permit runtime
 712      values.  */
 713   int seen_zero = 0;
 714   for (i = 0; i != GOMP_DIM_MAX; i++)
 715     {
 716       if (targ_fn->launch->dim[i])
 717        dims[i] = targ_fn->launch->dim[i];
 718       if (!dims[i])
 719        seen_zero = 1;
 720     }
 721
 722   if (seen_zero)
 723     {
 724       pthread_mutex_lock (&ptx_dev_lock);
 725
 726       static int gomp_openacc_dims[GOMP_DIM_MAX];
 727       if (!gomp_openacc_dims[0])
 728         {
 729           /* See if the user provided GOMP_OPENACC_DIM environment
 730              variable to specify runtime defaults.  */
 731           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 732             gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
 733         }
 734
 735       if (!nvthd->ptx_dev->default_dims[0])
 736         {
 737           int default_dims[GOMP_DIM_MAX];
 738           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 739             default_dims[i] = gomp_openacc_dims[i];
 740
 741           int gang, worker, vector;
 742           {
 743             int block_size = nvthd->ptx_dev->max_threads_per_block;
 744             int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
 745             int dev_size = nvthd->ptx_dev->num_sms;
 746             GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
 747                                " dev_size=%d, cpu_size=%d\n",
 748                                warp_size, block_size, dev_size, cpu_size);
 749
 750             gang = (cpu_size / block_size) * dev_size;
 751             worker = block_size / warp_size;
 752             vector = warp_size;
 753           }
 754
 755           /* There is no upper bound on the gang size.  The best size
 756              matches the hardware configuration.  Logical gangs are
 757              scheduled onto physical hardware.  To maximize usage, we
 758              should guess a large number.  */
 759           if (default_dims[GOMP_DIM_GANG] < 1)
 760             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
 761           /* The worker size must not exceed the hardware.  */
 762           if (default_dims[GOMP_DIM_WORKER] < 1
 763               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
 764             default_dims[GOMP_DIM_WORKER] = worker;
 765           /* The vector size must exactly match the hardware.  */
 766           if (default_dims[GOMP_DIM_VECTOR] < 1
 767               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
 768             default_dims[GOMP_DIM_VECTOR] = vector;
 769
 770           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
 771                              default_dims[GOMP_DIM_GANG],
 772                              default_dims[GOMP_DIM_WORKER],
 773                              default_dims[GOMP_DIM_VECTOR]);
 774
 775           for (i = 0; i != GOMP_DIM_MAX; i++)
 776             nvthd->ptx_dev->default_dims[i] = default_dims[i];
 777         }
 778       pthread_mutex_unlock (&ptx_dev_lock);
 779
 780       {
 781         bool default_dim_p[GOMP_DIM_MAX];
 782         for (i = 0; i != GOMP_DIM_MAX; i++)
 783           default_dim_p[i] = !dims[i];
 784
 785         if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
 786           {
 787             for (i = 0; i != GOMP_DIM_MAX; i++)
 788               if (default_dim_p[i])
 789                 dims[i] = nvthd->ptx_dev->default_dims[i];
 790
 791             if (default_dim_p[GOMP_DIM_VECTOR])
 792               dims[GOMP_DIM_VECTOR]
 793                 = MIN (dims[GOMP_DIM_VECTOR],
 794                        (targ_fn->max_threads_per_block / warp_size
 795                         * warp_size));
 796
 797             if (default_dim_p[GOMP_DIM_WORKER])
 798               dims[GOMP_DIM_WORKER]
 799                 = MIN (dims[GOMP_DIM_WORKER],
 800                        targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
 801           }
 802         else
 803           {
 804             /* Handle the case that the compiler allows the runtime to choose
 805                the vector-length conservatively, by ignoring
 806                gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
 807                it.  */
 808             int vectors = 0;
 809             /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
 810                gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
 811                exceed targ_fn->max_threads_per_block. */
 812             int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
 813             int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
 814             int grids, blocks;
 815
 816             CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
 817                               &blocks, function, NULL, 0,
 818                               dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
 819             GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
 820                                "grid = %d, block = %d\n", grids, blocks);
 821
 822             /* Keep the num_gangs proportional to the block size.  In
 823                the case were a block size is limited by shared-memory
 824                or the register file capacity, the runtime will not
 825                excessively over assign gangs to the multiprocessor
 826                units if their state is going to be swapped out even
 827                more than necessary. The constant factor 2 is there to
 828                prevent threads from idling when there is insufficient
 829                work for them.  */
 830             if (gangs == 0)
 831               gangs = 2 * grids * (blocks / warp_size);
 832
 833             if (vectors == 0)
 834               vectors = warp_size;
 835
 836             if (workers == 0)
 837               {
 838                 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
 839                                       ? vectors
 840                                       : dims[GOMP_DIM_VECTOR]);
 841                 workers = blocks / actual_vectors;
 842                 workers = MAX (workers, 1);
 843                 /* If we need a per-worker barrier ... .  */
 844                 if (actual_vectors > 32)
 845                   /* Don't use more barriers than available.  */
 846                   workers = MIN (workers, 15);
 847               }
 848
 849             for (i = 0; i != GOMP_DIM_MAX; i++)
 850               if (default_dim_p[i])
 851                 switch (i)
 852                   {
 853                   case GOMP_DIM_GANG: dims[i] = gangs; break;
 854                   case GOMP_DIM_WORKER: dims[i] = workers; break;
 855                   case GOMP_DIM_VECTOR: dims[i] = vectors; break;
 856                   default: GOMP_PLUGIN_fatal ("invalid dim");
 857                   }
 858           }
 859       }
 860     }
 861
 862   /* Check if the accelerator has sufficient hardware resources to
 863      launch the offloaded kernel.  */
 864   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
 865       > targ_fn->max_threads_per_block)
 866     {
 867       const char *msg
 868         = ("The Nvidia accelerator has insufficient resources to launch '%s'"
 869            " with num_workers = %d and vector_length = %d"
 870            "; "
 871            "recompile the program with 'num_workers = x and vector_length = y'"
 872            " on that offloaded region or '-fopenacc-dim=:x:y' where"
 873            " x * y <= %d"
 874            ".\n");
 875       GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 876                          dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
 877     }
 878
 879   /* Check if the accelerator has sufficient barrier resources to
 880      launch the offloaded kernel.  */
 881   if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
 882     {
 883       const char *msg
 884         = ("The Nvidia accelerator has insufficient barrier resources to launch"
 885            " '%s' with num_workers = %d and vector_length = %d"
 886            "; "
 887            "recompile the program with 'num_workers = x' on that offloaded"
 888            " region or '-fopenacc-dim=:x:' where x <= 15"
 889            "; "
 890            "or, recompile the program with 'vector_length = 32' on that"
 891            " offloaded region or '-fopenacc-dim=::32'"
 892            ".\n");
 893         GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 894                            dims[GOMP_DIM_VECTOR]);
 895     }
 896
 897   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 898                      " gangs=%u, workers=%u, vectors=%u\n",
 899                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
 900                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
 901
 902   // OpenACC            CUDA
 903   //
 904   // num_gangs          nctaid.x
 905   // num_workers        ntid.y
 906   // vector length      ntid.x
 907   kargs[0] = &dp;
 908   CUDA_CALL_ASSERT (cuLaunchKernel, function,
 909                     dims[GOMP_DIM_GANG], 1, 1,
 910                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 911                     0, stream, kargs, 0);
 912
 913   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
 914                      targ_fn->launch->fn);
 915 }
 916
 917 void * openacc_get_current_cuda_context (void);
 918
 919 static void *
 920 nvptx_alloc (size_t s)
 921 {
 922   CUdeviceptr d;
 923
 924   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
 925   return (void *) d;
 926 }
 927
 928 static bool
 929 nvptx_free (void *p, struct ptx_device *ptx_dev)
 930 {
 931   /* Assume callback context if this is null.  */
 932   if (GOMP_PLUGIN_acc_thread () == NULL)
 933     {
 934       struct ptx_free_block *n
 935         = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
 936       n->ptr = p;
 937       pthread_mutex_lock (&ptx_dev->free_blocks_lock);
 938       n->next = ptx_dev->free_blocks;
 939       ptx_dev->free_blocks = n;
 940       pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
 941       return true;
 942     }
 943
 944   CUdeviceptr pb;
 945   size_t ps;
 946
 947   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
 948   if ((CUdeviceptr) p != pb)
 949     {
 950       GOMP_PLUGIN_error ("invalid device address");
 951       return false;
 952     }
 953
 954   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
 955   return true;
 956 }
 957
 958 static void *
 959 nvptx_get_current_cuda_device (void)
 960 {
 961   struct nvptx_thread *nvthd = nvptx_thread ();
 962
 963   if (!nvthd || !nvthd->ptx_dev)
 964     return NULL;
 965
 966   return &nvthd->ptx_dev->dev;
 967 }
 968
 969 static void *
 970 nvptx_get_current_cuda_context (void)
 971 {
 972   struct nvptx_thread *nvthd = nvptx_thread ();
 973
 974   if (!nvthd || !nvthd->ptx_dev)
 975     return NULL;
 976
 977   return nvthd->ptx_dev->ctx;
 978 }
 979
 980 /* Plugin entry points.  */
 981
 982 const char *
 983 GOMP_OFFLOAD_get_name (void)
 984 {
 985   return "nvptx";
 986 }
 987
 988 unsigned int
 989 GOMP_OFFLOAD_get_caps (void)
 990 {
 991   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
 992 }
 993
 994 int
 995 GOMP_OFFLOAD_get_type (void)
 996 {
 997   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
 998 }
 999
1000 int
1001 GOMP_OFFLOAD_get_num_devices (void)
1002 {
1003   return nvptx_get_num_devices ();
1004 }
1005
1006 bool
1007 GOMP_OFFLOAD_init_device (int n)
1008 {
1009   struct ptx_device *dev;
1010
1011   pthread_mutex_lock (&ptx_dev_lock);
1012
1013   if (!nvptx_init () || ptx_devices[n] != NULL)
1014     {
1015       pthread_mutex_unlock (&ptx_dev_lock);
1016       return false;
1017     }
1018
1019   dev = nvptx_open_device (n);
1020   if (dev)
1021     {
1022       ptx_devices[n] = dev;
1023       instantiated_devices++;
1024     }
1025
1026   pthread_mutex_unlock (&ptx_dev_lock);
1027
1028   return dev != NULL;
1029 }
1030
1031 bool
1032 GOMP_OFFLOAD_fini_device (int n)
1033 {
1034   pthread_mutex_lock (&ptx_dev_lock);
1035
1036   if (ptx_devices[n] != NULL)
1037     {
1038       if (!nvptx_attach_host_thread_to_device (n)
1039           || !nvptx_close_device (ptx_devices[n]))
1040         {
1041           pthread_mutex_unlock (&ptx_dev_lock);
1042           return false;
1043         }
1044       ptx_devices[n] = NULL;
1045       instantiated_devices--;
1046     }
1047
1048   if (instantiated_devices == 0)
1049     {
1050       free (ptx_devices);
1051       ptx_devices = NULL;
1052     }
1053
1054   pthread_mutex_unlock (&ptx_dev_lock);
1055   return true;
1056 }
1057
1058 /* Return the libgomp version number we're compatible with.  There is
1059    no requirement for cross-version compatibility.  */
1060
1061 unsigned
1062 GOMP_OFFLOAD_version (void)
1063 {
1064   return GOMP_VERSION;
1065 }
1066
1067 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1068
1069 static void
1070 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1071 {
1072   CUdeviceptr dptr;
1073   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1074                                   module, "__nvptx_clocktick");
1075   if (r == CUDA_ERROR_NOT_FOUND)
1076     return;
1077   if (r != CUDA_SUCCESS)
1078     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1079   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1080   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1081                          sizeof (__nvptx_clocktick));
1082   if (r != CUDA_SUCCESS)
1083     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1084 }
1085
1086 /* Load the (partial) program described by TARGET_DATA to device
1087    number ORD.  Allocate and return TARGET_TABLE.  */
1088
1089 int
1090 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1091                          struct addr_pair **target_table)
1092 {
1093   CUmodule module;
1094   const char *const *var_names;
1095   const struct targ_fn_launch *fn_descs;
1096   unsigned int fn_entries, var_entries, i, j;
1097   struct targ_fn_descriptor *targ_fns;
1098   struct addr_pair *targ_tbl;
1099   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1100   struct ptx_image_data *new_image;
1101   struct ptx_device *dev;
1102
1103   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1104     {
1105       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1106                          " (expected %u, received %u)",
1107                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1108       return -1;
1109     }
1110
1111   if (!nvptx_attach_host_thread_to_device (ord)
1112       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1113     return -1;
1114
1115   dev = ptx_devices[ord];
1116
1117   /* The mkoffload utility emits a struct of pointers/integers at the
1118      start of each offload image.  The array of kernel names and the
1119      functions addresses form a one-to-one correspondence.  */
1120
1121   var_entries = img_header->var_num;
1122   var_names = img_header->var_names;
1123   fn_entries = img_header->fn_num;
1124   fn_descs = img_header->fn_descs;
1125
1126   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1127                                  * (fn_entries + var_entries));
1128   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1129                                  * fn_entries);
1130
1131   *target_table = targ_tbl;
1132
1133   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1134   new_image->target_data = target_data;
1135   new_image->module = module;
1136   new_image->fns = targ_fns;
1137
1138   pthread_mutex_lock (&dev->image_lock);
1139   new_image->next = dev->images;
1140   dev->images = new_image;
1141   pthread_mutex_unlock (&dev->image_lock);
1142
1143   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1144     {
1145       CUfunction function;
1146       int nregs, mthrs;
1147
1148       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1149                       fn_descs[i].fn);
1150       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1151                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1152       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1153                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1154
1155       targ_fns->fn = function;
1156       targ_fns->launch = &fn_descs[i];
1157       targ_fns->regs_per_thread = nregs;
1158       targ_fns->max_threads_per_block = mthrs;
1159
1160       targ_tbl->start = (uintptr_t) targ_fns;
1161       targ_tbl->end = targ_tbl->start + 1;
1162     }
1163
1164   for (j = 0; j < var_entries; j++, targ_tbl++)
1165     {
1166       CUdeviceptr var;
1167       size_t bytes;
1168
1169       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1170                       &var, &bytes, module, var_names[j]);
1171
1172       targ_tbl->start = (uintptr_t) var;
1173       targ_tbl->end = targ_tbl->start + bytes;
1174     }
1175
1176   nvptx_set_clocktick (module, dev);
1177
1178   return fn_entries + var_entries;
1179 }
1180
1181 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1182    function descriptors allocated by G_O_load_image.  */
1183
1184 bool
1185 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1186 {
1187   struct ptx_image_data *image, **prev_p;
1188   struct ptx_device *dev = ptx_devices[ord];
1189
1190   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1191     {
1192       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1193                          " (expected %u, received %u)",
1194                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1195       return false;
1196     }
1197
1198   bool ret = true;
1199   pthread_mutex_lock (&dev->image_lock);
1200   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1201     if (image->target_data == target_data)
1202       {
1203         *prev_p = image->next;
1204         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1205           ret = false;
1206         free (image->fns);
1207         free (image);
1208         break;
1209       }
1210   pthread_mutex_unlock (&dev->image_lock);
1211   return ret;
1212 }
1213
1214 void *
1215 GOMP_OFFLOAD_alloc (int ord, size_t size)
1216 {
1217   if (!nvptx_attach_host_thread_to_device (ord))
1218     return NULL;
1219
1220   struct ptx_device *ptx_dev = ptx_devices[ord];
1221   struct ptx_free_block *blocks, *tmp;
1222
1223   pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1224   blocks = ptx_dev->free_blocks;
1225   ptx_dev->free_blocks = NULL;
1226   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1227
1228   while (blocks)
1229     {
1230       tmp = blocks->next;
1231       nvptx_free (blocks->ptr, ptx_dev);
1232       free (blocks);
1233       blocks = tmp;
1234     }
1235
1236   return nvptx_alloc (size);
1237 }
1238
1239 bool
1240 GOMP_OFFLOAD_free (int ord, void *ptr)
1241 {
1242   return (nvptx_attach_host_thread_to_device (ord)
1243           && nvptx_free (ptr, ptx_devices[ord]));
1244 }
1245
1246 void
1247 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1248                            void **hostaddrs, void **devaddrs,
1249                            unsigned *dims, void *targ_mem_desc)
1250 {
1251   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1252
1253   void **hp = NULL;
1254   CUdeviceptr dp = 0;
1255
1256   if (mapnum > 0)
1257     {
1258       hp = alloca (mapnum * sizeof (void *));
1259       for (int i = 0; i < mapnum; i++)
1260         hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1261       CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
1262     }
1263
1264   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1265      fact have the same value on a unified-memory system).  */
1266   if (mapnum > 0)
1267     CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1268                       mapnum * sizeof (void *));
1269
1270   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1271               dp, NULL);
1272
1273   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1274   const char *maybe_abort_msg = "(perhaps abort was called)";
1275   if (r == CUDA_ERROR_LAUNCH_FAILED)
1276     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1277                        maybe_abort_msg);
1278   else if (r != CUDA_SUCCESS)
1279     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1280   CUDA_CALL_ASSERT (cuMemFree, dp);
1281 }
1282
1283 static void
1284 cuda_free_argmem (void *ptr)
1285 {
1286   void **block = (void **) ptr;
1287   nvptx_free (block[0], (struct ptx_device *) block[1]);
1288   free (block);
1289 }
1290
1291 void
1292 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1293                                  void **hostaddrs, void **devaddrs,
1294                                  unsigned *dims, void *targ_mem_desc,
1295                                  struct goacc_asyncqueue *aq)
1296 {
1297   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1298
1299   void **hp = NULL;
1300   CUdeviceptr dp = 0;
1301   void **block = NULL;
1302
1303   if (mapnum > 0)
1304     {
1305       block = (void **) GOMP_PLUGIN_malloc ((mapnum + 2) * sizeof (void *));
1306       hp = block + 2;
1307       for (int i = 0; i < mapnum; i++)
1308         hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1309       CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
1310     }
1311
1312   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1313      fact have the same value on a unified-memory system).  */
1314   if (mapnum > 0)
1315     {
1316       CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1317                         mapnum * sizeof (void *), aq->cuda_stream);
1318       block[0] = (void *) dp;
1319
1320       struct nvptx_thread *nvthd =
1321         (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1322       block[1] = (void *) nvthd->ptx_dev;
1323     }
1324   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1325               dp, aq->cuda_stream);
1326
1327   if (mapnum > 0)
1328     GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1329 }
1330
1331 void *
1332 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1333 {
1334   struct ptx_device *ptx_dev;
1335   struct nvptx_thread *nvthd
1336     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1337   CUcontext thd_ctx;
1338
1339   ptx_dev = ptx_devices[ord];
1340
1341   assert (ptx_dev);
1342
1343   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1344
1345   assert (ptx_dev->ctx);
1346
1347   if (!thd_ctx)
1348     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1349
1350   nvthd->ptx_dev = ptx_dev;
1351
1352   return (void *) nvthd;
1353 }
1354
1355 void
1356 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1357 {
1358   free (data);
1359 }
1360
1361 void *
1362 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1363 {
1364   return nvptx_get_current_cuda_device ();
1365 }
1366
1367 void *
1368 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1369 {
1370   return nvptx_get_current_cuda_context ();
1371 }
1372
1373 /* This returns a CUstream.  */
1374 void *
1375 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1376 {
1377   return (void *) aq->cuda_stream;
1378 }
1379
1380 /* This takes a CUstream.  */
1381 int
1382 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1383 {
1384   if (aq->cuda_stream)
1385     {
1386       CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1387       CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1388     }
1389
1390   aq->cuda_stream = (CUstream) stream;
1391   return 1;
1392 }
1393
1394 struct goacc_asyncqueue *
1395 GOMP_OFFLOAD_openacc_async_construct (void)
1396 {
1397   CUstream stream = NULL;
1398   CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1399
1400   struct goacc_asyncqueue *aq
1401     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1402   aq->cuda_stream = stream;
1403   return aq;
1404 }
1405
1406 bool
1407 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1408 {
1409   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1410   free (aq);
1411   return true;
1412 }
1413
1414 int
1415 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1416 {
1417   CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1418   if (r == CUDA_SUCCESS)
1419     return 1;
1420   if (r == CUDA_ERROR_NOT_READY)
1421     return 0;
1422
1423   GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1424   return -1;
1425 }
1426
1427 bool
1428 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1429 {
1430   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1431   return true;
1432 }
1433
1434 bool
1435 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1436                                       struct goacc_asyncqueue *aq2)
1437 {
1438   CUevent e;
1439   CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1440   CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1441   CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1442   return true;
1443 }
1444
1445 static void
1446 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1447 {
1448   if (res != CUDA_SUCCESS)
1449     GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1450   struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1451   cb->fn (cb->ptr);
1452   free (ptr);
1453 }
1454
1455 void
1456 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1457                                            void (*callback_fn)(void *),
1458                                            void *userptr)
1459 {
1460   struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1461   b->fn = callback_fn;
1462   b->ptr = userptr;
1463   b->aq = aq;
1464   CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1465                     cuda_callback_wrapper, (void *) b, 0);
1466 }
1467
1468 static bool
1469 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1470 {
1471   CUdeviceptr pb;
1472   size_t ps;
1473   if (!s)
1474     return true;
1475   if (!d)
1476     {
1477       GOMP_PLUGIN_error ("invalid device address");
1478       return false;
1479     }
1480   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1481   if (!pb)
1482     {
1483       GOMP_PLUGIN_error ("invalid device address");
1484       return false;
1485     }
1486   if (!h)
1487     {
1488       GOMP_PLUGIN_error ("invalid host address");
1489       return false;
1490     }
1491   if (d == h)
1492     {
1493       GOMP_PLUGIN_error ("invalid host or device address");
1494       return false;
1495     }
1496   if ((void *)(d + s) > (void *)(pb + ps))
1497     {
1498       GOMP_PLUGIN_error ("invalid size");
1499       return false;
1500     }
1501   return true;
1502 }
1503
1504 bool
1505 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1506 {
1507   if (!nvptx_attach_host_thread_to_device (ord)
1508       || !cuda_memcpy_sanity_check (src, dst, n))
1509     return false;
1510   CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1511   return true;
1512 }
1513
1514 bool
1515 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1516 {
1517   if (!nvptx_attach_host_thread_to_device (ord)
1518       || !cuda_memcpy_sanity_check (dst, src, n))
1519     return false;
1520   CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1521   return true;
1522 }
1523
1524 bool
1525 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1526 {
1527   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1528   return true;
1529 }
1530
1531 bool
1532 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1533                                      size_t n, struct goacc_asyncqueue *aq)
1534 {
1535   if (!nvptx_attach_host_thread_to_device (ord)
1536       || !cuda_memcpy_sanity_check (src, dst, n))
1537     return false;
1538   CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1539   return true;
1540 }
1541
1542 bool
1543 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1544                                      size_t n, struct goacc_asyncqueue *aq)
1545 {
1546   if (!nvptx_attach_host_thread_to_device (ord)
1547       || !cuda_memcpy_sanity_check (dst, src, n))
1548     return false;
1549   CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1550   return true;
1551 }
1552
1553 /* Adjust launch dimensions: pick good values for number of blocks and warps
1554    and ensure that number of warps does not exceed CUDA limits as well as GCC's
1555    own limits.  */
1556
1557 static void
1558 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1559                             struct ptx_device *ptx_dev,
1560                             int *teams_p, int *threads_p)
1561 {
1562   int max_warps_block = fn->max_threads_per_block / 32;
1563   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1564      and libgcc, which matches documented limit of all GPUs as of 2015.  */
1565   if (max_warps_block > 32)
1566     max_warps_block = 32;
1567   if (*threads_p <= 0)
1568     *threads_p = 8;
1569   if (*threads_p > max_warps_block)
1570     *threads_p = max_warps_block;
1571
1572   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1573   /* This is an estimate of how many blocks the device can host simultaneously.
1574      Actual limit, which may be lower, can be queried with "occupancy control"
1575      driver interface (since CUDA 6.0).  */
1576   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1577   if (*teams_p <= 0 || *teams_p > max_blocks)
1578     *teams_p = max_blocks;
1579 }
1580
1581 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1582    target regions.  */
1583
1584 static size_t
1585 nvptx_stacks_size ()
1586 {
1587   return 128 * 1024;
1588 }
1589
1590 /* Return contiguous storage for NUM stacks, each SIZE bytes.  */
1591
1592 static void *
1593 nvptx_stacks_alloc (size_t size, int num)
1594 {
1595   CUdeviceptr stacks;
1596   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
1597   if (r != CUDA_SUCCESS)
1598     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1599   return (void *) stacks;
1600 }
1601
1602 /* Release storage previously allocated by nvptx_stacks_alloc.  */
1603
1604 static void
1605 nvptx_stacks_free (void *p, int num)
1606 {
1607   CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
1608   if (r != CUDA_SUCCESS)
1609     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1610 }
1611
1612 void
1613 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1614 {
1615   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
1616   CUresult r;
1617   struct ptx_device *ptx_dev = ptx_devices[ord];
1618   const char *maybe_abort_msg = "(perhaps abort was called)";
1619   int teams = 0, threads = 0;
1620
1621   if (!args)
1622     GOMP_PLUGIN_fatal ("No target arguments provided");
1623   while (*args)
1624     {
1625       intptr_t id = (intptr_t) *args++, val;
1626       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1627         val = (intptr_t) *args++;
1628       else
1629         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1630       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1631         continue;
1632       val = val > INT_MAX ? INT_MAX : val;
1633       id &= GOMP_TARGET_ARG_ID_MASK;
1634       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
1635         teams = val;
1636       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
1637         threads = val;
1638     }
1639   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
1640
1641   size_t stack_size = nvptx_stacks_size ();
1642   void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
1643   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
1644   size_t fn_args_size = sizeof fn_args;
1645   void *config[] = {
1646     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
1647     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
1648     CU_LAUNCH_PARAM_END
1649   };
1650   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
1651                          32, threads, 1, 0, NULL, NULL, config);
1652   if (r != CUDA_SUCCESS)
1653     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1654
1655   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1656   if (r == CUDA_ERROR_LAUNCH_FAILED)
1657     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1658                        maybe_abort_msg);
1659   else if (r != CUDA_SUCCESS)
1660     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1661   nvptx_stacks_free (stacks, teams * threads);
1662 }
1663
1664 void
1665 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
1666                         void *async_data)
1667 {
1668   GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
1669 }