]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-nvptx.c
Fortran: Fix OpenMP's 'if(simd:' etc. conditions
[thirdparty/gcc.git] / libgomp / plugin / plugin-nvptx.c
CommitLineData
41dbbb37
TS
1/* Plugin for NVPTX execution.
2
8d9254fc 3 Copyright (C) 2013-2020 Free Software Foundation, Inc.
41dbbb37
TS
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
df36a3d3 34#define _GNU_SOURCE
41dbbb37
TS
35#include "openacc.h"
36#include "config.h"
37#include "libgomp-plugin.h"
41dbbb37 38#include "oacc-plugin.h"
2a21ff19 39#include "gomp-constants.h"
5fae049d 40#include "oacc-int.h"
41dbbb37
TS
41
42#include <pthread.h>
43#include <cuda.h>
44#include <stdbool.h>
6103184e 45#include <limits.h>
41dbbb37
TS
46#include <string.h>
47#include <stdio.h>
41dbbb37
TS
48#include <unistd.h>
49#include <assert.h>
6668eb45 50#include <errno.h>
41dbbb37 51
94767dac
TV
52#if CUDA_VERSION < 6000
53extern CUresult cuGetErrorString (CUresult, const char **);
b113af95 54#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
94767dac
TV
55#endif
56
8e09a12f
TV
57#if CUDA_VERSION >= 6050
58#undef cuLinkCreate
59#undef cuLinkAddData
60CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
61 const char *, unsigned, CUjit_option *, void **);
62CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
63#else
bd9b3d3d 64typedef size_t (*CUoccupancyB2DSize)(int);
8e09a12f
TV
65CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
66 const char *, unsigned, CUjit_option *, void **);
67CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
bd9b3d3d
CP
68CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
69 CUoccupancyB2DSize, size_t, int);
8e09a12f
TV
70#endif
71
02150de8
TV
72#define DO_PRAGMA(x) _Pragma (#x)
73
2393d337
JJ
74#if PLUGIN_NVPTX_DYNAMIC
75# include <dlfcn.h>
76
2393d337 77struct cuda_lib_s {
9e28b107
TV
78
79# define CUDA_ONE_CALL(call) \
80 __typeof (call) *call;
02150de8
TV
81# define CUDA_ONE_CALL_MAYBE_NULL(call) \
82 CUDA_ONE_CALL (call)
8c6310a2 83#include "cuda-lib.def"
9e28b107 84# undef CUDA_ONE_CALL
02150de8 85# undef CUDA_ONE_CALL_MAYBE_NULL
9e28b107 86
2393d337
JJ
87} cuda_lib;
88
89/* -1 if init_cuda_lib has not been called yet, false
90 if it has been and failed, true if it has been and succeeded. */
19929ba9 91static signed char cuda_lib_inited = -1;
2393d337
JJ
92
93/* Dynamically load the CUDA runtime library and initialize function
94 pointers, return false if unsuccessful, true if successful. */
95static bool
96init_cuda_lib (void)
97{
98 if (cuda_lib_inited != -1)
99 return cuda_lib_inited;
100 const char *cuda_runtime_lib = "libcuda.so.1";
101 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
102 cuda_lib_inited = false;
103 if (h == NULL)
104 return false;
9e28b107 105
02150de8
TV
106# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
107# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
108# define CUDA_ONE_CALL_1(call, allow_null) \
2393d337 109 cuda_lib.call = dlsym (h, #call); \
02150de8 110 if (!allow_null && cuda_lib.call == NULL) \
2393d337 111 return false;
8c6310a2 112#include "cuda-lib.def"
9e28b107
TV
113# undef CUDA_ONE_CALL
114# undef CUDA_ONE_CALL_1
02150de8 115# undef CUDA_ONE_CALL_MAYBE_NULL
9e28b107 116
2393d337
JJ
117 cuda_lib_inited = true;
118 return true;
41dbbb37 119}
2393d337
JJ
120# define CUDA_CALL_PREFIX cuda_lib.
121#else
02150de8
TV
122
123# define CUDA_ONE_CALL(call)
124# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
125#include "cuda-lib.def"
126#undef CUDA_ONE_CALL_MAYBE_NULL
127#undef CUDA_ONE_CALL
128
2393d337
JJ
129# define CUDA_CALL_PREFIX
130# define init_cuda_lib() true
131#endif
41dbbb37 132
df36a3d3
TV
133#include "secure_getenv.h"
134
4cdfee3f
TV
135#undef MIN
136#undef MAX
137#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
138#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
139
6ce13072 140/* Convenience macros for the frequently used CUDA library call and
2393d337
JJ
141 error handling sequence as well as CUDA library calls that
142 do the error checking themselves or don't do it at all. */
6ce13072
CLT
143
144#define CUDA_CALL_ERET(ERET, FN, ...) \
145 do { \
2393d337
JJ
146 unsigned __r \
147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
6ce13072
CLT
148 if (__r != CUDA_SUCCESS) \
149 { \
150 GOMP_PLUGIN_error (#FN " error: %s", \
151 cuda_error (__r)); \
152 return ERET; \
153 } \
154 } while (0)
155
156#define CUDA_CALL(FN, ...) \
2393d337 157 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
6ce13072
CLT
158
159#define CUDA_CALL_ASSERT(FN, ...) \
160 do { \
2393d337
JJ
161 unsigned __r \
162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
6ce13072
CLT
163 if (__r != CUDA_SUCCESS) \
164 { \
165 GOMP_PLUGIN_fatal (#FN " error: %s", \
166 cuda_error (__r)); \
167 } \
168 } while (0)
169
2393d337
JJ
170#define CUDA_CALL_NOCHECK(FN, ...) \
171 CUDA_CALL_PREFIX FN (__VA_ARGS__)
172
02150de8
TV
173#define CUDA_CALL_EXISTS(FN) \
174 CUDA_CALL_PREFIX FN
175
2393d337
JJ
176static const char *
177cuda_error (CUresult r)
178{
cedd9bd0 179 const char *fallback = "unknown cuda error";
2393d337
JJ
180 const char *desc;
181
cedd9bd0
TV
182 if (!CUDA_CALL_EXISTS (cuGetErrorString))
183 return fallback;
184
2393d337 185 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
cedd9bd0
TV
186 if (r == CUDA_SUCCESS)
187 return desc;
2393d337 188
cedd9bd0 189 return fallback;
2393d337
JJ
190}
191
6c84c8bf
MR
192/* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
193 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
194static char cuda_driver_version_s[30];
195
d93bdab5
JB
196static unsigned int instantiated_devices = 0;
197static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
41dbbb37 198
1f4c5b9b
CLT
199/* NVPTX/CUDA specific definition of asynchronous queues. */
200struct goacc_asyncqueue
2049befd 201{
1f4c5b9b 202 CUstream cuda_stream;
2049befd
CP
203};
204
1f4c5b9b 205struct nvptx_callback
41dbbb37 206{
1f4c5b9b
CLT
207 void (*fn) (void *);
208 void *ptr;
209 struct goacc_asyncqueue *aq;
210 struct nvptx_callback *next;
41dbbb37
TS
211};
212
213/* Thread-specific data for PTX. */
214
215struct nvptx_thread
216{
1f4c5b9b
CLT
217 /* We currently have this embedded inside the plugin because libgomp manages
218 devices through integer target_ids. This might be better if using an
219 opaque target-specific pointer directly from gomp_device_descr. */
41dbbb37
TS
220 struct ptx_device *ptx_dev;
221};
222
3e32ee19
NS
223/* Target data function launch information. */
224
225struct targ_fn_launch
226{
227 const char *fn;
cc3cd79b 228 unsigned short dim[GOMP_DIM_MAX];
3e32ee19
NS
229};
230
cc3cd79b
NS
231/* Target PTX object information. */
232
233struct targ_ptx_obj
234{
235 const char *code;
236 size_t size;
237};
238
239/* Target data image information. */
240
241typedef struct nvptx_tdata
242{
243 const struct targ_ptx_obj *ptx_objs;
244 unsigned ptx_num;
245
246 const char *const *var_names;
247 unsigned var_num;
248
249 const struct targ_fn_launch *fn_descs;
250 unsigned fn_num;
251} nvptx_tdata_t;
252
f3e9a059
NS
253/* Descriptor of a loaded function. */
254
255struct targ_fn_descriptor
256{
257 CUfunction fn;
3e32ee19 258 const struct targ_fn_launch *launch;
6103184e
AM
259 int regs_per_thread;
260 int max_threads_per_block;
f3e9a059
NS
261};
262
263/* A loaded PTX image. */
264struct ptx_image_data
265{
266 const void *target_data;
267 CUmodule module;
268
269 struct targ_fn_descriptor *fns; /* Array of functions. */
270
271 struct ptx_image_data *next;
272};
273
1f4c5b9b
CLT
274struct ptx_free_block
275{
276 void *ptr;
277 struct ptx_free_block *next;
278};
279
41dbbb37
TS
280struct ptx_device
281{
282 CUcontext ctx;
283 bool ctx_shared;
284 CUdevice dev;
1f4c5b9b 285
41dbbb37
TS
286 int ord;
287 bool overlap;
288 bool map;
289 bool concur;
41dbbb37 290 bool mkern;
6c84c8bf 291 int mode;
6103184e
AM
292 int clock_khz;
293 int num_sms;
294 int regs_per_block;
295 int regs_per_sm;
0c6c2f5f
CP
296 int warp_size;
297 int max_threads_per_block;
298 int max_threads_per_multiprocessor;
0b210c43 299 int default_dims[GOMP_DIM_MAX];
41dbbb37 300
6c84c8bf
MR
301 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
302 char name[256];
303
f3e9a059
NS
304 struct ptx_image_data *images; /* Images loaded on device. */
305 pthread_mutex_t image_lock; /* Lock for above list. */
41dbbb37 306
1f4c5b9b
CLT
307 struct ptx_free_block *free_blocks;
308 pthread_mutex_t free_blocks_lock;
41dbbb37 309
1f4c5b9b 310 struct ptx_device *next;
41dbbb37
TS
311};
312
d93bdab5
JB
313static struct ptx_device **ptx_devices;
314
41dbbb37
TS
315static inline struct nvptx_thread *
316nvptx_thread (void)
317{
318 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
319}
320
d93bdab5
JB
321/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
322 should be locked on entry and remains locked on exit. */
f3e9a059 323
d93bdab5 324static bool
41dbbb37
TS
325nvptx_init (void)
326{
d93bdab5 327 int ndevs;
41dbbb37 328
d93bdab5
JB
329 if (instantiated_devices != 0)
330 return true;
41dbbb37 331
2393d337
JJ
332 if (!init_cuda_lib ())
333 return false;
334
335 CUDA_CALL (cuInit, 0);
336
6c84c8bf
MR
337 int cuda_driver_version;
338 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
339 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
340 "CUDA Driver %u.%u",
341 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
342
6ce13072 343 CUDA_CALL (cuDeviceGetCount, &ndevs);
d93bdab5
JB
344 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
345 * ndevs);
6c84c8bf 346
d93bdab5 347 return true;
41dbbb37
TS
348}
349
d93bdab5
JB
350/* Select the N'th PTX device for the current host thread. The device must
351 have been previously opened before calling this function. */
352
6ce13072 353static bool
d93bdab5 354nvptx_attach_host_thread_to_device (int n)
41dbbb37 355{
d93bdab5
JB
356 CUdevice dev;
357 CUresult r;
358 struct ptx_device *ptx_dev;
359 CUcontext thd_ctx;
360
2393d337 361 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
1f4c5b9b
CLT
362 if (r == CUDA_ERROR_NOT_PERMITTED)
363 {
364 /* Assume we're in a CUDA callback, just return true. */
365 return true;
366 }
d93bdab5 367 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
6ce13072
CLT
368 {
369 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
370 return false;
371 }
d93bdab5
JB
372
373 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
6ce13072 374 return true;
d93bdab5
JB
375 else
376 {
377 CUcontext old_ctx;
378
379 ptx_dev = ptx_devices[n];
6ce13072
CLT
380 if (!ptx_dev)
381 {
382 GOMP_PLUGIN_error ("device %d not found", n);
383 return false;
384 }
d93bdab5 385
6ce13072 386 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
d93bdab5
JB
387
388 /* We don't necessarily have a current context (e.g. if it has been
389 destroyed. Pop it if we do though. */
390 if (thd_ctx != NULL)
6ce13072 391 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
d93bdab5 392
6ce13072 393 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
d93bdab5 394 }
6ce13072 395 return true;
41dbbb37
TS
396}
397
d93bdab5 398static struct ptx_device *
41dbbb37
TS
399nvptx_open_device (int n)
400{
401 struct ptx_device *ptx_dev;
d93bdab5 402 CUdevice dev, ctx_dev;
41dbbb37
TS
403 CUresult r;
404 int async_engines, pi;
405
6ce13072 406 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
41dbbb37
TS
407
408 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
409
410 ptx_dev->ord = n;
411 ptx_dev->dev = dev;
412 ptx_dev->ctx_shared = false;
413
2393d337 414 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
d93bdab5 415 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
6ce13072
CLT
416 {
417 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
418 return NULL;
419 }
d93bdab5
JB
420
421 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
422 {
423 /* The current host thread has an active context for a different device.
424 Detach it. */
425 CUcontext old_ctx;
6ce13072 426 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
d93bdab5
JB
427 }
428
6ce13072 429 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
41dbbb37
TS
430
431 if (!ptx_dev->ctx)
6ce13072 432 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
41dbbb37
TS
433 else
434 ptx_dev->ctx_shared = true;
435
6ce13072
CLT
436 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
437 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
41dbbb37
TS
438 ptx_dev->overlap = pi;
439
6ce13072
CLT
440 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
441 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
41dbbb37
TS
442 ptx_dev->map = pi;
443
6ce13072
CLT
444 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
445 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
41dbbb37
TS
446 ptx_dev->concur = pi;
447
6ce13072
CLT
448 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
449 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
41dbbb37
TS
450 ptx_dev->mode = pi;
451
6ce13072
CLT
452 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
453 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
41dbbb37
TS
454 ptx_dev->mkern = pi;
455
6103184e
AM
456 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
457 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
458 ptx_dev->clock_khz = pi;
459
2393d337 460 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
6103184e
AM
461 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
462 ptx_dev->num_sms = pi;
463
464 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
465 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
466 ptx_dev->regs_per_block = pi;
467
b113af95 468 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
6103184e 469 in CUDA 6.0 and newer. */
b113af95
TV
470 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
471 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
472 dev);
6103184e
AM
473 /* Fallback: use limit of registers per block, which is usually equal. */
474 if (r == CUDA_ERROR_INVALID_VALUE)
475 pi = ptx_dev->regs_per_block;
476 else if (r != CUDA_SUCCESS)
477 {
478 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
479 return NULL;
480 }
481 ptx_dev->regs_per_sm = pi;
482
483 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
484 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
485 if (pi != 32)
486 {
487 GOMP_PLUGIN_error ("Only warp size 32 is supported");
488 return NULL;
489 }
0c6c2f5f
CP
490 ptx_dev->warp_size = pi;
491
492 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
493 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
494 ptx_dev->max_threads_per_block = pi;
495
496 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
497 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
498 ptx_dev->max_threads_per_multiprocessor = pi;
6103184e 499
2393d337
JJ
500 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
501 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
41dbbb37
TS
502 if (r != CUDA_SUCCESS)
503 async_engines = 1;
504
0b210c43
TV
505 for (int i = 0; i != GOMP_DIM_MAX; i++)
506 ptx_dev->default_dims[i] = 0;
507
6c84c8bf
MR
508 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
509 dev);
510
f3e9a059
NS
511 ptx_dev->images = NULL;
512 pthread_mutex_init (&ptx_dev->image_lock, NULL);
513
1f4c5b9b
CLT
514 ptx_dev->free_blocks = NULL;
515 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
41dbbb37 516
d93bdab5 517 return ptx_dev;
41dbbb37
TS
518}
519
6ce13072 520static bool
d93bdab5 521nvptx_close_device (struct ptx_device *ptx_dev)
41dbbb37 522{
41dbbb37 523 if (!ptx_dev)
6ce13072 524 return true;
41dbbb37 525
1f4c5b9b
CLT
526 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
527 {
528 struct ptx_free_block *b_next = b->next;
529 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
530 free (b);
531 b = b_next;
532 }
533
534 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
f3e9a059 535 pthread_mutex_destroy (&ptx_dev->image_lock);
41dbbb37
TS
536
537 if (!ptx_dev->ctx_shared)
6ce13072 538 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
41dbbb37
TS
539
540 free (ptx_dev);
6ce13072 541 return true;
41dbbb37
TS
542}
543
544static int
545nvptx_get_num_devices (void)
546{
547 int n;
41dbbb37 548
a92defda
TS
549 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
550 configurations. */
551 if (sizeof (void *) != 8)
78672bd8
TS
552 {
553 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
554 " only 64-bit configurations are supported\n");
555 return 0;
556 }
a92defda 557
41dbbb37
TS
558 /* This function will be called before the plugin has been initialized in
559 order to enumerate available devices, but CUDA API routines can't be used
560 until cuInit has been called. Just call it now (but don't yet do any
561 further initialization). */
d93bdab5 562 if (instantiated_devices == 0)
c8319826 563 {
2393d337
JJ
564 if (!init_cuda_lib ())
565 return 0;
566 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
c8319826
JB
567 /* This is not an error: e.g. we may have CUDA libraries installed but
568 no devices available. */
569 if (r != CUDA_SUCCESS)
78672bd8
TS
570 {
571 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
572 cuda_error (r));
573 return 0;
574 }
c8319826 575 }
41dbbb37 576
6ce13072 577 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
41dbbb37
TS
578 return n;
579}
580
dfb15f6b
TV
581static void
582notify_var (const char *var_name, const char *env_var)
583{
584 if (env_var == NULL)
585 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
586 else
587 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
588}
41dbbb37 589
df36a3d3
TV
590static void
591process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
592{
593 const char *var_name = "GOMP_NVPTX_JIT";
594 const char *env_var = secure_getenv (var_name);
595 notify_var (var_name, env_var);
596
597 if (env_var == NULL)
598 return;
599
600 const char *c = env_var;
601 while (*c != '\0')
602 {
603 while (*c == ' ')
604 c++;
605
606 if (c[0] == '-' && c[1] == 'O'
607 && '0' <= c[2] && c[2] <= '4'
608 && (c[3] == '\0' || c[3] == ' '))
609 {
610 *gomp_nvptx_o = c[2] - '0';
611 c += 3;
612 continue;
613 }
614
615 GOMP_PLUGIN_error ("Error parsing %s", var_name);
616 break;
617 }
618}
619
6ce13072 620static bool
cc3cd79b
NS
621link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
622 unsigned num_objs)
41dbbb37 623{
df36a3d3
TV
624 CUjit_option opts[7];
625 void *optvals[7];
41dbbb37 626 float elapsed = 0.0;
6103184e
AM
627 char elog[1024];
628 char ilog[16384];
41dbbb37
TS
629 CUlinkState linkstate;
630 CUresult r;
631 void *linkout;
632 size_t linkoutsize __attribute__ ((unused));
633
41dbbb37
TS
634 opts[0] = CU_JIT_WALL_TIME;
635 optvals[0] = &elapsed;
636
637 opts[1] = CU_JIT_INFO_LOG_BUFFER;
638 optvals[1] = &ilog[0];
639
640 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
6103184e 641 optvals[2] = (void *) sizeof ilog;
41dbbb37
TS
642
643 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
644 optvals[3] = &elog[0];
645
646 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
6103184e 647 optvals[4] = (void *) sizeof elog;
41dbbb37
TS
648
649 opts[5] = CU_JIT_LOG_VERBOSE;
650 optvals[5] = (void *) 1;
651
df36a3d3
TV
652 static intptr_t gomp_nvptx_o = -1;
653
654 static bool init_done = false;
655 if (!init_done)
656 {
657 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
658 init_done = true;
659 }
660
661 int nopts = 6;
662 if (gomp_nvptx_o != -1)
663 {
664 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
665 optvals[nopts] = (void *) gomp_nvptx_o;
666 nopts++;
667 }
668
8e09a12f
TV
669 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
670 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
671 else
672 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
41dbbb37 673
cc3cd79b 674 for (; num_objs--; ptx_objs++)
41dbbb37 675 {
cc3cd79b
NS
676 /* cuLinkAddData's 'data' argument erroneously omits the const
677 qualifier. */
678 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
8e09a12f
TV
679 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
680 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
681 (char *) ptx_objs->code, ptx_objs->size,
682 0, 0, 0, 0);
683 else
684 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
685 (char *) ptx_objs->code, ptx_objs->size,
686 0, 0, 0, 0);
cc3cd79b
NS
687 if (r != CUDA_SUCCESS)
688 {
689 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
6ce13072 690 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
cc3cd79b 691 cuda_error (r));
6ce13072 692 return false;
cc3cd79b 693 }
41dbbb37
TS
694 }
695
cc3cd79b 696 GOMP_PLUGIN_debug (0, "Linking\n");
2393d337 697 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
41dbbb37
TS
698
699 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
700 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
701
cc3cd79b 702 if (r != CUDA_SUCCESS)
6ce13072
CLT
703 {
704 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
705 return false;
706 }
cc3cd79b 707
6ce13072
CLT
708 CUDA_CALL (cuModuleLoadData, module, linkout);
709 CUDA_CALL (cuLinkDestroy, linkstate);
710 return true;
41dbbb37
TS
711}
712
e70ab10d 713static void
41dbbb37 714nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1f4c5b9b
CLT
715 unsigned *dims, void *targ_mem_desc,
716 CUdeviceptr dp, CUstream stream)
41dbbb37
TS
717{
718 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
719 CUfunction function;
41dbbb37 720 int i;
41dbbb37 721 void *kargs[1];
41dbbb37 722 struct nvptx_thread *nvthd = nvptx_thread ();
4cdfee3f 723 int warp_size = nvthd->ptx_dev->warp_size;
41dbbb37
TS
724
725 function = targ_fn->fn;
726
3e32ee19
NS
727 /* Initialize the launch dimensions. Typically this is constant,
728 provided by the device compiler, but we must permit runtime
729 values. */
f99c3557
TS
730 int seen_zero = 0;
731 for (i = 0; i != GOMP_DIM_MAX; i++)
732 {
733 if (targ_fn->launch->dim[i])
734 dims[i] = targ_fn->launch->dim[i];
735 if (!dims[i])
736 seen_zero = 1;
737 }
738
739 if (seen_zero)
740 {
6668eb45 741 pthread_mutex_lock (&ptx_dev_lock);
0b210c43
TV
742
743 static int gomp_openacc_dims[GOMP_DIM_MAX];
744 if (!gomp_openacc_dims[0])
745 {
746 /* See if the user provided GOMP_OPENACC_DIM environment
747 variable to specify runtime defaults. */
748 for (int i = 0; i < GOMP_DIM_MAX; ++i)
749 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
750 }
751
752 if (!nvthd->ptx_dev->default_dims[0])
6668eb45 753 {
0b210c43 754 int default_dims[GOMP_DIM_MAX];
ec00d3fa 755 for (int i = 0; i < GOMP_DIM_MAX; ++i)
0b210c43 756 default_dims[i] = gomp_openacc_dims[i];
6668eb45 757
0c6c2f5f
CP
758 int gang, worker, vector;
759 {
0c6c2f5f
CP
760 int block_size = nvthd->ptx_dev->max_threads_per_block;
761 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
762 int dev_size = nvthd->ptx_dev->num_sms;
763 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
764 " dev_size=%d, cpu_size=%d\n",
765 warp_size, block_size, dev_size, cpu_size);
766
767 gang = (cpu_size / block_size) * dev_size;
768 worker = block_size / warp_size;
769 vector = warp_size;
770 }
6668eb45
CP
771
772 /* There is no upper bound on the gang size. The best size
773 matches the hardware configuration. Logical gangs are
774 scheduled onto physical hardware. To maximize usage, we
775 should guess a large number. */
776 if (default_dims[GOMP_DIM_GANG] < 1)
777 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
778 /* The worker size must not exceed the hardware. */
779 if (default_dims[GOMP_DIM_WORKER] < 1
780 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
781 default_dims[GOMP_DIM_WORKER] = worker;
782 /* The vector size must exactly match the hardware. */
783 if (default_dims[GOMP_DIM_VECTOR] < 1
784 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
785 default_dims[GOMP_DIM_VECTOR] = vector;
786
787 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
788 default_dims[GOMP_DIM_GANG],
789 default_dims[GOMP_DIM_WORKER],
790 default_dims[GOMP_DIM_VECTOR]);
0b210c43
TV
791
792 for (i = 0; i != GOMP_DIM_MAX; i++)
793 nvthd->ptx_dev->default_dims[i] = default_dims[i];
6668eb45
CP
794 }
795 pthread_mutex_unlock (&ptx_dev_lock);
796
4cdfee3f
TV
797 {
798 bool default_dim_p[GOMP_DIM_MAX];
799 for (i = 0; i != GOMP_DIM_MAX; i++)
bd9b3d3d
CP
800 default_dim_p[i] = !dims[i];
801
802 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
4cdfee3f 803 {
bd9b3d3d
CP
804 for (i = 0; i != GOMP_DIM_MAX; i++)
805 if (default_dim_p[i])
806 dims[i] = nvthd->ptx_dev->default_dims[i];
807
808 if (default_dim_p[GOMP_DIM_VECTOR])
809 dims[GOMP_DIM_VECTOR]
810 = MIN (dims[GOMP_DIM_VECTOR],
811 (targ_fn->max_threads_per_block / warp_size
812 * warp_size));
813
814 if (default_dim_p[GOMP_DIM_WORKER])
815 dims[GOMP_DIM_WORKER]
816 = MIN (dims[GOMP_DIM_WORKER],
817 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
4cdfee3f 818 }
bd9b3d3d
CP
819 else
820 {
821 /* Handle the case that the compiler allows the runtime to choose
822 the vector-length conservatively, by ignoring
823 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
824 it. */
825 int vectors = 0;
826 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
827 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
828 exceed targ_fn->max_threads_per_block. */
829 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
830 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
831 int grids, blocks;
832
833 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
834 &blocks, function, NULL, 0,
835 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
836 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
837 "grid = %d, block = %d\n", grids, blocks);
838
839 /* Keep the num_gangs proportional to the block size. In
840 the case were a block size is limited by shared-memory
841 or the register file capacity, the runtime will not
842 excessively over assign gangs to the multiprocessor
843 units if their state is going to be swapped out even
844 more than necessary. The constant factor 2 is there to
845 prevent threads from idling when there is insufficient
846 work for them. */
847 if (gangs == 0)
848 gangs = 2 * grids * (blocks / warp_size);
849
850 if (vectors == 0)
851 vectors = warp_size;
852
853 if (workers == 0)
854 {
855 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
856 ? vectors
857 : dims[GOMP_DIM_VECTOR]);
858 workers = blocks / actual_vectors;
2c372e81 859 workers = MAX (workers, 1);
052aaace
TV
860 /* If we need a per-worker barrier ... . */
861 if (actual_vectors > 32)
862 /* Don't use more barriers than available. */
863 workers = MIN (workers, 15);
bd9b3d3d 864 }
4cdfee3f 865
bd9b3d3d
CP
866 for (i = 0; i != GOMP_DIM_MAX; i++)
867 if (default_dim_p[i])
868 switch (i)
869 {
870 case GOMP_DIM_GANG: dims[i] = gangs; break;
871 case GOMP_DIM_WORKER: dims[i] = workers; break;
872 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
873 default: GOMP_PLUGIN_fatal ("invalid dim");
874 }
875 }
4cdfee3f 876 }
f99c3557 877 }
3e32ee19 878
88a4654d
CP
879 /* Check if the accelerator has sufficient hardware resources to
880 launch the offloaded kernel. */
881 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
882 > targ_fn->max_threads_per_block)
883 {
52d22ece
TV
884 const char *msg
885 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
886 " with num_workers = %d and vector_length = %d"
887 "; "
888 "recompile the program with 'num_workers = x and vector_length = y'"
889 " on that offloaded region or '-fopenacc-dim=:x:y' where"
890 " x * y <= %d"
891 ".\n");
892 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
893 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
88a4654d
CP
894 }
895
052aaace
TV
896 /* Check if the accelerator has sufficient barrier resources to
897 launch the offloaded kernel. */
898 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
899 {
900 const char *msg
901 = ("The Nvidia accelerator has insufficient barrier resources to launch"
902 " '%s' with num_workers = %d and vector_length = %d"
903 "; "
904 "recompile the program with 'num_workers = x' on that offloaded"
905 " region or '-fopenacc-dim=:x:' where x <= 15"
906 "; "
907 "or, recompile the program with 'vector_length = 32' on that"
2c2ff168 908 " offloaded region or '-fopenacc-dim=::32'"
052aaace
TV
909 ".\n");
910 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
911 dims[GOMP_DIM_VECTOR]);
912 }
913
3e32ee19
NS
914 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
915 " gangs=%u, workers=%u, vectors=%u\n",
6668eb45
CP
916 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
917 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
41dbbb37
TS
918
919 // OpenACC CUDA
920 //
3e32ee19
NS
921 // num_gangs nctaid.x
922 // num_workers ntid.y
923 // vector length ntid.x
5fae049d
TS
924
925 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
926 acc_prof_info *prof_info = thr->prof_info;
927 acc_event_info enqueue_launch_event_info;
928 acc_api_info *api_info = thr->api_info;
929 bool profiling_p = __builtin_expect (prof_info != NULL, false);
930 if (profiling_p)
931 {
932 prof_info->event_type = acc_ev_enqueue_launch_start;
933
934 enqueue_launch_event_info.launch_event.event_type
935 = prof_info->event_type;
936 enqueue_launch_event_info.launch_event.valid_bytes
937 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
938 enqueue_launch_event_info.launch_event.parent_construct
939 = acc_construct_parallel;
940 enqueue_launch_event_info.launch_event.implicit = 1;
941 enqueue_launch_event_info.launch_event.tool_info = NULL;
942 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
943 enqueue_launch_event_info.launch_event.num_gangs
944 = dims[GOMP_DIM_GANG];
945 enqueue_launch_event_info.launch_event.num_workers
946 = dims[GOMP_DIM_WORKER];
947 enqueue_launch_event_info.launch_event.vector_length
948 = dims[GOMP_DIM_VECTOR];
949
950 api_info->device_api = acc_device_api_cuda;
951
952 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
953 api_info);
954 }
955
41dbbb37 956 kargs[0] = &dp;
6ce13072
CLT
957 CUDA_CALL_ASSERT (cuLaunchKernel, function,
958 dims[GOMP_DIM_GANG], 1, 1,
959 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1f4c5b9b 960 0, stream, kargs, 0);
41dbbb37 961
5fae049d
TS
962 if (profiling_p)
963 {
964 prof_info->event_type = acc_ev_enqueue_launch_end;
965 enqueue_launch_event_info.launch_event.event_type
966 = prof_info->event_type;
967 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
968 api_info);
969 }
970
41dbbb37 971 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
3e32ee19 972 targ_fn->launch->fn);
41dbbb37
TS
973}
974
975void * openacc_get_current_cuda_context (void);
976
5fae049d
TS
977static void
978goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
979{
980 acc_prof_info *prof_info = thr->prof_info;
981 acc_event_info data_event_info;
982 acc_api_info *api_info = thr->api_info;
983
984 prof_info->event_type = acc_ev_alloc;
985
986 data_event_info.data_event.event_type = prof_info->event_type;
987 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
988 data_event_info.data_event.parent_construct = acc_construct_parallel;
989 data_event_info.data_event.implicit = 1;
990 data_event_info.data_event.tool_info = NULL;
991 data_event_info.data_event.var_name = NULL;
992 data_event_info.data_event.bytes = s;
993 data_event_info.data_event.host_ptr = NULL;
994 data_event_info.data_event.device_ptr = dp;
995
996 api_info->device_api = acc_device_api_cuda;
997
998 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
999}
1000
41dbbb37
TS
1001static void *
1002nvptx_alloc (size_t s)
1003{
1004 CUdeviceptr d;
41dbbb37 1005
6ce13072 1006 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
5fae049d
TS
1007 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1008 bool profiling_p
1009 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1010 if (profiling_p)
1011 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1012
6ce13072 1013 return (void *) d;
41dbbb37
TS
1014}
1015
5fae049d
TS
1016static void
1017goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1018{
1019 acc_prof_info *prof_info = thr->prof_info;
1020 acc_event_info data_event_info;
1021 acc_api_info *api_info = thr->api_info;
1022
1023 prof_info->event_type = acc_ev_free;
1024
1025 data_event_info.data_event.event_type = prof_info->event_type;
1026 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1027 data_event_info.data_event.parent_construct = acc_construct_parallel;
1028 data_event_info.data_event.implicit = 1;
1029 data_event_info.data_event.tool_info = NULL;
1030 data_event_info.data_event.var_name = NULL;
1031 data_event_info.data_event.bytes = -1;
1032 data_event_info.data_event.host_ptr = NULL;
1033 data_event_info.data_event.device_ptr = p;
1034
1035 api_info->device_api = acc_device_api_cuda;
1036
1037 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1038}
1039
6ce13072 1040static bool
1f4c5b9b 1041nvptx_free (void *p, struct ptx_device *ptx_dev)
41dbbb37 1042{
1f4c5b9b
CLT
1043 /* Assume callback context if this is null. */
1044 if (GOMP_PLUGIN_acc_thread () == NULL)
1045 {
1046 struct ptx_free_block *n
1047 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1048 n->ptr = p;
1049 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1050 n->next = ptx_dev->free_blocks;
1051 ptx_dev->free_blocks = n;
1052 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1053 return true;
1054 }
1055
41dbbb37
TS
1056 CUdeviceptr pb;
1057 size_t ps;
1058
6ce13072
CLT
1059 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1060 if ((CUdeviceptr) p != pb)
1061 {
1062 GOMP_PLUGIN_error ("invalid device address");
1063 return false;
1064 }
41dbbb37 1065
6ce13072 1066 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
5fae049d
TS
1067 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1068 bool profiling_p
1069 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1070 if (profiling_p)
1071 goacc_profiling_acc_ev_free (thr, p);
1072
6ce13072 1073 return true;
41dbbb37
TS
1074}
1075
1f4c5b9b
CLT
1076static void *
1077nvptx_get_current_cuda_device (void)
41dbbb37 1078{
41dbbb37
TS
1079 struct nvptx_thread *nvthd = nvptx_thread ();
1080
1f4c5b9b
CLT
1081 if (!nvthd || !nvthd->ptx_dev)
1082 return NULL;
41dbbb37 1083
1f4c5b9b 1084 return &nvthd->ptx_dev->dev;
41dbbb37
TS
1085}
1086
1f4c5b9b
CLT
1087static void *
1088nvptx_get_current_cuda_context (void)
41dbbb37
TS
1089{
1090 struct nvptx_thread *nvthd = nvptx_thread ();
1091
1092 if (!nvthd || !nvthd->ptx_dev)
1093 return NULL;
1094
1095 return nvthd->ptx_dev->ctx;
1096}
1097
41dbbb37
TS
1098/* Plugin entry points. */
1099
1100const char *
1101GOMP_OFFLOAD_get_name (void)
1102{
1103 return "nvptx";
1104}
1105
1106unsigned int
1107GOMP_OFFLOAD_get_caps (void)
1108{
6103184e 1109 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
41dbbb37
TS
1110}
1111
1112int
1113GOMP_OFFLOAD_get_type (void)
1114{
1115 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1116}
1117
1118int
1119GOMP_OFFLOAD_get_num_devices (void)
1120{
1121 return nvptx_get_num_devices ();
1122}
1123
6ce13072 1124bool
d93bdab5 1125GOMP_OFFLOAD_init_device (int n)
41dbbb37 1126{
6ce13072
CLT
1127 struct ptx_device *dev;
1128
d93bdab5 1129 pthread_mutex_lock (&ptx_dev_lock);
41dbbb37 1130
d93bdab5
JB
1131 if (!nvptx_init () || ptx_devices[n] != NULL)
1132 {
1133 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072 1134 return false;
d93bdab5
JB
1135 }
1136
6ce13072
CLT
1137 dev = nvptx_open_device (n);
1138 if (dev)
1139 {
1140 ptx_devices[n] = dev;
1141 instantiated_devices++;
1142 }
d93bdab5
JB
1143
1144 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072
CLT
1145
1146 return dev != NULL;
41dbbb37
TS
1147}
1148
6ce13072 1149bool
d93bdab5 1150GOMP_OFFLOAD_fini_device (int n)
41dbbb37 1151{
d93bdab5
JB
1152 pthread_mutex_lock (&ptx_dev_lock);
1153
1154 if (ptx_devices[n] != NULL)
1155 {
6ce13072
CLT
1156 if (!nvptx_attach_host_thread_to_device (n)
1157 || !nvptx_close_device (ptx_devices[n]))
1158 {
1159 pthread_mutex_unlock (&ptx_dev_lock);
1160 return false;
1161 }
d93bdab5
JB
1162 ptx_devices[n] = NULL;
1163 instantiated_devices--;
1164 }
1165
738c56d4
TV
1166 if (instantiated_devices == 0)
1167 {
1168 free (ptx_devices);
1169 ptx_devices = NULL;
1170 }
1171
d93bdab5 1172 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072 1173 return true;
41dbbb37
TS
1174}
1175
2a21ff19
NS
1176/* Return the libgomp version number we're compatible with. There is
1177 no requirement for cross-version compatibility. */
1178
1179unsigned
1180GOMP_OFFLOAD_version (void)
1181{
1182 return GOMP_VERSION;
1183}
1184
6103184e
AM
1185/* Initialize __nvptx_clocktick, if present in MODULE. */
1186
1187static void
1188nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1189{
1190 CUdeviceptr dptr;
2393d337
JJ
1191 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1192 module, "__nvptx_clocktick");
6103184e
AM
1193 if (r == CUDA_ERROR_NOT_FOUND)
1194 return;
1195 if (r != CUDA_SUCCESS)
1196 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1197 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
2393d337
JJ
1198 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1199 sizeof (__nvptx_clocktick));
6103184e
AM
1200 if (r != CUDA_SUCCESS)
1201 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1202}
1203
f3e9a059
NS
1204/* Load the (partial) program described by TARGET_DATA to device
1205 number ORD. Allocate and return TARGET_TABLE. */
1206
41dbbb37 1207int
2a21ff19 1208GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
d93bdab5 1209 struct addr_pair **target_table)
41dbbb37
TS
1210{
1211 CUmodule module;
3e32ee19
NS
1212 const char *const *var_names;
1213 const struct targ_fn_launch *fn_descs;
d93bdab5 1214 unsigned int fn_entries, var_entries, i, j;
41dbbb37 1215 struct targ_fn_descriptor *targ_fns;
f3e9a059 1216 struct addr_pair *targ_tbl;
afb2d80b 1217 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
d93bdab5 1218 struct ptx_image_data *new_image;
f3e9a059 1219 struct ptx_device *dev;
41dbbb37 1220
2a21ff19 1221 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
6ce13072
CLT
1222 {
1223 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1224 " (expected %u, received %u)",
1225 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1226 return -1;
1227 }
41dbbb37 1228
6ce13072
CLT
1229 if (!nvptx_attach_host_thread_to_device (ord)
1230 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1231 return -1;
d93bdab5 1232
6ce13072 1233 dev = ptx_devices[ord];
41dbbb37 1234
a4cb876d
NS
1235 /* The mkoffload utility emits a struct of pointers/integers at the
1236 start of each offload image. The array of kernel names and the
1237 functions addresses form a one-to-one correspondence. */
41dbbb37 1238
a4cb876d
NS
1239 var_entries = img_header->var_num;
1240 var_names = img_header->var_names;
1241 fn_entries = img_header->fn_num;
3e32ee19 1242 fn_descs = img_header->fn_descs;
41dbbb37 1243
f3e9a059
NS
1244 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1245 * (fn_entries + var_entries));
41dbbb37
TS
1246 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1247 * fn_entries);
1248
f3e9a059
NS
1249 *target_table = targ_tbl;
1250
1251 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1252 new_image->target_data = target_data;
1253 new_image->module = module;
1254 new_image->fns = targ_fns;
1255
1256 pthread_mutex_lock (&dev->image_lock);
1257 new_image->next = dev->images;
1258 dev->images = new_image;
1259 pthread_mutex_unlock (&dev->image_lock);
1260
1261 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
41dbbb37
TS
1262 {
1263 CUfunction function;
6103184e 1264 int nregs, mthrs;
41dbbb37 1265
6ce13072
CLT
1266 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1267 fn_descs[i].fn);
6103184e
AM
1268 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1269 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1270 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1271 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
41dbbb37 1272
f3e9a059 1273 targ_fns->fn = function;
3e32ee19 1274 targ_fns->launch = &fn_descs[i];
6103184e
AM
1275 targ_fns->regs_per_thread = nregs;
1276 targ_fns->max_threads_per_block = mthrs;
41dbbb37 1277
f3e9a059
NS
1278 targ_tbl->start = (uintptr_t) targ_fns;
1279 targ_tbl->end = targ_tbl->start + 1;
41dbbb37
TS
1280 }
1281
f3e9a059 1282 for (j = 0; j < var_entries; j++, targ_tbl++)
d93bdab5
JB
1283 {
1284 CUdeviceptr var;
1285 size_t bytes;
1286
6ce13072
CLT
1287 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1288 &var, &bytes, module, var_names[j]);
d93bdab5 1289
f3e9a059
NS
1290 targ_tbl->start = (uintptr_t) var;
1291 targ_tbl->end = targ_tbl->start + bytes;
d93bdab5
JB
1292 }
1293
6103184e
AM
1294 nvptx_set_clocktick (module, dev);
1295
f3e9a059 1296 return fn_entries + var_entries;
d93bdab5
JB
1297}
1298
f3e9a059
NS
1299/* Unload the program described by TARGET_DATA. DEV_DATA is the
1300 function descriptors allocated by G_O_load_image. */
1301
6ce13072 1302bool
2a21ff19 1303GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
d93bdab5 1304{
f3e9a059
NS
1305 struct ptx_image_data *image, **prev_p;
1306 struct ptx_device *dev = ptx_devices[ord];
1307
2a21ff19 1308 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
6ce13072
CLT
1309 {
1310 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1311 " (expected %u, received %u)",
1312 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1313 return false;
1314 }
1315
1316 bool ret = true;
f3e9a059
NS
1317 pthread_mutex_lock (&dev->image_lock);
1318 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1319 if (image->target_data == target_data)
1320 {
1321 *prev_p = image->next;
2393d337 1322 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
6ce13072 1323 ret = false;
f3e9a059
NS
1324 free (image->fns);
1325 free (image);
1326 break;
1327 }
1328 pthread_mutex_unlock (&dev->image_lock);
6ce13072 1329 return ret;
41dbbb37
TS
1330}
1331
1332void *
d93bdab5 1333GOMP_OFFLOAD_alloc (int ord, size_t size)
41dbbb37 1334{
6ce13072
CLT
1335 if (!nvptx_attach_host_thread_to_device (ord))
1336 return NULL;
41dbbb37 1337
1f4c5b9b
CLT
1338 struct ptx_device *ptx_dev = ptx_devices[ord];
1339 struct ptx_free_block *blocks, *tmp;
41dbbb37 1340
1f4c5b9b
CLT
1341 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1342 blocks = ptx_dev->free_blocks;
1343 ptx_dev->free_blocks = NULL;
1344 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
41dbbb37 1345
1f4c5b9b
CLT
1346 while (blocks)
1347 {
1348 tmp = blocks->next;
1349 nvptx_free (blocks->ptr, ptx_dev);
1350 free (blocks);
1351 blocks = tmp;
1352 }
1353
1354 return nvptx_alloc (size);
41dbbb37
TS
1355}
1356
6103184e 1357bool
1f4c5b9b 1358GOMP_OFFLOAD_free (int ord, void *ptr)
6103184e 1359{
1f4c5b9b
CLT
1360 return (nvptx_attach_host_thread_to_device (ord)
1361 && nvptx_free (ptr, ptx_devices[ord]));
6103184e
AM
1362}
1363
41dbbb37 1364void
345a8c17
TS
1365GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1366 void **hostaddrs, void **devaddrs,
1f4c5b9b 1367 unsigned *dims, void *targ_mem_desc)
41dbbb37 1368{
1f4c5b9b 1369 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
41dbbb37 1370
5fae049d
TS
1371 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1372 acc_prof_info *prof_info = thr->prof_info;
1373 acc_event_info data_event_info;
1374 acc_api_info *api_info = thr->api_info;
1375 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1376
1f4c5b9b
CLT
1377 void **hp = NULL;
1378 CUdeviceptr dp = 0;
41dbbb37 1379
1f4c5b9b
CLT
1380 if (mapnum > 0)
1381 {
5fae049d
TS
1382 size_t s = mapnum * sizeof (void *);
1383 hp = alloca (s);
1f4c5b9b
CLT
1384 for (int i = 0; i < mapnum; i++)
1385 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
5fae049d
TS
1386 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1387 if (profiling_p)
1388 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1f4c5b9b 1389 }
41dbbb37 1390
1f4c5b9b
CLT
1391 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1392 fact have the same value on a unified-memory system). */
1393 if (mapnum > 0)
5fae049d
TS
1394 {
1395 if (profiling_p)
1396 {
1397 prof_info->event_type = acc_ev_enqueue_upload_start;
1398
1399 data_event_info.data_event.event_type = prof_info->event_type;
1400 data_event_info.data_event.valid_bytes
1401 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1402 data_event_info.data_event.parent_construct
1403 = acc_construct_parallel;
1404 data_event_info.data_event.implicit = 1; /* Always implicit. */
1405 data_event_info.data_event.tool_info = NULL;
1406 data_event_info.data_event.var_name = NULL;
1407 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1408 data_event_info.data_event.host_ptr = hp;
1409 data_event_info.data_event.device_ptr = (const void *) dp;
1410
1411 api_info->device_api = acc_device_api_cuda;
1412
1413 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1414 api_info);
1415 }
1416 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1417 mapnum * sizeof (void *));
1418 if (profiling_p)
1419 {
1420 prof_info->event_type = acc_ev_enqueue_upload_end;
1421 data_event_info.data_event.event_type = prof_info->event_type;
1422 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1423 api_info);
1424 }
1425 }
41dbbb37 1426
1f4c5b9b
CLT
1427 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1428 dp, NULL);
41dbbb37 1429
1f4c5b9b
CLT
1430 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1431 const char *maybe_abort_msg = "(perhaps abort was called)";
1432 if (r == CUDA_ERROR_LAUNCH_FAILED)
1433 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1434 maybe_abort_msg);
1435 else if (r != CUDA_SUCCESS)
1436 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
5fae049d 1437
1f4c5b9b 1438 CUDA_CALL_ASSERT (cuMemFree, dp);
5fae049d
TS
1439 if (profiling_p)
1440 goacc_profiling_acc_ev_free (thr, (void *) dp);
41dbbb37
TS
1441}
1442
1f4c5b9b
CLT
1443static void
1444cuda_free_argmem (void *ptr)
41dbbb37 1445{
1f4c5b9b
CLT
1446 void **block = (void **) ptr;
1447 nvptx_free (block[0], (struct ptx_device *) block[1]);
1448 free (block);
41dbbb37
TS
1449}
1450
1451void
1f4c5b9b
CLT
1452GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1453 void **hostaddrs, void **devaddrs,
1454 unsigned *dims, void *targ_mem_desc,
1455 struct goacc_asyncqueue *aq)
41dbbb37 1456{
1f4c5b9b 1457 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
41dbbb37 1458
5fae049d
TS
1459 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1460 acc_prof_info *prof_info = thr->prof_info;
1461 acc_event_info data_event_info;
1462 acc_api_info *api_info = thr->api_info;
1463 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1464
1f4c5b9b
CLT
1465 void **hp = NULL;
1466 CUdeviceptr dp = 0;
1467 void **block = NULL;
41dbbb37 1468
1f4c5b9b
CLT
1469 if (mapnum > 0)
1470 {
5fae049d
TS
1471 size_t s = mapnum * sizeof (void *);
1472 block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1f4c5b9b
CLT
1473 hp = block + 2;
1474 for (int i = 0; i < mapnum; i++)
1475 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
5fae049d
TS
1476 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1477 if (profiling_p)
1478 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1f4c5b9b
CLT
1479 }
1480
1481 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1482 fact have the same value on a unified-memory system). */
1483 if (mapnum > 0)
1484 {
5fae049d
TS
1485 if (profiling_p)
1486 {
1487 prof_info->event_type = acc_ev_enqueue_upload_start;
1488
1489 data_event_info.data_event.event_type = prof_info->event_type;
1490 data_event_info.data_event.valid_bytes
1491 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1492 data_event_info.data_event.parent_construct
1493 = acc_construct_parallel;
1494 data_event_info.data_event.implicit = 1; /* Always implicit. */
1495 data_event_info.data_event.tool_info = NULL;
1496 data_event_info.data_event.var_name = NULL;
1497 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1498 data_event_info.data_event.host_ptr = hp;
1499 data_event_info.data_event.device_ptr = (const void *) dp;
1500
1501 api_info->device_api = acc_device_api_cuda;
1502
1503 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1504 api_info);
1505 }
1506
1f4c5b9b
CLT
1507 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1508 mapnum * sizeof (void *), aq->cuda_stream);
1509 block[0] = (void *) dp;
1510
1511 struct nvptx_thread *nvthd =
1512 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1513 block[1] = (void *) nvthd->ptx_dev;
5fae049d
TS
1514
1515 if (profiling_p)
1516 {
1517 prof_info->event_type = acc_ev_enqueue_upload_end;
1518 data_event_info.data_event.event_type = prof_info->event_type;
1519 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1520 api_info);
1521 }
1f4c5b9b 1522 }
5fae049d 1523
1f4c5b9b
CLT
1524 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1525 dp, aq->cuda_stream);
1526
1527 if (mapnum > 0)
1528 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
41dbbb37
TS
1529}
1530
1531void *
d93bdab5 1532GOMP_OFFLOAD_openacc_create_thread_data (int ord)
41dbbb37 1533{
d93bdab5 1534 struct ptx_device *ptx_dev;
41dbbb37
TS
1535 struct nvptx_thread *nvthd
1536 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
41dbbb37
TS
1537 CUcontext thd_ctx;
1538
d93bdab5
JB
1539 ptx_dev = ptx_devices[ord];
1540
1541 assert (ptx_dev);
1542
6ce13072 1543 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
41dbbb37
TS
1544
1545 assert (ptx_dev->ctx);
1546
1547 if (!thd_ctx)
6ce13072 1548 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
41dbbb37 1549
41dbbb37
TS
1550 nvthd->ptx_dev = ptx_dev;
1551
1552 return (void *) nvthd;
1553}
1554
1555void
1556GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1557{
1558 free (data);
1559}
1560
1561void *
345a8c17 1562GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
41dbbb37
TS
1563{
1564 return nvptx_get_current_cuda_device ();
1565}
1566
1567void *
345a8c17 1568GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
41dbbb37
TS
1569{
1570 return nvptx_get_current_cuda_context ();
1571}
1572
1f4c5b9b 1573/* This returns a CUstream. */
41dbbb37 1574void *
1f4c5b9b
CLT
1575GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1576{
1577 return (void *) aq->cuda_stream;
1578}
1579
1580/* This takes a CUstream. */
1581int
1582GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1583{
1584 if (aq->cuda_stream)
1585 {
1586 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1587 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1588 }
1589
1590 aq->cuda_stream = (CUstream) stream;
1591 return 1;
1592}
1593
1594struct goacc_asyncqueue *
d2903ce0 1595GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
41dbbb37 1596{
1f4c5b9b
CLT
1597 CUstream stream = NULL;
1598 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1599
1600 struct goacc_asyncqueue *aq
1601 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1602 aq->cuda_stream = stream;
1603 return aq;
41dbbb37
TS
1604}
1605
1f4c5b9b
CLT
1606bool
1607GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1608{
1609 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1610 free (aq);
1611 return true;
1612}
41dbbb37
TS
1613
1614int
1f4c5b9b 1615GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
41dbbb37 1616{
1f4c5b9b
CLT
1617 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1618 if (r == CUDA_SUCCESS)
1619 return 1;
1620 if (r == CUDA_ERROR_NOT_READY)
1621 return 0;
1622
1623 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1624 return -1;
1625}
1626
1627bool
1628GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1629{
1630 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1631 return true;
1632}
1633
1634bool
1635GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1636 struct goacc_asyncqueue *aq2)
1637{
1638 CUevent e;
1639 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1640 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1641 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1642 return true;
1643}
1644
1645static void
1646cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1647{
1648 if (res != CUDA_SUCCESS)
1649 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1650 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1651 cb->fn (cb->ptr);
1652 free (ptr);
1653}
1654
1655void
1656GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1657 void (*callback_fn)(void *),
1658 void *userptr)
1659{
1660 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1661 b->fn = callback_fn;
1662 b->ptr = userptr;
1663 b->aq = aq;
1664 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1665 cuda_callback_wrapper, (void *) b, 0);
1666}
1667
1668static bool
1669cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1670{
1671 CUdeviceptr pb;
1672 size_t ps;
1673 if (!s)
1674 return true;
1675 if (!d)
1676 {
1677 GOMP_PLUGIN_error ("invalid device address");
1678 return false;
1679 }
1680 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1681 if (!pb)
1682 {
1683 GOMP_PLUGIN_error ("invalid device address");
1684 return false;
1685 }
1686 if (!h)
1687 {
1688 GOMP_PLUGIN_error ("invalid host address");
1689 return false;
1690 }
1691 if (d == h)
1692 {
1693 GOMP_PLUGIN_error ("invalid host or device address");
1694 return false;
1695 }
1696 if ((void *)(d + s) > (void *)(pb + ps))
1697 {
1698 GOMP_PLUGIN_error ("invalid size");
1699 return false;
1700 }
1701 return true;
1702}
1703
1704bool
1705GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1706{
1707 if (!nvptx_attach_host_thread_to_device (ord)
1708 || !cuda_memcpy_sanity_check (src, dst, n))
1709 return false;
1710 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1711 return true;
1712}
1713
1714bool
1715GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1716{
1717 if (!nvptx_attach_host_thread_to_device (ord)
1718 || !cuda_memcpy_sanity_check (dst, src, n))
1719 return false;
1720 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1721 return true;
1722}
1723
1724bool
1725GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1726{
1727 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1728 return true;
1729}
1730
1731bool
1732GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1733 size_t n, struct goacc_asyncqueue *aq)
1734{
1735 if (!nvptx_attach_host_thread_to_device (ord)
1736 || !cuda_memcpy_sanity_check (src, dst, n))
1737 return false;
1738 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1739 return true;
1740}
1741
1742bool
1743GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1744 size_t n, struct goacc_asyncqueue *aq)
1745{
1746 if (!nvptx_attach_host_thread_to_device (ord)
1747 || !cuda_memcpy_sanity_check (dst, src, n))
1748 return false;
1749 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1750 return true;
41dbbb37 1751}
6103184e 1752
6fc0385c
TS
1753union goacc_property_value
1754GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1755{
1756 union goacc_property_value propval = { .val = 0 };
1757
1758 pthread_mutex_lock (&ptx_dev_lock);
1759
1760 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1761 {
1762 pthread_mutex_unlock (&ptx_dev_lock);
1763 return propval;
1764 }
1765
1766 struct ptx_device *ptx_dev = ptx_devices[n];
1767 switch (prop)
1768 {
1769 case GOACC_PROPERTY_MEMORY:
1770 {
1771 size_t total_mem;
1772
1773 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1774 propval.val = total_mem;
1775 }
1776 break;
1777 case GOACC_PROPERTY_FREE_MEMORY:
1778 {
1779 size_t total_mem;
1780 size_t free_mem;
1781 CUdevice ctxdev;
1782
1783 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1784 if (ptx_dev->dev == ctxdev)
1785 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1786 else if (ptx_dev->ctx)
1787 {
1788 CUcontext old_ctx;
1789
1790 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1791 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1792 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1793 }
1794 else
1795 {
1796 CUcontext new_ctx;
1797
1798 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1799 ptx_dev->dev);
1800 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1801 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1802 }
1803 propval.val = free_mem;
1804 }
1805 break;
1806 case GOACC_PROPERTY_NAME:
1807 propval.ptr = ptx_dev->name;
1808 break;
1809 case GOACC_PROPERTY_VENDOR:
1810 propval.ptr = "Nvidia";
1811 break;
1812 case GOACC_PROPERTY_DRIVER:
1813 propval.ptr = cuda_driver_version_s;
1814 break;
1815 default:
1816 break;
1817 }
1818
1819 pthread_mutex_unlock (&ptx_dev_lock);
1820 return propval;
1821}
1822
6103184e
AM
1823/* Adjust launch dimensions: pick good values for number of blocks and warps
1824 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1825 own limits. */
1826
1827static void
1828nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1829 struct ptx_device *ptx_dev,
1830 int *teams_p, int *threads_p)
1831{
1832 int max_warps_block = fn->max_threads_per_block / 32;
1833 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1834 and libgcc, which matches documented limit of all GPUs as of 2015. */
1835 if (max_warps_block > 32)
1836 max_warps_block = 32;
1837 if (*threads_p <= 0)
1838 *threads_p = 8;
1839 if (*threads_p > max_warps_block)
1840 *threads_p = max_warps_block;
1841
1842 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1843 /* This is an estimate of how many blocks the device can host simultaneously.
1844 Actual limit, which may be lower, can be queried with "occupancy control"
1845 driver interface (since CUDA 6.0). */
1846 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1847 if (*teams_p <= 0 || *teams_p > max_blocks)
1848 *teams_p = max_blocks;
1849}
1850
1851/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1852 target regions. */
1853
1854static size_t
1855nvptx_stacks_size ()
1856{
1857 return 128 * 1024;
1858}
1859
1860/* Return contiguous storage for NUM stacks, each SIZE bytes. */
1861
1862static void *
1863nvptx_stacks_alloc (size_t size, int num)
1864{
1865 CUdeviceptr stacks;
2393d337 1866 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
6103184e
AM
1867 if (r != CUDA_SUCCESS)
1868 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1869 return (void *) stacks;
1870}
1871
1872/* Release storage previously allocated by nvptx_stacks_alloc. */
1873
1874static void
1875nvptx_stacks_free (void *p, int num)
1876{
2393d337 1877 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
6103184e
AM
1878 if (r != CUDA_SUCCESS)
1879 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1880}
1881
1882void
1883GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1884{
1885 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
1886 CUresult r;
1887 struct ptx_device *ptx_dev = ptx_devices[ord];
1888 const char *maybe_abort_msg = "(perhaps abort was called)";
1889 int teams = 0, threads = 0;
1890
1891 if (!args)
1892 GOMP_PLUGIN_fatal ("No target arguments provided");
1893 while (*args)
1894 {
1895 intptr_t id = (intptr_t) *args++, val;
1896 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1897 val = (intptr_t) *args++;
1898 else
1899 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1900 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1901 continue;
1902 val = val > INT_MAX ? INT_MAX : val;
1903 id &= GOMP_TARGET_ARG_ID_MASK;
1904 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
1905 teams = val;
1906 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
1907 threads = val;
1908 }
1909 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
1910
1911 size_t stack_size = nvptx_stacks_size ();
1912 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
1913 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
1914 size_t fn_args_size = sizeof fn_args;
1915 void *config[] = {
1916 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
1917 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
1918 CU_LAUNCH_PARAM_END
1919 };
2393d337 1920 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
1f4c5b9b 1921 32, threads, 1, 0, NULL, NULL, config);
6103184e
AM
1922 if (r != CUDA_SUCCESS)
1923 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1924
2393d337 1925 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
6103184e
AM
1926 if (r == CUDA_ERROR_LAUNCH_FAILED)
1927 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1928 maybe_abort_msg);
1929 else if (r != CUDA_SUCCESS)
1930 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1931 nvptx_stacks_free (stacks, teams * threads);
1932}
1933
001ab12e 1934/* TODO: Implement GOMP_OFFLOAD_async_run. */