]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-nvptx.c
x86: fix VENDOR_MAX enum value
[thirdparty/gcc.git] / libgomp / plugin / plugin-nvptx.c
CommitLineData
41dbbb37
TS
1/* Plugin for NVPTX execution.
2
7adcbafe 3 Copyright (C) 2013-2022 Free Software Foundation, Inc.
41dbbb37
TS
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
df36a3d3 34#define _GNU_SOURCE
41dbbb37
TS
35#include "openacc.h"
36#include "config.h"
0bac793e 37#include "symcat.h"
41dbbb37 38#include "libgomp-plugin.h"
41dbbb37 39#include "oacc-plugin.h"
2a21ff19 40#include "gomp-constants.h"
5fae049d 41#include "oacc-int.h"
41dbbb37 42
131d18e9
TB
43/* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
44#include "config/nvptx/libgomp-nvptx.h"
45
41dbbb37 46#include <pthread.h>
cd644ce8 47#ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
5e431ae4
TS
48# include "cuda/cuda.h"
49#else
50# include <cuda.h>
51#endif
41dbbb37 52#include <stdbool.h>
6103184e 53#include <limits.h>
41dbbb37
TS
54#include <string.h>
55#include <stdio.h>
41dbbb37
TS
56#include <unistd.h>
57#include <assert.h>
6668eb45 58#include <errno.h>
41dbbb37 59
6b577a17
JB
60/* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
61 block to cache between kernel invocations. For soft-stacks blocks bigger
62 than this, we will free the block before attempting another GPU memory
63 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
64 we will free the cached soft-stacks block anyway then retry the
65 allocation. If that fails too, we lose. */
66
67#define SOFTSTACK_CACHE_LIMIT 134217728
68
94767dac
TV
69#if CUDA_VERSION < 6000
70extern CUresult cuGetErrorString (CUresult, const char **);
b113af95 71#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
94767dac
TV
72#endif
73
8e09a12f
TV
74#if CUDA_VERSION >= 6050
75#undef cuLinkCreate
76#undef cuLinkAddData
77CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
78 const char *, unsigned, CUjit_option *, void **);
79CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
80#else
bd9b3d3d 81typedef size_t (*CUoccupancyB2DSize)(int);
8e09a12f
TV
82CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
83 const char *, unsigned, CUjit_option *, void **);
84CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
bd9b3d3d
CP
85CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
86 CUoccupancyB2DSize, size_t, int);
8e09a12f
TV
87#endif
88
02150de8
TV
89#define DO_PRAGMA(x) _Pragma (#x)
90
cd644ce8 91#ifndef PLUGIN_NVPTX_LINK_LIBCUDA
2393d337
JJ
92# include <dlfcn.h>
93
2393d337 94struct cuda_lib_s {
9e28b107
TV
95
96# define CUDA_ONE_CALL(call) \
97 __typeof (call) *call;
02150de8
TV
98# define CUDA_ONE_CALL_MAYBE_NULL(call) \
99 CUDA_ONE_CALL (call)
8c6310a2 100#include "cuda-lib.def"
9e28b107 101# undef CUDA_ONE_CALL
02150de8 102# undef CUDA_ONE_CALL_MAYBE_NULL
9e28b107 103
2393d337
JJ
104} cuda_lib;
105
106/* -1 if init_cuda_lib has not been called yet, false
107 if it has been and failed, true if it has been and succeeded. */
19929ba9 108static signed char cuda_lib_inited = -1;
2393d337
JJ
109
110/* Dynamically load the CUDA runtime library and initialize function
111 pointers, return false if unsuccessful, true if successful. */
112static bool
113init_cuda_lib (void)
114{
115 if (cuda_lib_inited != -1)
116 return cuda_lib_inited;
117 const char *cuda_runtime_lib = "libcuda.so.1";
118 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
119 cuda_lib_inited = false;
120 if (h == NULL)
121 return false;
9e28b107 122
02150de8
TV
123# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
124# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
125# define CUDA_ONE_CALL_1(call, allow_null) \
2393d337 126 cuda_lib.call = dlsym (h, #call); \
02150de8 127 if (!allow_null && cuda_lib.call == NULL) \
2393d337 128 return false;
8c6310a2 129#include "cuda-lib.def"
9e28b107
TV
130# undef CUDA_ONE_CALL
131# undef CUDA_ONE_CALL_1
02150de8 132# undef CUDA_ONE_CALL_MAYBE_NULL
9e28b107 133
2393d337
JJ
134 cuda_lib_inited = true;
135 return true;
41dbbb37 136}
2393d337
JJ
137# define CUDA_CALL_PREFIX cuda_lib.
138#else
02150de8
TV
139
140# define CUDA_ONE_CALL(call)
141# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
142#include "cuda-lib.def"
143#undef CUDA_ONE_CALL_MAYBE_NULL
144#undef CUDA_ONE_CALL
145
2393d337
JJ
146# define CUDA_CALL_PREFIX
147# define init_cuda_lib() true
148#endif
41dbbb37 149
df36a3d3
TV
150#include "secure_getenv.h"
151
4cdfee3f
TV
152#undef MIN
153#undef MAX
154#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
155#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
156
6ce13072 157/* Convenience macros for the frequently used CUDA library call and
2393d337
JJ
158 error handling sequence as well as CUDA library calls that
159 do the error checking themselves or don't do it at all. */
6ce13072
CLT
160
161#define CUDA_CALL_ERET(ERET, FN, ...) \
162 do { \
2393d337
JJ
163 unsigned __r \
164 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
6ce13072
CLT
165 if (__r != CUDA_SUCCESS) \
166 { \
167 GOMP_PLUGIN_error (#FN " error: %s", \
168 cuda_error (__r)); \
169 return ERET; \
170 } \
171 } while (0)
172
173#define CUDA_CALL(FN, ...) \
2393d337 174 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
6ce13072
CLT
175
176#define CUDA_CALL_ASSERT(FN, ...) \
177 do { \
2393d337
JJ
178 unsigned __r \
179 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
6ce13072
CLT
180 if (__r != CUDA_SUCCESS) \
181 { \
182 GOMP_PLUGIN_fatal (#FN " error: %s", \
183 cuda_error (__r)); \
184 } \
185 } while (0)
186
2393d337
JJ
187#define CUDA_CALL_NOCHECK(FN, ...) \
188 CUDA_CALL_PREFIX FN (__VA_ARGS__)
189
02150de8
TV
190#define CUDA_CALL_EXISTS(FN) \
191 CUDA_CALL_PREFIX FN
192
2393d337
JJ
193static const char *
194cuda_error (CUresult r)
195{
cedd9bd0 196 const char *fallback = "unknown cuda error";
2393d337
JJ
197 const char *desc;
198
cedd9bd0
TV
199 if (!CUDA_CALL_EXISTS (cuGetErrorString))
200 return fallback;
201
2393d337 202 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
cedd9bd0
TV
203 if (r == CUDA_SUCCESS)
204 return desc;
2393d337 205
cedd9bd0 206 return fallback;
2393d337
JJ
207}
208
6c84c8bf
MR
209/* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
210 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
211static char cuda_driver_version_s[30];
212
d93bdab5
JB
213static unsigned int instantiated_devices = 0;
214static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
41dbbb37 215
1f4c5b9b
CLT
216/* NVPTX/CUDA specific definition of asynchronous queues. */
217struct goacc_asyncqueue
2049befd 218{
1f4c5b9b 219 CUstream cuda_stream;
2049befd
CP
220};
221
1f4c5b9b 222struct nvptx_callback
41dbbb37 223{
1f4c5b9b
CLT
224 void (*fn) (void *);
225 void *ptr;
226 struct goacc_asyncqueue *aq;
227 struct nvptx_callback *next;
41dbbb37
TS
228};
229
230/* Thread-specific data for PTX. */
231
232struct nvptx_thread
233{
1f4c5b9b
CLT
234 /* We currently have this embedded inside the plugin because libgomp manages
235 devices through integer target_ids. This might be better if using an
236 opaque target-specific pointer directly from gomp_device_descr. */
41dbbb37
TS
237 struct ptx_device *ptx_dev;
238};
239
3e32ee19
NS
240/* Target data function launch information. */
241
242struct targ_fn_launch
243{
244 const char *fn;
cc3cd79b 245 unsigned short dim[GOMP_DIM_MAX];
3e32ee19
NS
246};
247
cc3cd79b
NS
248/* Target PTX object information. */
249
250struct targ_ptx_obj
251{
252 const char *code;
253 size_t size;
254};
255
256/* Target data image information. */
257
258typedef struct nvptx_tdata
259{
260 const struct targ_ptx_obj *ptx_objs;
261 unsigned ptx_num;
262
263 const char *const *var_names;
264 unsigned var_num;
265
266 const struct targ_fn_launch *fn_descs;
267 unsigned fn_num;
268} nvptx_tdata_t;
269
f3e9a059
NS
270/* Descriptor of a loaded function. */
271
272struct targ_fn_descriptor
273{
274 CUfunction fn;
3e32ee19 275 const struct targ_fn_launch *launch;
6103184e
AM
276 int regs_per_thread;
277 int max_threads_per_block;
f3e9a059
NS
278};
279
280/* A loaded PTX image. */
281struct ptx_image_data
282{
283 const void *target_data;
284 CUmodule module;
285
286 struct targ_fn_descriptor *fns; /* Array of functions. */
287
288 struct ptx_image_data *next;
289};
290
1f4c5b9b
CLT
291struct ptx_free_block
292{
293 void *ptr;
294 struct ptx_free_block *next;
295};
296
41dbbb37
TS
297struct ptx_device
298{
299 CUcontext ctx;
300 bool ctx_shared;
301 CUdevice dev;
1f4c5b9b 302
41dbbb37
TS
303 int ord;
304 bool overlap;
305 bool map;
306 bool concur;
41dbbb37 307 bool mkern;
6c84c8bf 308 int mode;
6103184e
AM
309 int clock_khz;
310 int num_sms;
311 int regs_per_block;
312 int regs_per_sm;
0c6c2f5f
CP
313 int warp_size;
314 int max_threads_per_block;
315 int max_threads_per_multiprocessor;
0b210c43 316 int default_dims[GOMP_DIM_MAX];
41dbbb37 317
6c84c8bf
MR
318 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
319 char name[256];
320
f3e9a059
NS
321 struct ptx_image_data *images; /* Images loaded on device. */
322 pthread_mutex_t image_lock; /* Lock for above list. */
41dbbb37 323
1f4c5b9b
CLT
324 struct ptx_free_block *free_blocks;
325 pthread_mutex_t free_blocks_lock;
41dbbb37 326
6b577a17
JB
327 /* OpenMP stacks, cached between kernel invocations. */
328 struct
329 {
330 CUdeviceptr ptr;
331 size_t size;
332 pthread_mutex_t lock;
333 } omp_stacks;
334
131d18e9 335 struct rev_offload *rev_data;
1f4c5b9b 336 struct ptx_device *next;
41dbbb37
TS
337};
338
d93bdab5
JB
339static struct ptx_device **ptx_devices;
340
41dbbb37
TS
341static inline struct nvptx_thread *
342nvptx_thread (void)
343{
344 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
345}
346
d93bdab5
JB
347/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
348 should be locked on entry and remains locked on exit. */
f3e9a059 349
d93bdab5 350static bool
41dbbb37
TS
351nvptx_init (void)
352{
d93bdab5 353 int ndevs;
41dbbb37 354
d93bdab5
JB
355 if (instantiated_devices != 0)
356 return true;
41dbbb37 357
2393d337
JJ
358 if (!init_cuda_lib ())
359 return false;
360
361 CUDA_CALL (cuInit, 0);
362
6c84c8bf
MR
363 int cuda_driver_version;
364 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
365 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
366 "CUDA Driver %u.%u",
367 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
368
6ce13072 369 CUDA_CALL (cuDeviceGetCount, &ndevs);
d93bdab5
JB
370 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
371 * ndevs);
6c84c8bf 372
d93bdab5 373 return true;
41dbbb37
TS
374}
375
d93bdab5
JB
376/* Select the N'th PTX device for the current host thread. The device must
377 have been previously opened before calling this function. */
378
6ce13072 379static bool
d93bdab5 380nvptx_attach_host_thread_to_device (int n)
41dbbb37 381{
d93bdab5
JB
382 CUdevice dev;
383 CUresult r;
384 struct ptx_device *ptx_dev;
385 CUcontext thd_ctx;
386
2393d337 387 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
1f4c5b9b
CLT
388 if (r == CUDA_ERROR_NOT_PERMITTED)
389 {
390 /* Assume we're in a CUDA callback, just return true. */
391 return true;
392 }
d93bdab5 393 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
6ce13072
CLT
394 {
395 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
396 return false;
397 }
d93bdab5
JB
398
399 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
6ce13072 400 return true;
d93bdab5
JB
401 else
402 {
403 CUcontext old_ctx;
404
405 ptx_dev = ptx_devices[n];
6ce13072
CLT
406 if (!ptx_dev)
407 {
408 GOMP_PLUGIN_error ("device %d not found", n);
409 return false;
410 }
d93bdab5 411
6ce13072 412 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
d93bdab5
JB
413
414 /* We don't necessarily have a current context (e.g. if it has been
415 destroyed. Pop it if we do though. */
416 if (thd_ctx != NULL)
6ce13072 417 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
d93bdab5 418
6ce13072 419 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
d93bdab5 420 }
6ce13072 421 return true;
41dbbb37
TS
422}
423
d93bdab5 424static struct ptx_device *
41dbbb37
TS
425nvptx_open_device (int n)
426{
427 struct ptx_device *ptx_dev;
d93bdab5 428 CUdevice dev, ctx_dev;
41dbbb37 429 CUresult r;
131d18e9 430 int pi;
41dbbb37 431
6ce13072 432 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
41dbbb37
TS
433
434 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
435
436 ptx_dev->ord = n;
437 ptx_dev->dev = dev;
438 ptx_dev->ctx_shared = false;
439
2393d337 440 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
d93bdab5 441 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
6ce13072
CLT
442 {
443 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
444 return NULL;
445 }
d93bdab5
JB
446
447 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
448 {
449 /* The current host thread has an active context for a different device.
450 Detach it. */
451 CUcontext old_ctx;
6ce13072 452 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
d93bdab5
JB
453 }
454
6ce13072 455 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
41dbbb37
TS
456
457 if (!ptx_dev->ctx)
6ce13072 458 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
41dbbb37
TS
459 else
460 ptx_dev->ctx_shared = true;
461
6ce13072
CLT
462 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
463 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
41dbbb37
TS
464 ptx_dev->overlap = pi;
465
6ce13072
CLT
466 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
467 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
41dbbb37
TS
468 ptx_dev->map = pi;
469
6ce13072
CLT
470 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
471 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
41dbbb37
TS
472 ptx_dev->concur = pi;
473
6ce13072
CLT
474 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
475 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
41dbbb37
TS
476 ptx_dev->mode = pi;
477
6ce13072
CLT
478 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
479 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
41dbbb37
TS
480 ptx_dev->mkern = pi;
481
6103184e
AM
482 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
483 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
484 ptx_dev->clock_khz = pi;
485
2393d337 486 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
6103184e
AM
487 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
488 ptx_dev->num_sms = pi;
489
490 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
491 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
492 ptx_dev->regs_per_block = pi;
493
b113af95 494 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
6103184e 495 in CUDA 6.0 and newer. */
b113af95
TV
496 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
497 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
498 dev);
6103184e
AM
499 /* Fallback: use limit of registers per block, which is usually equal. */
500 if (r == CUDA_ERROR_INVALID_VALUE)
501 pi = ptx_dev->regs_per_block;
502 else if (r != CUDA_SUCCESS)
503 {
504 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
505 return NULL;
506 }
507 ptx_dev->regs_per_sm = pi;
508
509 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
510 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
511 if (pi != 32)
512 {
513 GOMP_PLUGIN_error ("Only warp size 32 is supported");
514 return NULL;
515 }
0c6c2f5f
CP
516 ptx_dev->warp_size = pi;
517
518 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
519 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
520 ptx_dev->max_threads_per_block = pi;
521
522 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
523 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
524 ptx_dev->max_threads_per_multiprocessor = pi;
6103184e 525
131d18e9
TB
526 /* Required below for reverse offload as implemented, but with compute
527 capability >= 2.0 and 64bit device processes, this should be universally be
528 the case; hence, an assert. */
529 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
530 CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
531 assert (r == CUDA_SUCCESS && pi);
41dbbb37 532
0b210c43
TV
533 for (int i = 0; i != GOMP_DIM_MAX; i++)
534 ptx_dev->default_dims[i] = 0;
535
6c84c8bf
MR
536 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
537 dev);
538
f3e9a059
NS
539 ptx_dev->images = NULL;
540 pthread_mutex_init (&ptx_dev->image_lock, NULL);
541
1f4c5b9b
CLT
542 ptx_dev->free_blocks = NULL;
543 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
41dbbb37 544
6b577a17
JB
545 ptx_dev->omp_stacks.ptr = 0;
546 ptx_dev->omp_stacks.size = 0;
547 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
548
d93bdab5 549 return ptx_dev;
41dbbb37
TS
550}
551
6ce13072 552static bool
d93bdab5 553nvptx_close_device (struct ptx_device *ptx_dev)
41dbbb37 554{
41dbbb37 555 if (!ptx_dev)
6ce13072 556 return true;
41dbbb37 557
1f4c5b9b
CLT
558 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
559 {
560 struct ptx_free_block *b_next = b->next;
561 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
562 free (b);
563 b = b_next;
564 }
565
566 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
f3e9a059 567 pthread_mutex_destroy (&ptx_dev->image_lock);
41dbbb37 568
6b577a17
JB
569 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
570
571 if (ptx_dev->omp_stacks.ptr)
572 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
573
41dbbb37 574 if (!ptx_dev->ctx_shared)
6ce13072 575 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
41dbbb37
TS
576
577 free (ptx_dev);
6ce13072 578 return true;
41dbbb37
TS
579}
580
581static int
582nvptx_get_num_devices (void)
583{
584 int n;
41dbbb37
TS
585
586 /* This function will be called before the plugin has been initialized in
587 order to enumerate available devices, but CUDA API routines can't be used
588 until cuInit has been called. Just call it now (but don't yet do any
589 further initialization). */
d93bdab5 590 if (instantiated_devices == 0)
c8319826 591 {
2393d337
JJ
592 if (!init_cuda_lib ())
593 return 0;
594 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
c8319826
JB
595 /* This is not an error: e.g. we may have CUDA libraries installed but
596 no devices available. */
597 if (r != CUDA_SUCCESS)
78672bd8
TS
598 {
599 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
600 cuda_error (r));
601 return 0;
602 }
c8319826 603 }
41dbbb37 604
6ce13072 605 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
41dbbb37
TS
606 return n;
607}
608
dfb15f6b
TV
609static void
610notify_var (const char *var_name, const char *env_var)
611{
612 if (env_var == NULL)
613 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
614 else
615 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
616}
41dbbb37 617
df36a3d3
TV
618static void
619process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
620{
621 const char *var_name = "GOMP_NVPTX_JIT";
622 const char *env_var = secure_getenv (var_name);
623 notify_var (var_name, env_var);
624
625 if (env_var == NULL)
626 return;
627
628 const char *c = env_var;
629 while (*c != '\0')
630 {
631 while (*c == ' ')
632 c++;
633
634 if (c[0] == '-' && c[1] == 'O'
635 && '0' <= c[2] && c[2] <= '4'
636 && (c[3] == '\0' || c[3] == ' '))
637 {
638 *gomp_nvptx_o = c[2] - '0';
639 c += 3;
640 continue;
641 }
642
643 GOMP_PLUGIN_error ("Error parsing %s", var_name);
644 break;
645 }
646}
647
6ce13072 648static bool
cc3cd79b
NS
649link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
650 unsigned num_objs)
41dbbb37 651{
df36a3d3
TV
652 CUjit_option opts[7];
653 void *optvals[7];
41dbbb37 654 float elapsed = 0.0;
6103184e
AM
655 char elog[1024];
656 char ilog[16384];
41dbbb37
TS
657 CUlinkState linkstate;
658 CUresult r;
659 void *linkout;
660 size_t linkoutsize __attribute__ ((unused));
661
41dbbb37
TS
662 opts[0] = CU_JIT_WALL_TIME;
663 optvals[0] = &elapsed;
664
665 opts[1] = CU_JIT_INFO_LOG_BUFFER;
666 optvals[1] = &ilog[0];
667
668 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
6103184e 669 optvals[2] = (void *) sizeof ilog;
41dbbb37
TS
670
671 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
672 optvals[3] = &elog[0];
673
674 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
6103184e 675 optvals[4] = (void *) sizeof elog;
41dbbb37
TS
676
677 opts[5] = CU_JIT_LOG_VERBOSE;
678 optvals[5] = (void *) 1;
679
df36a3d3
TV
680 static intptr_t gomp_nvptx_o = -1;
681
682 static bool init_done = false;
683 if (!init_done)
684 {
685 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
686 init_done = true;
687 }
688
689 int nopts = 6;
690 if (gomp_nvptx_o != -1)
691 {
692 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
693 optvals[nopts] = (void *) gomp_nvptx_o;
694 nopts++;
695 }
696
8e09a12f
TV
697 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
698 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
699 else
700 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
41dbbb37 701
cc3cd79b 702 for (; num_objs--; ptx_objs++)
41dbbb37 703 {
cc3cd79b
NS
704 /* cuLinkAddData's 'data' argument erroneously omits the const
705 qualifier. */
706 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
8e09a12f
TV
707 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
708 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
709 (char *) ptx_objs->code, ptx_objs->size,
710 0, 0, 0, 0);
711 else
712 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
713 (char *) ptx_objs->code, ptx_objs->size,
714 0, 0, 0, 0);
cc3cd79b
NS
715 if (r != CUDA_SUCCESS)
716 {
717 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
6ce13072 718 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
cc3cd79b 719 cuda_error (r));
6ce13072 720 return false;
cc3cd79b 721 }
41dbbb37
TS
722 }
723
cc3cd79b 724 GOMP_PLUGIN_debug (0, "Linking\n");
2393d337 725 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
41dbbb37
TS
726
727 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
728 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
729
cc3cd79b 730 if (r != CUDA_SUCCESS)
6ce13072 731 {
c0e9cee2 732 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
6ce13072
CLT
733 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
734 return false;
735 }
cc3cd79b 736
6ce13072
CLT
737 CUDA_CALL (cuModuleLoadData, module, linkout);
738 CUDA_CALL (cuLinkDestroy, linkstate);
739 return true;
41dbbb37
TS
740}
741
e70ab10d 742static void
41dbbb37 743nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1f4c5b9b
CLT
744 unsigned *dims, void *targ_mem_desc,
745 CUdeviceptr dp, CUstream stream)
41dbbb37
TS
746{
747 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
748 CUfunction function;
41dbbb37 749 int i;
41dbbb37 750 void *kargs[1];
41dbbb37 751 struct nvptx_thread *nvthd = nvptx_thread ();
4cdfee3f 752 int warp_size = nvthd->ptx_dev->warp_size;
41dbbb37
TS
753
754 function = targ_fn->fn;
755
3e32ee19
NS
756 /* Initialize the launch dimensions. Typically this is constant,
757 provided by the device compiler, but we must permit runtime
758 values. */
f99c3557
TS
759 int seen_zero = 0;
760 for (i = 0; i != GOMP_DIM_MAX; i++)
761 {
762 if (targ_fn->launch->dim[i])
763 dims[i] = targ_fn->launch->dim[i];
764 if (!dims[i])
765 seen_zero = 1;
766 }
767
768 if (seen_zero)
769 {
6668eb45 770 pthread_mutex_lock (&ptx_dev_lock);
0b210c43
TV
771
772 static int gomp_openacc_dims[GOMP_DIM_MAX];
773 if (!gomp_openacc_dims[0])
774 {
775 /* See if the user provided GOMP_OPENACC_DIM environment
776 variable to specify runtime defaults. */
777 for (int i = 0; i < GOMP_DIM_MAX; ++i)
778 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
779 }
780
781 if (!nvthd->ptx_dev->default_dims[0])
6668eb45 782 {
0b210c43 783 int default_dims[GOMP_DIM_MAX];
ec00d3fa 784 for (int i = 0; i < GOMP_DIM_MAX; ++i)
0b210c43 785 default_dims[i] = gomp_openacc_dims[i];
6668eb45 786
0c6c2f5f
CP
787 int gang, worker, vector;
788 {
0c6c2f5f
CP
789 int block_size = nvthd->ptx_dev->max_threads_per_block;
790 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
791 int dev_size = nvthd->ptx_dev->num_sms;
792 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
793 " dev_size=%d, cpu_size=%d\n",
794 warp_size, block_size, dev_size, cpu_size);
795
796 gang = (cpu_size / block_size) * dev_size;
797 worker = block_size / warp_size;
798 vector = warp_size;
799 }
6668eb45
CP
800
801 /* There is no upper bound on the gang size. The best size
802 matches the hardware configuration. Logical gangs are
803 scheduled onto physical hardware. To maximize usage, we
804 should guess a large number. */
805 if (default_dims[GOMP_DIM_GANG] < 1)
806 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
807 /* The worker size must not exceed the hardware. */
808 if (default_dims[GOMP_DIM_WORKER] < 1
809 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
810 default_dims[GOMP_DIM_WORKER] = worker;
811 /* The vector size must exactly match the hardware. */
812 if (default_dims[GOMP_DIM_VECTOR] < 1
813 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
814 default_dims[GOMP_DIM_VECTOR] = vector;
815
816 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
817 default_dims[GOMP_DIM_GANG],
818 default_dims[GOMP_DIM_WORKER],
819 default_dims[GOMP_DIM_VECTOR]);
0b210c43
TV
820
821 for (i = 0; i != GOMP_DIM_MAX; i++)
822 nvthd->ptx_dev->default_dims[i] = default_dims[i];
6668eb45
CP
823 }
824 pthread_mutex_unlock (&ptx_dev_lock);
825
4cdfee3f
TV
826 {
827 bool default_dim_p[GOMP_DIM_MAX];
828 for (i = 0; i != GOMP_DIM_MAX; i++)
bd9b3d3d
CP
829 default_dim_p[i] = !dims[i];
830
831 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
4cdfee3f 832 {
bd9b3d3d
CP
833 for (i = 0; i != GOMP_DIM_MAX; i++)
834 if (default_dim_p[i])
835 dims[i] = nvthd->ptx_dev->default_dims[i];
836
837 if (default_dim_p[GOMP_DIM_VECTOR])
838 dims[GOMP_DIM_VECTOR]
839 = MIN (dims[GOMP_DIM_VECTOR],
840 (targ_fn->max_threads_per_block / warp_size
841 * warp_size));
842
843 if (default_dim_p[GOMP_DIM_WORKER])
844 dims[GOMP_DIM_WORKER]
845 = MIN (dims[GOMP_DIM_WORKER],
846 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
4cdfee3f 847 }
bd9b3d3d
CP
848 else
849 {
850 /* Handle the case that the compiler allows the runtime to choose
851 the vector-length conservatively, by ignoring
852 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
853 it. */
854 int vectors = 0;
855 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
856 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
857 exceed targ_fn->max_threads_per_block. */
858 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
859 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
860 int grids, blocks;
861
862 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
863 &blocks, function, NULL, 0,
864 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
865 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
866 "grid = %d, block = %d\n", grids, blocks);
867
868 /* Keep the num_gangs proportional to the block size. In
869 the case were a block size is limited by shared-memory
870 or the register file capacity, the runtime will not
871 excessively over assign gangs to the multiprocessor
872 units if their state is going to be swapped out even
873 more than necessary. The constant factor 2 is there to
874 prevent threads from idling when there is insufficient
875 work for them. */
876 if (gangs == 0)
877 gangs = 2 * grids * (blocks / warp_size);
878
879 if (vectors == 0)
880 vectors = warp_size;
881
882 if (workers == 0)
883 {
884 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
885 ? vectors
886 : dims[GOMP_DIM_VECTOR]);
887 workers = blocks / actual_vectors;
2c372e81 888 workers = MAX (workers, 1);
052aaace
TV
889 /* If we need a per-worker barrier ... . */
890 if (actual_vectors > 32)
891 /* Don't use more barriers than available. */
892 workers = MIN (workers, 15);
bd9b3d3d 893 }
4cdfee3f 894
bd9b3d3d
CP
895 for (i = 0; i != GOMP_DIM_MAX; i++)
896 if (default_dim_p[i])
897 switch (i)
898 {
899 case GOMP_DIM_GANG: dims[i] = gangs; break;
900 case GOMP_DIM_WORKER: dims[i] = workers; break;
901 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
902 default: GOMP_PLUGIN_fatal ("invalid dim");
903 }
904 }
4cdfee3f 905 }
f99c3557 906 }
3e32ee19 907
88a4654d
CP
908 /* Check if the accelerator has sufficient hardware resources to
909 launch the offloaded kernel. */
910 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
911 > targ_fn->max_threads_per_block)
912 {
52d22ece
TV
913 const char *msg
914 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
915 " with num_workers = %d and vector_length = %d"
916 "; "
917 "recompile the program with 'num_workers = x and vector_length = y'"
918 " on that offloaded region or '-fopenacc-dim=:x:y' where"
919 " x * y <= %d"
920 ".\n");
921 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
922 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
88a4654d
CP
923 }
924
052aaace
TV
925 /* Check if the accelerator has sufficient barrier resources to
926 launch the offloaded kernel. */
927 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
928 {
929 const char *msg
930 = ("The Nvidia accelerator has insufficient barrier resources to launch"
931 " '%s' with num_workers = %d and vector_length = %d"
932 "; "
933 "recompile the program with 'num_workers = x' on that offloaded"
934 " region or '-fopenacc-dim=:x:' where x <= 15"
935 "; "
936 "or, recompile the program with 'vector_length = 32' on that"
2c2ff168 937 " offloaded region or '-fopenacc-dim=::32'"
052aaace
TV
938 ".\n");
939 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
940 dims[GOMP_DIM_VECTOR]);
941 }
942
3e32ee19
NS
943 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
944 " gangs=%u, workers=%u, vectors=%u\n",
6668eb45
CP
945 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
946 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
41dbbb37
TS
947
948 // OpenACC CUDA
949 //
3e32ee19
NS
950 // num_gangs nctaid.x
951 // num_workers ntid.y
952 // vector length ntid.x
5fae049d
TS
953
954 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
955 acc_prof_info *prof_info = thr->prof_info;
956 acc_event_info enqueue_launch_event_info;
957 acc_api_info *api_info = thr->api_info;
958 bool profiling_p = __builtin_expect (prof_info != NULL, false);
959 if (profiling_p)
960 {
961 prof_info->event_type = acc_ev_enqueue_launch_start;
962
963 enqueue_launch_event_info.launch_event.event_type
964 = prof_info->event_type;
965 enqueue_launch_event_info.launch_event.valid_bytes
966 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
967 enqueue_launch_event_info.launch_event.parent_construct
968 = acc_construct_parallel;
969 enqueue_launch_event_info.launch_event.implicit = 1;
970 enqueue_launch_event_info.launch_event.tool_info = NULL;
971 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
972 enqueue_launch_event_info.launch_event.num_gangs
973 = dims[GOMP_DIM_GANG];
974 enqueue_launch_event_info.launch_event.num_workers
975 = dims[GOMP_DIM_WORKER];
976 enqueue_launch_event_info.launch_event.vector_length
977 = dims[GOMP_DIM_VECTOR];
978
979 api_info->device_api = acc_device_api_cuda;
980
981 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
982 api_info);
983 }
984
41dbbb37 985 kargs[0] = &dp;
6ce13072
CLT
986 CUDA_CALL_ASSERT (cuLaunchKernel, function,
987 dims[GOMP_DIM_GANG], 1, 1,
988 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1f4c5b9b 989 0, stream, kargs, 0);
41dbbb37 990
5fae049d
TS
991 if (profiling_p)
992 {
993 prof_info->event_type = acc_ev_enqueue_launch_end;
994 enqueue_launch_event_info.launch_event.event_type
995 = prof_info->event_type;
996 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
997 api_info);
998 }
999
41dbbb37 1000 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
3e32ee19 1001 targ_fn->launch->fn);
41dbbb37
TS
1002}
1003
1004void * openacc_get_current_cuda_context (void);
1005
5fae049d
TS
1006static void
1007goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1008{
1009 acc_prof_info *prof_info = thr->prof_info;
1010 acc_event_info data_event_info;
1011 acc_api_info *api_info = thr->api_info;
1012
1013 prof_info->event_type = acc_ev_alloc;
1014
1015 data_event_info.data_event.event_type = prof_info->event_type;
1016 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1017 data_event_info.data_event.parent_construct = acc_construct_parallel;
1018 data_event_info.data_event.implicit = 1;
1019 data_event_info.data_event.tool_info = NULL;
1020 data_event_info.data_event.var_name = NULL;
1021 data_event_info.data_event.bytes = s;
1022 data_event_info.data_event.host_ptr = NULL;
1023 data_event_info.data_event.device_ptr = dp;
1024
1025 api_info->device_api = acc_device_api_cuda;
1026
1027 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1028}
1029
6b577a17
JB
1030/* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1031 size threshold, or if FORCE is true. */
1032
1033static void
1034nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1035{
1036 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1037 if (ptx_dev->omp_stacks.ptr
1038 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1039 {
1040 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1041 if (r != CUDA_SUCCESS)
1042 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1043 ptx_dev->omp_stacks.ptr = 0;
1044 ptx_dev->omp_stacks.size = 0;
1045 }
1046 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1047}
1048
41dbbb37 1049static void *
6b577a17 1050nvptx_alloc (size_t s, bool suppress_errors)
41dbbb37
TS
1051{
1052 CUdeviceptr d;
41dbbb37 1053
6b577a17
JB
1054 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1055 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1056 return NULL;
1057 else if (r != CUDA_SUCCESS)
1058 {
1059 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1060 return NULL;
1061 }
1062
1063 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
5fae049d
TS
1064 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1065 bool profiling_p
1066 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1067 if (profiling_p)
1068 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1069
6ce13072 1070 return (void *) d;
41dbbb37
TS
1071}
1072
5fae049d
TS
1073static void
1074goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1075{
1076 acc_prof_info *prof_info = thr->prof_info;
1077 acc_event_info data_event_info;
1078 acc_api_info *api_info = thr->api_info;
1079
1080 prof_info->event_type = acc_ev_free;
1081
1082 data_event_info.data_event.event_type = prof_info->event_type;
1083 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1084 data_event_info.data_event.parent_construct = acc_construct_parallel;
1085 data_event_info.data_event.implicit = 1;
1086 data_event_info.data_event.tool_info = NULL;
1087 data_event_info.data_event.var_name = NULL;
1088 data_event_info.data_event.bytes = -1;
1089 data_event_info.data_event.host_ptr = NULL;
1090 data_event_info.data_event.device_ptr = p;
1091
1092 api_info->device_api = acc_device_api_cuda;
1093
1094 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1095}
1096
6ce13072 1097static bool
1f4c5b9b 1098nvptx_free (void *p, struct ptx_device *ptx_dev)
41dbbb37 1099{
f9b98328
CLT
1100 CUdeviceptr pb;
1101 size_t ps;
1102
1103 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1104 (CUdeviceptr) p);
1105 if (r == CUDA_ERROR_NOT_PERMITTED)
1f4c5b9b 1106 {
f9b98328
CLT
1107 /* We assume that this error indicates we are in a CUDA callback context,
1108 where all CUDA calls are not allowed (see cuStreamAddCallback
1109 documentation for description). Arrange to free this piece of device
1110 memory later. */
1f4c5b9b
CLT
1111 struct ptx_free_block *n
1112 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1113 n->ptr = p;
1114 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1115 n->next = ptx_dev->free_blocks;
1116 ptx_dev->free_blocks = n;
1117 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1118 return true;
1119 }
f9b98328
CLT
1120 else if (r != CUDA_SUCCESS)
1121 {
1122 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1123 return false;
1124 }
6ce13072
CLT
1125 if ((CUdeviceptr) p != pb)
1126 {
1127 GOMP_PLUGIN_error ("invalid device address");
1128 return false;
1129 }
41dbbb37 1130
6ce13072 1131 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
5fae049d
TS
1132 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1133 bool profiling_p
1134 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1135 if (profiling_p)
1136 goacc_profiling_acc_ev_free (thr, p);
1137
6ce13072 1138 return true;
41dbbb37
TS
1139}
1140
1f4c5b9b
CLT
1141static void *
1142nvptx_get_current_cuda_device (void)
41dbbb37 1143{
41dbbb37
TS
1144 struct nvptx_thread *nvthd = nvptx_thread ();
1145
1f4c5b9b
CLT
1146 if (!nvthd || !nvthd->ptx_dev)
1147 return NULL;
41dbbb37 1148
1f4c5b9b 1149 return &nvthd->ptx_dev->dev;
41dbbb37
TS
1150}
1151
1f4c5b9b
CLT
1152static void *
1153nvptx_get_current_cuda_context (void)
41dbbb37
TS
1154{
1155 struct nvptx_thread *nvthd = nvptx_thread ();
1156
1157 if (!nvthd || !nvthd->ptx_dev)
1158 return NULL;
1159
1160 return nvthd->ptx_dev->ctx;
1161}
1162
41dbbb37
TS
1163/* Plugin entry points. */
1164
1165const char *
1166GOMP_OFFLOAD_get_name (void)
1167{
1168 return "nvptx";
1169}
1170
1171unsigned int
1172GOMP_OFFLOAD_get_caps (void)
1173{
6103184e 1174 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
41dbbb37
TS
1175}
1176
1177int
1178GOMP_OFFLOAD_get_type (void)
1179{
1180 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1181}
1182
1183int
683f1184 1184GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
41dbbb37 1185{
683f1184
TB
1186 int num_devices = nvptx_get_num_devices ();
1187 /* Return -1 if no omp_requires_mask cannot be fulfilled but
131d18e9
TB
1188 devices were present. Unified-shared address: see comment in
1189 nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1190 if (num_devices > 0
1191 && (omp_requires_mask & ~GOMP_REQUIRES_UNIFIED_ADDRESS) != 0)
683f1184
TB
1192 return -1;
1193 return num_devices;
41dbbb37
TS
1194}
1195
6ce13072 1196bool
d93bdab5 1197GOMP_OFFLOAD_init_device (int n)
41dbbb37 1198{
6ce13072
CLT
1199 struct ptx_device *dev;
1200
d93bdab5 1201 pthread_mutex_lock (&ptx_dev_lock);
41dbbb37 1202
d93bdab5
JB
1203 if (!nvptx_init () || ptx_devices[n] != NULL)
1204 {
1205 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072 1206 return false;
d93bdab5
JB
1207 }
1208
6ce13072
CLT
1209 dev = nvptx_open_device (n);
1210 if (dev)
1211 {
1212 ptx_devices[n] = dev;
1213 instantiated_devices++;
1214 }
d93bdab5
JB
1215
1216 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072
CLT
1217
1218 return dev != NULL;
41dbbb37
TS
1219}
1220
6ce13072 1221bool
d93bdab5 1222GOMP_OFFLOAD_fini_device (int n)
41dbbb37 1223{
d93bdab5
JB
1224 pthread_mutex_lock (&ptx_dev_lock);
1225
1226 if (ptx_devices[n] != NULL)
1227 {
6ce13072
CLT
1228 if (!nvptx_attach_host_thread_to_device (n)
1229 || !nvptx_close_device (ptx_devices[n]))
1230 {
1231 pthread_mutex_unlock (&ptx_dev_lock);
1232 return false;
1233 }
d93bdab5
JB
1234 ptx_devices[n] = NULL;
1235 instantiated_devices--;
1236 }
1237
738c56d4
TV
1238 if (instantiated_devices == 0)
1239 {
1240 free (ptx_devices);
1241 ptx_devices = NULL;
1242 }
1243
d93bdab5 1244 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072 1245 return true;
41dbbb37
TS
1246}
1247
2a21ff19
NS
1248/* Return the libgomp version number we're compatible with. There is
1249 no requirement for cross-version compatibility. */
1250
1251unsigned
1252GOMP_OFFLOAD_version (void)
1253{
1254 return GOMP_VERSION;
1255}
1256
6103184e
AM
1257/* Initialize __nvptx_clocktick, if present in MODULE. */
1258
1259static void
1260nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1261{
1262 CUdeviceptr dptr;
2393d337
JJ
1263 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1264 module, "__nvptx_clocktick");
6103184e
AM
1265 if (r == CUDA_ERROR_NOT_FOUND)
1266 return;
1267 if (r != CUDA_SUCCESS)
1268 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1269 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
2393d337
JJ
1270 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1271 sizeof (__nvptx_clocktick));
6103184e
AM
1272 if (r != CUDA_SUCCESS)
1273 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1274}
1275
f3e9a059 1276/* Load the (partial) program described by TARGET_DATA to device
0fcc0cf9
TB
1277 number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
1278 will contain the on-device addresses of the functions for reverse offload.
1279 To be freed by the caller. */
f3e9a059 1280
41dbbb37 1281int
2a21ff19 1282GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
0fcc0cf9 1283 struct addr_pair **target_table,
50be486d 1284 uint64_t **rev_fn_table)
41dbbb37
TS
1285{
1286 CUmodule module;
3e32ee19
NS
1287 const char *const *var_names;
1288 const struct targ_fn_launch *fn_descs;
0bac793e 1289 unsigned int fn_entries, var_entries, other_entries, i, j;
41dbbb37 1290 struct targ_fn_descriptor *targ_fns;
f3e9a059 1291 struct addr_pair *targ_tbl;
afb2d80b 1292 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
d93bdab5 1293 struct ptx_image_data *new_image;
f3e9a059 1294 struct ptx_device *dev;
41dbbb37 1295
2a21ff19 1296 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
6ce13072
CLT
1297 {
1298 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1299 " (expected %u, received %u)",
1300 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1301 return -1;
1302 }
41dbbb37 1303
6ce13072
CLT
1304 if (!nvptx_attach_host_thread_to_device (ord)
1305 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1306 return -1;
d93bdab5 1307
6ce13072 1308 dev = ptx_devices[ord];
41dbbb37 1309
a4cb876d
NS
1310 /* The mkoffload utility emits a struct of pointers/integers at the
1311 start of each offload image. The array of kernel names and the
1312 functions addresses form a one-to-one correspondence. */
41dbbb37 1313
a4cb876d
NS
1314 var_entries = img_header->var_num;
1315 var_names = img_header->var_names;
1316 fn_entries = img_header->fn_num;
3e32ee19 1317 fn_descs = img_header->fn_descs;
41dbbb37 1318
9f2fca56 1319 /* Currently, other_entries contains only the struct of ICVs. */
0bac793e
CLT
1320 other_entries = 1;
1321
f3e9a059 1322 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
0bac793e 1323 * (fn_entries + var_entries + other_entries));
41dbbb37
TS
1324 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1325 * fn_entries);
1326
f3e9a059
NS
1327 *target_table = targ_tbl;
1328
1329 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1330 new_image->target_data = target_data;
1331 new_image->module = module;
1332 new_image->fns = targ_fns;
1333
1334 pthread_mutex_lock (&dev->image_lock);
1335 new_image->next = dev->images;
1336 dev->images = new_image;
1337 pthread_mutex_unlock (&dev->image_lock);
1338
1339 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
41dbbb37
TS
1340 {
1341 CUfunction function;
6103184e 1342 int nregs, mthrs;
41dbbb37 1343
6ce13072
CLT
1344 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1345 fn_descs[i].fn);
6103184e
AM
1346 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1347 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1348 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1349 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
41dbbb37 1350
f3e9a059 1351 targ_fns->fn = function;
3e32ee19 1352 targ_fns->launch = &fn_descs[i];
6103184e
AM
1353 targ_fns->regs_per_thread = nregs;
1354 targ_fns->max_threads_per_block = mthrs;
41dbbb37 1355
f3e9a059
NS
1356 targ_tbl->start = (uintptr_t) targ_fns;
1357 targ_tbl->end = targ_tbl->start + 1;
41dbbb37
TS
1358 }
1359
f3e9a059 1360 for (j = 0; j < var_entries; j++, targ_tbl++)
d93bdab5
JB
1361 {
1362 CUdeviceptr var;
1363 size_t bytes;
1364
6ce13072
CLT
1365 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1366 &var, &bytes, module, var_names[j]);
d93bdab5 1367
f3e9a059
NS
1368 targ_tbl->start = (uintptr_t) var;
1369 targ_tbl->end = targ_tbl->start + bytes;
d93bdab5
JB
1370 }
1371
9f2fca56
MV
1372 CUdeviceptr varptr;
1373 size_t varsize;
1374 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1375 module, XSTRING (GOMP_ADDITIONAL_ICVS));
1376
0bac793e
CLT
1377 if (r == CUDA_SUCCESS)
1378 {
9f2fca56
MV
1379 targ_tbl->start = (uintptr_t) varptr;
1380 targ_tbl->end = (uintptr_t) (varptr + varsize);
0bac793e
CLT
1381 }
1382 else
9f2fca56 1383 /* The variable was not in this image. */
0bac793e 1384 targ_tbl->start = targ_tbl->end = 0;
0bac793e 1385
50be486d
TB
1386 if (rev_fn_table && fn_entries == 0)
1387 *rev_fn_table = NULL;
1388 else if (rev_fn_table)
1389 {
1390 CUdeviceptr var;
131d18e9 1391 size_t bytes, i;
50be486d
TB
1392 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1393 "$offload_func_table");
1394 if (r != CUDA_SUCCESS)
1395 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1396 assert (bytes == sizeof (uint64_t) * fn_entries);
1397 *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
1398 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
1399 if (r != CUDA_SUCCESS)
1400 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
131d18e9
TB
1401 /* Free if only NULL entries. */
1402 for (i = 0; i < fn_entries; ++i)
1403 if ((*rev_fn_table)[i] != 0)
1404 break;
1405 if (i == fn_entries)
1406 {
1407 free (*rev_fn_table);
1408 *rev_fn_table = NULL;
1409 }
1410 }
1411
1412 if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
1413 {
1414 /* cuMemHostAlloc memory is accessible on the device, if unified-shared
1415 address is supported; this is assumed - see comment in
1416 nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1417 CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
1418 sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
1419 CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
1420 CUdeviceptr device_rev_offload_var;
1421 size_t device_rev_offload_size;
1422 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
1423 &device_rev_offload_var,
1424 &device_rev_offload_size, module,
1425 XSTRING (GOMP_REV_OFFLOAD_VAR));
1426 if (r != CUDA_SUCCESS)
1427 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error - GOMP_REV_OFFLOAD_VAR: %s", cuda_error (r));
1428 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
1429 sizeof (dp));
1430 if (r != CUDA_SUCCESS)
1431 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
50be486d
TB
1432 }
1433
6103184e
AM
1434 nvptx_set_clocktick (module, dev);
1435
0bac793e 1436 return fn_entries + var_entries + other_entries;
d93bdab5
JB
1437}
1438
f3e9a059
NS
1439/* Unload the program described by TARGET_DATA. DEV_DATA is the
1440 function descriptors allocated by G_O_load_image. */
1441
6ce13072 1442bool
2a21ff19 1443GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
d93bdab5 1444{
f3e9a059
NS
1445 struct ptx_image_data *image, **prev_p;
1446 struct ptx_device *dev = ptx_devices[ord];
1447
2a21ff19 1448 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
6ce13072
CLT
1449 {
1450 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1451 " (expected %u, received %u)",
1452 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1453 return false;
1454 }
1455
1456 bool ret = true;
f3e9a059
NS
1457 pthread_mutex_lock (&dev->image_lock);
1458 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1459 if (image->target_data == target_data)
1460 {
1461 *prev_p = image->next;
2393d337 1462 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
6ce13072 1463 ret = false;
f3e9a059
NS
1464 free (image->fns);
1465 free (image);
1466 break;
1467 }
1468 pthread_mutex_unlock (&dev->image_lock);
6ce13072 1469 return ret;
41dbbb37
TS
1470}
1471
1472void *
d93bdab5 1473GOMP_OFFLOAD_alloc (int ord, size_t size)
41dbbb37 1474{
6ce13072
CLT
1475 if (!nvptx_attach_host_thread_to_device (ord))
1476 return NULL;
41dbbb37 1477
1f4c5b9b
CLT
1478 struct ptx_device *ptx_dev = ptx_devices[ord];
1479 struct ptx_free_block *blocks, *tmp;
41dbbb37 1480
1f4c5b9b
CLT
1481 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1482 blocks = ptx_dev->free_blocks;
1483 ptx_dev->free_blocks = NULL;
1484 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
41dbbb37 1485
6b577a17
JB
1486 nvptx_stacks_free (ptx_dev, false);
1487
1f4c5b9b
CLT
1488 while (blocks)
1489 {
1490 tmp = blocks->next;
1491 nvptx_free (blocks->ptr, ptx_dev);
1492 free (blocks);
1493 blocks = tmp;
1494 }
1495
6b577a17
JB
1496 void *d = nvptx_alloc (size, true);
1497 if (d)
1498 return d;
1499 else
1500 {
1501 /* Memory allocation failed. Try freeing the stacks block, and
1502 retrying. */
1503 nvptx_stacks_free (ptx_dev, true);
1504 return nvptx_alloc (size, false);
1505 }
41dbbb37
TS
1506}
1507
6103184e 1508bool
1f4c5b9b 1509GOMP_OFFLOAD_free (int ord, void *ptr)
6103184e 1510{
1f4c5b9b
CLT
1511 return (nvptx_attach_host_thread_to_device (ord)
1512 && nvptx_free (ptr, ptx_devices[ord]));
6103184e
AM
1513}
1514
41dbbb37 1515void
345a8c17
TS
1516GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1517 void **hostaddrs, void **devaddrs,
1f4c5b9b 1518 unsigned *dims, void *targ_mem_desc)
41dbbb37 1519{
1f4c5b9b 1520 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
41dbbb37 1521
5fae049d
TS
1522 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1523 acc_prof_info *prof_info = thr->prof_info;
1524 acc_event_info data_event_info;
1525 acc_api_info *api_info = thr->api_info;
1526 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1527
1f4c5b9b
CLT
1528 void **hp = NULL;
1529 CUdeviceptr dp = 0;
41dbbb37 1530
1f4c5b9b
CLT
1531 if (mapnum > 0)
1532 {
5fae049d
TS
1533 size_t s = mapnum * sizeof (void *);
1534 hp = alloca (s);
1f4c5b9b
CLT
1535 for (int i = 0; i < mapnum; i++)
1536 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
5fae049d
TS
1537 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1538 if (profiling_p)
1539 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1f4c5b9b 1540 }
41dbbb37 1541
1f4c5b9b
CLT
1542 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1543 fact have the same value on a unified-memory system). */
1544 if (mapnum > 0)
5fae049d
TS
1545 {
1546 if (profiling_p)
1547 {
1548 prof_info->event_type = acc_ev_enqueue_upload_start;
1549
1550 data_event_info.data_event.event_type = prof_info->event_type;
1551 data_event_info.data_event.valid_bytes
1552 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1553 data_event_info.data_event.parent_construct
1554 = acc_construct_parallel;
1555 data_event_info.data_event.implicit = 1; /* Always implicit. */
1556 data_event_info.data_event.tool_info = NULL;
1557 data_event_info.data_event.var_name = NULL;
1558 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1559 data_event_info.data_event.host_ptr = hp;
1560 data_event_info.data_event.device_ptr = (const void *) dp;
1561
1562 api_info->device_api = acc_device_api_cuda;
1563
1564 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1565 api_info);
1566 }
1567 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1568 mapnum * sizeof (void *));
1569 if (profiling_p)
1570 {
1571 prof_info->event_type = acc_ev_enqueue_upload_end;
1572 data_event_info.data_event.event_type = prof_info->event_type;
1573 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1574 api_info);
1575 }
1576 }
41dbbb37 1577
1f4c5b9b
CLT
1578 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1579 dp, NULL);
41dbbb37 1580
1f4c5b9b
CLT
1581 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1582 const char *maybe_abort_msg = "(perhaps abort was called)";
1583 if (r == CUDA_ERROR_LAUNCH_FAILED)
1584 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1585 maybe_abort_msg);
1586 else if (r != CUDA_SUCCESS)
1587 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
5fae049d 1588
1f4c5b9b 1589 CUDA_CALL_ASSERT (cuMemFree, dp);
5fae049d
TS
1590 if (profiling_p)
1591 goacc_profiling_acc_ev_free (thr, (void *) dp);
41dbbb37
TS
1592}
1593
1f4c5b9b
CLT
1594static void
1595cuda_free_argmem (void *ptr)
41dbbb37 1596{
1f4c5b9b
CLT
1597 void **block = (void **) ptr;
1598 nvptx_free (block[0], (struct ptx_device *) block[1]);
1599 free (block);
41dbbb37
TS
1600}
1601
1602void
1f4c5b9b
CLT
1603GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1604 void **hostaddrs, void **devaddrs,
1605 unsigned *dims, void *targ_mem_desc,
1606 struct goacc_asyncqueue *aq)
41dbbb37 1607{
1f4c5b9b 1608 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
41dbbb37 1609
5fae049d
TS
1610 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1611 acc_prof_info *prof_info = thr->prof_info;
1612 acc_event_info data_event_info;
1613 acc_api_info *api_info = thr->api_info;
1614 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1615
1f4c5b9b
CLT
1616 void **hp = NULL;
1617 CUdeviceptr dp = 0;
1618 void **block = NULL;
41dbbb37 1619
1f4c5b9b
CLT
1620 if (mapnum > 0)
1621 {
5fae049d
TS
1622 size_t s = mapnum * sizeof (void *);
1623 block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1f4c5b9b
CLT
1624 hp = block + 2;
1625 for (int i = 0; i < mapnum; i++)
1626 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
5fae049d
TS
1627 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1628 if (profiling_p)
1629 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1f4c5b9b
CLT
1630 }
1631
1632 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1633 fact have the same value on a unified-memory system). */
1634 if (mapnum > 0)
1635 {
5fae049d
TS
1636 if (profiling_p)
1637 {
1638 prof_info->event_type = acc_ev_enqueue_upload_start;
1639
1640 data_event_info.data_event.event_type = prof_info->event_type;
1641 data_event_info.data_event.valid_bytes
1642 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1643 data_event_info.data_event.parent_construct
1644 = acc_construct_parallel;
1645 data_event_info.data_event.implicit = 1; /* Always implicit. */
1646 data_event_info.data_event.tool_info = NULL;
1647 data_event_info.data_event.var_name = NULL;
1648 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1649 data_event_info.data_event.host_ptr = hp;
1650 data_event_info.data_event.device_ptr = (const void *) dp;
1651
1652 api_info->device_api = acc_device_api_cuda;
1653
1654 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1655 api_info);
1656 }
1657
1f4c5b9b
CLT
1658 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1659 mapnum * sizeof (void *), aq->cuda_stream);
1660 block[0] = (void *) dp;
1661
1662 struct nvptx_thread *nvthd =
1663 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1664 block[1] = (void *) nvthd->ptx_dev;
5fae049d
TS
1665
1666 if (profiling_p)
1667 {
1668 prof_info->event_type = acc_ev_enqueue_upload_end;
1669 data_event_info.data_event.event_type = prof_info->event_type;
1670 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1671 api_info);
1672 }
1f4c5b9b 1673 }
5fae049d 1674
1f4c5b9b
CLT
1675 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1676 dp, aq->cuda_stream);
1677
1678 if (mapnum > 0)
1679 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
41dbbb37
TS
1680}
1681
1682void *
d93bdab5 1683GOMP_OFFLOAD_openacc_create_thread_data (int ord)
41dbbb37 1684{
d93bdab5 1685 struct ptx_device *ptx_dev;
41dbbb37
TS
1686 struct nvptx_thread *nvthd
1687 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
41dbbb37
TS
1688 CUcontext thd_ctx;
1689
d93bdab5
JB
1690 ptx_dev = ptx_devices[ord];
1691
1692 assert (ptx_dev);
1693
6ce13072 1694 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
41dbbb37
TS
1695
1696 assert (ptx_dev->ctx);
1697
1698 if (!thd_ctx)
6ce13072 1699 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
41dbbb37 1700
41dbbb37
TS
1701 nvthd->ptx_dev = ptx_dev;
1702
1703 return (void *) nvthd;
1704}
1705
1706void
1707GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1708{
1709 free (data);
1710}
1711
1712void *
345a8c17 1713GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
41dbbb37
TS
1714{
1715 return nvptx_get_current_cuda_device ();
1716}
1717
1718void *
345a8c17 1719GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
41dbbb37
TS
1720{
1721 return nvptx_get_current_cuda_context ();
1722}
1723
1f4c5b9b 1724/* This returns a CUstream. */
41dbbb37 1725void *
1f4c5b9b
CLT
1726GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1727{
1728 return (void *) aq->cuda_stream;
1729}
1730
1731/* This takes a CUstream. */
1732int
1733GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1734{
1735 if (aq->cuda_stream)
1736 {
1737 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1738 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1739 }
1740
1741 aq->cuda_stream = (CUstream) stream;
1742 return 1;
1743}
1744
1745struct goacc_asyncqueue *
d2903ce0 1746GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
41dbbb37 1747{
1f4c5b9b
CLT
1748 CUstream stream = NULL;
1749 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1750
1751 struct goacc_asyncqueue *aq
1752 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1753 aq->cuda_stream = stream;
1754 return aq;
41dbbb37
TS
1755}
1756
1f4c5b9b
CLT
1757bool
1758GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1759{
1760 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1761 free (aq);
1762 return true;
1763}
41dbbb37
TS
1764
1765int
1f4c5b9b 1766GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
41dbbb37 1767{
1f4c5b9b
CLT
1768 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1769 if (r == CUDA_SUCCESS)
1770 return 1;
1771 if (r == CUDA_ERROR_NOT_READY)
1772 return 0;
1773
1774 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1775 return -1;
1776}
1777
1778bool
1779GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1780{
1781 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1782 return true;
1783}
1784
1785bool
1786GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1787 struct goacc_asyncqueue *aq2)
1788{
1789 CUevent e;
1790 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1791 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1792 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1793 return true;
1794}
1795
1796static void
1797cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1798{
1799 if (res != CUDA_SUCCESS)
1800 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1801 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1802 cb->fn (cb->ptr);
1803 free (ptr);
1804}
1805
1806void
1807GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1808 void (*callback_fn)(void *),
1809 void *userptr)
1810{
1811 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1812 b->fn = callback_fn;
1813 b->ptr = userptr;
1814 b->aq = aq;
1815 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1816 cuda_callback_wrapper, (void *) b, 0);
1817}
1818
1819static bool
1820cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1821{
1822 CUdeviceptr pb;
1823 size_t ps;
1824 if (!s)
1825 return true;
1826 if (!d)
1827 {
1828 GOMP_PLUGIN_error ("invalid device address");
1829 return false;
1830 }
1831 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1832 if (!pb)
1833 {
1834 GOMP_PLUGIN_error ("invalid device address");
1835 return false;
1836 }
1837 if (!h)
1838 {
1839 GOMP_PLUGIN_error ("invalid host address");
1840 return false;
1841 }
1842 if (d == h)
1843 {
1844 GOMP_PLUGIN_error ("invalid host or device address");
1845 return false;
1846 }
1847 if ((void *)(d + s) > (void *)(pb + ps))
1848 {
1849 GOMP_PLUGIN_error ("invalid size");
1850 return false;
1851 }
1852 return true;
1853}
1854
1855bool
1856GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1857{
1858 if (!nvptx_attach_host_thread_to_device (ord)
1859 || !cuda_memcpy_sanity_check (src, dst, n))
1860 return false;
1861 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1862 return true;
1863}
1864
1865bool
1866GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1867{
1868 if (!nvptx_attach_host_thread_to_device (ord)
1869 || !cuda_memcpy_sanity_check (dst, src, n))
1870 return false;
1871 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1872 return true;
1873}
1874
1875bool
1876GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1877{
1878 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1879 return true;
1880}
1881
1882bool
1883GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1884 size_t n, struct goacc_asyncqueue *aq)
1885{
1886 if (!nvptx_attach_host_thread_to_device (ord)
1887 || !cuda_memcpy_sanity_check (src, dst, n))
1888 return false;
1889 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1890 return true;
1891}
1892
1893bool
1894GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1895 size_t n, struct goacc_asyncqueue *aq)
1896{
1897 if (!nvptx_attach_host_thread_to_device (ord)
1898 || !cuda_memcpy_sanity_check (dst, src, n))
1899 return false;
1900 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1901 return true;
41dbbb37 1902}
6103184e 1903
6fc0385c
TS
1904union goacc_property_value
1905GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1906{
1907 union goacc_property_value propval = { .val = 0 };
1908
1909 pthread_mutex_lock (&ptx_dev_lock);
1910
1911 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1912 {
1913 pthread_mutex_unlock (&ptx_dev_lock);
1914 return propval;
1915 }
1916
1917 struct ptx_device *ptx_dev = ptx_devices[n];
1918 switch (prop)
1919 {
1920 case GOACC_PROPERTY_MEMORY:
1921 {
1922 size_t total_mem;
1923
1924 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1925 propval.val = total_mem;
1926 }
1927 break;
1928 case GOACC_PROPERTY_FREE_MEMORY:
1929 {
1930 size_t total_mem;
1931 size_t free_mem;
1932 CUdevice ctxdev;
1933
1934 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1935 if (ptx_dev->dev == ctxdev)
1936 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1937 else if (ptx_dev->ctx)
1938 {
1939 CUcontext old_ctx;
1940
1941 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1942 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1943 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1944 }
1945 else
1946 {
1947 CUcontext new_ctx;
1948
1949 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1950 ptx_dev->dev);
1951 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1952 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1953 }
1954 propval.val = free_mem;
1955 }
1956 break;
1957 case GOACC_PROPERTY_NAME:
1958 propval.ptr = ptx_dev->name;
1959 break;
1960 case GOACC_PROPERTY_VENDOR:
1961 propval.ptr = "Nvidia";
1962 break;
1963 case GOACC_PROPERTY_DRIVER:
1964 propval.ptr = cuda_driver_version_s;
1965 break;
1966 default:
1967 break;
1968 }
1969
1970 pthread_mutex_unlock (&ptx_dev_lock);
1971 return propval;
1972}
1973
6103184e
AM
1974/* Adjust launch dimensions: pick good values for number of blocks and warps
1975 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1976 own limits. */
1977
1978static void
1979nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1980 struct ptx_device *ptx_dev,
1981 int *teams_p, int *threads_p)
1982{
1983 int max_warps_block = fn->max_threads_per_block / 32;
1984 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1985 and libgcc, which matches documented limit of all GPUs as of 2015. */
1986 if (max_warps_block > 32)
1987 max_warps_block = 32;
1988 if (*threads_p <= 0)
1989 *threads_p = 8;
1990 if (*threads_p > max_warps_block)
1991 *threads_p = max_warps_block;
1992
1993 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1994 /* This is an estimate of how many blocks the device can host simultaneously.
1995 Actual limit, which may be lower, can be queried with "occupancy control"
1996 driver interface (since CUDA 6.0). */
1997 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1998 if (*teams_p <= 0 || *teams_p > max_blocks)
1999 *teams_p = max_blocks;
2000}
2001
2002/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2003 target regions. */
2004
2005static size_t
2006nvptx_stacks_size ()
2007{
2008 return 128 * 1024;
2009}
2010
6b577a17
JB
2011/* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
2012 the storage should be held on entry, and remains held on exit. */
6103184e
AM
2013
2014static void *
6b577a17 2015nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
6103184e 2016{
6b577a17
JB
2017 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
2018 return (void *) ptx_dev->omp_stacks.ptr;
2019
2020 /* Free the old, too-small stacks. */
2021 if (ptx_dev->omp_stacks.ptr)
2022 {
2023 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2024 if (r != CUDA_SUCCESS)
2025 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
2026 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
2027 if (r != CUDA_SUCCESS)
2028 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2029 }
2030
2031 /* Make new and bigger stacks, and remember where we put them and how big
2032 they are. */
2033 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
2034 size * num);
6103184e
AM
2035 if (r != CUDA_SUCCESS)
2036 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
6103184e 2037
6b577a17 2038 ptx_dev->omp_stacks.size = size * num;
6103184e 2039
6b577a17 2040 return (void *) ptx_dev->omp_stacks.ptr;
6103184e
AM
2041}
2042
131d18e9
TB
2043
2044void
2045rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
2046 CUstream stream)
2047{
2048 CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
2049 CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
2050}
2051
2052void
2053rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
2054 CUstream stream)
2055{
2056 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
2057 CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
2058}
2059
6103184e
AM
2060void
2061GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2062{
7345ef6c
TV
2063 struct targ_fn_descriptor *tgt_fn_desc
2064 = (struct targ_fn_descriptor *) tgt_fn;
2065 CUfunction function = tgt_fn_desc->fn;
2066 const struct targ_fn_launch *launch = tgt_fn_desc->launch;
2067 const char *fn_name = launch->fn;
6103184e
AM
2068 CUresult r;
2069 struct ptx_device *ptx_dev = ptx_devices[ord];
2070 const char *maybe_abort_msg = "(perhaps abort was called)";
2071 int teams = 0, threads = 0;
2072
2073 if (!args)
2074 GOMP_PLUGIN_fatal ("No target arguments provided");
2075 while (*args)
2076 {
2077 intptr_t id = (intptr_t) *args++, val;
2078 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2079 val = (intptr_t) *args++;
2080 else
2081 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2082 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2083 continue;
2084 val = val > INT_MAX ? INT_MAX : val;
2085 id &= GOMP_TARGET_ARG_ID_MASK;
2086 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2087 teams = val;
2088 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2089 threads = val;
2090 }
2091 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2092
2093 size_t stack_size = nvptx_stacks_size ();
131d18e9
TB
2094 bool reverse_offload = ptx_dev->rev_data != NULL;
2095 CUstream copy_stream = NULL;
6b577a17
JB
2096
2097 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2098 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
6103184e
AM
2099 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2100 size_t fn_args_size = sizeof fn_args;
2101 void *config[] = {
2102 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2103 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2104 CU_LAUNCH_PARAM_END
2105 };
7345ef6c
TV
2106 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2107 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2108 __FUNCTION__, fn_name, teams, threads);
131d18e9
TB
2109 if (reverse_offload)
2110 CUDA_CALL_ASSERT (cuStreamCreate, &copy_stream, CU_STREAM_NON_BLOCKING);
2393d337 2111 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
1f4c5b9b 2112 32, threads, 1, 0, NULL, NULL, config);
6103184e
AM
2113 if (r != CUDA_SUCCESS)
2114 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
131d18e9
TB
2115 if (reverse_offload)
2116 while (true)
2117 {
2118 r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
2119 if (r == CUDA_SUCCESS)
2120 break;
2121 if (r == CUDA_ERROR_LAUNCH_FAILED)
2122 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
2123 maybe_abort_msg);
2124 else if (r != CUDA_ERROR_NOT_READY)
2125 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
2126
2127 if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
2128 {
2129 struct rev_offload *rev_data = ptx_dev->rev_data;
2130 GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
2131 rev_data->addrs, rev_data->sizes,
2132 rev_data->kinds, rev_data->dev_num,
2133 rev_off_dev_to_host_cpy,
2134 rev_off_host_to_dev_cpy, copy_stream);
2135 CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
2136 __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
2137 }
2138 usleep (1);
2139 }
2140 else
2141 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2142 if (reverse_offload)
2143 CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
6103184e
AM
2144 if (r == CUDA_ERROR_LAUNCH_FAILED)
2145 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2146 maybe_abort_msg);
2147 else if (r != CUDA_SUCCESS)
2148 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
6b577a17
JB
2149
2150 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
6103184e
AM
2151}
2152
001ab12e 2153/* TODO: Implement GOMP_OFFLOAD_async_run. */