]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-nvptx.c
[AArch64] Improve immediate expansion [PR106583]
[thirdparty/gcc.git] / libgomp / plugin / plugin-nvptx.c
CommitLineData
41dbbb37
TS
1/* Plugin for NVPTX execution.
2
7adcbafe 3 Copyright (C) 2013-2022 Free Software Foundation, Inc.
41dbbb37
TS
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
df36a3d3 34#define _GNU_SOURCE
41dbbb37
TS
35#include "openacc.h"
36#include "config.h"
0bac793e 37#include "symcat.h"
41dbbb37 38#include "libgomp-plugin.h"
41dbbb37 39#include "oacc-plugin.h"
2a21ff19 40#include "gomp-constants.h"
5fae049d 41#include "oacc-int.h"
41dbbb37
TS
42
43#include <pthread.h>
cd644ce8 44#ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
5e431ae4
TS
45# include "cuda/cuda.h"
46#else
47# include <cuda.h>
48#endif
41dbbb37 49#include <stdbool.h>
6103184e 50#include <limits.h>
41dbbb37
TS
51#include <string.h>
52#include <stdio.h>
41dbbb37
TS
53#include <unistd.h>
54#include <assert.h>
6668eb45 55#include <errno.h>
41dbbb37 56
6b577a17
JB
57/* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
58 block to cache between kernel invocations. For soft-stacks blocks bigger
59 than this, we will free the block before attempting another GPU memory
60 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
61 we will free the cached soft-stacks block anyway then retry the
62 allocation. If that fails too, we lose. */
63
64#define SOFTSTACK_CACHE_LIMIT 134217728
65
94767dac
TV
66#if CUDA_VERSION < 6000
67extern CUresult cuGetErrorString (CUresult, const char **);
b113af95 68#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
94767dac
TV
69#endif
70
8e09a12f
TV
71#if CUDA_VERSION >= 6050
72#undef cuLinkCreate
73#undef cuLinkAddData
74CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
75 const char *, unsigned, CUjit_option *, void **);
76CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
77#else
bd9b3d3d 78typedef size_t (*CUoccupancyB2DSize)(int);
8e09a12f
TV
79CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
80 const char *, unsigned, CUjit_option *, void **);
81CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
bd9b3d3d
CP
82CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
83 CUoccupancyB2DSize, size_t, int);
8e09a12f
TV
84#endif
85
02150de8
TV
86#define DO_PRAGMA(x) _Pragma (#x)
87
cd644ce8 88#ifndef PLUGIN_NVPTX_LINK_LIBCUDA
2393d337
JJ
89# include <dlfcn.h>
90
2393d337 91struct cuda_lib_s {
9e28b107
TV
92
93# define CUDA_ONE_CALL(call) \
94 __typeof (call) *call;
02150de8
TV
95# define CUDA_ONE_CALL_MAYBE_NULL(call) \
96 CUDA_ONE_CALL (call)
8c6310a2 97#include "cuda-lib.def"
9e28b107 98# undef CUDA_ONE_CALL
02150de8 99# undef CUDA_ONE_CALL_MAYBE_NULL
9e28b107 100
2393d337
JJ
101} cuda_lib;
102
103/* -1 if init_cuda_lib has not been called yet, false
104 if it has been and failed, true if it has been and succeeded. */
19929ba9 105static signed char cuda_lib_inited = -1;
2393d337
JJ
106
107/* Dynamically load the CUDA runtime library and initialize function
108 pointers, return false if unsuccessful, true if successful. */
109static bool
110init_cuda_lib (void)
111{
112 if (cuda_lib_inited != -1)
113 return cuda_lib_inited;
114 const char *cuda_runtime_lib = "libcuda.so.1";
115 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
116 cuda_lib_inited = false;
117 if (h == NULL)
118 return false;
9e28b107 119
02150de8
TV
120# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
121# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
122# define CUDA_ONE_CALL_1(call, allow_null) \
2393d337 123 cuda_lib.call = dlsym (h, #call); \
02150de8 124 if (!allow_null && cuda_lib.call == NULL) \
2393d337 125 return false;
8c6310a2 126#include "cuda-lib.def"
9e28b107
TV
127# undef CUDA_ONE_CALL
128# undef CUDA_ONE_CALL_1
02150de8 129# undef CUDA_ONE_CALL_MAYBE_NULL
9e28b107 130
2393d337
JJ
131 cuda_lib_inited = true;
132 return true;
41dbbb37 133}
2393d337
JJ
134# define CUDA_CALL_PREFIX cuda_lib.
135#else
02150de8
TV
136
137# define CUDA_ONE_CALL(call)
138# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
139#include "cuda-lib.def"
140#undef CUDA_ONE_CALL_MAYBE_NULL
141#undef CUDA_ONE_CALL
142
2393d337
JJ
143# define CUDA_CALL_PREFIX
144# define init_cuda_lib() true
145#endif
41dbbb37 146
df36a3d3
TV
147#include "secure_getenv.h"
148
4cdfee3f
TV
149#undef MIN
150#undef MAX
151#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
152#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
153
6ce13072 154/* Convenience macros for the frequently used CUDA library call and
2393d337
JJ
155 error handling sequence as well as CUDA library calls that
156 do the error checking themselves or don't do it at all. */
6ce13072
CLT
157
158#define CUDA_CALL_ERET(ERET, FN, ...) \
159 do { \
2393d337
JJ
160 unsigned __r \
161 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
6ce13072
CLT
162 if (__r != CUDA_SUCCESS) \
163 { \
164 GOMP_PLUGIN_error (#FN " error: %s", \
165 cuda_error (__r)); \
166 return ERET; \
167 } \
168 } while (0)
169
170#define CUDA_CALL(FN, ...) \
2393d337 171 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
6ce13072
CLT
172
173#define CUDA_CALL_ASSERT(FN, ...) \
174 do { \
2393d337
JJ
175 unsigned __r \
176 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
6ce13072
CLT
177 if (__r != CUDA_SUCCESS) \
178 { \
179 GOMP_PLUGIN_fatal (#FN " error: %s", \
180 cuda_error (__r)); \
181 } \
182 } while (0)
183
2393d337
JJ
184#define CUDA_CALL_NOCHECK(FN, ...) \
185 CUDA_CALL_PREFIX FN (__VA_ARGS__)
186
02150de8
TV
187#define CUDA_CALL_EXISTS(FN) \
188 CUDA_CALL_PREFIX FN
189
2393d337
JJ
190static const char *
191cuda_error (CUresult r)
192{
cedd9bd0 193 const char *fallback = "unknown cuda error";
2393d337
JJ
194 const char *desc;
195
cedd9bd0
TV
196 if (!CUDA_CALL_EXISTS (cuGetErrorString))
197 return fallback;
198
2393d337 199 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
cedd9bd0
TV
200 if (r == CUDA_SUCCESS)
201 return desc;
2393d337 202
cedd9bd0 203 return fallback;
2393d337
JJ
204}
205
6c84c8bf
MR
206/* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
207 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
208static char cuda_driver_version_s[30];
209
d93bdab5
JB
210static unsigned int instantiated_devices = 0;
211static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
41dbbb37 212
1f4c5b9b
CLT
213/* NVPTX/CUDA specific definition of asynchronous queues. */
214struct goacc_asyncqueue
2049befd 215{
1f4c5b9b 216 CUstream cuda_stream;
2049befd
CP
217};
218
1f4c5b9b 219struct nvptx_callback
41dbbb37 220{
1f4c5b9b
CLT
221 void (*fn) (void *);
222 void *ptr;
223 struct goacc_asyncqueue *aq;
224 struct nvptx_callback *next;
41dbbb37
TS
225};
226
227/* Thread-specific data for PTX. */
228
229struct nvptx_thread
230{
1f4c5b9b
CLT
231 /* We currently have this embedded inside the plugin because libgomp manages
232 devices through integer target_ids. This might be better if using an
233 opaque target-specific pointer directly from gomp_device_descr. */
41dbbb37
TS
234 struct ptx_device *ptx_dev;
235};
236
3e32ee19
NS
237/* Target data function launch information. */
238
239struct targ_fn_launch
240{
241 const char *fn;
cc3cd79b 242 unsigned short dim[GOMP_DIM_MAX];
3e32ee19
NS
243};
244
cc3cd79b
NS
245/* Target PTX object information. */
246
247struct targ_ptx_obj
248{
249 const char *code;
250 size_t size;
251};
252
253/* Target data image information. */
254
255typedef struct nvptx_tdata
256{
257 const struct targ_ptx_obj *ptx_objs;
258 unsigned ptx_num;
259
260 const char *const *var_names;
261 unsigned var_num;
262
263 const struct targ_fn_launch *fn_descs;
264 unsigned fn_num;
265} nvptx_tdata_t;
266
f3e9a059
NS
267/* Descriptor of a loaded function. */
268
269struct targ_fn_descriptor
270{
271 CUfunction fn;
3e32ee19 272 const struct targ_fn_launch *launch;
6103184e
AM
273 int regs_per_thread;
274 int max_threads_per_block;
f3e9a059
NS
275};
276
277/* A loaded PTX image. */
278struct ptx_image_data
279{
280 const void *target_data;
281 CUmodule module;
282
283 struct targ_fn_descriptor *fns; /* Array of functions. */
284
285 struct ptx_image_data *next;
286};
287
1f4c5b9b
CLT
288struct ptx_free_block
289{
290 void *ptr;
291 struct ptx_free_block *next;
292};
293
41dbbb37
TS
294struct ptx_device
295{
296 CUcontext ctx;
297 bool ctx_shared;
298 CUdevice dev;
1f4c5b9b 299
41dbbb37
TS
300 int ord;
301 bool overlap;
302 bool map;
303 bool concur;
41dbbb37 304 bool mkern;
6c84c8bf 305 int mode;
6103184e
AM
306 int clock_khz;
307 int num_sms;
308 int regs_per_block;
309 int regs_per_sm;
0c6c2f5f
CP
310 int warp_size;
311 int max_threads_per_block;
312 int max_threads_per_multiprocessor;
0b210c43 313 int default_dims[GOMP_DIM_MAX];
41dbbb37 314
6c84c8bf
MR
315 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
316 char name[256];
317
f3e9a059
NS
318 struct ptx_image_data *images; /* Images loaded on device. */
319 pthread_mutex_t image_lock; /* Lock for above list. */
41dbbb37 320
1f4c5b9b
CLT
321 struct ptx_free_block *free_blocks;
322 pthread_mutex_t free_blocks_lock;
41dbbb37 323
6b577a17
JB
324 /* OpenMP stacks, cached between kernel invocations. */
325 struct
326 {
327 CUdeviceptr ptr;
328 size_t size;
329 pthread_mutex_t lock;
330 } omp_stacks;
331
1f4c5b9b 332 struct ptx_device *next;
41dbbb37
TS
333};
334
d93bdab5
JB
335static struct ptx_device **ptx_devices;
336
41dbbb37
TS
337static inline struct nvptx_thread *
338nvptx_thread (void)
339{
340 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
341}
342
d93bdab5
JB
343/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
344 should be locked on entry and remains locked on exit. */
f3e9a059 345
d93bdab5 346static bool
41dbbb37
TS
347nvptx_init (void)
348{
d93bdab5 349 int ndevs;
41dbbb37 350
d93bdab5
JB
351 if (instantiated_devices != 0)
352 return true;
41dbbb37 353
2393d337
JJ
354 if (!init_cuda_lib ())
355 return false;
356
357 CUDA_CALL (cuInit, 0);
358
6c84c8bf
MR
359 int cuda_driver_version;
360 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
361 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
362 "CUDA Driver %u.%u",
363 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
364
6ce13072 365 CUDA_CALL (cuDeviceGetCount, &ndevs);
d93bdab5
JB
366 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
367 * ndevs);
6c84c8bf 368
d93bdab5 369 return true;
41dbbb37
TS
370}
371
d93bdab5
JB
372/* Select the N'th PTX device for the current host thread. The device must
373 have been previously opened before calling this function. */
374
6ce13072 375static bool
d93bdab5 376nvptx_attach_host_thread_to_device (int n)
41dbbb37 377{
d93bdab5
JB
378 CUdevice dev;
379 CUresult r;
380 struct ptx_device *ptx_dev;
381 CUcontext thd_ctx;
382
2393d337 383 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
1f4c5b9b
CLT
384 if (r == CUDA_ERROR_NOT_PERMITTED)
385 {
386 /* Assume we're in a CUDA callback, just return true. */
387 return true;
388 }
d93bdab5 389 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
6ce13072
CLT
390 {
391 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
392 return false;
393 }
d93bdab5
JB
394
395 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
6ce13072 396 return true;
d93bdab5
JB
397 else
398 {
399 CUcontext old_ctx;
400
401 ptx_dev = ptx_devices[n];
6ce13072
CLT
402 if (!ptx_dev)
403 {
404 GOMP_PLUGIN_error ("device %d not found", n);
405 return false;
406 }
d93bdab5 407
6ce13072 408 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
d93bdab5
JB
409
410 /* We don't necessarily have a current context (e.g. if it has been
411 destroyed. Pop it if we do though. */
412 if (thd_ctx != NULL)
6ce13072 413 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
d93bdab5 414
6ce13072 415 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
d93bdab5 416 }
6ce13072 417 return true;
41dbbb37
TS
418}
419
d93bdab5 420static struct ptx_device *
41dbbb37
TS
421nvptx_open_device (int n)
422{
423 struct ptx_device *ptx_dev;
d93bdab5 424 CUdevice dev, ctx_dev;
41dbbb37
TS
425 CUresult r;
426 int async_engines, pi;
427
6ce13072 428 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
41dbbb37
TS
429
430 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
431
432 ptx_dev->ord = n;
433 ptx_dev->dev = dev;
434 ptx_dev->ctx_shared = false;
435
2393d337 436 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
d93bdab5 437 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
6ce13072
CLT
438 {
439 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
440 return NULL;
441 }
d93bdab5
JB
442
443 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
444 {
445 /* The current host thread has an active context for a different device.
446 Detach it. */
447 CUcontext old_ctx;
6ce13072 448 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
d93bdab5
JB
449 }
450
6ce13072 451 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
41dbbb37
TS
452
453 if (!ptx_dev->ctx)
6ce13072 454 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
41dbbb37
TS
455 else
456 ptx_dev->ctx_shared = true;
457
6ce13072
CLT
458 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
459 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
41dbbb37
TS
460 ptx_dev->overlap = pi;
461
6ce13072
CLT
462 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
463 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
41dbbb37
TS
464 ptx_dev->map = pi;
465
6ce13072
CLT
466 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
467 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
41dbbb37
TS
468 ptx_dev->concur = pi;
469
6ce13072
CLT
470 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
471 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
41dbbb37
TS
472 ptx_dev->mode = pi;
473
6ce13072
CLT
474 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
475 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
41dbbb37
TS
476 ptx_dev->mkern = pi;
477
6103184e
AM
478 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
479 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
480 ptx_dev->clock_khz = pi;
481
2393d337 482 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
6103184e
AM
483 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
484 ptx_dev->num_sms = pi;
485
486 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
487 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
488 ptx_dev->regs_per_block = pi;
489
b113af95 490 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
6103184e 491 in CUDA 6.0 and newer. */
b113af95
TV
492 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
493 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
494 dev);
6103184e
AM
495 /* Fallback: use limit of registers per block, which is usually equal. */
496 if (r == CUDA_ERROR_INVALID_VALUE)
497 pi = ptx_dev->regs_per_block;
498 else if (r != CUDA_SUCCESS)
499 {
500 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
501 return NULL;
502 }
503 ptx_dev->regs_per_sm = pi;
504
505 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
506 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
507 if (pi != 32)
508 {
509 GOMP_PLUGIN_error ("Only warp size 32 is supported");
510 return NULL;
511 }
0c6c2f5f
CP
512 ptx_dev->warp_size = pi;
513
514 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
515 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
516 ptx_dev->max_threads_per_block = pi;
517
518 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
519 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
520 ptx_dev->max_threads_per_multiprocessor = pi;
6103184e 521
2393d337
JJ
522 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
523 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
41dbbb37
TS
524 if (r != CUDA_SUCCESS)
525 async_engines = 1;
526
0b210c43
TV
527 for (int i = 0; i != GOMP_DIM_MAX; i++)
528 ptx_dev->default_dims[i] = 0;
529
6c84c8bf
MR
530 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
531 dev);
532
f3e9a059
NS
533 ptx_dev->images = NULL;
534 pthread_mutex_init (&ptx_dev->image_lock, NULL);
535
1f4c5b9b
CLT
536 ptx_dev->free_blocks = NULL;
537 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
41dbbb37 538
6b577a17
JB
539 ptx_dev->omp_stacks.ptr = 0;
540 ptx_dev->omp_stacks.size = 0;
541 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
542
d93bdab5 543 return ptx_dev;
41dbbb37
TS
544}
545
6ce13072 546static bool
d93bdab5 547nvptx_close_device (struct ptx_device *ptx_dev)
41dbbb37 548{
41dbbb37 549 if (!ptx_dev)
6ce13072 550 return true;
41dbbb37 551
1f4c5b9b
CLT
552 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
553 {
554 struct ptx_free_block *b_next = b->next;
555 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
556 free (b);
557 b = b_next;
558 }
559
560 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
f3e9a059 561 pthread_mutex_destroy (&ptx_dev->image_lock);
41dbbb37 562
6b577a17
JB
563 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
564
565 if (ptx_dev->omp_stacks.ptr)
566 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
567
41dbbb37 568 if (!ptx_dev->ctx_shared)
6ce13072 569 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
41dbbb37
TS
570
571 free (ptx_dev);
6ce13072 572 return true;
41dbbb37
TS
573}
574
575static int
576nvptx_get_num_devices (void)
577{
578 int n;
41dbbb37
TS
579
580 /* This function will be called before the plugin has been initialized in
581 order to enumerate available devices, but CUDA API routines can't be used
582 until cuInit has been called. Just call it now (but don't yet do any
583 further initialization). */
d93bdab5 584 if (instantiated_devices == 0)
c8319826 585 {
2393d337
JJ
586 if (!init_cuda_lib ())
587 return 0;
588 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
c8319826
JB
589 /* This is not an error: e.g. we may have CUDA libraries installed but
590 no devices available. */
591 if (r != CUDA_SUCCESS)
78672bd8
TS
592 {
593 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
594 cuda_error (r));
595 return 0;
596 }
c8319826 597 }
41dbbb37 598
6ce13072 599 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
41dbbb37
TS
600 return n;
601}
602
dfb15f6b
TV
603static void
604notify_var (const char *var_name, const char *env_var)
605{
606 if (env_var == NULL)
607 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
608 else
609 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
610}
41dbbb37 611
df36a3d3
TV
612static void
613process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
614{
615 const char *var_name = "GOMP_NVPTX_JIT";
616 const char *env_var = secure_getenv (var_name);
617 notify_var (var_name, env_var);
618
619 if (env_var == NULL)
620 return;
621
622 const char *c = env_var;
623 while (*c != '\0')
624 {
625 while (*c == ' ')
626 c++;
627
628 if (c[0] == '-' && c[1] == 'O'
629 && '0' <= c[2] && c[2] <= '4'
630 && (c[3] == '\0' || c[3] == ' '))
631 {
632 *gomp_nvptx_o = c[2] - '0';
633 c += 3;
634 continue;
635 }
636
637 GOMP_PLUGIN_error ("Error parsing %s", var_name);
638 break;
639 }
640}
641
6ce13072 642static bool
cc3cd79b
NS
643link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
644 unsigned num_objs)
41dbbb37 645{
df36a3d3
TV
646 CUjit_option opts[7];
647 void *optvals[7];
41dbbb37 648 float elapsed = 0.0;
6103184e
AM
649 char elog[1024];
650 char ilog[16384];
41dbbb37
TS
651 CUlinkState linkstate;
652 CUresult r;
653 void *linkout;
654 size_t linkoutsize __attribute__ ((unused));
655
41dbbb37
TS
656 opts[0] = CU_JIT_WALL_TIME;
657 optvals[0] = &elapsed;
658
659 opts[1] = CU_JIT_INFO_LOG_BUFFER;
660 optvals[1] = &ilog[0];
661
662 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
6103184e 663 optvals[2] = (void *) sizeof ilog;
41dbbb37
TS
664
665 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
666 optvals[3] = &elog[0];
667
668 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
6103184e 669 optvals[4] = (void *) sizeof elog;
41dbbb37
TS
670
671 opts[5] = CU_JIT_LOG_VERBOSE;
672 optvals[5] = (void *) 1;
673
df36a3d3
TV
674 static intptr_t gomp_nvptx_o = -1;
675
676 static bool init_done = false;
677 if (!init_done)
678 {
679 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
680 init_done = true;
681 }
682
683 int nopts = 6;
684 if (gomp_nvptx_o != -1)
685 {
686 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
687 optvals[nopts] = (void *) gomp_nvptx_o;
688 nopts++;
689 }
690
8e09a12f
TV
691 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
692 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
693 else
694 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
41dbbb37 695
cc3cd79b 696 for (; num_objs--; ptx_objs++)
41dbbb37 697 {
cc3cd79b
NS
698 /* cuLinkAddData's 'data' argument erroneously omits the const
699 qualifier. */
700 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
8e09a12f
TV
701 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
702 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
703 (char *) ptx_objs->code, ptx_objs->size,
704 0, 0, 0, 0);
705 else
706 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
707 (char *) ptx_objs->code, ptx_objs->size,
708 0, 0, 0, 0);
cc3cd79b
NS
709 if (r != CUDA_SUCCESS)
710 {
711 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
6ce13072 712 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
cc3cd79b 713 cuda_error (r));
6ce13072 714 return false;
cc3cd79b 715 }
41dbbb37
TS
716 }
717
cc3cd79b 718 GOMP_PLUGIN_debug (0, "Linking\n");
2393d337 719 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
41dbbb37
TS
720
721 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
722 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
723
cc3cd79b 724 if (r != CUDA_SUCCESS)
6ce13072 725 {
c0e9cee2 726 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
6ce13072
CLT
727 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
728 return false;
729 }
cc3cd79b 730
6ce13072
CLT
731 CUDA_CALL (cuModuleLoadData, module, linkout);
732 CUDA_CALL (cuLinkDestroy, linkstate);
733 return true;
41dbbb37
TS
734}
735
e70ab10d 736static void
41dbbb37 737nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1f4c5b9b
CLT
738 unsigned *dims, void *targ_mem_desc,
739 CUdeviceptr dp, CUstream stream)
41dbbb37
TS
740{
741 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
742 CUfunction function;
41dbbb37 743 int i;
41dbbb37 744 void *kargs[1];
41dbbb37 745 struct nvptx_thread *nvthd = nvptx_thread ();
4cdfee3f 746 int warp_size = nvthd->ptx_dev->warp_size;
41dbbb37
TS
747
748 function = targ_fn->fn;
749
3e32ee19
NS
750 /* Initialize the launch dimensions. Typically this is constant,
751 provided by the device compiler, but we must permit runtime
752 values. */
f99c3557
TS
753 int seen_zero = 0;
754 for (i = 0; i != GOMP_DIM_MAX; i++)
755 {
756 if (targ_fn->launch->dim[i])
757 dims[i] = targ_fn->launch->dim[i];
758 if (!dims[i])
759 seen_zero = 1;
760 }
761
762 if (seen_zero)
763 {
6668eb45 764 pthread_mutex_lock (&ptx_dev_lock);
0b210c43
TV
765
766 static int gomp_openacc_dims[GOMP_DIM_MAX];
767 if (!gomp_openacc_dims[0])
768 {
769 /* See if the user provided GOMP_OPENACC_DIM environment
770 variable to specify runtime defaults. */
771 for (int i = 0; i < GOMP_DIM_MAX; ++i)
772 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
773 }
774
775 if (!nvthd->ptx_dev->default_dims[0])
6668eb45 776 {
0b210c43 777 int default_dims[GOMP_DIM_MAX];
ec00d3fa 778 for (int i = 0; i < GOMP_DIM_MAX; ++i)
0b210c43 779 default_dims[i] = gomp_openacc_dims[i];
6668eb45 780
0c6c2f5f
CP
781 int gang, worker, vector;
782 {
0c6c2f5f
CP
783 int block_size = nvthd->ptx_dev->max_threads_per_block;
784 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
785 int dev_size = nvthd->ptx_dev->num_sms;
786 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
787 " dev_size=%d, cpu_size=%d\n",
788 warp_size, block_size, dev_size, cpu_size);
789
790 gang = (cpu_size / block_size) * dev_size;
791 worker = block_size / warp_size;
792 vector = warp_size;
793 }
6668eb45
CP
794
795 /* There is no upper bound on the gang size. The best size
796 matches the hardware configuration. Logical gangs are
797 scheduled onto physical hardware. To maximize usage, we
798 should guess a large number. */
799 if (default_dims[GOMP_DIM_GANG] < 1)
800 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
801 /* The worker size must not exceed the hardware. */
802 if (default_dims[GOMP_DIM_WORKER] < 1
803 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
804 default_dims[GOMP_DIM_WORKER] = worker;
805 /* The vector size must exactly match the hardware. */
806 if (default_dims[GOMP_DIM_VECTOR] < 1
807 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
808 default_dims[GOMP_DIM_VECTOR] = vector;
809
810 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
811 default_dims[GOMP_DIM_GANG],
812 default_dims[GOMP_DIM_WORKER],
813 default_dims[GOMP_DIM_VECTOR]);
0b210c43
TV
814
815 for (i = 0; i != GOMP_DIM_MAX; i++)
816 nvthd->ptx_dev->default_dims[i] = default_dims[i];
6668eb45
CP
817 }
818 pthread_mutex_unlock (&ptx_dev_lock);
819
4cdfee3f
TV
820 {
821 bool default_dim_p[GOMP_DIM_MAX];
822 for (i = 0; i != GOMP_DIM_MAX; i++)
bd9b3d3d
CP
823 default_dim_p[i] = !dims[i];
824
825 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
4cdfee3f 826 {
bd9b3d3d
CP
827 for (i = 0; i != GOMP_DIM_MAX; i++)
828 if (default_dim_p[i])
829 dims[i] = nvthd->ptx_dev->default_dims[i];
830
831 if (default_dim_p[GOMP_DIM_VECTOR])
832 dims[GOMP_DIM_VECTOR]
833 = MIN (dims[GOMP_DIM_VECTOR],
834 (targ_fn->max_threads_per_block / warp_size
835 * warp_size));
836
837 if (default_dim_p[GOMP_DIM_WORKER])
838 dims[GOMP_DIM_WORKER]
839 = MIN (dims[GOMP_DIM_WORKER],
840 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
4cdfee3f 841 }
bd9b3d3d
CP
842 else
843 {
844 /* Handle the case that the compiler allows the runtime to choose
845 the vector-length conservatively, by ignoring
846 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
847 it. */
848 int vectors = 0;
849 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
850 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
851 exceed targ_fn->max_threads_per_block. */
852 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
853 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
854 int grids, blocks;
855
856 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
857 &blocks, function, NULL, 0,
858 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
859 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
860 "grid = %d, block = %d\n", grids, blocks);
861
862 /* Keep the num_gangs proportional to the block size. In
863 the case were a block size is limited by shared-memory
864 or the register file capacity, the runtime will not
865 excessively over assign gangs to the multiprocessor
866 units if their state is going to be swapped out even
867 more than necessary. The constant factor 2 is there to
868 prevent threads from idling when there is insufficient
869 work for them. */
870 if (gangs == 0)
871 gangs = 2 * grids * (blocks / warp_size);
872
873 if (vectors == 0)
874 vectors = warp_size;
875
876 if (workers == 0)
877 {
878 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
879 ? vectors
880 : dims[GOMP_DIM_VECTOR]);
881 workers = blocks / actual_vectors;
2c372e81 882 workers = MAX (workers, 1);
052aaace
TV
883 /* If we need a per-worker barrier ... . */
884 if (actual_vectors > 32)
885 /* Don't use more barriers than available. */
886 workers = MIN (workers, 15);
bd9b3d3d 887 }
4cdfee3f 888
bd9b3d3d
CP
889 for (i = 0; i != GOMP_DIM_MAX; i++)
890 if (default_dim_p[i])
891 switch (i)
892 {
893 case GOMP_DIM_GANG: dims[i] = gangs; break;
894 case GOMP_DIM_WORKER: dims[i] = workers; break;
895 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
896 default: GOMP_PLUGIN_fatal ("invalid dim");
897 }
898 }
4cdfee3f 899 }
f99c3557 900 }
3e32ee19 901
88a4654d
CP
902 /* Check if the accelerator has sufficient hardware resources to
903 launch the offloaded kernel. */
904 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
905 > targ_fn->max_threads_per_block)
906 {
52d22ece
TV
907 const char *msg
908 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
909 " with num_workers = %d and vector_length = %d"
910 "; "
911 "recompile the program with 'num_workers = x and vector_length = y'"
912 " on that offloaded region or '-fopenacc-dim=:x:y' where"
913 " x * y <= %d"
914 ".\n");
915 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
916 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
88a4654d
CP
917 }
918
052aaace
TV
919 /* Check if the accelerator has sufficient barrier resources to
920 launch the offloaded kernel. */
921 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
922 {
923 const char *msg
924 = ("The Nvidia accelerator has insufficient barrier resources to launch"
925 " '%s' with num_workers = %d and vector_length = %d"
926 "; "
927 "recompile the program with 'num_workers = x' on that offloaded"
928 " region or '-fopenacc-dim=:x:' where x <= 15"
929 "; "
930 "or, recompile the program with 'vector_length = 32' on that"
2c2ff168 931 " offloaded region or '-fopenacc-dim=::32'"
052aaace
TV
932 ".\n");
933 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
934 dims[GOMP_DIM_VECTOR]);
935 }
936
3e32ee19
NS
937 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
938 " gangs=%u, workers=%u, vectors=%u\n",
6668eb45
CP
939 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
940 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
41dbbb37
TS
941
942 // OpenACC CUDA
943 //
3e32ee19
NS
944 // num_gangs nctaid.x
945 // num_workers ntid.y
946 // vector length ntid.x
5fae049d
TS
947
948 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
949 acc_prof_info *prof_info = thr->prof_info;
950 acc_event_info enqueue_launch_event_info;
951 acc_api_info *api_info = thr->api_info;
952 bool profiling_p = __builtin_expect (prof_info != NULL, false);
953 if (profiling_p)
954 {
955 prof_info->event_type = acc_ev_enqueue_launch_start;
956
957 enqueue_launch_event_info.launch_event.event_type
958 = prof_info->event_type;
959 enqueue_launch_event_info.launch_event.valid_bytes
960 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
961 enqueue_launch_event_info.launch_event.parent_construct
962 = acc_construct_parallel;
963 enqueue_launch_event_info.launch_event.implicit = 1;
964 enqueue_launch_event_info.launch_event.tool_info = NULL;
965 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
966 enqueue_launch_event_info.launch_event.num_gangs
967 = dims[GOMP_DIM_GANG];
968 enqueue_launch_event_info.launch_event.num_workers
969 = dims[GOMP_DIM_WORKER];
970 enqueue_launch_event_info.launch_event.vector_length
971 = dims[GOMP_DIM_VECTOR];
972
973 api_info->device_api = acc_device_api_cuda;
974
975 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
976 api_info);
977 }
978
41dbbb37 979 kargs[0] = &dp;
6ce13072
CLT
980 CUDA_CALL_ASSERT (cuLaunchKernel, function,
981 dims[GOMP_DIM_GANG], 1, 1,
982 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1f4c5b9b 983 0, stream, kargs, 0);
41dbbb37 984
5fae049d
TS
985 if (profiling_p)
986 {
987 prof_info->event_type = acc_ev_enqueue_launch_end;
988 enqueue_launch_event_info.launch_event.event_type
989 = prof_info->event_type;
990 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
991 api_info);
992 }
993
41dbbb37 994 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
3e32ee19 995 targ_fn->launch->fn);
41dbbb37
TS
996}
997
998void * openacc_get_current_cuda_context (void);
999
5fae049d
TS
1000static void
1001goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1002{
1003 acc_prof_info *prof_info = thr->prof_info;
1004 acc_event_info data_event_info;
1005 acc_api_info *api_info = thr->api_info;
1006
1007 prof_info->event_type = acc_ev_alloc;
1008
1009 data_event_info.data_event.event_type = prof_info->event_type;
1010 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1011 data_event_info.data_event.parent_construct = acc_construct_parallel;
1012 data_event_info.data_event.implicit = 1;
1013 data_event_info.data_event.tool_info = NULL;
1014 data_event_info.data_event.var_name = NULL;
1015 data_event_info.data_event.bytes = s;
1016 data_event_info.data_event.host_ptr = NULL;
1017 data_event_info.data_event.device_ptr = dp;
1018
1019 api_info->device_api = acc_device_api_cuda;
1020
1021 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1022}
1023
6b577a17
JB
1024/* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1025 size threshold, or if FORCE is true. */
1026
1027static void
1028nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1029{
1030 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1031 if (ptx_dev->omp_stacks.ptr
1032 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1033 {
1034 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1035 if (r != CUDA_SUCCESS)
1036 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1037 ptx_dev->omp_stacks.ptr = 0;
1038 ptx_dev->omp_stacks.size = 0;
1039 }
1040 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1041}
1042
41dbbb37 1043static void *
6b577a17 1044nvptx_alloc (size_t s, bool suppress_errors)
41dbbb37
TS
1045{
1046 CUdeviceptr d;
41dbbb37 1047
6b577a17
JB
1048 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1049 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1050 return NULL;
1051 else if (r != CUDA_SUCCESS)
1052 {
1053 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1054 return NULL;
1055 }
1056
1057 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
5fae049d
TS
1058 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1059 bool profiling_p
1060 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1061 if (profiling_p)
1062 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1063
6ce13072 1064 return (void *) d;
41dbbb37
TS
1065}
1066
5fae049d
TS
1067static void
1068goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1069{
1070 acc_prof_info *prof_info = thr->prof_info;
1071 acc_event_info data_event_info;
1072 acc_api_info *api_info = thr->api_info;
1073
1074 prof_info->event_type = acc_ev_free;
1075
1076 data_event_info.data_event.event_type = prof_info->event_type;
1077 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1078 data_event_info.data_event.parent_construct = acc_construct_parallel;
1079 data_event_info.data_event.implicit = 1;
1080 data_event_info.data_event.tool_info = NULL;
1081 data_event_info.data_event.var_name = NULL;
1082 data_event_info.data_event.bytes = -1;
1083 data_event_info.data_event.host_ptr = NULL;
1084 data_event_info.data_event.device_ptr = p;
1085
1086 api_info->device_api = acc_device_api_cuda;
1087
1088 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1089}
1090
6ce13072 1091static bool
1f4c5b9b 1092nvptx_free (void *p, struct ptx_device *ptx_dev)
41dbbb37 1093{
f9b98328
CLT
1094 CUdeviceptr pb;
1095 size_t ps;
1096
1097 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1098 (CUdeviceptr) p);
1099 if (r == CUDA_ERROR_NOT_PERMITTED)
1f4c5b9b 1100 {
f9b98328
CLT
1101 /* We assume that this error indicates we are in a CUDA callback context,
1102 where all CUDA calls are not allowed (see cuStreamAddCallback
1103 documentation for description). Arrange to free this piece of device
1104 memory later. */
1f4c5b9b
CLT
1105 struct ptx_free_block *n
1106 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1107 n->ptr = p;
1108 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1109 n->next = ptx_dev->free_blocks;
1110 ptx_dev->free_blocks = n;
1111 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1112 return true;
1113 }
f9b98328
CLT
1114 else if (r != CUDA_SUCCESS)
1115 {
1116 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1117 return false;
1118 }
6ce13072
CLT
1119 if ((CUdeviceptr) p != pb)
1120 {
1121 GOMP_PLUGIN_error ("invalid device address");
1122 return false;
1123 }
41dbbb37 1124
6ce13072 1125 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
5fae049d
TS
1126 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1127 bool profiling_p
1128 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1129 if (profiling_p)
1130 goacc_profiling_acc_ev_free (thr, p);
1131
6ce13072 1132 return true;
41dbbb37
TS
1133}
1134
1f4c5b9b
CLT
1135static void *
1136nvptx_get_current_cuda_device (void)
41dbbb37 1137{
41dbbb37
TS
1138 struct nvptx_thread *nvthd = nvptx_thread ();
1139
1f4c5b9b
CLT
1140 if (!nvthd || !nvthd->ptx_dev)
1141 return NULL;
41dbbb37 1142
1f4c5b9b 1143 return &nvthd->ptx_dev->dev;
41dbbb37
TS
1144}
1145
1f4c5b9b
CLT
1146static void *
1147nvptx_get_current_cuda_context (void)
41dbbb37
TS
1148{
1149 struct nvptx_thread *nvthd = nvptx_thread ();
1150
1151 if (!nvthd || !nvthd->ptx_dev)
1152 return NULL;
1153
1154 return nvthd->ptx_dev->ctx;
1155}
1156
41dbbb37
TS
1157/* Plugin entry points. */
1158
1159const char *
1160GOMP_OFFLOAD_get_name (void)
1161{
1162 return "nvptx";
1163}
1164
1165unsigned int
1166GOMP_OFFLOAD_get_caps (void)
1167{
6103184e 1168 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
41dbbb37
TS
1169}
1170
1171int
1172GOMP_OFFLOAD_get_type (void)
1173{
1174 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1175}
1176
1177int
683f1184 1178GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
41dbbb37 1179{
683f1184
TB
1180 int num_devices = nvptx_get_num_devices ();
1181 /* Return -1 if no omp_requires_mask cannot be fulfilled but
1182 devices were present. */
1183 if (num_devices > 0 && omp_requires_mask != 0)
1184 return -1;
1185 return num_devices;
41dbbb37
TS
1186}
1187
6ce13072 1188bool
d93bdab5 1189GOMP_OFFLOAD_init_device (int n)
41dbbb37 1190{
6ce13072
CLT
1191 struct ptx_device *dev;
1192
d93bdab5 1193 pthread_mutex_lock (&ptx_dev_lock);
41dbbb37 1194
d93bdab5
JB
1195 if (!nvptx_init () || ptx_devices[n] != NULL)
1196 {
1197 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072 1198 return false;
d93bdab5
JB
1199 }
1200
6ce13072
CLT
1201 dev = nvptx_open_device (n);
1202 if (dev)
1203 {
1204 ptx_devices[n] = dev;
1205 instantiated_devices++;
1206 }
d93bdab5
JB
1207
1208 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072
CLT
1209
1210 return dev != NULL;
41dbbb37
TS
1211}
1212
6ce13072 1213bool
d93bdab5 1214GOMP_OFFLOAD_fini_device (int n)
41dbbb37 1215{
d93bdab5
JB
1216 pthread_mutex_lock (&ptx_dev_lock);
1217
1218 if (ptx_devices[n] != NULL)
1219 {
6ce13072
CLT
1220 if (!nvptx_attach_host_thread_to_device (n)
1221 || !nvptx_close_device (ptx_devices[n]))
1222 {
1223 pthread_mutex_unlock (&ptx_dev_lock);
1224 return false;
1225 }
d93bdab5
JB
1226 ptx_devices[n] = NULL;
1227 instantiated_devices--;
1228 }
1229
738c56d4
TV
1230 if (instantiated_devices == 0)
1231 {
1232 free (ptx_devices);
1233 ptx_devices = NULL;
1234 }
1235
d93bdab5 1236 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072 1237 return true;
41dbbb37
TS
1238}
1239
2a21ff19
NS
1240/* Return the libgomp version number we're compatible with. There is
1241 no requirement for cross-version compatibility. */
1242
1243unsigned
1244GOMP_OFFLOAD_version (void)
1245{
1246 return GOMP_VERSION;
1247}
1248
6103184e
AM
1249/* Initialize __nvptx_clocktick, if present in MODULE. */
1250
1251static void
1252nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1253{
1254 CUdeviceptr dptr;
2393d337
JJ
1255 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1256 module, "__nvptx_clocktick");
6103184e
AM
1257 if (r == CUDA_ERROR_NOT_FOUND)
1258 return;
1259 if (r != CUDA_SUCCESS)
1260 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1261 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
2393d337
JJ
1262 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1263 sizeof (__nvptx_clocktick));
6103184e
AM
1264 if (r != CUDA_SUCCESS)
1265 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1266}
1267
f3e9a059 1268/* Load the (partial) program described by TARGET_DATA to device
0fcc0cf9
TB
1269 number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
1270 will contain the on-device addresses of the functions for reverse offload.
1271 To be freed by the caller. */
f3e9a059 1272
41dbbb37 1273int
2a21ff19 1274GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
0fcc0cf9 1275 struct addr_pair **target_table,
50be486d 1276 uint64_t **rev_fn_table)
41dbbb37
TS
1277{
1278 CUmodule module;
3e32ee19
NS
1279 const char *const *var_names;
1280 const struct targ_fn_launch *fn_descs;
0bac793e 1281 unsigned int fn_entries, var_entries, other_entries, i, j;
41dbbb37 1282 struct targ_fn_descriptor *targ_fns;
f3e9a059 1283 struct addr_pair *targ_tbl;
afb2d80b 1284 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
d93bdab5 1285 struct ptx_image_data *new_image;
f3e9a059 1286 struct ptx_device *dev;
41dbbb37 1287
2a21ff19 1288 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
6ce13072
CLT
1289 {
1290 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1291 " (expected %u, received %u)",
1292 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1293 return -1;
1294 }
41dbbb37 1295
6ce13072
CLT
1296 if (!nvptx_attach_host_thread_to_device (ord)
1297 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1298 return -1;
d93bdab5 1299
6ce13072 1300 dev = ptx_devices[ord];
41dbbb37 1301
a4cb876d
NS
1302 /* The mkoffload utility emits a struct of pointers/integers at the
1303 start of each offload image. The array of kernel names and the
1304 functions addresses form a one-to-one correspondence. */
41dbbb37 1305
a4cb876d
NS
1306 var_entries = img_header->var_num;
1307 var_names = img_header->var_names;
1308 fn_entries = img_header->fn_num;
3e32ee19 1309 fn_descs = img_header->fn_descs;
41dbbb37 1310
9f2fca56 1311 /* Currently, other_entries contains only the struct of ICVs. */
0bac793e
CLT
1312 other_entries = 1;
1313
f3e9a059 1314 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
0bac793e 1315 * (fn_entries + var_entries + other_entries));
41dbbb37
TS
1316 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1317 * fn_entries);
1318
f3e9a059
NS
1319 *target_table = targ_tbl;
1320
1321 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1322 new_image->target_data = target_data;
1323 new_image->module = module;
1324 new_image->fns = targ_fns;
1325
1326 pthread_mutex_lock (&dev->image_lock);
1327 new_image->next = dev->images;
1328 dev->images = new_image;
1329 pthread_mutex_unlock (&dev->image_lock);
1330
1331 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
41dbbb37
TS
1332 {
1333 CUfunction function;
6103184e 1334 int nregs, mthrs;
41dbbb37 1335
6ce13072
CLT
1336 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1337 fn_descs[i].fn);
6103184e
AM
1338 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1339 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1340 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1341 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
41dbbb37 1342
f3e9a059 1343 targ_fns->fn = function;
3e32ee19 1344 targ_fns->launch = &fn_descs[i];
6103184e
AM
1345 targ_fns->regs_per_thread = nregs;
1346 targ_fns->max_threads_per_block = mthrs;
41dbbb37 1347
f3e9a059
NS
1348 targ_tbl->start = (uintptr_t) targ_fns;
1349 targ_tbl->end = targ_tbl->start + 1;
41dbbb37
TS
1350 }
1351
f3e9a059 1352 for (j = 0; j < var_entries; j++, targ_tbl++)
d93bdab5
JB
1353 {
1354 CUdeviceptr var;
1355 size_t bytes;
1356
6ce13072
CLT
1357 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1358 &var, &bytes, module, var_names[j]);
d93bdab5 1359
f3e9a059
NS
1360 targ_tbl->start = (uintptr_t) var;
1361 targ_tbl->end = targ_tbl->start + bytes;
d93bdab5
JB
1362 }
1363
9f2fca56
MV
1364 CUdeviceptr varptr;
1365 size_t varsize;
1366 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1367 module, XSTRING (GOMP_ADDITIONAL_ICVS));
1368
0bac793e
CLT
1369 if (r == CUDA_SUCCESS)
1370 {
9f2fca56
MV
1371 targ_tbl->start = (uintptr_t) varptr;
1372 targ_tbl->end = (uintptr_t) (varptr + varsize);
0bac793e
CLT
1373 }
1374 else
9f2fca56 1375 /* The variable was not in this image. */
0bac793e 1376 targ_tbl->start = targ_tbl->end = 0;
0bac793e 1377
50be486d
TB
1378 if (rev_fn_table && fn_entries == 0)
1379 *rev_fn_table = NULL;
1380 else if (rev_fn_table)
1381 {
1382 CUdeviceptr var;
1383 size_t bytes;
1384 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1385 "$offload_func_table");
1386 if (r != CUDA_SUCCESS)
1387 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1388 assert (bytes == sizeof (uint64_t) * fn_entries);
1389 *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
1390 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
1391 if (r != CUDA_SUCCESS)
1392 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1393 }
1394
6103184e
AM
1395 nvptx_set_clocktick (module, dev);
1396
0bac793e 1397 return fn_entries + var_entries + other_entries;
d93bdab5
JB
1398}
1399
f3e9a059
NS
1400/* Unload the program described by TARGET_DATA. DEV_DATA is the
1401 function descriptors allocated by G_O_load_image. */
1402
6ce13072 1403bool
2a21ff19 1404GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
d93bdab5 1405{
f3e9a059
NS
1406 struct ptx_image_data *image, **prev_p;
1407 struct ptx_device *dev = ptx_devices[ord];
1408
2a21ff19 1409 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
6ce13072
CLT
1410 {
1411 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1412 " (expected %u, received %u)",
1413 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1414 return false;
1415 }
1416
1417 bool ret = true;
f3e9a059
NS
1418 pthread_mutex_lock (&dev->image_lock);
1419 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1420 if (image->target_data == target_data)
1421 {
1422 *prev_p = image->next;
2393d337 1423 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
6ce13072 1424 ret = false;
f3e9a059
NS
1425 free (image->fns);
1426 free (image);
1427 break;
1428 }
1429 pthread_mutex_unlock (&dev->image_lock);
6ce13072 1430 return ret;
41dbbb37
TS
1431}
1432
1433void *
d93bdab5 1434GOMP_OFFLOAD_alloc (int ord, size_t size)
41dbbb37 1435{
6ce13072
CLT
1436 if (!nvptx_attach_host_thread_to_device (ord))
1437 return NULL;
41dbbb37 1438
1f4c5b9b
CLT
1439 struct ptx_device *ptx_dev = ptx_devices[ord];
1440 struct ptx_free_block *blocks, *tmp;
41dbbb37 1441
1f4c5b9b
CLT
1442 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1443 blocks = ptx_dev->free_blocks;
1444 ptx_dev->free_blocks = NULL;
1445 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
41dbbb37 1446
6b577a17
JB
1447 nvptx_stacks_free (ptx_dev, false);
1448
1f4c5b9b
CLT
1449 while (blocks)
1450 {
1451 tmp = blocks->next;
1452 nvptx_free (blocks->ptr, ptx_dev);
1453 free (blocks);
1454 blocks = tmp;
1455 }
1456
6b577a17
JB
1457 void *d = nvptx_alloc (size, true);
1458 if (d)
1459 return d;
1460 else
1461 {
1462 /* Memory allocation failed. Try freeing the stacks block, and
1463 retrying. */
1464 nvptx_stacks_free (ptx_dev, true);
1465 return nvptx_alloc (size, false);
1466 }
41dbbb37
TS
1467}
1468
6103184e 1469bool
1f4c5b9b 1470GOMP_OFFLOAD_free (int ord, void *ptr)
6103184e 1471{
1f4c5b9b
CLT
1472 return (nvptx_attach_host_thread_to_device (ord)
1473 && nvptx_free (ptr, ptx_devices[ord]));
6103184e
AM
1474}
1475
41dbbb37 1476void
345a8c17
TS
1477GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1478 void **hostaddrs, void **devaddrs,
1f4c5b9b 1479 unsigned *dims, void *targ_mem_desc)
41dbbb37 1480{
1f4c5b9b 1481 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
41dbbb37 1482
5fae049d
TS
1483 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1484 acc_prof_info *prof_info = thr->prof_info;
1485 acc_event_info data_event_info;
1486 acc_api_info *api_info = thr->api_info;
1487 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1488
1f4c5b9b
CLT
1489 void **hp = NULL;
1490 CUdeviceptr dp = 0;
41dbbb37 1491
1f4c5b9b
CLT
1492 if (mapnum > 0)
1493 {
5fae049d
TS
1494 size_t s = mapnum * sizeof (void *);
1495 hp = alloca (s);
1f4c5b9b
CLT
1496 for (int i = 0; i < mapnum; i++)
1497 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
5fae049d
TS
1498 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1499 if (profiling_p)
1500 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1f4c5b9b 1501 }
41dbbb37 1502
1f4c5b9b
CLT
1503 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1504 fact have the same value on a unified-memory system). */
1505 if (mapnum > 0)
5fae049d
TS
1506 {
1507 if (profiling_p)
1508 {
1509 prof_info->event_type = acc_ev_enqueue_upload_start;
1510
1511 data_event_info.data_event.event_type = prof_info->event_type;
1512 data_event_info.data_event.valid_bytes
1513 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1514 data_event_info.data_event.parent_construct
1515 = acc_construct_parallel;
1516 data_event_info.data_event.implicit = 1; /* Always implicit. */
1517 data_event_info.data_event.tool_info = NULL;
1518 data_event_info.data_event.var_name = NULL;
1519 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1520 data_event_info.data_event.host_ptr = hp;
1521 data_event_info.data_event.device_ptr = (const void *) dp;
1522
1523 api_info->device_api = acc_device_api_cuda;
1524
1525 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1526 api_info);
1527 }
1528 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1529 mapnum * sizeof (void *));
1530 if (profiling_p)
1531 {
1532 prof_info->event_type = acc_ev_enqueue_upload_end;
1533 data_event_info.data_event.event_type = prof_info->event_type;
1534 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1535 api_info);
1536 }
1537 }
41dbbb37 1538
1f4c5b9b
CLT
1539 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1540 dp, NULL);
41dbbb37 1541
1f4c5b9b
CLT
1542 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1543 const char *maybe_abort_msg = "(perhaps abort was called)";
1544 if (r == CUDA_ERROR_LAUNCH_FAILED)
1545 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1546 maybe_abort_msg);
1547 else if (r != CUDA_SUCCESS)
1548 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
5fae049d 1549
1f4c5b9b 1550 CUDA_CALL_ASSERT (cuMemFree, dp);
5fae049d
TS
1551 if (profiling_p)
1552 goacc_profiling_acc_ev_free (thr, (void *) dp);
41dbbb37
TS
1553}
1554
1f4c5b9b
CLT
1555static void
1556cuda_free_argmem (void *ptr)
41dbbb37 1557{
1f4c5b9b
CLT
1558 void **block = (void **) ptr;
1559 nvptx_free (block[0], (struct ptx_device *) block[1]);
1560 free (block);
41dbbb37
TS
1561}
1562
1563void
1f4c5b9b
CLT
1564GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1565 void **hostaddrs, void **devaddrs,
1566 unsigned *dims, void *targ_mem_desc,
1567 struct goacc_asyncqueue *aq)
41dbbb37 1568{
1f4c5b9b 1569 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
41dbbb37 1570
5fae049d
TS
1571 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1572 acc_prof_info *prof_info = thr->prof_info;
1573 acc_event_info data_event_info;
1574 acc_api_info *api_info = thr->api_info;
1575 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1576
1f4c5b9b
CLT
1577 void **hp = NULL;
1578 CUdeviceptr dp = 0;
1579 void **block = NULL;
41dbbb37 1580
1f4c5b9b
CLT
1581 if (mapnum > 0)
1582 {
5fae049d
TS
1583 size_t s = mapnum * sizeof (void *);
1584 block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1f4c5b9b
CLT
1585 hp = block + 2;
1586 for (int i = 0; i < mapnum; i++)
1587 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
5fae049d
TS
1588 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1589 if (profiling_p)
1590 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1f4c5b9b
CLT
1591 }
1592
1593 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1594 fact have the same value on a unified-memory system). */
1595 if (mapnum > 0)
1596 {
5fae049d
TS
1597 if (profiling_p)
1598 {
1599 prof_info->event_type = acc_ev_enqueue_upload_start;
1600
1601 data_event_info.data_event.event_type = prof_info->event_type;
1602 data_event_info.data_event.valid_bytes
1603 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1604 data_event_info.data_event.parent_construct
1605 = acc_construct_parallel;
1606 data_event_info.data_event.implicit = 1; /* Always implicit. */
1607 data_event_info.data_event.tool_info = NULL;
1608 data_event_info.data_event.var_name = NULL;
1609 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1610 data_event_info.data_event.host_ptr = hp;
1611 data_event_info.data_event.device_ptr = (const void *) dp;
1612
1613 api_info->device_api = acc_device_api_cuda;
1614
1615 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1616 api_info);
1617 }
1618
1f4c5b9b
CLT
1619 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1620 mapnum * sizeof (void *), aq->cuda_stream);
1621 block[0] = (void *) dp;
1622
1623 struct nvptx_thread *nvthd =
1624 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1625 block[1] = (void *) nvthd->ptx_dev;
5fae049d
TS
1626
1627 if (profiling_p)
1628 {
1629 prof_info->event_type = acc_ev_enqueue_upload_end;
1630 data_event_info.data_event.event_type = prof_info->event_type;
1631 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1632 api_info);
1633 }
1f4c5b9b 1634 }
5fae049d 1635
1f4c5b9b
CLT
1636 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1637 dp, aq->cuda_stream);
1638
1639 if (mapnum > 0)
1640 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
41dbbb37
TS
1641}
1642
1643void *
d93bdab5 1644GOMP_OFFLOAD_openacc_create_thread_data (int ord)
41dbbb37 1645{
d93bdab5 1646 struct ptx_device *ptx_dev;
41dbbb37
TS
1647 struct nvptx_thread *nvthd
1648 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
41dbbb37
TS
1649 CUcontext thd_ctx;
1650
d93bdab5
JB
1651 ptx_dev = ptx_devices[ord];
1652
1653 assert (ptx_dev);
1654
6ce13072 1655 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
41dbbb37
TS
1656
1657 assert (ptx_dev->ctx);
1658
1659 if (!thd_ctx)
6ce13072 1660 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
41dbbb37 1661
41dbbb37
TS
1662 nvthd->ptx_dev = ptx_dev;
1663
1664 return (void *) nvthd;
1665}
1666
1667void
1668GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1669{
1670 free (data);
1671}
1672
1673void *
345a8c17 1674GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
41dbbb37
TS
1675{
1676 return nvptx_get_current_cuda_device ();
1677}
1678
1679void *
345a8c17 1680GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
41dbbb37
TS
1681{
1682 return nvptx_get_current_cuda_context ();
1683}
1684
1f4c5b9b 1685/* This returns a CUstream. */
41dbbb37 1686void *
1f4c5b9b
CLT
1687GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1688{
1689 return (void *) aq->cuda_stream;
1690}
1691
1692/* This takes a CUstream. */
1693int
1694GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1695{
1696 if (aq->cuda_stream)
1697 {
1698 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1699 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1700 }
1701
1702 aq->cuda_stream = (CUstream) stream;
1703 return 1;
1704}
1705
1706struct goacc_asyncqueue *
d2903ce0 1707GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
41dbbb37 1708{
1f4c5b9b
CLT
1709 CUstream stream = NULL;
1710 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1711
1712 struct goacc_asyncqueue *aq
1713 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1714 aq->cuda_stream = stream;
1715 return aq;
41dbbb37
TS
1716}
1717
1f4c5b9b
CLT
1718bool
1719GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1720{
1721 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1722 free (aq);
1723 return true;
1724}
41dbbb37
TS
1725
1726int
1f4c5b9b 1727GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
41dbbb37 1728{
1f4c5b9b
CLT
1729 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1730 if (r == CUDA_SUCCESS)
1731 return 1;
1732 if (r == CUDA_ERROR_NOT_READY)
1733 return 0;
1734
1735 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1736 return -1;
1737}
1738
1739bool
1740GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1741{
1742 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1743 return true;
1744}
1745
1746bool
1747GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1748 struct goacc_asyncqueue *aq2)
1749{
1750 CUevent e;
1751 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1752 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1753 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1754 return true;
1755}
1756
1757static void
1758cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1759{
1760 if (res != CUDA_SUCCESS)
1761 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1762 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1763 cb->fn (cb->ptr);
1764 free (ptr);
1765}
1766
1767void
1768GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1769 void (*callback_fn)(void *),
1770 void *userptr)
1771{
1772 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1773 b->fn = callback_fn;
1774 b->ptr = userptr;
1775 b->aq = aq;
1776 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1777 cuda_callback_wrapper, (void *) b, 0);
1778}
1779
1780static bool
1781cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1782{
1783 CUdeviceptr pb;
1784 size_t ps;
1785 if (!s)
1786 return true;
1787 if (!d)
1788 {
1789 GOMP_PLUGIN_error ("invalid device address");
1790 return false;
1791 }
1792 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1793 if (!pb)
1794 {
1795 GOMP_PLUGIN_error ("invalid device address");
1796 return false;
1797 }
1798 if (!h)
1799 {
1800 GOMP_PLUGIN_error ("invalid host address");
1801 return false;
1802 }
1803 if (d == h)
1804 {
1805 GOMP_PLUGIN_error ("invalid host or device address");
1806 return false;
1807 }
1808 if ((void *)(d + s) > (void *)(pb + ps))
1809 {
1810 GOMP_PLUGIN_error ("invalid size");
1811 return false;
1812 }
1813 return true;
1814}
1815
1816bool
1817GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1818{
1819 if (!nvptx_attach_host_thread_to_device (ord)
1820 || !cuda_memcpy_sanity_check (src, dst, n))
1821 return false;
1822 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1823 return true;
1824}
1825
1826bool
1827GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1828{
1829 if (!nvptx_attach_host_thread_to_device (ord)
1830 || !cuda_memcpy_sanity_check (dst, src, n))
1831 return false;
1832 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1833 return true;
1834}
1835
1836bool
1837GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1838{
1839 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1840 return true;
1841}
1842
1843bool
1844GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1845 size_t n, struct goacc_asyncqueue *aq)
1846{
1847 if (!nvptx_attach_host_thread_to_device (ord)
1848 || !cuda_memcpy_sanity_check (src, dst, n))
1849 return false;
1850 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1851 return true;
1852}
1853
1854bool
1855GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1856 size_t n, struct goacc_asyncqueue *aq)
1857{
1858 if (!nvptx_attach_host_thread_to_device (ord)
1859 || !cuda_memcpy_sanity_check (dst, src, n))
1860 return false;
1861 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1862 return true;
41dbbb37 1863}
6103184e 1864
6fc0385c
TS
1865union goacc_property_value
1866GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1867{
1868 union goacc_property_value propval = { .val = 0 };
1869
1870 pthread_mutex_lock (&ptx_dev_lock);
1871
1872 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1873 {
1874 pthread_mutex_unlock (&ptx_dev_lock);
1875 return propval;
1876 }
1877
1878 struct ptx_device *ptx_dev = ptx_devices[n];
1879 switch (prop)
1880 {
1881 case GOACC_PROPERTY_MEMORY:
1882 {
1883 size_t total_mem;
1884
1885 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1886 propval.val = total_mem;
1887 }
1888 break;
1889 case GOACC_PROPERTY_FREE_MEMORY:
1890 {
1891 size_t total_mem;
1892 size_t free_mem;
1893 CUdevice ctxdev;
1894
1895 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1896 if (ptx_dev->dev == ctxdev)
1897 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1898 else if (ptx_dev->ctx)
1899 {
1900 CUcontext old_ctx;
1901
1902 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1903 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1904 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1905 }
1906 else
1907 {
1908 CUcontext new_ctx;
1909
1910 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1911 ptx_dev->dev);
1912 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1913 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1914 }
1915 propval.val = free_mem;
1916 }
1917 break;
1918 case GOACC_PROPERTY_NAME:
1919 propval.ptr = ptx_dev->name;
1920 break;
1921 case GOACC_PROPERTY_VENDOR:
1922 propval.ptr = "Nvidia";
1923 break;
1924 case GOACC_PROPERTY_DRIVER:
1925 propval.ptr = cuda_driver_version_s;
1926 break;
1927 default:
1928 break;
1929 }
1930
1931 pthread_mutex_unlock (&ptx_dev_lock);
1932 return propval;
1933}
1934
6103184e
AM
1935/* Adjust launch dimensions: pick good values for number of blocks and warps
1936 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1937 own limits. */
1938
1939static void
1940nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1941 struct ptx_device *ptx_dev,
1942 int *teams_p, int *threads_p)
1943{
1944 int max_warps_block = fn->max_threads_per_block / 32;
1945 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1946 and libgcc, which matches documented limit of all GPUs as of 2015. */
1947 if (max_warps_block > 32)
1948 max_warps_block = 32;
1949 if (*threads_p <= 0)
1950 *threads_p = 8;
1951 if (*threads_p > max_warps_block)
1952 *threads_p = max_warps_block;
1953
1954 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1955 /* This is an estimate of how many blocks the device can host simultaneously.
1956 Actual limit, which may be lower, can be queried with "occupancy control"
1957 driver interface (since CUDA 6.0). */
1958 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1959 if (*teams_p <= 0 || *teams_p > max_blocks)
1960 *teams_p = max_blocks;
1961}
1962
1963/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1964 target regions. */
1965
1966static size_t
1967nvptx_stacks_size ()
1968{
1969 return 128 * 1024;
1970}
1971
6b577a17
JB
1972/* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
1973 the storage should be held on entry, and remains held on exit. */
6103184e
AM
1974
1975static void *
6b577a17 1976nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
6103184e 1977{
6b577a17
JB
1978 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
1979 return (void *) ptx_dev->omp_stacks.ptr;
1980
1981 /* Free the old, too-small stacks. */
1982 if (ptx_dev->omp_stacks.ptr)
1983 {
1984 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1985 if (r != CUDA_SUCCESS)
1986 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
1987 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1988 if (r != CUDA_SUCCESS)
1989 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1990 }
1991
1992 /* Make new and bigger stacks, and remember where we put them and how big
1993 they are. */
1994 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
1995 size * num);
6103184e
AM
1996 if (r != CUDA_SUCCESS)
1997 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
6103184e 1998
6b577a17 1999 ptx_dev->omp_stacks.size = size * num;
6103184e 2000
6b577a17 2001 return (void *) ptx_dev->omp_stacks.ptr;
6103184e
AM
2002}
2003
2004void
2005GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2006{
7345ef6c
TV
2007 struct targ_fn_descriptor *tgt_fn_desc
2008 = (struct targ_fn_descriptor *) tgt_fn;
2009 CUfunction function = tgt_fn_desc->fn;
2010 const struct targ_fn_launch *launch = tgt_fn_desc->launch;
2011 const char *fn_name = launch->fn;
6103184e
AM
2012 CUresult r;
2013 struct ptx_device *ptx_dev = ptx_devices[ord];
2014 const char *maybe_abort_msg = "(perhaps abort was called)";
2015 int teams = 0, threads = 0;
2016
2017 if (!args)
2018 GOMP_PLUGIN_fatal ("No target arguments provided");
2019 while (*args)
2020 {
2021 intptr_t id = (intptr_t) *args++, val;
2022 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2023 val = (intptr_t) *args++;
2024 else
2025 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2026 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2027 continue;
2028 val = val > INT_MAX ? INT_MAX : val;
2029 id &= GOMP_TARGET_ARG_ID_MASK;
2030 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2031 teams = val;
2032 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2033 threads = val;
2034 }
2035 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2036
2037 size_t stack_size = nvptx_stacks_size ();
6b577a17
JB
2038
2039 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2040 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
6103184e
AM
2041 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2042 size_t fn_args_size = sizeof fn_args;
2043 void *config[] = {
2044 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2045 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2046 CU_LAUNCH_PARAM_END
2047 };
7345ef6c
TV
2048 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2049 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2050 __FUNCTION__, fn_name, teams, threads);
2393d337 2051 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
1f4c5b9b 2052 32, threads, 1, 0, NULL, NULL, config);
6103184e
AM
2053 if (r != CUDA_SUCCESS)
2054 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2055
2393d337 2056 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
6103184e
AM
2057 if (r == CUDA_ERROR_LAUNCH_FAILED)
2058 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2059 maybe_abort_msg);
2060 else if (r != CUDA_SUCCESS)
2061 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
6b577a17
JB
2062
2063 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
6103184e
AM
2064}
2065
001ab12e 2066/* TODO: Implement GOMP_OFFLOAD_async_run. */