]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-nvptx.c
Update copyright years.
[thirdparty/gcc.git] / libgomp / plugin / plugin-nvptx.c
CommitLineData
41dbbb37
TS
1/* Plugin for NVPTX execution.
2
8d9254fc 3 Copyright (C) 2013-2020 Free Software Foundation, Inc.
41dbbb37
TS
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
df36a3d3 34#define _GNU_SOURCE
41dbbb37
TS
35#include "openacc.h"
36#include "config.h"
37#include "libgomp-plugin.h"
41dbbb37 38#include "oacc-plugin.h"
2a21ff19 39#include "gomp-constants.h"
5fae049d 40#include "oacc-int.h"
41dbbb37
TS
41
42#include <pthread.h>
43#include <cuda.h>
44#include <stdbool.h>
6103184e 45#include <limits.h>
41dbbb37
TS
46#include <string.h>
47#include <stdio.h>
41dbbb37
TS
48#include <unistd.h>
49#include <assert.h>
6668eb45 50#include <errno.h>
41dbbb37 51
94767dac
TV
52#if CUDA_VERSION < 6000
53extern CUresult cuGetErrorString (CUresult, const char **);
b113af95 54#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
94767dac
TV
55#endif
56
8e09a12f
TV
57#if CUDA_VERSION >= 6050
58#undef cuLinkCreate
59#undef cuLinkAddData
60CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
61 const char *, unsigned, CUjit_option *, void **);
62CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
63#else
bd9b3d3d 64typedef size_t (*CUoccupancyB2DSize)(int);
8e09a12f
TV
65CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
66 const char *, unsigned, CUjit_option *, void **);
67CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
bd9b3d3d
CP
68CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
69 CUoccupancyB2DSize, size_t, int);
8e09a12f
TV
70#endif
71
02150de8
TV
72#define DO_PRAGMA(x) _Pragma (#x)
73
2393d337
JJ
74#if PLUGIN_NVPTX_DYNAMIC
75# include <dlfcn.h>
76
2393d337 77struct cuda_lib_s {
9e28b107
TV
78
79# define CUDA_ONE_CALL(call) \
80 __typeof (call) *call;
02150de8
TV
81# define CUDA_ONE_CALL_MAYBE_NULL(call) \
82 CUDA_ONE_CALL (call)
8c6310a2 83#include "cuda-lib.def"
9e28b107 84# undef CUDA_ONE_CALL
02150de8 85# undef CUDA_ONE_CALL_MAYBE_NULL
9e28b107 86
2393d337
JJ
87} cuda_lib;
88
89/* -1 if init_cuda_lib has not been called yet, false
90 if it has been and failed, true if it has been and succeeded. */
19929ba9 91static signed char cuda_lib_inited = -1;
2393d337
JJ
92
93/* Dynamically load the CUDA runtime library and initialize function
94 pointers, return false if unsuccessful, true if successful. */
95static bool
96init_cuda_lib (void)
97{
98 if (cuda_lib_inited != -1)
99 return cuda_lib_inited;
100 const char *cuda_runtime_lib = "libcuda.so.1";
101 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
102 cuda_lib_inited = false;
103 if (h == NULL)
104 return false;
9e28b107 105
02150de8
TV
106# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
107# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
108# define CUDA_ONE_CALL_1(call, allow_null) \
2393d337 109 cuda_lib.call = dlsym (h, #call); \
02150de8 110 if (!allow_null && cuda_lib.call == NULL) \
2393d337 111 return false;
8c6310a2 112#include "cuda-lib.def"
9e28b107
TV
113# undef CUDA_ONE_CALL
114# undef CUDA_ONE_CALL_1
02150de8 115# undef CUDA_ONE_CALL_MAYBE_NULL
9e28b107 116
2393d337
JJ
117 cuda_lib_inited = true;
118 return true;
41dbbb37 119}
2393d337
JJ
120# define CUDA_CALL_PREFIX cuda_lib.
121#else
02150de8
TV
122
123# define CUDA_ONE_CALL(call)
124# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
125#include "cuda-lib.def"
126#undef CUDA_ONE_CALL_MAYBE_NULL
127#undef CUDA_ONE_CALL
128
2393d337
JJ
129# define CUDA_CALL_PREFIX
130# define init_cuda_lib() true
131#endif
41dbbb37 132
df36a3d3
TV
133#include "secure_getenv.h"
134
4cdfee3f
TV
135#undef MIN
136#undef MAX
137#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
138#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
139
6ce13072 140/* Convenience macros for the frequently used CUDA library call and
2393d337
JJ
141 error handling sequence as well as CUDA library calls that
142 do the error checking themselves or don't do it at all. */
6ce13072
CLT
143
144#define CUDA_CALL_ERET(ERET, FN, ...) \
145 do { \
2393d337
JJ
146 unsigned __r \
147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
6ce13072
CLT
148 if (__r != CUDA_SUCCESS) \
149 { \
150 GOMP_PLUGIN_error (#FN " error: %s", \
151 cuda_error (__r)); \
152 return ERET; \
153 } \
154 } while (0)
155
156#define CUDA_CALL(FN, ...) \
2393d337 157 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
6ce13072
CLT
158
159#define CUDA_CALL_ASSERT(FN, ...) \
160 do { \
2393d337
JJ
161 unsigned __r \
162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
6ce13072
CLT
163 if (__r != CUDA_SUCCESS) \
164 { \
165 GOMP_PLUGIN_fatal (#FN " error: %s", \
166 cuda_error (__r)); \
167 } \
168 } while (0)
169
2393d337
JJ
170#define CUDA_CALL_NOCHECK(FN, ...) \
171 CUDA_CALL_PREFIX FN (__VA_ARGS__)
172
02150de8
TV
173#define CUDA_CALL_EXISTS(FN) \
174 CUDA_CALL_PREFIX FN
175
2393d337
JJ
176static const char *
177cuda_error (CUresult r)
178{
cedd9bd0 179 const char *fallback = "unknown cuda error";
2393d337
JJ
180 const char *desc;
181
cedd9bd0
TV
182 if (!CUDA_CALL_EXISTS (cuGetErrorString))
183 return fallback;
184
2393d337 185 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
cedd9bd0
TV
186 if (r == CUDA_SUCCESS)
187 return desc;
2393d337 188
cedd9bd0 189 return fallback;
2393d337
JJ
190}
191
6c84c8bf
MR
192/* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
193 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
194static char cuda_driver_version_s[30];
195
d93bdab5
JB
196static unsigned int instantiated_devices = 0;
197static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
41dbbb37 198
1f4c5b9b
CLT
199/* NVPTX/CUDA specific definition of asynchronous queues. */
200struct goacc_asyncqueue
2049befd 201{
1f4c5b9b 202 CUstream cuda_stream;
2049befd
CP
203};
204
1f4c5b9b 205struct nvptx_callback
41dbbb37 206{
1f4c5b9b
CLT
207 void (*fn) (void *);
208 void *ptr;
209 struct goacc_asyncqueue *aq;
210 struct nvptx_callback *next;
41dbbb37
TS
211};
212
213/* Thread-specific data for PTX. */
214
215struct nvptx_thread
216{
1f4c5b9b
CLT
217 /* We currently have this embedded inside the plugin because libgomp manages
218 devices through integer target_ids. This might be better if using an
219 opaque target-specific pointer directly from gomp_device_descr. */
41dbbb37
TS
220 struct ptx_device *ptx_dev;
221};
222
3e32ee19
NS
223/* Target data function launch information. */
224
225struct targ_fn_launch
226{
227 const char *fn;
cc3cd79b 228 unsigned short dim[GOMP_DIM_MAX];
3e32ee19
NS
229};
230
cc3cd79b
NS
231/* Target PTX object information. */
232
233struct targ_ptx_obj
234{
235 const char *code;
236 size_t size;
237};
238
239/* Target data image information. */
240
241typedef struct nvptx_tdata
242{
243 const struct targ_ptx_obj *ptx_objs;
244 unsigned ptx_num;
245
246 const char *const *var_names;
247 unsigned var_num;
248
249 const struct targ_fn_launch *fn_descs;
250 unsigned fn_num;
251} nvptx_tdata_t;
252
f3e9a059
NS
253/* Descriptor of a loaded function. */
254
255struct targ_fn_descriptor
256{
257 CUfunction fn;
3e32ee19 258 const struct targ_fn_launch *launch;
6103184e
AM
259 int regs_per_thread;
260 int max_threads_per_block;
f3e9a059
NS
261};
262
263/* A loaded PTX image. */
264struct ptx_image_data
265{
266 const void *target_data;
267 CUmodule module;
268
269 struct targ_fn_descriptor *fns; /* Array of functions. */
270
271 struct ptx_image_data *next;
272};
273
1f4c5b9b
CLT
274struct ptx_free_block
275{
276 void *ptr;
277 struct ptx_free_block *next;
278};
279
41dbbb37
TS
280struct ptx_device
281{
282 CUcontext ctx;
283 bool ctx_shared;
284 CUdevice dev;
1f4c5b9b 285
41dbbb37
TS
286 int ord;
287 bool overlap;
288 bool map;
289 bool concur;
41dbbb37 290 bool mkern;
6c84c8bf 291 int mode;
6103184e
AM
292 int clock_khz;
293 int num_sms;
294 int regs_per_block;
295 int regs_per_sm;
0c6c2f5f
CP
296 int warp_size;
297 int max_threads_per_block;
298 int max_threads_per_multiprocessor;
0b210c43 299 int default_dims[GOMP_DIM_MAX];
41dbbb37 300
6c84c8bf
MR
301 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
302 char name[256];
303
f3e9a059
NS
304 struct ptx_image_data *images; /* Images loaded on device. */
305 pthread_mutex_t image_lock; /* Lock for above list. */
41dbbb37 306
1f4c5b9b
CLT
307 struct ptx_free_block *free_blocks;
308 pthread_mutex_t free_blocks_lock;
41dbbb37 309
1f4c5b9b 310 struct ptx_device *next;
41dbbb37
TS
311};
312
d93bdab5
JB
313static struct ptx_device **ptx_devices;
314
41dbbb37
TS
315static inline struct nvptx_thread *
316nvptx_thread (void)
317{
318 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
319}
320
d93bdab5
JB
321/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
322 should be locked on entry and remains locked on exit. */
f3e9a059 323
d93bdab5 324static bool
41dbbb37
TS
325nvptx_init (void)
326{
d93bdab5 327 int ndevs;
41dbbb37 328
d93bdab5
JB
329 if (instantiated_devices != 0)
330 return true;
41dbbb37 331
2393d337
JJ
332 if (!init_cuda_lib ())
333 return false;
334
335 CUDA_CALL (cuInit, 0);
336
6c84c8bf
MR
337 int cuda_driver_version;
338 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
339 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
340 "CUDA Driver %u.%u",
341 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
342
6ce13072 343 CUDA_CALL (cuDeviceGetCount, &ndevs);
d93bdab5
JB
344 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
345 * ndevs);
6c84c8bf 346
d93bdab5 347 return true;
41dbbb37
TS
348}
349
d93bdab5
JB
350/* Select the N'th PTX device for the current host thread. The device must
351 have been previously opened before calling this function. */
352
6ce13072 353static bool
d93bdab5 354nvptx_attach_host_thread_to_device (int n)
41dbbb37 355{
d93bdab5
JB
356 CUdevice dev;
357 CUresult r;
358 struct ptx_device *ptx_dev;
359 CUcontext thd_ctx;
360
2393d337 361 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
1f4c5b9b
CLT
362 if (r == CUDA_ERROR_NOT_PERMITTED)
363 {
364 /* Assume we're in a CUDA callback, just return true. */
365 return true;
366 }
d93bdab5 367 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
6ce13072
CLT
368 {
369 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
370 return false;
371 }
d93bdab5
JB
372
373 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
6ce13072 374 return true;
d93bdab5
JB
375 else
376 {
377 CUcontext old_ctx;
378
379 ptx_dev = ptx_devices[n];
6ce13072
CLT
380 if (!ptx_dev)
381 {
382 GOMP_PLUGIN_error ("device %d not found", n);
383 return false;
384 }
d93bdab5 385
6ce13072 386 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
d93bdab5
JB
387
388 /* We don't necessarily have a current context (e.g. if it has been
389 destroyed. Pop it if we do though. */
390 if (thd_ctx != NULL)
6ce13072 391 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
d93bdab5 392
6ce13072 393 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
d93bdab5 394 }
6ce13072 395 return true;
41dbbb37
TS
396}
397
d93bdab5 398static struct ptx_device *
41dbbb37
TS
399nvptx_open_device (int n)
400{
401 struct ptx_device *ptx_dev;
d93bdab5 402 CUdevice dev, ctx_dev;
41dbbb37
TS
403 CUresult r;
404 int async_engines, pi;
405
6ce13072 406 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
41dbbb37
TS
407
408 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
409
410 ptx_dev->ord = n;
411 ptx_dev->dev = dev;
412 ptx_dev->ctx_shared = false;
413
2393d337 414 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
d93bdab5 415 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
6ce13072
CLT
416 {
417 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
418 return NULL;
419 }
d93bdab5
JB
420
421 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
422 {
423 /* The current host thread has an active context for a different device.
424 Detach it. */
425 CUcontext old_ctx;
6ce13072 426 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
d93bdab5
JB
427 }
428
6ce13072 429 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
41dbbb37
TS
430
431 if (!ptx_dev->ctx)
6ce13072 432 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
41dbbb37
TS
433 else
434 ptx_dev->ctx_shared = true;
435
6ce13072
CLT
436 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
437 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
41dbbb37
TS
438 ptx_dev->overlap = pi;
439
6ce13072
CLT
440 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
441 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
41dbbb37
TS
442 ptx_dev->map = pi;
443
6ce13072
CLT
444 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
445 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
41dbbb37
TS
446 ptx_dev->concur = pi;
447
6ce13072
CLT
448 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
449 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
41dbbb37
TS
450 ptx_dev->mode = pi;
451
6ce13072
CLT
452 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
453 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
41dbbb37
TS
454 ptx_dev->mkern = pi;
455
6103184e
AM
456 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
457 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
458 ptx_dev->clock_khz = pi;
459
2393d337 460 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
6103184e
AM
461 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
462 ptx_dev->num_sms = pi;
463
464 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
465 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
466 ptx_dev->regs_per_block = pi;
467
b113af95 468 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
6103184e 469 in CUDA 6.0 and newer. */
b113af95
TV
470 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
471 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
472 dev);
6103184e
AM
473 /* Fallback: use limit of registers per block, which is usually equal. */
474 if (r == CUDA_ERROR_INVALID_VALUE)
475 pi = ptx_dev->regs_per_block;
476 else if (r != CUDA_SUCCESS)
477 {
478 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
479 return NULL;
480 }
481 ptx_dev->regs_per_sm = pi;
482
483 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
484 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
485 if (pi != 32)
486 {
487 GOMP_PLUGIN_error ("Only warp size 32 is supported");
488 return NULL;
489 }
0c6c2f5f
CP
490 ptx_dev->warp_size = pi;
491
492 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
493 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
494 ptx_dev->max_threads_per_block = pi;
495
496 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
497 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
498 ptx_dev->max_threads_per_multiprocessor = pi;
6103184e 499
2393d337
JJ
500 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
501 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
41dbbb37
TS
502 if (r != CUDA_SUCCESS)
503 async_engines = 1;
504
0b210c43
TV
505 for (int i = 0; i != GOMP_DIM_MAX; i++)
506 ptx_dev->default_dims[i] = 0;
507
6c84c8bf
MR
508 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
509 dev);
510
f3e9a059
NS
511 ptx_dev->images = NULL;
512 pthread_mutex_init (&ptx_dev->image_lock, NULL);
513
1f4c5b9b
CLT
514 ptx_dev->free_blocks = NULL;
515 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
41dbbb37 516
d93bdab5 517 return ptx_dev;
41dbbb37
TS
518}
519
6ce13072 520static bool
d93bdab5 521nvptx_close_device (struct ptx_device *ptx_dev)
41dbbb37 522{
41dbbb37 523 if (!ptx_dev)
6ce13072 524 return true;
41dbbb37 525
1f4c5b9b
CLT
526 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
527 {
528 struct ptx_free_block *b_next = b->next;
529 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
530 free (b);
531 b = b_next;
532 }
533
534 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
f3e9a059 535 pthread_mutex_destroy (&ptx_dev->image_lock);
41dbbb37
TS
536
537 if (!ptx_dev->ctx_shared)
6ce13072 538 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
41dbbb37
TS
539
540 free (ptx_dev);
6ce13072 541 return true;
41dbbb37
TS
542}
543
544static int
545nvptx_get_num_devices (void)
546{
547 int n;
41dbbb37 548
a92defda
TS
549 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
550 configurations. */
551 if (sizeof (void *) != 8)
78672bd8
TS
552 {
553 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
554 " only 64-bit configurations are supported\n");
555 return 0;
556 }
a92defda 557
41dbbb37
TS
558 /* This function will be called before the plugin has been initialized in
559 order to enumerate available devices, but CUDA API routines can't be used
560 until cuInit has been called. Just call it now (but don't yet do any
561 further initialization). */
d93bdab5 562 if (instantiated_devices == 0)
c8319826 563 {
2393d337
JJ
564 if (!init_cuda_lib ())
565 return 0;
566 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
c8319826
JB
567 /* This is not an error: e.g. we may have CUDA libraries installed but
568 no devices available. */
569 if (r != CUDA_SUCCESS)
78672bd8
TS
570 {
571 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
572 cuda_error (r));
573 return 0;
574 }
c8319826 575 }
41dbbb37 576
6ce13072 577 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
41dbbb37
TS
578 return n;
579}
580
dfb15f6b
TV
581static void
582notify_var (const char *var_name, const char *env_var)
583{
584 if (env_var == NULL)
585 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
586 else
587 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
588}
41dbbb37 589
df36a3d3
TV
590static void
591process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
592{
593 const char *var_name = "GOMP_NVPTX_JIT";
594 const char *env_var = secure_getenv (var_name);
595 notify_var (var_name, env_var);
596
597 if (env_var == NULL)
598 return;
599
600 const char *c = env_var;
601 while (*c != '\0')
602 {
603 while (*c == ' ')
604 c++;
605
606 if (c[0] == '-' && c[1] == 'O'
607 && '0' <= c[2] && c[2] <= '4'
608 && (c[3] == '\0' || c[3] == ' '))
609 {
610 *gomp_nvptx_o = c[2] - '0';
611 c += 3;
612 continue;
613 }
614
615 GOMP_PLUGIN_error ("Error parsing %s", var_name);
616 break;
617 }
618}
619
6ce13072 620static bool
cc3cd79b
NS
621link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
622 unsigned num_objs)
41dbbb37 623{
df36a3d3
TV
624 CUjit_option opts[7];
625 void *optvals[7];
41dbbb37 626 float elapsed = 0.0;
6103184e
AM
627 char elog[1024];
628 char ilog[16384];
41dbbb37
TS
629 CUlinkState linkstate;
630 CUresult r;
631 void *linkout;
632 size_t linkoutsize __attribute__ ((unused));
633
41dbbb37
TS
634 opts[0] = CU_JIT_WALL_TIME;
635 optvals[0] = &elapsed;
636
637 opts[1] = CU_JIT_INFO_LOG_BUFFER;
638 optvals[1] = &ilog[0];
639
640 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
6103184e 641 optvals[2] = (void *) sizeof ilog;
41dbbb37
TS
642
643 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
644 optvals[3] = &elog[0];
645
646 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
6103184e 647 optvals[4] = (void *) sizeof elog;
41dbbb37
TS
648
649 opts[5] = CU_JIT_LOG_VERBOSE;
650 optvals[5] = (void *) 1;
651
df36a3d3
TV
652 static intptr_t gomp_nvptx_o = -1;
653
654 static bool init_done = false;
655 if (!init_done)
656 {
657 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
658 init_done = true;
659 }
660
661 int nopts = 6;
662 if (gomp_nvptx_o != -1)
663 {
664 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
665 optvals[nopts] = (void *) gomp_nvptx_o;
666 nopts++;
667 }
668
8e09a12f
TV
669 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
670 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
671 else
672 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
41dbbb37 673
cc3cd79b 674 for (; num_objs--; ptx_objs++)
41dbbb37 675 {
cc3cd79b
NS
676 /* cuLinkAddData's 'data' argument erroneously omits the const
677 qualifier. */
678 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
8e09a12f
TV
679 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
680 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
681 (char *) ptx_objs->code, ptx_objs->size,
682 0, 0, 0, 0);
683 else
684 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
685 (char *) ptx_objs->code, ptx_objs->size,
686 0, 0, 0, 0);
cc3cd79b
NS
687 if (r != CUDA_SUCCESS)
688 {
689 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
6ce13072 690 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
cc3cd79b 691 cuda_error (r));
6ce13072 692 return false;
cc3cd79b 693 }
41dbbb37
TS
694 }
695
cc3cd79b 696 GOMP_PLUGIN_debug (0, "Linking\n");
2393d337 697 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
41dbbb37
TS
698
699 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
700 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
701
cc3cd79b 702 if (r != CUDA_SUCCESS)
6ce13072
CLT
703 {
704 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
705 return false;
706 }
cc3cd79b 707
6ce13072
CLT
708 CUDA_CALL (cuModuleLoadData, module, linkout);
709 CUDA_CALL (cuLinkDestroy, linkstate);
710 return true;
41dbbb37
TS
711}
712
e70ab10d 713static void
41dbbb37 714nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1f4c5b9b
CLT
715 unsigned *dims, void *targ_mem_desc,
716 CUdeviceptr dp, CUstream stream)
41dbbb37
TS
717{
718 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
719 CUfunction function;
41dbbb37 720 int i;
41dbbb37 721 void *kargs[1];
41dbbb37 722 struct nvptx_thread *nvthd = nvptx_thread ();
4cdfee3f 723 int warp_size = nvthd->ptx_dev->warp_size;
41dbbb37
TS
724
725 function = targ_fn->fn;
726
3e32ee19
NS
727 /* Initialize the launch dimensions. Typically this is constant,
728 provided by the device compiler, but we must permit runtime
729 values. */
f99c3557
TS
730 int seen_zero = 0;
731 for (i = 0; i != GOMP_DIM_MAX; i++)
732 {
733 if (targ_fn->launch->dim[i])
734 dims[i] = targ_fn->launch->dim[i];
735 if (!dims[i])
736 seen_zero = 1;
737 }
738
739 if (seen_zero)
740 {
6668eb45 741 pthread_mutex_lock (&ptx_dev_lock);
0b210c43
TV
742
743 static int gomp_openacc_dims[GOMP_DIM_MAX];
744 if (!gomp_openacc_dims[0])
745 {
746 /* See if the user provided GOMP_OPENACC_DIM environment
747 variable to specify runtime defaults. */
748 for (int i = 0; i < GOMP_DIM_MAX; ++i)
749 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
750 }
751
752 if (!nvthd->ptx_dev->default_dims[0])
6668eb45 753 {
0b210c43 754 int default_dims[GOMP_DIM_MAX];
ec00d3fa 755 for (int i = 0; i < GOMP_DIM_MAX; ++i)
0b210c43 756 default_dims[i] = gomp_openacc_dims[i];
6668eb45 757
0c6c2f5f
CP
758 int gang, worker, vector;
759 {
0c6c2f5f
CP
760 int block_size = nvthd->ptx_dev->max_threads_per_block;
761 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
762 int dev_size = nvthd->ptx_dev->num_sms;
763 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
764 " dev_size=%d, cpu_size=%d\n",
765 warp_size, block_size, dev_size, cpu_size);
766
767 gang = (cpu_size / block_size) * dev_size;
768 worker = block_size / warp_size;
769 vector = warp_size;
770 }
6668eb45
CP
771
772 /* There is no upper bound on the gang size. The best size
773 matches the hardware configuration. Logical gangs are
774 scheduled onto physical hardware. To maximize usage, we
775 should guess a large number. */
776 if (default_dims[GOMP_DIM_GANG] < 1)
777 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
778 /* The worker size must not exceed the hardware. */
779 if (default_dims[GOMP_DIM_WORKER] < 1
780 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
781 default_dims[GOMP_DIM_WORKER] = worker;
782 /* The vector size must exactly match the hardware. */
783 if (default_dims[GOMP_DIM_VECTOR] < 1
784 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
785 default_dims[GOMP_DIM_VECTOR] = vector;
786
787 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
788 default_dims[GOMP_DIM_GANG],
789 default_dims[GOMP_DIM_WORKER],
790 default_dims[GOMP_DIM_VECTOR]);
0b210c43
TV
791
792 for (i = 0; i != GOMP_DIM_MAX; i++)
793 nvthd->ptx_dev->default_dims[i] = default_dims[i];
6668eb45
CP
794 }
795 pthread_mutex_unlock (&ptx_dev_lock);
796
4cdfee3f
TV
797 {
798 bool default_dim_p[GOMP_DIM_MAX];
799 for (i = 0; i != GOMP_DIM_MAX; i++)
bd9b3d3d
CP
800 default_dim_p[i] = !dims[i];
801
802 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
4cdfee3f 803 {
bd9b3d3d
CP
804 for (i = 0; i != GOMP_DIM_MAX; i++)
805 if (default_dim_p[i])
806 dims[i] = nvthd->ptx_dev->default_dims[i];
807
808 if (default_dim_p[GOMP_DIM_VECTOR])
809 dims[GOMP_DIM_VECTOR]
810 = MIN (dims[GOMP_DIM_VECTOR],
811 (targ_fn->max_threads_per_block / warp_size
812 * warp_size));
813
814 if (default_dim_p[GOMP_DIM_WORKER])
815 dims[GOMP_DIM_WORKER]
816 = MIN (dims[GOMP_DIM_WORKER],
817 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
4cdfee3f 818 }
bd9b3d3d
CP
819 else
820 {
821 /* Handle the case that the compiler allows the runtime to choose
822 the vector-length conservatively, by ignoring
823 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
824 it. */
825 int vectors = 0;
826 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
827 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
828 exceed targ_fn->max_threads_per_block. */
829 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
830 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
831 int grids, blocks;
832
833 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
834 &blocks, function, NULL, 0,
835 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
836 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
837 "grid = %d, block = %d\n", grids, blocks);
838
839 /* Keep the num_gangs proportional to the block size. In
840 the case were a block size is limited by shared-memory
841 or the register file capacity, the runtime will not
842 excessively over assign gangs to the multiprocessor
843 units if their state is going to be swapped out even
844 more than necessary. The constant factor 2 is there to
845 prevent threads from idling when there is insufficient
846 work for them. */
847 if (gangs == 0)
848 gangs = 2 * grids * (blocks / warp_size);
849
850 if (vectors == 0)
851 vectors = warp_size;
852
853 if (workers == 0)
854 {
855 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
856 ? vectors
857 : dims[GOMP_DIM_VECTOR]);
858 workers = blocks / actual_vectors;
2c372e81 859 workers = MAX (workers, 1);
052aaace
TV
860 /* If we need a per-worker barrier ... . */
861 if (actual_vectors > 32)
862 /* Don't use more barriers than available. */
863 workers = MIN (workers, 15);
bd9b3d3d 864 }
4cdfee3f 865
bd9b3d3d
CP
866 for (i = 0; i != GOMP_DIM_MAX; i++)
867 if (default_dim_p[i])
868 switch (i)
869 {
870 case GOMP_DIM_GANG: dims[i] = gangs; break;
871 case GOMP_DIM_WORKER: dims[i] = workers; break;
872 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
873 default: GOMP_PLUGIN_fatal ("invalid dim");
874 }
875 }
4cdfee3f 876 }
f99c3557 877 }
3e32ee19 878
88a4654d
CP
879 /* Check if the accelerator has sufficient hardware resources to
880 launch the offloaded kernel. */
881 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
882 > targ_fn->max_threads_per_block)
883 {
52d22ece
TV
884 const char *msg
885 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
886 " with num_workers = %d and vector_length = %d"
887 "; "
888 "recompile the program with 'num_workers = x and vector_length = y'"
889 " on that offloaded region or '-fopenacc-dim=:x:y' where"
890 " x * y <= %d"
891 ".\n");
892 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
893 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
88a4654d
CP
894 }
895
052aaace
TV
896 /* Check if the accelerator has sufficient barrier resources to
897 launch the offloaded kernel. */
898 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
899 {
900 const char *msg
901 = ("The Nvidia accelerator has insufficient barrier resources to launch"
902 " '%s' with num_workers = %d and vector_length = %d"
903 "; "
904 "recompile the program with 'num_workers = x' on that offloaded"
905 " region or '-fopenacc-dim=:x:' where x <= 15"
906 "; "
907 "or, recompile the program with 'vector_length = 32' on that"
2c2ff168 908 " offloaded region or '-fopenacc-dim=::32'"
052aaace
TV
909 ".\n");
910 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
911 dims[GOMP_DIM_VECTOR]);
912 }
913
3e32ee19
NS
914 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
915 " gangs=%u, workers=%u, vectors=%u\n",
6668eb45
CP
916 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
917 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
41dbbb37
TS
918
919 // OpenACC CUDA
920 //
3e32ee19
NS
921 // num_gangs nctaid.x
922 // num_workers ntid.y
923 // vector length ntid.x
5fae049d
TS
924
925 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
926 acc_prof_info *prof_info = thr->prof_info;
927 acc_event_info enqueue_launch_event_info;
928 acc_api_info *api_info = thr->api_info;
929 bool profiling_p = __builtin_expect (prof_info != NULL, false);
930 if (profiling_p)
931 {
932 prof_info->event_type = acc_ev_enqueue_launch_start;
933
934 enqueue_launch_event_info.launch_event.event_type
935 = prof_info->event_type;
936 enqueue_launch_event_info.launch_event.valid_bytes
937 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
938 enqueue_launch_event_info.launch_event.parent_construct
939 = acc_construct_parallel;
940 enqueue_launch_event_info.launch_event.implicit = 1;
941 enqueue_launch_event_info.launch_event.tool_info = NULL;
942 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
943 enqueue_launch_event_info.launch_event.num_gangs
944 = dims[GOMP_DIM_GANG];
945 enqueue_launch_event_info.launch_event.num_workers
946 = dims[GOMP_DIM_WORKER];
947 enqueue_launch_event_info.launch_event.vector_length
948 = dims[GOMP_DIM_VECTOR];
949
950 api_info->device_api = acc_device_api_cuda;
951
952 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
953 api_info);
954 }
955
41dbbb37 956 kargs[0] = &dp;
6ce13072
CLT
957 CUDA_CALL_ASSERT (cuLaunchKernel, function,
958 dims[GOMP_DIM_GANG], 1, 1,
959 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1f4c5b9b 960 0, stream, kargs, 0);
41dbbb37 961
5fae049d
TS
962 if (profiling_p)
963 {
964 prof_info->event_type = acc_ev_enqueue_launch_end;
965 enqueue_launch_event_info.launch_event.event_type
966 = prof_info->event_type;
967 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
968 api_info);
969 }
970
41dbbb37 971 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
3e32ee19 972 targ_fn->launch->fn);
41dbbb37
TS
973}
974
975void * openacc_get_current_cuda_context (void);
976
5fae049d
TS
977static void
978goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
979{
980 acc_prof_info *prof_info = thr->prof_info;
981 acc_event_info data_event_info;
982 acc_api_info *api_info = thr->api_info;
983
984 prof_info->event_type = acc_ev_alloc;
985
986 data_event_info.data_event.event_type = prof_info->event_type;
987 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
988 data_event_info.data_event.parent_construct = acc_construct_parallel;
989 data_event_info.data_event.implicit = 1;
990 data_event_info.data_event.tool_info = NULL;
991 data_event_info.data_event.var_name = NULL;
992 data_event_info.data_event.bytes = s;
993 data_event_info.data_event.host_ptr = NULL;
994 data_event_info.data_event.device_ptr = dp;
995
996 api_info->device_api = acc_device_api_cuda;
997
998 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
999}
1000
41dbbb37
TS
1001static void *
1002nvptx_alloc (size_t s)
1003{
1004 CUdeviceptr d;
41dbbb37 1005
6ce13072 1006 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
5fae049d
TS
1007 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1008 bool profiling_p
1009 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1010 if (profiling_p)
1011 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1012
6ce13072 1013 return (void *) d;
41dbbb37
TS
1014}
1015
5fae049d
TS
1016static void
1017goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1018{
1019 acc_prof_info *prof_info = thr->prof_info;
1020 acc_event_info data_event_info;
1021 acc_api_info *api_info = thr->api_info;
1022
1023 prof_info->event_type = acc_ev_free;
1024
1025 data_event_info.data_event.event_type = prof_info->event_type;
1026 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1027 data_event_info.data_event.parent_construct = acc_construct_parallel;
1028 data_event_info.data_event.implicit = 1;
1029 data_event_info.data_event.tool_info = NULL;
1030 data_event_info.data_event.var_name = NULL;
1031 data_event_info.data_event.bytes = -1;
1032 data_event_info.data_event.host_ptr = NULL;
1033 data_event_info.data_event.device_ptr = p;
1034
1035 api_info->device_api = acc_device_api_cuda;
1036
1037 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1038}
1039
6ce13072 1040static bool
1f4c5b9b 1041nvptx_free (void *p, struct ptx_device *ptx_dev)
41dbbb37 1042{
1f4c5b9b
CLT
1043 /* Assume callback context if this is null. */
1044 if (GOMP_PLUGIN_acc_thread () == NULL)
1045 {
1046 struct ptx_free_block *n
1047 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1048 n->ptr = p;
1049 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1050 n->next = ptx_dev->free_blocks;
1051 ptx_dev->free_blocks = n;
1052 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1053 return true;
1054 }
1055
41dbbb37
TS
1056 CUdeviceptr pb;
1057 size_t ps;
1058
6ce13072
CLT
1059 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1060 if ((CUdeviceptr) p != pb)
1061 {
1062 GOMP_PLUGIN_error ("invalid device address");
1063 return false;
1064 }
41dbbb37 1065
6ce13072 1066 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
5fae049d
TS
1067 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1068 bool profiling_p
1069 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1070 if (profiling_p)
1071 goacc_profiling_acc_ev_free (thr, p);
1072
6ce13072 1073 return true;
41dbbb37
TS
1074}
1075
1f4c5b9b
CLT
1076static void *
1077nvptx_get_current_cuda_device (void)
41dbbb37 1078{
41dbbb37
TS
1079 struct nvptx_thread *nvthd = nvptx_thread ();
1080
1f4c5b9b
CLT
1081 if (!nvthd || !nvthd->ptx_dev)
1082 return NULL;
41dbbb37 1083
1f4c5b9b 1084 return &nvthd->ptx_dev->dev;
41dbbb37
TS
1085}
1086
1f4c5b9b
CLT
1087static void *
1088nvptx_get_current_cuda_context (void)
41dbbb37
TS
1089{
1090 struct nvptx_thread *nvthd = nvptx_thread ();
1091
1092 if (!nvthd || !nvthd->ptx_dev)
1093 return NULL;
1094
1095 return nvthd->ptx_dev->ctx;
1096}
1097
41dbbb37
TS
1098/* Plugin entry points. */
1099
1100const char *
1101GOMP_OFFLOAD_get_name (void)
1102{
1103 return "nvptx";
1104}
1105
1106unsigned int
1107GOMP_OFFLOAD_get_caps (void)
1108{
6103184e 1109 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
41dbbb37
TS
1110}
1111
1112int
1113GOMP_OFFLOAD_get_type (void)
1114{
1115 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1116}
1117
1118int
1119GOMP_OFFLOAD_get_num_devices (void)
1120{
1121 return nvptx_get_num_devices ();
1122}
1123
6c84c8bf
MR
1124union gomp_device_property_value
1125GOMP_OFFLOAD_get_property (int n, int prop)
1126{
1127 union gomp_device_property_value propval = { .val = 0 };
1128
1129 pthread_mutex_lock (&ptx_dev_lock);
1130
1131 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1132 {
1133 pthread_mutex_unlock (&ptx_dev_lock);
1134 return propval;
1135 }
1136
1137 struct ptx_device *ptx_dev = ptx_devices[n];
1138 switch (prop)
1139 {
1140 case GOMP_DEVICE_PROPERTY_MEMORY:
1141 {
1142 size_t total_mem;
1143
1144 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1145 propval.val = total_mem;
1146 }
1147 break;
1148 case GOMP_DEVICE_PROPERTY_FREE_MEMORY:
1149 {
1150 size_t total_mem;
1151 size_t free_mem;
1152 CUdevice ctxdev;
1153
1154 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1155 if (ptx_dev->dev == ctxdev)
1156 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1157 else if (ptx_dev->ctx)
1158 {
1159 CUcontext old_ctx;
1160
1161 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1162 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1163 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1164 }
1165 else
1166 {
1167 CUcontext new_ctx;
1168
1169 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1170 ptx_dev->dev);
1171 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1172 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1173 }
1174 propval.val = free_mem;
1175 }
1176 break;
1177 case GOMP_DEVICE_PROPERTY_NAME:
1178 propval.ptr = ptx_dev->name;
1179 break;
1180 case GOMP_DEVICE_PROPERTY_VENDOR:
1181 propval.ptr = "Nvidia";
1182 break;
1183 case GOMP_DEVICE_PROPERTY_DRIVER:
1184 propval.ptr = cuda_driver_version_s;
1185 break;
1186 }
1187
1188 pthread_mutex_unlock (&ptx_dev_lock);
1189 return propval;
1190}
1191
6ce13072 1192bool
d93bdab5 1193GOMP_OFFLOAD_init_device (int n)
41dbbb37 1194{
6ce13072
CLT
1195 struct ptx_device *dev;
1196
d93bdab5 1197 pthread_mutex_lock (&ptx_dev_lock);
41dbbb37 1198
d93bdab5
JB
1199 if (!nvptx_init () || ptx_devices[n] != NULL)
1200 {
1201 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072 1202 return false;
d93bdab5
JB
1203 }
1204
6ce13072
CLT
1205 dev = nvptx_open_device (n);
1206 if (dev)
1207 {
1208 ptx_devices[n] = dev;
1209 instantiated_devices++;
1210 }
d93bdab5
JB
1211
1212 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072
CLT
1213
1214 return dev != NULL;
41dbbb37
TS
1215}
1216
6ce13072 1217bool
d93bdab5 1218GOMP_OFFLOAD_fini_device (int n)
41dbbb37 1219{
d93bdab5
JB
1220 pthread_mutex_lock (&ptx_dev_lock);
1221
1222 if (ptx_devices[n] != NULL)
1223 {
6ce13072
CLT
1224 if (!nvptx_attach_host_thread_to_device (n)
1225 || !nvptx_close_device (ptx_devices[n]))
1226 {
1227 pthread_mutex_unlock (&ptx_dev_lock);
1228 return false;
1229 }
d93bdab5
JB
1230 ptx_devices[n] = NULL;
1231 instantiated_devices--;
1232 }
1233
738c56d4
TV
1234 if (instantiated_devices == 0)
1235 {
1236 free (ptx_devices);
1237 ptx_devices = NULL;
1238 }
1239
d93bdab5 1240 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072 1241 return true;
41dbbb37
TS
1242}
1243
2a21ff19
NS
1244/* Return the libgomp version number we're compatible with. There is
1245 no requirement for cross-version compatibility. */
1246
1247unsigned
1248GOMP_OFFLOAD_version (void)
1249{
1250 return GOMP_VERSION;
1251}
1252
6103184e
AM
1253/* Initialize __nvptx_clocktick, if present in MODULE. */
1254
1255static void
1256nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1257{
1258 CUdeviceptr dptr;
2393d337
JJ
1259 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1260 module, "__nvptx_clocktick");
6103184e
AM
1261 if (r == CUDA_ERROR_NOT_FOUND)
1262 return;
1263 if (r != CUDA_SUCCESS)
1264 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1265 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
2393d337
JJ
1266 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1267 sizeof (__nvptx_clocktick));
6103184e
AM
1268 if (r != CUDA_SUCCESS)
1269 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1270}
1271
f3e9a059
NS
1272/* Load the (partial) program described by TARGET_DATA to device
1273 number ORD. Allocate and return TARGET_TABLE. */
1274
41dbbb37 1275int
2a21ff19 1276GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
d93bdab5 1277 struct addr_pair **target_table)
41dbbb37
TS
1278{
1279 CUmodule module;
3e32ee19
NS
1280 const char *const *var_names;
1281 const struct targ_fn_launch *fn_descs;
d93bdab5 1282 unsigned int fn_entries, var_entries, i, j;
41dbbb37 1283 struct targ_fn_descriptor *targ_fns;
f3e9a059 1284 struct addr_pair *targ_tbl;
afb2d80b 1285 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
d93bdab5 1286 struct ptx_image_data *new_image;
f3e9a059 1287 struct ptx_device *dev;
41dbbb37 1288
2a21ff19 1289 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
6ce13072
CLT
1290 {
1291 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1292 " (expected %u, received %u)",
1293 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1294 return -1;
1295 }
41dbbb37 1296
6ce13072
CLT
1297 if (!nvptx_attach_host_thread_to_device (ord)
1298 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1299 return -1;
d93bdab5 1300
6ce13072 1301 dev = ptx_devices[ord];
41dbbb37 1302
a4cb876d
NS
1303 /* The mkoffload utility emits a struct of pointers/integers at the
1304 start of each offload image. The array of kernel names and the
1305 functions addresses form a one-to-one correspondence. */
41dbbb37 1306
a4cb876d
NS
1307 var_entries = img_header->var_num;
1308 var_names = img_header->var_names;
1309 fn_entries = img_header->fn_num;
3e32ee19 1310 fn_descs = img_header->fn_descs;
41dbbb37 1311
f3e9a059
NS
1312 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1313 * (fn_entries + var_entries));
41dbbb37
TS
1314 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1315 * fn_entries);
1316
f3e9a059
NS
1317 *target_table = targ_tbl;
1318
1319 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1320 new_image->target_data = target_data;
1321 new_image->module = module;
1322 new_image->fns = targ_fns;
1323
1324 pthread_mutex_lock (&dev->image_lock);
1325 new_image->next = dev->images;
1326 dev->images = new_image;
1327 pthread_mutex_unlock (&dev->image_lock);
1328
1329 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
41dbbb37
TS
1330 {
1331 CUfunction function;
6103184e 1332 int nregs, mthrs;
41dbbb37 1333
6ce13072
CLT
1334 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1335 fn_descs[i].fn);
6103184e
AM
1336 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1337 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1338 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1339 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
41dbbb37 1340
f3e9a059 1341 targ_fns->fn = function;
3e32ee19 1342 targ_fns->launch = &fn_descs[i];
6103184e
AM
1343 targ_fns->regs_per_thread = nregs;
1344 targ_fns->max_threads_per_block = mthrs;
41dbbb37 1345
f3e9a059
NS
1346 targ_tbl->start = (uintptr_t) targ_fns;
1347 targ_tbl->end = targ_tbl->start + 1;
41dbbb37
TS
1348 }
1349
f3e9a059 1350 for (j = 0; j < var_entries; j++, targ_tbl++)
d93bdab5
JB
1351 {
1352 CUdeviceptr var;
1353 size_t bytes;
1354
6ce13072
CLT
1355 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1356 &var, &bytes, module, var_names[j]);
d93bdab5 1357
f3e9a059
NS
1358 targ_tbl->start = (uintptr_t) var;
1359 targ_tbl->end = targ_tbl->start + bytes;
d93bdab5
JB
1360 }
1361
6103184e
AM
1362 nvptx_set_clocktick (module, dev);
1363
f3e9a059 1364 return fn_entries + var_entries;
d93bdab5
JB
1365}
1366
f3e9a059
NS
1367/* Unload the program described by TARGET_DATA. DEV_DATA is the
1368 function descriptors allocated by G_O_load_image. */
1369
6ce13072 1370bool
2a21ff19 1371GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
d93bdab5 1372{
f3e9a059
NS
1373 struct ptx_image_data *image, **prev_p;
1374 struct ptx_device *dev = ptx_devices[ord];
1375
2a21ff19 1376 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
6ce13072
CLT
1377 {
1378 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1379 " (expected %u, received %u)",
1380 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1381 return false;
1382 }
1383
1384 bool ret = true;
f3e9a059
NS
1385 pthread_mutex_lock (&dev->image_lock);
1386 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1387 if (image->target_data == target_data)
1388 {
1389 *prev_p = image->next;
2393d337 1390 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
6ce13072 1391 ret = false;
f3e9a059
NS
1392 free (image->fns);
1393 free (image);
1394 break;
1395 }
1396 pthread_mutex_unlock (&dev->image_lock);
6ce13072 1397 return ret;
41dbbb37
TS
1398}
1399
1400void *
d93bdab5 1401GOMP_OFFLOAD_alloc (int ord, size_t size)
41dbbb37 1402{
6ce13072
CLT
1403 if (!nvptx_attach_host_thread_to_device (ord))
1404 return NULL;
41dbbb37 1405
1f4c5b9b
CLT
1406 struct ptx_device *ptx_dev = ptx_devices[ord];
1407 struct ptx_free_block *blocks, *tmp;
41dbbb37 1408
1f4c5b9b
CLT
1409 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1410 blocks = ptx_dev->free_blocks;
1411 ptx_dev->free_blocks = NULL;
1412 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
41dbbb37 1413
1f4c5b9b
CLT
1414 while (blocks)
1415 {
1416 tmp = blocks->next;
1417 nvptx_free (blocks->ptr, ptx_dev);
1418 free (blocks);
1419 blocks = tmp;
1420 }
1421
1422 return nvptx_alloc (size);
41dbbb37
TS
1423}
1424
6103184e 1425bool
1f4c5b9b 1426GOMP_OFFLOAD_free (int ord, void *ptr)
6103184e 1427{
1f4c5b9b
CLT
1428 return (nvptx_attach_host_thread_to_device (ord)
1429 && nvptx_free (ptr, ptx_devices[ord]));
6103184e
AM
1430}
1431
41dbbb37 1432void
345a8c17
TS
1433GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1434 void **hostaddrs, void **devaddrs,
1f4c5b9b 1435 unsigned *dims, void *targ_mem_desc)
41dbbb37 1436{
1f4c5b9b 1437 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
41dbbb37 1438
5fae049d
TS
1439 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1440 acc_prof_info *prof_info = thr->prof_info;
1441 acc_event_info data_event_info;
1442 acc_api_info *api_info = thr->api_info;
1443 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1444
1f4c5b9b
CLT
1445 void **hp = NULL;
1446 CUdeviceptr dp = 0;
41dbbb37 1447
1f4c5b9b
CLT
1448 if (mapnum > 0)
1449 {
5fae049d
TS
1450 size_t s = mapnum * sizeof (void *);
1451 hp = alloca (s);
1f4c5b9b
CLT
1452 for (int i = 0; i < mapnum; i++)
1453 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
5fae049d
TS
1454 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1455 if (profiling_p)
1456 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1f4c5b9b 1457 }
41dbbb37 1458
1f4c5b9b
CLT
1459 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1460 fact have the same value on a unified-memory system). */
1461 if (mapnum > 0)
5fae049d
TS
1462 {
1463 if (profiling_p)
1464 {
1465 prof_info->event_type = acc_ev_enqueue_upload_start;
1466
1467 data_event_info.data_event.event_type = prof_info->event_type;
1468 data_event_info.data_event.valid_bytes
1469 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1470 data_event_info.data_event.parent_construct
1471 = acc_construct_parallel;
1472 data_event_info.data_event.implicit = 1; /* Always implicit. */
1473 data_event_info.data_event.tool_info = NULL;
1474 data_event_info.data_event.var_name = NULL;
1475 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1476 data_event_info.data_event.host_ptr = hp;
1477 data_event_info.data_event.device_ptr = (const void *) dp;
1478
1479 api_info->device_api = acc_device_api_cuda;
1480
1481 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1482 api_info);
1483 }
1484 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1485 mapnum * sizeof (void *));
1486 if (profiling_p)
1487 {
1488 prof_info->event_type = acc_ev_enqueue_upload_end;
1489 data_event_info.data_event.event_type = prof_info->event_type;
1490 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1491 api_info);
1492 }
1493 }
41dbbb37 1494
1f4c5b9b
CLT
1495 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1496 dp, NULL);
41dbbb37 1497
1f4c5b9b
CLT
1498 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1499 const char *maybe_abort_msg = "(perhaps abort was called)";
1500 if (r == CUDA_ERROR_LAUNCH_FAILED)
1501 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1502 maybe_abort_msg);
1503 else if (r != CUDA_SUCCESS)
1504 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
5fae049d 1505
1f4c5b9b 1506 CUDA_CALL_ASSERT (cuMemFree, dp);
5fae049d
TS
1507 if (profiling_p)
1508 goacc_profiling_acc_ev_free (thr, (void *) dp);
41dbbb37
TS
1509}
1510
1f4c5b9b
CLT
1511static void
1512cuda_free_argmem (void *ptr)
41dbbb37 1513{
1f4c5b9b
CLT
1514 void **block = (void **) ptr;
1515 nvptx_free (block[0], (struct ptx_device *) block[1]);
1516 free (block);
41dbbb37
TS
1517}
1518
1519void
1f4c5b9b
CLT
1520GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1521 void **hostaddrs, void **devaddrs,
1522 unsigned *dims, void *targ_mem_desc,
1523 struct goacc_asyncqueue *aq)
41dbbb37 1524{
1f4c5b9b 1525 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
41dbbb37 1526
5fae049d
TS
1527 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1528 acc_prof_info *prof_info = thr->prof_info;
1529 acc_event_info data_event_info;
1530 acc_api_info *api_info = thr->api_info;
1531 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1532
1f4c5b9b
CLT
1533 void **hp = NULL;
1534 CUdeviceptr dp = 0;
1535 void **block = NULL;
41dbbb37 1536
1f4c5b9b
CLT
1537 if (mapnum > 0)
1538 {
5fae049d
TS
1539 size_t s = mapnum * sizeof (void *);
1540 block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1f4c5b9b
CLT
1541 hp = block + 2;
1542 for (int i = 0; i < mapnum; i++)
1543 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
5fae049d
TS
1544 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1545 if (profiling_p)
1546 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1f4c5b9b
CLT
1547 }
1548
1549 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1550 fact have the same value on a unified-memory system). */
1551 if (mapnum > 0)
1552 {
5fae049d
TS
1553 if (profiling_p)
1554 {
1555 prof_info->event_type = acc_ev_enqueue_upload_start;
1556
1557 data_event_info.data_event.event_type = prof_info->event_type;
1558 data_event_info.data_event.valid_bytes
1559 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1560 data_event_info.data_event.parent_construct
1561 = acc_construct_parallel;
1562 data_event_info.data_event.implicit = 1; /* Always implicit. */
1563 data_event_info.data_event.tool_info = NULL;
1564 data_event_info.data_event.var_name = NULL;
1565 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1566 data_event_info.data_event.host_ptr = hp;
1567 data_event_info.data_event.device_ptr = (const void *) dp;
1568
1569 api_info->device_api = acc_device_api_cuda;
1570
1571 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1572 api_info);
1573 }
1574
1f4c5b9b
CLT
1575 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1576 mapnum * sizeof (void *), aq->cuda_stream);
1577 block[0] = (void *) dp;
1578
1579 struct nvptx_thread *nvthd =
1580 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1581 block[1] = (void *) nvthd->ptx_dev;
5fae049d
TS
1582
1583 if (profiling_p)
1584 {
1585 prof_info->event_type = acc_ev_enqueue_upload_end;
1586 data_event_info.data_event.event_type = prof_info->event_type;
1587 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1588 api_info);
1589 }
1f4c5b9b 1590 }
5fae049d 1591
1f4c5b9b
CLT
1592 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1593 dp, aq->cuda_stream);
1594
1595 if (mapnum > 0)
1596 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
41dbbb37
TS
1597}
1598
1599void *
d93bdab5 1600GOMP_OFFLOAD_openacc_create_thread_data (int ord)
41dbbb37 1601{
d93bdab5 1602 struct ptx_device *ptx_dev;
41dbbb37
TS
1603 struct nvptx_thread *nvthd
1604 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
41dbbb37
TS
1605 CUcontext thd_ctx;
1606
d93bdab5
JB
1607 ptx_dev = ptx_devices[ord];
1608
1609 assert (ptx_dev);
1610
6ce13072 1611 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
41dbbb37
TS
1612
1613 assert (ptx_dev->ctx);
1614
1615 if (!thd_ctx)
6ce13072 1616 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
41dbbb37 1617
41dbbb37
TS
1618 nvthd->ptx_dev = ptx_dev;
1619
1620 return (void *) nvthd;
1621}
1622
1623void
1624GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1625{
1626 free (data);
1627}
1628
1629void *
345a8c17 1630GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
41dbbb37
TS
1631{
1632 return nvptx_get_current_cuda_device ();
1633}
1634
1635void *
345a8c17 1636GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
41dbbb37
TS
1637{
1638 return nvptx_get_current_cuda_context ();
1639}
1640
1f4c5b9b 1641/* This returns a CUstream. */
41dbbb37 1642void *
1f4c5b9b
CLT
1643GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1644{
1645 return (void *) aq->cuda_stream;
1646}
1647
1648/* This takes a CUstream. */
1649int
1650GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1651{
1652 if (aq->cuda_stream)
1653 {
1654 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1655 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1656 }
1657
1658 aq->cuda_stream = (CUstream) stream;
1659 return 1;
1660}
1661
1662struct goacc_asyncqueue *
d2903ce0 1663GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
41dbbb37 1664{
1f4c5b9b
CLT
1665 CUstream stream = NULL;
1666 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1667
1668 struct goacc_asyncqueue *aq
1669 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1670 aq->cuda_stream = stream;
1671 return aq;
41dbbb37
TS
1672}
1673
1f4c5b9b
CLT
1674bool
1675GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1676{
1677 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1678 free (aq);
1679 return true;
1680}
41dbbb37
TS
1681
1682int
1f4c5b9b 1683GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
41dbbb37 1684{
1f4c5b9b
CLT
1685 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1686 if (r == CUDA_SUCCESS)
1687 return 1;
1688 if (r == CUDA_ERROR_NOT_READY)
1689 return 0;
1690
1691 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1692 return -1;
1693}
1694
1695bool
1696GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1697{
1698 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1699 return true;
1700}
1701
1702bool
1703GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1704 struct goacc_asyncqueue *aq2)
1705{
1706 CUevent e;
1707 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1708 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1709 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1710 return true;
1711}
1712
1713static void
1714cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1715{
1716 if (res != CUDA_SUCCESS)
1717 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1718 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1719 cb->fn (cb->ptr);
1720 free (ptr);
1721}
1722
1723void
1724GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1725 void (*callback_fn)(void *),
1726 void *userptr)
1727{
1728 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1729 b->fn = callback_fn;
1730 b->ptr = userptr;
1731 b->aq = aq;
1732 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1733 cuda_callback_wrapper, (void *) b, 0);
1734}
1735
1736static bool
1737cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1738{
1739 CUdeviceptr pb;
1740 size_t ps;
1741 if (!s)
1742 return true;
1743 if (!d)
1744 {
1745 GOMP_PLUGIN_error ("invalid device address");
1746 return false;
1747 }
1748 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1749 if (!pb)
1750 {
1751 GOMP_PLUGIN_error ("invalid device address");
1752 return false;
1753 }
1754 if (!h)
1755 {
1756 GOMP_PLUGIN_error ("invalid host address");
1757 return false;
1758 }
1759 if (d == h)
1760 {
1761 GOMP_PLUGIN_error ("invalid host or device address");
1762 return false;
1763 }
1764 if ((void *)(d + s) > (void *)(pb + ps))
1765 {
1766 GOMP_PLUGIN_error ("invalid size");
1767 return false;
1768 }
1769 return true;
1770}
1771
1772bool
1773GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1774{
1775 if (!nvptx_attach_host_thread_to_device (ord)
1776 || !cuda_memcpy_sanity_check (src, dst, n))
1777 return false;
1778 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1779 return true;
1780}
1781
1782bool
1783GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1784{
1785 if (!nvptx_attach_host_thread_to_device (ord)
1786 || !cuda_memcpy_sanity_check (dst, src, n))
1787 return false;
1788 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1789 return true;
1790}
1791
1792bool
1793GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1794{
1795 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1796 return true;
1797}
1798
1799bool
1800GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1801 size_t n, struct goacc_asyncqueue *aq)
1802{
1803 if (!nvptx_attach_host_thread_to_device (ord)
1804 || !cuda_memcpy_sanity_check (src, dst, n))
1805 return false;
1806 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1807 return true;
1808}
1809
1810bool
1811GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1812 size_t n, struct goacc_asyncqueue *aq)
1813{
1814 if (!nvptx_attach_host_thread_to_device (ord)
1815 || !cuda_memcpy_sanity_check (dst, src, n))
1816 return false;
1817 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1818 return true;
41dbbb37 1819}
6103184e
AM
1820
1821/* Adjust launch dimensions: pick good values for number of blocks and warps
1822 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1823 own limits. */
1824
1825static void
1826nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1827 struct ptx_device *ptx_dev,
1828 int *teams_p, int *threads_p)
1829{
1830 int max_warps_block = fn->max_threads_per_block / 32;
1831 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1832 and libgcc, which matches documented limit of all GPUs as of 2015. */
1833 if (max_warps_block > 32)
1834 max_warps_block = 32;
1835 if (*threads_p <= 0)
1836 *threads_p = 8;
1837 if (*threads_p > max_warps_block)
1838 *threads_p = max_warps_block;
1839
1840 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1841 /* This is an estimate of how many blocks the device can host simultaneously.
1842 Actual limit, which may be lower, can be queried with "occupancy control"
1843 driver interface (since CUDA 6.0). */
1844 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1845 if (*teams_p <= 0 || *teams_p > max_blocks)
1846 *teams_p = max_blocks;
1847}
1848
1849/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1850 target regions. */
1851
1852static size_t
1853nvptx_stacks_size ()
1854{
1855 return 128 * 1024;
1856}
1857
1858/* Return contiguous storage for NUM stacks, each SIZE bytes. */
1859
1860static void *
1861nvptx_stacks_alloc (size_t size, int num)
1862{
1863 CUdeviceptr stacks;
2393d337 1864 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
6103184e
AM
1865 if (r != CUDA_SUCCESS)
1866 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1867 return (void *) stacks;
1868}
1869
1870/* Release storage previously allocated by nvptx_stacks_alloc. */
1871
1872static void
1873nvptx_stacks_free (void *p, int num)
1874{
2393d337 1875 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
6103184e
AM
1876 if (r != CUDA_SUCCESS)
1877 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1878}
1879
1880void
1881GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1882{
1883 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
1884 CUresult r;
1885 struct ptx_device *ptx_dev = ptx_devices[ord];
1886 const char *maybe_abort_msg = "(perhaps abort was called)";
1887 int teams = 0, threads = 0;
1888
1889 if (!args)
1890 GOMP_PLUGIN_fatal ("No target arguments provided");
1891 while (*args)
1892 {
1893 intptr_t id = (intptr_t) *args++, val;
1894 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1895 val = (intptr_t) *args++;
1896 else
1897 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1898 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1899 continue;
1900 val = val > INT_MAX ? INT_MAX : val;
1901 id &= GOMP_TARGET_ARG_ID_MASK;
1902 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
1903 teams = val;
1904 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
1905 threads = val;
1906 }
1907 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
1908
1909 size_t stack_size = nvptx_stacks_size ();
1910 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
1911 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
1912 size_t fn_args_size = sizeof fn_args;
1913 void *config[] = {
1914 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
1915 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
1916 CU_LAUNCH_PARAM_END
1917 };
2393d337 1918 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
1f4c5b9b 1919 32, threads, 1, 0, NULL, NULL, config);
6103184e
AM
1920 if (r != CUDA_SUCCESS)
1921 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1922
2393d337 1923 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
6103184e
AM
1924 if (r == CUDA_ERROR_LAUNCH_FAILED)
1925 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1926 maybe_abort_msg);
1927 else if (r != CUDA_SUCCESS)
1928 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1929 nvptx_stacks_free (stacks, teams * threads);
1930}
1931
1932void
1933GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
1934 void *async_data)
1935{
1936 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
1937}