]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-nvptx.c
Fortran: Accept again tab as alternative to space as separator [PR114304]
[thirdparty/gcc.git] / libgomp / plugin / plugin-nvptx.c
CommitLineData
41dbbb37
TS
1/* Plugin for NVPTX execution.
2
a945c346 3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
41dbbb37
TS
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
df36a3d3 34#define _GNU_SOURCE
41dbbb37
TS
35#include "openacc.h"
36#include "config.h"
0bac793e 37#include "symcat.h"
41dbbb37 38#include "libgomp-plugin.h"
41dbbb37 39#include "oacc-plugin.h"
2a21ff19 40#include "gomp-constants.h"
5fae049d 41#include "oacc-int.h"
41dbbb37 42
131d18e9
TB
43/* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
44#include "config/nvptx/libgomp-nvptx.h"
45
41dbbb37 46#include <pthread.h>
cd644ce8 47#ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
5e431ae4
TS
48# include "cuda/cuda.h"
49#else
50# include <cuda.h>
51#endif
41dbbb37 52#include <stdbool.h>
6103184e 53#include <limits.h>
41dbbb37
TS
54#include <string.h>
55#include <stdio.h>
41dbbb37
TS
56#include <unistd.h>
57#include <assert.h>
6668eb45 58#include <errno.h>
130c2f3c 59#include <stdlib.h>
41dbbb37 60
6b577a17
JB
61/* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
62 block to cache between kernel invocations. For soft-stacks blocks bigger
63 than this, we will free the block before attempting another GPU memory
64 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
65 we will free the cached soft-stacks block anyway then retry the
66 allocation. If that fails too, we lose. */
67
68#define SOFTSTACK_CACHE_LIMIT 134217728
69
94767dac
TV
70#if CUDA_VERSION < 6000
71extern CUresult cuGetErrorString (CUresult, const char **);
b113af95 72#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
94767dac
TV
73#endif
74
8e09a12f
TV
75#if CUDA_VERSION >= 6050
76#undef cuLinkCreate
77#undef cuLinkAddData
78CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
79 const char *, unsigned, CUjit_option *, void **);
80CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
81#else
bd9b3d3d 82typedef size_t (*CUoccupancyB2DSize)(int);
8e09a12f
TV
83CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
84 const char *, unsigned, CUjit_option *, void **);
85CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
bd9b3d3d
CP
86CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
87 CUoccupancyB2DSize, size_t, int);
8e09a12f
TV
88#endif
89
02150de8
TV
90#define DO_PRAGMA(x) _Pragma (#x)
91
cd644ce8 92#ifndef PLUGIN_NVPTX_LINK_LIBCUDA
2393d337
JJ
93# include <dlfcn.h>
94
2393d337 95struct cuda_lib_s {
9e28b107
TV
96
97# define CUDA_ONE_CALL(call) \
98 __typeof (call) *call;
02150de8
TV
99# define CUDA_ONE_CALL_MAYBE_NULL(call) \
100 CUDA_ONE_CALL (call)
8c6310a2 101#include "cuda-lib.def"
9e28b107 102# undef CUDA_ONE_CALL
02150de8 103# undef CUDA_ONE_CALL_MAYBE_NULL
9e28b107 104
2393d337
JJ
105} cuda_lib;
106
107/* -1 if init_cuda_lib has not been called yet, false
108 if it has been and failed, true if it has been and succeeded. */
19929ba9 109static signed char cuda_lib_inited = -1;
2393d337
JJ
110
111/* Dynamically load the CUDA runtime library and initialize function
112 pointers, return false if unsuccessful, true if successful. */
113static bool
114init_cuda_lib (void)
115{
116 if (cuda_lib_inited != -1)
117 return cuda_lib_inited;
118 const char *cuda_runtime_lib = "libcuda.so.1";
119 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
120 cuda_lib_inited = false;
121 if (h == NULL)
122 return false;
9e28b107 123
02150de8
TV
124# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
125# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
126# define CUDA_ONE_CALL_1(call, allow_null) \
2393d337 127 cuda_lib.call = dlsym (h, #call); \
02150de8 128 if (!allow_null && cuda_lib.call == NULL) \
ab70addf 129 GOMP_PLUGIN_fatal ("'%s' is missing '%s'", cuda_runtime_lib, #call);
8c6310a2 130#include "cuda-lib.def"
9e28b107
TV
131# undef CUDA_ONE_CALL
132# undef CUDA_ONE_CALL_1
02150de8 133# undef CUDA_ONE_CALL_MAYBE_NULL
9e28b107 134
2393d337
JJ
135 cuda_lib_inited = true;
136 return true;
41dbbb37 137}
2393d337
JJ
138# define CUDA_CALL_PREFIX cuda_lib.
139#else
02150de8
TV
140
141# define CUDA_ONE_CALL(call)
142# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
143#include "cuda-lib.def"
144#undef CUDA_ONE_CALL_MAYBE_NULL
145#undef CUDA_ONE_CALL
146
2393d337
JJ
147# define CUDA_CALL_PREFIX
148# define init_cuda_lib() true
149#endif
41dbbb37 150
df36a3d3
TV
151#include "secure_getenv.h"
152
4cdfee3f
TV
153#undef MIN
154#undef MAX
155#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
156#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
157
6ce13072 158/* Convenience macros for the frequently used CUDA library call and
2393d337
JJ
159 error handling sequence as well as CUDA library calls that
160 do the error checking themselves or don't do it at all. */
6ce13072
CLT
161
162#define CUDA_CALL_ERET(ERET, FN, ...) \
163 do { \
2393d337
JJ
164 unsigned __r \
165 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
6ce13072
CLT
166 if (__r != CUDA_SUCCESS) \
167 { \
168 GOMP_PLUGIN_error (#FN " error: %s", \
169 cuda_error (__r)); \
170 return ERET; \
171 } \
172 } while (0)
173
174#define CUDA_CALL(FN, ...) \
2393d337 175 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
6ce13072
CLT
176
177#define CUDA_CALL_ASSERT(FN, ...) \
178 do { \
2393d337
JJ
179 unsigned __r \
180 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
6ce13072
CLT
181 if (__r != CUDA_SUCCESS) \
182 { \
183 GOMP_PLUGIN_fatal (#FN " error: %s", \
184 cuda_error (__r)); \
185 } \
186 } while (0)
187
2393d337
JJ
188#define CUDA_CALL_NOCHECK(FN, ...) \
189 CUDA_CALL_PREFIX FN (__VA_ARGS__)
190
02150de8
TV
191#define CUDA_CALL_EXISTS(FN) \
192 CUDA_CALL_PREFIX FN
193
2393d337
JJ
194static const char *
195cuda_error (CUresult r)
196{
cedd9bd0 197 const char *fallback = "unknown cuda error";
2393d337
JJ
198 const char *desc;
199
cedd9bd0
TV
200 if (!CUDA_CALL_EXISTS (cuGetErrorString))
201 return fallback;
202
2393d337 203 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
cedd9bd0
TV
204 if (r == CUDA_SUCCESS)
205 return desc;
2393d337 206
cedd9bd0 207 return fallback;
2393d337
JJ
208}
209
6c84c8bf
MR
210/* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
211 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
212static char cuda_driver_version_s[30];
213
d93bdab5
JB
214static unsigned int instantiated_devices = 0;
215static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
41dbbb37 216
1f4c5b9b
CLT
217/* NVPTX/CUDA specific definition of asynchronous queues. */
218struct goacc_asyncqueue
2049befd 219{
1f4c5b9b 220 CUstream cuda_stream;
2049befd
CP
221};
222
1f4c5b9b 223struct nvptx_callback
41dbbb37 224{
1f4c5b9b
CLT
225 void (*fn) (void *);
226 void *ptr;
227 struct goacc_asyncqueue *aq;
228 struct nvptx_callback *next;
41dbbb37
TS
229};
230
231/* Thread-specific data for PTX. */
232
233struct nvptx_thread
234{
1f4c5b9b
CLT
235 /* We currently have this embedded inside the plugin because libgomp manages
236 devices through integer target_ids. This might be better if using an
237 opaque target-specific pointer directly from gomp_device_descr. */
41dbbb37
TS
238 struct ptx_device *ptx_dev;
239};
240
3e32ee19
NS
241/* Target data function launch information. */
242
243struct targ_fn_launch
244{
245 const char *fn;
cc3cd79b 246 unsigned short dim[GOMP_DIM_MAX];
3e32ee19
NS
247};
248
cc3cd79b
NS
249/* Target PTX object information. */
250
251struct targ_ptx_obj
252{
253 const char *code;
254 size_t size;
255};
256
257/* Target data image information. */
258
259typedef struct nvptx_tdata
260{
261 const struct targ_ptx_obj *ptx_objs;
262 unsigned ptx_num;
263
264 const char *const *var_names;
265 unsigned var_num;
266
267 const struct targ_fn_launch *fn_descs;
268 unsigned fn_num;
a49c7d31
KCY
269
270 unsigned ind_fn_num;
cc3cd79b
NS
271} nvptx_tdata_t;
272
f3e9a059
NS
273/* Descriptor of a loaded function. */
274
275struct targ_fn_descriptor
276{
277 CUfunction fn;
3e32ee19 278 const struct targ_fn_launch *launch;
6103184e
AM
279 int regs_per_thread;
280 int max_threads_per_block;
f3e9a059
NS
281};
282
283/* A loaded PTX image. */
284struct ptx_image_data
285{
286 const void *target_data;
287 CUmodule module;
288
289 struct targ_fn_descriptor *fns; /* Array of functions. */
290
291 struct ptx_image_data *next;
292};
293
1f4c5b9b
CLT
294struct ptx_free_block
295{
296 void *ptr;
297 struct ptx_free_block *next;
298};
299
41dbbb37
TS
300struct ptx_device
301{
302 CUcontext ctx;
303 bool ctx_shared;
304 CUdevice dev;
1f4c5b9b 305
41dbbb37
TS
306 int ord;
307 bool overlap;
308 bool map;
309 bool concur;
41dbbb37 310 bool mkern;
6c84c8bf 311 int mode;
6103184e
AM
312 int clock_khz;
313 int num_sms;
314 int regs_per_block;
315 int regs_per_sm;
0c6c2f5f
CP
316 int warp_size;
317 int max_threads_per_block;
318 int max_threads_per_multiprocessor;
0b210c43 319 int default_dims[GOMP_DIM_MAX];
41dbbb37 320
6c84c8bf
MR
321 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
322 char name[256];
323
f3e9a059
NS
324 struct ptx_image_data *images; /* Images loaded on device. */
325 pthread_mutex_t image_lock; /* Lock for above list. */
41dbbb37 326
1f4c5b9b
CLT
327 struct ptx_free_block *free_blocks;
328 pthread_mutex_t free_blocks_lock;
41dbbb37 329
6b577a17
JB
330 /* OpenMP stacks, cached between kernel invocations. */
331 struct
332 {
333 CUdeviceptr ptr;
334 size_t size;
335 pthread_mutex_t lock;
336 } omp_stacks;
337
131d18e9 338 struct rev_offload *rev_data;
1f4c5b9b 339 struct ptx_device *next;
41dbbb37
TS
340};
341
d93bdab5
JB
342static struct ptx_device **ptx_devices;
343
30486fab
AS
344/* OpenMP kernels reserve a small amount of ".shared" space for use by
345 omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
346 default is set here. */
347static unsigned lowlat_pool_size = 8 * 1024;
348
41dbbb37
TS
349static inline struct nvptx_thread *
350nvptx_thread (void)
351{
352 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
353}
354
d93bdab5
JB
355/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
356 should be locked on entry and remains locked on exit. */
f3e9a059 357
d93bdab5 358static bool
41dbbb37
TS
359nvptx_init (void)
360{
d93bdab5 361 int ndevs;
41dbbb37 362
d93bdab5
JB
363 if (instantiated_devices != 0)
364 return true;
41dbbb37 365
2393d337
JJ
366 if (!init_cuda_lib ())
367 return false;
368
369 CUDA_CALL (cuInit, 0);
370
6c84c8bf
MR
371 int cuda_driver_version;
372 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
373 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
374 "CUDA Driver %u.%u",
375 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
376
6ce13072 377 CUDA_CALL (cuDeviceGetCount, &ndevs);
d93bdab5
JB
378 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
379 * ndevs);
6c84c8bf 380
d93bdab5 381 return true;
41dbbb37
TS
382}
383
d93bdab5
JB
384/* Select the N'th PTX device for the current host thread. The device must
385 have been previously opened before calling this function. */
386
6ce13072 387static bool
d93bdab5 388nvptx_attach_host_thread_to_device (int n)
41dbbb37 389{
d93bdab5
JB
390 CUdevice dev;
391 CUresult r;
392 struct ptx_device *ptx_dev;
393 CUcontext thd_ctx;
394
2393d337 395 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
1f4c5b9b
CLT
396 if (r == CUDA_ERROR_NOT_PERMITTED)
397 {
398 /* Assume we're in a CUDA callback, just return true. */
399 return true;
400 }
d93bdab5 401 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
6ce13072
CLT
402 {
403 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
404 return false;
405 }
d93bdab5
JB
406
407 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
6ce13072 408 return true;
d93bdab5
JB
409 else
410 {
411 CUcontext old_ctx;
412
413 ptx_dev = ptx_devices[n];
6ce13072
CLT
414 if (!ptx_dev)
415 {
416 GOMP_PLUGIN_error ("device %d not found", n);
417 return false;
418 }
d93bdab5 419
6ce13072 420 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
d93bdab5
JB
421
422 /* We don't necessarily have a current context (e.g. if it has been
423 destroyed. Pop it if we do though. */
424 if (thd_ctx != NULL)
6ce13072 425 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
d93bdab5 426
6ce13072 427 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
d93bdab5 428 }
6ce13072 429 return true;
41dbbb37
TS
430}
431
d93bdab5 432static struct ptx_device *
41dbbb37
TS
433nvptx_open_device (int n)
434{
435 struct ptx_device *ptx_dev;
d93bdab5 436 CUdevice dev, ctx_dev;
41dbbb37 437 CUresult r;
131d18e9 438 int pi;
41dbbb37 439
6ce13072 440 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
41dbbb37
TS
441
442 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
443
444 ptx_dev->ord = n;
445 ptx_dev->dev = dev;
446 ptx_dev->ctx_shared = false;
447
2393d337 448 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
d93bdab5 449 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
6ce13072
CLT
450 {
451 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
452 return NULL;
453 }
d93bdab5
JB
454
455 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
456 {
457 /* The current host thread has an active context for a different device.
458 Detach it. */
459 CUcontext old_ctx;
6ce13072 460 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
d93bdab5
JB
461 }
462
6ce13072 463 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
41dbbb37
TS
464
465 if (!ptx_dev->ctx)
6ce13072 466 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
41dbbb37
TS
467 else
468 ptx_dev->ctx_shared = true;
469
6ce13072
CLT
470 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
471 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
41dbbb37
TS
472 ptx_dev->overlap = pi;
473
6ce13072
CLT
474 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
475 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
41dbbb37
TS
476 ptx_dev->map = pi;
477
6ce13072
CLT
478 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
479 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
41dbbb37
TS
480 ptx_dev->concur = pi;
481
6ce13072
CLT
482 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
483 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
41dbbb37
TS
484 ptx_dev->mode = pi;
485
6ce13072
CLT
486 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
487 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
41dbbb37
TS
488 ptx_dev->mkern = pi;
489
6103184e
AM
490 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
491 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
492 ptx_dev->clock_khz = pi;
493
2393d337 494 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
6103184e
AM
495 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
496 ptx_dev->num_sms = pi;
497
498 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
499 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
500 ptx_dev->regs_per_block = pi;
501
b113af95 502 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
6103184e 503 in CUDA 6.0 and newer. */
b113af95
TV
504 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
505 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
506 dev);
6103184e
AM
507 /* Fallback: use limit of registers per block, which is usually equal. */
508 if (r == CUDA_ERROR_INVALID_VALUE)
509 pi = ptx_dev->regs_per_block;
510 else if (r != CUDA_SUCCESS)
511 {
512 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
513 return NULL;
514 }
515 ptx_dev->regs_per_sm = pi;
516
517 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
518 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
519 if (pi != 32)
520 {
521 GOMP_PLUGIN_error ("Only warp size 32 is supported");
522 return NULL;
523 }
0c6c2f5f
CP
524 ptx_dev->warp_size = pi;
525
526 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
527 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
528 ptx_dev->max_threads_per_block = pi;
529
530 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
531 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
532 ptx_dev->max_threads_per_multiprocessor = pi;
6103184e 533
131d18e9
TB
534 /* Required below for reverse offload as implemented, but with compute
535 capability >= 2.0 and 64bit device processes, this should be universally be
536 the case; hence, an assert. */
537 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
538 CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
539 assert (r == CUDA_SUCCESS && pi);
41dbbb37 540
0b210c43
TV
541 for (int i = 0; i != GOMP_DIM_MAX; i++)
542 ptx_dev->default_dims[i] = 0;
543
6c84c8bf
MR
544 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
545 dev);
546
f3e9a059
NS
547 ptx_dev->images = NULL;
548 pthread_mutex_init (&ptx_dev->image_lock, NULL);
549
1f4c5b9b
CLT
550 ptx_dev->free_blocks = NULL;
551 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
41dbbb37 552
6b577a17
JB
553 ptx_dev->omp_stacks.ptr = 0;
554 ptx_dev->omp_stacks.size = 0;
555 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
556
20553883
TS
557 ptx_dev->rev_data = NULL;
558
d93bdab5 559 return ptx_dev;
41dbbb37
TS
560}
561
6ce13072 562static bool
d93bdab5 563nvptx_close_device (struct ptx_device *ptx_dev)
41dbbb37 564{
41dbbb37 565 if (!ptx_dev)
6ce13072 566 return true;
41dbbb37 567
1f4c5b9b
CLT
568 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
569 {
570 struct ptx_free_block *b_next = b->next;
571 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
572 free (b);
573 b = b_next;
574 }
575
576 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
f3e9a059 577 pthread_mutex_destroy (&ptx_dev->image_lock);
41dbbb37 578
6b577a17
JB
579 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
580
581 if (ptx_dev->omp_stacks.ptr)
582 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
583
41dbbb37 584 if (!ptx_dev->ctx_shared)
6ce13072 585 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
41dbbb37
TS
586
587 free (ptx_dev);
6ce13072 588 return true;
41dbbb37
TS
589}
590
591static int
592nvptx_get_num_devices (void)
593{
594 int n;
41dbbb37
TS
595
596 /* This function will be called before the plugin has been initialized in
597 order to enumerate available devices, but CUDA API routines can't be used
598 until cuInit has been called. Just call it now (but don't yet do any
599 further initialization). */
d93bdab5 600 if (instantiated_devices == 0)
c8319826 601 {
2393d337
JJ
602 if (!init_cuda_lib ())
603 return 0;
604 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
c8319826
JB
605 /* This is not an error: e.g. we may have CUDA libraries installed but
606 no devices available. */
607 if (r != CUDA_SUCCESS)
78672bd8
TS
608 {
609 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
610 cuda_error (r));
611 return 0;
612 }
c8319826 613 }
41dbbb37 614
37078f24 615 CUDA_CALL_ASSERT (cuDeviceGetCount, &n);
41dbbb37
TS
616 return n;
617}
618
dfb15f6b
TV
619static void
620notify_var (const char *var_name, const char *env_var)
621{
622 if (env_var == NULL)
623 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
624 else
625 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
626}
41dbbb37 627
df36a3d3
TV
628static void
629process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
630{
631 const char *var_name = "GOMP_NVPTX_JIT";
632 const char *env_var = secure_getenv (var_name);
633 notify_var (var_name, env_var);
634
635 if (env_var == NULL)
636 return;
637
638 const char *c = env_var;
639 while (*c != '\0')
640 {
641 while (*c == ' ')
642 c++;
643
644 if (c[0] == '-' && c[1] == 'O'
645 && '0' <= c[2] && c[2] <= '4'
646 && (c[3] == '\0' || c[3] == ' '))
647 {
648 *gomp_nvptx_o = c[2] - '0';
649 c += 3;
650 continue;
651 }
652
653 GOMP_PLUGIN_error ("Error parsing %s", var_name);
654 break;
655 }
656}
657
6ce13072 658static bool
cc3cd79b
NS
659link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
660 unsigned num_objs)
41dbbb37 661{
df36a3d3
TV
662 CUjit_option opts[7];
663 void *optvals[7];
41dbbb37 664 float elapsed = 0.0;
6103184e
AM
665 char elog[1024];
666 char ilog[16384];
41dbbb37
TS
667 CUlinkState linkstate;
668 CUresult r;
669 void *linkout;
670 size_t linkoutsize __attribute__ ((unused));
671
41dbbb37
TS
672 opts[0] = CU_JIT_WALL_TIME;
673 optvals[0] = &elapsed;
674
675 opts[1] = CU_JIT_INFO_LOG_BUFFER;
676 optvals[1] = &ilog[0];
677
678 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
6103184e 679 optvals[2] = (void *) sizeof ilog;
41dbbb37
TS
680
681 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
682 optvals[3] = &elog[0];
683
684 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
6103184e 685 optvals[4] = (void *) sizeof elog;
41dbbb37
TS
686
687 opts[5] = CU_JIT_LOG_VERBOSE;
688 optvals[5] = (void *) 1;
689
df36a3d3
TV
690 static intptr_t gomp_nvptx_o = -1;
691
692 static bool init_done = false;
693 if (!init_done)
694 {
695 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
696 init_done = true;
697 }
698
699 int nopts = 6;
700 if (gomp_nvptx_o != -1)
701 {
702 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
703 optvals[nopts] = (void *) gomp_nvptx_o;
704 nopts++;
705 }
706
8e09a12f
TV
707 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
708 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
709 else
710 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
41dbbb37 711
cc3cd79b 712 for (; num_objs--; ptx_objs++)
41dbbb37 713 {
cc3cd79b
NS
714 /* cuLinkAddData's 'data' argument erroneously omits the const
715 qualifier. */
716 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
8e09a12f
TV
717 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
718 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
719 (char *) ptx_objs->code, ptx_objs->size,
720 0, 0, 0, 0);
721 else
722 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
723 (char *) ptx_objs->code, ptx_objs->size,
724 0, 0, 0, 0);
cc3cd79b
NS
725 if (r != CUDA_SUCCESS)
726 {
727 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
6ce13072 728 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
cc3cd79b 729 cuda_error (r));
6ce13072 730 return false;
cc3cd79b 731 }
41dbbb37
TS
732 }
733
cc3cd79b 734 GOMP_PLUGIN_debug (0, "Linking\n");
2393d337 735 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
41dbbb37
TS
736
737 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
738 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
739
cc3cd79b 740 if (r != CUDA_SUCCESS)
6ce13072 741 {
c0e9cee2 742 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
6ce13072
CLT
743 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
744 return false;
745 }
cc3cd79b 746
6ce13072
CLT
747 CUDA_CALL (cuModuleLoadData, module, linkout);
748 CUDA_CALL (cuLinkDestroy, linkstate);
749 return true;
41dbbb37
TS
750}
751
e70ab10d 752static void
f8332e52 753nvptx_exec (void (*fn), unsigned *dims, void *targ_mem_desc,
1f4c5b9b 754 CUdeviceptr dp, CUstream stream)
41dbbb37
TS
755{
756 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
757 CUfunction function;
41dbbb37 758 int i;
41dbbb37 759 void *kargs[1];
41dbbb37 760 struct nvptx_thread *nvthd = nvptx_thread ();
4cdfee3f 761 int warp_size = nvthd->ptx_dev->warp_size;
41dbbb37
TS
762
763 function = targ_fn->fn;
764
3e32ee19
NS
765 /* Initialize the launch dimensions. Typically this is constant,
766 provided by the device compiler, but we must permit runtime
767 values. */
f99c3557
TS
768 int seen_zero = 0;
769 for (i = 0; i != GOMP_DIM_MAX; i++)
770 {
771 if (targ_fn->launch->dim[i])
772 dims[i] = targ_fn->launch->dim[i];
773 if (!dims[i])
774 seen_zero = 1;
775 }
776
777 if (seen_zero)
778 {
6668eb45 779 pthread_mutex_lock (&ptx_dev_lock);
0b210c43
TV
780
781 static int gomp_openacc_dims[GOMP_DIM_MAX];
782 if (!gomp_openacc_dims[0])
783 {
784 /* See if the user provided GOMP_OPENACC_DIM environment
785 variable to specify runtime defaults. */
786 for (int i = 0; i < GOMP_DIM_MAX; ++i)
787 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
788 }
789
790 if (!nvthd->ptx_dev->default_dims[0])
6668eb45 791 {
0b210c43 792 int default_dims[GOMP_DIM_MAX];
ec00d3fa 793 for (int i = 0; i < GOMP_DIM_MAX; ++i)
0b210c43 794 default_dims[i] = gomp_openacc_dims[i];
6668eb45 795
0c6c2f5f
CP
796 int gang, worker, vector;
797 {
0c6c2f5f
CP
798 int block_size = nvthd->ptx_dev->max_threads_per_block;
799 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
800 int dev_size = nvthd->ptx_dev->num_sms;
801 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
802 " dev_size=%d, cpu_size=%d\n",
803 warp_size, block_size, dev_size, cpu_size);
804
805 gang = (cpu_size / block_size) * dev_size;
806 worker = block_size / warp_size;
807 vector = warp_size;
808 }
6668eb45
CP
809
810 /* There is no upper bound on the gang size. The best size
811 matches the hardware configuration. Logical gangs are
812 scheduled onto physical hardware. To maximize usage, we
813 should guess a large number. */
814 if (default_dims[GOMP_DIM_GANG] < 1)
815 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
816 /* The worker size must not exceed the hardware. */
817 if (default_dims[GOMP_DIM_WORKER] < 1
818 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
819 default_dims[GOMP_DIM_WORKER] = worker;
820 /* The vector size must exactly match the hardware. */
821 if (default_dims[GOMP_DIM_VECTOR] < 1
822 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
823 default_dims[GOMP_DIM_VECTOR] = vector;
824
825 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
826 default_dims[GOMP_DIM_GANG],
827 default_dims[GOMP_DIM_WORKER],
828 default_dims[GOMP_DIM_VECTOR]);
0b210c43
TV
829
830 for (i = 0; i != GOMP_DIM_MAX; i++)
831 nvthd->ptx_dev->default_dims[i] = default_dims[i];
6668eb45
CP
832 }
833 pthread_mutex_unlock (&ptx_dev_lock);
834
4cdfee3f
TV
835 {
836 bool default_dim_p[GOMP_DIM_MAX];
837 for (i = 0; i != GOMP_DIM_MAX; i++)
bd9b3d3d
CP
838 default_dim_p[i] = !dims[i];
839
840 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
4cdfee3f 841 {
bd9b3d3d
CP
842 for (i = 0; i != GOMP_DIM_MAX; i++)
843 if (default_dim_p[i])
844 dims[i] = nvthd->ptx_dev->default_dims[i];
845
846 if (default_dim_p[GOMP_DIM_VECTOR])
847 dims[GOMP_DIM_VECTOR]
848 = MIN (dims[GOMP_DIM_VECTOR],
849 (targ_fn->max_threads_per_block / warp_size
850 * warp_size));
851
852 if (default_dim_p[GOMP_DIM_WORKER])
853 dims[GOMP_DIM_WORKER]
854 = MIN (dims[GOMP_DIM_WORKER],
855 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
4cdfee3f 856 }
bd9b3d3d
CP
857 else
858 {
859 /* Handle the case that the compiler allows the runtime to choose
860 the vector-length conservatively, by ignoring
861 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
862 it. */
863 int vectors = 0;
864 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
865 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
866 exceed targ_fn->max_threads_per_block. */
867 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
868 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
869 int grids, blocks;
870
871 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
872 &blocks, function, NULL, 0,
873 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
874 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
875 "grid = %d, block = %d\n", grids, blocks);
876
877 /* Keep the num_gangs proportional to the block size. In
878 the case were a block size is limited by shared-memory
879 or the register file capacity, the runtime will not
880 excessively over assign gangs to the multiprocessor
881 units if their state is going to be swapped out even
882 more than necessary. The constant factor 2 is there to
883 prevent threads from idling when there is insufficient
884 work for them. */
885 if (gangs == 0)
886 gangs = 2 * grids * (blocks / warp_size);
887
888 if (vectors == 0)
889 vectors = warp_size;
890
891 if (workers == 0)
892 {
893 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
894 ? vectors
895 : dims[GOMP_DIM_VECTOR]);
896 workers = blocks / actual_vectors;
2c372e81 897 workers = MAX (workers, 1);
052aaace
TV
898 /* If we need a per-worker barrier ... . */
899 if (actual_vectors > 32)
900 /* Don't use more barriers than available. */
901 workers = MIN (workers, 15);
bd9b3d3d 902 }
4cdfee3f 903
bd9b3d3d
CP
904 for (i = 0; i != GOMP_DIM_MAX; i++)
905 if (default_dim_p[i])
906 switch (i)
907 {
908 case GOMP_DIM_GANG: dims[i] = gangs; break;
909 case GOMP_DIM_WORKER: dims[i] = workers; break;
910 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
911 default: GOMP_PLUGIN_fatal ("invalid dim");
912 }
913 }
4cdfee3f 914 }
f99c3557 915 }
3e32ee19 916
88a4654d
CP
917 /* Check if the accelerator has sufficient hardware resources to
918 launch the offloaded kernel. */
919 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
920 > targ_fn->max_threads_per_block)
921 {
52d22ece
TV
922 const char *msg
923 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
924 " with num_workers = %d and vector_length = %d"
925 "; "
926 "recompile the program with 'num_workers = x and vector_length = y'"
927 " on that offloaded region or '-fopenacc-dim=:x:y' where"
928 " x * y <= %d"
929 ".\n");
930 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
931 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
88a4654d
CP
932 }
933
052aaace
TV
934 /* Check if the accelerator has sufficient barrier resources to
935 launch the offloaded kernel. */
936 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
937 {
938 const char *msg
939 = ("The Nvidia accelerator has insufficient barrier resources to launch"
940 " '%s' with num_workers = %d and vector_length = %d"
941 "; "
942 "recompile the program with 'num_workers = x' on that offloaded"
943 " region or '-fopenacc-dim=:x:' where x <= 15"
944 "; "
945 "or, recompile the program with 'vector_length = 32' on that"
2c2ff168 946 " offloaded region or '-fopenacc-dim=::32'"
052aaace
TV
947 ".\n");
948 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
949 dims[GOMP_DIM_VECTOR]);
950 }
951
3e32ee19
NS
952 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
953 " gangs=%u, workers=%u, vectors=%u\n",
6668eb45
CP
954 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
955 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
41dbbb37
TS
956
957 // OpenACC CUDA
958 //
3e32ee19
NS
959 // num_gangs nctaid.x
960 // num_workers ntid.y
961 // vector length ntid.x
5fae049d
TS
962
963 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
964 acc_prof_info *prof_info = thr->prof_info;
965 acc_event_info enqueue_launch_event_info;
966 acc_api_info *api_info = thr->api_info;
967 bool profiling_p = __builtin_expect (prof_info != NULL, false);
968 if (profiling_p)
969 {
970 prof_info->event_type = acc_ev_enqueue_launch_start;
971
972 enqueue_launch_event_info.launch_event.event_type
973 = prof_info->event_type;
974 enqueue_launch_event_info.launch_event.valid_bytes
975 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
976 enqueue_launch_event_info.launch_event.parent_construct
977 = acc_construct_parallel;
978 enqueue_launch_event_info.launch_event.implicit = 1;
979 enqueue_launch_event_info.launch_event.tool_info = NULL;
980 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
981 enqueue_launch_event_info.launch_event.num_gangs
982 = dims[GOMP_DIM_GANG];
983 enqueue_launch_event_info.launch_event.num_workers
984 = dims[GOMP_DIM_WORKER];
985 enqueue_launch_event_info.launch_event.vector_length
986 = dims[GOMP_DIM_VECTOR];
987
988 api_info->device_api = acc_device_api_cuda;
989
990 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
991 api_info);
992 }
993
41dbbb37 994 kargs[0] = &dp;
6ce13072
CLT
995 CUDA_CALL_ASSERT (cuLaunchKernel, function,
996 dims[GOMP_DIM_GANG], 1, 1,
997 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1f4c5b9b 998 0, stream, kargs, 0);
41dbbb37 999
5fae049d
TS
1000 if (profiling_p)
1001 {
1002 prof_info->event_type = acc_ev_enqueue_launch_end;
1003 enqueue_launch_event_info.launch_event.event_type
1004 = prof_info->event_type;
1005 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1006 api_info);
1007 }
1008
41dbbb37 1009 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
3e32ee19 1010 targ_fn->launch->fn);
41dbbb37
TS
1011}
1012
1013void * openacc_get_current_cuda_context (void);
1014
5fae049d
TS
1015static void
1016goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1017{
1018 acc_prof_info *prof_info = thr->prof_info;
1019 acc_event_info data_event_info;
1020 acc_api_info *api_info = thr->api_info;
1021
1022 prof_info->event_type = acc_ev_alloc;
1023
1024 data_event_info.data_event.event_type = prof_info->event_type;
1025 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1026 data_event_info.data_event.parent_construct = acc_construct_parallel;
1027 data_event_info.data_event.implicit = 1;
1028 data_event_info.data_event.tool_info = NULL;
1029 data_event_info.data_event.var_name = NULL;
1030 data_event_info.data_event.bytes = s;
1031 data_event_info.data_event.host_ptr = NULL;
1032 data_event_info.data_event.device_ptr = dp;
1033
1034 api_info->device_api = acc_device_api_cuda;
1035
1036 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1037}
1038
6b577a17
JB
1039/* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1040 size threshold, or if FORCE is true. */
1041
1042static void
1043nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1044{
1045 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1046 if (ptx_dev->omp_stacks.ptr
1047 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1048 {
1049 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1050 if (r != CUDA_SUCCESS)
1051 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1052 ptx_dev->omp_stacks.ptr = 0;
1053 ptx_dev->omp_stacks.size = 0;
1054 }
1055 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1056}
1057
41dbbb37 1058static void *
6b577a17 1059nvptx_alloc (size_t s, bool suppress_errors)
41dbbb37
TS
1060{
1061 CUdeviceptr d;
41dbbb37 1062
6b577a17
JB
1063 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1064 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1065 return NULL;
1066 else if (r != CUDA_SUCCESS)
1067 {
1068 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1069 return NULL;
1070 }
1071
1072 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
5fae049d
TS
1073 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1074 bool profiling_p
1075 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1076 if (profiling_p)
1077 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1078
6ce13072 1079 return (void *) d;
41dbbb37
TS
1080}
1081
5fae049d
TS
1082static void
1083goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1084{
1085 acc_prof_info *prof_info = thr->prof_info;
1086 acc_event_info data_event_info;
1087 acc_api_info *api_info = thr->api_info;
1088
1089 prof_info->event_type = acc_ev_free;
1090
1091 data_event_info.data_event.event_type = prof_info->event_type;
1092 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1093 data_event_info.data_event.parent_construct = acc_construct_parallel;
1094 data_event_info.data_event.implicit = 1;
1095 data_event_info.data_event.tool_info = NULL;
1096 data_event_info.data_event.var_name = NULL;
1097 data_event_info.data_event.bytes = -1;
1098 data_event_info.data_event.host_ptr = NULL;
1099 data_event_info.data_event.device_ptr = p;
1100
1101 api_info->device_api = acc_device_api_cuda;
1102
1103 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1104}
1105
6ce13072 1106static bool
1f4c5b9b 1107nvptx_free (void *p, struct ptx_device *ptx_dev)
41dbbb37 1108{
f9b98328
CLT
1109 CUdeviceptr pb;
1110 size_t ps;
1111
1112 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1113 (CUdeviceptr) p);
1114 if (r == CUDA_ERROR_NOT_PERMITTED)
1f4c5b9b 1115 {
f9b98328
CLT
1116 /* We assume that this error indicates we are in a CUDA callback context,
1117 where all CUDA calls are not allowed (see cuStreamAddCallback
1118 documentation for description). Arrange to free this piece of device
1119 memory later. */
1f4c5b9b
CLT
1120 struct ptx_free_block *n
1121 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1122 n->ptr = p;
1123 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1124 n->next = ptx_dev->free_blocks;
1125 ptx_dev->free_blocks = n;
1126 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1127 return true;
1128 }
f9b98328
CLT
1129 else if (r != CUDA_SUCCESS)
1130 {
1131 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1132 return false;
1133 }
6ce13072
CLT
1134 if ((CUdeviceptr) p != pb)
1135 {
1136 GOMP_PLUGIN_error ("invalid device address");
1137 return false;
1138 }
41dbbb37 1139
6ce13072 1140 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
5fae049d
TS
1141 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1142 bool profiling_p
1143 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1144 if (profiling_p)
1145 goacc_profiling_acc_ev_free (thr, p);
1146
6ce13072 1147 return true;
41dbbb37
TS
1148}
1149
1f4c5b9b
CLT
1150static void *
1151nvptx_get_current_cuda_device (void)
41dbbb37 1152{
41dbbb37
TS
1153 struct nvptx_thread *nvthd = nvptx_thread ();
1154
1f4c5b9b
CLT
1155 if (!nvthd || !nvthd->ptx_dev)
1156 return NULL;
41dbbb37 1157
1f4c5b9b 1158 return &nvthd->ptx_dev->dev;
41dbbb37
TS
1159}
1160
1f4c5b9b
CLT
1161static void *
1162nvptx_get_current_cuda_context (void)
41dbbb37
TS
1163{
1164 struct nvptx_thread *nvthd = nvptx_thread ();
1165
1166 if (!nvthd || !nvthd->ptx_dev)
1167 return NULL;
1168
1169 return nvthd->ptx_dev->ctx;
1170}
1171
41dbbb37
TS
1172/* Plugin entry points. */
1173
1174const char *
1175GOMP_OFFLOAD_get_name (void)
1176{
1177 return "nvptx";
1178}
1179
1180unsigned int
1181GOMP_OFFLOAD_get_caps (void)
1182{
6103184e 1183 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
41dbbb37
TS
1184}
1185
1186int
1187GOMP_OFFLOAD_get_type (void)
1188{
1189 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1190}
1191
1192int
683f1184 1193GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
41dbbb37 1194{
683f1184
TB
1195 int num_devices = nvptx_get_num_devices ();
1196 /* Return -1 if no omp_requires_mask cannot be fulfilled but
131d18e9
TB
1197 devices were present. Unified-shared address: see comment in
1198 nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1199 if (num_devices > 0
ea4b23d9
TB
1200 && ((omp_requires_mask
1201 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1202 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
683f1184
TB
1203 return -1;
1204 return num_devices;
41dbbb37
TS
1205}
1206
6ce13072 1207bool
d93bdab5 1208GOMP_OFFLOAD_init_device (int n)
41dbbb37 1209{
6ce13072
CLT
1210 struct ptx_device *dev;
1211
d93bdab5 1212 pthread_mutex_lock (&ptx_dev_lock);
41dbbb37 1213
d93bdab5
JB
1214 if (!nvptx_init () || ptx_devices[n] != NULL)
1215 {
1216 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072 1217 return false;
d93bdab5
JB
1218 }
1219
6ce13072
CLT
1220 dev = nvptx_open_device (n);
1221 if (dev)
1222 {
1223 ptx_devices[n] = dev;
1224 instantiated_devices++;
1225 }
d93bdab5 1226
30486fab
AS
1227 const char *var_name = "GOMP_NVPTX_LOWLAT_POOL";
1228 const char *env_var = secure_getenv (var_name);
1229 notify_var (var_name, env_var);
1230
1231 if (env_var != NULL)
1232 {
1233 char *endptr;
1234 unsigned long val = strtoul (env_var, &endptr, 10);
1235 if (endptr == NULL || *endptr != '\0'
1236 || errno == ERANGE || errno == EINVAL
1237 || val > UINT_MAX)
1238 GOMP_PLUGIN_error ("Error parsing %s", var_name);
1239 else
1240 lowlat_pool_size = val;
1241 }
1242
d93bdab5 1243 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072
CLT
1244
1245 return dev != NULL;
41dbbb37
TS
1246}
1247
6ce13072 1248bool
d93bdab5 1249GOMP_OFFLOAD_fini_device (int n)
41dbbb37 1250{
d93bdab5
JB
1251 pthread_mutex_lock (&ptx_dev_lock);
1252
1253 if (ptx_devices[n] != NULL)
1254 {
6ce13072
CLT
1255 if (!nvptx_attach_host_thread_to_device (n)
1256 || !nvptx_close_device (ptx_devices[n]))
1257 {
1258 pthread_mutex_unlock (&ptx_dev_lock);
1259 return false;
1260 }
d93bdab5
JB
1261 ptx_devices[n] = NULL;
1262 instantiated_devices--;
1263 }
1264
738c56d4
TV
1265 if (instantiated_devices == 0)
1266 {
1267 free (ptx_devices);
1268 ptx_devices = NULL;
1269 }
1270
d93bdab5 1271 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072 1272 return true;
41dbbb37
TS
1273}
1274
2a21ff19
NS
1275/* Return the libgomp version number we're compatible with. There is
1276 no requirement for cross-version compatibility. */
1277
1278unsigned
1279GOMP_OFFLOAD_version (void)
1280{
1281 return GOMP_VERSION;
1282}
1283
6103184e
AM
1284/* Initialize __nvptx_clocktick, if present in MODULE. */
1285
1286static void
1287nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1288{
1289 CUdeviceptr dptr;
2393d337
JJ
1290 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1291 module, "__nvptx_clocktick");
6103184e
AM
1292 if (r == CUDA_ERROR_NOT_FOUND)
1293 return;
1294 if (r != CUDA_SUCCESS)
1295 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1296 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
2393d337
JJ
1297 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1298 sizeof (__nvptx_clocktick));
6103184e
AM
1299 if (r != CUDA_SUCCESS)
1300 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1301}
1302
f3e9a059 1303/* Load the (partial) program described by TARGET_DATA to device
0fcc0cf9
TB
1304 number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
1305 will contain the on-device addresses of the functions for reverse offload.
1306 To be freed by the caller. */
f3e9a059 1307
41dbbb37 1308int
2a21ff19 1309GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
0fcc0cf9 1310 struct addr_pair **target_table,
a49c7d31
KCY
1311 uint64_t **rev_fn_table,
1312 uint64_t *host_ind_fn_table)
41dbbb37
TS
1313{
1314 CUmodule module;
3e32ee19
NS
1315 const char *const *var_names;
1316 const struct targ_fn_launch *fn_descs;
a49c7d31 1317 unsigned int fn_entries, var_entries, ind_fn_entries, other_entries, i, j;
41dbbb37 1318 struct targ_fn_descriptor *targ_fns;
f3e9a059 1319 struct addr_pair *targ_tbl;
afb2d80b 1320 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
d93bdab5 1321 struct ptx_image_data *new_image;
f3e9a059 1322 struct ptx_device *dev;
41dbbb37 1323
2a21ff19 1324 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
6ce13072
CLT
1325 {
1326 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1327 " (expected %u, received %u)",
1328 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1329 return -1;
1330 }
41dbbb37 1331
6ce13072
CLT
1332 if (!nvptx_attach_host_thread_to_device (ord)
1333 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1334 return -1;
d93bdab5 1335
6ce13072 1336 dev = ptx_devices[ord];
41dbbb37 1337
a4cb876d
NS
1338 /* The mkoffload utility emits a struct of pointers/integers at the
1339 start of each offload image. The array of kernel names and the
1340 functions addresses form a one-to-one correspondence. */
41dbbb37 1341
a4cb876d
NS
1342 var_entries = img_header->var_num;
1343 var_names = img_header->var_names;
1344 fn_entries = img_header->fn_num;
3e32ee19 1345 fn_descs = img_header->fn_descs;
a49c7d31
KCY
1346 ind_fn_entries = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
1347 ? img_header->ind_fn_num : 0;
41dbbb37 1348
9f2fca56 1349 /* Currently, other_entries contains only the struct of ICVs. */
0bac793e
CLT
1350 other_entries = 1;
1351
f3e9a059 1352 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
0bac793e 1353 * (fn_entries + var_entries + other_entries));
41dbbb37
TS
1354 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1355 * fn_entries);
1356
f3e9a059
NS
1357 *target_table = targ_tbl;
1358
1359 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1360 new_image->target_data = target_data;
1361 new_image->module = module;
1362 new_image->fns = targ_fns;
1363
1364 pthread_mutex_lock (&dev->image_lock);
1365 new_image->next = dev->images;
1366 dev->images = new_image;
1367 pthread_mutex_unlock (&dev->image_lock);
1368
1369 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
41dbbb37
TS
1370 {
1371 CUfunction function;
6103184e 1372 int nregs, mthrs;
41dbbb37 1373
6ce13072
CLT
1374 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1375 fn_descs[i].fn);
6103184e
AM
1376 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1377 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1378 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1379 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
41dbbb37 1380
f3e9a059 1381 targ_fns->fn = function;
3e32ee19 1382 targ_fns->launch = &fn_descs[i];
6103184e
AM
1383 targ_fns->regs_per_thread = nregs;
1384 targ_fns->max_threads_per_block = mthrs;
41dbbb37 1385
f3e9a059
NS
1386 targ_tbl->start = (uintptr_t) targ_fns;
1387 targ_tbl->end = targ_tbl->start + 1;
41dbbb37
TS
1388 }
1389
f3e9a059 1390 for (j = 0; j < var_entries; j++, targ_tbl++)
d93bdab5
JB
1391 {
1392 CUdeviceptr var;
1393 size_t bytes;
1394
6ce13072
CLT
1395 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1396 &var, &bytes, module, var_names[j]);
d93bdab5 1397
f3e9a059
NS
1398 targ_tbl->start = (uintptr_t) var;
1399 targ_tbl->end = targ_tbl->start + bytes;
d93bdab5
JB
1400 }
1401
a49c7d31
KCY
1402 if (ind_fn_entries > 0)
1403 {
1404 CUdeviceptr var;
1405 size_t bytes;
1406
1407 /* Read indirect function table from image. */
1408 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1409 "$offload_ind_func_table");
1410 if (r != CUDA_SUCCESS)
1411 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1412 assert (bytes == sizeof (uint64_t) * ind_fn_entries);
1413
1414 uint64_t ind_fn_table[ind_fn_entries];
1415 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, ind_fn_table, var, bytes);
1416 if (r != CUDA_SUCCESS)
1417 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1418
1419 /* Build host->target address map for indirect functions. */
1420 uint64_t ind_fn_map[ind_fn_entries * 2 + 1];
1421 for (unsigned k = 0; k < ind_fn_entries; k++)
1422 {
1423 ind_fn_map[k * 2] = host_ind_fn_table[k];
1424 ind_fn_map[k * 2 + 1] = ind_fn_table[k];
1425 GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
1426 k, host_ind_fn_table[k], ind_fn_table[k]);
1427 }
1428 ind_fn_map[ind_fn_entries * 2] = 0;
1429
1430 /* Write the map onto the target. */
1431 void *map_target_addr
1432 = GOMP_OFFLOAD_alloc (ord, sizeof (ind_fn_map));
1433 GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr);
1434
1435 GOMP_OFFLOAD_host2dev (ord, map_target_addr,
1436 (void*) ind_fn_map,
1437 sizeof (ind_fn_map));
1438
1439 /* Write address of the map onto the target. */
1440 CUdeviceptr varptr;
1441 size_t varsize;
1442 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1443 module, XSTRING (GOMP_INDIRECT_ADDR_MAP));
1444 if (r != CUDA_SUCCESS)
1445 GOMP_PLUGIN_fatal ("Indirect map variable not found in image: %s",
1446 cuda_error (r));
1447
1448 GOMP_PLUGIN_debug (0,
1449 "Indirect map variable found at %llx with size %ld\n",
1450 varptr, varsize);
1451
1452 GOMP_OFFLOAD_host2dev (ord, (void *) varptr, &map_target_addr,
1453 sizeof (map_target_addr));
1454 }
1455
9f2fca56
MV
1456 CUdeviceptr varptr;
1457 size_t varsize;
1458 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1459 module, XSTRING (GOMP_ADDITIONAL_ICVS));
1460
0bac793e
CLT
1461 if (r == CUDA_SUCCESS)
1462 {
9f2fca56
MV
1463 targ_tbl->start = (uintptr_t) varptr;
1464 targ_tbl->end = (uintptr_t) (varptr + varsize);
0bac793e
CLT
1465 }
1466 else
9f2fca56 1467 /* The variable was not in this image. */
0bac793e 1468 targ_tbl->start = targ_tbl->end = 0;
0bac793e 1469
50be486d
TB
1470 if (rev_fn_table && fn_entries == 0)
1471 *rev_fn_table = NULL;
1472 else if (rev_fn_table)
1473 {
1474 CUdeviceptr var;
9f9d128f
TB
1475 size_t bytes;
1476 unsigned int i;
50be486d
TB
1477 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1478 "$offload_func_table");
1479 if (r != CUDA_SUCCESS)
1480 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1481 assert (bytes == sizeof (uint64_t) * fn_entries);
1482 *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
1483 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
1484 if (r != CUDA_SUCCESS)
1485 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
131d18e9
TB
1486 /* Free if only NULL entries. */
1487 for (i = 0; i < fn_entries; ++i)
1488 if ((*rev_fn_table)[i] != 0)
1489 break;
1490 if (i == fn_entries)
1491 {
1492 free (*rev_fn_table);
1493 *rev_fn_table = NULL;
1494 }
1495 }
1496
1497 if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
1498 {
9f9d128f
TB
1499 /* Get the on-device GOMP_REV_OFFLOAD_VAR variable. It should be
1500 available but it might be not. One reason could be: if the user code
1501 has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1502 is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1503 are not linked in. */
131d18e9
TB
1504 CUdeviceptr device_rev_offload_var;
1505 size_t device_rev_offload_size;
1506 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
1507 &device_rev_offload_var,
1508 &device_rev_offload_size, module,
1509 XSTRING (GOMP_REV_OFFLOAD_VAR));
1510 if (r != CUDA_SUCCESS)
9f9d128f
TB
1511 {
1512 free (*rev_fn_table);
1513 *rev_fn_table = NULL;
1514 }
1515 else
1516 {
1517 /* cuMemHostAlloc memory is accessible on the device, if
1518 unified-shared address is supported; this is assumed - see comment
1519 in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1520 CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
1521 sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
1522 CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
1523 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
1524 sizeof (dp));
1525 if (r != CUDA_SUCCESS)
1526 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1527 }
50be486d
TB
1528 }
1529
6103184e
AM
1530 nvptx_set_clocktick (module, dev);
1531
0bac793e 1532 return fn_entries + var_entries + other_entries;
d93bdab5
JB
1533}
1534
f3e9a059
NS
1535/* Unload the program described by TARGET_DATA. DEV_DATA is the
1536 function descriptors allocated by G_O_load_image. */
1537
6ce13072 1538bool
2a21ff19 1539GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
d93bdab5 1540{
f3e9a059
NS
1541 struct ptx_image_data *image, **prev_p;
1542 struct ptx_device *dev = ptx_devices[ord];
1543
2a21ff19 1544 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
6ce13072
CLT
1545 {
1546 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1547 " (expected %u, received %u)",
1548 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1549 return false;
1550 }
1551
1552 bool ret = true;
f3e9a059
NS
1553 pthread_mutex_lock (&dev->image_lock);
1554 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1555 if (image->target_data == target_data)
1556 {
1557 *prev_p = image->next;
2393d337 1558 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
6ce13072 1559 ret = false;
f3e9a059
NS
1560 free (image->fns);
1561 free (image);
1562 break;
1563 }
1564 pthread_mutex_unlock (&dev->image_lock);
6ce13072 1565 return ret;
41dbbb37
TS
1566}
1567
1568void *
d93bdab5 1569GOMP_OFFLOAD_alloc (int ord, size_t size)
41dbbb37 1570{
6ce13072
CLT
1571 if (!nvptx_attach_host_thread_to_device (ord))
1572 return NULL;
41dbbb37 1573
1f4c5b9b
CLT
1574 struct ptx_device *ptx_dev = ptx_devices[ord];
1575 struct ptx_free_block *blocks, *tmp;
41dbbb37 1576
1f4c5b9b
CLT
1577 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1578 blocks = ptx_dev->free_blocks;
1579 ptx_dev->free_blocks = NULL;
1580 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
41dbbb37 1581
6b577a17
JB
1582 nvptx_stacks_free (ptx_dev, false);
1583
1f4c5b9b
CLT
1584 while (blocks)
1585 {
1586 tmp = blocks->next;
1587 nvptx_free (blocks->ptr, ptx_dev);
1588 free (blocks);
1589 blocks = tmp;
1590 }
1591
6b577a17
JB
1592 void *d = nvptx_alloc (size, true);
1593 if (d)
1594 return d;
1595 else
1596 {
1597 /* Memory allocation failed. Try freeing the stacks block, and
1598 retrying. */
1599 nvptx_stacks_free (ptx_dev, true);
1600 return nvptx_alloc (size, false);
1601 }
41dbbb37
TS
1602}
1603
6103184e 1604bool
1f4c5b9b 1605GOMP_OFFLOAD_free (int ord, void *ptr)
6103184e 1606{
1f4c5b9b
CLT
1607 return (nvptx_attach_host_thread_to_device (ord)
1608 && nvptx_free (ptr, ptx_devices[ord]));
6103184e
AM
1609}
1610
41dbbb37 1611void
f8332e52
TS
1612GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
1613 size_t mapnum __attribute__((unused)),
199867d0
TS
1614 void **hostaddrs __attribute__((unused)),
1615 void **devaddrs,
1f4c5b9b 1616 unsigned *dims, void *targ_mem_desc)
41dbbb37 1617{
f8332e52 1618 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
41dbbb37 1619
f8332e52
TS
1620 CUdeviceptr dp = (CUdeviceptr) devaddrs;
1621 nvptx_exec (fn, dims, targ_mem_desc, dp, NULL);
41dbbb37 1622
1f4c5b9b
CLT
1623 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1624 const char *maybe_abort_msg = "(perhaps abort was called)";
1625 if (r == CUDA_ERROR_LAUNCH_FAILED)
1626 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1627 maybe_abort_msg);
1628 else if (r != CUDA_SUCCESS)
1629 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
41dbbb37
TS
1630}
1631
1632void
f8332e52
TS
1633GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *),
1634 size_t mapnum __attribute__((unused)),
199867d0
TS
1635 void **hostaddrs __attribute__((unused)),
1636 void **devaddrs,
1f4c5b9b
CLT
1637 unsigned *dims, void *targ_mem_desc,
1638 struct goacc_asyncqueue *aq)
41dbbb37 1639{
f8332e52 1640 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1f4c5b9b 1641
f8332e52
TS
1642 CUdeviceptr dp = (CUdeviceptr) devaddrs;
1643 nvptx_exec (fn, dims, targ_mem_desc, dp, aq->cuda_stream);
41dbbb37
TS
1644}
1645
1646void *
d93bdab5 1647GOMP_OFFLOAD_openacc_create_thread_data (int ord)
41dbbb37 1648{
d93bdab5 1649 struct ptx_device *ptx_dev;
41dbbb37
TS
1650 struct nvptx_thread *nvthd
1651 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
41dbbb37
TS
1652 CUcontext thd_ctx;
1653
d93bdab5
JB
1654 ptx_dev = ptx_devices[ord];
1655
1656 assert (ptx_dev);
1657
6ce13072 1658 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
41dbbb37
TS
1659
1660 assert (ptx_dev->ctx);
1661
1662 if (!thd_ctx)
6ce13072 1663 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
41dbbb37 1664
41dbbb37
TS
1665 nvthd->ptx_dev = ptx_dev;
1666
1667 return (void *) nvthd;
1668}
1669
1670void
1671GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1672{
1673 free (data);
1674}
1675
1676void *
345a8c17 1677GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
41dbbb37
TS
1678{
1679 return nvptx_get_current_cuda_device ();
1680}
1681
1682void *
345a8c17 1683GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
41dbbb37
TS
1684{
1685 return nvptx_get_current_cuda_context ();
1686}
1687
1f4c5b9b 1688/* This returns a CUstream. */
41dbbb37 1689void *
1f4c5b9b
CLT
1690GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1691{
1692 return (void *) aq->cuda_stream;
1693}
1694
1695/* This takes a CUstream. */
1696int
1697GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1698{
1699 if (aq->cuda_stream)
1700 {
1701 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1702 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1703 }
1704
1705 aq->cuda_stream = (CUstream) stream;
1706 return 1;
1707}
1708
130c2f3c
TS
1709static struct goacc_asyncqueue *
1710nvptx_goacc_asyncqueue_construct (unsigned int flags)
41dbbb37 1711{
1f4c5b9b 1712 CUstream stream = NULL;
130c2f3c 1713 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
1f4c5b9b
CLT
1714
1715 struct goacc_asyncqueue *aq
1716 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1717 aq->cuda_stream = stream;
1718 return aq;
41dbbb37
TS
1719}
1720
130c2f3c
TS
1721struct goacc_asyncqueue *
1722GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1723{
1724 return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
1725}
1726
1727static bool
1728nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
1f4c5b9b
CLT
1729{
1730 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1731 free (aq);
1732 return true;
1733}
41dbbb37 1734
130c2f3c
TS
1735bool
1736GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1737{
1738 return nvptx_goacc_asyncqueue_destruct (aq);
1739}
1740
41dbbb37 1741int
1f4c5b9b 1742GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
41dbbb37 1743{
1f4c5b9b
CLT
1744 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1745 if (r == CUDA_SUCCESS)
1746 return 1;
1747 if (r == CUDA_ERROR_NOT_READY)
1748 return 0;
1749
1750 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1751 return -1;
1752}
1753
130c2f3c
TS
1754static bool
1755nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
1f4c5b9b
CLT
1756{
1757 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1758 return true;
1759}
1760
130c2f3c
TS
1761bool
1762GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1763{
1764 return nvptx_goacc_asyncqueue_synchronize (aq);
1765}
1766
1f4c5b9b
CLT
1767bool
1768GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1769 struct goacc_asyncqueue *aq2)
1770{
1771 CUevent e;
1772 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1773 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1774 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1775 return true;
1776}
1777
1778static void
1779cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1780{
1781 if (res != CUDA_SUCCESS)
1782 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1783 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1784 cb->fn (cb->ptr);
1785 free (ptr);
1786}
1787
1788void
1789GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1790 void (*callback_fn)(void *),
1791 void *userptr)
1792{
1793 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1794 b->fn = callback_fn;
1795 b->ptr = userptr;
1796 b->aq = aq;
1797 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1798 cuda_callback_wrapper, (void *) b, 0);
1799}
1800
1801static bool
1802cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1803{
1804 CUdeviceptr pb;
1805 size_t ps;
1806 if (!s)
1807 return true;
1808 if (!d)
1809 {
1810 GOMP_PLUGIN_error ("invalid device address");
1811 return false;
1812 }
1813 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1814 if (!pb)
1815 {
1816 GOMP_PLUGIN_error ("invalid device address");
1817 return false;
1818 }
1819 if (!h)
1820 {
1821 GOMP_PLUGIN_error ("invalid host address");
1822 return false;
1823 }
1824 if (d == h)
1825 {
1826 GOMP_PLUGIN_error ("invalid host or device address");
1827 return false;
1828 }
1829 if ((void *)(d + s) > (void *)(pb + ps))
1830 {
1831 GOMP_PLUGIN_error ("invalid size");
1832 return false;
1833 }
1834 return true;
1835}
1836
1837bool
1838GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1839{
1840 if (!nvptx_attach_host_thread_to_device (ord)
1841 || !cuda_memcpy_sanity_check (src, dst, n))
1842 return false;
1843 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1844 return true;
1845}
1846
1847bool
1848GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1849{
1850 if (!nvptx_attach_host_thread_to_device (ord)
1851 || !cuda_memcpy_sanity_check (dst, src, n))
1852 return false;
1853 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1854 return true;
1855}
1856
1857bool
1858GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1859{
1860 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1861 return true;
1862}
1863
25072a47
TB
1864int
1865GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
1866 size_t dim0_len, void *dst, size_t dst_offset1_size,
1867 size_t dst_offset0_len, size_t dst_dim1_size,
1868 const void *src, size_t src_offset1_size,
1869 size_t src_offset0_len, size_t src_dim1_size)
1870{
1871 if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
1872 return false;
1873
1874 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
1875
1876 CUDA_MEMCPY2D data;
8b9e559f
TB
1877
1878 memset (&data, 0, sizeof (data));
25072a47
TB
1879 data.WidthInBytes = dim1_size;
1880 data.Height = dim0_len;
1881
1882 if (dst_ord == -1)
1883 {
1884 data.dstMemoryType = CU_MEMORYTYPE_HOST;
1885 data.dstHost = dst;
1886 }
1887 else
1888 {
1889 data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1890 data.dstDevice = (CUdeviceptr) dst;
1891 }
1892 data.dstPitch = dst_dim1_size;
1893 data.dstXInBytes = dst_offset1_size;
1894 data.dstY = dst_offset0_len;
1895
1896 if (src_ord == -1)
1897 {
1898 data.srcMemoryType = CU_MEMORYTYPE_HOST;
1899 data.srcHost = src;
1900 }
1901 else
1902 {
1903 data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1904 data.srcDevice = (CUdeviceptr) src;
1905 }
1906 data.srcPitch = src_dim1_size;
1907 data.srcXInBytes = src_offset1_size;
1908 data.srcY = src_offset0_len;
1909
d7e9ae4f
JB
1910 if (data.srcXInBytes != 0 || data.srcY != 0)
1911 {
1912 /* Adjust origin to the actual array data, else the CUDA 2D memory
1913 copy API calls below may fail to validate source/dest pointers
1914 correctly (especially for Fortran where the "virtual origin" of an
1915 array is often outside the stored data). */
1916 if (src_ord == -1)
1917 data.srcHost = (const void *) ((const char *) data.srcHost
1918 + data.srcY * data.srcPitch
1919 + data.srcXInBytes);
1920 else
1921 data.srcDevice += data.srcY * data.srcPitch + data.srcXInBytes;
1922 data.srcXInBytes = 0;
1923 data.srcY = 0;
1924 }
1925
1926 if (data.dstXInBytes != 0 || data.dstY != 0)
1927 {
1928 /* As above. */
1929 if (dst_ord == -1)
1930 data.dstHost = (void *) ((char *) data.dstHost
1931 + data.dstY * data.dstPitch
1932 + data.dstXInBytes);
1933 else
1934 data.dstDevice += data.dstY * data.dstPitch + data.dstXInBytes;
1935 data.dstXInBytes = 0;
1936 data.dstY = 0;
1937 }
1938
25072a47
TB
1939 CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
1940 if (res == CUDA_ERROR_INVALID_VALUE)
1941 /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
1942 for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
1943 with an error; try the slower cuMemcpy2DUnaligned now. */
1944 CUDA_CALL (cuMemcpy2DUnaligned, &data);
1945 else if (res != CUDA_SUCCESS)
1946 {
1947 GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res));
1948 return false;
1949 }
1950 return true;
1951}
1952
1953int
1954GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
1955 size_t dim1_len, size_t dim0_len, void *dst,
1956 size_t dst_offset2_size, size_t dst_offset1_len,
1957 size_t dst_offset0_len, size_t dst_dim2_size,
1958 size_t dst_dim1_len, const void *src,
1959 size_t src_offset2_size, size_t src_offset1_len,
1960 size_t src_offset0_len, size_t src_dim2_size,
1961 size_t src_dim1_len)
1962{
1963 if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
1964 return false;
1965
1966 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
1967
1968 CUDA_MEMCPY3D data;
8b9e559f
TB
1969
1970 memset (&data, 0, sizeof (data));
25072a47
TB
1971 data.WidthInBytes = dim2_size;
1972 data.Height = dim1_len;
1973 data.Depth = dim0_len;
1974
1975 if (dst_ord == -1)
1976 {
1977 data.dstMemoryType = CU_MEMORYTYPE_HOST;
1978 data.dstHost = dst;
1979 }
1980 else
1981 {
1982 data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1983 data.dstDevice = (CUdeviceptr) dst;
1984 }
1985 data.dstPitch = dst_dim2_size;
1986 data.dstHeight = dst_dim1_len;
1987 data.dstXInBytes = dst_offset2_size;
1988 data.dstY = dst_offset1_len;
1989 data.dstZ = dst_offset0_len;
25072a47
TB
1990
1991 if (src_ord == -1)
1992 {
1993 data.srcMemoryType = CU_MEMORYTYPE_HOST;
1994 data.srcHost = src;
1995 }
1996 else
1997 {
1998 data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1999 data.srcDevice = (CUdeviceptr) src;
2000 }
2001 data.srcPitch = src_dim2_size;
2002 data.srcHeight = src_dim1_len;
2003 data.srcXInBytes = src_offset2_size;
2004 data.srcY = src_offset1_len;
2005 data.srcZ = src_offset0_len;
25072a47 2006
d7e9ae4f
JB
2007 if (data.srcXInBytes != 0 || data.srcY != 0 || data.srcZ != 0)
2008 {
2009 /* Adjust origin to the actual array data, else the CUDA 3D memory
2010 copy API call below may fail to validate source/dest pointers
2011 correctly (especially for Fortran where the "virtual origin" of an
2012 array is often outside the stored data). */
2013 if (src_ord == -1)
2014 data.srcHost
2015 = (const void *) ((const char *) data.srcHost
2016 + (data.srcZ * data.srcHeight + data.srcY)
2017 * data.srcPitch
2018 + data.srcXInBytes);
2019 else
2020 data.srcDevice
2021 += (data.srcZ * data.srcHeight + data.srcY) * data.srcPitch
2022 + data.srcXInBytes;
2023 data.srcXInBytes = 0;
2024 data.srcY = 0;
2025 data.srcZ = 0;
2026 }
2027
2028 if (data.dstXInBytes != 0 || data.dstY != 0 || data.dstZ != 0)
2029 {
2030 /* As above. */
2031 if (dst_ord == -1)
2032 data.dstHost = (void *) ((char *) data.dstHost
2033 + (data.dstZ * data.dstHeight + data.dstY)
2034 * data.dstPitch
2035 + data.dstXInBytes);
2036 else
2037 data.dstDevice
2038 += (data.dstZ * data.dstHeight + data.dstY) * data.dstPitch
2039 + data.dstXInBytes;
2040 data.dstXInBytes = 0;
2041 data.dstY = 0;
2042 data.dstZ = 0;
2043 }
2044
25072a47
TB
2045 CUDA_CALL (cuMemcpy3D, &data);
2046 return true;
2047}
2048
1f4c5b9b
CLT
2049bool
2050GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
2051 size_t n, struct goacc_asyncqueue *aq)
2052{
2053 if (!nvptx_attach_host_thread_to_device (ord)
2054 || !cuda_memcpy_sanity_check (src, dst, n))
2055 return false;
2056 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
2057 return true;
2058}
2059
2060bool
2061GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
2062 size_t n, struct goacc_asyncqueue *aq)
2063{
2064 if (!nvptx_attach_host_thread_to_device (ord)
2065 || !cuda_memcpy_sanity_check (dst, src, n))
2066 return false;
2067 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
2068 return true;
41dbbb37 2069}
6103184e 2070
6fc0385c
TS
2071union goacc_property_value
2072GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
2073{
2074 union goacc_property_value propval = { .val = 0 };
2075
2076 pthread_mutex_lock (&ptx_dev_lock);
2077
2078 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
2079 {
2080 pthread_mutex_unlock (&ptx_dev_lock);
2081 return propval;
2082 }
2083
2084 struct ptx_device *ptx_dev = ptx_devices[n];
2085 switch (prop)
2086 {
2087 case GOACC_PROPERTY_MEMORY:
2088 {
2089 size_t total_mem;
2090
2091 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
2092 propval.val = total_mem;
2093 }
2094 break;
2095 case GOACC_PROPERTY_FREE_MEMORY:
2096 {
2097 size_t total_mem;
2098 size_t free_mem;
2099 CUdevice ctxdev;
2100
2101 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
2102 if (ptx_dev->dev == ctxdev)
2103 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2104 else if (ptx_dev->ctx)
2105 {
2106 CUcontext old_ctx;
2107
2108 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
2109 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2110 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
2111 }
2112 else
2113 {
2114 CUcontext new_ctx;
2115
2116 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
2117 ptx_dev->dev);
2118 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2119 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
2120 }
2121 propval.val = free_mem;
2122 }
2123 break;
2124 case GOACC_PROPERTY_NAME:
2125 propval.ptr = ptx_dev->name;
2126 break;
2127 case GOACC_PROPERTY_VENDOR:
2128 propval.ptr = "Nvidia";
2129 break;
2130 case GOACC_PROPERTY_DRIVER:
2131 propval.ptr = cuda_driver_version_s;
2132 break;
2133 default:
2134 break;
2135 }
2136
2137 pthread_mutex_unlock (&ptx_dev_lock);
2138 return propval;
2139}
2140
6103184e
AM
2141/* Adjust launch dimensions: pick good values for number of blocks and warps
2142 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2143 own limits. */
2144
2145static void
2146nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2147 struct ptx_device *ptx_dev,
2148 int *teams_p, int *threads_p)
2149{
2150 int max_warps_block = fn->max_threads_per_block / 32;
2151 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2152 and libgcc, which matches documented limit of all GPUs as of 2015. */
2153 if (max_warps_block > 32)
2154 max_warps_block = 32;
2155 if (*threads_p <= 0)
2156 *threads_p = 8;
2157 if (*threads_p > max_warps_block)
2158 *threads_p = max_warps_block;
2159
2160 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2161 /* This is an estimate of how many blocks the device can host simultaneously.
2162 Actual limit, which may be lower, can be queried with "occupancy control"
2163 driver interface (since CUDA 6.0). */
2164 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2165 if (*teams_p <= 0 || *teams_p > max_blocks)
2166 *teams_p = max_blocks;
2167}
2168
2169/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2170 target regions. */
2171
2172static size_t
2173nvptx_stacks_size ()
2174{
2175 return 128 * 1024;
2176}
2177
6b577a17
JB
2178/* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
2179 the storage should be held on entry, and remains held on exit. */
6103184e
AM
2180
2181static void *
6b577a17 2182nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
6103184e 2183{
6b577a17
JB
2184 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
2185 return (void *) ptx_dev->omp_stacks.ptr;
2186
2187 /* Free the old, too-small stacks. */
2188 if (ptx_dev->omp_stacks.ptr)
2189 {
2190 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2191 if (r != CUDA_SUCCESS)
2192 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
2193 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
2194 if (r != CUDA_SUCCESS)
2195 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2196 }
2197
2198 /* Make new and bigger stacks, and remember where we put them and how big
2199 they are. */
2200 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
2201 size * num);
6103184e
AM
2202 if (r != CUDA_SUCCESS)
2203 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
6103184e 2204
6b577a17 2205 ptx_dev->omp_stacks.size = size * num;
6103184e 2206
6b577a17 2207 return (void *) ptx_dev->omp_stacks.ptr;
6103184e
AM
2208}
2209
131d18e9 2210
6103184e
AM
2211void
2212GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2213{
7345ef6c
TV
2214 struct targ_fn_descriptor *tgt_fn_desc
2215 = (struct targ_fn_descriptor *) tgt_fn;
2216 CUfunction function = tgt_fn_desc->fn;
2217 const struct targ_fn_launch *launch = tgt_fn_desc->launch;
2218 const char *fn_name = launch->fn;
6103184e
AM
2219 CUresult r;
2220 struct ptx_device *ptx_dev = ptx_devices[ord];
2221 const char *maybe_abort_msg = "(perhaps abort was called)";
2222 int teams = 0, threads = 0;
2223
2224 if (!args)
2225 GOMP_PLUGIN_fatal ("No target arguments provided");
2226 while (*args)
2227 {
2228 intptr_t id = (intptr_t) *args++, val;
2229 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2230 val = (intptr_t) *args++;
2231 else
2232 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2233 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2234 continue;
2235 val = val > INT_MAX ? INT_MAX : val;
2236 id &= GOMP_TARGET_ARG_ID_MASK;
2237 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2238 teams = val;
2239 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2240 threads = val;
2241 }
2242 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2243
131d18e9 2244 bool reverse_offload = ptx_dev->rev_data != NULL;
130c2f3c
TS
2245 struct goacc_asyncqueue *reverse_offload_aq = NULL;
2246 if (reverse_offload)
2247 {
2248 reverse_offload_aq
2249 = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
2250 if (!reverse_offload_aq)
2251 exit (EXIT_FAILURE);
2252 }
2253
2254 size_t stack_size = nvptx_stacks_size ();
6b577a17
JB
2255
2256 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2257 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
6103184e
AM
2258 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2259 size_t fn_args_size = sizeof fn_args;
2260 void *config[] = {
2261 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2262 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2263 CU_LAUNCH_PARAM_END
2264 };
7345ef6c
TV
2265 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2266 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2267 __FUNCTION__, fn_name, teams, threads);
2393d337 2268 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
30486fab 2269 32, threads, 1, lowlat_pool_size, NULL, NULL, config);
6103184e
AM
2270 if (r != CUDA_SUCCESS)
2271 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
131d18e9
TB
2272 if (reverse_offload)
2273 while (true)
2274 {
2275 r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
2276 if (r == CUDA_SUCCESS)
2277 break;
2278 if (r == CUDA_ERROR_LAUNCH_FAILED)
2279 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
2280 maybe_abort_msg);
2281 else if (r != CUDA_ERROR_NOT_READY)
2282 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
2283
2284 if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
2285 {
2286 struct rev_offload *rev_data = ptx_dev->rev_data;
2287 GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
2288 rev_data->addrs, rev_data->sizes,
2289 rev_data->kinds, rev_data->dev_num,
130c2f3c
TS
2290 reverse_offload_aq);
2291 if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
2292 exit (EXIT_FAILURE);
131d18e9
TB
2293 __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
2294 }
2295 usleep (1);
2296 }
2297 else
2298 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
6103184e
AM
2299 if (r == CUDA_ERROR_LAUNCH_FAILED)
2300 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2301 maybe_abort_msg);
2302 else if (r != CUDA_SUCCESS)
2303 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
6b577a17
JB
2304
2305 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
130c2f3c
TS
2306
2307 if (reverse_offload)
2308 {
2309 if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
2310 exit (EXIT_FAILURE);
2311 }
6103184e
AM
2312}
2313
001ab12e 2314/* TODO: Implement GOMP_OFFLOAD_async_run. */