]> git.ipfire.org Git - thirdparty/gcc.git/blame - libgomp/plugin/plugin-nvptx.c
GCN, nvptx: Errors during device probing are fatal
[thirdparty/gcc.git] / libgomp / plugin / plugin-nvptx.c
CommitLineData
41dbbb37
TS
1/* Plugin for NVPTX execution.
2
a945c346 3 Copyright (C) 2013-2024 Free Software Foundation, Inc.
41dbbb37
TS
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29/* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
df36a3d3 34#define _GNU_SOURCE
41dbbb37
TS
35#include "openacc.h"
36#include "config.h"
0bac793e 37#include "symcat.h"
41dbbb37 38#include "libgomp-plugin.h"
41dbbb37 39#include "oacc-plugin.h"
2a21ff19 40#include "gomp-constants.h"
5fae049d 41#include "oacc-int.h"
41dbbb37 42
131d18e9
TB
43/* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
44#include "config/nvptx/libgomp-nvptx.h"
45
41dbbb37 46#include <pthread.h>
cd644ce8 47#ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
5e431ae4
TS
48# include "cuda/cuda.h"
49#else
50# include <cuda.h>
51#endif
41dbbb37 52#include <stdbool.h>
6103184e 53#include <limits.h>
41dbbb37
TS
54#include <string.h>
55#include <stdio.h>
41dbbb37
TS
56#include <unistd.h>
57#include <assert.h>
6668eb45 58#include <errno.h>
130c2f3c 59#include <stdlib.h>
41dbbb37 60
6b577a17
JB
61/* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
62 block to cache between kernel invocations. For soft-stacks blocks bigger
63 than this, we will free the block before attempting another GPU memory
64 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
65 we will free the cached soft-stacks block anyway then retry the
66 allocation. If that fails too, we lose. */
67
68#define SOFTSTACK_CACHE_LIMIT 134217728
69
94767dac
TV
70#if CUDA_VERSION < 6000
71extern CUresult cuGetErrorString (CUresult, const char **);
b113af95 72#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
94767dac
TV
73#endif
74
8e09a12f
TV
75#if CUDA_VERSION >= 6050
76#undef cuLinkCreate
77#undef cuLinkAddData
78CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
79 const char *, unsigned, CUjit_option *, void **);
80CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
81#else
bd9b3d3d 82typedef size_t (*CUoccupancyB2DSize)(int);
8e09a12f
TV
83CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
84 const char *, unsigned, CUjit_option *, void **);
85CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
bd9b3d3d
CP
86CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
87 CUoccupancyB2DSize, size_t, int);
8e09a12f
TV
88#endif
89
02150de8
TV
90#define DO_PRAGMA(x) _Pragma (#x)
91
cd644ce8 92#ifndef PLUGIN_NVPTX_LINK_LIBCUDA
2393d337
JJ
93# include <dlfcn.h>
94
2393d337 95struct cuda_lib_s {
9e28b107
TV
96
97# define CUDA_ONE_CALL(call) \
98 __typeof (call) *call;
02150de8
TV
99# define CUDA_ONE_CALL_MAYBE_NULL(call) \
100 CUDA_ONE_CALL (call)
8c6310a2 101#include "cuda-lib.def"
9e28b107 102# undef CUDA_ONE_CALL
02150de8 103# undef CUDA_ONE_CALL_MAYBE_NULL
9e28b107 104
2393d337
JJ
105} cuda_lib;
106
107/* -1 if init_cuda_lib has not been called yet, false
108 if it has been and failed, true if it has been and succeeded. */
19929ba9 109static signed char cuda_lib_inited = -1;
2393d337
JJ
110
111/* Dynamically load the CUDA runtime library and initialize function
112 pointers, return false if unsuccessful, true if successful. */
113static bool
114init_cuda_lib (void)
115{
116 if (cuda_lib_inited != -1)
117 return cuda_lib_inited;
118 const char *cuda_runtime_lib = "libcuda.so.1";
119 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
120 cuda_lib_inited = false;
121 if (h == NULL)
122 return false;
9e28b107 123
02150de8
TV
124# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
125# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
126# define CUDA_ONE_CALL_1(call, allow_null) \
2393d337 127 cuda_lib.call = dlsym (h, #call); \
02150de8 128 if (!allow_null && cuda_lib.call == NULL) \
ab70addf 129 GOMP_PLUGIN_fatal ("'%s' is missing '%s'", cuda_runtime_lib, #call);
8c6310a2 130#include "cuda-lib.def"
9e28b107
TV
131# undef CUDA_ONE_CALL
132# undef CUDA_ONE_CALL_1
02150de8 133# undef CUDA_ONE_CALL_MAYBE_NULL
9e28b107 134
2393d337
JJ
135 cuda_lib_inited = true;
136 return true;
41dbbb37 137}
2393d337
JJ
138# define CUDA_CALL_PREFIX cuda_lib.
139#else
02150de8
TV
140
141# define CUDA_ONE_CALL(call)
142# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
143#include "cuda-lib.def"
144#undef CUDA_ONE_CALL_MAYBE_NULL
145#undef CUDA_ONE_CALL
146
2393d337
JJ
147# define CUDA_CALL_PREFIX
148# define init_cuda_lib() true
149#endif
41dbbb37 150
df36a3d3
TV
151#include "secure_getenv.h"
152
4cdfee3f
TV
153#undef MIN
154#undef MAX
155#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
156#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
157
6ce13072 158/* Convenience macros for the frequently used CUDA library call and
2393d337
JJ
159 error handling sequence as well as CUDA library calls that
160 do the error checking themselves or don't do it at all. */
6ce13072
CLT
161
162#define CUDA_CALL_ERET(ERET, FN, ...) \
163 do { \
2393d337
JJ
164 unsigned __r \
165 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
6ce13072
CLT
166 if (__r != CUDA_SUCCESS) \
167 { \
168 GOMP_PLUGIN_error (#FN " error: %s", \
169 cuda_error (__r)); \
170 return ERET; \
171 } \
172 } while (0)
173
174#define CUDA_CALL(FN, ...) \
2393d337 175 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
6ce13072
CLT
176
177#define CUDA_CALL_ASSERT(FN, ...) \
178 do { \
2393d337
JJ
179 unsigned __r \
180 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
6ce13072
CLT
181 if (__r != CUDA_SUCCESS) \
182 { \
183 GOMP_PLUGIN_fatal (#FN " error: %s", \
184 cuda_error (__r)); \
185 } \
186 } while (0)
187
2393d337
JJ
188#define CUDA_CALL_NOCHECK(FN, ...) \
189 CUDA_CALL_PREFIX FN (__VA_ARGS__)
190
02150de8
TV
191#define CUDA_CALL_EXISTS(FN) \
192 CUDA_CALL_PREFIX FN
193
2393d337
JJ
194static const char *
195cuda_error (CUresult r)
196{
cedd9bd0 197 const char *fallback = "unknown cuda error";
2393d337
JJ
198 const char *desc;
199
cedd9bd0
TV
200 if (!CUDA_CALL_EXISTS (cuGetErrorString))
201 return fallback;
202
2393d337 203 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
cedd9bd0
TV
204 if (r == CUDA_SUCCESS)
205 return desc;
2393d337 206
cedd9bd0 207 return fallback;
2393d337
JJ
208}
209
6c84c8bf
MR
210/* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
211 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
212static char cuda_driver_version_s[30];
213
d93bdab5
JB
214static unsigned int instantiated_devices = 0;
215static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
41dbbb37 216
1f4c5b9b
CLT
217/* NVPTX/CUDA specific definition of asynchronous queues. */
218struct goacc_asyncqueue
2049befd 219{
1f4c5b9b 220 CUstream cuda_stream;
2049befd
CP
221};
222
1f4c5b9b 223struct nvptx_callback
41dbbb37 224{
1f4c5b9b
CLT
225 void (*fn) (void *);
226 void *ptr;
227 struct goacc_asyncqueue *aq;
228 struct nvptx_callback *next;
41dbbb37
TS
229};
230
231/* Thread-specific data for PTX. */
232
233struct nvptx_thread
234{
1f4c5b9b
CLT
235 /* We currently have this embedded inside the plugin because libgomp manages
236 devices through integer target_ids. This might be better if using an
237 opaque target-specific pointer directly from gomp_device_descr. */
41dbbb37
TS
238 struct ptx_device *ptx_dev;
239};
240
3e32ee19
NS
241/* Target data function launch information. */
242
243struct targ_fn_launch
244{
245 const char *fn;
cc3cd79b 246 unsigned short dim[GOMP_DIM_MAX];
3e32ee19
NS
247};
248
cc3cd79b
NS
249/* Target PTX object information. */
250
251struct targ_ptx_obj
252{
253 const char *code;
254 size_t size;
255};
256
257/* Target data image information. */
258
259typedef struct nvptx_tdata
260{
261 const struct targ_ptx_obj *ptx_objs;
262 unsigned ptx_num;
263
264 const char *const *var_names;
265 unsigned var_num;
266
267 const struct targ_fn_launch *fn_descs;
268 unsigned fn_num;
a49c7d31
KCY
269
270 unsigned ind_fn_num;
cc3cd79b
NS
271} nvptx_tdata_t;
272
f3e9a059
NS
273/* Descriptor of a loaded function. */
274
275struct targ_fn_descriptor
276{
277 CUfunction fn;
3e32ee19 278 const struct targ_fn_launch *launch;
6103184e
AM
279 int regs_per_thread;
280 int max_threads_per_block;
f3e9a059
NS
281};
282
283/* A loaded PTX image. */
284struct ptx_image_data
285{
286 const void *target_data;
287 CUmodule module;
288
289 struct targ_fn_descriptor *fns; /* Array of functions. */
290
291 struct ptx_image_data *next;
292};
293
1f4c5b9b
CLT
294struct ptx_free_block
295{
296 void *ptr;
297 struct ptx_free_block *next;
298};
299
41dbbb37
TS
300struct ptx_device
301{
302 CUcontext ctx;
303 bool ctx_shared;
304 CUdevice dev;
1f4c5b9b 305
41dbbb37
TS
306 int ord;
307 bool overlap;
308 bool map;
309 bool concur;
41dbbb37 310 bool mkern;
6c84c8bf 311 int mode;
6103184e
AM
312 int clock_khz;
313 int num_sms;
314 int regs_per_block;
315 int regs_per_sm;
0c6c2f5f
CP
316 int warp_size;
317 int max_threads_per_block;
318 int max_threads_per_multiprocessor;
0b210c43 319 int default_dims[GOMP_DIM_MAX];
41dbbb37 320
6c84c8bf
MR
321 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
322 char name[256];
323
f3e9a059
NS
324 struct ptx_image_data *images; /* Images loaded on device. */
325 pthread_mutex_t image_lock; /* Lock for above list. */
41dbbb37 326
1f4c5b9b
CLT
327 struct ptx_free_block *free_blocks;
328 pthread_mutex_t free_blocks_lock;
41dbbb37 329
6b577a17
JB
330 /* OpenMP stacks, cached between kernel invocations. */
331 struct
332 {
333 CUdeviceptr ptr;
334 size_t size;
335 pthread_mutex_t lock;
336 } omp_stacks;
337
131d18e9 338 struct rev_offload *rev_data;
1f4c5b9b 339 struct ptx_device *next;
41dbbb37
TS
340};
341
d93bdab5
JB
342static struct ptx_device **ptx_devices;
343
30486fab
AS
344/* OpenMP kernels reserve a small amount of ".shared" space for use by
345 omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
346 default is set here. */
347static unsigned lowlat_pool_size = 8 * 1024;
348
41dbbb37
TS
349static inline struct nvptx_thread *
350nvptx_thread (void)
351{
352 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
353}
354
d93bdab5
JB
355/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
356 should be locked on entry and remains locked on exit. */
f3e9a059 357
d93bdab5 358static bool
41dbbb37
TS
359nvptx_init (void)
360{
d93bdab5 361 int ndevs;
41dbbb37 362
d93bdab5
JB
363 if (instantiated_devices != 0)
364 return true;
41dbbb37 365
2393d337
JJ
366 if (!init_cuda_lib ())
367 return false;
368
369 CUDA_CALL (cuInit, 0);
370
6c84c8bf
MR
371 int cuda_driver_version;
372 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
373 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
374 "CUDA Driver %u.%u",
375 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
376
6ce13072 377 CUDA_CALL (cuDeviceGetCount, &ndevs);
d93bdab5
JB
378 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
379 * ndevs);
6c84c8bf 380
d93bdab5 381 return true;
41dbbb37
TS
382}
383
d93bdab5
JB
384/* Select the N'th PTX device for the current host thread. The device must
385 have been previously opened before calling this function. */
386
6ce13072 387static bool
d93bdab5 388nvptx_attach_host_thread_to_device (int n)
41dbbb37 389{
d93bdab5
JB
390 CUdevice dev;
391 CUresult r;
392 struct ptx_device *ptx_dev;
393 CUcontext thd_ctx;
394
2393d337 395 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
1f4c5b9b
CLT
396 if (r == CUDA_ERROR_NOT_PERMITTED)
397 {
398 /* Assume we're in a CUDA callback, just return true. */
399 return true;
400 }
d93bdab5 401 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
6ce13072
CLT
402 {
403 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
404 return false;
405 }
d93bdab5
JB
406
407 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
6ce13072 408 return true;
d93bdab5
JB
409 else
410 {
411 CUcontext old_ctx;
412
413 ptx_dev = ptx_devices[n];
6ce13072
CLT
414 if (!ptx_dev)
415 {
416 GOMP_PLUGIN_error ("device %d not found", n);
417 return false;
418 }
d93bdab5 419
6ce13072 420 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
d93bdab5
JB
421
422 /* We don't necessarily have a current context (e.g. if it has been
423 destroyed. Pop it if we do though. */
424 if (thd_ctx != NULL)
6ce13072 425 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
d93bdab5 426
6ce13072 427 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
d93bdab5 428 }
6ce13072 429 return true;
41dbbb37
TS
430}
431
d93bdab5 432static struct ptx_device *
41dbbb37
TS
433nvptx_open_device (int n)
434{
435 struct ptx_device *ptx_dev;
d93bdab5 436 CUdevice dev, ctx_dev;
41dbbb37 437 CUresult r;
131d18e9 438 int pi;
41dbbb37 439
6ce13072 440 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
41dbbb37
TS
441
442 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
443
444 ptx_dev->ord = n;
445 ptx_dev->dev = dev;
446 ptx_dev->ctx_shared = false;
447
2393d337 448 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
d93bdab5 449 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
6ce13072
CLT
450 {
451 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
452 return NULL;
453 }
d93bdab5
JB
454
455 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
456 {
457 /* The current host thread has an active context for a different device.
458 Detach it. */
459 CUcontext old_ctx;
6ce13072 460 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
d93bdab5
JB
461 }
462
6ce13072 463 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
41dbbb37
TS
464
465 if (!ptx_dev->ctx)
6ce13072 466 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
41dbbb37
TS
467 else
468 ptx_dev->ctx_shared = true;
469
6ce13072
CLT
470 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
471 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
41dbbb37
TS
472 ptx_dev->overlap = pi;
473
6ce13072
CLT
474 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
475 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
41dbbb37
TS
476 ptx_dev->map = pi;
477
6ce13072
CLT
478 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
479 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
41dbbb37
TS
480 ptx_dev->concur = pi;
481
6ce13072
CLT
482 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
483 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
41dbbb37
TS
484 ptx_dev->mode = pi;
485
6ce13072
CLT
486 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
487 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
41dbbb37
TS
488 ptx_dev->mkern = pi;
489
6103184e
AM
490 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
491 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
492 ptx_dev->clock_khz = pi;
493
2393d337 494 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
6103184e
AM
495 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
496 ptx_dev->num_sms = pi;
497
498 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
499 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
500 ptx_dev->regs_per_block = pi;
501
b113af95 502 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
6103184e 503 in CUDA 6.0 and newer. */
b113af95
TV
504 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
505 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
506 dev);
6103184e
AM
507 /* Fallback: use limit of registers per block, which is usually equal. */
508 if (r == CUDA_ERROR_INVALID_VALUE)
509 pi = ptx_dev->regs_per_block;
510 else if (r != CUDA_SUCCESS)
511 {
512 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
513 return NULL;
514 }
515 ptx_dev->regs_per_sm = pi;
516
517 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
518 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
519 if (pi != 32)
520 {
521 GOMP_PLUGIN_error ("Only warp size 32 is supported");
522 return NULL;
523 }
0c6c2f5f
CP
524 ptx_dev->warp_size = pi;
525
526 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
527 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
528 ptx_dev->max_threads_per_block = pi;
529
530 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
531 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
532 ptx_dev->max_threads_per_multiprocessor = pi;
6103184e 533
131d18e9
TB
534 /* Required below for reverse offload as implemented, but with compute
535 capability >= 2.0 and 64bit device processes, this should be universally be
536 the case; hence, an assert. */
537 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
538 CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
539 assert (r == CUDA_SUCCESS && pi);
41dbbb37 540
0b210c43
TV
541 for (int i = 0; i != GOMP_DIM_MAX; i++)
542 ptx_dev->default_dims[i] = 0;
543
6c84c8bf
MR
544 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
545 dev);
546
f3e9a059
NS
547 ptx_dev->images = NULL;
548 pthread_mutex_init (&ptx_dev->image_lock, NULL);
549
1f4c5b9b
CLT
550 ptx_dev->free_blocks = NULL;
551 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
41dbbb37 552
6b577a17
JB
553 ptx_dev->omp_stacks.ptr = 0;
554 ptx_dev->omp_stacks.size = 0;
555 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
556
20553883
TS
557 ptx_dev->rev_data = NULL;
558
d93bdab5 559 return ptx_dev;
41dbbb37
TS
560}
561
6ce13072 562static bool
d93bdab5 563nvptx_close_device (struct ptx_device *ptx_dev)
41dbbb37 564{
41dbbb37 565 if (!ptx_dev)
6ce13072 566 return true;
41dbbb37 567
1f4c5b9b
CLT
568 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
569 {
570 struct ptx_free_block *b_next = b->next;
571 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
572 free (b);
573 b = b_next;
574 }
575
576 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
f3e9a059 577 pthread_mutex_destroy (&ptx_dev->image_lock);
41dbbb37 578
6b577a17
JB
579 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
580
581 if (ptx_dev->omp_stacks.ptr)
582 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
583
41dbbb37 584 if (!ptx_dev->ctx_shared)
6ce13072 585 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
41dbbb37
TS
586
587 free (ptx_dev);
6ce13072 588 return true;
41dbbb37
TS
589}
590
591static int
592nvptx_get_num_devices (void)
593{
594 int n;
41dbbb37
TS
595
596 /* This function will be called before the plugin has been initialized in
597 order to enumerate available devices, but CUDA API routines can't be used
598 until cuInit has been called. Just call it now (but don't yet do any
599 further initialization). */
d93bdab5 600 if (instantiated_devices == 0)
c8319826 601 {
2393d337
JJ
602 if (!init_cuda_lib ())
603 return 0;
604 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
c8319826
JB
605 /* This is not an error: e.g. we may have CUDA libraries installed but
606 no devices available. */
a02d7f0e 607 if (r == CUDA_ERROR_NO_DEVICE)
78672bd8
TS
608 {
609 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
610 cuda_error (r));
611 return 0;
612 }
a02d7f0e
TS
613 else if (r != CUDA_SUCCESS)
614 GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
c8319826 615 }
41dbbb37 616
37078f24 617 CUDA_CALL_ASSERT (cuDeviceGetCount, &n);
41dbbb37
TS
618 return n;
619}
620
dfb15f6b
TV
621static void
622notify_var (const char *var_name, const char *env_var)
623{
624 if (env_var == NULL)
625 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
626 else
627 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
628}
41dbbb37 629
df36a3d3
TV
630static void
631process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
632{
633 const char *var_name = "GOMP_NVPTX_JIT";
634 const char *env_var = secure_getenv (var_name);
635 notify_var (var_name, env_var);
636
637 if (env_var == NULL)
638 return;
639
640 const char *c = env_var;
641 while (*c != '\0')
642 {
643 while (*c == ' ')
644 c++;
645
646 if (c[0] == '-' && c[1] == 'O'
647 && '0' <= c[2] && c[2] <= '4'
648 && (c[3] == '\0' || c[3] == ' '))
649 {
650 *gomp_nvptx_o = c[2] - '0';
651 c += 3;
652 continue;
653 }
654
655 GOMP_PLUGIN_error ("Error parsing %s", var_name);
656 break;
657 }
658}
659
6ce13072 660static bool
cc3cd79b
NS
661link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
662 unsigned num_objs)
41dbbb37 663{
df36a3d3
TV
664 CUjit_option opts[7];
665 void *optvals[7];
41dbbb37 666 float elapsed = 0.0;
6103184e
AM
667 char elog[1024];
668 char ilog[16384];
41dbbb37
TS
669 CUlinkState linkstate;
670 CUresult r;
671 void *linkout;
672 size_t linkoutsize __attribute__ ((unused));
673
41dbbb37
TS
674 opts[0] = CU_JIT_WALL_TIME;
675 optvals[0] = &elapsed;
676
677 opts[1] = CU_JIT_INFO_LOG_BUFFER;
678 optvals[1] = &ilog[0];
679
680 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
6103184e 681 optvals[2] = (void *) sizeof ilog;
41dbbb37
TS
682
683 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
684 optvals[3] = &elog[0];
685
686 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
6103184e 687 optvals[4] = (void *) sizeof elog;
41dbbb37
TS
688
689 opts[5] = CU_JIT_LOG_VERBOSE;
690 optvals[5] = (void *) 1;
691
df36a3d3
TV
692 static intptr_t gomp_nvptx_o = -1;
693
694 static bool init_done = false;
695 if (!init_done)
696 {
697 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
698 init_done = true;
699 }
700
701 int nopts = 6;
702 if (gomp_nvptx_o != -1)
703 {
704 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
705 optvals[nopts] = (void *) gomp_nvptx_o;
706 nopts++;
707 }
708
8e09a12f
TV
709 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
710 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
711 else
712 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
41dbbb37 713
cc3cd79b 714 for (; num_objs--; ptx_objs++)
41dbbb37 715 {
cc3cd79b
NS
716 /* cuLinkAddData's 'data' argument erroneously omits the const
717 qualifier. */
718 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
8e09a12f
TV
719 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
720 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
721 (char *) ptx_objs->code, ptx_objs->size,
722 0, 0, 0, 0);
723 else
724 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
725 (char *) ptx_objs->code, ptx_objs->size,
726 0, 0, 0, 0);
cc3cd79b
NS
727 if (r != CUDA_SUCCESS)
728 {
729 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
6ce13072 730 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
cc3cd79b 731 cuda_error (r));
6ce13072 732 return false;
cc3cd79b 733 }
41dbbb37
TS
734 }
735
cc3cd79b 736 GOMP_PLUGIN_debug (0, "Linking\n");
2393d337 737 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
41dbbb37
TS
738
739 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
740 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
741
cc3cd79b 742 if (r != CUDA_SUCCESS)
6ce13072 743 {
c0e9cee2 744 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
6ce13072
CLT
745 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
746 return false;
747 }
cc3cd79b 748
6ce13072
CLT
749 CUDA_CALL (cuModuleLoadData, module, linkout);
750 CUDA_CALL (cuLinkDestroy, linkstate);
751 return true;
41dbbb37
TS
752}
753
e70ab10d 754static void
f8332e52 755nvptx_exec (void (*fn), unsigned *dims, void *targ_mem_desc,
1f4c5b9b 756 CUdeviceptr dp, CUstream stream)
41dbbb37
TS
757{
758 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
759 CUfunction function;
41dbbb37 760 int i;
41dbbb37 761 void *kargs[1];
41dbbb37 762 struct nvptx_thread *nvthd = nvptx_thread ();
4cdfee3f 763 int warp_size = nvthd->ptx_dev->warp_size;
41dbbb37
TS
764
765 function = targ_fn->fn;
766
3e32ee19
NS
767 /* Initialize the launch dimensions. Typically this is constant,
768 provided by the device compiler, but we must permit runtime
769 values. */
f99c3557
TS
770 int seen_zero = 0;
771 for (i = 0; i != GOMP_DIM_MAX; i++)
772 {
773 if (targ_fn->launch->dim[i])
774 dims[i] = targ_fn->launch->dim[i];
775 if (!dims[i])
776 seen_zero = 1;
777 }
778
779 if (seen_zero)
780 {
6668eb45 781 pthread_mutex_lock (&ptx_dev_lock);
0b210c43
TV
782
783 static int gomp_openacc_dims[GOMP_DIM_MAX];
784 if (!gomp_openacc_dims[0])
785 {
786 /* See if the user provided GOMP_OPENACC_DIM environment
787 variable to specify runtime defaults. */
788 for (int i = 0; i < GOMP_DIM_MAX; ++i)
789 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
790 }
791
792 if (!nvthd->ptx_dev->default_dims[0])
6668eb45 793 {
0b210c43 794 int default_dims[GOMP_DIM_MAX];
ec00d3fa 795 for (int i = 0; i < GOMP_DIM_MAX; ++i)
0b210c43 796 default_dims[i] = gomp_openacc_dims[i];
6668eb45 797
0c6c2f5f
CP
798 int gang, worker, vector;
799 {
0c6c2f5f
CP
800 int block_size = nvthd->ptx_dev->max_threads_per_block;
801 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
802 int dev_size = nvthd->ptx_dev->num_sms;
803 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
804 " dev_size=%d, cpu_size=%d\n",
805 warp_size, block_size, dev_size, cpu_size);
806
807 gang = (cpu_size / block_size) * dev_size;
808 worker = block_size / warp_size;
809 vector = warp_size;
810 }
6668eb45
CP
811
812 /* There is no upper bound on the gang size. The best size
813 matches the hardware configuration. Logical gangs are
814 scheduled onto physical hardware. To maximize usage, we
815 should guess a large number. */
816 if (default_dims[GOMP_DIM_GANG] < 1)
817 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
818 /* The worker size must not exceed the hardware. */
819 if (default_dims[GOMP_DIM_WORKER] < 1
820 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
821 default_dims[GOMP_DIM_WORKER] = worker;
822 /* The vector size must exactly match the hardware. */
823 if (default_dims[GOMP_DIM_VECTOR] < 1
824 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
825 default_dims[GOMP_DIM_VECTOR] = vector;
826
827 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
828 default_dims[GOMP_DIM_GANG],
829 default_dims[GOMP_DIM_WORKER],
830 default_dims[GOMP_DIM_VECTOR]);
0b210c43
TV
831
832 for (i = 0; i != GOMP_DIM_MAX; i++)
833 nvthd->ptx_dev->default_dims[i] = default_dims[i];
6668eb45
CP
834 }
835 pthread_mutex_unlock (&ptx_dev_lock);
836
4cdfee3f
TV
837 {
838 bool default_dim_p[GOMP_DIM_MAX];
839 for (i = 0; i != GOMP_DIM_MAX; i++)
bd9b3d3d
CP
840 default_dim_p[i] = !dims[i];
841
842 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
4cdfee3f 843 {
bd9b3d3d
CP
844 for (i = 0; i != GOMP_DIM_MAX; i++)
845 if (default_dim_p[i])
846 dims[i] = nvthd->ptx_dev->default_dims[i];
847
848 if (default_dim_p[GOMP_DIM_VECTOR])
849 dims[GOMP_DIM_VECTOR]
850 = MIN (dims[GOMP_DIM_VECTOR],
851 (targ_fn->max_threads_per_block / warp_size
852 * warp_size));
853
854 if (default_dim_p[GOMP_DIM_WORKER])
855 dims[GOMP_DIM_WORKER]
856 = MIN (dims[GOMP_DIM_WORKER],
857 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
4cdfee3f 858 }
bd9b3d3d
CP
859 else
860 {
861 /* Handle the case that the compiler allows the runtime to choose
862 the vector-length conservatively, by ignoring
863 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
864 it. */
865 int vectors = 0;
866 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
867 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
868 exceed targ_fn->max_threads_per_block. */
869 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
870 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
871 int grids, blocks;
872
873 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
874 &blocks, function, NULL, 0,
875 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
876 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
877 "grid = %d, block = %d\n", grids, blocks);
878
879 /* Keep the num_gangs proportional to the block size. In
880 the case were a block size is limited by shared-memory
881 or the register file capacity, the runtime will not
882 excessively over assign gangs to the multiprocessor
883 units if their state is going to be swapped out even
884 more than necessary. The constant factor 2 is there to
885 prevent threads from idling when there is insufficient
886 work for them. */
887 if (gangs == 0)
888 gangs = 2 * grids * (blocks / warp_size);
889
890 if (vectors == 0)
891 vectors = warp_size;
892
893 if (workers == 0)
894 {
895 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
896 ? vectors
897 : dims[GOMP_DIM_VECTOR]);
898 workers = blocks / actual_vectors;
2c372e81 899 workers = MAX (workers, 1);
052aaace
TV
900 /* If we need a per-worker barrier ... . */
901 if (actual_vectors > 32)
902 /* Don't use more barriers than available. */
903 workers = MIN (workers, 15);
bd9b3d3d 904 }
4cdfee3f 905
bd9b3d3d
CP
906 for (i = 0; i != GOMP_DIM_MAX; i++)
907 if (default_dim_p[i])
908 switch (i)
909 {
910 case GOMP_DIM_GANG: dims[i] = gangs; break;
911 case GOMP_DIM_WORKER: dims[i] = workers; break;
912 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
913 default: GOMP_PLUGIN_fatal ("invalid dim");
914 }
915 }
4cdfee3f 916 }
f99c3557 917 }
3e32ee19 918
88a4654d
CP
919 /* Check if the accelerator has sufficient hardware resources to
920 launch the offloaded kernel. */
921 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
922 > targ_fn->max_threads_per_block)
923 {
52d22ece
TV
924 const char *msg
925 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
926 " with num_workers = %d and vector_length = %d"
927 "; "
928 "recompile the program with 'num_workers = x and vector_length = y'"
929 " on that offloaded region or '-fopenacc-dim=:x:y' where"
930 " x * y <= %d"
931 ".\n");
932 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
933 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
88a4654d
CP
934 }
935
052aaace
TV
936 /* Check if the accelerator has sufficient barrier resources to
937 launch the offloaded kernel. */
938 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
939 {
940 const char *msg
941 = ("The Nvidia accelerator has insufficient barrier resources to launch"
942 " '%s' with num_workers = %d and vector_length = %d"
943 "; "
944 "recompile the program with 'num_workers = x' on that offloaded"
945 " region or '-fopenacc-dim=:x:' where x <= 15"
946 "; "
947 "or, recompile the program with 'vector_length = 32' on that"
2c2ff168 948 " offloaded region or '-fopenacc-dim=::32'"
052aaace
TV
949 ".\n");
950 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
951 dims[GOMP_DIM_VECTOR]);
952 }
953
3e32ee19
NS
954 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
955 " gangs=%u, workers=%u, vectors=%u\n",
6668eb45
CP
956 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
957 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
41dbbb37
TS
958
959 // OpenACC CUDA
960 //
3e32ee19
NS
961 // num_gangs nctaid.x
962 // num_workers ntid.y
963 // vector length ntid.x
5fae049d
TS
964
965 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
966 acc_prof_info *prof_info = thr->prof_info;
967 acc_event_info enqueue_launch_event_info;
968 acc_api_info *api_info = thr->api_info;
969 bool profiling_p = __builtin_expect (prof_info != NULL, false);
970 if (profiling_p)
971 {
972 prof_info->event_type = acc_ev_enqueue_launch_start;
973
974 enqueue_launch_event_info.launch_event.event_type
975 = prof_info->event_type;
976 enqueue_launch_event_info.launch_event.valid_bytes
977 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
978 enqueue_launch_event_info.launch_event.parent_construct
979 = acc_construct_parallel;
980 enqueue_launch_event_info.launch_event.implicit = 1;
981 enqueue_launch_event_info.launch_event.tool_info = NULL;
982 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
983 enqueue_launch_event_info.launch_event.num_gangs
984 = dims[GOMP_DIM_GANG];
985 enqueue_launch_event_info.launch_event.num_workers
986 = dims[GOMP_DIM_WORKER];
987 enqueue_launch_event_info.launch_event.vector_length
988 = dims[GOMP_DIM_VECTOR];
989
990 api_info->device_api = acc_device_api_cuda;
991
992 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
993 api_info);
994 }
995
41dbbb37 996 kargs[0] = &dp;
6ce13072
CLT
997 CUDA_CALL_ASSERT (cuLaunchKernel, function,
998 dims[GOMP_DIM_GANG], 1, 1,
999 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1f4c5b9b 1000 0, stream, kargs, 0);
41dbbb37 1001
5fae049d
TS
1002 if (profiling_p)
1003 {
1004 prof_info->event_type = acc_ev_enqueue_launch_end;
1005 enqueue_launch_event_info.launch_event.event_type
1006 = prof_info->event_type;
1007 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1008 api_info);
1009 }
1010
41dbbb37 1011 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
3e32ee19 1012 targ_fn->launch->fn);
41dbbb37
TS
1013}
1014
1015void * openacc_get_current_cuda_context (void);
1016
5fae049d
TS
1017static void
1018goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1019{
1020 acc_prof_info *prof_info = thr->prof_info;
1021 acc_event_info data_event_info;
1022 acc_api_info *api_info = thr->api_info;
1023
1024 prof_info->event_type = acc_ev_alloc;
1025
1026 data_event_info.data_event.event_type = prof_info->event_type;
1027 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1028 data_event_info.data_event.parent_construct = acc_construct_parallel;
1029 data_event_info.data_event.implicit = 1;
1030 data_event_info.data_event.tool_info = NULL;
1031 data_event_info.data_event.var_name = NULL;
1032 data_event_info.data_event.bytes = s;
1033 data_event_info.data_event.host_ptr = NULL;
1034 data_event_info.data_event.device_ptr = dp;
1035
1036 api_info->device_api = acc_device_api_cuda;
1037
1038 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1039}
1040
6b577a17
JB
1041/* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1042 size threshold, or if FORCE is true. */
1043
1044static void
1045nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1046{
1047 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1048 if (ptx_dev->omp_stacks.ptr
1049 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1050 {
1051 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1052 if (r != CUDA_SUCCESS)
1053 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1054 ptx_dev->omp_stacks.ptr = 0;
1055 ptx_dev->omp_stacks.size = 0;
1056 }
1057 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1058}
1059
41dbbb37 1060static void *
6b577a17 1061nvptx_alloc (size_t s, bool suppress_errors)
41dbbb37
TS
1062{
1063 CUdeviceptr d;
41dbbb37 1064
6b577a17
JB
1065 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1066 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1067 return NULL;
1068 else if (r != CUDA_SUCCESS)
1069 {
1070 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1071 return NULL;
1072 }
1073
1074 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
5fae049d
TS
1075 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1076 bool profiling_p
1077 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1078 if (profiling_p)
1079 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1080
6ce13072 1081 return (void *) d;
41dbbb37
TS
1082}
1083
5fae049d
TS
1084static void
1085goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1086{
1087 acc_prof_info *prof_info = thr->prof_info;
1088 acc_event_info data_event_info;
1089 acc_api_info *api_info = thr->api_info;
1090
1091 prof_info->event_type = acc_ev_free;
1092
1093 data_event_info.data_event.event_type = prof_info->event_type;
1094 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1095 data_event_info.data_event.parent_construct = acc_construct_parallel;
1096 data_event_info.data_event.implicit = 1;
1097 data_event_info.data_event.tool_info = NULL;
1098 data_event_info.data_event.var_name = NULL;
1099 data_event_info.data_event.bytes = -1;
1100 data_event_info.data_event.host_ptr = NULL;
1101 data_event_info.data_event.device_ptr = p;
1102
1103 api_info->device_api = acc_device_api_cuda;
1104
1105 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1106}
1107
6ce13072 1108static bool
1f4c5b9b 1109nvptx_free (void *p, struct ptx_device *ptx_dev)
41dbbb37 1110{
f9b98328
CLT
1111 CUdeviceptr pb;
1112 size_t ps;
1113
1114 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1115 (CUdeviceptr) p);
1116 if (r == CUDA_ERROR_NOT_PERMITTED)
1f4c5b9b 1117 {
f9b98328
CLT
1118 /* We assume that this error indicates we are in a CUDA callback context,
1119 where all CUDA calls are not allowed (see cuStreamAddCallback
1120 documentation for description). Arrange to free this piece of device
1121 memory later. */
1f4c5b9b
CLT
1122 struct ptx_free_block *n
1123 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1124 n->ptr = p;
1125 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1126 n->next = ptx_dev->free_blocks;
1127 ptx_dev->free_blocks = n;
1128 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1129 return true;
1130 }
f9b98328
CLT
1131 else if (r != CUDA_SUCCESS)
1132 {
1133 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1134 return false;
1135 }
6ce13072
CLT
1136 if ((CUdeviceptr) p != pb)
1137 {
1138 GOMP_PLUGIN_error ("invalid device address");
1139 return false;
1140 }
41dbbb37 1141
6ce13072 1142 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
5fae049d
TS
1143 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1144 bool profiling_p
1145 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1146 if (profiling_p)
1147 goacc_profiling_acc_ev_free (thr, p);
1148
6ce13072 1149 return true;
41dbbb37
TS
1150}
1151
1f4c5b9b
CLT
1152static void *
1153nvptx_get_current_cuda_device (void)
41dbbb37 1154{
41dbbb37
TS
1155 struct nvptx_thread *nvthd = nvptx_thread ();
1156
1f4c5b9b
CLT
1157 if (!nvthd || !nvthd->ptx_dev)
1158 return NULL;
41dbbb37 1159
1f4c5b9b 1160 return &nvthd->ptx_dev->dev;
41dbbb37
TS
1161}
1162
1f4c5b9b
CLT
1163static void *
1164nvptx_get_current_cuda_context (void)
41dbbb37
TS
1165{
1166 struct nvptx_thread *nvthd = nvptx_thread ();
1167
1168 if (!nvthd || !nvthd->ptx_dev)
1169 return NULL;
1170
1171 return nvthd->ptx_dev->ctx;
1172}
1173
41dbbb37
TS
1174/* Plugin entry points. */
1175
1176const char *
1177GOMP_OFFLOAD_get_name (void)
1178{
1179 return "nvptx";
1180}
1181
1182unsigned int
1183GOMP_OFFLOAD_get_caps (void)
1184{
6103184e 1185 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
41dbbb37
TS
1186}
1187
1188int
1189GOMP_OFFLOAD_get_type (void)
1190{
1191 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1192}
1193
1194int
683f1184 1195GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
41dbbb37 1196{
683f1184
TB
1197 int num_devices = nvptx_get_num_devices ();
1198 /* Return -1 if no omp_requires_mask cannot be fulfilled but
131d18e9
TB
1199 devices were present. Unified-shared address: see comment in
1200 nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1201 if (num_devices > 0
ea4b23d9
TB
1202 && ((omp_requires_mask
1203 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1204 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
683f1184
TB
1205 return -1;
1206 return num_devices;
41dbbb37
TS
1207}
1208
6ce13072 1209bool
d93bdab5 1210GOMP_OFFLOAD_init_device (int n)
41dbbb37 1211{
6ce13072
CLT
1212 struct ptx_device *dev;
1213
d93bdab5 1214 pthread_mutex_lock (&ptx_dev_lock);
41dbbb37 1215
d93bdab5
JB
1216 if (!nvptx_init () || ptx_devices[n] != NULL)
1217 {
1218 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072 1219 return false;
d93bdab5
JB
1220 }
1221
6ce13072
CLT
1222 dev = nvptx_open_device (n);
1223 if (dev)
1224 {
1225 ptx_devices[n] = dev;
1226 instantiated_devices++;
1227 }
d93bdab5 1228
30486fab
AS
1229 const char *var_name = "GOMP_NVPTX_LOWLAT_POOL";
1230 const char *env_var = secure_getenv (var_name);
1231 notify_var (var_name, env_var);
1232
1233 if (env_var != NULL)
1234 {
1235 char *endptr;
1236 unsigned long val = strtoul (env_var, &endptr, 10);
1237 if (endptr == NULL || *endptr != '\0'
1238 || errno == ERANGE || errno == EINVAL
1239 || val > UINT_MAX)
1240 GOMP_PLUGIN_error ("Error parsing %s", var_name);
1241 else
1242 lowlat_pool_size = val;
1243 }
1244
d93bdab5 1245 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072
CLT
1246
1247 return dev != NULL;
41dbbb37
TS
1248}
1249
6ce13072 1250bool
d93bdab5 1251GOMP_OFFLOAD_fini_device (int n)
41dbbb37 1252{
d93bdab5
JB
1253 pthread_mutex_lock (&ptx_dev_lock);
1254
1255 if (ptx_devices[n] != NULL)
1256 {
6ce13072
CLT
1257 if (!nvptx_attach_host_thread_to_device (n)
1258 || !nvptx_close_device (ptx_devices[n]))
1259 {
1260 pthread_mutex_unlock (&ptx_dev_lock);
1261 return false;
1262 }
d93bdab5
JB
1263 ptx_devices[n] = NULL;
1264 instantiated_devices--;
1265 }
1266
738c56d4
TV
1267 if (instantiated_devices == 0)
1268 {
1269 free (ptx_devices);
1270 ptx_devices = NULL;
1271 }
1272
d93bdab5 1273 pthread_mutex_unlock (&ptx_dev_lock);
6ce13072 1274 return true;
41dbbb37
TS
1275}
1276
2a21ff19
NS
1277/* Return the libgomp version number we're compatible with. There is
1278 no requirement for cross-version compatibility. */
1279
1280unsigned
1281GOMP_OFFLOAD_version (void)
1282{
1283 return GOMP_VERSION;
1284}
1285
6103184e
AM
1286/* Initialize __nvptx_clocktick, if present in MODULE. */
1287
1288static void
1289nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1290{
1291 CUdeviceptr dptr;
2393d337
JJ
1292 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1293 module, "__nvptx_clocktick");
6103184e
AM
1294 if (r == CUDA_ERROR_NOT_FOUND)
1295 return;
1296 if (r != CUDA_SUCCESS)
1297 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1298 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
2393d337
JJ
1299 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1300 sizeof (__nvptx_clocktick));
6103184e
AM
1301 if (r != CUDA_SUCCESS)
1302 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1303}
1304
f3e9a059 1305/* Load the (partial) program described by TARGET_DATA to device
0fcc0cf9
TB
1306 number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
1307 will contain the on-device addresses of the functions for reverse offload.
1308 To be freed by the caller. */
f3e9a059 1309
41dbbb37 1310int
2a21ff19 1311GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
0fcc0cf9 1312 struct addr_pair **target_table,
a49c7d31
KCY
1313 uint64_t **rev_fn_table,
1314 uint64_t *host_ind_fn_table)
41dbbb37
TS
1315{
1316 CUmodule module;
3e32ee19
NS
1317 const char *const *var_names;
1318 const struct targ_fn_launch *fn_descs;
a49c7d31 1319 unsigned int fn_entries, var_entries, ind_fn_entries, other_entries, i, j;
41dbbb37 1320 struct targ_fn_descriptor *targ_fns;
f3e9a059 1321 struct addr_pair *targ_tbl;
afb2d80b 1322 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
d93bdab5 1323 struct ptx_image_data *new_image;
f3e9a059 1324 struct ptx_device *dev;
41dbbb37 1325
2a21ff19 1326 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
6ce13072
CLT
1327 {
1328 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1329 " (expected %u, received %u)",
1330 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1331 return -1;
1332 }
41dbbb37 1333
6ce13072
CLT
1334 if (!nvptx_attach_host_thread_to_device (ord)
1335 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1336 return -1;
d93bdab5 1337
6ce13072 1338 dev = ptx_devices[ord];
41dbbb37 1339
a4cb876d
NS
1340 /* The mkoffload utility emits a struct of pointers/integers at the
1341 start of each offload image. The array of kernel names and the
1342 functions addresses form a one-to-one correspondence. */
41dbbb37 1343
a4cb876d
NS
1344 var_entries = img_header->var_num;
1345 var_names = img_header->var_names;
1346 fn_entries = img_header->fn_num;
3e32ee19 1347 fn_descs = img_header->fn_descs;
a49c7d31
KCY
1348 ind_fn_entries = GOMP_VERSION_SUPPORTS_INDIRECT_FUNCS (version)
1349 ? img_header->ind_fn_num : 0;
41dbbb37 1350
9f2fca56 1351 /* Currently, other_entries contains only the struct of ICVs. */
0bac793e
CLT
1352 other_entries = 1;
1353
f3e9a059 1354 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
0bac793e 1355 * (fn_entries + var_entries + other_entries));
41dbbb37
TS
1356 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1357 * fn_entries);
1358
f3e9a059
NS
1359 *target_table = targ_tbl;
1360
1361 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1362 new_image->target_data = target_data;
1363 new_image->module = module;
1364 new_image->fns = targ_fns;
1365
1366 pthread_mutex_lock (&dev->image_lock);
1367 new_image->next = dev->images;
1368 dev->images = new_image;
1369 pthread_mutex_unlock (&dev->image_lock);
1370
1371 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
41dbbb37
TS
1372 {
1373 CUfunction function;
6103184e 1374 int nregs, mthrs;
41dbbb37 1375
6ce13072
CLT
1376 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1377 fn_descs[i].fn);
6103184e
AM
1378 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1379 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1380 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1381 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
41dbbb37 1382
f3e9a059 1383 targ_fns->fn = function;
3e32ee19 1384 targ_fns->launch = &fn_descs[i];
6103184e
AM
1385 targ_fns->regs_per_thread = nregs;
1386 targ_fns->max_threads_per_block = mthrs;
41dbbb37 1387
f3e9a059
NS
1388 targ_tbl->start = (uintptr_t) targ_fns;
1389 targ_tbl->end = targ_tbl->start + 1;
41dbbb37
TS
1390 }
1391
f3e9a059 1392 for (j = 0; j < var_entries; j++, targ_tbl++)
d93bdab5
JB
1393 {
1394 CUdeviceptr var;
1395 size_t bytes;
1396
6ce13072
CLT
1397 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1398 &var, &bytes, module, var_names[j]);
d93bdab5 1399
f3e9a059
NS
1400 targ_tbl->start = (uintptr_t) var;
1401 targ_tbl->end = targ_tbl->start + bytes;
d93bdab5
JB
1402 }
1403
a49c7d31
KCY
1404 if (ind_fn_entries > 0)
1405 {
1406 CUdeviceptr var;
1407 size_t bytes;
1408
1409 /* Read indirect function table from image. */
1410 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1411 "$offload_ind_func_table");
1412 if (r != CUDA_SUCCESS)
1413 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1414 assert (bytes == sizeof (uint64_t) * ind_fn_entries);
1415
1416 uint64_t ind_fn_table[ind_fn_entries];
1417 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, ind_fn_table, var, bytes);
1418 if (r != CUDA_SUCCESS)
1419 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1420
1421 /* Build host->target address map for indirect functions. */
1422 uint64_t ind_fn_map[ind_fn_entries * 2 + 1];
1423 for (unsigned k = 0; k < ind_fn_entries; k++)
1424 {
1425 ind_fn_map[k * 2] = host_ind_fn_table[k];
1426 ind_fn_map[k * 2 + 1] = ind_fn_table[k];
1427 GOMP_PLUGIN_debug (0, "Indirect function %d: %lx->%lx\n",
1428 k, host_ind_fn_table[k], ind_fn_table[k]);
1429 }
1430 ind_fn_map[ind_fn_entries * 2] = 0;
1431
1432 /* Write the map onto the target. */
1433 void *map_target_addr
1434 = GOMP_OFFLOAD_alloc (ord, sizeof (ind_fn_map));
1435 GOMP_PLUGIN_debug (0, "Allocated indirect map at %p\n", map_target_addr);
1436
1437 GOMP_OFFLOAD_host2dev (ord, map_target_addr,
1438 (void*) ind_fn_map,
1439 sizeof (ind_fn_map));
1440
1441 /* Write address of the map onto the target. */
1442 CUdeviceptr varptr;
1443 size_t varsize;
1444 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1445 module, XSTRING (GOMP_INDIRECT_ADDR_MAP));
1446 if (r != CUDA_SUCCESS)
1447 GOMP_PLUGIN_fatal ("Indirect map variable not found in image: %s",
1448 cuda_error (r));
1449
1450 GOMP_PLUGIN_debug (0,
1451 "Indirect map variable found at %llx with size %ld\n",
1452 varptr, varsize);
1453
1454 GOMP_OFFLOAD_host2dev (ord, (void *) varptr, &map_target_addr,
1455 sizeof (map_target_addr));
1456 }
1457
9f2fca56
MV
1458 CUdeviceptr varptr;
1459 size_t varsize;
1460 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1461 module, XSTRING (GOMP_ADDITIONAL_ICVS));
1462
0bac793e
CLT
1463 if (r == CUDA_SUCCESS)
1464 {
9f2fca56
MV
1465 targ_tbl->start = (uintptr_t) varptr;
1466 targ_tbl->end = (uintptr_t) (varptr + varsize);
0bac793e
CLT
1467 }
1468 else
9f2fca56 1469 /* The variable was not in this image. */
0bac793e 1470 targ_tbl->start = targ_tbl->end = 0;
0bac793e 1471
50be486d
TB
1472 if (rev_fn_table && fn_entries == 0)
1473 *rev_fn_table = NULL;
1474 else if (rev_fn_table)
1475 {
1476 CUdeviceptr var;
9f9d128f
TB
1477 size_t bytes;
1478 unsigned int i;
50be486d
TB
1479 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1480 "$offload_func_table");
1481 if (r != CUDA_SUCCESS)
1482 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1483 assert (bytes == sizeof (uint64_t) * fn_entries);
1484 *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
1485 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
1486 if (r != CUDA_SUCCESS)
1487 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
131d18e9
TB
1488 /* Free if only NULL entries. */
1489 for (i = 0; i < fn_entries; ++i)
1490 if ((*rev_fn_table)[i] != 0)
1491 break;
1492 if (i == fn_entries)
1493 {
1494 free (*rev_fn_table);
1495 *rev_fn_table = NULL;
1496 }
1497 }
1498
1499 if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
1500 {
9f9d128f
TB
1501 /* Get the on-device GOMP_REV_OFFLOAD_VAR variable. It should be
1502 available but it might be not. One reason could be: if the user code
1503 has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1504 is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1505 are not linked in. */
131d18e9
TB
1506 CUdeviceptr device_rev_offload_var;
1507 size_t device_rev_offload_size;
1508 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
1509 &device_rev_offload_var,
1510 &device_rev_offload_size, module,
1511 XSTRING (GOMP_REV_OFFLOAD_VAR));
1512 if (r != CUDA_SUCCESS)
9f9d128f
TB
1513 {
1514 free (*rev_fn_table);
1515 *rev_fn_table = NULL;
1516 }
1517 else
1518 {
1519 /* cuMemHostAlloc memory is accessible on the device, if
1520 unified-shared address is supported; this is assumed - see comment
1521 in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1522 CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
1523 sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
1524 CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
1525 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
1526 sizeof (dp));
1527 if (r != CUDA_SUCCESS)
1528 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1529 }
50be486d
TB
1530 }
1531
6103184e
AM
1532 nvptx_set_clocktick (module, dev);
1533
0bac793e 1534 return fn_entries + var_entries + other_entries;
d93bdab5
JB
1535}
1536
f3e9a059
NS
1537/* Unload the program described by TARGET_DATA. DEV_DATA is the
1538 function descriptors allocated by G_O_load_image. */
1539
6ce13072 1540bool
2a21ff19 1541GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
d93bdab5 1542{
f3e9a059
NS
1543 struct ptx_image_data *image, **prev_p;
1544 struct ptx_device *dev = ptx_devices[ord];
1545
2a21ff19 1546 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
6ce13072
CLT
1547 {
1548 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1549 " (expected %u, received %u)",
1550 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1551 return false;
1552 }
1553
1554 bool ret = true;
f3e9a059
NS
1555 pthread_mutex_lock (&dev->image_lock);
1556 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1557 if (image->target_data == target_data)
1558 {
1559 *prev_p = image->next;
2393d337 1560 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
6ce13072 1561 ret = false;
f3e9a059
NS
1562 free (image->fns);
1563 free (image);
1564 break;
1565 }
1566 pthread_mutex_unlock (&dev->image_lock);
6ce13072 1567 return ret;
41dbbb37
TS
1568}
1569
1570void *
d93bdab5 1571GOMP_OFFLOAD_alloc (int ord, size_t size)
41dbbb37 1572{
6ce13072
CLT
1573 if (!nvptx_attach_host_thread_to_device (ord))
1574 return NULL;
41dbbb37 1575
1f4c5b9b
CLT
1576 struct ptx_device *ptx_dev = ptx_devices[ord];
1577 struct ptx_free_block *blocks, *tmp;
41dbbb37 1578
1f4c5b9b
CLT
1579 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1580 blocks = ptx_dev->free_blocks;
1581 ptx_dev->free_blocks = NULL;
1582 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
41dbbb37 1583
6b577a17
JB
1584 nvptx_stacks_free (ptx_dev, false);
1585
1f4c5b9b
CLT
1586 while (blocks)
1587 {
1588 tmp = blocks->next;
1589 nvptx_free (blocks->ptr, ptx_dev);
1590 free (blocks);
1591 blocks = tmp;
1592 }
1593
6b577a17
JB
1594 void *d = nvptx_alloc (size, true);
1595 if (d)
1596 return d;
1597 else
1598 {
1599 /* Memory allocation failed. Try freeing the stacks block, and
1600 retrying. */
1601 nvptx_stacks_free (ptx_dev, true);
1602 return nvptx_alloc (size, false);
1603 }
41dbbb37
TS
1604}
1605
6103184e 1606bool
1f4c5b9b 1607GOMP_OFFLOAD_free (int ord, void *ptr)
6103184e 1608{
1f4c5b9b
CLT
1609 return (nvptx_attach_host_thread_to_device (ord)
1610 && nvptx_free (ptr, ptx_devices[ord]));
6103184e
AM
1611}
1612
41dbbb37 1613void
f8332e52
TS
1614GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
1615 size_t mapnum __attribute__((unused)),
199867d0
TS
1616 void **hostaddrs __attribute__((unused)),
1617 void **devaddrs,
1f4c5b9b 1618 unsigned *dims, void *targ_mem_desc)
41dbbb37 1619{
f8332e52 1620 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
41dbbb37 1621
f8332e52
TS
1622 CUdeviceptr dp = (CUdeviceptr) devaddrs;
1623 nvptx_exec (fn, dims, targ_mem_desc, dp, NULL);
41dbbb37 1624
1f4c5b9b
CLT
1625 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1626 const char *maybe_abort_msg = "(perhaps abort was called)";
1627 if (r == CUDA_ERROR_LAUNCH_FAILED)
1628 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1629 maybe_abort_msg);
1630 else if (r != CUDA_SUCCESS)
1631 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
41dbbb37
TS
1632}
1633
1634void
f8332e52
TS
1635GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *),
1636 size_t mapnum __attribute__((unused)),
199867d0
TS
1637 void **hostaddrs __attribute__((unused)),
1638 void **devaddrs,
1f4c5b9b
CLT
1639 unsigned *dims, void *targ_mem_desc,
1640 struct goacc_asyncqueue *aq)
41dbbb37 1641{
f8332e52 1642 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
1f4c5b9b 1643
f8332e52
TS
1644 CUdeviceptr dp = (CUdeviceptr) devaddrs;
1645 nvptx_exec (fn, dims, targ_mem_desc, dp, aq->cuda_stream);
41dbbb37
TS
1646}
1647
1648void *
d93bdab5 1649GOMP_OFFLOAD_openacc_create_thread_data (int ord)
41dbbb37 1650{
d93bdab5 1651 struct ptx_device *ptx_dev;
41dbbb37
TS
1652 struct nvptx_thread *nvthd
1653 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
41dbbb37
TS
1654 CUcontext thd_ctx;
1655
d93bdab5
JB
1656 ptx_dev = ptx_devices[ord];
1657
1658 assert (ptx_dev);
1659
6ce13072 1660 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
41dbbb37
TS
1661
1662 assert (ptx_dev->ctx);
1663
1664 if (!thd_ctx)
6ce13072 1665 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
41dbbb37 1666
41dbbb37
TS
1667 nvthd->ptx_dev = ptx_dev;
1668
1669 return (void *) nvthd;
1670}
1671
1672void
1673GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1674{
1675 free (data);
1676}
1677
1678void *
345a8c17 1679GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
41dbbb37
TS
1680{
1681 return nvptx_get_current_cuda_device ();
1682}
1683
1684void *
345a8c17 1685GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
41dbbb37
TS
1686{
1687 return nvptx_get_current_cuda_context ();
1688}
1689
1f4c5b9b 1690/* This returns a CUstream. */
41dbbb37 1691void *
1f4c5b9b
CLT
1692GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1693{
1694 return (void *) aq->cuda_stream;
1695}
1696
1697/* This takes a CUstream. */
1698int
1699GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1700{
1701 if (aq->cuda_stream)
1702 {
1703 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1704 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1705 }
1706
1707 aq->cuda_stream = (CUstream) stream;
1708 return 1;
1709}
1710
130c2f3c
TS
1711static struct goacc_asyncqueue *
1712nvptx_goacc_asyncqueue_construct (unsigned int flags)
41dbbb37 1713{
1f4c5b9b 1714 CUstream stream = NULL;
130c2f3c 1715 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
1f4c5b9b
CLT
1716
1717 struct goacc_asyncqueue *aq
1718 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1719 aq->cuda_stream = stream;
1720 return aq;
41dbbb37
TS
1721}
1722
130c2f3c
TS
1723struct goacc_asyncqueue *
1724GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1725{
1726 return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
1727}
1728
1729static bool
1730nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
1f4c5b9b
CLT
1731{
1732 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1733 free (aq);
1734 return true;
1735}
41dbbb37 1736
130c2f3c
TS
1737bool
1738GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1739{
1740 return nvptx_goacc_asyncqueue_destruct (aq);
1741}
1742
41dbbb37 1743int
1f4c5b9b 1744GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
41dbbb37 1745{
1f4c5b9b
CLT
1746 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1747 if (r == CUDA_SUCCESS)
1748 return 1;
1749 if (r == CUDA_ERROR_NOT_READY)
1750 return 0;
1751
1752 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1753 return -1;
1754}
1755
130c2f3c
TS
1756static bool
1757nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
1f4c5b9b
CLT
1758{
1759 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1760 return true;
1761}
1762
130c2f3c
TS
1763bool
1764GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1765{
1766 return nvptx_goacc_asyncqueue_synchronize (aq);
1767}
1768
1f4c5b9b
CLT
1769bool
1770GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1771 struct goacc_asyncqueue *aq2)
1772{
1773 CUevent e;
1774 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1775 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1776 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1777 return true;
1778}
1779
1780static void
1781cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1782{
1783 if (res != CUDA_SUCCESS)
1784 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1785 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1786 cb->fn (cb->ptr);
1787 free (ptr);
1788}
1789
1790void
1791GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1792 void (*callback_fn)(void *),
1793 void *userptr)
1794{
1795 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1796 b->fn = callback_fn;
1797 b->ptr = userptr;
1798 b->aq = aq;
1799 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1800 cuda_callback_wrapper, (void *) b, 0);
1801}
1802
1803static bool
1804cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1805{
1806 CUdeviceptr pb;
1807 size_t ps;
1808 if (!s)
1809 return true;
1810 if (!d)
1811 {
1812 GOMP_PLUGIN_error ("invalid device address");
1813 return false;
1814 }
1815 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1816 if (!pb)
1817 {
1818 GOMP_PLUGIN_error ("invalid device address");
1819 return false;
1820 }
1821 if (!h)
1822 {
1823 GOMP_PLUGIN_error ("invalid host address");
1824 return false;
1825 }
1826 if (d == h)
1827 {
1828 GOMP_PLUGIN_error ("invalid host or device address");
1829 return false;
1830 }
1831 if ((void *)(d + s) > (void *)(pb + ps))
1832 {
1833 GOMP_PLUGIN_error ("invalid size");
1834 return false;
1835 }
1836 return true;
1837}
1838
1839bool
1840GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1841{
1842 if (!nvptx_attach_host_thread_to_device (ord)
1843 || !cuda_memcpy_sanity_check (src, dst, n))
1844 return false;
1845 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1846 return true;
1847}
1848
1849bool
1850GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1851{
1852 if (!nvptx_attach_host_thread_to_device (ord)
1853 || !cuda_memcpy_sanity_check (dst, src, n))
1854 return false;
1855 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1856 return true;
1857}
1858
1859bool
1860GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1861{
1862 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1863 return true;
1864}
1865
25072a47
TB
1866int
1867GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size,
1868 size_t dim0_len, void *dst, size_t dst_offset1_size,
1869 size_t dst_offset0_len, size_t dst_dim1_size,
1870 const void *src, size_t src_offset1_size,
1871 size_t src_offset0_len, size_t src_dim1_size)
1872{
1873 if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
1874 return false;
1875
1876 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
1877
1878 CUDA_MEMCPY2D data;
8b9e559f
TB
1879
1880 memset (&data, 0, sizeof (data));
25072a47
TB
1881 data.WidthInBytes = dim1_size;
1882 data.Height = dim0_len;
1883
1884 if (dst_ord == -1)
1885 {
1886 data.dstMemoryType = CU_MEMORYTYPE_HOST;
1887 data.dstHost = dst;
1888 }
1889 else
1890 {
1891 data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1892 data.dstDevice = (CUdeviceptr) dst;
1893 }
1894 data.dstPitch = dst_dim1_size;
1895 data.dstXInBytes = dst_offset1_size;
1896 data.dstY = dst_offset0_len;
1897
1898 if (src_ord == -1)
1899 {
1900 data.srcMemoryType = CU_MEMORYTYPE_HOST;
1901 data.srcHost = src;
1902 }
1903 else
1904 {
1905 data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
1906 data.srcDevice = (CUdeviceptr) src;
1907 }
1908 data.srcPitch = src_dim1_size;
1909 data.srcXInBytes = src_offset1_size;
1910 data.srcY = src_offset0_len;
1911
d7e9ae4f
JB
1912 if (data.srcXInBytes != 0 || data.srcY != 0)
1913 {
1914 /* Adjust origin to the actual array data, else the CUDA 2D memory
1915 copy API calls below may fail to validate source/dest pointers
1916 correctly (especially for Fortran where the "virtual origin" of an
1917 array is often outside the stored data). */
1918 if (src_ord == -1)
1919 data.srcHost = (const void *) ((const char *) data.srcHost
1920 + data.srcY * data.srcPitch
1921 + data.srcXInBytes);
1922 else
1923 data.srcDevice += data.srcY * data.srcPitch + data.srcXInBytes;
1924 data.srcXInBytes = 0;
1925 data.srcY = 0;
1926 }
1927
1928 if (data.dstXInBytes != 0 || data.dstY != 0)
1929 {
1930 /* As above. */
1931 if (dst_ord == -1)
1932 data.dstHost = (void *) ((char *) data.dstHost
1933 + data.dstY * data.dstPitch
1934 + data.dstXInBytes);
1935 else
1936 data.dstDevice += data.dstY * data.dstPitch + data.dstXInBytes;
1937 data.dstXInBytes = 0;
1938 data.dstY = 0;
1939 }
1940
25072a47
TB
1941 CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
1942 if (res == CUDA_ERROR_INVALID_VALUE)
1943 /* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
1944 for (some) memory not allocated by cuMemAllocPitch, cuMemcpy2D fails
1945 with an error; try the slower cuMemcpy2DUnaligned now. */
1946 CUDA_CALL (cuMemcpy2DUnaligned, &data);
1947 else if (res != CUDA_SUCCESS)
1948 {
1949 GOMP_PLUGIN_error ("cuMemcpy2D error: %s", cuda_error (res));
1950 return false;
1951 }
1952 return true;
1953}
1954
1955int
1956GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
1957 size_t dim1_len, size_t dim0_len, void *dst,
1958 size_t dst_offset2_size, size_t dst_offset1_len,
1959 size_t dst_offset0_len, size_t dst_dim2_size,
1960 size_t dst_dim1_len, const void *src,
1961 size_t src_offset2_size, size_t src_offset1_len,
1962 size_t src_offset0_len, size_t src_dim2_size,
1963 size_t src_dim1_len)
1964{
1965 if (!nvptx_attach_host_thread_to_device (src_ord != -1 ? src_ord : dst_ord))
1966 return false;
1967
1968 /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */
1969
1970 CUDA_MEMCPY3D data;
8b9e559f
TB
1971
1972 memset (&data, 0, sizeof (data));
25072a47
TB
1973 data.WidthInBytes = dim2_size;
1974 data.Height = dim1_len;
1975 data.Depth = dim0_len;
1976
1977 if (dst_ord == -1)
1978 {
1979 data.dstMemoryType = CU_MEMORYTYPE_HOST;
1980 data.dstHost = dst;
1981 }
1982 else
1983 {
1984 data.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1985 data.dstDevice = (CUdeviceptr) dst;
1986 }
1987 data.dstPitch = dst_dim2_size;
1988 data.dstHeight = dst_dim1_len;
1989 data.dstXInBytes = dst_offset2_size;
1990 data.dstY = dst_offset1_len;
1991 data.dstZ = dst_offset0_len;
25072a47
TB
1992
1993 if (src_ord == -1)
1994 {
1995 data.srcMemoryType = CU_MEMORYTYPE_HOST;
1996 data.srcHost = src;
1997 }
1998 else
1999 {
2000 data.srcMemoryType = CU_MEMORYTYPE_DEVICE;
2001 data.srcDevice = (CUdeviceptr) src;
2002 }
2003 data.srcPitch = src_dim2_size;
2004 data.srcHeight = src_dim1_len;
2005 data.srcXInBytes = src_offset2_size;
2006 data.srcY = src_offset1_len;
2007 data.srcZ = src_offset0_len;
25072a47 2008
d7e9ae4f
JB
2009 if (data.srcXInBytes != 0 || data.srcY != 0 || data.srcZ != 0)
2010 {
2011 /* Adjust origin to the actual array data, else the CUDA 3D memory
2012 copy API call below may fail to validate source/dest pointers
2013 correctly (especially for Fortran where the "virtual origin" of an
2014 array is often outside the stored data). */
2015 if (src_ord == -1)
2016 data.srcHost
2017 = (const void *) ((const char *) data.srcHost
2018 + (data.srcZ * data.srcHeight + data.srcY)
2019 * data.srcPitch
2020 + data.srcXInBytes);
2021 else
2022 data.srcDevice
2023 += (data.srcZ * data.srcHeight + data.srcY) * data.srcPitch
2024 + data.srcXInBytes;
2025 data.srcXInBytes = 0;
2026 data.srcY = 0;
2027 data.srcZ = 0;
2028 }
2029
2030 if (data.dstXInBytes != 0 || data.dstY != 0 || data.dstZ != 0)
2031 {
2032 /* As above. */
2033 if (dst_ord == -1)
2034 data.dstHost = (void *) ((char *) data.dstHost
2035 + (data.dstZ * data.dstHeight + data.dstY)
2036 * data.dstPitch
2037 + data.dstXInBytes);
2038 else
2039 data.dstDevice
2040 += (data.dstZ * data.dstHeight + data.dstY) * data.dstPitch
2041 + data.dstXInBytes;
2042 data.dstXInBytes = 0;
2043 data.dstY = 0;
2044 data.dstZ = 0;
2045 }
2046
25072a47
TB
2047 CUDA_CALL (cuMemcpy3D, &data);
2048 return true;
2049}
2050
1f4c5b9b
CLT
2051bool
2052GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
2053 size_t n, struct goacc_asyncqueue *aq)
2054{
2055 if (!nvptx_attach_host_thread_to_device (ord)
2056 || !cuda_memcpy_sanity_check (src, dst, n))
2057 return false;
2058 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
2059 return true;
2060}
2061
2062bool
2063GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
2064 size_t n, struct goacc_asyncqueue *aq)
2065{
2066 if (!nvptx_attach_host_thread_to_device (ord)
2067 || !cuda_memcpy_sanity_check (dst, src, n))
2068 return false;
2069 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
2070 return true;
41dbbb37 2071}
6103184e 2072
6fc0385c
TS
2073union goacc_property_value
2074GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
2075{
2076 union goacc_property_value propval = { .val = 0 };
2077
2078 pthread_mutex_lock (&ptx_dev_lock);
2079
2080 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
2081 {
2082 pthread_mutex_unlock (&ptx_dev_lock);
2083 return propval;
2084 }
2085
2086 struct ptx_device *ptx_dev = ptx_devices[n];
2087 switch (prop)
2088 {
2089 case GOACC_PROPERTY_MEMORY:
2090 {
2091 size_t total_mem;
2092
2093 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
2094 propval.val = total_mem;
2095 }
2096 break;
2097 case GOACC_PROPERTY_FREE_MEMORY:
2098 {
2099 size_t total_mem;
2100 size_t free_mem;
2101 CUdevice ctxdev;
2102
2103 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
2104 if (ptx_dev->dev == ctxdev)
2105 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2106 else if (ptx_dev->ctx)
2107 {
2108 CUcontext old_ctx;
2109
2110 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
2111 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2112 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
2113 }
2114 else
2115 {
2116 CUcontext new_ctx;
2117
2118 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
2119 ptx_dev->dev);
2120 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2121 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
2122 }
2123 propval.val = free_mem;
2124 }
2125 break;
2126 case GOACC_PROPERTY_NAME:
2127 propval.ptr = ptx_dev->name;
2128 break;
2129 case GOACC_PROPERTY_VENDOR:
2130 propval.ptr = "Nvidia";
2131 break;
2132 case GOACC_PROPERTY_DRIVER:
2133 propval.ptr = cuda_driver_version_s;
2134 break;
2135 default:
2136 break;
2137 }
2138
2139 pthread_mutex_unlock (&ptx_dev_lock);
2140 return propval;
2141}
2142
6103184e
AM
2143/* Adjust launch dimensions: pick good values for number of blocks and warps
2144 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2145 own limits. */
2146
2147static void
2148nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2149 struct ptx_device *ptx_dev,
2150 int *teams_p, int *threads_p)
2151{
2152 int max_warps_block = fn->max_threads_per_block / 32;
2153 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2154 and libgcc, which matches documented limit of all GPUs as of 2015. */
2155 if (max_warps_block > 32)
2156 max_warps_block = 32;
2157 if (*threads_p <= 0)
2158 *threads_p = 8;
2159 if (*threads_p > max_warps_block)
2160 *threads_p = max_warps_block;
2161
2162 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2163 /* This is an estimate of how many blocks the device can host simultaneously.
2164 Actual limit, which may be lower, can be queried with "occupancy control"
2165 driver interface (since CUDA 6.0). */
2166 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2167 if (*teams_p <= 0 || *teams_p > max_blocks)
2168 *teams_p = max_blocks;
2169}
2170
2171/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2172 target regions. */
2173
2174static size_t
2175nvptx_stacks_size ()
2176{
2177 return 128 * 1024;
2178}
2179
6b577a17
JB
2180/* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
2181 the storage should be held on entry, and remains held on exit. */
6103184e
AM
2182
2183static void *
6b577a17 2184nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
6103184e 2185{
6b577a17
JB
2186 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
2187 return (void *) ptx_dev->omp_stacks.ptr;
2188
2189 /* Free the old, too-small stacks. */
2190 if (ptx_dev->omp_stacks.ptr)
2191 {
2192 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2193 if (r != CUDA_SUCCESS)
2194 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
2195 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
2196 if (r != CUDA_SUCCESS)
2197 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2198 }
2199
2200 /* Make new and bigger stacks, and remember where we put them and how big
2201 they are. */
2202 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
2203 size * num);
6103184e
AM
2204 if (r != CUDA_SUCCESS)
2205 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
6103184e 2206
6b577a17 2207 ptx_dev->omp_stacks.size = size * num;
6103184e 2208
6b577a17 2209 return (void *) ptx_dev->omp_stacks.ptr;
6103184e
AM
2210}
2211
131d18e9 2212
6103184e
AM
2213void
2214GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2215{
7345ef6c
TV
2216 struct targ_fn_descriptor *tgt_fn_desc
2217 = (struct targ_fn_descriptor *) tgt_fn;
2218 CUfunction function = tgt_fn_desc->fn;
2219 const struct targ_fn_launch *launch = tgt_fn_desc->launch;
2220 const char *fn_name = launch->fn;
6103184e
AM
2221 CUresult r;
2222 struct ptx_device *ptx_dev = ptx_devices[ord];
2223 const char *maybe_abort_msg = "(perhaps abort was called)";
2224 int teams = 0, threads = 0;
2225
2226 if (!args)
2227 GOMP_PLUGIN_fatal ("No target arguments provided");
2228 while (*args)
2229 {
2230 intptr_t id = (intptr_t) *args++, val;
2231 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2232 val = (intptr_t) *args++;
2233 else
2234 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2235 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2236 continue;
2237 val = val > INT_MAX ? INT_MAX : val;
2238 id &= GOMP_TARGET_ARG_ID_MASK;
2239 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2240 teams = val;
2241 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2242 threads = val;
2243 }
2244 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2245
131d18e9 2246 bool reverse_offload = ptx_dev->rev_data != NULL;
130c2f3c
TS
2247 struct goacc_asyncqueue *reverse_offload_aq = NULL;
2248 if (reverse_offload)
2249 {
2250 reverse_offload_aq
2251 = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
2252 if (!reverse_offload_aq)
2253 exit (EXIT_FAILURE);
2254 }
2255
2256 size_t stack_size = nvptx_stacks_size ();
6b577a17
JB
2257
2258 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2259 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
6103184e
AM
2260 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2261 size_t fn_args_size = sizeof fn_args;
2262 void *config[] = {
2263 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2264 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2265 CU_LAUNCH_PARAM_END
2266 };
7345ef6c
TV
2267 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2268 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2269 __FUNCTION__, fn_name, teams, threads);
2393d337 2270 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
30486fab 2271 32, threads, 1, lowlat_pool_size, NULL, NULL, config);
6103184e
AM
2272 if (r != CUDA_SUCCESS)
2273 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
131d18e9
TB
2274 if (reverse_offload)
2275 while (true)
2276 {
2277 r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
2278 if (r == CUDA_SUCCESS)
2279 break;
2280 if (r == CUDA_ERROR_LAUNCH_FAILED)
2281 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
2282 maybe_abort_msg);
2283 else if (r != CUDA_ERROR_NOT_READY)
2284 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
2285
2286 if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
2287 {
2288 struct rev_offload *rev_data = ptx_dev->rev_data;
2289 GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
2290 rev_data->addrs, rev_data->sizes,
2291 rev_data->kinds, rev_data->dev_num,
130c2f3c
TS
2292 reverse_offload_aq);
2293 if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
2294 exit (EXIT_FAILURE);
131d18e9
TB
2295 __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
2296 }
2297 usleep (1);
2298 }
2299 else
2300 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
6103184e
AM
2301 if (r == CUDA_ERROR_LAUNCH_FAILED)
2302 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2303 maybe_abort_msg);
2304 else if (r != CUDA_SUCCESS)
2305 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
6b577a17
JB
2306
2307 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
130c2f3c
TS
2308
2309 if (reverse_offload)
2310 {
2311 if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
2312 exit (EXIT_FAILURE);
2313 }
6103184e
AM
2314}
2315
001ab12e 2316/* TODO: Implement GOMP_OFFLOAD_async_run. */