]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgomp/plugin/plugin-nvptx.c
'-foffload-memory=pinned' using offloading device interfaces
[thirdparty/gcc.git] / libgomp / plugin / plugin-nvptx.c
1 /* Plugin for NVPTX execution.
2
3 Copyright (C) 2013-2022 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "symcat.h"
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
41 #include "oacc-int.h"
42
43 /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
44 #include "config/nvptx/libgomp-nvptx.h"
45
46 #include <pthread.h>
47 #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
48 # include "cuda/cuda.h"
49 #else
50 # include <cuda.h>
51 #endif
52 #include <stdbool.h>
53 #include <limits.h>
54 #include <string.h>
55 #include <stdio.h>
56 #include <unistd.h>
57 #include <assert.h>
58 #include <errno.h>
59 #include <stdlib.h>
60
61 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
62 block to cache between kernel invocations. For soft-stacks blocks bigger
63 than this, we will free the block before attempting another GPU memory
64 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
65 we will free the cached soft-stacks block anyway then retry the
66 allocation. If that fails too, we lose. */
67
68 #define SOFTSTACK_CACHE_LIMIT 134217728
69
70 #if CUDA_VERSION < 6000
71 extern CUresult cuGetErrorString (CUresult, const char **);
72 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
73 #endif
74
75 #if CUDA_VERSION >= 6050
76 #undef cuLinkCreate
77 #undef cuLinkAddData
78 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
79 const char *, unsigned, CUjit_option *, void **);
80 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
81 #undef cuMemHostRegister
82 CUresult cuMemHostRegister (void *, size_t, unsigned int);
83 #else
84 typedef size_t (*CUoccupancyB2DSize)(int);
85 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
86 const char *, unsigned, CUjit_option *, void **);
87 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
88 CUresult cuMemHostRegister_v2 (void *, size_t, unsigned int);
89 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
90 CUoccupancyB2DSize, size_t, int);
91 #endif
92
93 #define DO_PRAGMA(x) _Pragma (#x)
94
95 #ifndef PLUGIN_NVPTX_LINK_LIBCUDA
96 # include <dlfcn.h>
97
98 struct cuda_lib_s {
99
100 # define CUDA_ONE_CALL(call) \
101 __typeof (call) *call;
102 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
103 CUDA_ONE_CALL (call)
104 #include "cuda-lib.def"
105 # undef CUDA_ONE_CALL
106 # undef CUDA_ONE_CALL_MAYBE_NULL
107
108 } cuda_lib;
109
110 /* -1 if init_cuda_lib has not been called yet, false
111 if it has been and failed, true if it has been and succeeded. */
112 static signed char cuda_lib_inited = -1;
113
114 /* Dynamically load the CUDA runtime library and initialize function
115 pointers, return false if unsuccessful, true if successful. */
116 static bool
117 init_cuda_lib (void)
118 {
119 if (cuda_lib_inited != -1)
120 return cuda_lib_inited;
121 const char *cuda_runtime_lib = "libcuda.so.1";
122 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
123 cuda_lib_inited = false;
124 if (h == NULL)
125 return false;
126
127 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
128 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
129 # define CUDA_ONE_CALL_1(call, allow_null) \
130 cuda_lib.call = dlsym (h, #call); \
131 if (!allow_null && cuda_lib.call == NULL) \
132 return false;
133 #include "cuda-lib.def"
134 # undef CUDA_ONE_CALL
135 # undef CUDA_ONE_CALL_1
136 # undef CUDA_ONE_CALL_MAYBE_NULL
137
138 cuda_lib_inited = true;
139 return true;
140 }
141 # define CUDA_CALL_PREFIX cuda_lib.
142 #else
143
144 # define CUDA_ONE_CALL(call)
145 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
146 #include "cuda-lib.def"
147 #undef CUDA_ONE_CALL_MAYBE_NULL
148 #undef CUDA_ONE_CALL
149
150 # define CUDA_CALL_PREFIX
151 # define init_cuda_lib() true
152 #endif
153
154 #include "secure_getenv.h"
155
156 #undef MIN
157 #undef MAX
158 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
159 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
160
161 /* Convenience macros for the frequently used CUDA library call and
162 error handling sequence as well as CUDA library calls that
163 do the error checking themselves or don't do it at all. */
164
165 #define CUDA_CALL_ERET(ERET, FN, ...) \
166 do { \
167 unsigned __r \
168 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
169 if (__r != CUDA_SUCCESS) \
170 { \
171 GOMP_PLUGIN_error (#FN " error: %s", \
172 cuda_error (__r)); \
173 return ERET; \
174 } \
175 } while (0)
176
177 #define CUDA_CALL(FN, ...) \
178 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
179
180 #define CUDA_CALL_ASSERT(FN, ...) \
181 do { \
182 unsigned __r \
183 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
184 if (__r != CUDA_SUCCESS) \
185 { \
186 GOMP_PLUGIN_fatal (#FN " error: %s", \
187 cuda_error (__r)); \
188 } \
189 } while (0)
190
191 #define CUDA_CALL_NOCHECK(FN, ...) \
192 CUDA_CALL_PREFIX FN (__VA_ARGS__)
193
194 #define CUDA_CALL_EXISTS(FN) \
195 CUDA_CALL_PREFIX FN
196
197 static const char *
198 cuda_error (CUresult r)
199 {
200 const char *fallback = "unknown cuda error";
201 const char *desc;
202
203 if (!CUDA_CALL_EXISTS (cuGetErrorString))
204 return fallback;
205
206 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
207 if (r == CUDA_SUCCESS)
208 return desc;
209
210 return fallback;
211 }
212
213 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
214 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
215 static char cuda_driver_version_s[30];
216
217 static unsigned int instantiated_devices = 0;
218 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
219
220 /* NVPTX/CUDA specific definition of asynchronous queues. */
221 struct goacc_asyncqueue
222 {
223 CUstream cuda_stream;
224 pthread_mutex_t page_locked_host_unregister_blocks_lock;
225 struct ptx_free_block *page_locked_host_unregister_blocks;
226 };
227
228 struct nvptx_callback
229 {
230 void (*fn) (void *);
231 void *ptr;
232 struct goacc_asyncqueue *aq;
233 struct nvptx_callback *next;
234 };
235
236 /* Thread-specific data for PTX. */
237
238 struct nvptx_thread
239 {
240 /* We currently have this embedded inside the plugin because libgomp manages
241 devices through integer target_ids. This might be better if using an
242 opaque target-specific pointer directly from gomp_device_descr. */
243 struct ptx_device *ptx_dev;
244 };
245
246 /* Target data function launch information. */
247
248 struct targ_fn_launch
249 {
250 const char *fn;
251 unsigned short dim[GOMP_DIM_MAX];
252 };
253
254 /* Target PTX object information. */
255
256 struct targ_ptx_obj
257 {
258 const char *code;
259 size_t size;
260 };
261
262 /* Target data image information. */
263
264 typedef struct nvptx_tdata
265 {
266 const struct targ_ptx_obj *ptx_objs;
267 unsigned ptx_num;
268
269 const char *const *var_names;
270 unsigned var_num;
271
272 const struct targ_fn_launch *fn_descs;
273 unsigned fn_num;
274 } nvptx_tdata_t;
275
276 /* Descriptor of a loaded function. */
277
278 struct targ_fn_descriptor
279 {
280 CUfunction fn;
281 const struct targ_fn_launch *launch;
282 int regs_per_thread;
283 int max_threads_per_block;
284 };
285
286 /* A loaded PTX image. */
287 struct ptx_image_data
288 {
289 const void *target_data;
290 CUmodule module;
291
292 struct targ_fn_descriptor *fns; /* Array of functions. */
293
294 struct ptx_image_data *next;
295 };
296
297 struct ptx_free_block
298 {
299 void *ptr;
300 struct ptx_free_block *next;
301 };
302
303 struct ptx_device
304 {
305 CUcontext ctx;
306 bool ctx_shared;
307 CUdevice dev;
308
309 int ord;
310 bool overlap;
311 bool map;
312 bool concur;
313 bool mkern;
314 int mode;
315 int clock_khz;
316 int num_sms;
317 int regs_per_block;
318 int regs_per_sm;
319 int warp_size;
320 int max_threads_per_block;
321 int max_threads_per_multiprocessor;
322 bool read_only_host_register_supported;
323 int default_dims[GOMP_DIM_MAX];
324 int compute_major, compute_minor;
325
326 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
327 char name[256];
328
329 struct ptx_image_data *images; /* Images loaded on device. */
330 pthread_mutex_t image_lock; /* Lock for above list. */
331
332 struct ptx_free_block *free_blocks;
333 pthread_mutex_t free_blocks_lock;
334
335 /* OpenMP stacks, cached between kernel invocations. */
336 struct
337 {
338 CUdeviceptr ptr;
339 size_t size;
340 pthread_mutex_t lock;
341 } omp_stacks;
342
343 struct rev_offload *rev_data;
344 struct ptx_device *next;
345 };
346
347 static struct ptx_device **ptx_devices;
348
349 static struct ptx_free_block *free_host_blocks = NULL;
350 static pthread_mutex_t free_host_blocks_lock = PTHREAD_MUTEX_INITIALIZER;
351
352 static bool
353 nvptx_run_deferred_page_locked_host_free (void)
354 {
355 GOMP_PLUGIN_debug (0, "%s\n",
356 __FUNCTION__);
357
358 pthread_mutex_lock (&free_host_blocks_lock);
359 struct ptx_free_block *b = free_host_blocks;
360 free_host_blocks = NULL;
361 pthread_mutex_unlock (&free_host_blocks_lock);
362
363 while (b)
364 {
365 GOMP_PLUGIN_debug (0, " b=%p: cuMemFreeHost(b->ptr=%p)\n",
366 b, b->ptr);
367
368 struct ptx_free_block *b_next = b->next;
369 CUDA_CALL (cuMemFreeHost, b->ptr);
370 free (b);
371 b = b_next;
372 }
373 return true;
374 }
375
376 /* OpenMP kernels reserve a small amount of ".shared" space for use by
377 omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
378 default is set here. */
379 static unsigned lowlat_pool_size = 8*1024;
380
381 static bool nvptx_do_global_cdtors (CUmodule, struct ptx_device *,
382 const char *);
383 static size_t nvptx_stacks_size ();
384 static void *nvptx_stacks_acquire (struct ptx_device *, size_t, int);
385
386 static inline struct nvptx_thread *
387 nvptx_thread (void)
388 {
389 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
390 }
391
392 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
393 should be locked on entry and remains locked on exit. */
394
395 static bool
396 nvptx_init (void)
397 {
398 int ndevs;
399
400 if (instantiated_devices != 0)
401 return true;
402
403 if (!init_cuda_lib ())
404 return false;
405
406 CUDA_CALL (cuInit, 0);
407
408 int cuda_driver_version;
409 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
410 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
411 "CUDA Driver %u.%u",
412 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
413
414 CUDA_CALL (cuDeviceGetCount, &ndevs);
415 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
416 * ndevs);
417
418 return true;
419 }
420
421 /* Select the N'th PTX device for the current host thread. The device must
422 have been previously opened before calling this function. */
423
424 static bool
425 nvptx_attach_host_thread_to_device (int n)
426 {
427 CUdevice dev;
428 CUresult r;
429 struct ptx_device *ptx_dev;
430 CUcontext thd_ctx;
431
432 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
433 if (r == CUDA_ERROR_NOT_PERMITTED)
434 {
435 /* Assume we're in a CUDA callback, just return true. */
436 return true;
437 }
438 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
439 {
440 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
441 return false;
442 }
443
444 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
445 return true;
446 else
447 {
448 CUcontext old_ctx;
449
450 ptx_dev = ptx_devices[n];
451 if (!ptx_dev)
452 {
453 GOMP_PLUGIN_error ("device %d not found", n);
454 return false;
455 }
456
457 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
458
459 /* We don't necessarily have a current context (e.g. if it has been
460 destroyed. Pop it if we do though. */
461 if (thd_ctx != NULL)
462 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
463
464 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
465 }
466 return true;
467 }
468
469 static struct ptx_device *
470 nvptx_open_device (int n)
471 {
472 struct ptx_device *ptx_dev;
473 CUdevice dev, ctx_dev;
474 CUresult r;
475 int pi;
476
477 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
478
479 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
480
481 ptx_dev->ord = n;
482 ptx_dev->dev = dev;
483 ptx_dev->ctx_shared = false;
484
485 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
486 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
487 {
488 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
489 return NULL;
490 }
491
492 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
493 {
494 /* The current host thread has an active context for a different device.
495 Detach it. */
496 CUcontext old_ctx;
497 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
498 }
499
500 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
501
502 if (!ptx_dev->ctx)
503 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
504 else
505 ptx_dev->ctx_shared = true;
506
507 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
508 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
509 ptx_dev->overlap = pi;
510
511 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
512 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
513 ptx_dev->map = pi;
514
515 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
516 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
517 ptx_dev->concur = pi;
518
519 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
520 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
521 ptx_dev->mode = pi;
522
523 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
524 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
525 ptx_dev->mkern = pi;
526
527 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
528 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
529 ptx_dev->clock_khz = pi;
530
531 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
532 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
533 ptx_dev->num_sms = pi;
534
535 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
536 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
537 ptx_dev->regs_per_block = pi;
538
539 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
540 in CUDA 6.0 and newer. */
541 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
542 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
543 dev);
544 /* Fallback: use limit of registers per block, which is usually equal. */
545 if (r == CUDA_ERROR_INVALID_VALUE)
546 pi = ptx_dev->regs_per_block;
547 else if (r != CUDA_SUCCESS)
548 {
549 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
550 return NULL;
551 }
552 ptx_dev->regs_per_sm = pi;
553
554 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
555 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
556 if (pi != 32)
557 {
558 GOMP_PLUGIN_error ("Only warp size 32 is supported");
559 return NULL;
560 }
561 ptx_dev->warp_size = pi;
562
563 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
564 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
565 ptx_dev->max_threads_per_block = pi;
566
567 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
568 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
569 ptx_dev->max_threads_per_multiprocessor = pi;
570
571 /* Required below for reverse offload as implemented, but with compute
572 capability >= 2.0 and 64bit device processes, this should be universally be
573 the case; hence, an assert. */
574 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
575 CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
576 assert (r == CUDA_SUCCESS && pi);
577
578 /* This is a CUDA 11.1 feature. */
579 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
580 CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED,
581 dev);
582 if (r == CUDA_ERROR_INVALID_VALUE)
583 pi = false;
584 else if (r != CUDA_SUCCESS)
585 {
586 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
587 return NULL;
588 }
589 ptx_dev->read_only_host_register_supported = pi;
590
591 for (int i = 0; i != GOMP_DIM_MAX; i++)
592 ptx_dev->default_dims[i] = 0;
593
594 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
595 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
596 ptx_dev->compute_major = pi;
597
598 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
599 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
600 ptx_dev->compute_minor = pi;
601
602 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
603 dev);
604
605 ptx_dev->images = NULL;
606 pthread_mutex_init (&ptx_dev->image_lock, NULL);
607
608 ptx_dev->free_blocks = NULL;
609 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
610
611 ptx_dev->omp_stacks.ptr = 0;
612 ptx_dev->omp_stacks.size = 0;
613 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
614
615 ptx_dev->rev_data = NULL;
616
617 return ptx_dev;
618 }
619
620 static bool
621 nvptx_close_device (struct ptx_device *ptx_dev)
622 {
623 if (!ptx_dev)
624 return true;
625
626 bool ret = true;
627
628 for (struct ptx_image_data *image = ptx_dev->images;
629 image != NULL;
630 image = image->next)
631 {
632 if (!nvptx_do_global_cdtors (image->module, ptx_dev,
633 "__do_global_dtors__entry"))
634 ret = false;
635 }
636
637 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
638 {
639 struct ptx_free_block *b_next = b->next;
640 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
641 free (b);
642 b = b_next;
643 }
644
645 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
646 pthread_mutex_destroy (&ptx_dev->image_lock);
647
648 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
649
650 if (ptx_dev->omp_stacks.ptr)
651 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
652
653 if (!ptx_dev->ctx_shared)
654 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
655
656 free (ptx_dev);
657
658 return ret;
659 }
660
661 static int
662 nvptx_get_num_devices (void)
663 {
664 int n;
665
666 /* This function will be called before the plugin has been initialized in
667 order to enumerate available devices, but CUDA API routines can't be used
668 until cuInit has been called. Just call it now (but don't yet do any
669 further initialization). */
670 if (instantiated_devices == 0)
671 {
672 if (!init_cuda_lib ())
673 return 0;
674 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
675 /* This is not an error: e.g. we may have CUDA libraries installed but
676 no devices available. */
677 if (r != CUDA_SUCCESS)
678 {
679 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
680 cuda_error (r));
681 return 0;
682 }
683 }
684
685 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
686 return n;
687 }
688
689 static void
690 notify_var (const char *var_name, const char *env_var)
691 {
692 if (env_var == NULL)
693 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
694 else
695 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
696 }
697
698 static void
699 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
700 {
701 const char *var_name = "GOMP_NVPTX_JIT";
702 const char *env_var = secure_getenv (var_name);
703 notify_var (var_name, env_var);
704
705 if (env_var == NULL)
706 return;
707
708 const char *c = env_var;
709 while (*c != '\0')
710 {
711 while (*c == ' ')
712 c++;
713
714 if (c[0] == '-' && c[1] == 'O'
715 && '0' <= c[2] && c[2] <= '4'
716 && (c[3] == '\0' || c[3] == ' '))
717 {
718 *gomp_nvptx_o = c[2] - '0';
719 c += 3;
720 continue;
721 }
722
723 GOMP_PLUGIN_error ("Error parsing %s", var_name);
724 break;
725 }
726 }
727
728 static bool
729 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
730 unsigned num_objs)
731 {
732 CUjit_option opts[7];
733 void *optvals[7];
734 float elapsed = 0.0;
735 char elog[1024];
736 char ilog[16384];
737 CUlinkState linkstate;
738 CUresult r;
739 void *linkout;
740 size_t linkoutsize __attribute__ ((unused));
741
742 opts[0] = CU_JIT_WALL_TIME;
743 optvals[0] = &elapsed;
744
745 opts[1] = CU_JIT_INFO_LOG_BUFFER;
746 optvals[1] = &ilog[0];
747
748 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
749 optvals[2] = (void *) sizeof ilog;
750
751 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
752 optvals[3] = &elog[0];
753
754 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
755 optvals[4] = (void *) sizeof elog;
756
757 opts[5] = CU_JIT_LOG_VERBOSE;
758 optvals[5] = (void *) 1;
759
760 static intptr_t gomp_nvptx_o = -1;
761
762 static bool init_done = false;
763 if (!init_done)
764 {
765 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
766 init_done = true;
767 }
768
769 int nopts = 6;
770 if (gomp_nvptx_o != -1)
771 {
772 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
773 optvals[nopts] = (void *) gomp_nvptx_o;
774 nopts++;
775 }
776
777 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
778 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
779 else
780 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
781
782 for (; num_objs--; ptx_objs++)
783 {
784 /* cuLinkAddData's 'data' argument erroneously omits the const
785 qualifier. */
786 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
787 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
788 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
789 (char *) ptx_objs->code, ptx_objs->size,
790 0, 0, 0, 0);
791 else
792 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
793 (char *) ptx_objs->code, ptx_objs->size,
794 0, 0, 0, 0);
795 if (r != CUDA_SUCCESS)
796 {
797 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
798 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
799 cuda_error (r));
800 return false;
801 }
802 }
803
804 GOMP_PLUGIN_debug (0, "Linking\n");
805 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
806
807 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
808 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
809
810 if (r != CUDA_SUCCESS)
811 {
812 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
813 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
814 return false;
815 }
816
817 CUDA_CALL (cuModuleLoadData, module, linkout);
818 CUDA_CALL (cuLinkDestroy, linkstate);
819 return true;
820 }
821
822 static void
823 nvptx_exec (void (*fn), unsigned *dims, void *targ_mem_desc,
824 CUdeviceptr dp, CUstream stream)
825 {
826 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
827 CUfunction function;
828 int i;
829 void *kargs[1];
830 struct nvptx_thread *nvthd = nvptx_thread ();
831 int warp_size = nvthd->ptx_dev->warp_size;
832
833 function = targ_fn->fn;
834
835 /* Initialize the launch dimensions. Typically this is constant,
836 provided by the device compiler, but we must permit runtime
837 values. */
838 int seen_zero = 0;
839 for (i = 0; i != GOMP_DIM_MAX; i++)
840 {
841 if (targ_fn->launch->dim[i])
842 dims[i] = targ_fn->launch->dim[i];
843 if (!dims[i])
844 seen_zero = 1;
845 }
846
847 if (seen_zero)
848 {
849 pthread_mutex_lock (&ptx_dev_lock);
850
851 static int gomp_openacc_dims[GOMP_DIM_MAX];
852 if (!gomp_openacc_dims[0])
853 {
854 /* See if the user provided GOMP_OPENACC_DIM environment
855 variable to specify runtime defaults. */
856 for (int i = 0; i < GOMP_DIM_MAX; ++i)
857 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
858 }
859
860 if (!nvthd->ptx_dev->default_dims[0])
861 {
862 int default_dims[GOMP_DIM_MAX];
863 for (int i = 0; i < GOMP_DIM_MAX; ++i)
864 default_dims[i] = gomp_openacc_dims[i];
865
866 int gang, worker, vector;
867 {
868 int block_size = nvthd->ptx_dev->max_threads_per_block;
869 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
870 int dev_size = nvthd->ptx_dev->num_sms;
871 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
872 " dev_size=%d, cpu_size=%d\n",
873 warp_size, block_size, dev_size, cpu_size);
874
875 gang = (cpu_size / block_size) * dev_size;
876 worker = block_size / warp_size;
877 vector = warp_size;
878 }
879
880 /* There is no upper bound on the gang size. The best size
881 matches the hardware configuration. Logical gangs are
882 scheduled onto physical hardware. To maximize usage, we
883 should guess a large number. */
884 if (default_dims[GOMP_DIM_GANG] < 1)
885 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
886 /* The worker size must not exceed the hardware. */
887 if (default_dims[GOMP_DIM_WORKER] < 1
888 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
889 default_dims[GOMP_DIM_WORKER] = worker;
890 /* The vector size must exactly match the hardware. */
891 if (default_dims[GOMP_DIM_VECTOR] < 1
892 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
893 default_dims[GOMP_DIM_VECTOR] = vector;
894
895 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
896 default_dims[GOMP_DIM_GANG],
897 default_dims[GOMP_DIM_WORKER],
898 default_dims[GOMP_DIM_VECTOR]);
899
900 for (i = 0; i != GOMP_DIM_MAX; i++)
901 nvthd->ptx_dev->default_dims[i] = default_dims[i];
902 }
903 pthread_mutex_unlock (&ptx_dev_lock);
904
905 {
906 bool default_dim_p[GOMP_DIM_MAX];
907 for (i = 0; i != GOMP_DIM_MAX; i++)
908 default_dim_p[i] = !dims[i];
909
910 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
911 {
912 for (i = 0; i != GOMP_DIM_MAX; i++)
913 if (default_dim_p[i])
914 dims[i] = nvthd->ptx_dev->default_dims[i];
915
916 if (default_dim_p[GOMP_DIM_VECTOR])
917 dims[GOMP_DIM_VECTOR]
918 = MIN (dims[GOMP_DIM_VECTOR],
919 (targ_fn->max_threads_per_block / warp_size
920 * warp_size));
921
922 if (default_dim_p[GOMP_DIM_WORKER])
923 dims[GOMP_DIM_WORKER]
924 = MIN (dims[GOMP_DIM_WORKER],
925 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
926 }
927 else
928 {
929 /* Handle the case that the compiler allows the runtime to choose
930 the vector-length conservatively, by ignoring
931 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
932 it. */
933 int vectors = 0;
934 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
935 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
936 exceed targ_fn->max_threads_per_block. */
937 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
938 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
939 int grids, blocks;
940
941 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
942 &blocks, function, NULL, 0,
943 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
944 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
945 "grid = %d, block = %d\n", grids, blocks);
946
947 /* Keep the num_gangs proportional to the block size. In
948 the case were a block size is limited by shared-memory
949 or the register file capacity, the runtime will not
950 excessively over assign gangs to the multiprocessor
951 units if their state is going to be swapped out even
952 more than necessary. The constant factor 2 is there to
953 prevent threads from idling when there is insufficient
954 work for them. */
955 if (gangs == 0)
956 gangs = 2 * grids * (blocks / warp_size);
957
958 if (vectors == 0)
959 vectors = warp_size;
960
961 if (workers == 0)
962 {
963 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
964 ? vectors
965 : dims[GOMP_DIM_VECTOR]);
966 workers = blocks / actual_vectors;
967 workers = MAX (workers, 1);
968 /* If we need a per-worker barrier ... . */
969 if (actual_vectors > 32)
970 /* Don't use more barriers than available. */
971 workers = MIN (workers, 15);
972 }
973
974 for (i = 0; i != GOMP_DIM_MAX; i++)
975 if (default_dim_p[i])
976 switch (i)
977 {
978 case GOMP_DIM_GANG: dims[i] = gangs; break;
979 case GOMP_DIM_WORKER: dims[i] = workers; break;
980 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
981 default: GOMP_PLUGIN_fatal ("invalid dim");
982 }
983 }
984 }
985 }
986
987 /* Check if the accelerator has sufficient hardware resources to
988 launch the offloaded kernel. */
989 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
990 > targ_fn->max_threads_per_block)
991 {
992 const char *msg
993 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
994 " with num_workers = %d and vector_length = %d"
995 "; "
996 "recompile the program with 'num_workers = x and vector_length = y'"
997 " on that offloaded region or '-fopenacc-dim=:x:y' where"
998 " x * y <= %d"
999 ".\n");
1000 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
1001 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
1002 }
1003
1004 /* Check if the accelerator has sufficient barrier resources to
1005 launch the offloaded kernel. */
1006 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
1007 {
1008 const char *msg
1009 = ("The Nvidia accelerator has insufficient barrier resources to launch"
1010 " '%s' with num_workers = %d and vector_length = %d"
1011 "; "
1012 "recompile the program with 'num_workers = x' on that offloaded"
1013 " region or '-fopenacc-dim=:x:' where x <= 15"
1014 "; "
1015 "or, recompile the program with 'vector_length = 32' on that"
1016 " offloaded region or '-fopenacc-dim=::32'"
1017 ".\n");
1018 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
1019 dims[GOMP_DIM_VECTOR]);
1020 }
1021
1022 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1023 " gangs=%u, workers=%u, vectors=%u\n",
1024 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1025 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1026
1027 // OpenACC CUDA
1028 //
1029 // num_gangs nctaid.x
1030 // num_workers ntid.y
1031 // vector length ntid.x
1032
1033 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1034 acc_prof_info *prof_info = thr->prof_info;
1035 acc_event_info enqueue_launch_event_info;
1036 acc_api_info *api_info = thr->api_info;
1037 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1038 if (profiling_p)
1039 {
1040 prof_info->event_type = acc_ev_enqueue_launch_start;
1041
1042 enqueue_launch_event_info.launch_event.event_type
1043 = prof_info->event_type;
1044 enqueue_launch_event_info.launch_event.valid_bytes
1045 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
1046 enqueue_launch_event_info.launch_event.parent_construct
1047 = acc_construct_parallel;
1048 enqueue_launch_event_info.launch_event.implicit = 1;
1049 enqueue_launch_event_info.launch_event.tool_info = NULL;
1050 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
1051 enqueue_launch_event_info.launch_event.num_gangs
1052 = dims[GOMP_DIM_GANG];
1053 enqueue_launch_event_info.launch_event.num_workers
1054 = dims[GOMP_DIM_WORKER];
1055 enqueue_launch_event_info.launch_event.vector_length
1056 = dims[GOMP_DIM_VECTOR];
1057
1058 api_info->device_api = acc_device_api_cuda;
1059
1060 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1061 api_info);
1062 }
1063
1064 /* Per 'nvptx_goacc_validate_dims'. */
1065 assert (dims[GOMP_DIM_VECTOR] % warp_size == 0);
1066
1067 kargs[0] = &dp;
1068 CUDA_CALL_ASSERT (cuLaunchKernel, function,
1069 dims[GOMP_DIM_GANG], 1, 1,
1070 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1071 0, stream, kargs, 0);
1072
1073 if (profiling_p)
1074 {
1075 prof_info->event_type = acc_ev_enqueue_launch_end;
1076 enqueue_launch_event_info.launch_event.event_type
1077 = prof_info->event_type;
1078 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
1079 api_info);
1080 }
1081
1082 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
1083 targ_fn->launch->fn);
1084 }
1085
1086 void * openacc_get_current_cuda_context (void);
1087
1088 static void
1089 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
1090 {
1091 acc_prof_info *prof_info = thr->prof_info;
1092 acc_event_info data_event_info;
1093 acc_api_info *api_info = thr->api_info;
1094
1095 prof_info->event_type = acc_ev_alloc;
1096
1097 data_event_info.data_event.event_type = prof_info->event_type;
1098 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1099 data_event_info.data_event.parent_construct = acc_construct_parallel;
1100 data_event_info.data_event.implicit = 1;
1101 data_event_info.data_event.tool_info = NULL;
1102 data_event_info.data_event.var_name = NULL;
1103 data_event_info.data_event.bytes = s;
1104 data_event_info.data_event.host_ptr = NULL;
1105 data_event_info.data_event.device_ptr = dp;
1106
1107 api_info->device_api = acc_device_api_cuda;
1108
1109 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1110 }
1111
1112 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1113 size threshold, or if FORCE is true. */
1114
1115 static void
1116 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1117 {
1118 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1119 if (ptx_dev->omp_stacks.ptr
1120 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1121 {
1122 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1123 if (r != CUDA_SUCCESS)
1124 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1125 ptx_dev->omp_stacks.ptr = 0;
1126 ptx_dev->omp_stacks.size = 0;
1127 }
1128 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1129 }
1130
1131 static void *
1132 nvptx_alloc (size_t s, bool suppress_errors, bool usm)
1133 {
1134 CUdeviceptr d;
1135
1136 CUresult r = (usm ? CUDA_CALL_NOCHECK (cuMemAllocManaged, &d, s,
1137 CU_MEM_ATTACH_GLOBAL)
1138 : CUDA_CALL_NOCHECK (cuMemAlloc, &d, s));
1139 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1140 return NULL;
1141 else if (r != CUDA_SUCCESS)
1142 {
1143 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1144 return NULL;
1145 }
1146
1147 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1148 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1149 bool profiling_p
1150 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1151 if (profiling_p)
1152 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1153
1154 return (void *) d;
1155 }
1156
1157 static void
1158 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1159 {
1160 acc_prof_info *prof_info = thr->prof_info;
1161 acc_event_info data_event_info;
1162 acc_api_info *api_info = thr->api_info;
1163
1164 prof_info->event_type = acc_ev_free;
1165
1166 data_event_info.data_event.event_type = prof_info->event_type;
1167 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1168 data_event_info.data_event.parent_construct = acc_construct_parallel;
1169 data_event_info.data_event.implicit = 1;
1170 data_event_info.data_event.tool_info = NULL;
1171 data_event_info.data_event.var_name = NULL;
1172 data_event_info.data_event.bytes = -1;
1173 data_event_info.data_event.host_ptr = NULL;
1174 data_event_info.data_event.device_ptr = p;
1175
1176 api_info->device_api = acc_device_api_cuda;
1177
1178 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1179 }
1180
1181 static bool
1182 nvptx_free (void *p, struct ptx_device *ptx_dev)
1183 {
1184 CUdeviceptr pb;
1185 size_t ps;
1186
1187 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1188 (CUdeviceptr) p);
1189 if (r == CUDA_ERROR_NOT_PERMITTED)
1190 {
1191 /* We assume that this error indicates we are in a CUDA callback context,
1192 where all CUDA calls are not allowed (see cuStreamAddCallback
1193 documentation for description). Arrange to free this piece of device
1194 memory later. */
1195 struct ptx_free_block *n
1196 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1197 n->ptr = p;
1198 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1199 n->next = ptx_dev->free_blocks;
1200 ptx_dev->free_blocks = n;
1201 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1202 return true;
1203 }
1204 else if (r != CUDA_SUCCESS)
1205 {
1206 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1207 return false;
1208 }
1209 if ((CUdeviceptr) p != pb)
1210 {
1211 GOMP_PLUGIN_error ("invalid device address");
1212 return false;
1213 }
1214
1215 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1216 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1217 bool profiling_p
1218 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1219 if (profiling_p)
1220 goacc_profiling_acc_ev_free (thr, p);
1221
1222 return true;
1223 }
1224
1225 static void *
1226 nvptx_get_current_cuda_device (void)
1227 {
1228 struct nvptx_thread *nvthd = nvptx_thread ();
1229
1230 if (!nvthd || !nvthd->ptx_dev)
1231 return NULL;
1232
1233 return &nvthd->ptx_dev->dev;
1234 }
1235
1236 static void *
1237 nvptx_get_current_cuda_context (void)
1238 {
1239 struct nvptx_thread *nvthd = nvptx_thread ();
1240
1241 if (!nvthd || !nvthd->ptx_dev)
1242 return NULL;
1243
1244 return nvthd->ptx_dev->ctx;
1245 }
1246
1247 /* Plugin entry points. */
1248
1249 const char *
1250 GOMP_OFFLOAD_get_name (void)
1251 {
1252 return "nvptx";
1253 }
1254
1255 unsigned int
1256 GOMP_OFFLOAD_get_caps (void)
1257 {
1258 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1259 }
1260
1261 int
1262 GOMP_OFFLOAD_get_type (void)
1263 {
1264 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1265 }
1266
1267 int
1268 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
1269 {
1270 int num_devices = nvptx_get_num_devices ();
1271 /* Return -1 if no omp_requires_mask cannot be fulfilled but
1272 devices were present. Unified-shared address: see comment in
1273 nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1274 if (num_devices > 0
1275 && ((omp_requires_mask
1276 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1277 | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
1278 | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
1279 return -1;
1280 return num_devices;
1281 }
1282
1283 bool
1284 GOMP_OFFLOAD_init_device (int n)
1285 {
1286 struct ptx_device *dev;
1287
1288 pthread_mutex_lock (&ptx_dev_lock);
1289
1290 if (!nvptx_init () || ptx_devices[n] != NULL)
1291 {
1292 pthread_mutex_unlock (&ptx_dev_lock);
1293 return false;
1294 }
1295
1296 dev = nvptx_open_device (n);
1297 if (dev)
1298 {
1299 ptx_devices[n] = dev;
1300 instantiated_devices++;
1301 }
1302
1303 const char *var_name = "GOMP_NVPTX_LOWLAT_POOL";
1304 const char *env_var = secure_getenv (var_name);
1305 notify_var (var_name, env_var);
1306
1307 if (env_var != NULL)
1308 {
1309 char *endptr;
1310 unsigned long val = strtoul (env_var, &endptr, 10);
1311 if (endptr == NULL || *endptr != '\0'
1312 || errno == ERANGE || errno == EINVAL
1313 || val > UINT_MAX)
1314 GOMP_PLUGIN_error ("Error parsing %s", var_name);
1315 else
1316 lowlat_pool_size = val;
1317 }
1318
1319 pthread_mutex_unlock (&ptx_dev_lock);
1320
1321 return dev != NULL;
1322 }
1323
1324 bool
1325 GOMP_OFFLOAD_fini_device (int n)
1326 {
1327 /* This isn't related to this specific 'ptx_devices[n]', but is a convenient
1328 place to clean up. */
1329 if (!nvptx_run_deferred_page_locked_host_free ())
1330 return false;
1331
1332 pthread_mutex_lock (&ptx_dev_lock);
1333
1334 if (ptx_devices[n] != NULL)
1335 {
1336 if (!nvptx_attach_host_thread_to_device (n)
1337 || !nvptx_close_device (ptx_devices[n]))
1338 {
1339 pthread_mutex_unlock (&ptx_dev_lock);
1340 return false;
1341 }
1342 ptx_devices[n] = NULL;
1343 instantiated_devices--;
1344 }
1345
1346 if (instantiated_devices == 0)
1347 {
1348 free (ptx_devices);
1349 ptx_devices = NULL;
1350 }
1351
1352 pthread_mutex_unlock (&ptx_dev_lock);
1353 return true;
1354 }
1355
1356 /* Return the libgomp version number we're compatible with. There is
1357 no requirement for cross-version compatibility. */
1358
1359 unsigned
1360 GOMP_OFFLOAD_version (void)
1361 {
1362 return GOMP_VERSION;
1363 }
1364
1365 /* Initialize __nvptx_clocktick, if present in MODULE. */
1366
1367 static void
1368 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1369 {
1370 CUdeviceptr dptr;
1371 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1372 module, "__nvptx_clocktick");
1373 if (r == CUDA_ERROR_NOT_FOUND)
1374 return;
1375 if (r != CUDA_SUCCESS)
1376 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1377 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1378 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1379 sizeof (__nvptx_clocktick));
1380 if (r != CUDA_SUCCESS)
1381 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1382 }
1383
1384 /* Invoke MODULE's global constructors/destructors. */
1385
1386 static bool
1387 nvptx_do_global_cdtors (CUmodule module, struct ptx_device *ptx_dev,
1388 const char *funcname)
1389 {
1390 bool ret = true;
1391 char *funcname_mgomp = NULL;
1392 CUresult r;
1393 CUfunction funcptr;
1394 r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
1395 &funcptr, module, funcname);
1396 GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
1397 funcname, cuda_error (r));
1398 if (r == CUDA_ERROR_NOT_FOUND)
1399 {
1400 /* Try '[funcname]__mgomp'. */
1401
1402 size_t funcname_len = strlen (funcname);
1403 const char *mgomp_suffix = "__mgomp";
1404 size_t mgomp_suffix_len = strlen (mgomp_suffix);
1405 funcname_mgomp
1406 = GOMP_PLUGIN_malloc (funcname_len + mgomp_suffix_len + 1);
1407 memcpy (funcname_mgomp, funcname, funcname_len);
1408 memcpy (funcname_mgomp + funcname_len,
1409 mgomp_suffix, mgomp_suffix_len + 1);
1410 funcname = funcname_mgomp;
1411
1412 r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
1413 &funcptr, module, funcname);
1414 GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
1415 funcname, cuda_error (r));
1416 }
1417 if (r == CUDA_ERROR_NOT_FOUND)
1418 ;
1419 else if (r != CUDA_SUCCESS)
1420 {
1421 GOMP_PLUGIN_error ("cuModuleGetFunction (%s) error: %s",
1422 funcname, cuda_error (r));
1423 ret = false;
1424 }
1425 else
1426 {
1427 /* If necessary, set up soft stack. */
1428 void *nvptx_stacks_0;
1429 void *kargs[1];
1430 if (funcname_mgomp)
1431 {
1432 size_t stack_size = nvptx_stacks_size ();
1433 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1434 nvptx_stacks_0 = nvptx_stacks_acquire (ptx_dev, stack_size, 1);
1435 nvptx_stacks_0 += stack_size;
1436 kargs[0] = &nvptx_stacks_0;
1437 }
1438 r = CUDA_CALL_NOCHECK (cuLaunchKernel,
1439 funcptr,
1440 1, 1, 1, 1, 1, 1,
1441 /* sharedMemBytes */ 0,
1442 /* hStream */ NULL,
1443 /* kernelParams */ funcname_mgomp ? kargs : NULL,
1444 /* extra */ NULL);
1445 if (r != CUDA_SUCCESS)
1446 {
1447 GOMP_PLUGIN_error ("cuLaunchKernel (%s) error: %s",
1448 funcname, cuda_error (r));
1449 ret = false;
1450 }
1451
1452 r = CUDA_CALL_NOCHECK (cuStreamSynchronize,
1453 NULL);
1454 if (r != CUDA_SUCCESS)
1455 {
1456 GOMP_PLUGIN_error ("cuStreamSynchronize (%s) error: %s",
1457 funcname, cuda_error (r));
1458 ret = false;
1459 }
1460
1461 if (funcname_mgomp)
1462 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1463 }
1464
1465 if (funcname_mgomp)
1466 free (funcname_mgomp);
1467
1468 return ret;
1469 }
1470
1471 /* Load the (partial) program described by TARGET_DATA to device
1472 number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
1473 will contain the on-device addresses of the functions for reverse offload.
1474 To be freed by the caller. */
1475
1476 int
1477 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1478 struct addr_pair **target_table,
1479 uint64_t **rev_fn_table)
1480 {
1481 CUmodule module;
1482 const char *const *var_names;
1483 const struct targ_fn_launch *fn_descs;
1484 unsigned int fn_entries, var_entries, other_entries, i, j;
1485 struct targ_fn_descriptor *targ_fns;
1486 struct addr_pair *targ_tbl;
1487 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1488 struct ptx_image_data *new_image;
1489 struct ptx_device *dev;
1490
1491 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1492 {
1493 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1494 " (expected %u, received %u)",
1495 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1496 return -1;
1497 }
1498
1499 if (!nvptx_attach_host_thread_to_device (ord)
1500 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1501 return -1;
1502
1503 dev = ptx_devices[ord];
1504
1505 /* The mkoffload utility emits a struct of pointers/integers at the
1506 start of each offload image. The array of kernel names and the
1507 functions addresses form a one-to-one correspondence. */
1508
1509 var_entries = img_header->var_num;
1510 var_names = img_header->var_names;
1511 fn_entries = img_header->fn_num;
1512 fn_descs = img_header->fn_descs;
1513
1514 /* Currently, other_entries contains only the struct of ICVs. */
1515 other_entries = 1;
1516
1517 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1518 * (fn_entries + var_entries + other_entries));
1519 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1520 * fn_entries);
1521
1522 *target_table = targ_tbl;
1523
1524 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1525 new_image->target_data = target_data;
1526 new_image->module = module;
1527 new_image->fns = targ_fns;
1528
1529 pthread_mutex_lock (&dev->image_lock);
1530 new_image->next = dev->images;
1531 dev->images = new_image;
1532 pthread_mutex_unlock (&dev->image_lock);
1533
1534 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1535 {
1536 CUfunction function;
1537 int nregs, mthrs;
1538
1539 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1540 fn_descs[i].fn);
1541 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1542 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1543 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1544 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1545
1546 targ_fns->fn = function;
1547 targ_fns->launch = &fn_descs[i];
1548 targ_fns->regs_per_thread = nregs;
1549 targ_fns->max_threads_per_block = mthrs;
1550
1551 targ_tbl->start = (uintptr_t) targ_fns;
1552 targ_tbl->end = targ_tbl->start + 1;
1553 }
1554
1555 for (j = 0; j < var_entries; j++, targ_tbl++)
1556 {
1557 CUdeviceptr var;
1558 size_t bytes;
1559
1560 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1561 &var, &bytes, module, var_names[j]);
1562
1563 targ_tbl->start = (uintptr_t) var;
1564 targ_tbl->end = targ_tbl->start + bytes;
1565 }
1566
1567 CUdeviceptr varptr;
1568 size_t varsize;
1569 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
1570 module, XSTRING (GOMP_ADDITIONAL_ICVS));
1571
1572 if (r == CUDA_SUCCESS)
1573 {
1574 targ_tbl->start = (uintptr_t) varptr;
1575 targ_tbl->end = (uintptr_t) (varptr + varsize);
1576 }
1577 else
1578 /* The variable was not in this image. */
1579 targ_tbl->start = targ_tbl->end = 0;
1580
1581 if (rev_fn_table && fn_entries == 0)
1582 *rev_fn_table = NULL;
1583 else if (rev_fn_table)
1584 {
1585 CUdeviceptr var;
1586 size_t bytes;
1587 unsigned int i;
1588 r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
1589 "$offload_func_table");
1590 if (r != CUDA_SUCCESS)
1591 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1592 assert (bytes == sizeof (uint64_t) * fn_entries);
1593 *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
1594 r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
1595 if (r != CUDA_SUCCESS)
1596 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1597 /* Free if only NULL entries. */
1598 for (i = 0; i < fn_entries; ++i)
1599 if ((*rev_fn_table)[i] != 0)
1600 break;
1601 if (i == fn_entries)
1602 {
1603 free (*rev_fn_table);
1604 *rev_fn_table = NULL;
1605 }
1606 }
1607
1608 if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
1609 {
1610 /* Get the on-device GOMP_REV_OFFLOAD_VAR variable. It should be
1611 available but it might be not. One reason could be: if the user code
1612 has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1613 is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1614 are not linked in. */
1615 CUdeviceptr device_rev_offload_var;
1616 size_t device_rev_offload_size;
1617 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
1618 &device_rev_offload_var,
1619 &device_rev_offload_size, module,
1620 XSTRING (GOMP_REV_OFFLOAD_VAR));
1621 if (r != CUDA_SUCCESS)
1622 {
1623 free (*rev_fn_table);
1624 *rev_fn_table = NULL;
1625 }
1626 else
1627 {
1628 /* cuMemHostAlloc memory is accessible on the device, if
1629 unified-shared address is supported; this is assumed - see comment
1630 in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1631 CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
1632 sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
1633 CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
1634 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
1635 sizeof (dp));
1636 if (r != CUDA_SUCCESS)
1637 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1638 }
1639 }
1640
1641 nvptx_set_clocktick (module, dev);
1642
1643 if (!nvptx_do_global_cdtors (module, dev, "__do_global_ctors__entry"))
1644 return -1;
1645
1646 return fn_entries + var_entries + other_entries;
1647 }
1648
1649 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1650 function descriptors allocated by G_O_load_image. */
1651
1652 bool
1653 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1654 {
1655 struct ptx_image_data *image, **prev_p;
1656 struct ptx_device *dev = ptx_devices[ord];
1657
1658 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1659 {
1660 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1661 " (expected %u, received %u)",
1662 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1663 return false;
1664 }
1665
1666 bool ret = true;
1667 pthread_mutex_lock (&dev->image_lock);
1668 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1669 if (image->target_data == target_data)
1670 {
1671 if (!nvptx_do_global_cdtors (image->module, dev,
1672 "__do_global_dtors__entry"))
1673 ret = false;
1674
1675 *prev_p = image->next;
1676 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1677 ret = false;
1678 free (image->fns);
1679 free (image);
1680 break;
1681 }
1682 pthread_mutex_unlock (&dev->image_lock);
1683 return ret;
1684 }
1685
1686 static void *
1687 GOMP_OFFLOAD_alloc_1 (int ord, size_t size, bool usm)
1688 {
1689 if (!nvptx_attach_host_thread_to_device (ord))
1690 return NULL;
1691
1692 struct ptx_device *ptx_dev = ptx_devices[ord];
1693 struct ptx_free_block *blocks, *tmp;
1694
1695 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1696 blocks = ptx_dev->free_blocks;
1697 ptx_dev->free_blocks = NULL;
1698 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1699
1700 while (blocks)
1701 {
1702 tmp = blocks->next;
1703 nvptx_free (blocks->ptr, ptx_dev);
1704 free (blocks);
1705 blocks = tmp;
1706 }
1707
1708 void *d = nvptx_alloc (size, true, usm);
1709 if (d)
1710 return d;
1711 else
1712 {
1713 /* Memory allocation failed. Try freeing the stacks block, and
1714 retrying. */
1715 nvptx_stacks_free (ptx_dev, true);
1716 return nvptx_alloc (size, false, usm);
1717 }
1718 }
1719
1720 void *
1721 GOMP_OFFLOAD_alloc (int ord, size_t size)
1722 {
1723 return GOMP_OFFLOAD_alloc_1 (ord, size, false);
1724 }
1725
1726 void *
1727 GOMP_OFFLOAD_usm_alloc (int ord, size_t size)
1728 {
1729 return GOMP_OFFLOAD_alloc_1 (ord, size, true);
1730 }
1731
1732 bool
1733 GOMP_OFFLOAD_free (int ord, void *ptr)
1734 {
1735 return (nvptx_attach_host_thread_to_device (ord)
1736 && nvptx_free (ptr, ptx_devices[ord]));
1737 }
1738
1739 bool
1740 GOMP_OFFLOAD_usm_free (int ord, void *ptr)
1741 {
1742 return GOMP_OFFLOAD_free (ord, ptr);
1743 }
1744
1745 bool
1746 GOMP_OFFLOAD_is_usm_ptr (void *ptr)
1747 {
1748 bool managed = false;
1749 /* This returns 3 outcomes ...
1750 CUDA_ERROR_INVALID_VALUE - Not a Cuda allocated pointer.
1751 CUDA_SUCCESS, managed:false - Cuda allocated, but not USM.
1752 CUDA_SUCCESS, managed:true - USM. */
1753 CUDA_CALL_NOCHECK (cuPointerGetAttribute, &managed,
1754 CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr);
1755 return managed;
1756 }
1757
1758
1759 bool
1760 GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
1761 {
1762 GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
1763 __FUNCTION__, ptr, (unsigned long long) size);
1764
1765 /* TODO: Maybe running the deferred 'cuMemFreeHost's here is not the best
1766 idea, given that we don't know what context we're called from? (See
1767 'GOMP_OFFLOAD_run' reverse offload handling.) But, where to do it? */
1768 if (!nvptx_run_deferred_page_locked_host_free ())
1769 return false;
1770
1771 CUresult r;
1772
1773 unsigned int flags = 0;
1774 /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
1775 'flags |= CU_MEMHOSTALLOC_PORTABLE;' here. */
1776 r = CUDA_CALL_NOCHECK (cuMemHostAlloc, ptr, size, flags);
1777 if (r == CUDA_ERROR_OUT_OF_MEMORY)
1778 *ptr = NULL;
1779 else if (r != CUDA_SUCCESS)
1780 {
1781 GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r));
1782 return false;
1783 }
1784 GOMP_PLUGIN_debug (0, " -> *ptr=%p\n",
1785 *ptr);
1786 return true;
1787 }
1788
1789 static void
1790 nvptx_page_locked_host_free_callback (CUstream stream, CUresult r, void *ptr)
1791 {
1792 GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, ptr=%p\n",
1793 __FUNCTION__, stream, (unsigned) r, ptr);
1794
1795 if (r != CUDA_SUCCESS)
1796 GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__, cuda_error (r));
1797
1798 /* We can't now call 'cuMemFreeHost': we're in a CUDA stream context,
1799 where we "must not make any CUDA API calls".
1800 And, in particular in an OpenMP 'target' reverse offload context,
1801 this may even dead-lock?! */
1802 /* See 'nvptx_free'. */
1803 struct ptx_free_block *n
1804 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1805 GOMP_PLUGIN_debug (0, " defer; n=%p\n", n);
1806 n->ptr = ptr;
1807 pthread_mutex_lock (&free_host_blocks_lock);
1808 n->next = free_host_blocks;
1809 free_host_blocks = n;
1810 pthread_mutex_unlock (&free_host_blocks_lock);
1811 }
1812
1813 bool
1814 GOMP_OFFLOAD_page_locked_host_free (void *ptr, struct goacc_asyncqueue *aq)
1815 {
1816 GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, aq=%p\n",
1817 __FUNCTION__, ptr, aq);
1818
1819 if (aq)
1820 {
1821 GOMP_PLUGIN_debug (0, " aq <-"
1822 " nvptx_page_locked_host_free_callback(ptr)\n");
1823 CUDA_CALL (cuStreamAddCallback, aq->cuda_stream,
1824 nvptx_page_locked_host_free_callback, ptr, 0);
1825 }
1826 else
1827 CUDA_CALL (cuMemFreeHost, ptr);
1828 return true;
1829 }
1830
1831 static int
1832 nvptx_page_locked_host_p (const void *ptr, size_t size)
1833 {
1834 GOMP_PLUGIN_debug (0, "%s: ptr=%p, size=%llu\n",
1835 __FUNCTION__, ptr, (unsigned long long) size);
1836
1837 int ret;
1838
1839 CUresult r;
1840
1841 /* Apparently, there exists no CUDA call to query 'PTR + [0, SIZE)'. Instead
1842 of invoking 'cuMemHostGetFlags' SIZE times, we deem it sufficient to only
1843 query the base PTR. */
1844 unsigned int flags;
1845 void *ptr_noconst = (void *) ptr;
1846 r = CUDA_CALL_NOCHECK (cuMemHostGetFlags, &flags, ptr_noconst);
1847 (void) flags;
1848 if (r == CUDA_SUCCESS)
1849 ret = 1;
1850 else if (r == CUDA_ERROR_INVALID_VALUE)
1851 ret = 0;
1852 else
1853 {
1854 GOMP_PLUGIN_error ("cuMemHostGetFlags error: %s", cuda_error (r));
1855 ret = -1;
1856 }
1857 GOMP_PLUGIN_debug (0, " -> %d (with r = %u)\n",
1858 ret, (unsigned) r);
1859 return ret;
1860 }
1861
1862 int
1863 GOMP_OFFLOAD_page_locked_host_register (int ord,
1864 void *ptr, size_t size, int kind)
1865 {
1866 bool try_read_only;
1867 /* Magic number: if the actualy mapping kind is unknown... */
1868 if (kind == -1)
1869 /* ..., allow for trying read-only registration here. */
1870 try_read_only = true;
1871 else
1872 try_read_only = !GOMP_MAP_COPY_FROM_P (kind);
1873 GOMP_PLUGIN_debug (0, "nvptx %s: ord=%d, ptr=%p, size=%llu,"
1874 " kind=%d (try_read_only=%d)\n",
1875 __FUNCTION__, ord, ptr, (unsigned long long) size,
1876 kind, try_read_only);
1877 assert (size != 0);
1878
1879 if (!nvptx_attach_host_thread_to_device (ord))
1880 return -1;
1881 struct ptx_device *ptx_dev = ptx_devices[ord];
1882
1883 int ret = -1;
1884
1885 CUresult r;
1886
1887 unsigned int flags = 0;
1888 /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
1889 'flags |= CU_MEMHOSTREGISTER_PORTABLE;' here. */
1890 cuMemHostRegister:
1891 if (CUDA_CALL_EXISTS (cuMemHostRegister_v2))
1892 r = CUDA_CALL_NOCHECK (cuMemHostRegister_v2, ptr, size, flags);
1893 else
1894 r = CUDA_CALL_NOCHECK (cuMemHostRegister, ptr, size, flags);
1895 if (r == CUDA_SUCCESS)
1896 ret = 1;
1897 else if (r == CUDA_ERROR_INVALID_VALUE)
1898 {
1899 /* For example, for 'cuMemHostAlloc' (via the user code, for example)
1900 followed by 'cuMemHostRegister' (via 'always_pinned_mode', for
1901 example), we don't get 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED' but
1902 'CUDA_ERROR_INVALID_VALUE'. */
1903 if (nvptx_page_locked_host_p (ptr, size))
1904 /* Accept the case that the region already is page-locked. */
1905 ret = 0;
1906 /* Depending on certain circumstances (see 'cuMemHostRegister'
1907 documentation), for example, 'const' data that is placed in section
1908 '.rodata' may need 'flags |= CU_MEMHOSTREGISTER_READ_ONLY;', to avoid
1909 'CUDA_ERROR_INVALID_VALUE'. If running into that, we now apply/re-try
1910 lazily instead of actively setting it above, to avoid the following
1911 problem. Supposedly/observably (but, not documented), if part of a
1912 memory page has been registered without 'CU_MEMHOSTREGISTER_READ_ONLY'
1913 and we then try to register another part with
1914 'CU_MEMHOSTREGISTER_READ_ONLY', we'll get 'CUDA_ERROR_INVALID_VALUE'.
1915 In that case, we can solve the issue by re-trying with
1916 'CU_MEMHOSTREGISTER_READ_ONLY' masked out. However, if part of a
1917 memory page has been registered with 'CU_MEMHOSTREGISTER_READ_ONLY'
1918 and we then try to register another part without
1919 'CU_MEMHOSTREGISTER_READ_ONLY', that latter part apparently inherits
1920 the former's 'CU_MEMHOSTREGISTER_READ_ONLY' (and any device to host
1921 copy then fails). We can't easily resolve that situation
1922 retroactively, that is, we can't easily re-register the first
1923 'CU_MEMHOSTREGISTER_READ_ONLY' part without that flag. */
1924 else if (!(flags & CU_MEMHOSTREGISTER_READ_ONLY)
1925 && try_read_only
1926 && ptx_dev->read_only_host_register_supported)
1927 {
1928 GOMP_PLUGIN_debug (0, " flags |= CU_MEMHOSTREGISTER_READ_ONLY;\n");
1929 flags |= CU_MEMHOSTREGISTER_READ_ONLY;
1930 goto cuMemHostRegister;
1931 }
1932 /* We ought to use 'CU_MEMHOSTREGISTER_READ_ONLY', but it's not
1933 available. */
1934 else if (try_read_only
1935 && !ptx_dev->read_only_host_register_supported)
1936 {
1937 assert (!(flags & CU_MEMHOSTREGISTER_READ_ONLY));
1938 GOMP_PLUGIN_debug (0, " punt;"
1939 " CU_MEMHOSTREGISTER_READ_ONLY not available\n");
1940 /* Accept this (legacy) case; we can't (easily) register page-locked
1941 this region of host memory. */
1942 ret = 0;
1943 }
1944 }
1945 else if (r == CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
1946 {
1947 /* 'cuMemHostRegister' (via the user code, for example) followed by
1948 another (potentially partially overlapping) 'cuMemHostRegister'
1949 (via 'always_pinned_mode', for example). */
1950 /* Accept this case in good faith; do not verify further. */
1951 ret = 0;
1952 }
1953 if (ret == -1)
1954 GOMP_PLUGIN_error ("cuMemHostRegister error: %s", cuda_error (r));
1955 GOMP_PLUGIN_debug (0, " -> %d (with r = %u)\n",
1956 ret, (unsigned) r);
1957 return ret;
1958 }
1959
1960 static void
1961 nvptx_page_locked_host_unregister_callback (CUstream stream, CUresult r,
1962 void *b_)
1963 {
1964 void **b = b_;
1965 struct goacc_asyncqueue *aq = b[0];
1966 void *ptr = b[1];
1967 GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, b_=%p (aq=%p, ptr=%p)\n",
1968 __FUNCTION__, stream, (unsigned) r, b_, aq, ptr);
1969
1970 free (b_);
1971
1972 if (r != CUDA_SUCCESS)
1973 GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__, cuda_error (r));
1974
1975 /* We can't now call 'cuMemHostUnregister': we're in a CUDA stream context,
1976 where we "must not make any CUDA API calls". */
1977 /* See 'nvptx_free'. */
1978 struct ptx_free_block *n
1979 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1980 GOMP_PLUGIN_debug (0, " defer; n=%p\n", n);
1981 n->ptr = ptr;
1982 pthread_mutex_lock (&aq->page_locked_host_unregister_blocks_lock);
1983 n->next = aq->page_locked_host_unregister_blocks;
1984 aq->page_locked_host_unregister_blocks = n;
1985 pthread_mutex_unlock (&aq->page_locked_host_unregister_blocks_lock);
1986 }
1987
1988 bool
1989 GOMP_OFFLOAD_page_locked_host_unregister (void *ptr, size_t size,
1990 struct goacc_asyncqueue *aq)
1991 {
1992 GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu, aq=%p\n",
1993 __FUNCTION__, ptr, (unsigned long long) size, aq);
1994 assert (size != 0);
1995
1996 if (aq)
1997 {
1998 /* We don't unregister right away, as in-flight operations may still
1999 benefit from the registration. */
2000 void **b = GOMP_PLUGIN_malloc (2 * sizeof (*b));
2001 b[0] = aq;
2002 b[1] = ptr;
2003 GOMP_PLUGIN_debug (0, " aq <-"
2004 " nvptx_page_locked_host_unregister_callback(b=%p)\n",
2005 b);
2006 CUDA_CALL (cuStreamAddCallback, aq->cuda_stream,
2007 nvptx_page_locked_host_unregister_callback, b, 0);
2008 }
2009 else
2010 CUDA_CALL (cuMemHostUnregister, ptr);
2011 return true;
2012 }
2013
2014 int
2015 GOMP_OFFLOAD_page_locked_host_p (int ord, const void *ptr, size_t size)
2016 {
2017 GOMP_PLUGIN_debug (0, "nvptx %s: ord=%d, ptr=%p, size=%llu\n",
2018 __FUNCTION__, ord, ptr, (unsigned long long) size);
2019
2020 if (!nvptx_attach_host_thread_to_device (ord))
2021 return -1;
2022
2023 return nvptx_page_locked_host_p (ptr, size);
2024 }
2025
2026
2027 void
2028 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
2029 size_t mapnum __attribute__((unused)),
2030 void **hostaddrs __attribute__((unused)),
2031 void **devaddrs,
2032 unsigned *dims, void *targ_mem_desc)
2033 {
2034 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
2035
2036 CUdeviceptr dp = (CUdeviceptr) devaddrs;
2037 nvptx_exec (fn, dims, targ_mem_desc, dp, NULL);
2038
2039 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
2040 const char *maybe_abort_msg = "(perhaps abort was called)";
2041 if (r == CUDA_ERROR_LAUNCH_FAILED)
2042 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
2043 maybe_abort_msg);
2044 else if (r != CUDA_SUCCESS)
2045 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
2046 }
2047
2048 void
2049 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *),
2050 size_t mapnum __attribute__((unused)),
2051 void **hostaddrs __attribute__((unused)),
2052 void **devaddrs,
2053 unsigned *dims, void *targ_mem_desc,
2054 struct goacc_asyncqueue *aq)
2055 {
2056 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
2057
2058 CUdeviceptr dp = (CUdeviceptr) devaddrs;
2059 nvptx_exec (fn, dims, targ_mem_desc, dp, aq->cuda_stream);
2060 }
2061
2062 void *
2063 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
2064 {
2065 struct ptx_device *ptx_dev;
2066 struct nvptx_thread *nvthd
2067 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
2068 CUcontext thd_ctx;
2069
2070 ptx_dev = ptx_devices[ord];
2071
2072 assert (ptx_dev);
2073
2074 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
2075
2076 assert (ptx_dev->ctx);
2077
2078 if (!thd_ctx)
2079 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2080
2081 nvthd->ptx_dev = ptx_dev;
2082
2083 return (void *) nvthd;
2084 }
2085
2086 void
2087 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2088 {
2089 free (data);
2090 }
2091
2092 void *
2093 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2094 {
2095 return nvptx_get_current_cuda_device ();
2096 }
2097
2098 void *
2099 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2100 {
2101 return nvptx_get_current_cuda_context ();
2102 }
2103
2104 /* This returns a CUstream. */
2105 void *
2106 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
2107 {
2108 return (void *) aq->cuda_stream;
2109 }
2110
2111 /* This takes a CUstream. */
2112 int
2113 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
2114 {
2115 if (aq->cuda_stream)
2116 {
2117 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
2118 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
2119 }
2120
2121 aq->cuda_stream = (CUstream) stream;
2122 return 1;
2123 }
2124
2125 static struct goacc_asyncqueue *
2126 nvptx_goacc_asyncqueue_construct (unsigned int flags)
2127 {
2128 GOMP_PLUGIN_debug (0, "%s: flags=%u\n",
2129 __FUNCTION__, flags);
2130
2131 CUstream stream = NULL;
2132 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
2133
2134 struct goacc_asyncqueue *aq
2135 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
2136 aq->cuda_stream = stream;
2137 pthread_mutex_init (&aq->page_locked_host_unregister_blocks_lock, NULL);
2138 aq->page_locked_host_unregister_blocks = NULL;
2139 GOMP_PLUGIN_debug (0, " -> aq=%p (with cuda_stream=%p)\n",
2140 aq, aq->cuda_stream);
2141 return aq;
2142 }
2143
2144 struct goacc_asyncqueue *
2145 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
2146 {
2147 return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
2148 }
2149
2150 static bool
2151 nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
2152 {
2153 GOMP_PLUGIN_debug (0, "nvptx %s: aq=%p\n",
2154 __FUNCTION__, aq);
2155
2156 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
2157
2158 bool ret = true;
2159 pthread_mutex_lock (&aq->page_locked_host_unregister_blocks_lock);
2160 if (aq->page_locked_host_unregister_blocks != NULL)
2161 {
2162 GOMP_PLUGIN_error ("aq->page_locked_host_unregister_blocks not empty");
2163 ret = false;
2164 }
2165 pthread_mutex_unlock (&aq->page_locked_host_unregister_blocks_lock);
2166 pthread_mutex_destroy (&aq->page_locked_host_unregister_blocks_lock);
2167
2168 free (aq);
2169
2170 return ret;
2171 }
2172
2173 bool
2174 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
2175 {
2176 return nvptx_goacc_asyncqueue_destruct (aq);
2177 }
2178
2179 static bool
2180 nvptx_run_deferred_page_locked_host_unregister (struct goacc_asyncqueue *aq)
2181 {
2182 GOMP_PLUGIN_debug (0, "%s: aq=%p\n",
2183 __FUNCTION__, aq);
2184
2185 bool ret = true;
2186 pthread_mutex_lock (&aq->page_locked_host_unregister_blocks_lock);
2187 for (struct ptx_free_block *b = aq->page_locked_host_unregister_blocks; b;)
2188 {
2189 GOMP_PLUGIN_debug (0, " b=%p: cuMemHostUnregister(b->ptr=%p)\n",
2190 b, b->ptr);
2191
2192 struct ptx_free_block *b_next = b->next;
2193 CUresult r = CUDA_CALL_NOCHECK (cuMemHostUnregister, b->ptr);
2194 if (r != CUDA_SUCCESS)
2195 {
2196 GOMP_PLUGIN_error ("cuMemHostUnregister error: %s", cuda_error (r));
2197 ret = false;
2198 }
2199 free (b);
2200 b = b_next;
2201 }
2202 aq->page_locked_host_unregister_blocks = NULL;
2203 pthread_mutex_unlock (&aq->page_locked_host_unregister_blocks_lock);
2204 return ret;
2205 }
2206
2207 int
2208 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
2209 {
2210 GOMP_PLUGIN_debug (0, "nvptx %s: aq=%p\n",
2211 __FUNCTION__, aq);
2212
2213 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
2214 if (r == CUDA_SUCCESS)
2215 {
2216 /* As a user may expect that they don't need to 'wait' if
2217 'acc_async_test' returns 'true', clean up here, too. */
2218 if (!nvptx_run_deferred_page_locked_host_unregister (aq))
2219 return -1;
2220
2221 return 1;
2222 }
2223 if (r == CUDA_ERROR_NOT_READY)
2224 return 0;
2225
2226 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
2227 return -1;
2228 }
2229
2230 static bool
2231 nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
2232 {
2233 GOMP_PLUGIN_debug (0, "%s: aq=%p\n",
2234 __FUNCTION__, aq);
2235
2236 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
2237
2238 /* This is called from a user code (non-stream) context, and upon returning,
2239 we must've given up on any page-locked memory registrations, so unregister
2240 any pending ones now. */
2241 if (!nvptx_run_deferred_page_locked_host_unregister (aq))
2242 return false;
2243
2244 return true;
2245 }
2246
2247 bool
2248 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
2249 {
2250 return nvptx_goacc_asyncqueue_synchronize (aq);
2251 }
2252
2253 static void
2254 nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback
2255 (CUstream stream, CUresult r, void *b_)
2256 {
2257 void **b = b_;
2258 struct goacc_asyncqueue *aq1 = b[0];
2259 struct goacc_asyncqueue *aq2 = b[1];
2260 GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, b_=%p (aq1=%p, aq2=%p)\n",
2261 __FUNCTION__, stream, (unsigned) r, b_, aq1, aq2);
2262
2263 free (b_);
2264
2265 if (r != CUDA_SUCCESS)
2266 GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__, cuda_error (r));
2267
2268 pthread_mutex_lock (&aq1->page_locked_host_unregister_blocks_lock);
2269 if (aq1->page_locked_host_unregister_blocks)
2270 {
2271 pthread_mutex_lock (&aq2->page_locked_host_unregister_blocks_lock);
2272 GOMP_PLUGIN_debug (0, " page_locked_host_unregister_blocks:"
2273 " aq1 -> aq2\n");
2274 if (aq2->page_locked_host_unregister_blocks == NULL)
2275 aq2->page_locked_host_unregister_blocks
2276 = aq1->page_locked_host_unregister_blocks;
2277 else
2278 {
2279 struct ptx_free_block *b = aq2->page_locked_host_unregister_blocks;
2280 while (b->next != NULL)
2281 b = b->next;
2282 b->next = aq1->page_locked_host_unregister_blocks;
2283 }
2284 pthread_mutex_unlock (&aq2->page_locked_host_unregister_blocks_lock);
2285 aq1->page_locked_host_unregister_blocks = NULL;
2286 }
2287 pthread_mutex_unlock (&aq1->page_locked_host_unregister_blocks_lock);
2288 }
2289
2290 bool
2291 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
2292 struct goacc_asyncqueue *aq2)
2293 {
2294 GOMP_PLUGIN_debug (0, "nvptx %s: aq1=%p, aq2=%p\n",
2295 __FUNCTION__, aq1, aq2);
2296
2297 if (aq1 != aq2)
2298 {
2299 void **b = GOMP_PLUGIN_malloc (2 * sizeof (*b));
2300 b[0] = aq1;
2301 b[1] = aq2;
2302 /* Enqueue on 'aq1': move 'page_locked_host_unregister_blocks' of 'aq1'
2303 to 'aq2'. */
2304 GOMP_PLUGIN_debug (0, " aq1 <-"
2305 " nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback"
2306 "(b=%p)\n", b);
2307 CUDA_CALL (cuStreamAddCallback, aq1->cuda_stream,
2308 nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback,
2309 b, 0);
2310 }
2311
2312 CUevent e;
2313 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
2314 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
2315 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
2316
2317 return true;
2318 }
2319
2320 static void
2321 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
2322 {
2323 if (res != CUDA_SUCCESS)
2324 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
2325 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
2326 cb->fn (cb->ptr);
2327 free (ptr);
2328 }
2329
2330 void
2331 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
2332 void (*callback_fn)(void *),
2333 void *userptr)
2334 {
2335 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
2336 b->fn = callback_fn;
2337 b->ptr = userptr;
2338 b->aq = aq;
2339 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
2340 cuda_callback_wrapper, (void *) b, 0);
2341 }
2342
2343 static bool
2344 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
2345 {
2346 CUdeviceptr pb;
2347 size_t ps;
2348 if (!s)
2349 return true;
2350 if (!d)
2351 {
2352 GOMP_PLUGIN_error ("invalid device address");
2353 return false;
2354 }
2355 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
2356 if (!pb)
2357 {
2358 GOMP_PLUGIN_error ("invalid device address");
2359 return false;
2360 }
2361 if (!h)
2362 {
2363 GOMP_PLUGIN_error ("invalid host address");
2364 return false;
2365 }
2366 if (d == h)
2367 {
2368 GOMP_PLUGIN_error ("invalid host or device address");
2369 return false;
2370 }
2371 if ((void *)(d + s) > (void *)(pb + ps))
2372 {
2373 GOMP_PLUGIN_error ("invalid size");
2374 return false;
2375 }
2376 return true;
2377 }
2378
2379 bool
2380 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
2381 {
2382 if (!nvptx_attach_host_thread_to_device (ord)
2383 || !cuda_memcpy_sanity_check (src, dst, n))
2384 return false;
2385 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
2386 return true;
2387 }
2388
2389 bool
2390 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
2391 {
2392 if (!nvptx_attach_host_thread_to_device (ord)
2393 || !cuda_memcpy_sanity_check (dst, src, n))
2394 return false;
2395 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
2396 return true;
2397 }
2398
2399 bool
2400 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
2401 {
2402 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
2403 return true;
2404 }
2405
2406 bool
2407 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
2408 size_t n, struct goacc_asyncqueue *aq)
2409 {
2410 if (!nvptx_attach_host_thread_to_device (ord)
2411 || !cuda_memcpy_sanity_check (src, dst, n))
2412 return false;
2413 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
2414 return true;
2415 }
2416
2417 bool
2418 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
2419 size_t n, struct goacc_asyncqueue *aq)
2420 {
2421 if (!nvptx_attach_host_thread_to_device (ord)
2422 || !cuda_memcpy_sanity_check (dst, src, n))
2423 return false;
2424 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
2425 return true;
2426 }
2427
2428 union goacc_property_value
2429 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
2430 {
2431 union goacc_property_value propval = { .val = 0 };
2432
2433 pthread_mutex_lock (&ptx_dev_lock);
2434
2435 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
2436 {
2437 pthread_mutex_unlock (&ptx_dev_lock);
2438 return propval;
2439 }
2440
2441 struct ptx_device *ptx_dev = ptx_devices[n];
2442 switch (prop)
2443 {
2444 case GOACC_PROPERTY_MEMORY:
2445 {
2446 size_t total_mem;
2447
2448 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
2449 propval.val = total_mem;
2450 }
2451 break;
2452 case GOACC_PROPERTY_FREE_MEMORY:
2453 {
2454 size_t total_mem;
2455 size_t free_mem;
2456 CUdevice ctxdev;
2457
2458 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
2459 if (ptx_dev->dev == ctxdev)
2460 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2461 else if (ptx_dev->ctx)
2462 {
2463 CUcontext old_ctx;
2464
2465 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
2466 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2467 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
2468 }
2469 else
2470 {
2471 CUcontext new_ctx;
2472
2473 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
2474 ptx_dev->dev);
2475 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
2476 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
2477 }
2478 propval.val = free_mem;
2479 }
2480 break;
2481 case GOACC_PROPERTY_NAME:
2482 propval.ptr = ptx_dev->name;
2483 break;
2484 case GOACC_PROPERTY_VENDOR:
2485 propval.ptr = "Nvidia";
2486 break;
2487 case GOACC_PROPERTY_DRIVER:
2488 propval.ptr = cuda_driver_version_s;
2489 break;
2490 default:
2491 break;
2492 }
2493
2494 pthread_mutex_unlock (&ptx_dev_lock);
2495 return propval;
2496 }
2497
2498 /* Adjust launch dimensions: pick good values for number of blocks and warps
2499 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2500 own limits. */
2501
2502 static void
2503 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2504 struct ptx_device *ptx_dev,
2505 int *teams_p, int *threads_p)
2506 {
2507 int max_warps_block = fn->max_threads_per_block / 32;
2508 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2509 and libgcc, which matches documented limit of all GPUs as of 2015. */
2510 if (max_warps_block > 32)
2511 max_warps_block = 32;
2512 if (*threads_p <= 0)
2513 *threads_p = 8;
2514 if (*threads_p > max_warps_block)
2515 *threads_p = max_warps_block;
2516
2517 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2518 /* This is an estimate of how many blocks the device can host simultaneously.
2519 Actual limit, which may be lower, can be queried with "occupancy control"
2520 driver interface (since CUDA 6.0). */
2521 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2522 if (*teams_p <= 0 || *teams_p > max_blocks)
2523 *teams_p = max_blocks;
2524 }
2525
2526 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2527 target regions. */
2528
2529 static size_t
2530 nvptx_stacks_size ()
2531 {
2532 return 128 * 1024;
2533 }
2534
2535 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
2536 the storage should be held on entry, and remains held on exit. */
2537
2538 static void *
2539 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
2540 {
2541 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
2542 return (void *) ptx_dev->omp_stacks.ptr;
2543
2544 /* Free the old, too-small stacks. */
2545 if (ptx_dev->omp_stacks.ptr)
2546 {
2547 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2548 if (r != CUDA_SUCCESS)
2549 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
2550 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
2551 if (r != CUDA_SUCCESS)
2552 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2553 }
2554
2555 /* Make new and bigger stacks, and remember where we put them and how big
2556 they are. */
2557 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
2558 size * num);
2559 if (r != CUDA_SUCCESS)
2560 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2561
2562 ptx_dev->omp_stacks.size = size * num;
2563
2564 return (void *) ptx_dev->omp_stacks.ptr;
2565 }
2566
2567
2568 void
2569 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2570 {
2571 struct targ_fn_descriptor *tgt_fn_desc
2572 = (struct targ_fn_descriptor *) tgt_fn;
2573 CUfunction function = tgt_fn_desc->fn;
2574 const struct targ_fn_launch *launch = tgt_fn_desc->launch;
2575 const char *fn_name = launch->fn;
2576 CUresult r;
2577 struct ptx_device *ptx_dev = ptx_devices[ord];
2578 const char *maybe_abort_msg = "(perhaps abort was called)";
2579 int teams = 0, threads = 0;
2580
2581 if (!args)
2582 GOMP_PLUGIN_fatal ("No target arguments provided");
2583 while (*args)
2584 {
2585 intptr_t id = (intptr_t) *args++, val;
2586 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2587 val = (intptr_t) *args++;
2588 else
2589 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2590 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2591 continue;
2592 val = val > INT_MAX ? INT_MAX : val;
2593 id &= GOMP_TARGET_ARG_ID_MASK;
2594 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2595 teams = val;
2596 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2597 threads = val;
2598 }
2599 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2600
2601 bool reverse_offload = ptx_dev->rev_data != NULL;
2602 struct goacc_asyncqueue *reverse_offload_aq = NULL;
2603 if (reverse_offload)
2604 {
2605 reverse_offload_aq
2606 = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
2607 if (!reverse_offload_aq)
2608 exit (EXIT_FAILURE);
2609 }
2610
2611 size_t stack_size = nvptx_stacks_size ();
2612
2613 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2614 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2615 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2616 size_t fn_args_size = sizeof fn_args;
2617 void *config[] = {
2618 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2619 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2620 CU_LAUNCH_PARAM_END
2621 };
2622 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2623 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2624 __FUNCTION__, fn_name, teams, threads);
2625 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2626 32, threads, 1, lowlat_pool_size, NULL, NULL, config);
2627 if (r != CUDA_SUCCESS)
2628 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2629 if (reverse_offload)
2630 while (true)
2631 {
2632 r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
2633 if (r == CUDA_SUCCESS)
2634 break;
2635 if (r == CUDA_ERROR_LAUNCH_FAILED)
2636 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
2637 maybe_abort_msg);
2638 else if (r != CUDA_ERROR_NOT_READY)
2639 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
2640
2641 if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
2642 {
2643 struct rev_offload *rev_data = ptx_dev->rev_data;
2644 GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
2645 rev_data->addrs, rev_data->sizes,
2646 rev_data->kinds, rev_data->dev_num,
2647 reverse_offload_aq);
2648 if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
2649 exit (EXIT_FAILURE);
2650 __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
2651
2652 /* Clean up here; otherwise we may run into the situation that
2653 a following reverse offload does
2654 'GOMP_OFFLOAD_page_locked_host_alloc', and that then runs the
2655 deferred 'cuMemFreeHost's -- which may dead-lock?!
2656 TODO: This may need more considerations for the case that
2657 different host threads do reverse offload? We could move
2658 'free_host_blocks' into 'aq' (which is separate per reverse
2659 offload) instead of global, like
2660 'page_locked_host_unregister_blocks', but that doesn't seem the
2661 right thing for OpenACC 'async' generally? */
2662 if (!nvptx_run_deferred_page_locked_host_free ())
2663 exit (EXIT_FAILURE);
2664 }
2665 usleep (1);
2666 }
2667 else
2668 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2669 if (r == CUDA_ERROR_LAUNCH_FAILED)
2670 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2671 maybe_abort_msg);
2672 else if (r != CUDA_SUCCESS)
2673 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2674
2675 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2676
2677 if (reverse_offload)
2678 {
2679 if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
2680 exit (EXIT_FAILURE);
2681 }
2682 }
2683
2684 /* TODO: Implement GOMP_OFFLOAD_async_run. */
2685
2686 #define CHECK_ISA(major, minor) \
2687 if (((device->compute_major == major && device->compute_minor >= minor) \
2688 || device->compute_major > major) \
2689 && strcmp (isa, "sm_"#major#minor) == 0) \
2690 return true
2691
2692 bool
2693 GOMP_OFFLOAD_evaluate_device (int device_num, const char *kind,
2694 const char *arch, const char *isa)
2695 {
2696 if (kind && strcmp (kind, "gpu") != 0)
2697 return false;
2698 if (arch && strcmp (arch, "nvptx") != 0)
2699 return false;
2700 if (!isa)
2701 return true;
2702
2703 struct ptx_device *device = ptx_devices[device_num];
2704
2705 CHECK_ISA (3, 0);
2706 CHECK_ISA (3, 5);
2707 CHECK_ISA (3, 7);
2708 CHECK_ISA (5, 0);
2709 CHECK_ISA (5, 2);
2710 CHECK_ISA (5, 3);
2711 CHECK_ISA (6, 0);
2712 CHECK_ISA (6, 1);
2713 CHECK_ISA (6, 2);
2714 CHECK_ISA (7, 0);
2715 CHECK_ISA (7, 2);
2716 CHECK_ISA (7, 5);
2717 CHECK_ISA (8, 0);
2718 CHECK_ISA (8, 6);
2719
2720 return false;
2721 }