1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2022 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
43 /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
44 #include "config/nvptx/libgomp-nvptx.h"
47 #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
48 # include "cuda/cuda.h"
61 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
62 block to cache between kernel invocations. For soft-stacks blocks bigger
63 than this, we will free the block before attempting another GPU memory
64 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
65 we will free the cached soft-stacks block anyway then retry the
66 allocation. If that fails too, we lose. */
68 #define SOFTSTACK_CACHE_LIMIT 134217728
70 #if CUDA_VERSION < 6000
71 extern CUresult
cuGetErrorString (CUresult
, const char **);
72 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
75 #if CUDA_VERSION >= 6050
78 CUresult
cuLinkAddData (CUlinkState
, CUjitInputType
, void *, size_t,
79 const char *, unsigned, CUjit_option
*, void **);
80 CUresult
cuLinkCreate (unsigned, CUjit_option
*, void **, CUlinkState
*);
81 #undef cuMemHostRegister
82 CUresult
cuMemHostRegister (void *, size_t, unsigned int);
84 typedef size_t (*CUoccupancyB2DSize
)(int);
85 CUresult
cuLinkAddData_v2 (CUlinkState
, CUjitInputType
, void *, size_t,
86 const char *, unsigned, CUjit_option
*, void **);
87 CUresult
cuLinkCreate_v2 (unsigned, CUjit_option
*, void **, CUlinkState
*);
88 CUresult
cuMemHostRegister_v2 (void *, size_t, unsigned int);
89 CUresult
cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction
,
90 CUoccupancyB2DSize
, size_t, int);
93 #define DO_PRAGMA(x) _Pragma (#x)
95 #ifndef PLUGIN_NVPTX_LINK_LIBCUDA
100 # define CUDA_ONE_CALL(call) \
101 __typeof (call) *call;
102 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
104 #include "cuda-lib.def"
105 # undef CUDA_ONE_CALL
106 # undef CUDA_ONE_CALL_MAYBE_NULL
110 /* -1 if init_cuda_lib has not been called yet, false
111 if it has been and failed, true if it has been and succeeded. */
112 static signed char cuda_lib_inited
= -1;
114 /* Dynamically load the CUDA runtime library and initialize function
115 pointers, return false if unsuccessful, true if successful. */
119 if (cuda_lib_inited
!= -1)
120 return cuda_lib_inited
;
121 const char *cuda_runtime_lib
= "libcuda.so.1";
122 void *h
= dlopen (cuda_runtime_lib
, RTLD_LAZY
);
123 cuda_lib_inited
= false;
127 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
128 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
129 # define CUDA_ONE_CALL_1(call, allow_null) \
130 cuda_lib.call = dlsym (h, #call); \
131 if (!allow_null && cuda_lib.call == NULL) \
133 #include "cuda-lib.def"
134 # undef CUDA_ONE_CALL
135 # undef CUDA_ONE_CALL_1
136 # undef CUDA_ONE_CALL_MAYBE_NULL
138 cuda_lib_inited
= true;
141 # define CUDA_CALL_PREFIX cuda_lib.
144 # define CUDA_ONE_CALL(call)
145 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
146 #include "cuda-lib.def"
147 #undef CUDA_ONE_CALL_MAYBE_NULL
150 # define CUDA_CALL_PREFIX
151 # define init_cuda_lib() true
154 #include "secure_getenv.h"
158 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
159 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
161 /* Convenience macros for the frequently used CUDA library call and
162 error handling sequence as well as CUDA library calls that
163 do the error checking themselves or don't do it at all. */
165 #define CUDA_CALL_ERET(ERET, FN, ...) \
168 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
169 if (__r != CUDA_SUCCESS) \
171 GOMP_PLUGIN_error (#FN " error: %s", \
177 #define CUDA_CALL(FN, ...) \
178 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
180 #define CUDA_CALL_ASSERT(FN, ...) \
183 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
184 if (__r != CUDA_SUCCESS) \
186 GOMP_PLUGIN_fatal (#FN " error: %s", \
191 #define CUDA_CALL_NOCHECK(FN, ...) \
192 CUDA_CALL_PREFIX FN (__VA_ARGS__)
194 #define CUDA_CALL_EXISTS(FN) \
198 cuda_error (CUresult r
)
200 const char *fallback
= "unknown cuda error";
203 if (!CUDA_CALL_EXISTS (cuGetErrorString
))
206 r
= CUDA_CALL_NOCHECK (cuGetErrorString
, r
, &desc
);
207 if (r
== CUDA_SUCCESS
)
213 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
214 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
215 static char cuda_driver_version_s
[30];
217 static unsigned int instantiated_devices
= 0;
218 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
220 /* NVPTX/CUDA specific definition of asynchronous queues. */
221 struct goacc_asyncqueue
223 CUstream cuda_stream
;
224 pthread_mutex_t page_locked_host_unregister_blocks_lock
;
225 struct ptx_free_block
*page_locked_host_unregister_blocks
;
228 struct nvptx_callback
232 struct goacc_asyncqueue
*aq
;
233 struct nvptx_callback
*next
;
236 /* Thread-specific data for PTX. */
240 /* We currently have this embedded inside the plugin because libgomp manages
241 devices through integer target_ids. This might be better if using an
242 opaque target-specific pointer directly from gomp_device_descr. */
243 struct ptx_device
*ptx_dev
;
246 /* Target data function launch information. */
248 struct targ_fn_launch
251 unsigned short dim
[GOMP_DIM_MAX
];
254 /* Target PTX object information. */
262 /* Target data image information. */
264 typedef struct nvptx_tdata
266 const struct targ_ptx_obj
*ptx_objs
;
269 const char *const *var_names
;
272 const struct targ_fn_launch
*fn_descs
;
276 /* Descriptor of a loaded function. */
278 struct targ_fn_descriptor
281 const struct targ_fn_launch
*launch
;
283 int max_threads_per_block
;
286 /* A loaded PTX image. */
287 struct ptx_image_data
289 const void *target_data
;
292 struct targ_fn_descriptor
*fns
; /* Array of functions. */
294 struct ptx_image_data
*next
;
297 struct ptx_free_block
300 struct ptx_free_block
*next
;
320 int max_threads_per_block
;
321 int max_threads_per_multiprocessor
;
322 bool read_only_host_register_supported
;
323 int default_dims
[GOMP_DIM_MAX
];
324 int compute_major
, compute_minor
;
326 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
329 struct ptx_image_data
*images
; /* Images loaded on device. */
330 pthread_mutex_t image_lock
; /* Lock for above list. */
332 struct ptx_free_block
*free_blocks
;
333 pthread_mutex_t free_blocks_lock
;
335 /* OpenMP stacks, cached between kernel invocations. */
340 pthread_mutex_t lock
;
343 struct rev_offload
*rev_data
;
344 struct ptx_device
*next
;
347 static struct ptx_device
**ptx_devices
;
349 static struct ptx_free_block
*free_host_blocks
= NULL
;
350 static pthread_mutex_t free_host_blocks_lock
= PTHREAD_MUTEX_INITIALIZER
;
353 nvptx_run_deferred_page_locked_host_free (void)
355 GOMP_PLUGIN_debug (0, "%s\n",
358 pthread_mutex_lock (&free_host_blocks_lock
);
359 struct ptx_free_block
*b
= free_host_blocks
;
360 free_host_blocks
= NULL
;
361 pthread_mutex_unlock (&free_host_blocks_lock
);
365 GOMP_PLUGIN_debug (0, " b=%p: cuMemFreeHost(b->ptr=%p)\n",
368 struct ptx_free_block
*b_next
= b
->next
;
369 CUDA_CALL (cuMemFreeHost
, b
->ptr
);
376 /* OpenMP kernels reserve a small amount of ".shared" space for use by
377 omp_alloc. The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
378 default is set here. */
379 static unsigned lowlat_pool_size
= 8*1024;
381 static bool nvptx_do_global_cdtors (CUmodule
, struct ptx_device
*,
383 static size_t nvptx_stacks_size ();
384 static void *nvptx_stacks_acquire (struct ptx_device
*, size_t, int);
386 static inline struct nvptx_thread
*
389 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
392 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
393 should be locked on entry and remains locked on exit. */
400 if (instantiated_devices
!= 0)
403 if (!init_cuda_lib ())
406 CUDA_CALL (cuInit
, 0);
408 int cuda_driver_version
;
409 CUDA_CALL_ERET (NULL
, cuDriverGetVersion
, &cuda_driver_version
);
410 snprintf (cuda_driver_version_s
, sizeof cuda_driver_version_s
,
412 cuda_driver_version
/ 1000, cuda_driver_version
% 1000 / 10);
414 CUDA_CALL (cuDeviceGetCount
, &ndevs
);
415 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
421 /* Select the N'th PTX device for the current host thread. The device must
422 have been previously opened before calling this function. */
425 nvptx_attach_host_thread_to_device (int n
)
429 struct ptx_device
*ptx_dev
;
432 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &dev
);
433 if (r
== CUDA_ERROR_NOT_PERMITTED
)
435 /* Assume we're in a CUDA callback, just return true. */
438 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
440 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
444 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
450 ptx_dev
= ptx_devices
[n
];
453 GOMP_PLUGIN_error ("device %d not found", n
);
457 CUDA_CALL (cuCtxGetCurrent
, &thd_ctx
);
459 /* We don't necessarily have a current context (e.g. if it has been
460 destroyed. Pop it if we do though. */
462 CUDA_CALL (cuCtxPopCurrent
, &old_ctx
);
464 CUDA_CALL (cuCtxPushCurrent
, ptx_dev
->ctx
);
469 static struct ptx_device
*
470 nvptx_open_device (int n
)
472 struct ptx_device
*ptx_dev
;
473 CUdevice dev
, ctx_dev
;
477 CUDA_CALL_ERET (NULL
, cuDeviceGet
, &dev
, n
);
479 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
483 ptx_dev
->ctx_shared
= false;
485 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &ctx_dev
);
486 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
488 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
492 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
494 /* The current host thread has an active context for a different device.
497 CUDA_CALL_ERET (NULL
, cuCtxPopCurrent
, &old_ctx
);
500 CUDA_CALL_ERET (NULL
, cuCtxGetCurrent
, &ptx_dev
->ctx
);
503 CUDA_CALL_ERET (NULL
, cuCtxCreate
, &ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
505 ptx_dev
->ctx_shared
= true;
507 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
508 &pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
509 ptx_dev
->overlap
= pi
;
511 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
512 &pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
515 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
516 &pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
517 ptx_dev
->concur
= pi
;
519 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
520 &pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
523 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
524 &pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
527 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
528 &pi
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, dev
);
529 ptx_dev
->clock_khz
= pi
;
531 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
532 &pi
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, dev
);
533 ptx_dev
->num_sms
= pi
;
535 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
536 &pi
, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, dev
);
537 ptx_dev
->regs_per_block
= pi
;
539 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
540 in CUDA 6.0 and newer. */
541 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
542 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR
,
544 /* Fallback: use limit of registers per block, which is usually equal. */
545 if (r
== CUDA_ERROR_INVALID_VALUE
)
546 pi
= ptx_dev
->regs_per_block
;
547 else if (r
!= CUDA_SUCCESS
)
549 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r
));
552 ptx_dev
->regs_per_sm
= pi
;
554 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
555 &pi
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, dev
);
558 GOMP_PLUGIN_error ("Only warp size 32 is supported");
561 ptx_dev
->warp_size
= pi
;
563 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
564 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, dev
);
565 ptx_dev
->max_threads_per_block
= pi
;
567 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
568 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
, dev
);
569 ptx_dev
->max_threads_per_multiprocessor
= pi
;
571 /* Required below for reverse offload as implemented, but with compute
572 capability >= 2.0 and 64bit device processes, this should be universally be
573 the case; hence, an assert. */
574 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
575 CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING
, dev
);
576 assert (r
== CUDA_SUCCESS
&& pi
);
578 /* This is a CUDA 11.1 feature. */
579 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
580 CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED
,
582 if (r
== CUDA_ERROR_INVALID_VALUE
)
584 else if (r
!= CUDA_SUCCESS
)
586 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r
));
589 ptx_dev
->read_only_host_register_supported
= pi
;
591 for (int i
= 0; i
!= GOMP_DIM_MAX
; i
++)
592 ptx_dev
->default_dims
[i
] = 0;
594 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
595 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
, dev
);
596 ptx_dev
->compute_major
= pi
;
598 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
599 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR
, dev
);
600 ptx_dev
->compute_minor
= pi
;
602 CUDA_CALL_ERET (NULL
, cuDeviceGetName
, ptx_dev
->name
, sizeof ptx_dev
->name
,
605 ptx_dev
->images
= NULL
;
606 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
608 ptx_dev
->free_blocks
= NULL
;
609 pthread_mutex_init (&ptx_dev
->free_blocks_lock
, NULL
);
611 ptx_dev
->omp_stacks
.ptr
= 0;
612 ptx_dev
->omp_stacks
.size
= 0;
613 pthread_mutex_init (&ptx_dev
->omp_stacks
.lock
, NULL
);
615 ptx_dev
->rev_data
= NULL
;
621 nvptx_close_device (struct ptx_device
*ptx_dev
)
628 for (struct ptx_image_data
*image
= ptx_dev
->images
;
632 if (!nvptx_do_global_cdtors (image
->module
, ptx_dev
,
633 "__do_global_dtors__entry"))
637 for (struct ptx_free_block
*b
= ptx_dev
->free_blocks
; b
;)
639 struct ptx_free_block
*b_next
= b
->next
;
640 CUDA_CALL (cuMemFree
, (CUdeviceptr
) b
->ptr
);
645 pthread_mutex_destroy (&ptx_dev
->free_blocks_lock
);
646 pthread_mutex_destroy (&ptx_dev
->image_lock
);
648 pthread_mutex_destroy (&ptx_dev
->omp_stacks
.lock
);
650 if (ptx_dev
->omp_stacks
.ptr
)
651 CUDA_CALL (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
653 if (!ptx_dev
->ctx_shared
)
654 CUDA_CALL (cuCtxDestroy
, ptx_dev
->ctx
);
662 nvptx_get_num_devices (void)
666 /* This function will be called before the plugin has been initialized in
667 order to enumerate available devices, but CUDA API routines can't be used
668 until cuInit has been called. Just call it now (but don't yet do any
669 further initialization). */
670 if (instantiated_devices
== 0)
672 if (!init_cuda_lib ())
674 CUresult r
= CUDA_CALL_NOCHECK (cuInit
, 0);
675 /* This is not an error: e.g. we may have CUDA libraries installed but
676 no devices available. */
677 if (r
!= CUDA_SUCCESS
)
679 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
685 CUDA_CALL_ERET (-1, cuDeviceGetCount
, &n
);
690 notify_var (const char *var_name
, const char *env_var
)
693 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name
);
695 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name
, env_var
);
699 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o
)
701 const char *var_name
= "GOMP_NVPTX_JIT";
702 const char *env_var
= secure_getenv (var_name
);
703 notify_var (var_name
, env_var
);
708 const char *c
= env_var
;
714 if (c
[0] == '-' && c
[1] == 'O'
715 && '0' <= c
[2] && c
[2] <= '4'
716 && (c
[3] == '\0' || c
[3] == ' '))
718 *gomp_nvptx_o
= c
[2] - '0';
723 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
729 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
732 CUjit_option opts
[7];
737 CUlinkState linkstate
;
740 size_t linkoutsize
__attribute__ ((unused
));
742 opts
[0] = CU_JIT_WALL_TIME
;
743 optvals
[0] = &elapsed
;
745 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
746 optvals
[1] = &ilog
[0];
748 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
749 optvals
[2] = (void *) sizeof ilog
;
751 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
752 optvals
[3] = &elog
[0];
754 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
755 optvals
[4] = (void *) sizeof elog
;
757 opts
[5] = CU_JIT_LOG_VERBOSE
;
758 optvals
[5] = (void *) 1;
760 static intptr_t gomp_nvptx_o
= -1;
762 static bool init_done
= false;
765 process_GOMP_NVPTX_JIT (&gomp_nvptx_o
);
770 if (gomp_nvptx_o
!= -1)
772 opts
[nopts
] = CU_JIT_OPTIMIZATION_LEVEL
;
773 optvals
[nopts
] = (void *) gomp_nvptx_o
;
777 if (CUDA_CALL_EXISTS (cuLinkCreate_v2
))
778 CUDA_CALL (cuLinkCreate_v2
, nopts
, opts
, optvals
, &linkstate
);
780 CUDA_CALL (cuLinkCreate
, nopts
, opts
, optvals
, &linkstate
);
782 for (; num_objs
--; ptx_objs
++)
784 /* cuLinkAddData's 'data' argument erroneously omits the const
786 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
787 if (CUDA_CALL_EXISTS (cuLinkAddData_v2
))
788 r
= CUDA_CALL_NOCHECK (cuLinkAddData_v2
, linkstate
, CU_JIT_INPUT_PTX
,
789 (char *) ptx_objs
->code
, ptx_objs
->size
,
792 r
= CUDA_CALL_NOCHECK (cuLinkAddData
, linkstate
, CU_JIT_INPUT_PTX
,
793 (char *) ptx_objs
->code
, ptx_objs
->size
,
795 if (r
!= CUDA_SUCCESS
)
797 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
798 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
804 GOMP_PLUGIN_debug (0, "Linking\n");
805 r
= CUDA_CALL_NOCHECK (cuLinkComplete
, linkstate
, &linkout
, &linkoutsize
);
807 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
808 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
810 if (r
!= CUDA_SUCCESS
)
812 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
813 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r
));
817 CUDA_CALL (cuModuleLoadData
, module
, linkout
);
818 CUDA_CALL (cuLinkDestroy
, linkstate
);
823 nvptx_exec (void (*fn
), unsigned *dims
, void *targ_mem_desc
,
824 CUdeviceptr dp
, CUstream stream
)
826 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
830 struct nvptx_thread
*nvthd
= nvptx_thread ();
831 int warp_size
= nvthd
->ptx_dev
->warp_size
;
833 function
= targ_fn
->fn
;
835 /* Initialize the launch dimensions. Typically this is constant,
836 provided by the device compiler, but we must permit runtime
839 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
841 if (targ_fn
->launch
->dim
[i
])
842 dims
[i
] = targ_fn
->launch
->dim
[i
];
849 pthread_mutex_lock (&ptx_dev_lock
);
851 static int gomp_openacc_dims
[GOMP_DIM_MAX
];
852 if (!gomp_openacc_dims
[0])
854 /* See if the user provided GOMP_OPENACC_DIM environment
855 variable to specify runtime defaults. */
856 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
857 gomp_openacc_dims
[i
] = GOMP_PLUGIN_acc_default_dim (i
);
860 if (!nvthd
->ptx_dev
->default_dims
[0])
862 int default_dims
[GOMP_DIM_MAX
];
863 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
864 default_dims
[i
] = gomp_openacc_dims
[i
];
866 int gang
, worker
, vector
;
868 int block_size
= nvthd
->ptx_dev
->max_threads_per_block
;
869 int cpu_size
= nvthd
->ptx_dev
->max_threads_per_multiprocessor
;
870 int dev_size
= nvthd
->ptx_dev
->num_sms
;
871 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
872 " dev_size=%d, cpu_size=%d\n",
873 warp_size
, block_size
, dev_size
, cpu_size
);
875 gang
= (cpu_size
/ block_size
) * dev_size
;
876 worker
= block_size
/ warp_size
;
880 /* There is no upper bound on the gang size. The best size
881 matches the hardware configuration. Logical gangs are
882 scheduled onto physical hardware. To maximize usage, we
883 should guess a large number. */
884 if (default_dims
[GOMP_DIM_GANG
] < 1)
885 default_dims
[GOMP_DIM_GANG
] = gang
? gang
: 1024;
886 /* The worker size must not exceed the hardware. */
887 if (default_dims
[GOMP_DIM_WORKER
] < 1
888 || (default_dims
[GOMP_DIM_WORKER
] > worker
&& gang
))
889 default_dims
[GOMP_DIM_WORKER
] = worker
;
890 /* The vector size must exactly match the hardware. */
891 if (default_dims
[GOMP_DIM_VECTOR
] < 1
892 || (default_dims
[GOMP_DIM_VECTOR
] != vector
&& gang
))
893 default_dims
[GOMP_DIM_VECTOR
] = vector
;
895 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
896 default_dims
[GOMP_DIM_GANG
],
897 default_dims
[GOMP_DIM_WORKER
],
898 default_dims
[GOMP_DIM_VECTOR
]);
900 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
901 nvthd
->ptx_dev
->default_dims
[i
] = default_dims
[i
];
903 pthread_mutex_unlock (&ptx_dev_lock
);
906 bool default_dim_p
[GOMP_DIM_MAX
];
907 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
908 default_dim_p
[i
] = !dims
[i
];
910 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize
))
912 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
913 if (default_dim_p
[i
])
914 dims
[i
] = nvthd
->ptx_dev
->default_dims
[i
];
916 if (default_dim_p
[GOMP_DIM_VECTOR
])
917 dims
[GOMP_DIM_VECTOR
]
918 = MIN (dims
[GOMP_DIM_VECTOR
],
919 (targ_fn
->max_threads_per_block
/ warp_size
922 if (default_dim_p
[GOMP_DIM_WORKER
])
923 dims
[GOMP_DIM_WORKER
]
924 = MIN (dims
[GOMP_DIM_WORKER
],
925 targ_fn
->max_threads_per_block
/ dims
[GOMP_DIM_VECTOR
]);
929 /* Handle the case that the compiler allows the runtime to choose
930 the vector-length conservatively, by ignoring
931 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
934 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
935 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
936 exceed targ_fn->max_threads_per_block. */
937 int workers
= gomp_openacc_dims
[GOMP_DIM_WORKER
];
938 int gangs
= gomp_openacc_dims
[GOMP_DIM_GANG
];
941 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize
, &grids
,
942 &blocks
, function
, NULL
, 0,
943 dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]);
944 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
945 "grid = %d, block = %d\n", grids
, blocks
);
947 /* Keep the num_gangs proportional to the block size. In
948 the case were a block size is limited by shared-memory
949 or the register file capacity, the runtime will not
950 excessively over assign gangs to the multiprocessor
951 units if their state is going to be swapped out even
952 more than necessary. The constant factor 2 is there to
953 prevent threads from idling when there is insufficient
956 gangs
= 2 * grids
* (blocks
/ warp_size
);
963 int actual_vectors
= (default_dim_p
[GOMP_DIM_VECTOR
]
965 : dims
[GOMP_DIM_VECTOR
]);
966 workers
= blocks
/ actual_vectors
;
967 workers
= MAX (workers
, 1);
968 /* If we need a per-worker barrier ... . */
969 if (actual_vectors
> 32)
970 /* Don't use more barriers than available. */
971 workers
= MIN (workers
, 15);
974 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
975 if (default_dim_p
[i
])
978 case GOMP_DIM_GANG
: dims
[i
] = gangs
; break;
979 case GOMP_DIM_WORKER
: dims
[i
] = workers
; break;
980 case GOMP_DIM_VECTOR
: dims
[i
] = vectors
; break;
981 default: GOMP_PLUGIN_fatal ("invalid dim");
987 /* Check if the accelerator has sufficient hardware resources to
988 launch the offloaded kernel. */
989 if (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]
990 > targ_fn
->max_threads_per_block
)
993 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
994 " with num_workers = %d and vector_length = %d"
996 "recompile the program with 'num_workers = x and vector_length = y'"
997 " on that offloaded region or '-fopenacc-dim=:x:y' where"
1000 GOMP_PLUGIN_fatal (msg
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
1001 dims
[GOMP_DIM_VECTOR
], targ_fn
->max_threads_per_block
);
1004 /* Check if the accelerator has sufficient barrier resources to
1005 launch the offloaded kernel. */
1006 if (dims
[GOMP_DIM_WORKER
] > 15 && dims
[GOMP_DIM_VECTOR
] > 32)
1009 = ("The Nvidia accelerator has insufficient barrier resources to launch"
1010 " '%s' with num_workers = %d and vector_length = %d"
1012 "recompile the program with 'num_workers = x' on that offloaded"
1013 " region or '-fopenacc-dim=:x:' where x <= 15"
1015 "or, recompile the program with 'vector_length = 32' on that"
1016 " offloaded region or '-fopenacc-dim=::32'"
1018 GOMP_PLUGIN_fatal (msg
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
1019 dims
[GOMP_DIM_VECTOR
]);
1022 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
1023 " gangs=%u, workers=%u, vectors=%u\n",
1024 __FUNCTION__
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_GANG
],
1025 dims
[GOMP_DIM_WORKER
], dims
[GOMP_DIM_VECTOR
]);
1029 // num_gangs nctaid.x
1030 // num_workers ntid.y
1031 // vector length ntid.x
1033 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1034 acc_prof_info
*prof_info
= thr
->prof_info
;
1035 acc_event_info enqueue_launch_event_info
;
1036 acc_api_info
*api_info
= thr
->api_info
;
1037 bool profiling_p
= __builtin_expect (prof_info
!= NULL
, false);
1040 prof_info
->event_type
= acc_ev_enqueue_launch_start
;
1042 enqueue_launch_event_info
.launch_event
.event_type
1043 = prof_info
->event_type
;
1044 enqueue_launch_event_info
.launch_event
.valid_bytes
1045 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES
;
1046 enqueue_launch_event_info
.launch_event
.parent_construct
1047 = acc_construct_parallel
;
1048 enqueue_launch_event_info
.launch_event
.implicit
= 1;
1049 enqueue_launch_event_info
.launch_event
.tool_info
= NULL
;
1050 enqueue_launch_event_info
.launch_event
.kernel_name
= targ_fn
->launch
->fn
;
1051 enqueue_launch_event_info
.launch_event
.num_gangs
1052 = dims
[GOMP_DIM_GANG
];
1053 enqueue_launch_event_info
.launch_event
.num_workers
1054 = dims
[GOMP_DIM_WORKER
];
1055 enqueue_launch_event_info
.launch_event
.vector_length
1056 = dims
[GOMP_DIM_VECTOR
];
1058 api_info
->device_api
= acc_device_api_cuda
;
1060 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &enqueue_launch_event_info
,
1064 /* Per 'nvptx_goacc_validate_dims'. */
1065 assert (dims
[GOMP_DIM_VECTOR
] % warp_size
== 0);
1068 CUDA_CALL_ASSERT (cuLaunchKernel
, function
,
1069 dims
[GOMP_DIM_GANG
], 1, 1,
1070 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
1071 0, stream
, kargs
, 0);
1075 prof_info
->event_type
= acc_ev_enqueue_launch_end
;
1076 enqueue_launch_event_info
.launch_event
.event_type
1077 = prof_info
->event_type
;
1078 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &enqueue_launch_event_info
,
1082 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
1083 targ_fn
->launch
->fn
);
1086 void * openacc_get_current_cuda_context (void);
1089 goacc_profiling_acc_ev_alloc (struct goacc_thread
*thr
, void *dp
, size_t s
)
1091 acc_prof_info
*prof_info
= thr
->prof_info
;
1092 acc_event_info data_event_info
;
1093 acc_api_info
*api_info
= thr
->api_info
;
1095 prof_info
->event_type
= acc_ev_alloc
;
1097 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1098 data_event_info
.data_event
.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1099 data_event_info
.data_event
.parent_construct
= acc_construct_parallel
;
1100 data_event_info
.data_event
.implicit
= 1;
1101 data_event_info
.data_event
.tool_info
= NULL
;
1102 data_event_info
.data_event
.var_name
= NULL
;
1103 data_event_info
.data_event
.bytes
= s
;
1104 data_event_info
.data_event
.host_ptr
= NULL
;
1105 data_event_info
.data_event
.device_ptr
= dp
;
1107 api_info
->device_api
= acc_device_api_cuda
;
1109 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
, api_info
);
1112 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1113 size threshold, or if FORCE is true. */
1116 nvptx_stacks_free (struct ptx_device
*ptx_dev
, bool force
)
1118 pthread_mutex_lock (&ptx_dev
->omp_stacks
.lock
);
1119 if (ptx_dev
->omp_stacks
.ptr
1120 && (force
|| ptx_dev
->omp_stacks
.size
> SOFTSTACK_CACHE_LIMIT
))
1122 CUresult r
= CUDA_CALL_NOCHECK (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
1123 if (r
!= CUDA_SUCCESS
)
1124 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1125 ptx_dev
->omp_stacks
.ptr
= 0;
1126 ptx_dev
->omp_stacks
.size
= 0;
1128 pthread_mutex_unlock (&ptx_dev
->omp_stacks
.lock
);
1132 nvptx_alloc (size_t s
, bool suppress_errors
, bool usm
)
1136 CUresult r
= (usm
? CUDA_CALL_NOCHECK (cuMemAllocManaged
, &d
, s
,
1137 CU_MEM_ATTACH_GLOBAL
)
1138 : CUDA_CALL_NOCHECK (cuMemAlloc
, &d
, s
));
1139 if (suppress_errors
&& r
== CUDA_ERROR_OUT_OF_MEMORY
)
1141 else if (r
!= CUDA_SUCCESS
)
1143 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r
));
1147 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1148 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1150 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1152 goacc_profiling_acc_ev_alloc (thr
, (void *) d
, s
);
1158 goacc_profiling_acc_ev_free (struct goacc_thread
*thr
, void *p
)
1160 acc_prof_info
*prof_info
= thr
->prof_info
;
1161 acc_event_info data_event_info
;
1162 acc_api_info
*api_info
= thr
->api_info
;
1164 prof_info
->event_type
= acc_ev_free
;
1166 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1167 data_event_info
.data_event
.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1168 data_event_info
.data_event
.parent_construct
= acc_construct_parallel
;
1169 data_event_info
.data_event
.implicit
= 1;
1170 data_event_info
.data_event
.tool_info
= NULL
;
1171 data_event_info
.data_event
.var_name
= NULL
;
1172 data_event_info
.data_event
.bytes
= -1;
1173 data_event_info
.data_event
.host_ptr
= NULL
;
1174 data_event_info
.data_event
.device_ptr
= p
;
1176 api_info
->device_api
= acc_device_api_cuda
;
1178 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
, api_info
);
1182 nvptx_free (void *p
, struct ptx_device
*ptx_dev
)
1187 CUresult r
= CUDA_CALL_NOCHECK (cuMemGetAddressRange
, &pb
, &ps
,
1189 if (r
== CUDA_ERROR_NOT_PERMITTED
)
1191 /* We assume that this error indicates we are in a CUDA callback context,
1192 where all CUDA calls are not allowed (see cuStreamAddCallback
1193 documentation for description). Arrange to free this piece of device
1195 struct ptx_free_block
*n
1196 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block
));
1198 pthread_mutex_lock (&ptx_dev
->free_blocks_lock
);
1199 n
->next
= ptx_dev
->free_blocks
;
1200 ptx_dev
->free_blocks
= n
;
1201 pthread_mutex_unlock (&ptx_dev
->free_blocks_lock
);
1204 else if (r
!= CUDA_SUCCESS
)
1206 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r
));
1209 if ((CUdeviceptr
) p
!= pb
)
1211 GOMP_PLUGIN_error ("invalid device address");
1215 CUDA_CALL (cuMemFree
, (CUdeviceptr
) p
);
1216 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1218 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1220 goacc_profiling_acc_ev_free (thr
, p
);
1226 nvptx_get_current_cuda_device (void)
1228 struct nvptx_thread
*nvthd
= nvptx_thread ();
1230 if (!nvthd
|| !nvthd
->ptx_dev
)
1233 return &nvthd
->ptx_dev
->dev
;
1237 nvptx_get_current_cuda_context (void)
1239 struct nvptx_thread
*nvthd
= nvptx_thread ();
1241 if (!nvthd
|| !nvthd
->ptx_dev
)
1244 return nvthd
->ptx_dev
->ctx
;
1247 /* Plugin entry points. */
1250 GOMP_OFFLOAD_get_name (void)
1256 GOMP_OFFLOAD_get_caps (void)
1258 return GOMP_OFFLOAD_CAP_OPENACC_200
| GOMP_OFFLOAD_CAP_OPENMP_400
;
1262 GOMP_OFFLOAD_get_type (void)
1264 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1268 GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask
)
1270 int num_devices
= nvptx_get_num_devices ();
1271 /* Return -1 if no omp_requires_mask cannot be fulfilled but
1272 devices were present. Unified-shared address: see comment in
1273 nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1275 && ((omp_requires_mask
1276 & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
1277 | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
1278 | GOMP_REQUIRES_REVERSE_OFFLOAD
)) != 0))
1284 GOMP_OFFLOAD_init_device (int n
)
1286 struct ptx_device
*dev
;
1288 pthread_mutex_lock (&ptx_dev_lock
);
1290 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1292 pthread_mutex_unlock (&ptx_dev_lock
);
1296 dev
= nvptx_open_device (n
);
1299 ptx_devices
[n
] = dev
;
1300 instantiated_devices
++;
1303 const char *var_name
= "GOMP_NVPTX_LOWLAT_POOL";
1304 const char *env_var
= secure_getenv (var_name
);
1305 notify_var (var_name
, env_var
);
1307 if (env_var
!= NULL
)
1310 unsigned long val
= strtoul (env_var
, &endptr
, 10);
1311 if (endptr
== NULL
|| *endptr
!= '\0'
1312 || errno
== ERANGE
|| errno
== EINVAL
1314 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
1316 lowlat_pool_size
= val
;
1319 pthread_mutex_unlock (&ptx_dev_lock
);
1325 GOMP_OFFLOAD_fini_device (int n
)
1327 /* This isn't related to this specific 'ptx_devices[n]', but is a convenient
1328 place to clean up. */
1329 if (!nvptx_run_deferred_page_locked_host_free ())
1332 pthread_mutex_lock (&ptx_dev_lock
);
1334 if (ptx_devices
[n
] != NULL
)
1336 if (!nvptx_attach_host_thread_to_device (n
)
1337 || !nvptx_close_device (ptx_devices
[n
]))
1339 pthread_mutex_unlock (&ptx_dev_lock
);
1342 ptx_devices
[n
] = NULL
;
1343 instantiated_devices
--;
1346 if (instantiated_devices
== 0)
1352 pthread_mutex_unlock (&ptx_dev_lock
);
1356 /* Return the libgomp version number we're compatible with. There is
1357 no requirement for cross-version compatibility. */
1360 GOMP_OFFLOAD_version (void)
1362 return GOMP_VERSION
;
1365 /* Initialize __nvptx_clocktick, if present in MODULE. */
1368 nvptx_set_clocktick (CUmodule module
, struct ptx_device
*dev
)
1371 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &dptr
, NULL
,
1372 module
, "__nvptx_clocktick");
1373 if (r
== CUDA_ERROR_NOT_FOUND
)
1375 if (r
!= CUDA_SUCCESS
)
1376 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1377 double __nvptx_clocktick
= 1e-3 / dev
->clock_khz
;
1378 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, dptr
, &__nvptx_clocktick
,
1379 sizeof (__nvptx_clocktick
));
1380 if (r
!= CUDA_SUCCESS
)
1381 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1384 /* Invoke MODULE's global constructors/destructors. */
1387 nvptx_do_global_cdtors (CUmodule module
, struct ptx_device
*ptx_dev
,
1388 const char *funcname
)
1391 char *funcname_mgomp
= NULL
;
1394 r
= CUDA_CALL_NOCHECK (cuModuleGetFunction
,
1395 &funcptr
, module
, funcname
);
1396 GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
1397 funcname
, cuda_error (r
));
1398 if (r
== CUDA_ERROR_NOT_FOUND
)
1400 /* Try '[funcname]__mgomp'. */
1402 size_t funcname_len
= strlen (funcname
);
1403 const char *mgomp_suffix
= "__mgomp";
1404 size_t mgomp_suffix_len
= strlen (mgomp_suffix
);
1406 = GOMP_PLUGIN_malloc (funcname_len
+ mgomp_suffix_len
+ 1);
1407 memcpy (funcname_mgomp
, funcname
, funcname_len
);
1408 memcpy (funcname_mgomp
+ funcname_len
,
1409 mgomp_suffix
, mgomp_suffix_len
+ 1);
1410 funcname
= funcname_mgomp
;
1412 r
= CUDA_CALL_NOCHECK (cuModuleGetFunction
,
1413 &funcptr
, module
, funcname
);
1414 GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
1415 funcname
, cuda_error (r
));
1417 if (r
== CUDA_ERROR_NOT_FOUND
)
1419 else if (r
!= CUDA_SUCCESS
)
1421 GOMP_PLUGIN_error ("cuModuleGetFunction (%s) error: %s",
1422 funcname
, cuda_error (r
));
1427 /* If necessary, set up soft stack. */
1428 void *nvptx_stacks_0
;
1432 size_t stack_size
= nvptx_stacks_size ();
1433 pthread_mutex_lock (&ptx_dev
->omp_stacks
.lock
);
1434 nvptx_stacks_0
= nvptx_stacks_acquire (ptx_dev
, stack_size
, 1);
1435 nvptx_stacks_0
+= stack_size
;
1436 kargs
[0] = &nvptx_stacks_0
;
1438 r
= CUDA_CALL_NOCHECK (cuLaunchKernel
,
1441 /* sharedMemBytes */ 0,
1443 /* kernelParams */ funcname_mgomp
? kargs
: NULL
,
1445 if (r
!= CUDA_SUCCESS
)
1447 GOMP_PLUGIN_error ("cuLaunchKernel (%s) error: %s",
1448 funcname
, cuda_error (r
));
1452 r
= CUDA_CALL_NOCHECK (cuStreamSynchronize
,
1454 if (r
!= CUDA_SUCCESS
)
1456 GOMP_PLUGIN_error ("cuStreamSynchronize (%s) error: %s",
1457 funcname
, cuda_error (r
));
1462 pthread_mutex_unlock (&ptx_dev
->omp_stacks
.lock
);
1466 free (funcname_mgomp
);
1471 /* Load the (partial) program described by TARGET_DATA to device
1472 number ORD. Allocate and return TARGET_TABLE. If not NULL, REV_FN_TABLE
1473 will contain the on-device addresses of the functions for reverse offload.
1474 To be freed by the caller. */
1477 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1478 struct addr_pair
**target_table
,
1479 uint64_t **rev_fn_table
)
1482 const char *const *var_names
;
1483 const struct targ_fn_launch
*fn_descs
;
1484 unsigned int fn_entries
, var_entries
, other_entries
, i
, j
;
1485 struct targ_fn_descriptor
*targ_fns
;
1486 struct addr_pair
*targ_tbl
;
1487 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1488 struct ptx_image_data
*new_image
;
1489 struct ptx_device
*dev
;
1491 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1493 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1494 " (expected %u, received %u)",
1495 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1499 if (!nvptx_attach_host_thread_to_device (ord
)
1500 || !link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
))
1503 dev
= ptx_devices
[ord
];
1505 /* The mkoffload utility emits a struct of pointers/integers at the
1506 start of each offload image. The array of kernel names and the
1507 functions addresses form a one-to-one correspondence. */
1509 var_entries
= img_header
->var_num
;
1510 var_names
= img_header
->var_names
;
1511 fn_entries
= img_header
->fn_num
;
1512 fn_descs
= img_header
->fn_descs
;
1514 /* Currently, other_entries contains only the struct of ICVs. */
1517 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1518 * (fn_entries
+ var_entries
+ other_entries
));
1519 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1522 *target_table
= targ_tbl
;
1524 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1525 new_image
->target_data
= target_data
;
1526 new_image
->module
= module
;
1527 new_image
->fns
= targ_fns
;
1529 pthread_mutex_lock (&dev
->image_lock
);
1530 new_image
->next
= dev
->images
;
1531 dev
->images
= new_image
;
1532 pthread_mutex_unlock (&dev
->image_lock
);
1534 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1536 CUfunction function
;
1539 CUDA_CALL_ERET (-1, cuModuleGetFunction
, &function
, module
,
1541 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &nregs
,
1542 CU_FUNC_ATTRIBUTE_NUM_REGS
, function
);
1543 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &mthrs
,
1544 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, function
);
1546 targ_fns
->fn
= function
;
1547 targ_fns
->launch
= &fn_descs
[i
];
1548 targ_fns
->regs_per_thread
= nregs
;
1549 targ_fns
->max_threads_per_block
= mthrs
;
1551 targ_tbl
->start
= (uintptr_t) targ_fns
;
1552 targ_tbl
->end
= targ_tbl
->start
+ 1;
1555 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1560 CUDA_CALL_ERET (-1, cuModuleGetGlobal
,
1561 &var
, &bytes
, module
, var_names
[j
]);
1563 targ_tbl
->start
= (uintptr_t) var
;
1564 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1569 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &varptr
, &varsize
,
1570 module
, XSTRING (GOMP_ADDITIONAL_ICVS
));
1572 if (r
== CUDA_SUCCESS
)
1574 targ_tbl
->start
= (uintptr_t) varptr
;
1575 targ_tbl
->end
= (uintptr_t) (varptr
+ varsize
);
1578 /* The variable was not in this image. */
1579 targ_tbl
->start
= targ_tbl
->end
= 0;
1581 if (rev_fn_table
&& fn_entries
== 0)
1582 *rev_fn_table
= NULL
;
1583 else if (rev_fn_table
)
1588 r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &var
, &bytes
, module
,
1589 "$offload_func_table");
1590 if (r
!= CUDA_SUCCESS
)
1591 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1592 assert (bytes
== sizeof (uint64_t) * fn_entries
);
1593 *rev_fn_table
= GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries
);
1594 r
= CUDA_CALL_NOCHECK (cuMemcpyDtoH
, *rev_fn_table
, var
, bytes
);
1595 if (r
!= CUDA_SUCCESS
)
1596 GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r
));
1597 /* Free if only NULL entries. */
1598 for (i
= 0; i
< fn_entries
; ++i
)
1599 if ((*rev_fn_table
)[i
] != 0)
1601 if (i
== fn_entries
)
1603 free (*rev_fn_table
);
1604 *rev_fn_table
= NULL
;
1608 if (rev_fn_table
&& *rev_fn_table
&& dev
->rev_data
== NULL
)
1610 /* Get the on-device GOMP_REV_OFFLOAD_VAR variable. It should be
1611 available but it might be not. One reason could be: if the user code
1612 has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
1613 is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
1614 are not linked in. */
1615 CUdeviceptr device_rev_offload_var
;
1616 size_t device_rev_offload_size
;
1617 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
,
1618 &device_rev_offload_var
,
1619 &device_rev_offload_size
, module
,
1620 XSTRING (GOMP_REV_OFFLOAD_VAR
));
1621 if (r
!= CUDA_SUCCESS
)
1623 free (*rev_fn_table
);
1624 *rev_fn_table
= NULL
;
1628 /* cuMemHostAlloc memory is accessible on the device, if
1629 unified-shared address is supported; this is assumed - see comment
1630 in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
1631 CUDA_CALL_ASSERT (cuMemHostAlloc
, (void **) &dev
->rev_data
,
1632 sizeof (*dev
->rev_data
), CU_MEMHOSTALLOC_DEVICEMAP
);
1633 CUdeviceptr dp
= (CUdeviceptr
) dev
->rev_data
;
1634 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, device_rev_offload_var
, &dp
,
1636 if (r
!= CUDA_SUCCESS
)
1637 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1641 nvptx_set_clocktick (module
, dev
);
1643 if (!nvptx_do_global_cdtors (module
, dev
, "__do_global_ctors__entry"))
1646 return fn_entries
+ var_entries
+ other_entries
;
1649 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1650 function descriptors allocated by G_O_load_image. */
1653 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1655 struct ptx_image_data
*image
, **prev_p
;
1656 struct ptx_device
*dev
= ptx_devices
[ord
];
1658 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1660 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1661 " (expected %u, received %u)",
1662 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1667 pthread_mutex_lock (&dev
->image_lock
);
1668 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1669 if (image
->target_data
== target_data
)
1671 if (!nvptx_do_global_cdtors (image
->module
, dev
,
1672 "__do_global_dtors__entry"))
1675 *prev_p
= image
->next
;
1676 if (CUDA_CALL_NOCHECK (cuModuleUnload
, image
->module
) != CUDA_SUCCESS
)
1682 pthread_mutex_unlock (&dev
->image_lock
);
1687 GOMP_OFFLOAD_alloc_1 (int ord
, size_t size
, bool usm
)
1689 if (!nvptx_attach_host_thread_to_device (ord
))
1692 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1693 struct ptx_free_block
*blocks
, *tmp
;
1695 pthread_mutex_lock (&ptx_dev
->free_blocks_lock
);
1696 blocks
= ptx_dev
->free_blocks
;
1697 ptx_dev
->free_blocks
= NULL
;
1698 pthread_mutex_unlock (&ptx_dev
->free_blocks_lock
);
1703 nvptx_free (blocks
->ptr
, ptx_dev
);
1708 void *d
= nvptx_alloc (size
, true, usm
);
1713 /* Memory allocation failed. Try freeing the stacks block, and
1715 nvptx_stacks_free (ptx_dev
, true);
1716 return nvptx_alloc (size
, false, usm
);
1721 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1723 return GOMP_OFFLOAD_alloc_1 (ord
, size
, false);
1727 GOMP_OFFLOAD_usm_alloc (int ord
, size_t size
)
1729 return GOMP_OFFLOAD_alloc_1 (ord
, size
, true);
1733 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1735 return (nvptx_attach_host_thread_to_device (ord
)
1736 && nvptx_free (ptr
, ptx_devices
[ord
]));
1740 GOMP_OFFLOAD_usm_free (int ord
, void *ptr
)
1742 return GOMP_OFFLOAD_free (ord
, ptr
);
1746 GOMP_OFFLOAD_is_usm_ptr (void *ptr
)
1748 bool managed
= false;
1749 /* This returns 3 outcomes ...
1750 CUDA_ERROR_INVALID_VALUE - Not a Cuda allocated pointer.
1751 CUDA_SUCCESS, managed:false - Cuda allocated, but not USM.
1752 CUDA_SUCCESS, managed:true - USM. */
1753 CUDA_CALL_NOCHECK (cuPointerGetAttribute
, &managed
,
1754 CU_POINTER_ATTRIBUTE_IS_MANAGED
, (CUdeviceptr
)ptr
);
1760 GOMP_OFFLOAD_page_locked_host_alloc (void **ptr
, size_t size
)
1762 GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
1763 __FUNCTION__
, ptr
, (unsigned long long) size
);
1765 /* TODO: Maybe running the deferred 'cuMemFreeHost's here is not the best
1766 idea, given that we don't know what context we're called from? (See
1767 'GOMP_OFFLOAD_run' reverse offload handling.) But, where to do it? */
1768 if (!nvptx_run_deferred_page_locked_host_free ())
1773 unsigned int flags
= 0;
1774 /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
1775 'flags |= CU_MEMHOSTALLOC_PORTABLE;' here. */
1776 r
= CUDA_CALL_NOCHECK (cuMemHostAlloc
, ptr
, size
, flags
);
1777 if (r
== CUDA_ERROR_OUT_OF_MEMORY
)
1779 else if (r
!= CUDA_SUCCESS
)
1781 GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r
));
1784 GOMP_PLUGIN_debug (0, " -> *ptr=%p\n",
1790 nvptx_page_locked_host_free_callback (CUstream stream
, CUresult r
, void *ptr
)
1792 GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, ptr=%p\n",
1793 __FUNCTION__
, stream
, (unsigned) r
, ptr
);
1795 if (r
!= CUDA_SUCCESS
)
1796 GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__
, cuda_error (r
));
1798 /* We can't now call 'cuMemFreeHost': we're in a CUDA stream context,
1799 where we "must not make any CUDA API calls".
1800 And, in particular in an OpenMP 'target' reverse offload context,
1801 this may even dead-lock?! */
1802 /* See 'nvptx_free'. */
1803 struct ptx_free_block
*n
1804 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block
));
1805 GOMP_PLUGIN_debug (0, " defer; n=%p\n", n
);
1807 pthread_mutex_lock (&free_host_blocks_lock
);
1808 n
->next
= free_host_blocks
;
1809 free_host_blocks
= n
;
1810 pthread_mutex_unlock (&free_host_blocks_lock
);
1814 GOMP_OFFLOAD_page_locked_host_free (void *ptr
, struct goacc_asyncqueue
*aq
)
1816 GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, aq=%p\n",
1817 __FUNCTION__
, ptr
, aq
);
1821 GOMP_PLUGIN_debug (0, " aq <-"
1822 " nvptx_page_locked_host_free_callback(ptr)\n");
1823 CUDA_CALL (cuStreamAddCallback
, aq
->cuda_stream
,
1824 nvptx_page_locked_host_free_callback
, ptr
, 0);
1827 CUDA_CALL (cuMemFreeHost
, ptr
);
1832 nvptx_page_locked_host_p (const void *ptr
, size_t size
)
1834 GOMP_PLUGIN_debug (0, "%s: ptr=%p, size=%llu\n",
1835 __FUNCTION__
, ptr
, (unsigned long long) size
);
1841 /* Apparently, there exists no CUDA call to query 'PTR + [0, SIZE)'. Instead
1842 of invoking 'cuMemHostGetFlags' SIZE times, we deem it sufficient to only
1843 query the base PTR. */
1845 void *ptr_noconst
= (void *) ptr
;
1846 r
= CUDA_CALL_NOCHECK (cuMemHostGetFlags
, &flags
, ptr_noconst
);
1848 if (r
== CUDA_SUCCESS
)
1850 else if (r
== CUDA_ERROR_INVALID_VALUE
)
1854 GOMP_PLUGIN_error ("cuMemHostGetFlags error: %s", cuda_error (r
));
1857 GOMP_PLUGIN_debug (0, " -> %d (with r = %u)\n",
1863 GOMP_OFFLOAD_page_locked_host_register (int ord
,
1864 void *ptr
, size_t size
, int kind
)
1867 /* Magic number: if the actualy mapping kind is unknown... */
1869 /* ..., allow for trying read-only registration here. */
1870 try_read_only
= true;
1872 try_read_only
= !GOMP_MAP_COPY_FROM_P (kind
);
1873 GOMP_PLUGIN_debug (0, "nvptx %s: ord=%d, ptr=%p, size=%llu,"
1874 " kind=%d (try_read_only=%d)\n",
1875 __FUNCTION__
, ord
, ptr
, (unsigned long long) size
,
1876 kind
, try_read_only
);
1879 if (!nvptx_attach_host_thread_to_device (ord
))
1881 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1887 unsigned int flags
= 0;
1888 /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
1889 'flags |= CU_MEMHOSTREGISTER_PORTABLE;' here. */
1891 if (CUDA_CALL_EXISTS (cuMemHostRegister_v2
))
1892 r
= CUDA_CALL_NOCHECK (cuMemHostRegister_v2
, ptr
, size
, flags
);
1894 r
= CUDA_CALL_NOCHECK (cuMemHostRegister
, ptr
, size
, flags
);
1895 if (r
== CUDA_SUCCESS
)
1897 else if (r
== CUDA_ERROR_INVALID_VALUE
)
1899 /* For example, for 'cuMemHostAlloc' (via the user code, for example)
1900 followed by 'cuMemHostRegister' (via 'always_pinned_mode', for
1901 example), we don't get 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED' but
1902 'CUDA_ERROR_INVALID_VALUE'. */
1903 if (nvptx_page_locked_host_p (ptr
, size
))
1904 /* Accept the case that the region already is page-locked. */
1906 /* Depending on certain circumstances (see 'cuMemHostRegister'
1907 documentation), for example, 'const' data that is placed in section
1908 '.rodata' may need 'flags |= CU_MEMHOSTREGISTER_READ_ONLY;', to avoid
1909 'CUDA_ERROR_INVALID_VALUE'. If running into that, we now apply/re-try
1910 lazily instead of actively setting it above, to avoid the following
1911 problem. Supposedly/observably (but, not documented), if part of a
1912 memory page has been registered without 'CU_MEMHOSTREGISTER_READ_ONLY'
1913 and we then try to register another part with
1914 'CU_MEMHOSTREGISTER_READ_ONLY', we'll get 'CUDA_ERROR_INVALID_VALUE'.
1915 In that case, we can solve the issue by re-trying with
1916 'CU_MEMHOSTREGISTER_READ_ONLY' masked out. However, if part of a
1917 memory page has been registered with 'CU_MEMHOSTREGISTER_READ_ONLY'
1918 and we then try to register another part without
1919 'CU_MEMHOSTREGISTER_READ_ONLY', that latter part apparently inherits
1920 the former's 'CU_MEMHOSTREGISTER_READ_ONLY' (and any device to host
1921 copy then fails). We can't easily resolve that situation
1922 retroactively, that is, we can't easily re-register the first
1923 'CU_MEMHOSTREGISTER_READ_ONLY' part without that flag. */
1924 else if (!(flags
& CU_MEMHOSTREGISTER_READ_ONLY
)
1926 && ptx_dev
->read_only_host_register_supported
)
1928 GOMP_PLUGIN_debug (0, " flags |= CU_MEMHOSTREGISTER_READ_ONLY;\n");
1929 flags
|= CU_MEMHOSTREGISTER_READ_ONLY
;
1930 goto cuMemHostRegister
;
1932 /* We ought to use 'CU_MEMHOSTREGISTER_READ_ONLY', but it's not
1934 else if (try_read_only
1935 && !ptx_dev
->read_only_host_register_supported
)
1937 assert (!(flags
& CU_MEMHOSTREGISTER_READ_ONLY
));
1938 GOMP_PLUGIN_debug (0, " punt;"
1939 " CU_MEMHOSTREGISTER_READ_ONLY not available\n");
1940 /* Accept this (legacy) case; we can't (easily) register page-locked
1941 this region of host memory. */
1945 else if (r
== CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
)
1947 /* 'cuMemHostRegister' (via the user code, for example) followed by
1948 another (potentially partially overlapping) 'cuMemHostRegister'
1949 (via 'always_pinned_mode', for example). */
1950 /* Accept this case in good faith; do not verify further. */
1954 GOMP_PLUGIN_error ("cuMemHostRegister error: %s", cuda_error (r
));
1955 GOMP_PLUGIN_debug (0, " -> %d (with r = %u)\n",
1961 nvptx_page_locked_host_unregister_callback (CUstream stream
, CUresult r
,
1965 struct goacc_asyncqueue
*aq
= b
[0];
1967 GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, b_=%p (aq=%p, ptr=%p)\n",
1968 __FUNCTION__
, stream
, (unsigned) r
, b_
, aq
, ptr
);
1972 if (r
!= CUDA_SUCCESS
)
1973 GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__
, cuda_error (r
));
1975 /* We can't now call 'cuMemHostUnregister': we're in a CUDA stream context,
1976 where we "must not make any CUDA API calls". */
1977 /* See 'nvptx_free'. */
1978 struct ptx_free_block
*n
1979 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block
));
1980 GOMP_PLUGIN_debug (0, " defer; n=%p\n", n
);
1982 pthread_mutex_lock (&aq
->page_locked_host_unregister_blocks_lock
);
1983 n
->next
= aq
->page_locked_host_unregister_blocks
;
1984 aq
->page_locked_host_unregister_blocks
= n
;
1985 pthread_mutex_unlock (&aq
->page_locked_host_unregister_blocks_lock
);
1989 GOMP_OFFLOAD_page_locked_host_unregister (void *ptr
, size_t size
,
1990 struct goacc_asyncqueue
*aq
)
1992 GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu, aq=%p\n",
1993 __FUNCTION__
, ptr
, (unsigned long long) size
, aq
);
1998 /* We don't unregister right away, as in-flight operations may still
1999 benefit from the registration. */
2000 void **b
= GOMP_PLUGIN_malloc (2 * sizeof (*b
));
2003 GOMP_PLUGIN_debug (0, " aq <-"
2004 " nvptx_page_locked_host_unregister_callback(b=%p)\n",
2006 CUDA_CALL (cuStreamAddCallback
, aq
->cuda_stream
,
2007 nvptx_page_locked_host_unregister_callback
, b
, 0);
2010 CUDA_CALL (cuMemHostUnregister
, ptr
);
2015 GOMP_OFFLOAD_page_locked_host_p (int ord
, const void *ptr
, size_t size
)
2017 GOMP_PLUGIN_debug (0, "nvptx %s: ord=%d, ptr=%p, size=%llu\n",
2018 __FUNCTION__
, ord
, ptr
, (unsigned long long) size
);
2020 if (!nvptx_attach_host_thread_to_device (ord
))
2023 return nvptx_page_locked_host_p (ptr
, size
);
2028 GOMP_OFFLOAD_openacc_exec (void (*fn
) (void *),
2029 size_t mapnum
__attribute__((unused
)),
2030 void **hostaddrs
__attribute__((unused
)),
2032 unsigned *dims
, void *targ_mem_desc
)
2034 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__
);
2036 CUdeviceptr dp
= (CUdeviceptr
) devaddrs
;
2037 nvptx_exec (fn
, dims
, targ_mem_desc
, dp
, NULL
);
2039 CUresult r
= CUDA_CALL_NOCHECK (cuStreamSynchronize
, NULL
);
2040 const char *maybe_abort_msg
= "(perhaps abort was called)";
2041 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2042 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
2044 else if (r
!= CUDA_SUCCESS
)
2045 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
2049 GOMP_OFFLOAD_openacc_async_exec (void (*fn
) (void *),
2050 size_t mapnum
__attribute__((unused
)),
2051 void **hostaddrs
__attribute__((unused
)),
2053 unsigned *dims
, void *targ_mem_desc
,
2054 struct goacc_asyncqueue
*aq
)
2056 GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__
);
2058 CUdeviceptr dp
= (CUdeviceptr
) devaddrs
;
2059 nvptx_exec (fn
, dims
, targ_mem_desc
, dp
, aq
->cuda_stream
);
2063 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
2065 struct ptx_device
*ptx_dev
;
2066 struct nvptx_thread
*nvthd
2067 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
2070 ptx_dev
= ptx_devices
[ord
];
2074 CUDA_CALL_ASSERT (cuCtxGetCurrent
, &thd_ctx
);
2076 assert (ptx_dev
->ctx
);
2079 CUDA_CALL_ASSERT (cuCtxPushCurrent
, ptx_dev
->ctx
);
2081 nvthd
->ptx_dev
= ptx_dev
;
2083 return (void *) nvthd
;
2087 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
2093 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
2095 return nvptx_get_current_cuda_device ();
2099 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
2101 return nvptx_get_current_cuda_context ();
2104 /* This returns a CUstream. */
2106 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue
*aq
)
2108 return (void *) aq
->cuda_stream
;
2111 /* This takes a CUstream. */
2113 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue
*aq
, void *stream
)
2115 if (aq
->cuda_stream
)
2117 CUDA_CALL_ASSERT (cuStreamSynchronize
, aq
->cuda_stream
);
2118 CUDA_CALL_ASSERT (cuStreamDestroy
, aq
->cuda_stream
);
2121 aq
->cuda_stream
= (CUstream
) stream
;
2125 static struct goacc_asyncqueue
*
2126 nvptx_goacc_asyncqueue_construct (unsigned int flags
)
2128 GOMP_PLUGIN_debug (0, "%s: flags=%u\n",
2129 __FUNCTION__
, flags
);
2131 CUstream stream
= NULL
;
2132 CUDA_CALL_ERET (NULL
, cuStreamCreate
, &stream
, flags
);
2134 struct goacc_asyncqueue
*aq
2135 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue
));
2136 aq
->cuda_stream
= stream
;
2137 pthread_mutex_init (&aq
->page_locked_host_unregister_blocks_lock
, NULL
);
2138 aq
->page_locked_host_unregister_blocks
= NULL
;
2139 GOMP_PLUGIN_debug (0, " -> aq=%p (with cuda_stream=%p)\n",
2140 aq
, aq
->cuda_stream
);
2144 struct goacc_asyncqueue
*
2145 GOMP_OFFLOAD_openacc_async_construct (int device
__attribute__((unused
)))
2147 return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT
);
2151 nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue
*aq
)
2153 GOMP_PLUGIN_debug (0, "nvptx %s: aq=%p\n",
2156 CUDA_CALL_ERET (false, cuStreamDestroy
, aq
->cuda_stream
);
2159 pthread_mutex_lock (&aq
->page_locked_host_unregister_blocks_lock
);
2160 if (aq
->page_locked_host_unregister_blocks
!= NULL
)
2162 GOMP_PLUGIN_error ("aq->page_locked_host_unregister_blocks not empty");
2165 pthread_mutex_unlock (&aq
->page_locked_host_unregister_blocks_lock
);
2166 pthread_mutex_destroy (&aq
->page_locked_host_unregister_blocks_lock
);
2174 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue
*aq
)
2176 return nvptx_goacc_asyncqueue_destruct (aq
);
2180 nvptx_run_deferred_page_locked_host_unregister (struct goacc_asyncqueue
*aq
)
2182 GOMP_PLUGIN_debug (0, "%s: aq=%p\n",
2186 pthread_mutex_lock (&aq
->page_locked_host_unregister_blocks_lock
);
2187 for (struct ptx_free_block
*b
= aq
->page_locked_host_unregister_blocks
; b
;)
2189 GOMP_PLUGIN_debug (0, " b=%p: cuMemHostUnregister(b->ptr=%p)\n",
2192 struct ptx_free_block
*b_next
= b
->next
;
2193 CUresult r
= CUDA_CALL_NOCHECK (cuMemHostUnregister
, b
->ptr
);
2194 if (r
!= CUDA_SUCCESS
)
2196 GOMP_PLUGIN_error ("cuMemHostUnregister error: %s", cuda_error (r
));
2202 aq
->page_locked_host_unregister_blocks
= NULL
;
2203 pthread_mutex_unlock (&aq
->page_locked_host_unregister_blocks_lock
);
2208 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue
*aq
)
2210 GOMP_PLUGIN_debug (0, "nvptx %s: aq=%p\n",
2213 CUresult r
= CUDA_CALL_NOCHECK (cuStreamQuery
, aq
->cuda_stream
);
2214 if (r
== CUDA_SUCCESS
)
2216 /* As a user may expect that they don't need to 'wait' if
2217 'acc_async_test' returns 'true', clean up here, too. */
2218 if (!nvptx_run_deferred_page_locked_host_unregister (aq
))
2223 if (r
== CUDA_ERROR_NOT_READY
)
2226 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r
));
2231 nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue
*aq
)
2233 GOMP_PLUGIN_debug (0, "%s: aq=%p\n",
2236 CUDA_CALL_ERET (false, cuStreamSynchronize
, aq
->cuda_stream
);
2238 /* This is called from a user code (non-stream) context, and upon returning,
2239 we must've given up on any page-locked memory registrations, so unregister
2240 any pending ones now. */
2241 if (!nvptx_run_deferred_page_locked_host_unregister (aq
))
2248 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue
*aq
)
2250 return nvptx_goacc_asyncqueue_synchronize (aq
);
2254 nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback
2255 (CUstream stream
, CUresult r
, void *b_
)
2258 struct goacc_asyncqueue
*aq1
= b
[0];
2259 struct goacc_asyncqueue
*aq2
= b
[1];
2260 GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, b_=%p (aq1=%p, aq2=%p)\n",
2261 __FUNCTION__
, stream
, (unsigned) r
, b_
, aq1
, aq2
);
2265 if (r
!= CUDA_SUCCESS
)
2266 GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__
, cuda_error (r
));
2268 pthread_mutex_lock (&aq1
->page_locked_host_unregister_blocks_lock
);
2269 if (aq1
->page_locked_host_unregister_blocks
)
2271 pthread_mutex_lock (&aq2
->page_locked_host_unregister_blocks_lock
);
2272 GOMP_PLUGIN_debug (0, " page_locked_host_unregister_blocks:"
2274 if (aq2
->page_locked_host_unregister_blocks
== NULL
)
2275 aq2
->page_locked_host_unregister_blocks
2276 = aq1
->page_locked_host_unregister_blocks
;
2279 struct ptx_free_block
*b
= aq2
->page_locked_host_unregister_blocks
;
2280 while (b
->next
!= NULL
)
2282 b
->next
= aq1
->page_locked_host_unregister_blocks
;
2284 pthread_mutex_unlock (&aq2
->page_locked_host_unregister_blocks_lock
);
2285 aq1
->page_locked_host_unregister_blocks
= NULL
;
2287 pthread_mutex_unlock (&aq1
->page_locked_host_unregister_blocks_lock
);
2291 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue
*aq1
,
2292 struct goacc_asyncqueue
*aq2
)
2294 GOMP_PLUGIN_debug (0, "nvptx %s: aq1=%p, aq2=%p\n",
2295 __FUNCTION__
, aq1
, aq2
);
2299 void **b
= GOMP_PLUGIN_malloc (2 * sizeof (*b
));
2302 /* Enqueue on 'aq1': move 'page_locked_host_unregister_blocks' of 'aq1'
2304 GOMP_PLUGIN_debug (0, " aq1 <-"
2305 " nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback"
2307 CUDA_CALL (cuStreamAddCallback
, aq1
->cuda_stream
,
2308 nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback
,
2313 CUDA_CALL_ERET (false, cuEventCreate
, &e
, CU_EVENT_DISABLE_TIMING
);
2314 CUDA_CALL_ERET (false, cuEventRecord
, e
, aq1
->cuda_stream
);
2315 CUDA_CALL_ERET (false, cuStreamWaitEvent
, aq2
->cuda_stream
, e
, 0);
2321 cuda_callback_wrapper (CUstream stream
, CUresult res
, void *ptr
)
2323 if (res
!= CUDA_SUCCESS
)
2324 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__
, cuda_error (res
));
2325 struct nvptx_callback
*cb
= (struct nvptx_callback
*) ptr
;
2331 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue
*aq
,
2332 void (*callback_fn
)(void *),
2335 struct nvptx_callback
*b
= GOMP_PLUGIN_malloc (sizeof (*b
));
2336 b
->fn
= callback_fn
;
2339 CUDA_CALL_ASSERT (cuStreamAddCallback
, aq
->cuda_stream
,
2340 cuda_callback_wrapper
, (void *) b
, 0);
2344 cuda_memcpy_sanity_check (const void *h
, const void *d
, size_t s
)
2352 GOMP_PLUGIN_error ("invalid device address");
2355 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
2358 GOMP_PLUGIN_error ("invalid device address");
2363 GOMP_PLUGIN_error ("invalid host address");
2368 GOMP_PLUGIN_error ("invalid host or device address");
2371 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
2373 GOMP_PLUGIN_error ("invalid size");
2380 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
2382 if (!nvptx_attach_host_thread_to_device (ord
)
2383 || !cuda_memcpy_sanity_check (src
, dst
, n
))
2385 CUDA_CALL (cuMemcpyHtoD
, (CUdeviceptr
) dst
, src
, n
);
2390 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
2392 if (!nvptx_attach_host_thread_to_device (ord
)
2393 || !cuda_memcpy_sanity_check (dst
, src
, n
))
2395 CUDA_CALL (cuMemcpyDtoH
, dst
, (CUdeviceptr
) src
, n
);
2400 GOMP_OFFLOAD_dev2dev (int ord
, void *dst
, const void *src
, size_t n
)
2402 CUDA_CALL (cuMemcpyDtoDAsync
, (CUdeviceptr
) dst
, (CUdeviceptr
) src
, n
, NULL
);
2407 GOMP_OFFLOAD_openacc_async_host2dev (int ord
, void *dst
, const void *src
,
2408 size_t n
, struct goacc_asyncqueue
*aq
)
2410 if (!nvptx_attach_host_thread_to_device (ord
)
2411 || !cuda_memcpy_sanity_check (src
, dst
, n
))
2413 CUDA_CALL (cuMemcpyHtoDAsync
, (CUdeviceptr
) dst
, src
, n
, aq
->cuda_stream
);
2418 GOMP_OFFLOAD_openacc_async_dev2host (int ord
, void *dst
, const void *src
,
2419 size_t n
, struct goacc_asyncqueue
*aq
)
2421 if (!nvptx_attach_host_thread_to_device (ord
)
2422 || !cuda_memcpy_sanity_check (dst
, src
, n
))
2424 CUDA_CALL (cuMemcpyDtoHAsync
, dst
, (CUdeviceptr
) src
, n
, aq
->cuda_stream
);
2428 union goacc_property_value
2429 GOMP_OFFLOAD_openacc_get_property (int n
, enum goacc_property prop
)
2431 union goacc_property_value propval
= { .val
= 0 };
2433 pthread_mutex_lock (&ptx_dev_lock
);
2435 if (n
>= nvptx_get_num_devices () || n
< 0 || ptx_devices
[n
] == NULL
)
2437 pthread_mutex_unlock (&ptx_dev_lock
);
2441 struct ptx_device
*ptx_dev
= ptx_devices
[n
];
2444 case GOACC_PROPERTY_MEMORY
:
2448 CUDA_CALL_ERET (propval
, cuDeviceTotalMem
, &total_mem
, ptx_dev
->dev
);
2449 propval
.val
= total_mem
;
2452 case GOACC_PROPERTY_FREE_MEMORY
:
2458 CUDA_CALL_ERET (propval
, cuCtxGetDevice
, &ctxdev
);
2459 if (ptx_dev
->dev
== ctxdev
)
2460 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
2461 else if (ptx_dev
->ctx
)
2465 CUDA_CALL_ERET (propval
, cuCtxPushCurrent
, ptx_dev
->ctx
);
2466 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
2467 CUDA_CALL_ASSERT (cuCtxPopCurrent
, &old_ctx
);
2473 CUDA_CALL_ERET (propval
, cuCtxCreate
, &new_ctx
, CU_CTX_SCHED_AUTO
,
2475 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
2476 CUDA_CALL_ASSERT (cuCtxDestroy
, new_ctx
);
2478 propval
.val
= free_mem
;
2481 case GOACC_PROPERTY_NAME
:
2482 propval
.ptr
= ptx_dev
->name
;
2484 case GOACC_PROPERTY_VENDOR
:
2485 propval
.ptr
= "Nvidia";
2487 case GOACC_PROPERTY_DRIVER
:
2488 propval
.ptr
= cuda_driver_version_s
;
2494 pthread_mutex_unlock (&ptx_dev_lock
);
2498 /* Adjust launch dimensions: pick good values for number of blocks and warps
2499 and ensure that number of warps does not exceed CUDA limits as well as GCC's
2503 nvptx_adjust_launch_bounds (struct targ_fn_descriptor
*fn
,
2504 struct ptx_device
*ptx_dev
,
2505 int *teams_p
, int *threads_p
)
2507 int max_warps_block
= fn
->max_threads_per_block
/ 32;
2508 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2509 and libgcc, which matches documented limit of all GPUs as of 2015. */
2510 if (max_warps_block
> 32)
2511 max_warps_block
= 32;
2512 if (*threads_p
<= 0)
2514 if (*threads_p
> max_warps_block
)
2515 *threads_p
= max_warps_block
;
2517 int regs_per_block
= fn
->regs_per_thread
* 32 * *threads_p
;
2518 /* This is an estimate of how many blocks the device can host simultaneously.
2519 Actual limit, which may be lower, can be queried with "occupancy control"
2520 driver interface (since CUDA 6.0). */
2521 int max_blocks
= ptx_dev
->regs_per_sm
/ regs_per_block
* ptx_dev
->num_sms
;
2522 if (*teams_p
<= 0 || *teams_p
> max_blocks
)
2523 *teams_p
= max_blocks
;
2526 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2530 nvptx_stacks_size ()
2535 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
2536 the storage should be held on entry, and remains held on exit. */
2539 nvptx_stacks_acquire (struct ptx_device
*ptx_dev
, size_t size
, int num
)
2541 if (ptx_dev
->omp_stacks
.ptr
&& ptx_dev
->omp_stacks
.size
>= size
* num
)
2542 return (void *) ptx_dev
->omp_stacks
.ptr
;
2544 /* Free the old, too-small stacks. */
2545 if (ptx_dev
->omp_stacks
.ptr
)
2547 CUresult r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
2548 if (r
!= CUDA_SUCCESS
)
2549 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r
));
2550 r
= CUDA_CALL_NOCHECK (cuMemFree
, ptx_dev
->omp_stacks
.ptr
);
2551 if (r
!= CUDA_SUCCESS
)
2552 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
2555 /* Make new and bigger stacks, and remember where we put them and how big
2557 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &ptx_dev
->omp_stacks
.ptr
,
2559 if (r
!= CUDA_SUCCESS
)
2560 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
2562 ptx_dev
->omp_stacks
.size
= size
* num
;
2564 return (void *) ptx_dev
->omp_stacks
.ptr
;
2569 GOMP_OFFLOAD_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
)
2571 struct targ_fn_descriptor
*tgt_fn_desc
2572 = (struct targ_fn_descriptor
*) tgt_fn
;
2573 CUfunction function
= tgt_fn_desc
->fn
;
2574 const struct targ_fn_launch
*launch
= tgt_fn_desc
->launch
;
2575 const char *fn_name
= launch
->fn
;
2577 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
2578 const char *maybe_abort_msg
= "(perhaps abort was called)";
2579 int teams
= 0, threads
= 0;
2582 GOMP_PLUGIN_fatal ("No target arguments provided");
2585 intptr_t id
= (intptr_t) *args
++, val
;
2586 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
2587 val
= (intptr_t) *args
++;
2589 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
2590 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) != GOMP_TARGET_ARG_DEVICE_ALL
)
2592 val
= val
> INT_MAX
? INT_MAX
: val
;
2593 id
&= GOMP_TARGET_ARG_ID_MASK
;
2594 if (id
== GOMP_TARGET_ARG_NUM_TEAMS
)
2596 else if (id
== GOMP_TARGET_ARG_THREAD_LIMIT
)
2599 nvptx_adjust_launch_bounds (tgt_fn
, ptx_dev
, &teams
, &threads
);
2601 bool reverse_offload
= ptx_dev
->rev_data
!= NULL
;
2602 struct goacc_asyncqueue
*reverse_offload_aq
= NULL
;
2603 if (reverse_offload
)
2606 = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING
);
2607 if (!reverse_offload_aq
)
2608 exit (EXIT_FAILURE
);
2611 size_t stack_size
= nvptx_stacks_size ();
2613 pthread_mutex_lock (&ptx_dev
->omp_stacks
.lock
);
2614 void *stacks
= nvptx_stacks_acquire (ptx_dev
, stack_size
, teams
* threads
);
2615 void *fn_args
[] = {tgt_vars
, stacks
, (void *) stack_size
};
2616 size_t fn_args_size
= sizeof fn_args
;
2618 CU_LAUNCH_PARAM_BUFFER_POINTER
, fn_args
,
2619 CU_LAUNCH_PARAM_BUFFER_SIZE
, &fn_args_size
,
2622 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2623 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2624 __FUNCTION__
, fn_name
, teams
, threads
);
2625 r
= CUDA_CALL_NOCHECK (cuLaunchKernel
, function
, teams
, 1, 1,
2626 32, threads
, 1, lowlat_pool_size
, NULL
, NULL
, config
);
2627 if (r
!= CUDA_SUCCESS
)
2628 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
2629 if (reverse_offload
)
2632 r
= CUDA_CALL_NOCHECK (cuStreamQuery
, NULL
);
2633 if (r
== CUDA_SUCCESS
)
2635 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2636 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r
),
2638 else if (r
!= CUDA_ERROR_NOT_READY
)
2639 GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r
));
2641 if (__atomic_load_n (&ptx_dev
->rev_data
->fn
, __ATOMIC_ACQUIRE
) != 0)
2643 struct rev_offload
*rev_data
= ptx_dev
->rev_data
;
2644 GOMP_PLUGIN_target_rev (rev_data
->fn
, rev_data
->mapnum
,
2645 rev_data
->addrs
, rev_data
->sizes
,
2646 rev_data
->kinds
, rev_data
->dev_num
,
2647 reverse_offload_aq
);
2648 if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq
))
2649 exit (EXIT_FAILURE
);
2650 __atomic_store_n (&rev_data
->fn
, 0, __ATOMIC_RELEASE
);
2652 /* Clean up here; otherwise we may run into the situation that
2653 a following reverse offload does
2654 'GOMP_OFFLOAD_page_locked_host_alloc', and that then runs the
2655 deferred 'cuMemFreeHost's -- which may dead-lock?!
2656 TODO: This may need more considerations for the case that
2657 different host threads do reverse offload? We could move
2658 'free_host_blocks' into 'aq' (which is separate per reverse
2659 offload) instead of global, like
2660 'page_locked_host_unregister_blocks', but that doesn't seem the
2661 right thing for OpenACC 'async' generally? */
2662 if (!nvptx_run_deferred_page_locked_host_free ())
2663 exit (EXIT_FAILURE
);
2668 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
2669 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
2670 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
2672 else if (r
!= CUDA_SUCCESS
)
2673 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
2675 pthread_mutex_unlock (&ptx_dev
->omp_stacks
.lock
);
2677 if (reverse_offload
)
2679 if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq
))
2680 exit (EXIT_FAILURE
);
2684 /* TODO: Implement GOMP_OFFLOAD_async_run. */
2686 #define CHECK_ISA(major, minor) \
2687 if (((device->compute_major == major && device->compute_minor >= minor) \
2688 || device->compute_major > major) \
2689 && strcmp (isa, "sm_"#major#minor) == 0) \
2693 GOMP_OFFLOAD_evaluate_device (int device_num
, const char *kind
,
2694 const char *arch
, const char *isa
)
2696 if (kind
&& strcmp (kind
, "gpu") != 0)
2698 if (arch
&& strcmp (arch
, "nvptx") != 0)
2703 struct ptx_device
*device
= ptx_devices
[device_num
];