1 /* Plugin for NVPTX execution.
3 Copyright (C) 2013-2020 Free Software Foundation, Inc.
5 Contributed by Mentor Embedded.
7 This file is part of the GNU Offloading and Multi Processing Library
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
52 #if CUDA_VERSION < 6000
53 extern CUresult
cuGetErrorString (CUresult
, const char **);
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
57 #if CUDA_VERSION >= 6050
60 CUresult
cuLinkAddData (CUlinkState
, CUjitInputType
, void *, size_t,
61 const char *, unsigned, CUjit_option
*, void **);
62 CUresult
cuLinkCreate (unsigned, CUjit_option
*, void **, CUlinkState
*);
64 typedef size_t (*CUoccupancyB2DSize
)(int);
65 CUresult
cuLinkAddData_v2 (CUlinkState
, CUjitInputType
, void *, size_t,
66 const char *, unsigned, CUjit_option
*, void **);
67 CUresult
cuLinkCreate_v2 (unsigned, CUjit_option
*, void **, CUlinkState
*);
68 CUresult
cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction
,
69 CUoccupancyB2DSize
, size_t, int);
72 #define DO_PRAGMA(x) _Pragma (#x)
74 #if PLUGIN_NVPTX_DYNAMIC
79 # define CUDA_ONE_CALL(call) \
80 __typeof (call) *call;
81 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
83 #include "cuda-lib.def"
85 # undef CUDA_ONE_CALL_MAYBE_NULL
89 /* -1 if init_cuda_lib has not been called yet, false
90 if it has been and failed, true if it has been and succeeded. */
91 static signed char cuda_lib_inited
= -1;
93 /* Dynamically load the CUDA runtime library and initialize function
94 pointers, return false if unsuccessful, true if successful. */
98 if (cuda_lib_inited
!= -1)
99 return cuda_lib_inited
;
100 const char *cuda_runtime_lib
= "libcuda.so.1";
101 void *h
= dlopen (cuda_runtime_lib
, RTLD_LAZY
);
102 cuda_lib_inited
= false;
106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
108 # define CUDA_ONE_CALL_1(call, allow_null) \
109 cuda_lib.call = dlsym (h, #call); \
110 if (!allow_null && cuda_lib.call == NULL) \
112 #include "cuda-lib.def"
113 # undef CUDA_ONE_CALL
114 # undef CUDA_ONE_CALL_1
115 # undef CUDA_ONE_CALL_MAYBE_NULL
117 cuda_lib_inited
= true;
120 # define CUDA_CALL_PREFIX cuda_lib.
123 # define CUDA_ONE_CALL(call)
124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
125 #include "cuda-lib.def"
126 #undef CUDA_ONE_CALL_MAYBE_NULL
129 # define CUDA_CALL_PREFIX
130 # define init_cuda_lib() true
133 #include "secure_getenv.h"
137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
140 /* Convenience macros for the frequently used CUDA library call and
141 error handling sequence as well as CUDA library calls that
142 do the error checking themselves or don't do it at all. */
144 #define CUDA_CALL_ERET(ERET, FN, ...) \
147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
148 if (__r != CUDA_SUCCESS) \
150 GOMP_PLUGIN_error (#FN " error: %s", \
156 #define CUDA_CALL(FN, ...) \
157 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
159 #define CUDA_CALL_ASSERT(FN, ...) \
162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
163 if (__r != CUDA_SUCCESS) \
165 GOMP_PLUGIN_fatal (#FN " error: %s", \
170 #define CUDA_CALL_NOCHECK(FN, ...) \
171 CUDA_CALL_PREFIX FN (__VA_ARGS__)
173 #define CUDA_CALL_EXISTS(FN) \
177 cuda_error (CUresult r
)
179 const char *fallback
= "unknown cuda error";
182 if (!CUDA_CALL_EXISTS (cuGetErrorString
))
185 r
= CUDA_CALL_NOCHECK (cuGetErrorString
, r
, &desc
);
186 if (r
== CUDA_SUCCESS
)
192 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
193 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
194 static char cuda_driver_version_s
[30];
196 static unsigned int instantiated_devices
= 0;
197 static pthread_mutex_t ptx_dev_lock
= PTHREAD_MUTEX_INITIALIZER
;
199 /* NVPTX/CUDA specific definition of asynchronous queues. */
200 struct goacc_asyncqueue
202 CUstream cuda_stream
;
205 struct nvptx_callback
209 struct goacc_asyncqueue
*aq
;
210 struct nvptx_callback
*next
;
213 /* Thread-specific data for PTX. */
217 /* We currently have this embedded inside the plugin because libgomp manages
218 devices through integer target_ids. This might be better if using an
219 opaque target-specific pointer directly from gomp_device_descr. */
220 struct ptx_device
*ptx_dev
;
223 /* Target data function launch information. */
225 struct targ_fn_launch
228 unsigned short dim
[GOMP_DIM_MAX
];
231 /* Target PTX object information. */
239 /* Target data image information. */
241 typedef struct nvptx_tdata
243 const struct targ_ptx_obj
*ptx_objs
;
246 const char *const *var_names
;
249 const struct targ_fn_launch
*fn_descs
;
253 /* Descriptor of a loaded function. */
255 struct targ_fn_descriptor
258 const struct targ_fn_launch
*launch
;
260 int max_threads_per_block
;
263 /* A loaded PTX image. */
264 struct ptx_image_data
266 const void *target_data
;
269 struct targ_fn_descriptor
*fns
; /* Array of functions. */
271 struct ptx_image_data
*next
;
274 struct ptx_free_block
277 struct ptx_free_block
*next
;
297 int max_threads_per_block
;
298 int max_threads_per_multiprocessor
;
299 int default_dims
[GOMP_DIM_MAX
];
301 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
304 struct ptx_image_data
*images
; /* Images loaded on device. */
305 pthread_mutex_t image_lock
; /* Lock for above list. */
307 struct ptx_free_block
*free_blocks
;
308 pthread_mutex_t free_blocks_lock
;
310 struct ptx_device
*next
;
313 static struct ptx_device
**ptx_devices
;
315 static inline struct nvptx_thread
*
318 return (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
321 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
322 should be locked on entry and remains locked on exit. */
329 if (instantiated_devices
!= 0)
332 if (!init_cuda_lib ())
335 CUDA_CALL (cuInit
, 0);
337 int cuda_driver_version
;
338 CUDA_CALL_ERET (NULL
, cuDriverGetVersion
, &cuda_driver_version
);
339 snprintf (cuda_driver_version_s
, sizeof cuda_driver_version_s
,
341 cuda_driver_version
/ 1000, cuda_driver_version
% 1000 / 10);
343 CUDA_CALL (cuDeviceGetCount
, &ndevs
);
344 ptx_devices
= GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device
*)
350 /* Select the N'th PTX device for the current host thread. The device must
351 have been previously opened before calling this function. */
354 nvptx_attach_host_thread_to_device (int n
)
358 struct ptx_device
*ptx_dev
;
361 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &dev
);
362 if (r
== CUDA_ERROR_NOT_PERMITTED
)
364 /* Assume we're in a CUDA callback, just return true. */
367 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
369 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
373 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& dev
== n
)
379 ptx_dev
= ptx_devices
[n
];
382 GOMP_PLUGIN_error ("device %d not found", n
);
386 CUDA_CALL (cuCtxGetCurrent
, &thd_ctx
);
388 /* We don't necessarily have a current context (e.g. if it has been
389 destroyed. Pop it if we do though. */
391 CUDA_CALL (cuCtxPopCurrent
, &old_ctx
);
393 CUDA_CALL (cuCtxPushCurrent
, ptx_dev
->ctx
);
398 static struct ptx_device
*
399 nvptx_open_device (int n
)
401 struct ptx_device
*ptx_dev
;
402 CUdevice dev
, ctx_dev
;
404 int async_engines
, pi
;
406 CUDA_CALL_ERET (NULL
, cuDeviceGet
, &dev
, n
);
408 ptx_dev
= GOMP_PLUGIN_malloc (sizeof (struct ptx_device
));
412 ptx_dev
->ctx_shared
= false;
414 r
= CUDA_CALL_NOCHECK (cuCtxGetDevice
, &ctx_dev
);
415 if (r
!= CUDA_SUCCESS
&& r
!= CUDA_ERROR_INVALID_CONTEXT
)
417 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r
));
421 if (r
!= CUDA_ERROR_INVALID_CONTEXT
&& ctx_dev
!= dev
)
423 /* The current host thread has an active context for a different device.
426 CUDA_CALL_ERET (NULL
, cuCtxPopCurrent
, &old_ctx
);
429 CUDA_CALL_ERET (NULL
, cuCtxGetCurrent
, &ptx_dev
->ctx
);
432 CUDA_CALL_ERET (NULL
, cuCtxCreate
, &ptx_dev
->ctx
, CU_CTX_SCHED_AUTO
, dev
);
434 ptx_dev
->ctx_shared
= true;
436 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
437 &pi
, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
, dev
);
438 ptx_dev
->overlap
= pi
;
440 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
441 &pi
, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY
, dev
);
444 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
445 &pi
, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS
, dev
);
446 ptx_dev
->concur
= pi
;
448 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
449 &pi
, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE
, dev
);
452 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
453 &pi
, CU_DEVICE_ATTRIBUTE_INTEGRATED
, dev
);
456 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
457 &pi
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, dev
);
458 ptx_dev
->clock_khz
= pi
;
460 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
461 &pi
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
, dev
);
462 ptx_dev
->num_sms
= pi
;
464 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
465 &pi
, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
, dev
);
466 ptx_dev
->regs_per_block
= pi
;
468 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
469 in CUDA 6.0 and newer. */
470 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &pi
,
471 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR
,
473 /* Fallback: use limit of registers per block, which is usually equal. */
474 if (r
== CUDA_ERROR_INVALID_VALUE
)
475 pi
= ptx_dev
->regs_per_block
;
476 else if (r
!= CUDA_SUCCESS
)
478 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r
));
481 ptx_dev
->regs_per_sm
= pi
;
483 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
,
484 &pi
, CU_DEVICE_ATTRIBUTE_WARP_SIZE
, dev
);
487 GOMP_PLUGIN_error ("Only warp size 32 is supported");
490 ptx_dev
->warp_size
= pi
;
492 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
493 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, dev
);
494 ptx_dev
->max_threads_per_block
= pi
;
496 CUDA_CALL_ERET (NULL
, cuDeviceGetAttribute
, &pi
,
497 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
, dev
);
498 ptx_dev
->max_threads_per_multiprocessor
= pi
;
500 r
= CUDA_CALL_NOCHECK (cuDeviceGetAttribute
, &async_engines
,
501 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
, dev
);
502 if (r
!= CUDA_SUCCESS
)
505 for (int i
= 0; i
!= GOMP_DIM_MAX
; i
++)
506 ptx_dev
->default_dims
[i
] = 0;
508 CUDA_CALL_ERET (NULL
, cuDeviceGetName
, ptx_dev
->name
, sizeof ptx_dev
->name
,
511 ptx_dev
->images
= NULL
;
512 pthread_mutex_init (&ptx_dev
->image_lock
, NULL
);
514 ptx_dev
->free_blocks
= NULL
;
515 pthread_mutex_init (&ptx_dev
->free_blocks_lock
, NULL
);
521 nvptx_close_device (struct ptx_device
*ptx_dev
)
526 for (struct ptx_free_block
*b
= ptx_dev
->free_blocks
; b
;)
528 struct ptx_free_block
*b_next
= b
->next
;
529 CUDA_CALL (cuMemFree
, (CUdeviceptr
) b
->ptr
);
534 pthread_mutex_destroy (&ptx_dev
->free_blocks_lock
);
535 pthread_mutex_destroy (&ptx_dev
->image_lock
);
537 if (!ptx_dev
->ctx_shared
)
538 CUDA_CALL (cuCtxDestroy
, ptx_dev
->ctx
);
545 nvptx_get_num_devices (void)
549 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
551 if (sizeof (void *) != 8)
553 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
554 " only 64-bit configurations are supported\n");
558 /* This function will be called before the plugin has been initialized in
559 order to enumerate available devices, but CUDA API routines can't be used
560 until cuInit has been called. Just call it now (but don't yet do any
561 further initialization). */
562 if (instantiated_devices
== 0)
564 if (!init_cuda_lib ())
566 CUresult r
= CUDA_CALL_NOCHECK (cuInit
, 0);
567 /* This is not an error: e.g. we may have CUDA libraries installed but
568 no devices available. */
569 if (r
!= CUDA_SUCCESS
)
571 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
577 CUDA_CALL_ERET (-1, cuDeviceGetCount
, &n
);
582 notify_var (const char *var_name
, const char *env_var
)
585 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name
);
587 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name
, env_var
);
591 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o
)
593 const char *var_name
= "GOMP_NVPTX_JIT";
594 const char *env_var
= secure_getenv (var_name
);
595 notify_var (var_name
, env_var
);
600 const char *c
= env_var
;
606 if (c
[0] == '-' && c
[1] == 'O'
607 && '0' <= c
[2] && c
[2] <= '4'
608 && (c
[3] == '\0' || c
[3] == ' '))
610 *gomp_nvptx_o
= c
[2] - '0';
615 GOMP_PLUGIN_error ("Error parsing %s", var_name
);
621 link_ptx (CUmodule
*module
, const struct targ_ptx_obj
*ptx_objs
,
624 CUjit_option opts
[7];
629 CUlinkState linkstate
;
632 size_t linkoutsize
__attribute__ ((unused
));
634 opts
[0] = CU_JIT_WALL_TIME
;
635 optvals
[0] = &elapsed
;
637 opts
[1] = CU_JIT_INFO_LOG_BUFFER
;
638 optvals
[1] = &ilog
[0];
640 opts
[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
;
641 optvals
[2] = (void *) sizeof ilog
;
643 opts
[3] = CU_JIT_ERROR_LOG_BUFFER
;
644 optvals
[3] = &elog
[0];
646 opts
[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
;
647 optvals
[4] = (void *) sizeof elog
;
649 opts
[5] = CU_JIT_LOG_VERBOSE
;
650 optvals
[5] = (void *) 1;
652 static intptr_t gomp_nvptx_o
= -1;
654 static bool init_done
= false;
657 process_GOMP_NVPTX_JIT (&gomp_nvptx_o
);
662 if (gomp_nvptx_o
!= -1)
664 opts
[nopts
] = CU_JIT_OPTIMIZATION_LEVEL
;
665 optvals
[nopts
] = (void *) gomp_nvptx_o
;
669 if (CUDA_CALL_EXISTS (cuLinkCreate_v2
))
670 CUDA_CALL (cuLinkCreate_v2
, nopts
, opts
, optvals
, &linkstate
);
672 CUDA_CALL (cuLinkCreate
, nopts
, opts
, optvals
, &linkstate
);
674 for (; num_objs
--; ptx_objs
++)
676 /* cuLinkAddData's 'data' argument erroneously omits the const
678 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs
->code
);
679 if (CUDA_CALL_EXISTS (cuLinkAddData_v2
))
680 r
= CUDA_CALL_NOCHECK (cuLinkAddData_v2
, linkstate
, CU_JIT_INPUT_PTX
,
681 (char *) ptx_objs
->code
, ptx_objs
->size
,
684 r
= CUDA_CALL_NOCHECK (cuLinkAddData
, linkstate
, CU_JIT_INPUT_PTX
,
685 (char *) ptx_objs
->code
, ptx_objs
->size
,
687 if (r
!= CUDA_SUCCESS
)
689 GOMP_PLUGIN_error ("Link error log %s\n", &elog
[0]);
690 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
696 GOMP_PLUGIN_debug (0, "Linking\n");
697 r
= CUDA_CALL_NOCHECK (cuLinkComplete
, linkstate
, &linkout
, &linkoutsize
);
699 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed
);
700 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog
[0]);
702 if (r
!= CUDA_SUCCESS
)
704 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r
));
708 CUDA_CALL (cuModuleLoadData
, module
, linkout
);
709 CUDA_CALL (cuLinkDestroy
, linkstate
);
714 nvptx_exec (void (*fn
), size_t mapnum
, void **hostaddrs
, void **devaddrs
,
715 unsigned *dims
, void *targ_mem_desc
,
716 CUdeviceptr dp
, CUstream stream
)
718 struct targ_fn_descriptor
*targ_fn
= (struct targ_fn_descriptor
*) fn
;
722 struct nvptx_thread
*nvthd
= nvptx_thread ();
723 int warp_size
= nvthd
->ptx_dev
->warp_size
;
725 function
= targ_fn
->fn
;
727 /* Initialize the launch dimensions. Typically this is constant,
728 provided by the device compiler, but we must permit runtime
731 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
733 if (targ_fn
->launch
->dim
[i
])
734 dims
[i
] = targ_fn
->launch
->dim
[i
];
741 pthread_mutex_lock (&ptx_dev_lock
);
743 static int gomp_openacc_dims
[GOMP_DIM_MAX
];
744 if (!gomp_openacc_dims
[0])
746 /* See if the user provided GOMP_OPENACC_DIM environment
747 variable to specify runtime defaults. */
748 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
749 gomp_openacc_dims
[i
] = GOMP_PLUGIN_acc_default_dim (i
);
752 if (!nvthd
->ptx_dev
->default_dims
[0])
754 int default_dims
[GOMP_DIM_MAX
];
755 for (int i
= 0; i
< GOMP_DIM_MAX
; ++i
)
756 default_dims
[i
] = gomp_openacc_dims
[i
];
758 int gang
, worker
, vector
;
760 int block_size
= nvthd
->ptx_dev
->max_threads_per_block
;
761 int cpu_size
= nvthd
->ptx_dev
->max_threads_per_multiprocessor
;
762 int dev_size
= nvthd
->ptx_dev
->num_sms
;
763 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
764 " dev_size=%d, cpu_size=%d\n",
765 warp_size
, block_size
, dev_size
, cpu_size
);
767 gang
= (cpu_size
/ block_size
) * dev_size
;
768 worker
= block_size
/ warp_size
;
772 /* There is no upper bound on the gang size. The best size
773 matches the hardware configuration. Logical gangs are
774 scheduled onto physical hardware. To maximize usage, we
775 should guess a large number. */
776 if (default_dims
[GOMP_DIM_GANG
] < 1)
777 default_dims
[GOMP_DIM_GANG
] = gang
? gang
: 1024;
778 /* The worker size must not exceed the hardware. */
779 if (default_dims
[GOMP_DIM_WORKER
] < 1
780 || (default_dims
[GOMP_DIM_WORKER
] > worker
&& gang
))
781 default_dims
[GOMP_DIM_WORKER
] = worker
;
782 /* The vector size must exactly match the hardware. */
783 if (default_dims
[GOMP_DIM_VECTOR
] < 1
784 || (default_dims
[GOMP_DIM_VECTOR
] != vector
&& gang
))
785 default_dims
[GOMP_DIM_VECTOR
] = vector
;
787 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
788 default_dims
[GOMP_DIM_GANG
],
789 default_dims
[GOMP_DIM_WORKER
],
790 default_dims
[GOMP_DIM_VECTOR
]);
792 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
793 nvthd
->ptx_dev
->default_dims
[i
] = default_dims
[i
];
795 pthread_mutex_unlock (&ptx_dev_lock
);
798 bool default_dim_p
[GOMP_DIM_MAX
];
799 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
800 default_dim_p
[i
] = !dims
[i
];
802 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize
))
804 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
805 if (default_dim_p
[i
])
806 dims
[i
] = nvthd
->ptx_dev
->default_dims
[i
];
808 if (default_dim_p
[GOMP_DIM_VECTOR
])
809 dims
[GOMP_DIM_VECTOR
]
810 = MIN (dims
[GOMP_DIM_VECTOR
],
811 (targ_fn
->max_threads_per_block
/ warp_size
814 if (default_dim_p
[GOMP_DIM_WORKER
])
815 dims
[GOMP_DIM_WORKER
]
816 = MIN (dims
[GOMP_DIM_WORKER
],
817 targ_fn
->max_threads_per_block
/ dims
[GOMP_DIM_VECTOR
]);
821 /* Handle the case that the compiler allows the runtime to choose
822 the vector-length conservatively, by ignoring
823 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
826 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
827 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
828 exceed targ_fn->max_threads_per_block. */
829 int workers
= gomp_openacc_dims
[GOMP_DIM_WORKER
];
830 int gangs
= gomp_openacc_dims
[GOMP_DIM_GANG
];
833 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize
, &grids
,
834 &blocks
, function
, NULL
, 0,
835 dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]);
836 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
837 "grid = %d, block = %d\n", grids
, blocks
);
839 /* Keep the num_gangs proportional to the block size. In
840 the case were a block size is limited by shared-memory
841 or the register file capacity, the runtime will not
842 excessively over assign gangs to the multiprocessor
843 units if their state is going to be swapped out even
844 more than necessary. The constant factor 2 is there to
845 prevent threads from idling when there is insufficient
848 gangs
= 2 * grids
* (blocks
/ warp_size
);
855 int actual_vectors
= (default_dim_p
[GOMP_DIM_VECTOR
]
857 : dims
[GOMP_DIM_VECTOR
]);
858 workers
= blocks
/ actual_vectors
;
859 workers
= MAX (workers
, 1);
860 /* If we need a per-worker barrier ... . */
861 if (actual_vectors
> 32)
862 /* Don't use more barriers than available. */
863 workers
= MIN (workers
, 15);
866 for (i
= 0; i
!= GOMP_DIM_MAX
; i
++)
867 if (default_dim_p
[i
])
870 case GOMP_DIM_GANG
: dims
[i
] = gangs
; break;
871 case GOMP_DIM_WORKER
: dims
[i
] = workers
; break;
872 case GOMP_DIM_VECTOR
: dims
[i
] = vectors
; break;
873 default: GOMP_PLUGIN_fatal ("invalid dim");
879 /* Check if the accelerator has sufficient hardware resources to
880 launch the offloaded kernel. */
881 if (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
]
882 > targ_fn
->max_threads_per_block
)
885 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
886 " with num_workers = %d and vector_length = %d"
888 "recompile the program with 'num_workers = x and vector_length = y'"
889 " on that offloaded region or '-fopenacc-dim=:x:y' where"
892 GOMP_PLUGIN_fatal (msg
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
893 dims
[GOMP_DIM_VECTOR
], targ_fn
->max_threads_per_block
);
896 /* Check if the accelerator has sufficient barrier resources to
897 launch the offloaded kernel. */
898 if (dims
[GOMP_DIM_WORKER
] > 15 && dims
[GOMP_DIM_VECTOR
] > 32)
901 = ("The Nvidia accelerator has insufficient barrier resources to launch"
902 " '%s' with num_workers = %d and vector_length = %d"
904 "recompile the program with 'num_workers = x' on that offloaded"
905 " region or '-fopenacc-dim=:x:' where x <= 15"
907 "or, recompile the program with 'vector_length = 32' on that"
908 " offloaded region or '-fopenacc-dim=::32'"
910 GOMP_PLUGIN_fatal (msg
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_WORKER
],
911 dims
[GOMP_DIM_VECTOR
]);
914 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
915 " gangs=%u, workers=%u, vectors=%u\n",
916 __FUNCTION__
, targ_fn
->launch
->fn
, dims
[GOMP_DIM_GANG
],
917 dims
[GOMP_DIM_WORKER
], dims
[GOMP_DIM_VECTOR
]);
921 // num_gangs nctaid.x
922 // num_workers ntid.y
923 // vector length ntid.x
925 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
926 acc_prof_info
*prof_info
= thr
->prof_info
;
927 acc_event_info enqueue_launch_event_info
;
928 acc_api_info
*api_info
= thr
->api_info
;
929 bool profiling_p
= __builtin_expect (prof_info
!= NULL
, false);
932 prof_info
->event_type
= acc_ev_enqueue_launch_start
;
934 enqueue_launch_event_info
.launch_event
.event_type
935 = prof_info
->event_type
;
936 enqueue_launch_event_info
.launch_event
.valid_bytes
937 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES
;
938 enqueue_launch_event_info
.launch_event
.parent_construct
939 = acc_construct_parallel
;
940 enqueue_launch_event_info
.launch_event
.implicit
= 1;
941 enqueue_launch_event_info
.launch_event
.tool_info
= NULL
;
942 enqueue_launch_event_info
.launch_event
.kernel_name
= targ_fn
->launch
->fn
;
943 enqueue_launch_event_info
.launch_event
.num_gangs
944 = dims
[GOMP_DIM_GANG
];
945 enqueue_launch_event_info
.launch_event
.num_workers
946 = dims
[GOMP_DIM_WORKER
];
947 enqueue_launch_event_info
.launch_event
.vector_length
948 = dims
[GOMP_DIM_VECTOR
];
950 api_info
->device_api
= acc_device_api_cuda
;
952 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &enqueue_launch_event_info
,
957 CUDA_CALL_ASSERT (cuLaunchKernel
, function
,
958 dims
[GOMP_DIM_GANG
], 1, 1,
959 dims
[GOMP_DIM_VECTOR
], dims
[GOMP_DIM_WORKER
], 1,
960 0, stream
, kargs
, 0);
964 prof_info
->event_type
= acc_ev_enqueue_launch_end
;
965 enqueue_launch_event_info
.launch_event
.event_type
966 = prof_info
->event_type
;
967 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &enqueue_launch_event_info
,
971 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__
,
972 targ_fn
->launch
->fn
);
975 void * openacc_get_current_cuda_context (void);
978 goacc_profiling_acc_ev_alloc (struct goacc_thread
*thr
, void *dp
, size_t s
)
980 acc_prof_info
*prof_info
= thr
->prof_info
;
981 acc_event_info data_event_info
;
982 acc_api_info
*api_info
= thr
->api_info
;
984 prof_info
->event_type
= acc_ev_alloc
;
986 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
987 data_event_info
.data_event
.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES
;
988 data_event_info
.data_event
.parent_construct
= acc_construct_parallel
;
989 data_event_info
.data_event
.implicit
= 1;
990 data_event_info
.data_event
.tool_info
= NULL
;
991 data_event_info
.data_event
.var_name
= NULL
;
992 data_event_info
.data_event
.bytes
= s
;
993 data_event_info
.data_event
.host_ptr
= NULL
;
994 data_event_info
.data_event
.device_ptr
= dp
;
996 api_info
->device_api
= acc_device_api_cuda
;
998 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
, api_info
);
1002 nvptx_alloc (size_t s
)
1006 CUDA_CALL_ERET (NULL
, cuMemAlloc
, &d
, s
);
1007 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1009 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1011 goacc_profiling_acc_ev_alloc (thr
, (void *) d
, s
);
1017 goacc_profiling_acc_ev_free (struct goacc_thread
*thr
, void *p
)
1019 acc_prof_info
*prof_info
= thr
->prof_info
;
1020 acc_event_info data_event_info
;
1021 acc_api_info
*api_info
= thr
->api_info
;
1023 prof_info
->event_type
= acc_ev_free
;
1025 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1026 data_event_info
.data_event
.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1027 data_event_info
.data_event
.parent_construct
= acc_construct_parallel
;
1028 data_event_info
.data_event
.implicit
= 1;
1029 data_event_info
.data_event
.tool_info
= NULL
;
1030 data_event_info
.data_event
.var_name
= NULL
;
1031 data_event_info
.data_event
.bytes
= -1;
1032 data_event_info
.data_event
.host_ptr
= NULL
;
1033 data_event_info
.data_event
.device_ptr
= p
;
1035 api_info
->device_api
= acc_device_api_cuda
;
1037 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
, api_info
);
1041 nvptx_free (void *p
, struct ptx_device
*ptx_dev
)
1043 /* Assume callback context if this is null. */
1044 if (GOMP_PLUGIN_acc_thread () == NULL
)
1046 struct ptx_free_block
*n
1047 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block
));
1049 pthread_mutex_lock (&ptx_dev
->free_blocks_lock
);
1050 n
->next
= ptx_dev
->free_blocks
;
1051 ptx_dev
->free_blocks
= n
;
1052 pthread_mutex_unlock (&ptx_dev
->free_blocks_lock
);
1059 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) p
);
1060 if ((CUdeviceptr
) p
!= pb
)
1062 GOMP_PLUGIN_error ("invalid device address");
1066 CUDA_CALL (cuMemFree
, (CUdeviceptr
) p
);
1067 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1069 = __builtin_expect (thr
!= NULL
&& thr
->prof_info
!= NULL
, false);
1071 goacc_profiling_acc_ev_free (thr
, p
);
1077 nvptx_get_current_cuda_device (void)
1079 struct nvptx_thread
*nvthd
= nvptx_thread ();
1081 if (!nvthd
|| !nvthd
->ptx_dev
)
1084 return &nvthd
->ptx_dev
->dev
;
1088 nvptx_get_current_cuda_context (void)
1090 struct nvptx_thread
*nvthd
= nvptx_thread ();
1092 if (!nvthd
|| !nvthd
->ptx_dev
)
1095 return nvthd
->ptx_dev
->ctx
;
1098 /* Plugin entry points. */
1101 GOMP_OFFLOAD_get_name (void)
1107 GOMP_OFFLOAD_get_caps (void)
1109 return GOMP_OFFLOAD_CAP_OPENACC_200
| GOMP_OFFLOAD_CAP_OPENMP_400
;
1113 GOMP_OFFLOAD_get_type (void)
1115 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX
;
1119 GOMP_OFFLOAD_get_num_devices (void)
1121 return nvptx_get_num_devices ();
1124 union gomp_device_property_value
1125 GOMP_OFFLOAD_get_property (int n
, int prop
)
1127 union gomp_device_property_value propval
= { .val
= 0 };
1129 pthread_mutex_lock (&ptx_dev_lock
);
1131 if (n
>= nvptx_get_num_devices () || n
< 0 || ptx_devices
[n
] == NULL
)
1133 pthread_mutex_unlock (&ptx_dev_lock
);
1137 struct ptx_device
*ptx_dev
= ptx_devices
[n
];
1140 case GOMP_DEVICE_PROPERTY_MEMORY
:
1144 CUDA_CALL_ERET (propval
, cuDeviceTotalMem
, &total_mem
, ptx_dev
->dev
);
1145 propval
.val
= total_mem
;
1148 case GOMP_DEVICE_PROPERTY_FREE_MEMORY
:
1154 CUDA_CALL_ERET (propval
, cuCtxGetDevice
, &ctxdev
);
1155 if (ptx_dev
->dev
== ctxdev
)
1156 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
1157 else if (ptx_dev
->ctx
)
1161 CUDA_CALL_ERET (propval
, cuCtxPushCurrent
, ptx_dev
->ctx
);
1162 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
1163 CUDA_CALL_ASSERT (cuCtxPopCurrent
, &old_ctx
);
1169 CUDA_CALL_ERET (propval
, cuCtxCreate
, &new_ctx
, CU_CTX_SCHED_AUTO
,
1171 CUDA_CALL_ERET (propval
, cuMemGetInfo
, &free_mem
, &total_mem
);
1172 CUDA_CALL_ASSERT (cuCtxDestroy
, new_ctx
);
1174 propval
.val
= free_mem
;
1177 case GOMP_DEVICE_PROPERTY_NAME
:
1178 propval
.ptr
= ptx_dev
->name
;
1180 case GOMP_DEVICE_PROPERTY_VENDOR
:
1181 propval
.ptr
= "Nvidia";
1183 case GOMP_DEVICE_PROPERTY_DRIVER
:
1184 propval
.ptr
= cuda_driver_version_s
;
1188 pthread_mutex_unlock (&ptx_dev_lock
);
1193 GOMP_OFFLOAD_init_device (int n
)
1195 struct ptx_device
*dev
;
1197 pthread_mutex_lock (&ptx_dev_lock
);
1199 if (!nvptx_init () || ptx_devices
[n
] != NULL
)
1201 pthread_mutex_unlock (&ptx_dev_lock
);
1205 dev
= nvptx_open_device (n
);
1208 ptx_devices
[n
] = dev
;
1209 instantiated_devices
++;
1212 pthread_mutex_unlock (&ptx_dev_lock
);
1218 GOMP_OFFLOAD_fini_device (int n
)
1220 pthread_mutex_lock (&ptx_dev_lock
);
1222 if (ptx_devices
[n
] != NULL
)
1224 if (!nvptx_attach_host_thread_to_device (n
)
1225 || !nvptx_close_device (ptx_devices
[n
]))
1227 pthread_mutex_unlock (&ptx_dev_lock
);
1230 ptx_devices
[n
] = NULL
;
1231 instantiated_devices
--;
1234 if (instantiated_devices
== 0)
1240 pthread_mutex_unlock (&ptx_dev_lock
);
1244 /* Return the libgomp version number we're compatible with. There is
1245 no requirement for cross-version compatibility. */
1248 GOMP_OFFLOAD_version (void)
1250 return GOMP_VERSION
;
1253 /* Initialize __nvptx_clocktick, if present in MODULE. */
1256 nvptx_set_clocktick (CUmodule module
, struct ptx_device
*dev
)
1259 CUresult r
= CUDA_CALL_NOCHECK (cuModuleGetGlobal
, &dptr
, NULL
,
1260 module
, "__nvptx_clocktick");
1261 if (r
== CUDA_ERROR_NOT_FOUND
)
1263 if (r
!= CUDA_SUCCESS
)
1264 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r
));
1265 double __nvptx_clocktick
= 1e-3 / dev
->clock_khz
;
1266 r
= CUDA_CALL_NOCHECK (cuMemcpyHtoD
, dptr
, &__nvptx_clocktick
,
1267 sizeof (__nvptx_clocktick
));
1268 if (r
!= CUDA_SUCCESS
)
1269 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r
));
1272 /* Load the (partial) program described by TARGET_DATA to device
1273 number ORD. Allocate and return TARGET_TABLE. */
1276 GOMP_OFFLOAD_load_image (int ord
, unsigned version
, const void *target_data
,
1277 struct addr_pair
**target_table
)
1280 const char *const *var_names
;
1281 const struct targ_fn_launch
*fn_descs
;
1282 unsigned int fn_entries
, var_entries
, i
, j
;
1283 struct targ_fn_descriptor
*targ_fns
;
1284 struct addr_pair
*targ_tbl
;
1285 const nvptx_tdata_t
*img_header
= (const nvptx_tdata_t
*) target_data
;
1286 struct ptx_image_data
*new_image
;
1287 struct ptx_device
*dev
;
1289 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1291 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1292 " (expected %u, received %u)",
1293 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1297 if (!nvptx_attach_host_thread_to_device (ord
)
1298 || !link_ptx (&module
, img_header
->ptx_objs
, img_header
->ptx_num
))
1301 dev
= ptx_devices
[ord
];
1303 /* The mkoffload utility emits a struct of pointers/integers at the
1304 start of each offload image. The array of kernel names and the
1305 functions addresses form a one-to-one correspondence. */
1307 var_entries
= img_header
->var_num
;
1308 var_names
= img_header
->var_names
;
1309 fn_entries
= img_header
->fn_num
;
1310 fn_descs
= img_header
->fn_descs
;
1312 targ_tbl
= GOMP_PLUGIN_malloc (sizeof (struct addr_pair
)
1313 * (fn_entries
+ var_entries
));
1314 targ_fns
= GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor
)
1317 *target_table
= targ_tbl
;
1319 new_image
= GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data
));
1320 new_image
->target_data
= target_data
;
1321 new_image
->module
= module
;
1322 new_image
->fns
= targ_fns
;
1324 pthread_mutex_lock (&dev
->image_lock
);
1325 new_image
->next
= dev
->images
;
1326 dev
->images
= new_image
;
1327 pthread_mutex_unlock (&dev
->image_lock
);
1329 for (i
= 0; i
< fn_entries
; i
++, targ_fns
++, targ_tbl
++)
1331 CUfunction function
;
1334 CUDA_CALL_ERET (-1, cuModuleGetFunction
, &function
, module
,
1336 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &nregs
,
1337 CU_FUNC_ATTRIBUTE_NUM_REGS
, function
);
1338 CUDA_CALL_ERET (-1, cuFuncGetAttribute
, &mthrs
,
1339 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
, function
);
1341 targ_fns
->fn
= function
;
1342 targ_fns
->launch
= &fn_descs
[i
];
1343 targ_fns
->regs_per_thread
= nregs
;
1344 targ_fns
->max_threads_per_block
= mthrs
;
1346 targ_tbl
->start
= (uintptr_t) targ_fns
;
1347 targ_tbl
->end
= targ_tbl
->start
+ 1;
1350 for (j
= 0; j
< var_entries
; j
++, targ_tbl
++)
1355 CUDA_CALL_ERET (-1, cuModuleGetGlobal
,
1356 &var
, &bytes
, module
, var_names
[j
]);
1358 targ_tbl
->start
= (uintptr_t) var
;
1359 targ_tbl
->end
= targ_tbl
->start
+ bytes
;
1362 nvptx_set_clocktick (module
, dev
);
1364 return fn_entries
+ var_entries
;
1367 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1368 function descriptors allocated by G_O_load_image. */
1371 GOMP_OFFLOAD_unload_image (int ord
, unsigned version
, const void *target_data
)
1373 struct ptx_image_data
*image
, **prev_p
;
1374 struct ptx_device
*dev
= ptx_devices
[ord
];
1376 if (GOMP_VERSION_DEV (version
) > GOMP_VERSION_NVIDIA_PTX
)
1378 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1379 " (expected %u, received %u)",
1380 GOMP_VERSION_NVIDIA_PTX
, GOMP_VERSION_DEV (version
));
1385 pthread_mutex_lock (&dev
->image_lock
);
1386 for (prev_p
= &dev
->images
; (image
= *prev_p
) != 0; prev_p
= &image
->next
)
1387 if (image
->target_data
== target_data
)
1389 *prev_p
= image
->next
;
1390 if (CUDA_CALL_NOCHECK (cuModuleUnload
, image
->module
) != CUDA_SUCCESS
)
1396 pthread_mutex_unlock (&dev
->image_lock
);
1401 GOMP_OFFLOAD_alloc (int ord
, size_t size
)
1403 if (!nvptx_attach_host_thread_to_device (ord
))
1406 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1407 struct ptx_free_block
*blocks
, *tmp
;
1409 pthread_mutex_lock (&ptx_dev
->free_blocks_lock
);
1410 blocks
= ptx_dev
->free_blocks
;
1411 ptx_dev
->free_blocks
= NULL
;
1412 pthread_mutex_unlock (&ptx_dev
->free_blocks_lock
);
1417 nvptx_free (blocks
->ptr
, ptx_dev
);
1422 return nvptx_alloc (size
);
1426 GOMP_OFFLOAD_free (int ord
, void *ptr
)
1428 return (nvptx_attach_host_thread_to_device (ord
)
1429 && nvptx_free (ptr
, ptx_devices
[ord
]));
1433 GOMP_OFFLOAD_openacc_exec (void (*fn
) (void *), size_t mapnum
,
1434 void **hostaddrs
, void **devaddrs
,
1435 unsigned *dims
, void *targ_mem_desc
)
1437 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
1439 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1440 acc_prof_info
*prof_info
= thr
->prof_info
;
1441 acc_event_info data_event_info
;
1442 acc_api_info
*api_info
= thr
->api_info
;
1443 bool profiling_p
= __builtin_expect (prof_info
!= NULL
, false);
1450 size_t s
= mapnum
* sizeof (void *);
1452 for (int i
= 0; i
< mapnum
; i
++)
1453 hp
[i
] = (devaddrs
[i
] ? devaddrs
[i
] : hostaddrs
[i
]);
1454 CUDA_CALL_ASSERT (cuMemAlloc
, &dp
, s
);
1456 goacc_profiling_acc_ev_alloc (thr
, (void *) dp
, s
);
1459 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1460 fact have the same value on a unified-memory system). */
1465 prof_info
->event_type
= acc_ev_enqueue_upload_start
;
1467 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1468 data_event_info
.data_event
.valid_bytes
1469 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1470 data_event_info
.data_event
.parent_construct
1471 = acc_construct_parallel
;
1472 data_event_info
.data_event
.implicit
= 1; /* Always implicit. */
1473 data_event_info
.data_event
.tool_info
= NULL
;
1474 data_event_info
.data_event
.var_name
= NULL
;
1475 data_event_info
.data_event
.bytes
= mapnum
* sizeof (void *);
1476 data_event_info
.data_event
.host_ptr
= hp
;
1477 data_event_info
.data_event
.device_ptr
= (const void *) dp
;
1479 api_info
->device_api
= acc_device_api_cuda
;
1481 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1484 CUDA_CALL_ASSERT (cuMemcpyHtoD
, dp
, (void *) hp
,
1485 mapnum
* sizeof (void *));
1488 prof_info
->event_type
= acc_ev_enqueue_upload_end
;
1489 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1490 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1495 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, dims
, targ_mem_desc
,
1498 CUresult r
= CUDA_CALL_NOCHECK (cuStreamSynchronize
, NULL
);
1499 const char *maybe_abort_msg
= "(perhaps abort was called)";
1500 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1501 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r
),
1503 else if (r
!= CUDA_SUCCESS
)
1504 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r
));
1506 CUDA_CALL_ASSERT (cuMemFree
, dp
);
1508 goacc_profiling_acc_ev_free (thr
, (void *) dp
);
1512 cuda_free_argmem (void *ptr
)
1514 void **block
= (void **) ptr
;
1515 nvptx_free (block
[0], (struct ptx_device
*) block
[1]);
1520 GOMP_OFFLOAD_openacc_async_exec (void (*fn
) (void *), size_t mapnum
,
1521 void **hostaddrs
, void **devaddrs
,
1522 unsigned *dims
, void *targ_mem_desc
,
1523 struct goacc_asyncqueue
*aq
)
1525 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__
);
1527 struct goacc_thread
*thr
= GOMP_PLUGIN_goacc_thread ();
1528 acc_prof_info
*prof_info
= thr
->prof_info
;
1529 acc_event_info data_event_info
;
1530 acc_api_info
*api_info
= thr
->api_info
;
1531 bool profiling_p
= __builtin_expect (prof_info
!= NULL
, false);
1535 void **block
= NULL
;
1539 size_t s
= mapnum
* sizeof (void *);
1540 block
= (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s
);
1542 for (int i
= 0; i
< mapnum
; i
++)
1543 hp
[i
] = (devaddrs
[i
] ? devaddrs
[i
] : hostaddrs
[i
]);
1544 CUDA_CALL_ASSERT (cuMemAlloc
, &dp
, s
);
1546 goacc_profiling_acc_ev_alloc (thr
, (void *) dp
, s
);
1549 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1550 fact have the same value on a unified-memory system). */
1555 prof_info
->event_type
= acc_ev_enqueue_upload_start
;
1557 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1558 data_event_info
.data_event
.valid_bytes
1559 = _ACC_DATA_EVENT_INFO_VALID_BYTES
;
1560 data_event_info
.data_event
.parent_construct
1561 = acc_construct_parallel
;
1562 data_event_info
.data_event
.implicit
= 1; /* Always implicit. */
1563 data_event_info
.data_event
.tool_info
= NULL
;
1564 data_event_info
.data_event
.var_name
= NULL
;
1565 data_event_info
.data_event
.bytes
= mapnum
* sizeof (void *);
1566 data_event_info
.data_event
.host_ptr
= hp
;
1567 data_event_info
.data_event
.device_ptr
= (const void *) dp
;
1569 api_info
->device_api
= acc_device_api_cuda
;
1571 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1575 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync
, dp
, (void *) hp
,
1576 mapnum
* sizeof (void *), aq
->cuda_stream
);
1577 block
[0] = (void *) dp
;
1579 struct nvptx_thread
*nvthd
=
1580 (struct nvptx_thread
*) GOMP_PLUGIN_acc_thread ();
1581 block
[1] = (void *) nvthd
->ptx_dev
;
1585 prof_info
->event_type
= acc_ev_enqueue_upload_end
;
1586 data_event_info
.data_event
.event_type
= prof_info
->event_type
;
1587 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info
, &data_event_info
,
1592 nvptx_exec (fn
, mapnum
, hostaddrs
, devaddrs
, dims
, targ_mem_desc
,
1593 dp
, aq
->cuda_stream
);
1596 GOMP_OFFLOAD_openacc_async_queue_callback (aq
, cuda_free_argmem
, block
);
1600 GOMP_OFFLOAD_openacc_create_thread_data (int ord
)
1602 struct ptx_device
*ptx_dev
;
1603 struct nvptx_thread
*nvthd
1604 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread
));
1607 ptx_dev
= ptx_devices
[ord
];
1611 CUDA_CALL_ASSERT (cuCtxGetCurrent
, &thd_ctx
);
1613 assert (ptx_dev
->ctx
);
1616 CUDA_CALL_ASSERT (cuCtxPushCurrent
, ptx_dev
->ctx
);
1618 nvthd
->ptx_dev
= ptx_dev
;
1620 return (void *) nvthd
;
1624 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data
)
1630 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1632 return nvptx_get_current_cuda_device ();
1636 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1638 return nvptx_get_current_cuda_context ();
1641 /* This returns a CUstream. */
1643 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue
*aq
)
1645 return (void *) aq
->cuda_stream
;
1648 /* This takes a CUstream. */
1650 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue
*aq
, void *stream
)
1652 if (aq
->cuda_stream
)
1654 CUDA_CALL_ASSERT (cuStreamSynchronize
, aq
->cuda_stream
);
1655 CUDA_CALL_ASSERT (cuStreamDestroy
, aq
->cuda_stream
);
1658 aq
->cuda_stream
= (CUstream
) stream
;
1662 struct goacc_asyncqueue
*
1663 GOMP_OFFLOAD_openacc_async_construct (int device
__attribute__((unused
)))
1665 CUstream stream
= NULL
;
1666 CUDA_CALL_ERET (NULL
, cuStreamCreate
, &stream
, CU_STREAM_DEFAULT
);
1668 struct goacc_asyncqueue
*aq
1669 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue
));
1670 aq
->cuda_stream
= stream
;
1675 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue
*aq
)
1677 CUDA_CALL_ERET (false, cuStreamDestroy
, aq
->cuda_stream
);
1683 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue
*aq
)
1685 CUresult r
= CUDA_CALL_NOCHECK (cuStreamQuery
, aq
->cuda_stream
);
1686 if (r
== CUDA_SUCCESS
)
1688 if (r
== CUDA_ERROR_NOT_READY
)
1691 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r
));
1696 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue
*aq
)
1698 CUDA_CALL_ERET (false, cuStreamSynchronize
, aq
->cuda_stream
);
1703 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue
*aq1
,
1704 struct goacc_asyncqueue
*aq2
)
1707 CUDA_CALL_ERET (false, cuEventCreate
, &e
, CU_EVENT_DISABLE_TIMING
);
1708 CUDA_CALL_ERET (false, cuEventRecord
, e
, aq1
->cuda_stream
);
1709 CUDA_CALL_ERET (false, cuStreamWaitEvent
, aq2
->cuda_stream
, e
, 0);
1714 cuda_callback_wrapper (CUstream stream
, CUresult res
, void *ptr
)
1716 if (res
!= CUDA_SUCCESS
)
1717 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__
, cuda_error (res
));
1718 struct nvptx_callback
*cb
= (struct nvptx_callback
*) ptr
;
1724 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue
*aq
,
1725 void (*callback_fn
)(void *),
1728 struct nvptx_callback
*b
= GOMP_PLUGIN_malloc (sizeof (*b
));
1729 b
->fn
= callback_fn
;
1732 CUDA_CALL_ASSERT (cuStreamAddCallback
, aq
->cuda_stream
,
1733 cuda_callback_wrapper
, (void *) b
, 0);
1737 cuda_memcpy_sanity_check (const void *h
, const void *d
, size_t s
)
1745 GOMP_PLUGIN_error ("invalid device address");
1748 CUDA_CALL (cuMemGetAddressRange
, &pb
, &ps
, (CUdeviceptr
) d
);
1751 GOMP_PLUGIN_error ("invalid device address");
1756 GOMP_PLUGIN_error ("invalid host address");
1761 GOMP_PLUGIN_error ("invalid host or device address");
1764 if ((void *)(d
+ s
) > (void *)(pb
+ ps
))
1766 GOMP_PLUGIN_error ("invalid size");
1773 GOMP_OFFLOAD_host2dev (int ord
, void *dst
, const void *src
, size_t n
)
1775 if (!nvptx_attach_host_thread_to_device (ord
)
1776 || !cuda_memcpy_sanity_check (src
, dst
, n
))
1778 CUDA_CALL (cuMemcpyHtoD
, (CUdeviceptr
) dst
, src
, n
);
1783 GOMP_OFFLOAD_dev2host (int ord
, void *dst
, const void *src
, size_t n
)
1785 if (!nvptx_attach_host_thread_to_device (ord
)
1786 || !cuda_memcpy_sanity_check (dst
, src
, n
))
1788 CUDA_CALL (cuMemcpyDtoH
, dst
, (CUdeviceptr
) src
, n
);
1793 GOMP_OFFLOAD_dev2dev (int ord
, void *dst
, const void *src
, size_t n
)
1795 CUDA_CALL (cuMemcpyDtoDAsync
, (CUdeviceptr
) dst
, (CUdeviceptr
) src
, n
, NULL
);
1800 GOMP_OFFLOAD_openacc_async_host2dev (int ord
, void *dst
, const void *src
,
1801 size_t n
, struct goacc_asyncqueue
*aq
)
1803 if (!nvptx_attach_host_thread_to_device (ord
)
1804 || !cuda_memcpy_sanity_check (src
, dst
, n
))
1806 CUDA_CALL (cuMemcpyHtoDAsync
, (CUdeviceptr
) dst
, src
, n
, aq
->cuda_stream
);
1811 GOMP_OFFLOAD_openacc_async_dev2host (int ord
, void *dst
, const void *src
,
1812 size_t n
, struct goacc_asyncqueue
*aq
)
1814 if (!nvptx_attach_host_thread_to_device (ord
)
1815 || !cuda_memcpy_sanity_check (dst
, src
, n
))
1817 CUDA_CALL (cuMemcpyDtoHAsync
, dst
, (CUdeviceptr
) src
, n
, aq
->cuda_stream
);
1821 /* Adjust launch dimensions: pick good values for number of blocks and warps
1822 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1826 nvptx_adjust_launch_bounds (struct targ_fn_descriptor
*fn
,
1827 struct ptx_device
*ptx_dev
,
1828 int *teams_p
, int *threads_p
)
1830 int max_warps_block
= fn
->max_threads_per_block
/ 32;
1831 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1832 and libgcc, which matches documented limit of all GPUs as of 2015. */
1833 if (max_warps_block
> 32)
1834 max_warps_block
= 32;
1835 if (*threads_p
<= 0)
1837 if (*threads_p
> max_warps_block
)
1838 *threads_p
= max_warps_block
;
1840 int regs_per_block
= fn
->regs_per_thread
* 32 * *threads_p
;
1841 /* This is an estimate of how many blocks the device can host simultaneously.
1842 Actual limit, which may be lower, can be queried with "occupancy control"
1843 driver interface (since CUDA 6.0). */
1844 int max_blocks
= ptx_dev
->regs_per_sm
/ regs_per_block
* ptx_dev
->num_sms
;
1845 if (*teams_p
<= 0 || *teams_p
> max_blocks
)
1846 *teams_p
= max_blocks
;
1849 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1853 nvptx_stacks_size ()
1858 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
1861 nvptx_stacks_alloc (size_t size
, int num
)
1864 CUresult r
= CUDA_CALL_NOCHECK (cuMemAlloc
, &stacks
, size
* num
);
1865 if (r
!= CUDA_SUCCESS
)
1866 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r
));
1867 return (void *) stacks
;
1870 /* Release storage previously allocated by nvptx_stacks_alloc. */
1873 nvptx_stacks_free (void *p
, int num
)
1875 CUresult r
= CUDA_CALL_NOCHECK (cuMemFree
, (CUdeviceptr
) p
);
1876 if (r
!= CUDA_SUCCESS
)
1877 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r
));
1881 GOMP_OFFLOAD_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
)
1883 CUfunction function
= ((struct targ_fn_descriptor
*) tgt_fn
)->fn
;
1885 struct ptx_device
*ptx_dev
= ptx_devices
[ord
];
1886 const char *maybe_abort_msg
= "(perhaps abort was called)";
1887 int teams
= 0, threads
= 0;
1890 GOMP_PLUGIN_fatal ("No target arguments provided");
1893 intptr_t id
= (intptr_t) *args
++, val
;
1894 if (id
& GOMP_TARGET_ARG_SUBSEQUENT_PARAM
)
1895 val
= (intptr_t) *args
++;
1897 val
= id
>> GOMP_TARGET_ARG_VALUE_SHIFT
;
1898 if ((id
& GOMP_TARGET_ARG_DEVICE_MASK
) != GOMP_TARGET_ARG_DEVICE_ALL
)
1900 val
= val
> INT_MAX
? INT_MAX
: val
;
1901 id
&= GOMP_TARGET_ARG_ID_MASK
;
1902 if (id
== GOMP_TARGET_ARG_NUM_TEAMS
)
1904 else if (id
== GOMP_TARGET_ARG_THREAD_LIMIT
)
1907 nvptx_adjust_launch_bounds (tgt_fn
, ptx_dev
, &teams
, &threads
);
1909 size_t stack_size
= nvptx_stacks_size ();
1910 void *stacks
= nvptx_stacks_alloc (stack_size
, teams
* threads
);
1911 void *fn_args
[] = {tgt_vars
, stacks
, (void *) stack_size
};
1912 size_t fn_args_size
= sizeof fn_args
;
1914 CU_LAUNCH_PARAM_BUFFER_POINTER
, fn_args
,
1915 CU_LAUNCH_PARAM_BUFFER_SIZE
, &fn_args_size
,
1918 r
= CUDA_CALL_NOCHECK (cuLaunchKernel
, function
, teams
, 1, 1,
1919 32, threads
, 1, 0, NULL
, NULL
, config
);
1920 if (r
!= CUDA_SUCCESS
)
1921 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r
));
1923 r
= CUDA_CALL_NOCHECK (cuCtxSynchronize
, );
1924 if (r
== CUDA_ERROR_LAUNCH_FAILED
)
1925 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r
),
1927 else if (r
!= CUDA_SUCCESS
)
1928 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r
));
1929 nvptx_stacks_free (stacks
, teams
* threads
);
1933 GOMP_OFFLOAD_async_run (int ord
, void *tgt_fn
, void *tgt_vars
, void **args
,
1936 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");