]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgomp/plugin/plugin-nvptx.c
Update copyright years.
[thirdparty/gcc.git] / libgomp / plugin / plugin-nvptx.c
1 /* Plugin for NVPTX execution.
2
3 Copyright (C) 2013-2020 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
40 #include "oacc-int.h"
41
42 #include <pthread.h>
43 #include <cuda.h>
44 #include <stdbool.h>
45 #include <limits.h>
46 #include <string.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <assert.h>
50 #include <errno.h>
51
52 #if CUDA_VERSION < 6000
53 extern CUresult cuGetErrorString (CUresult, const char **);
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
55 #endif
56
57 #if CUDA_VERSION >= 6050
58 #undef cuLinkCreate
59 #undef cuLinkAddData
60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
61 const char *, unsigned, CUjit_option *, void **);
62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
63 #else
64 typedef size_t (*CUoccupancyB2DSize)(int);
65 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
66 const char *, unsigned, CUjit_option *, void **);
67 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
68 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
69 CUoccupancyB2DSize, size_t, int);
70 #endif
71
72 #define DO_PRAGMA(x) _Pragma (#x)
73
74 #if PLUGIN_NVPTX_DYNAMIC
75 # include <dlfcn.h>
76
77 struct cuda_lib_s {
78
79 # define CUDA_ONE_CALL(call) \
80 __typeof (call) *call;
81 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
82 CUDA_ONE_CALL (call)
83 #include "cuda-lib.def"
84 # undef CUDA_ONE_CALL
85 # undef CUDA_ONE_CALL_MAYBE_NULL
86
87 } cuda_lib;
88
89 /* -1 if init_cuda_lib has not been called yet, false
90 if it has been and failed, true if it has been and succeeded. */
91 static signed char cuda_lib_inited = -1;
92
93 /* Dynamically load the CUDA runtime library and initialize function
94 pointers, return false if unsuccessful, true if successful. */
95 static bool
96 init_cuda_lib (void)
97 {
98 if (cuda_lib_inited != -1)
99 return cuda_lib_inited;
100 const char *cuda_runtime_lib = "libcuda.so.1";
101 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
102 cuda_lib_inited = false;
103 if (h == NULL)
104 return false;
105
106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
108 # define CUDA_ONE_CALL_1(call, allow_null) \
109 cuda_lib.call = dlsym (h, #call); \
110 if (!allow_null && cuda_lib.call == NULL) \
111 return false;
112 #include "cuda-lib.def"
113 # undef CUDA_ONE_CALL
114 # undef CUDA_ONE_CALL_1
115 # undef CUDA_ONE_CALL_MAYBE_NULL
116
117 cuda_lib_inited = true;
118 return true;
119 }
120 # define CUDA_CALL_PREFIX cuda_lib.
121 #else
122
123 # define CUDA_ONE_CALL(call)
124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
125 #include "cuda-lib.def"
126 #undef CUDA_ONE_CALL_MAYBE_NULL
127 #undef CUDA_ONE_CALL
128
129 # define CUDA_CALL_PREFIX
130 # define init_cuda_lib() true
131 #endif
132
133 #include "secure_getenv.h"
134
135 #undef MIN
136 #undef MAX
137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
139
140 /* Convenience macros for the frequently used CUDA library call and
141 error handling sequence as well as CUDA library calls that
142 do the error checking themselves or don't do it at all. */
143
144 #define CUDA_CALL_ERET(ERET, FN, ...) \
145 do { \
146 unsigned __r \
147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
148 if (__r != CUDA_SUCCESS) \
149 { \
150 GOMP_PLUGIN_error (#FN " error: %s", \
151 cuda_error (__r)); \
152 return ERET; \
153 } \
154 } while (0)
155
156 #define CUDA_CALL(FN, ...) \
157 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
158
159 #define CUDA_CALL_ASSERT(FN, ...) \
160 do { \
161 unsigned __r \
162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
163 if (__r != CUDA_SUCCESS) \
164 { \
165 GOMP_PLUGIN_fatal (#FN " error: %s", \
166 cuda_error (__r)); \
167 } \
168 } while (0)
169
170 #define CUDA_CALL_NOCHECK(FN, ...) \
171 CUDA_CALL_PREFIX FN (__VA_ARGS__)
172
173 #define CUDA_CALL_EXISTS(FN) \
174 CUDA_CALL_PREFIX FN
175
176 static const char *
177 cuda_error (CUresult r)
178 {
179 const char *fallback = "unknown cuda error";
180 const char *desc;
181
182 if (!CUDA_CALL_EXISTS (cuGetErrorString))
183 return fallback;
184
185 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
186 if (r == CUDA_SUCCESS)
187 return desc;
188
189 return fallback;
190 }
191
192 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
193 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
194 static char cuda_driver_version_s[30];
195
196 static unsigned int instantiated_devices = 0;
197 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
198
199 /* NVPTX/CUDA specific definition of asynchronous queues. */
200 struct goacc_asyncqueue
201 {
202 CUstream cuda_stream;
203 };
204
205 struct nvptx_callback
206 {
207 void (*fn) (void *);
208 void *ptr;
209 struct goacc_asyncqueue *aq;
210 struct nvptx_callback *next;
211 };
212
213 /* Thread-specific data for PTX. */
214
215 struct nvptx_thread
216 {
217 /* We currently have this embedded inside the plugin because libgomp manages
218 devices through integer target_ids. This might be better if using an
219 opaque target-specific pointer directly from gomp_device_descr. */
220 struct ptx_device *ptx_dev;
221 };
222
223 /* Target data function launch information. */
224
225 struct targ_fn_launch
226 {
227 const char *fn;
228 unsigned short dim[GOMP_DIM_MAX];
229 };
230
231 /* Target PTX object information. */
232
233 struct targ_ptx_obj
234 {
235 const char *code;
236 size_t size;
237 };
238
239 /* Target data image information. */
240
241 typedef struct nvptx_tdata
242 {
243 const struct targ_ptx_obj *ptx_objs;
244 unsigned ptx_num;
245
246 const char *const *var_names;
247 unsigned var_num;
248
249 const struct targ_fn_launch *fn_descs;
250 unsigned fn_num;
251 } nvptx_tdata_t;
252
253 /* Descriptor of a loaded function. */
254
255 struct targ_fn_descriptor
256 {
257 CUfunction fn;
258 const struct targ_fn_launch *launch;
259 int regs_per_thread;
260 int max_threads_per_block;
261 };
262
263 /* A loaded PTX image. */
264 struct ptx_image_data
265 {
266 const void *target_data;
267 CUmodule module;
268
269 struct targ_fn_descriptor *fns; /* Array of functions. */
270
271 struct ptx_image_data *next;
272 };
273
274 struct ptx_free_block
275 {
276 void *ptr;
277 struct ptx_free_block *next;
278 };
279
280 struct ptx_device
281 {
282 CUcontext ctx;
283 bool ctx_shared;
284 CUdevice dev;
285
286 int ord;
287 bool overlap;
288 bool map;
289 bool concur;
290 bool mkern;
291 int mode;
292 int clock_khz;
293 int num_sms;
294 int regs_per_block;
295 int regs_per_sm;
296 int warp_size;
297 int max_threads_per_block;
298 int max_threads_per_multiprocessor;
299 int default_dims[GOMP_DIM_MAX];
300
301 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
302 char name[256];
303
304 struct ptx_image_data *images; /* Images loaded on device. */
305 pthread_mutex_t image_lock; /* Lock for above list. */
306
307 struct ptx_free_block *free_blocks;
308 pthread_mutex_t free_blocks_lock;
309
310 struct ptx_device *next;
311 };
312
313 static struct ptx_device **ptx_devices;
314
315 static inline struct nvptx_thread *
316 nvptx_thread (void)
317 {
318 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
319 }
320
321 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
322 should be locked on entry and remains locked on exit. */
323
324 static bool
325 nvptx_init (void)
326 {
327 int ndevs;
328
329 if (instantiated_devices != 0)
330 return true;
331
332 if (!init_cuda_lib ())
333 return false;
334
335 CUDA_CALL (cuInit, 0);
336
337 int cuda_driver_version;
338 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
339 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
340 "CUDA Driver %u.%u",
341 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
342
343 CUDA_CALL (cuDeviceGetCount, &ndevs);
344 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
345 * ndevs);
346
347 return true;
348 }
349
350 /* Select the N'th PTX device for the current host thread. The device must
351 have been previously opened before calling this function. */
352
353 static bool
354 nvptx_attach_host_thread_to_device (int n)
355 {
356 CUdevice dev;
357 CUresult r;
358 struct ptx_device *ptx_dev;
359 CUcontext thd_ctx;
360
361 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
362 if (r == CUDA_ERROR_NOT_PERMITTED)
363 {
364 /* Assume we're in a CUDA callback, just return true. */
365 return true;
366 }
367 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
368 {
369 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
370 return false;
371 }
372
373 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
374 return true;
375 else
376 {
377 CUcontext old_ctx;
378
379 ptx_dev = ptx_devices[n];
380 if (!ptx_dev)
381 {
382 GOMP_PLUGIN_error ("device %d not found", n);
383 return false;
384 }
385
386 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
387
388 /* We don't necessarily have a current context (e.g. if it has been
389 destroyed. Pop it if we do though. */
390 if (thd_ctx != NULL)
391 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
392
393 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
394 }
395 return true;
396 }
397
398 static struct ptx_device *
399 nvptx_open_device (int n)
400 {
401 struct ptx_device *ptx_dev;
402 CUdevice dev, ctx_dev;
403 CUresult r;
404 int async_engines, pi;
405
406 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
407
408 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
409
410 ptx_dev->ord = n;
411 ptx_dev->dev = dev;
412 ptx_dev->ctx_shared = false;
413
414 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
415 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
416 {
417 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
418 return NULL;
419 }
420
421 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
422 {
423 /* The current host thread has an active context for a different device.
424 Detach it. */
425 CUcontext old_ctx;
426 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
427 }
428
429 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
430
431 if (!ptx_dev->ctx)
432 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
433 else
434 ptx_dev->ctx_shared = true;
435
436 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
437 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
438 ptx_dev->overlap = pi;
439
440 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
441 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
442 ptx_dev->map = pi;
443
444 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
445 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
446 ptx_dev->concur = pi;
447
448 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
449 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
450 ptx_dev->mode = pi;
451
452 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
453 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
454 ptx_dev->mkern = pi;
455
456 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
457 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
458 ptx_dev->clock_khz = pi;
459
460 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
461 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
462 ptx_dev->num_sms = pi;
463
464 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
465 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
466 ptx_dev->regs_per_block = pi;
467
468 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
469 in CUDA 6.0 and newer. */
470 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
471 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
472 dev);
473 /* Fallback: use limit of registers per block, which is usually equal. */
474 if (r == CUDA_ERROR_INVALID_VALUE)
475 pi = ptx_dev->regs_per_block;
476 else if (r != CUDA_SUCCESS)
477 {
478 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
479 return NULL;
480 }
481 ptx_dev->regs_per_sm = pi;
482
483 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
484 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
485 if (pi != 32)
486 {
487 GOMP_PLUGIN_error ("Only warp size 32 is supported");
488 return NULL;
489 }
490 ptx_dev->warp_size = pi;
491
492 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
493 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
494 ptx_dev->max_threads_per_block = pi;
495
496 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
497 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
498 ptx_dev->max_threads_per_multiprocessor = pi;
499
500 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
501 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
502 if (r != CUDA_SUCCESS)
503 async_engines = 1;
504
505 for (int i = 0; i != GOMP_DIM_MAX; i++)
506 ptx_dev->default_dims[i] = 0;
507
508 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
509 dev);
510
511 ptx_dev->images = NULL;
512 pthread_mutex_init (&ptx_dev->image_lock, NULL);
513
514 ptx_dev->free_blocks = NULL;
515 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
516
517 return ptx_dev;
518 }
519
520 static bool
521 nvptx_close_device (struct ptx_device *ptx_dev)
522 {
523 if (!ptx_dev)
524 return true;
525
526 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
527 {
528 struct ptx_free_block *b_next = b->next;
529 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
530 free (b);
531 b = b_next;
532 }
533
534 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
535 pthread_mutex_destroy (&ptx_dev->image_lock);
536
537 if (!ptx_dev->ctx_shared)
538 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
539
540 free (ptx_dev);
541 return true;
542 }
543
544 static int
545 nvptx_get_num_devices (void)
546 {
547 int n;
548
549 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
550 configurations. */
551 if (sizeof (void *) != 8)
552 {
553 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
554 " only 64-bit configurations are supported\n");
555 return 0;
556 }
557
558 /* This function will be called before the plugin has been initialized in
559 order to enumerate available devices, but CUDA API routines can't be used
560 until cuInit has been called. Just call it now (but don't yet do any
561 further initialization). */
562 if (instantiated_devices == 0)
563 {
564 if (!init_cuda_lib ())
565 return 0;
566 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
567 /* This is not an error: e.g. we may have CUDA libraries installed but
568 no devices available. */
569 if (r != CUDA_SUCCESS)
570 {
571 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
572 cuda_error (r));
573 return 0;
574 }
575 }
576
577 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
578 return n;
579 }
580
581 static void
582 notify_var (const char *var_name, const char *env_var)
583 {
584 if (env_var == NULL)
585 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
586 else
587 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
588 }
589
590 static void
591 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
592 {
593 const char *var_name = "GOMP_NVPTX_JIT";
594 const char *env_var = secure_getenv (var_name);
595 notify_var (var_name, env_var);
596
597 if (env_var == NULL)
598 return;
599
600 const char *c = env_var;
601 while (*c != '\0')
602 {
603 while (*c == ' ')
604 c++;
605
606 if (c[0] == '-' && c[1] == 'O'
607 && '0' <= c[2] && c[2] <= '4'
608 && (c[3] == '\0' || c[3] == ' '))
609 {
610 *gomp_nvptx_o = c[2] - '0';
611 c += 3;
612 continue;
613 }
614
615 GOMP_PLUGIN_error ("Error parsing %s", var_name);
616 break;
617 }
618 }
619
620 static bool
621 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
622 unsigned num_objs)
623 {
624 CUjit_option opts[7];
625 void *optvals[7];
626 float elapsed = 0.0;
627 char elog[1024];
628 char ilog[16384];
629 CUlinkState linkstate;
630 CUresult r;
631 void *linkout;
632 size_t linkoutsize __attribute__ ((unused));
633
634 opts[0] = CU_JIT_WALL_TIME;
635 optvals[0] = &elapsed;
636
637 opts[1] = CU_JIT_INFO_LOG_BUFFER;
638 optvals[1] = &ilog[0];
639
640 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
641 optvals[2] = (void *) sizeof ilog;
642
643 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
644 optvals[3] = &elog[0];
645
646 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
647 optvals[4] = (void *) sizeof elog;
648
649 opts[5] = CU_JIT_LOG_VERBOSE;
650 optvals[5] = (void *) 1;
651
652 static intptr_t gomp_nvptx_o = -1;
653
654 static bool init_done = false;
655 if (!init_done)
656 {
657 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
658 init_done = true;
659 }
660
661 int nopts = 6;
662 if (gomp_nvptx_o != -1)
663 {
664 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
665 optvals[nopts] = (void *) gomp_nvptx_o;
666 nopts++;
667 }
668
669 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
670 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
671 else
672 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
673
674 for (; num_objs--; ptx_objs++)
675 {
676 /* cuLinkAddData's 'data' argument erroneously omits the const
677 qualifier. */
678 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
679 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
680 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
681 (char *) ptx_objs->code, ptx_objs->size,
682 0, 0, 0, 0);
683 else
684 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
685 (char *) ptx_objs->code, ptx_objs->size,
686 0, 0, 0, 0);
687 if (r != CUDA_SUCCESS)
688 {
689 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
690 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
691 cuda_error (r));
692 return false;
693 }
694 }
695
696 GOMP_PLUGIN_debug (0, "Linking\n");
697 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
698
699 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
700 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
701
702 if (r != CUDA_SUCCESS)
703 {
704 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
705 return false;
706 }
707
708 CUDA_CALL (cuModuleLoadData, module, linkout);
709 CUDA_CALL (cuLinkDestroy, linkstate);
710 return true;
711 }
712
713 static void
714 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
715 unsigned *dims, void *targ_mem_desc,
716 CUdeviceptr dp, CUstream stream)
717 {
718 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
719 CUfunction function;
720 int i;
721 void *kargs[1];
722 struct nvptx_thread *nvthd = nvptx_thread ();
723 int warp_size = nvthd->ptx_dev->warp_size;
724
725 function = targ_fn->fn;
726
727 /* Initialize the launch dimensions. Typically this is constant,
728 provided by the device compiler, but we must permit runtime
729 values. */
730 int seen_zero = 0;
731 for (i = 0; i != GOMP_DIM_MAX; i++)
732 {
733 if (targ_fn->launch->dim[i])
734 dims[i] = targ_fn->launch->dim[i];
735 if (!dims[i])
736 seen_zero = 1;
737 }
738
739 if (seen_zero)
740 {
741 pthread_mutex_lock (&ptx_dev_lock);
742
743 static int gomp_openacc_dims[GOMP_DIM_MAX];
744 if (!gomp_openacc_dims[0])
745 {
746 /* See if the user provided GOMP_OPENACC_DIM environment
747 variable to specify runtime defaults. */
748 for (int i = 0; i < GOMP_DIM_MAX; ++i)
749 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
750 }
751
752 if (!nvthd->ptx_dev->default_dims[0])
753 {
754 int default_dims[GOMP_DIM_MAX];
755 for (int i = 0; i < GOMP_DIM_MAX; ++i)
756 default_dims[i] = gomp_openacc_dims[i];
757
758 int gang, worker, vector;
759 {
760 int block_size = nvthd->ptx_dev->max_threads_per_block;
761 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
762 int dev_size = nvthd->ptx_dev->num_sms;
763 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
764 " dev_size=%d, cpu_size=%d\n",
765 warp_size, block_size, dev_size, cpu_size);
766
767 gang = (cpu_size / block_size) * dev_size;
768 worker = block_size / warp_size;
769 vector = warp_size;
770 }
771
772 /* There is no upper bound on the gang size. The best size
773 matches the hardware configuration. Logical gangs are
774 scheduled onto physical hardware. To maximize usage, we
775 should guess a large number. */
776 if (default_dims[GOMP_DIM_GANG] < 1)
777 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
778 /* The worker size must not exceed the hardware. */
779 if (default_dims[GOMP_DIM_WORKER] < 1
780 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
781 default_dims[GOMP_DIM_WORKER] = worker;
782 /* The vector size must exactly match the hardware. */
783 if (default_dims[GOMP_DIM_VECTOR] < 1
784 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
785 default_dims[GOMP_DIM_VECTOR] = vector;
786
787 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
788 default_dims[GOMP_DIM_GANG],
789 default_dims[GOMP_DIM_WORKER],
790 default_dims[GOMP_DIM_VECTOR]);
791
792 for (i = 0; i != GOMP_DIM_MAX; i++)
793 nvthd->ptx_dev->default_dims[i] = default_dims[i];
794 }
795 pthread_mutex_unlock (&ptx_dev_lock);
796
797 {
798 bool default_dim_p[GOMP_DIM_MAX];
799 for (i = 0; i != GOMP_DIM_MAX; i++)
800 default_dim_p[i] = !dims[i];
801
802 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
803 {
804 for (i = 0; i != GOMP_DIM_MAX; i++)
805 if (default_dim_p[i])
806 dims[i] = nvthd->ptx_dev->default_dims[i];
807
808 if (default_dim_p[GOMP_DIM_VECTOR])
809 dims[GOMP_DIM_VECTOR]
810 = MIN (dims[GOMP_DIM_VECTOR],
811 (targ_fn->max_threads_per_block / warp_size
812 * warp_size));
813
814 if (default_dim_p[GOMP_DIM_WORKER])
815 dims[GOMP_DIM_WORKER]
816 = MIN (dims[GOMP_DIM_WORKER],
817 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
818 }
819 else
820 {
821 /* Handle the case that the compiler allows the runtime to choose
822 the vector-length conservatively, by ignoring
823 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
824 it. */
825 int vectors = 0;
826 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
827 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
828 exceed targ_fn->max_threads_per_block. */
829 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
830 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
831 int grids, blocks;
832
833 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
834 &blocks, function, NULL, 0,
835 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
836 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
837 "grid = %d, block = %d\n", grids, blocks);
838
839 /* Keep the num_gangs proportional to the block size. In
840 the case were a block size is limited by shared-memory
841 or the register file capacity, the runtime will not
842 excessively over assign gangs to the multiprocessor
843 units if their state is going to be swapped out even
844 more than necessary. The constant factor 2 is there to
845 prevent threads from idling when there is insufficient
846 work for them. */
847 if (gangs == 0)
848 gangs = 2 * grids * (blocks / warp_size);
849
850 if (vectors == 0)
851 vectors = warp_size;
852
853 if (workers == 0)
854 {
855 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
856 ? vectors
857 : dims[GOMP_DIM_VECTOR]);
858 workers = blocks / actual_vectors;
859 workers = MAX (workers, 1);
860 /* If we need a per-worker barrier ... . */
861 if (actual_vectors > 32)
862 /* Don't use more barriers than available. */
863 workers = MIN (workers, 15);
864 }
865
866 for (i = 0; i != GOMP_DIM_MAX; i++)
867 if (default_dim_p[i])
868 switch (i)
869 {
870 case GOMP_DIM_GANG: dims[i] = gangs; break;
871 case GOMP_DIM_WORKER: dims[i] = workers; break;
872 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
873 default: GOMP_PLUGIN_fatal ("invalid dim");
874 }
875 }
876 }
877 }
878
879 /* Check if the accelerator has sufficient hardware resources to
880 launch the offloaded kernel. */
881 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
882 > targ_fn->max_threads_per_block)
883 {
884 const char *msg
885 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
886 " with num_workers = %d and vector_length = %d"
887 "; "
888 "recompile the program with 'num_workers = x and vector_length = y'"
889 " on that offloaded region or '-fopenacc-dim=:x:y' where"
890 " x * y <= %d"
891 ".\n");
892 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
893 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
894 }
895
896 /* Check if the accelerator has sufficient barrier resources to
897 launch the offloaded kernel. */
898 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
899 {
900 const char *msg
901 = ("The Nvidia accelerator has insufficient barrier resources to launch"
902 " '%s' with num_workers = %d and vector_length = %d"
903 "; "
904 "recompile the program with 'num_workers = x' on that offloaded"
905 " region or '-fopenacc-dim=:x:' where x <= 15"
906 "; "
907 "or, recompile the program with 'vector_length = 32' on that"
908 " offloaded region or '-fopenacc-dim=::32'"
909 ".\n");
910 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
911 dims[GOMP_DIM_VECTOR]);
912 }
913
914 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
915 " gangs=%u, workers=%u, vectors=%u\n",
916 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
917 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
918
919 // OpenACC CUDA
920 //
921 // num_gangs nctaid.x
922 // num_workers ntid.y
923 // vector length ntid.x
924
925 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
926 acc_prof_info *prof_info = thr->prof_info;
927 acc_event_info enqueue_launch_event_info;
928 acc_api_info *api_info = thr->api_info;
929 bool profiling_p = __builtin_expect (prof_info != NULL, false);
930 if (profiling_p)
931 {
932 prof_info->event_type = acc_ev_enqueue_launch_start;
933
934 enqueue_launch_event_info.launch_event.event_type
935 = prof_info->event_type;
936 enqueue_launch_event_info.launch_event.valid_bytes
937 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
938 enqueue_launch_event_info.launch_event.parent_construct
939 = acc_construct_parallel;
940 enqueue_launch_event_info.launch_event.implicit = 1;
941 enqueue_launch_event_info.launch_event.tool_info = NULL;
942 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
943 enqueue_launch_event_info.launch_event.num_gangs
944 = dims[GOMP_DIM_GANG];
945 enqueue_launch_event_info.launch_event.num_workers
946 = dims[GOMP_DIM_WORKER];
947 enqueue_launch_event_info.launch_event.vector_length
948 = dims[GOMP_DIM_VECTOR];
949
950 api_info->device_api = acc_device_api_cuda;
951
952 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
953 api_info);
954 }
955
956 kargs[0] = &dp;
957 CUDA_CALL_ASSERT (cuLaunchKernel, function,
958 dims[GOMP_DIM_GANG], 1, 1,
959 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
960 0, stream, kargs, 0);
961
962 if (profiling_p)
963 {
964 prof_info->event_type = acc_ev_enqueue_launch_end;
965 enqueue_launch_event_info.launch_event.event_type
966 = prof_info->event_type;
967 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
968 api_info);
969 }
970
971 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
972 targ_fn->launch->fn);
973 }
974
975 void * openacc_get_current_cuda_context (void);
976
977 static void
978 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
979 {
980 acc_prof_info *prof_info = thr->prof_info;
981 acc_event_info data_event_info;
982 acc_api_info *api_info = thr->api_info;
983
984 prof_info->event_type = acc_ev_alloc;
985
986 data_event_info.data_event.event_type = prof_info->event_type;
987 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
988 data_event_info.data_event.parent_construct = acc_construct_parallel;
989 data_event_info.data_event.implicit = 1;
990 data_event_info.data_event.tool_info = NULL;
991 data_event_info.data_event.var_name = NULL;
992 data_event_info.data_event.bytes = s;
993 data_event_info.data_event.host_ptr = NULL;
994 data_event_info.data_event.device_ptr = dp;
995
996 api_info->device_api = acc_device_api_cuda;
997
998 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
999 }
1000
1001 static void *
1002 nvptx_alloc (size_t s)
1003 {
1004 CUdeviceptr d;
1005
1006 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1007 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1008 bool profiling_p
1009 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1010 if (profiling_p)
1011 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1012
1013 return (void *) d;
1014 }
1015
1016 static void
1017 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1018 {
1019 acc_prof_info *prof_info = thr->prof_info;
1020 acc_event_info data_event_info;
1021 acc_api_info *api_info = thr->api_info;
1022
1023 prof_info->event_type = acc_ev_free;
1024
1025 data_event_info.data_event.event_type = prof_info->event_type;
1026 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1027 data_event_info.data_event.parent_construct = acc_construct_parallel;
1028 data_event_info.data_event.implicit = 1;
1029 data_event_info.data_event.tool_info = NULL;
1030 data_event_info.data_event.var_name = NULL;
1031 data_event_info.data_event.bytes = -1;
1032 data_event_info.data_event.host_ptr = NULL;
1033 data_event_info.data_event.device_ptr = p;
1034
1035 api_info->device_api = acc_device_api_cuda;
1036
1037 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1038 }
1039
1040 static bool
1041 nvptx_free (void *p, struct ptx_device *ptx_dev)
1042 {
1043 /* Assume callback context if this is null. */
1044 if (GOMP_PLUGIN_acc_thread () == NULL)
1045 {
1046 struct ptx_free_block *n
1047 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1048 n->ptr = p;
1049 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1050 n->next = ptx_dev->free_blocks;
1051 ptx_dev->free_blocks = n;
1052 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1053 return true;
1054 }
1055
1056 CUdeviceptr pb;
1057 size_t ps;
1058
1059 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1060 if ((CUdeviceptr) p != pb)
1061 {
1062 GOMP_PLUGIN_error ("invalid device address");
1063 return false;
1064 }
1065
1066 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1067 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1068 bool profiling_p
1069 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1070 if (profiling_p)
1071 goacc_profiling_acc_ev_free (thr, p);
1072
1073 return true;
1074 }
1075
1076 static void *
1077 nvptx_get_current_cuda_device (void)
1078 {
1079 struct nvptx_thread *nvthd = nvptx_thread ();
1080
1081 if (!nvthd || !nvthd->ptx_dev)
1082 return NULL;
1083
1084 return &nvthd->ptx_dev->dev;
1085 }
1086
1087 static void *
1088 nvptx_get_current_cuda_context (void)
1089 {
1090 struct nvptx_thread *nvthd = nvptx_thread ();
1091
1092 if (!nvthd || !nvthd->ptx_dev)
1093 return NULL;
1094
1095 return nvthd->ptx_dev->ctx;
1096 }
1097
1098 /* Plugin entry points. */
1099
1100 const char *
1101 GOMP_OFFLOAD_get_name (void)
1102 {
1103 return "nvptx";
1104 }
1105
1106 unsigned int
1107 GOMP_OFFLOAD_get_caps (void)
1108 {
1109 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1110 }
1111
1112 int
1113 GOMP_OFFLOAD_get_type (void)
1114 {
1115 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1116 }
1117
1118 int
1119 GOMP_OFFLOAD_get_num_devices (void)
1120 {
1121 return nvptx_get_num_devices ();
1122 }
1123
1124 union gomp_device_property_value
1125 GOMP_OFFLOAD_get_property (int n, int prop)
1126 {
1127 union gomp_device_property_value propval = { .val = 0 };
1128
1129 pthread_mutex_lock (&ptx_dev_lock);
1130
1131 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1132 {
1133 pthread_mutex_unlock (&ptx_dev_lock);
1134 return propval;
1135 }
1136
1137 struct ptx_device *ptx_dev = ptx_devices[n];
1138 switch (prop)
1139 {
1140 case GOMP_DEVICE_PROPERTY_MEMORY:
1141 {
1142 size_t total_mem;
1143
1144 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1145 propval.val = total_mem;
1146 }
1147 break;
1148 case GOMP_DEVICE_PROPERTY_FREE_MEMORY:
1149 {
1150 size_t total_mem;
1151 size_t free_mem;
1152 CUdevice ctxdev;
1153
1154 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1155 if (ptx_dev->dev == ctxdev)
1156 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1157 else if (ptx_dev->ctx)
1158 {
1159 CUcontext old_ctx;
1160
1161 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1162 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1163 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1164 }
1165 else
1166 {
1167 CUcontext new_ctx;
1168
1169 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1170 ptx_dev->dev);
1171 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1172 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1173 }
1174 propval.val = free_mem;
1175 }
1176 break;
1177 case GOMP_DEVICE_PROPERTY_NAME:
1178 propval.ptr = ptx_dev->name;
1179 break;
1180 case GOMP_DEVICE_PROPERTY_VENDOR:
1181 propval.ptr = "Nvidia";
1182 break;
1183 case GOMP_DEVICE_PROPERTY_DRIVER:
1184 propval.ptr = cuda_driver_version_s;
1185 break;
1186 }
1187
1188 pthread_mutex_unlock (&ptx_dev_lock);
1189 return propval;
1190 }
1191
1192 bool
1193 GOMP_OFFLOAD_init_device (int n)
1194 {
1195 struct ptx_device *dev;
1196
1197 pthread_mutex_lock (&ptx_dev_lock);
1198
1199 if (!nvptx_init () || ptx_devices[n] != NULL)
1200 {
1201 pthread_mutex_unlock (&ptx_dev_lock);
1202 return false;
1203 }
1204
1205 dev = nvptx_open_device (n);
1206 if (dev)
1207 {
1208 ptx_devices[n] = dev;
1209 instantiated_devices++;
1210 }
1211
1212 pthread_mutex_unlock (&ptx_dev_lock);
1213
1214 return dev != NULL;
1215 }
1216
1217 bool
1218 GOMP_OFFLOAD_fini_device (int n)
1219 {
1220 pthread_mutex_lock (&ptx_dev_lock);
1221
1222 if (ptx_devices[n] != NULL)
1223 {
1224 if (!nvptx_attach_host_thread_to_device (n)
1225 || !nvptx_close_device (ptx_devices[n]))
1226 {
1227 pthread_mutex_unlock (&ptx_dev_lock);
1228 return false;
1229 }
1230 ptx_devices[n] = NULL;
1231 instantiated_devices--;
1232 }
1233
1234 if (instantiated_devices == 0)
1235 {
1236 free (ptx_devices);
1237 ptx_devices = NULL;
1238 }
1239
1240 pthread_mutex_unlock (&ptx_dev_lock);
1241 return true;
1242 }
1243
1244 /* Return the libgomp version number we're compatible with. There is
1245 no requirement for cross-version compatibility. */
1246
1247 unsigned
1248 GOMP_OFFLOAD_version (void)
1249 {
1250 return GOMP_VERSION;
1251 }
1252
1253 /* Initialize __nvptx_clocktick, if present in MODULE. */
1254
1255 static void
1256 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1257 {
1258 CUdeviceptr dptr;
1259 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1260 module, "__nvptx_clocktick");
1261 if (r == CUDA_ERROR_NOT_FOUND)
1262 return;
1263 if (r != CUDA_SUCCESS)
1264 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1265 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1266 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1267 sizeof (__nvptx_clocktick));
1268 if (r != CUDA_SUCCESS)
1269 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1270 }
1271
1272 /* Load the (partial) program described by TARGET_DATA to device
1273 number ORD. Allocate and return TARGET_TABLE. */
1274
1275 int
1276 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1277 struct addr_pair **target_table)
1278 {
1279 CUmodule module;
1280 const char *const *var_names;
1281 const struct targ_fn_launch *fn_descs;
1282 unsigned int fn_entries, var_entries, i, j;
1283 struct targ_fn_descriptor *targ_fns;
1284 struct addr_pair *targ_tbl;
1285 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1286 struct ptx_image_data *new_image;
1287 struct ptx_device *dev;
1288
1289 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1290 {
1291 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1292 " (expected %u, received %u)",
1293 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1294 return -1;
1295 }
1296
1297 if (!nvptx_attach_host_thread_to_device (ord)
1298 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1299 return -1;
1300
1301 dev = ptx_devices[ord];
1302
1303 /* The mkoffload utility emits a struct of pointers/integers at the
1304 start of each offload image. The array of kernel names and the
1305 functions addresses form a one-to-one correspondence. */
1306
1307 var_entries = img_header->var_num;
1308 var_names = img_header->var_names;
1309 fn_entries = img_header->fn_num;
1310 fn_descs = img_header->fn_descs;
1311
1312 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1313 * (fn_entries + var_entries));
1314 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1315 * fn_entries);
1316
1317 *target_table = targ_tbl;
1318
1319 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1320 new_image->target_data = target_data;
1321 new_image->module = module;
1322 new_image->fns = targ_fns;
1323
1324 pthread_mutex_lock (&dev->image_lock);
1325 new_image->next = dev->images;
1326 dev->images = new_image;
1327 pthread_mutex_unlock (&dev->image_lock);
1328
1329 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1330 {
1331 CUfunction function;
1332 int nregs, mthrs;
1333
1334 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1335 fn_descs[i].fn);
1336 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1337 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1338 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1339 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1340
1341 targ_fns->fn = function;
1342 targ_fns->launch = &fn_descs[i];
1343 targ_fns->regs_per_thread = nregs;
1344 targ_fns->max_threads_per_block = mthrs;
1345
1346 targ_tbl->start = (uintptr_t) targ_fns;
1347 targ_tbl->end = targ_tbl->start + 1;
1348 }
1349
1350 for (j = 0; j < var_entries; j++, targ_tbl++)
1351 {
1352 CUdeviceptr var;
1353 size_t bytes;
1354
1355 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1356 &var, &bytes, module, var_names[j]);
1357
1358 targ_tbl->start = (uintptr_t) var;
1359 targ_tbl->end = targ_tbl->start + bytes;
1360 }
1361
1362 nvptx_set_clocktick (module, dev);
1363
1364 return fn_entries + var_entries;
1365 }
1366
1367 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1368 function descriptors allocated by G_O_load_image. */
1369
1370 bool
1371 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1372 {
1373 struct ptx_image_data *image, **prev_p;
1374 struct ptx_device *dev = ptx_devices[ord];
1375
1376 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1377 {
1378 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1379 " (expected %u, received %u)",
1380 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1381 return false;
1382 }
1383
1384 bool ret = true;
1385 pthread_mutex_lock (&dev->image_lock);
1386 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1387 if (image->target_data == target_data)
1388 {
1389 *prev_p = image->next;
1390 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1391 ret = false;
1392 free (image->fns);
1393 free (image);
1394 break;
1395 }
1396 pthread_mutex_unlock (&dev->image_lock);
1397 return ret;
1398 }
1399
1400 void *
1401 GOMP_OFFLOAD_alloc (int ord, size_t size)
1402 {
1403 if (!nvptx_attach_host_thread_to_device (ord))
1404 return NULL;
1405
1406 struct ptx_device *ptx_dev = ptx_devices[ord];
1407 struct ptx_free_block *blocks, *tmp;
1408
1409 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1410 blocks = ptx_dev->free_blocks;
1411 ptx_dev->free_blocks = NULL;
1412 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1413
1414 while (blocks)
1415 {
1416 tmp = blocks->next;
1417 nvptx_free (blocks->ptr, ptx_dev);
1418 free (blocks);
1419 blocks = tmp;
1420 }
1421
1422 return nvptx_alloc (size);
1423 }
1424
1425 bool
1426 GOMP_OFFLOAD_free (int ord, void *ptr)
1427 {
1428 return (nvptx_attach_host_thread_to_device (ord)
1429 && nvptx_free (ptr, ptx_devices[ord]));
1430 }
1431
1432 void
1433 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1434 void **hostaddrs, void **devaddrs,
1435 unsigned *dims, void *targ_mem_desc)
1436 {
1437 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1438
1439 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1440 acc_prof_info *prof_info = thr->prof_info;
1441 acc_event_info data_event_info;
1442 acc_api_info *api_info = thr->api_info;
1443 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1444
1445 void **hp = NULL;
1446 CUdeviceptr dp = 0;
1447
1448 if (mapnum > 0)
1449 {
1450 size_t s = mapnum * sizeof (void *);
1451 hp = alloca (s);
1452 for (int i = 0; i < mapnum; i++)
1453 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1454 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1455 if (profiling_p)
1456 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1457 }
1458
1459 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1460 fact have the same value on a unified-memory system). */
1461 if (mapnum > 0)
1462 {
1463 if (profiling_p)
1464 {
1465 prof_info->event_type = acc_ev_enqueue_upload_start;
1466
1467 data_event_info.data_event.event_type = prof_info->event_type;
1468 data_event_info.data_event.valid_bytes
1469 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1470 data_event_info.data_event.parent_construct
1471 = acc_construct_parallel;
1472 data_event_info.data_event.implicit = 1; /* Always implicit. */
1473 data_event_info.data_event.tool_info = NULL;
1474 data_event_info.data_event.var_name = NULL;
1475 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1476 data_event_info.data_event.host_ptr = hp;
1477 data_event_info.data_event.device_ptr = (const void *) dp;
1478
1479 api_info->device_api = acc_device_api_cuda;
1480
1481 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1482 api_info);
1483 }
1484 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1485 mapnum * sizeof (void *));
1486 if (profiling_p)
1487 {
1488 prof_info->event_type = acc_ev_enqueue_upload_end;
1489 data_event_info.data_event.event_type = prof_info->event_type;
1490 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1491 api_info);
1492 }
1493 }
1494
1495 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1496 dp, NULL);
1497
1498 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1499 const char *maybe_abort_msg = "(perhaps abort was called)";
1500 if (r == CUDA_ERROR_LAUNCH_FAILED)
1501 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1502 maybe_abort_msg);
1503 else if (r != CUDA_SUCCESS)
1504 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1505
1506 CUDA_CALL_ASSERT (cuMemFree, dp);
1507 if (profiling_p)
1508 goacc_profiling_acc_ev_free (thr, (void *) dp);
1509 }
1510
1511 static void
1512 cuda_free_argmem (void *ptr)
1513 {
1514 void **block = (void **) ptr;
1515 nvptx_free (block[0], (struct ptx_device *) block[1]);
1516 free (block);
1517 }
1518
1519 void
1520 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1521 void **hostaddrs, void **devaddrs,
1522 unsigned *dims, void *targ_mem_desc,
1523 struct goacc_asyncqueue *aq)
1524 {
1525 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1526
1527 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1528 acc_prof_info *prof_info = thr->prof_info;
1529 acc_event_info data_event_info;
1530 acc_api_info *api_info = thr->api_info;
1531 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1532
1533 void **hp = NULL;
1534 CUdeviceptr dp = 0;
1535 void **block = NULL;
1536
1537 if (mapnum > 0)
1538 {
1539 size_t s = mapnum * sizeof (void *);
1540 block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1541 hp = block + 2;
1542 for (int i = 0; i < mapnum; i++)
1543 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1544 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1545 if (profiling_p)
1546 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1547 }
1548
1549 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1550 fact have the same value on a unified-memory system). */
1551 if (mapnum > 0)
1552 {
1553 if (profiling_p)
1554 {
1555 prof_info->event_type = acc_ev_enqueue_upload_start;
1556
1557 data_event_info.data_event.event_type = prof_info->event_type;
1558 data_event_info.data_event.valid_bytes
1559 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1560 data_event_info.data_event.parent_construct
1561 = acc_construct_parallel;
1562 data_event_info.data_event.implicit = 1; /* Always implicit. */
1563 data_event_info.data_event.tool_info = NULL;
1564 data_event_info.data_event.var_name = NULL;
1565 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1566 data_event_info.data_event.host_ptr = hp;
1567 data_event_info.data_event.device_ptr = (const void *) dp;
1568
1569 api_info->device_api = acc_device_api_cuda;
1570
1571 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1572 api_info);
1573 }
1574
1575 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1576 mapnum * sizeof (void *), aq->cuda_stream);
1577 block[0] = (void *) dp;
1578
1579 struct nvptx_thread *nvthd =
1580 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1581 block[1] = (void *) nvthd->ptx_dev;
1582
1583 if (profiling_p)
1584 {
1585 prof_info->event_type = acc_ev_enqueue_upload_end;
1586 data_event_info.data_event.event_type = prof_info->event_type;
1587 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1588 api_info);
1589 }
1590 }
1591
1592 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1593 dp, aq->cuda_stream);
1594
1595 if (mapnum > 0)
1596 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1597 }
1598
1599 void *
1600 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1601 {
1602 struct ptx_device *ptx_dev;
1603 struct nvptx_thread *nvthd
1604 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1605 CUcontext thd_ctx;
1606
1607 ptx_dev = ptx_devices[ord];
1608
1609 assert (ptx_dev);
1610
1611 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1612
1613 assert (ptx_dev->ctx);
1614
1615 if (!thd_ctx)
1616 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1617
1618 nvthd->ptx_dev = ptx_dev;
1619
1620 return (void *) nvthd;
1621 }
1622
1623 void
1624 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1625 {
1626 free (data);
1627 }
1628
1629 void *
1630 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1631 {
1632 return nvptx_get_current_cuda_device ();
1633 }
1634
1635 void *
1636 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1637 {
1638 return nvptx_get_current_cuda_context ();
1639 }
1640
1641 /* This returns a CUstream. */
1642 void *
1643 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1644 {
1645 return (void *) aq->cuda_stream;
1646 }
1647
1648 /* This takes a CUstream. */
1649 int
1650 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1651 {
1652 if (aq->cuda_stream)
1653 {
1654 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1655 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1656 }
1657
1658 aq->cuda_stream = (CUstream) stream;
1659 return 1;
1660 }
1661
1662 struct goacc_asyncqueue *
1663 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1664 {
1665 CUstream stream = NULL;
1666 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1667
1668 struct goacc_asyncqueue *aq
1669 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1670 aq->cuda_stream = stream;
1671 return aq;
1672 }
1673
1674 bool
1675 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1676 {
1677 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1678 free (aq);
1679 return true;
1680 }
1681
1682 int
1683 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1684 {
1685 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1686 if (r == CUDA_SUCCESS)
1687 return 1;
1688 if (r == CUDA_ERROR_NOT_READY)
1689 return 0;
1690
1691 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1692 return -1;
1693 }
1694
1695 bool
1696 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1697 {
1698 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1699 return true;
1700 }
1701
1702 bool
1703 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1704 struct goacc_asyncqueue *aq2)
1705 {
1706 CUevent e;
1707 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1708 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1709 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1710 return true;
1711 }
1712
1713 static void
1714 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1715 {
1716 if (res != CUDA_SUCCESS)
1717 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1718 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1719 cb->fn (cb->ptr);
1720 free (ptr);
1721 }
1722
1723 void
1724 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1725 void (*callback_fn)(void *),
1726 void *userptr)
1727 {
1728 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1729 b->fn = callback_fn;
1730 b->ptr = userptr;
1731 b->aq = aq;
1732 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1733 cuda_callback_wrapper, (void *) b, 0);
1734 }
1735
1736 static bool
1737 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1738 {
1739 CUdeviceptr pb;
1740 size_t ps;
1741 if (!s)
1742 return true;
1743 if (!d)
1744 {
1745 GOMP_PLUGIN_error ("invalid device address");
1746 return false;
1747 }
1748 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1749 if (!pb)
1750 {
1751 GOMP_PLUGIN_error ("invalid device address");
1752 return false;
1753 }
1754 if (!h)
1755 {
1756 GOMP_PLUGIN_error ("invalid host address");
1757 return false;
1758 }
1759 if (d == h)
1760 {
1761 GOMP_PLUGIN_error ("invalid host or device address");
1762 return false;
1763 }
1764 if ((void *)(d + s) > (void *)(pb + ps))
1765 {
1766 GOMP_PLUGIN_error ("invalid size");
1767 return false;
1768 }
1769 return true;
1770 }
1771
1772 bool
1773 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1774 {
1775 if (!nvptx_attach_host_thread_to_device (ord)
1776 || !cuda_memcpy_sanity_check (src, dst, n))
1777 return false;
1778 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1779 return true;
1780 }
1781
1782 bool
1783 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1784 {
1785 if (!nvptx_attach_host_thread_to_device (ord)
1786 || !cuda_memcpy_sanity_check (dst, src, n))
1787 return false;
1788 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1789 return true;
1790 }
1791
1792 bool
1793 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1794 {
1795 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1796 return true;
1797 }
1798
1799 bool
1800 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1801 size_t n, struct goacc_asyncqueue *aq)
1802 {
1803 if (!nvptx_attach_host_thread_to_device (ord)
1804 || !cuda_memcpy_sanity_check (src, dst, n))
1805 return false;
1806 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1807 return true;
1808 }
1809
1810 bool
1811 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1812 size_t n, struct goacc_asyncqueue *aq)
1813 {
1814 if (!nvptx_attach_host_thread_to_device (ord)
1815 || !cuda_memcpy_sanity_check (dst, src, n))
1816 return false;
1817 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1818 return true;
1819 }
1820
1821 /* Adjust launch dimensions: pick good values for number of blocks and warps
1822 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1823 own limits. */
1824
1825 static void
1826 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1827 struct ptx_device *ptx_dev,
1828 int *teams_p, int *threads_p)
1829 {
1830 int max_warps_block = fn->max_threads_per_block / 32;
1831 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1832 and libgcc, which matches documented limit of all GPUs as of 2015. */
1833 if (max_warps_block > 32)
1834 max_warps_block = 32;
1835 if (*threads_p <= 0)
1836 *threads_p = 8;
1837 if (*threads_p > max_warps_block)
1838 *threads_p = max_warps_block;
1839
1840 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1841 /* This is an estimate of how many blocks the device can host simultaneously.
1842 Actual limit, which may be lower, can be queried with "occupancy control"
1843 driver interface (since CUDA 6.0). */
1844 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1845 if (*teams_p <= 0 || *teams_p > max_blocks)
1846 *teams_p = max_blocks;
1847 }
1848
1849 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1850 target regions. */
1851
1852 static size_t
1853 nvptx_stacks_size ()
1854 {
1855 return 128 * 1024;
1856 }
1857
1858 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
1859
1860 static void *
1861 nvptx_stacks_alloc (size_t size, int num)
1862 {
1863 CUdeviceptr stacks;
1864 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
1865 if (r != CUDA_SUCCESS)
1866 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1867 return (void *) stacks;
1868 }
1869
1870 /* Release storage previously allocated by nvptx_stacks_alloc. */
1871
1872 static void
1873 nvptx_stacks_free (void *p, int num)
1874 {
1875 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
1876 if (r != CUDA_SUCCESS)
1877 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1878 }
1879
1880 void
1881 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1882 {
1883 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
1884 CUresult r;
1885 struct ptx_device *ptx_dev = ptx_devices[ord];
1886 const char *maybe_abort_msg = "(perhaps abort was called)";
1887 int teams = 0, threads = 0;
1888
1889 if (!args)
1890 GOMP_PLUGIN_fatal ("No target arguments provided");
1891 while (*args)
1892 {
1893 intptr_t id = (intptr_t) *args++, val;
1894 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1895 val = (intptr_t) *args++;
1896 else
1897 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1898 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1899 continue;
1900 val = val > INT_MAX ? INT_MAX : val;
1901 id &= GOMP_TARGET_ARG_ID_MASK;
1902 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
1903 teams = val;
1904 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
1905 threads = val;
1906 }
1907 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
1908
1909 size_t stack_size = nvptx_stacks_size ();
1910 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
1911 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
1912 size_t fn_args_size = sizeof fn_args;
1913 void *config[] = {
1914 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
1915 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
1916 CU_LAUNCH_PARAM_END
1917 };
1918 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
1919 32, threads, 1, 0, NULL, NULL, config);
1920 if (r != CUDA_SUCCESS)
1921 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1922
1923 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1924 if (r == CUDA_ERROR_LAUNCH_FAILED)
1925 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1926 maybe_abort_msg);
1927 else if (r != CUDA_SUCCESS)
1928 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1929 nvptx_stacks_free (stacks, teams * threads);
1930 }
1931
1932 void
1933 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
1934 void *async_data)
1935 {
1936 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
1937 }