]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgomp/plugin/plugin-nvptx.c
Update copyright years.
[thirdparty/gcc.git] / libgomp / plugin / plugin-nvptx.c
1 /* Plugin for NVPTX execution.
2
3 Copyright (C) 2013-2022 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "symcat.h"
38 #include "libgomp-plugin.h"
39 #include "oacc-plugin.h"
40 #include "gomp-constants.h"
41 #include "oacc-int.h"
42
43 #include <pthread.h>
44 #include <cuda.h>
45 #include <stdbool.h>
46 #include <limits.h>
47 #include <string.h>
48 #include <stdio.h>
49 #include <unistd.h>
50 #include <assert.h>
51 #include <errno.h>
52
53 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
54 block to cache between kernel invocations. For soft-stacks blocks bigger
55 than this, we will free the block before attempting another GPU memory
56 allocation (i.e. in GOMP_OFFLOAD_alloc). Otherwise, if an allocation fails,
57 we will free the cached soft-stacks block anyway then retry the
58 allocation. If that fails too, we lose. */
59
60 #define SOFTSTACK_CACHE_LIMIT 134217728
61
62 #if CUDA_VERSION < 6000
63 extern CUresult cuGetErrorString (CUresult, const char **);
64 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
65 #endif
66
67 #if CUDA_VERSION >= 6050
68 #undef cuLinkCreate
69 #undef cuLinkAddData
70 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
71 const char *, unsigned, CUjit_option *, void **);
72 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
73 #else
74 typedef size_t (*CUoccupancyB2DSize)(int);
75 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
76 const char *, unsigned, CUjit_option *, void **);
77 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
78 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
79 CUoccupancyB2DSize, size_t, int);
80 #endif
81
82 #define DO_PRAGMA(x) _Pragma (#x)
83
84 #if PLUGIN_NVPTX_DYNAMIC
85 # include <dlfcn.h>
86
87 struct cuda_lib_s {
88
89 # define CUDA_ONE_CALL(call) \
90 __typeof (call) *call;
91 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
92 CUDA_ONE_CALL (call)
93 #include "cuda-lib.def"
94 # undef CUDA_ONE_CALL
95 # undef CUDA_ONE_CALL_MAYBE_NULL
96
97 } cuda_lib;
98
99 /* -1 if init_cuda_lib has not been called yet, false
100 if it has been and failed, true if it has been and succeeded. */
101 static signed char cuda_lib_inited = -1;
102
103 /* Dynamically load the CUDA runtime library and initialize function
104 pointers, return false if unsuccessful, true if successful. */
105 static bool
106 init_cuda_lib (void)
107 {
108 if (cuda_lib_inited != -1)
109 return cuda_lib_inited;
110 const char *cuda_runtime_lib = "libcuda.so.1";
111 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
112 cuda_lib_inited = false;
113 if (h == NULL)
114 return false;
115
116 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
117 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
118 # define CUDA_ONE_CALL_1(call, allow_null) \
119 cuda_lib.call = dlsym (h, #call); \
120 if (!allow_null && cuda_lib.call == NULL) \
121 return false;
122 #include "cuda-lib.def"
123 # undef CUDA_ONE_CALL
124 # undef CUDA_ONE_CALL_1
125 # undef CUDA_ONE_CALL_MAYBE_NULL
126
127 cuda_lib_inited = true;
128 return true;
129 }
130 # define CUDA_CALL_PREFIX cuda_lib.
131 #else
132
133 # define CUDA_ONE_CALL(call)
134 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
135 #include "cuda-lib.def"
136 #undef CUDA_ONE_CALL_MAYBE_NULL
137 #undef CUDA_ONE_CALL
138
139 # define CUDA_CALL_PREFIX
140 # define init_cuda_lib() true
141 #endif
142
143 #include "secure_getenv.h"
144
145 #undef MIN
146 #undef MAX
147 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
148 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
149
150 /* Convenience macros for the frequently used CUDA library call and
151 error handling sequence as well as CUDA library calls that
152 do the error checking themselves or don't do it at all. */
153
154 #define CUDA_CALL_ERET(ERET, FN, ...) \
155 do { \
156 unsigned __r \
157 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
158 if (__r != CUDA_SUCCESS) \
159 { \
160 GOMP_PLUGIN_error (#FN " error: %s", \
161 cuda_error (__r)); \
162 return ERET; \
163 } \
164 } while (0)
165
166 #define CUDA_CALL(FN, ...) \
167 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
168
169 #define CUDA_CALL_ASSERT(FN, ...) \
170 do { \
171 unsigned __r \
172 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
173 if (__r != CUDA_SUCCESS) \
174 { \
175 GOMP_PLUGIN_fatal (#FN " error: %s", \
176 cuda_error (__r)); \
177 } \
178 } while (0)
179
180 #define CUDA_CALL_NOCHECK(FN, ...) \
181 CUDA_CALL_PREFIX FN (__VA_ARGS__)
182
183 #define CUDA_CALL_EXISTS(FN) \
184 CUDA_CALL_PREFIX FN
185
186 static const char *
187 cuda_error (CUresult r)
188 {
189 const char *fallback = "unknown cuda error";
190 const char *desc;
191
192 if (!CUDA_CALL_EXISTS (cuGetErrorString))
193 return fallback;
194
195 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
196 if (r == CUDA_SUCCESS)
197 return desc;
198
199 return fallback;
200 }
201
202 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
203 Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
204 static char cuda_driver_version_s[30];
205
206 static unsigned int instantiated_devices = 0;
207 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
208
209 /* NVPTX/CUDA specific definition of asynchronous queues. */
210 struct goacc_asyncqueue
211 {
212 CUstream cuda_stream;
213 };
214
215 struct nvptx_callback
216 {
217 void (*fn) (void *);
218 void *ptr;
219 struct goacc_asyncqueue *aq;
220 struct nvptx_callback *next;
221 };
222
223 /* Thread-specific data for PTX. */
224
225 struct nvptx_thread
226 {
227 /* We currently have this embedded inside the plugin because libgomp manages
228 devices through integer target_ids. This might be better if using an
229 opaque target-specific pointer directly from gomp_device_descr. */
230 struct ptx_device *ptx_dev;
231 };
232
233 /* Target data function launch information. */
234
235 struct targ_fn_launch
236 {
237 const char *fn;
238 unsigned short dim[GOMP_DIM_MAX];
239 };
240
241 /* Target PTX object information. */
242
243 struct targ_ptx_obj
244 {
245 const char *code;
246 size_t size;
247 };
248
249 /* Target data image information. */
250
251 typedef struct nvptx_tdata
252 {
253 const struct targ_ptx_obj *ptx_objs;
254 unsigned ptx_num;
255
256 const char *const *var_names;
257 unsigned var_num;
258
259 const struct targ_fn_launch *fn_descs;
260 unsigned fn_num;
261 } nvptx_tdata_t;
262
263 /* Descriptor of a loaded function. */
264
265 struct targ_fn_descriptor
266 {
267 CUfunction fn;
268 const struct targ_fn_launch *launch;
269 int regs_per_thread;
270 int max_threads_per_block;
271 };
272
273 /* A loaded PTX image. */
274 struct ptx_image_data
275 {
276 const void *target_data;
277 CUmodule module;
278
279 struct targ_fn_descriptor *fns; /* Array of functions. */
280
281 struct ptx_image_data *next;
282 };
283
284 struct ptx_free_block
285 {
286 void *ptr;
287 struct ptx_free_block *next;
288 };
289
290 struct ptx_device
291 {
292 CUcontext ctx;
293 bool ctx_shared;
294 CUdevice dev;
295
296 int ord;
297 bool overlap;
298 bool map;
299 bool concur;
300 bool mkern;
301 int mode;
302 int clock_khz;
303 int num_sms;
304 int regs_per_block;
305 int regs_per_sm;
306 int warp_size;
307 int max_threads_per_block;
308 int max_threads_per_multiprocessor;
309 int default_dims[GOMP_DIM_MAX];
310
311 /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp'). */
312 char name[256];
313
314 struct ptx_image_data *images; /* Images loaded on device. */
315 pthread_mutex_t image_lock; /* Lock for above list. */
316
317 struct ptx_free_block *free_blocks;
318 pthread_mutex_t free_blocks_lock;
319
320 /* OpenMP stacks, cached between kernel invocations. */
321 struct
322 {
323 CUdeviceptr ptr;
324 size_t size;
325 pthread_mutex_t lock;
326 } omp_stacks;
327
328 struct ptx_device *next;
329 };
330
331 static struct ptx_device **ptx_devices;
332
333 static inline struct nvptx_thread *
334 nvptx_thread (void)
335 {
336 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
337 }
338
339 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
340 should be locked on entry and remains locked on exit. */
341
342 static bool
343 nvptx_init (void)
344 {
345 int ndevs;
346
347 if (instantiated_devices != 0)
348 return true;
349
350 if (!init_cuda_lib ())
351 return false;
352
353 CUDA_CALL (cuInit, 0);
354
355 int cuda_driver_version;
356 CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
357 snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
358 "CUDA Driver %u.%u",
359 cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
360
361 CUDA_CALL (cuDeviceGetCount, &ndevs);
362 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
363 * ndevs);
364
365 return true;
366 }
367
368 /* Select the N'th PTX device for the current host thread. The device must
369 have been previously opened before calling this function. */
370
371 static bool
372 nvptx_attach_host_thread_to_device (int n)
373 {
374 CUdevice dev;
375 CUresult r;
376 struct ptx_device *ptx_dev;
377 CUcontext thd_ctx;
378
379 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
380 if (r == CUDA_ERROR_NOT_PERMITTED)
381 {
382 /* Assume we're in a CUDA callback, just return true. */
383 return true;
384 }
385 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
386 {
387 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
388 return false;
389 }
390
391 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
392 return true;
393 else
394 {
395 CUcontext old_ctx;
396
397 ptx_dev = ptx_devices[n];
398 if (!ptx_dev)
399 {
400 GOMP_PLUGIN_error ("device %d not found", n);
401 return false;
402 }
403
404 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
405
406 /* We don't necessarily have a current context (e.g. if it has been
407 destroyed. Pop it if we do though. */
408 if (thd_ctx != NULL)
409 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
410
411 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
412 }
413 return true;
414 }
415
416 static struct ptx_device *
417 nvptx_open_device (int n)
418 {
419 struct ptx_device *ptx_dev;
420 CUdevice dev, ctx_dev;
421 CUresult r;
422 int async_engines, pi;
423
424 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
425
426 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
427
428 ptx_dev->ord = n;
429 ptx_dev->dev = dev;
430 ptx_dev->ctx_shared = false;
431
432 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
433 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
434 {
435 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
436 return NULL;
437 }
438
439 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
440 {
441 /* The current host thread has an active context for a different device.
442 Detach it. */
443 CUcontext old_ctx;
444 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
445 }
446
447 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
448
449 if (!ptx_dev->ctx)
450 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
451 else
452 ptx_dev->ctx_shared = true;
453
454 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
455 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
456 ptx_dev->overlap = pi;
457
458 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
459 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
460 ptx_dev->map = pi;
461
462 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
463 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
464 ptx_dev->concur = pi;
465
466 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
467 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
468 ptx_dev->mode = pi;
469
470 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
471 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
472 ptx_dev->mkern = pi;
473
474 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
475 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
476 ptx_dev->clock_khz = pi;
477
478 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
479 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
480 ptx_dev->num_sms = pi;
481
482 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
483 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
484 ptx_dev->regs_per_block = pi;
485
486 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
487 in CUDA 6.0 and newer. */
488 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
489 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
490 dev);
491 /* Fallback: use limit of registers per block, which is usually equal. */
492 if (r == CUDA_ERROR_INVALID_VALUE)
493 pi = ptx_dev->regs_per_block;
494 else if (r != CUDA_SUCCESS)
495 {
496 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
497 return NULL;
498 }
499 ptx_dev->regs_per_sm = pi;
500
501 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
502 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
503 if (pi != 32)
504 {
505 GOMP_PLUGIN_error ("Only warp size 32 is supported");
506 return NULL;
507 }
508 ptx_dev->warp_size = pi;
509
510 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
511 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
512 ptx_dev->max_threads_per_block = pi;
513
514 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
515 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
516 ptx_dev->max_threads_per_multiprocessor = pi;
517
518 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
519 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
520 if (r != CUDA_SUCCESS)
521 async_engines = 1;
522
523 for (int i = 0; i != GOMP_DIM_MAX; i++)
524 ptx_dev->default_dims[i] = 0;
525
526 CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
527 dev);
528
529 ptx_dev->images = NULL;
530 pthread_mutex_init (&ptx_dev->image_lock, NULL);
531
532 ptx_dev->free_blocks = NULL;
533 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
534
535 ptx_dev->omp_stacks.ptr = 0;
536 ptx_dev->omp_stacks.size = 0;
537 pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
538
539 return ptx_dev;
540 }
541
542 static bool
543 nvptx_close_device (struct ptx_device *ptx_dev)
544 {
545 if (!ptx_dev)
546 return true;
547
548 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
549 {
550 struct ptx_free_block *b_next = b->next;
551 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
552 free (b);
553 b = b_next;
554 }
555
556 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
557 pthread_mutex_destroy (&ptx_dev->image_lock);
558
559 pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
560
561 if (ptx_dev->omp_stacks.ptr)
562 CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
563
564 if (!ptx_dev->ctx_shared)
565 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
566
567 free (ptx_dev);
568 return true;
569 }
570
571 static int
572 nvptx_get_num_devices (void)
573 {
574 int n;
575
576 /* This function will be called before the plugin has been initialized in
577 order to enumerate available devices, but CUDA API routines can't be used
578 until cuInit has been called. Just call it now (but don't yet do any
579 further initialization). */
580 if (instantiated_devices == 0)
581 {
582 if (!init_cuda_lib ())
583 return 0;
584 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
585 /* This is not an error: e.g. we may have CUDA libraries installed but
586 no devices available. */
587 if (r != CUDA_SUCCESS)
588 {
589 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
590 cuda_error (r));
591 return 0;
592 }
593 }
594
595 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
596 return n;
597 }
598
599 static void
600 notify_var (const char *var_name, const char *env_var)
601 {
602 if (env_var == NULL)
603 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
604 else
605 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
606 }
607
608 static void
609 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
610 {
611 const char *var_name = "GOMP_NVPTX_JIT";
612 const char *env_var = secure_getenv (var_name);
613 notify_var (var_name, env_var);
614
615 if (env_var == NULL)
616 return;
617
618 const char *c = env_var;
619 while (*c != '\0')
620 {
621 while (*c == ' ')
622 c++;
623
624 if (c[0] == '-' && c[1] == 'O'
625 && '0' <= c[2] && c[2] <= '4'
626 && (c[3] == '\0' || c[3] == ' '))
627 {
628 *gomp_nvptx_o = c[2] - '0';
629 c += 3;
630 continue;
631 }
632
633 GOMP_PLUGIN_error ("Error parsing %s", var_name);
634 break;
635 }
636 }
637
638 static bool
639 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
640 unsigned num_objs)
641 {
642 CUjit_option opts[7];
643 void *optvals[7];
644 float elapsed = 0.0;
645 char elog[1024];
646 char ilog[16384];
647 CUlinkState linkstate;
648 CUresult r;
649 void *linkout;
650 size_t linkoutsize __attribute__ ((unused));
651
652 opts[0] = CU_JIT_WALL_TIME;
653 optvals[0] = &elapsed;
654
655 opts[1] = CU_JIT_INFO_LOG_BUFFER;
656 optvals[1] = &ilog[0];
657
658 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
659 optvals[2] = (void *) sizeof ilog;
660
661 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
662 optvals[3] = &elog[0];
663
664 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
665 optvals[4] = (void *) sizeof elog;
666
667 opts[5] = CU_JIT_LOG_VERBOSE;
668 optvals[5] = (void *) 1;
669
670 static intptr_t gomp_nvptx_o = -1;
671
672 static bool init_done = false;
673 if (!init_done)
674 {
675 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
676 init_done = true;
677 }
678
679 int nopts = 6;
680 if (gomp_nvptx_o != -1)
681 {
682 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
683 optvals[nopts] = (void *) gomp_nvptx_o;
684 nopts++;
685 }
686
687 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
688 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
689 else
690 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
691
692 for (; num_objs--; ptx_objs++)
693 {
694 /* cuLinkAddData's 'data' argument erroneously omits the const
695 qualifier. */
696 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
697 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
698 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
699 (char *) ptx_objs->code, ptx_objs->size,
700 0, 0, 0, 0);
701 else
702 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
703 (char *) ptx_objs->code, ptx_objs->size,
704 0, 0, 0, 0);
705 if (r != CUDA_SUCCESS)
706 {
707 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
708 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
709 cuda_error (r));
710 return false;
711 }
712 }
713
714 GOMP_PLUGIN_debug (0, "Linking\n");
715 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
716
717 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
718 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
719
720 if (r != CUDA_SUCCESS)
721 {
722 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
723 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
724 return false;
725 }
726
727 CUDA_CALL (cuModuleLoadData, module, linkout);
728 CUDA_CALL (cuLinkDestroy, linkstate);
729 return true;
730 }
731
732 static void
733 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
734 unsigned *dims, void *targ_mem_desc,
735 CUdeviceptr dp, CUstream stream)
736 {
737 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
738 CUfunction function;
739 int i;
740 void *kargs[1];
741 struct nvptx_thread *nvthd = nvptx_thread ();
742 int warp_size = nvthd->ptx_dev->warp_size;
743
744 function = targ_fn->fn;
745
746 /* Initialize the launch dimensions. Typically this is constant,
747 provided by the device compiler, but we must permit runtime
748 values. */
749 int seen_zero = 0;
750 for (i = 0; i != GOMP_DIM_MAX; i++)
751 {
752 if (targ_fn->launch->dim[i])
753 dims[i] = targ_fn->launch->dim[i];
754 if (!dims[i])
755 seen_zero = 1;
756 }
757
758 if (seen_zero)
759 {
760 pthread_mutex_lock (&ptx_dev_lock);
761
762 static int gomp_openacc_dims[GOMP_DIM_MAX];
763 if (!gomp_openacc_dims[0])
764 {
765 /* See if the user provided GOMP_OPENACC_DIM environment
766 variable to specify runtime defaults. */
767 for (int i = 0; i < GOMP_DIM_MAX; ++i)
768 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
769 }
770
771 if (!nvthd->ptx_dev->default_dims[0])
772 {
773 int default_dims[GOMP_DIM_MAX];
774 for (int i = 0; i < GOMP_DIM_MAX; ++i)
775 default_dims[i] = gomp_openacc_dims[i];
776
777 int gang, worker, vector;
778 {
779 int block_size = nvthd->ptx_dev->max_threads_per_block;
780 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
781 int dev_size = nvthd->ptx_dev->num_sms;
782 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
783 " dev_size=%d, cpu_size=%d\n",
784 warp_size, block_size, dev_size, cpu_size);
785
786 gang = (cpu_size / block_size) * dev_size;
787 worker = block_size / warp_size;
788 vector = warp_size;
789 }
790
791 /* There is no upper bound on the gang size. The best size
792 matches the hardware configuration. Logical gangs are
793 scheduled onto physical hardware. To maximize usage, we
794 should guess a large number. */
795 if (default_dims[GOMP_DIM_GANG] < 1)
796 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
797 /* The worker size must not exceed the hardware. */
798 if (default_dims[GOMP_DIM_WORKER] < 1
799 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
800 default_dims[GOMP_DIM_WORKER] = worker;
801 /* The vector size must exactly match the hardware. */
802 if (default_dims[GOMP_DIM_VECTOR] < 1
803 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
804 default_dims[GOMP_DIM_VECTOR] = vector;
805
806 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
807 default_dims[GOMP_DIM_GANG],
808 default_dims[GOMP_DIM_WORKER],
809 default_dims[GOMP_DIM_VECTOR]);
810
811 for (i = 0; i != GOMP_DIM_MAX; i++)
812 nvthd->ptx_dev->default_dims[i] = default_dims[i];
813 }
814 pthread_mutex_unlock (&ptx_dev_lock);
815
816 {
817 bool default_dim_p[GOMP_DIM_MAX];
818 for (i = 0; i != GOMP_DIM_MAX; i++)
819 default_dim_p[i] = !dims[i];
820
821 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
822 {
823 for (i = 0; i != GOMP_DIM_MAX; i++)
824 if (default_dim_p[i])
825 dims[i] = nvthd->ptx_dev->default_dims[i];
826
827 if (default_dim_p[GOMP_DIM_VECTOR])
828 dims[GOMP_DIM_VECTOR]
829 = MIN (dims[GOMP_DIM_VECTOR],
830 (targ_fn->max_threads_per_block / warp_size
831 * warp_size));
832
833 if (default_dim_p[GOMP_DIM_WORKER])
834 dims[GOMP_DIM_WORKER]
835 = MIN (dims[GOMP_DIM_WORKER],
836 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
837 }
838 else
839 {
840 /* Handle the case that the compiler allows the runtime to choose
841 the vector-length conservatively, by ignoring
842 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
843 it. */
844 int vectors = 0;
845 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
846 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
847 exceed targ_fn->max_threads_per_block. */
848 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
849 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
850 int grids, blocks;
851
852 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
853 &blocks, function, NULL, 0,
854 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
855 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
856 "grid = %d, block = %d\n", grids, blocks);
857
858 /* Keep the num_gangs proportional to the block size. In
859 the case were a block size is limited by shared-memory
860 or the register file capacity, the runtime will not
861 excessively over assign gangs to the multiprocessor
862 units if their state is going to be swapped out even
863 more than necessary. The constant factor 2 is there to
864 prevent threads from idling when there is insufficient
865 work for them. */
866 if (gangs == 0)
867 gangs = 2 * grids * (blocks / warp_size);
868
869 if (vectors == 0)
870 vectors = warp_size;
871
872 if (workers == 0)
873 {
874 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
875 ? vectors
876 : dims[GOMP_DIM_VECTOR]);
877 workers = blocks / actual_vectors;
878 workers = MAX (workers, 1);
879 /* If we need a per-worker barrier ... . */
880 if (actual_vectors > 32)
881 /* Don't use more barriers than available. */
882 workers = MIN (workers, 15);
883 }
884
885 for (i = 0; i != GOMP_DIM_MAX; i++)
886 if (default_dim_p[i])
887 switch (i)
888 {
889 case GOMP_DIM_GANG: dims[i] = gangs; break;
890 case GOMP_DIM_WORKER: dims[i] = workers; break;
891 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
892 default: GOMP_PLUGIN_fatal ("invalid dim");
893 }
894 }
895 }
896 }
897
898 /* Check if the accelerator has sufficient hardware resources to
899 launch the offloaded kernel. */
900 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
901 > targ_fn->max_threads_per_block)
902 {
903 const char *msg
904 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
905 " with num_workers = %d and vector_length = %d"
906 "; "
907 "recompile the program with 'num_workers = x and vector_length = y'"
908 " on that offloaded region or '-fopenacc-dim=:x:y' where"
909 " x * y <= %d"
910 ".\n");
911 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
912 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
913 }
914
915 /* Check if the accelerator has sufficient barrier resources to
916 launch the offloaded kernel. */
917 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
918 {
919 const char *msg
920 = ("The Nvidia accelerator has insufficient barrier resources to launch"
921 " '%s' with num_workers = %d and vector_length = %d"
922 "; "
923 "recompile the program with 'num_workers = x' on that offloaded"
924 " region or '-fopenacc-dim=:x:' where x <= 15"
925 "; "
926 "or, recompile the program with 'vector_length = 32' on that"
927 " offloaded region or '-fopenacc-dim=::32'"
928 ".\n");
929 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
930 dims[GOMP_DIM_VECTOR]);
931 }
932
933 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
934 " gangs=%u, workers=%u, vectors=%u\n",
935 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
936 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
937
938 // OpenACC CUDA
939 //
940 // num_gangs nctaid.x
941 // num_workers ntid.y
942 // vector length ntid.x
943
944 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
945 acc_prof_info *prof_info = thr->prof_info;
946 acc_event_info enqueue_launch_event_info;
947 acc_api_info *api_info = thr->api_info;
948 bool profiling_p = __builtin_expect (prof_info != NULL, false);
949 if (profiling_p)
950 {
951 prof_info->event_type = acc_ev_enqueue_launch_start;
952
953 enqueue_launch_event_info.launch_event.event_type
954 = prof_info->event_type;
955 enqueue_launch_event_info.launch_event.valid_bytes
956 = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
957 enqueue_launch_event_info.launch_event.parent_construct
958 = acc_construct_parallel;
959 enqueue_launch_event_info.launch_event.implicit = 1;
960 enqueue_launch_event_info.launch_event.tool_info = NULL;
961 enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
962 enqueue_launch_event_info.launch_event.num_gangs
963 = dims[GOMP_DIM_GANG];
964 enqueue_launch_event_info.launch_event.num_workers
965 = dims[GOMP_DIM_WORKER];
966 enqueue_launch_event_info.launch_event.vector_length
967 = dims[GOMP_DIM_VECTOR];
968
969 api_info->device_api = acc_device_api_cuda;
970
971 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
972 api_info);
973 }
974
975 kargs[0] = &dp;
976 CUDA_CALL_ASSERT (cuLaunchKernel, function,
977 dims[GOMP_DIM_GANG], 1, 1,
978 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
979 0, stream, kargs, 0);
980
981 if (profiling_p)
982 {
983 prof_info->event_type = acc_ev_enqueue_launch_end;
984 enqueue_launch_event_info.launch_event.event_type
985 = prof_info->event_type;
986 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
987 api_info);
988 }
989
990 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
991 targ_fn->launch->fn);
992 }
993
994 void * openacc_get_current_cuda_context (void);
995
996 static void
997 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
998 {
999 acc_prof_info *prof_info = thr->prof_info;
1000 acc_event_info data_event_info;
1001 acc_api_info *api_info = thr->api_info;
1002
1003 prof_info->event_type = acc_ev_alloc;
1004
1005 data_event_info.data_event.event_type = prof_info->event_type;
1006 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1007 data_event_info.data_event.parent_construct = acc_construct_parallel;
1008 data_event_info.data_event.implicit = 1;
1009 data_event_info.data_event.tool_info = NULL;
1010 data_event_info.data_event.var_name = NULL;
1011 data_event_info.data_event.bytes = s;
1012 data_event_info.data_event.host_ptr = NULL;
1013 data_event_info.data_event.device_ptr = dp;
1014
1015 api_info->device_api = acc_device_api_cuda;
1016
1017 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1018 }
1019
1020 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1021 size threshold, or if FORCE is true. */
1022
1023 static void
1024 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1025 {
1026 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1027 if (ptx_dev->omp_stacks.ptr
1028 && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1029 {
1030 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1031 if (r != CUDA_SUCCESS)
1032 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1033 ptx_dev->omp_stacks.ptr = 0;
1034 ptx_dev->omp_stacks.size = 0;
1035 }
1036 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1037 }
1038
1039 static void *
1040 nvptx_alloc (size_t s, bool suppress_errors)
1041 {
1042 CUdeviceptr d;
1043
1044 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1045 if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1046 return NULL;
1047 else if (r != CUDA_SUCCESS)
1048 {
1049 GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1050 return NULL;
1051 }
1052
1053 /* NOTE: We only do profiling stuff if the memory allocation succeeds. */
1054 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1055 bool profiling_p
1056 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1057 if (profiling_p)
1058 goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1059
1060 return (void *) d;
1061 }
1062
1063 static void
1064 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1065 {
1066 acc_prof_info *prof_info = thr->prof_info;
1067 acc_event_info data_event_info;
1068 acc_api_info *api_info = thr->api_info;
1069
1070 prof_info->event_type = acc_ev_free;
1071
1072 data_event_info.data_event.event_type = prof_info->event_type;
1073 data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1074 data_event_info.data_event.parent_construct = acc_construct_parallel;
1075 data_event_info.data_event.implicit = 1;
1076 data_event_info.data_event.tool_info = NULL;
1077 data_event_info.data_event.var_name = NULL;
1078 data_event_info.data_event.bytes = -1;
1079 data_event_info.data_event.host_ptr = NULL;
1080 data_event_info.data_event.device_ptr = p;
1081
1082 api_info->device_api = acc_device_api_cuda;
1083
1084 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1085 }
1086
1087 static bool
1088 nvptx_free (void *p, struct ptx_device *ptx_dev)
1089 {
1090 CUdeviceptr pb;
1091 size_t ps;
1092
1093 CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1094 (CUdeviceptr) p);
1095 if (r == CUDA_ERROR_NOT_PERMITTED)
1096 {
1097 /* We assume that this error indicates we are in a CUDA callback context,
1098 where all CUDA calls are not allowed (see cuStreamAddCallback
1099 documentation for description). Arrange to free this piece of device
1100 memory later. */
1101 struct ptx_free_block *n
1102 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1103 n->ptr = p;
1104 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1105 n->next = ptx_dev->free_blocks;
1106 ptx_dev->free_blocks = n;
1107 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1108 return true;
1109 }
1110 else if (r != CUDA_SUCCESS)
1111 {
1112 GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1113 return false;
1114 }
1115 if ((CUdeviceptr) p != pb)
1116 {
1117 GOMP_PLUGIN_error ("invalid device address");
1118 return false;
1119 }
1120
1121 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1122 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1123 bool profiling_p
1124 = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1125 if (profiling_p)
1126 goacc_profiling_acc_ev_free (thr, p);
1127
1128 return true;
1129 }
1130
1131 static void *
1132 nvptx_get_current_cuda_device (void)
1133 {
1134 struct nvptx_thread *nvthd = nvptx_thread ();
1135
1136 if (!nvthd || !nvthd->ptx_dev)
1137 return NULL;
1138
1139 return &nvthd->ptx_dev->dev;
1140 }
1141
1142 static void *
1143 nvptx_get_current_cuda_context (void)
1144 {
1145 struct nvptx_thread *nvthd = nvptx_thread ();
1146
1147 if (!nvthd || !nvthd->ptx_dev)
1148 return NULL;
1149
1150 return nvthd->ptx_dev->ctx;
1151 }
1152
1153 /* Plugin entry points. */
1154
1155 const char *
1156 GOMP_OFFLOAD_get_name (void)
1157 {
1158 return "nvptx";
1159 }
1160
1161 unsigned int
1162 GOMP_OFFLOAD_get_caps (void)
1163 {
1164 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1165 }
1166
1167 int
1168 GOMP_OFFLOAD_get_type (void)
1169 {
1170 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1171 }
1172
1173 int
1174 GOMP_OFFLOAD_get_num_devices (void)
1175 {
1176 return nvptx_get_num_devices ();
1177 }
1178
1179 bool
1180 GOMP_OFFLOAD_init_device (int n)
1181 {
1182 struct ptx_device *dev;
1183
1184 pthread_mutex_lock (&ptx_dev_lock);
1185
1186 if (!nvptx_init () || ptx_devices[n] != NULL)
1187 {
1188 pthread_mutex_unlock (&ptx_dev_lock);
1189 return false;
1190 }
1191
1192 dev = nvptx_open_device (n);
1193 if (dev)
1194 {
1195 ptx_devices[n] = dev;
1196 instantiated_devices++;
1197 }
1198
1199 pthread_mutex_unlock (&ptx_dev_lock);
1200
1201 return dev != NULL;
1202 }
1203
1204 bool
1205 GOMP_OFFLOAD_fini_device (int n)
1206 {
1207 pthread_mutex_lock (&ptx_dev_lock);
1208
1209 if (ptx_devices[n] != NULL)
1210 {
1211 if (!nvptx_attach_host_thread_to_device (n)
1212 || !nvptx_close_device (ptx_devices[n]))
1213 {
1214 pthread_mutex_unlock (&ptx_dev_lock);
1215 return false;
1216 }
1217 ptx_devices[n] = NULL;
1218 instantiated_devices--;
1219 }
1220
1221 if (instantiated_devices == 0)
1222 {
1223 free (ptx_devices);
1224 ptx_devices = NULL;
1225 }
1226
1227 pthread_mutex_unlock (&ptx_dev_lock);
1228 return true;
1229 }
1230
1231 /* Return the libgomp version number we're compatible with. There is
1232 no requirement for cross-version compatibility. */
1233
1234 unsigned
1235 GOMP_OFFLOAD_version (void)
1236 {
1237 return GOMP_VERSION;
1238 }
1239
1240 /* Initialize __nvptx_clocktick, if present in MODULE. */
1241
1242 static void
1243 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1244 {
1245 CUdeviceptr dptr;
1246 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1247 module, "__nvptx_clocktick");
1248 if (r == CUDA_ERROR_NOT_FOUND)
1249 return;
1250 if (r != CUDA_SUCCESS)
1251 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1252 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1253 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1254 sizeof (__nvptx_clocktick));
1255 if (r != CUDA_SUCCESS)
1256 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1257 }
1258
1259 /* Load the (partial) program described by TARGET_DATA to device
1260 number ORD. Allocate and return TARGET_TABLE. */
1261
1262 int
1263 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1264 struct addr_pair **target_table)
1265 {
1266 CUmodule module;
1267 const char *const *var_names;
1268 const struct targ_fn_launch *fn_descs;
1269 unsigned int fn_entries, var_entries, other_entries, i, j;
1270 struct targ_fn_descriptor *targ_fns;
1271 struct addr_pair *targ_tbl;
1272 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1273 struct ptx_image_data *new_image;
1274 struct ptx_device *dev;
1275
1276 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1277 {
1278 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1279 " (expected %u, received %u)",
1280 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1281 return -1;
1282 }
1283
1284 if (!nvptx_attach_host_thread_to_device (ord)
1285 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1286 return -1;
1287
1288 dev = ptx_devices[ord];
1289
1290 /* The mkoffload utility emits a struct of pointers/integers at the
1291 start of each offload image. The array of kernel names and the
1292 functions addresses form a one-to-one correspondence. */
1293
1294 var_entries = img_header->var_num;
1295 var_names = img_header->var_names;
1296 fn_entries = img_header->fn_num;
1297 fn_descs = img_header->fn_descs;
1298
1299 /* Currently, the only other entry kind is 'device number'. */
1300 other_entries = 1;
1301
1302 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1303 * (fn_entries + var_entries + other_entries));
1304 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1305 * fn_entries);
1306
1307 *target_table = targ_tbl;
1308
1309 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1310 new_image->target_data = target_data;
1311 new_image->module = module;
1312 new_image->fns = targ_fns;
1313
1314 pthread_mutex_lock (&dev->image_lock);
1315 new_image->next = dev->images;
1316 dev->images = new_image;
1317 pthread_mutex_unlock (&dev->image_lock);
1318
1319 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1320 {
1321 CUfunction function;
1322 int nregs, mthrs;
1323
1324 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1325 fn_descs[i].fn);
1326 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1327 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1328 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1329 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1330
1331 targ_fns->fn = function;
1332 targ_fns->launch = &fn_descs[i];
1333 targ_fns->regs_per_thread = nregs;
1334 targ_fns->max_threads_per_block = mthrs;
1335
1336 targ_tbl->start = (uintptr_t) targ_fns;
1337 targ_tbl->end = targ_tbl->start + 1;
1338 }
1339
1340 for (j = 0; j < var_entries; j++, targ_tbl++)
1341 {
1342 CUdeviceptr var;
1343 size_t bytes;
1344
1345 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1346 &var, &bytes, module, var_names[j]);
1347
1348 targ_tbl->start = (uintptr_t) var;
1349 targ_tbl->end = targ_tbl->start + bytes;
1350 }
1351
1352 CUdeviceptr device_num_varptr;
1353 size_t device_num_varsize;
1354 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &device_num_varptr,
1355 &device_num_varsize, module,
1356 STRINGX (GOMP_DEVICE_NUM_VAR));
1357 if (r == CUDA_SUCCESS)
1358 {
1359 targ_tbl->start = (uintptr_t) device_num_varptr;
1360 targ_tbl->end = (uintptr_t) (device_num_varptr + device_num_varsize);
1361 }
1362 else
1363 /* The 'GOMP_DEVICE_NUM_VAR' variable was not in this image. */
1364 targ_tbl->start = targ_tbl->end = 0;
1365 targ_tbl++;
1366
1367 nvptx_set_clocktick (module, dev);
1368
1369 return fn_entries + var_entries + other_entries;
1370 }
1371
1372 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1373 function descriptors allocated by G_O_load_image. */
1374
1375 bool
1376 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1377 {
1378 struct ptx_image_data *image, **prev_p;
1379 struct ptx_device *dev = ptx_devices[ord];
1380
1381 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1382 {
1383 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1384 " (expected %u, received %u)",
1385 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1386 return false;
1387 }
1388
1389 bool ret = true;
1390 pthread_mutex_lock (&dev->image_lock);
1391 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1392 if (image->target_data == target_data)
1393 {
1394 *prev_p = image->next;
1395 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1396 ret = false;
1397 free (image->fns);
1398 free (image);
1399 break;
1400 }
1401 pthread_mutex_unlock (&dev->image_lock);
1402 return ret;
1403 }
1404
1405 void *
1406 GOMP_OFFLOAD_alloc (int ord, size_t size)
1407 {
1408 if (!nvptx_attach_host_thread_to_device (ord))
1409 return NULL;
1410
1411 struct ptx_device *ptx_dev = ptx_devices[ord];
1412 struct ptx_free_block *blocks, *tmp;
1413
1414 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1415 blocks = ptx_dev->free_blocks;
1416 ptx_dev->free_blocks = NULL;
1417 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1418
1419 nvptx_stacks_free (ptx_dev, false);
1420
1421 while (blocks)
1422 {
1423 tmp = blocks->next;
1424 nvptx_free (blocks->ptr, ptx_dev);
1425 free (blocks);
1426 blocks = tmp;
1427 }
1428
1429 void *d = nvptx_alloc (size, true);
1430 if (d)
1431 return d;
1432 else
1433 {
1434 /* Memory allocation failed. Try freeing the stacks block, and
1435 retrying. */
1436 nvptx_stacks_free (ptx_dev, true);
1437 return nvptx_alloc (size, false);
1438 }
1439 }
1440
1441 bool
1442 GOMP_OFFLOAD_free (int ord, void *ptr)
1443 {
1444 return (nvptx_attach_host_thread_to_device (ord)
1445 && nvptx_free (ptr, ptx_devices[ord]));
1446 }
1447
1448 void
1449 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1450 void **hostaddrs, void **devaddrs,
1451 unsigned *dims, void *targ_mem_desc)
1452 {
1453 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1454
1455 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1456 acc_prof_info *prof_info = thr->prof_info;
1457 acc_event_info data_event_info;
1458 acc_api_info *api_info = thr->api_info;
1459 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1460
1461 void **hp = NULL;
1462 CUdeviceptr dp = 0;
1463
1464 if (mapnum > 0)
1465 {
1466 size_t s = mapnum * sizeof (void *);
1467 hp = alloca (s);
1468 for (int i = 0; i < mapnum; i++)
1469 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1470 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1471 if (profiling_p)
1472 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1473 }
1474
1475 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1476 fact have the same value on a unified-memory system). */
1477 if (mapnum > 0)
1478 {
1479 if (profiling_p)
1480 {
1481 prof_info->event_type = acc_ev_enqueue_upload_start;
1482
1483 data_event_info.data_event.event_type = prof_info->event_type;
1484 data_event_info.data_event.valid_bytes
1485 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1486 data_event_info.data_event.parent_construct
1487 = acc_construct_parallel;
1488 data_event_info.data_event.implicit = 1; /* Always implicit. */
1489 data_event_info.data_event.tool_info = NULL;
1490 data_event_info.data_event.var_name = NULL;
1491 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1492 data_event_info.data_event.host_ptr = hp;
1493 data_event_info.data_event.device_ptr = (const void *) dp;
1494
1495 api_info->device_api = acc_device_api_cuda;
1496
1497 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1498 api_info);
1499 }
1500 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1501 mapnum * sizeof (void *));
1502 if (profiling_p)
1503 {
1504 prof_info->event_type = acc_ev_enqueue_upload_end;
1505 data_event_info.data_event.event_type = prof_info->event_type;
1506 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1507 api_info);
1508 }
1509 }
1510
1511 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1512 dp, NULL);
1513
1514 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1515 const char *maybe_abort_msg = "(perhaps abort was called)";
1516 if (r == CUDA_ERROR_LAUNCH_FAILED)
1517 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1518 maybe_abort_msg);
1519 else if (r != CUDA_SUCCESS)
1520 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1521
1522 CUDA_CALL_ASSERT (cuMemFree, dp);
1523 if (profiling_p)
1524 goacc_profiling_acc_ev_free (thr, (void *) dp);
1525 }
1526
1527 static void
1528 cuda_free_argmem (void *ptr)
1529 {
1530 void **block = (void **) ptr;
1531 nvptx_free (block[0], (struct ptx_device *) block[1]);
1532 free (block);
1533 }
1534
1535 void
1536 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1537 void **hostaddrs, void **devaddrs,
1538 unsigned *dims, void *targ_mem_desc,
1539 struct goacc_asyncqueue *aq)
1540 {
1541 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1542
1543 struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1544 acc_prof_info *prof_info = thr->prof_info;
1545 acc_event_info data_event_info;
1546 acc_api_info *api_info = thr->api_info;
1547 bool profiling_p = __builtin_expect (prof_info != NULL, false);
1548
1549 void **hp = NULL;
1550 CUdeviceptr dp = 0;
1551 void **block = NULL;
1552
1553 if (mapnum > 0)
1554 {
1555 size_t s = mapnum * sizeof (void *);
1556 block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1557 hp = block + 2;
1558 for (int i = 0; i < mapnum; i++)
1559 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1560 CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1561 if (profiling_p)
1562 goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1563 }
1564
1565 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1566 fact have the same value on a unified-memory system). */
1567 if (mapnum > 0)
1568 {
1569 if (profiling_p)
1570 {
1571 prof_info->event_type = acc_ev_enqueue_upload_start;
1572
1573 data_event_info.data_event.event_type = prof_info->event_type;
1574 data_event_info.data_event.valid_bytes
1575 = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1576 data_event_info.data_event.parent_construct
1577 = acc_construct_parallel;
1578 data_event_info.data_event.implicit = 1; /* Always implicit. */
1579 data_event_info.data_event.tool_info = NULL;
1580 data_event_info.data_event.var_name = NULL;
1581 data_event_info.data_event.bytes = mapnum * sizeof (void *);
1582 data_event_info.data_event.host_ptr = hp;
1583 data_event_info.data_event.device_ptr = (const void *) dp;
1584
1585 api_info->device_api = acc_device_api_cuda;
1586
1587 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1588 api_info);
1589 }
1590
1591 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1592 mapnum * sizeof (void *), aq->cuda_stream);
1593 block[0] = (void *) dp;
1594
1595 struct nvptx_thread *nvthd =
1596 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1597 block[1] = (void *) nvthd->ptx_dev;
1598
1599 if (profiling_p)
1600 {
1601 prof_info->event_type = acc_ev_enqueue_upload_end;
1602 data_event_info.data_event.event_type = prof_info->event_type;
1603 GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1604 api_info);
1605 }
1606 }
1607
1608 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1609 dp, aq->cuda_stream);
1610
1611 if (mapnum > 0)
1612 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1613 }
1614
1615 void *
1616 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1617 {
1618 struct ptx_device *ptx_dev;
1619 struct nvptx_thread *nvthd
1620 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1621 CUcontext thd_ctx;
1622
1623 ptx_dev = ptx_devices[ord];
1624
1625 assert (ptx_dev);
1626
1627 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1628
1629 assert (ptx_dev->ctx);
1630
1631 if (!thd_ctx)
1632 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1633
1634 nvthd->ptx_dev = ptx_dev;
1635
1636 return (void *) nvthd;
1637 }
1638
1639 void
1640 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1641 {
1642 free (data);
1643 }
1644
1645 void *
1646 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1647 {
1648 return nvptx_get_current_cuda_device ();
1649 }
1650
1651 void *
1652 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1653 {
1654 return nvptx_get_current_cuda_context ();
1655 }
1656
1657 /* This returns a CUstream. */
1658 void *
1659 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1660 {
1661 return (void *) aq->cuda_stream;
1662 }
1663
1664 /* This takes a CUstream. */
1665 int
1666 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1667 {
1668 if (aq->cuda_stream)
1669 {
1670 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1671 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1672 }
1673
1674 aq->cuda_stream = (CUstream) stream;
1675 return 1;
1676 }
1677
1678 struct goacc_asyncqueue *
1679 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1680 {
1681 CUstream stream = NULL;
1682 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1683
1684 struct goacc_asyncqueue *aq
1685 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1686 aq->cuda_stream = stream;
1687 return aq;
1688 }
1689
1690 bool
1691 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1692 {
1693 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1694 free (aq);
1695 return true;
1696 }
1697
1698 int
1699 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1700 {
1701 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1702 if (r == CUDA_SUCCESS)
1703 return 1;
1704 if (r == CUDA_ERROR_NOT_READY)
1705 return 0;
1706
1707 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1708 return -1;
1709 }
1710
1711 bool
1712 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1713 {
1714 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1715 return true;
1716 }
1717
1718 bool
1719 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1720 struct goacc_asyncqueue *aq2)
1721 {
1722 CUevent e;
1723 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1724 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1725 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1726 return true;
1727 }
1728
1729 static void
1730 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1731 {
1732 if (res != CUDA_SUCCESS)
1733 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1734 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1735 cb->fn (cb->ptr);
1736 free (ptr);
1737 }
1738
1739 void
1740 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1741 void (*callback_fn)(void *),
1742 void *userptr)
1743 {
1744 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1745 b->fn = callback_fn;
1746 b->ptr = userptr;
1747 b->aq = aq;
1748 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1749 cuda_callback_wrapper, (void *) b, 0);
1750 }
1751
1752 static bool
1753 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1754 {
1755 CUdeviceptr pb;
1756 size_t ps;
1757 if (!s)
1758 return true;
1759 if (!d)
1760 {
1761 GOMP_PLUGIN_error ("invalid device address");
1762 return false;
1763 }
1764 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1765 if (!pb)
1766 {
1767 GOMP_PLUGIN_error ("invalid device address");
1768 return false;
1769 }
1770 if (!h)
1771 {
1772 GOMP_PLUGIN_error ("invalid host address");
1773 return false;
1774 }
1775 if (d == h)
1776 {
1777 GOMP_PLUGIN_error ("invalid host or device address");
1778 return false;
1779 }
1780 if ((void *)(d + s) > (void *)(pb + ps))
1781 {
1782 GOMP_PLUGIN_error ("invalid size");
1783 return false;
1784 }
1785 return true;
1786 }
1787
1788 bool
1789 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1790 {
1791 if (!nvptx_attach_host_thread_to_device (ord)
1792 || !cuda_memcpy_sanity_check (src, dst, n))
1793 return false;
1794 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1795 return true;
1796 }
1797
1798 bool
1799 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1800 {
1801 if (!nvptx_attach_host_thread_to_device (ord)
1802 || !cuda_memcpy_sanity_check (dst, src, n))
1803 return false;
1804 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1805 return true;
1806 }
1807
1808 bool
1809 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1810 {
1811 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1812 return true;
1813 }
1814
1815 bool
1816 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1817 size_t n, struct goacc_asyncqueue *aq)
1818 {
1819 if (!nvptx_attach_host_thread_to_device (ord)
1820 || !cuda_memcpy_sanity_check (src, dst, n))
1821 return false;
1822 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1823 return true;
1824 }
1825
1826 bool
1827 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1828 size_t n, struct goacc_asyncqueue *aq)
1829 {
1830 if (!nvptx_attach_host_thread_to_device (ord)
1831 || !cuda_memcpy_sanity_check (dst, src, n))
1832 return false;
1833 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1834 return true;
1835 }
1836
1837 union goacc_property_value
1838 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1839 {
1840 union goacc_property_value propval = { .val = 0 };
1841
1842 pthread_mutex_lock (&ptx_dev_lock);
1843
1844 if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1845 {
1846 pthread_mutex_unlock (&ptx_dev_lock);
1847 return propval;
1848 }
1849
1850 struct ptx_device *ptx_dev = ptx_devices[n];
1851 switch (prop)
1852 {
1853 case GOACC_PROPERTY_MEMORY:
1854 {
1855 size_t total_mem;
1856
1857 CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1858 propval.val = total_mem;
1859 }
1860 break;
1861 case GOACC_PROPERTY_FREE_MEMORY:
1862 {
1863 size_t total_mem;
1864 size_t free_mem;
1865 CUdevice ctxdev;
1866
1867 CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1868 if (ptx_dev->dev == ctxdev)
1869 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1870 else if (ptx_dev->ctx)
1871 {
1872 CUcontext old_ctx;
1873
1874 CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1875 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1876 CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1877 }
1878 else
1879 {
1880 CUcontext new_ctx;
1881
1882 CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1883 ptx_dev->dev);
1884 CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1885 CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1886 }
1887 propval.val = free_mem;
1888 }
1889 break;
1890 case GOACC_PROPERTY_NAME:
1891 propval.ptr = ptx_dev->name;
1892 break;
1893 case GOACC_PROPERTY_VENDOR:
1894 propval.ptr = "Nvidia";
1895 break;
1896 case GOACC_PROPERTY_DRIVER:
1897 propval.ptr = cuda_driver_version_s;
1898 break;
1899 default:
1900 break;
1901 }
1902
1903 pthread_mutex_unlock (&ptx_dev_lock);
1904 return propval;
1905 }
1906
1907 /* Adjust launch dimensions: pick good values for number of blocks and warps
1908 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1909 own limits. */
1910
1911 static void
1912 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1913 struct ptx_device *ptx_dev,
1914 int *teams_p, int *threads_p)
1915 {
1916 int max_warps_block = fn->max_threads_per_block / 32;
1917 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1918 and libgcc, which matches documented limit of all GPUs as of 2015. */
1919 if (max_warps_block > 32)
1920 max_warps_block = 32;
1921 if (*threads_p <= 0)
1922 *threads_p = 8;
1923 if (*threads_p > max_warps_block)
1924 *threads_p = max_warps_block;
1925
1926 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1927 /* This is an estimate of how many blocks the device can host simultaneously.
1928 Actual limit, which may be lower, can be queried with "occupancy control"
1929 driver interface (since CUDA 6.0). */
1930 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1931 if (*teams_p <= 0 || *teams_p > max_blocks)
1932 *teams_p = max_blocks;
1933 }
1934
1935 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1936 target regions. */
1937
1938 static size_t
1939 nvptx_stacks_size ()
1940 {
1941 return 128 * 1024;
1942 }
1943
1944 /* Return contiguous storage for NUM stacks, each SIZE bytes. The lock for
1945 the storage should be held on entry, and remains held on exit. */
1946
1947 static void *
1948 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
1949 {
1950 if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
1951 return (void *) ptx_dev->omp_stacks.ptr;
1952
1953 /* Free the old, too-small stacks. */
1954 if (ptx_dev->omp_stacks.ptr)
1955 {
1956 CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1957 if (r != CUDA_SUCCESS)
1958 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
1959 r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1960 if (r != CUDA_SUCCESS)
1961 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1962 }
1963
1964 /* Make new and bigger stacks, and remember where we put them and how big
1965 they are. */
1966 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
1967 size * num);
1968 if (r != CUDA_SUCCESS)
1969 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1970
1971 ptx_dev->omp_stacks.size = size * num;
1972
1973 return (void *) ptx_dev->omp_stacks.ptr;
1974 }
1975
1976 void
1977 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1978 {
1979 struct targ_fn_descriptor *tgt_fn_desc
1980 = (struct targ_fn_descriptor *) tgt_fn;
1981 CUfunction function = tgt_fn_desc->fn;
1982 const struct targ_fn_launch *launch = tgt_fn_desc->launch;
1983 const char *fn_name = launch->fn;
1984 CUresult r;
1985 struct ptx_device *ptx_dev = ptx_devices[ord];
1986 const char *maybe_abort_msg = "(perhaps abort was called)";
1987 int teams = 0, threads = 0;
1988
1989 if (!args)
1990 GOMP_PLUGIN_fatal ("No target arguments provided");
1991 while (*args)
1992 {
1993 intptr_t id = (intptr_t) *args++, val;
1994 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1995 val = (intptr_t) *args++;
1996 else
1997 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1998 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1999 continue;
2000 val = val > INT_MAX ? INT_MAX : val;
2001 id &= GOMP_TARGET_ARG_ID_MASK;
2002 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2003 teams = val;
2004 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2005 threads = val;
2006 }
2007 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2008
2009 size_t stack_size = nvptx_stacks_size ();
2010
2011 pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
2012 void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
2013 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2014 size_t fn_args_size = sizeof fn_args;
2015 void *config[] = {
2016 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2017 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2018 CU_LAUNCH_PARAM_END
2019 };
2020 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
2021 " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2022 __FUNCTION__, fn_name, teams, threads);
2023 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2024 32, threads, 1, 0, NULL, NULL, config);
2025 if (r != CUDA_SUCCESS)
2026 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2027
2028 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2029 if (r == CUDA_ERROR_LAUNCH_FAILED)
2030 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2031 maybe_abort_msg);
2032 else if (r != CUDA_SUCCESS)
2033 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2034
2035 pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2036 }
2037
2038 /* TODO: Implement GOMP_OFFLOAD_async_run. */