]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgomp/plugin/plugin-nvptx.c
8f71e69acb602fb8a3f023d66db3bf170ae5d17c
[thirdparty/gcc.git] / libgomp / plugin / plugin-nvptx.c
1 /* Plugin for NVPTX execution.
2
3 Copyright (C) 2013-2019 Free Software Foundation, Inc.
4
5 Contributed by Mentor Embedded.
6
7 This file is part of the GNU Offloading and Multi Processing Library
8 (libgomp).
9
10 Libgomp is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3, or (at your option)
13 any later version.
14
15 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
16 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 more details.
19
20 Under Section 7 of GPL version 3, you are granted additional
21 permissions described in the GCC Runtime Library Exception, version
22 3.1, as published by the Free Software Foundation.
23
24 You should have received a copy of the GNU General Public License and
25 a copy of the GCC Runtime Library Exception along with this program;
26 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
27 <http://www.gnu.org/licenses/>. */
28
29 /* Nvidia PTX-specific parts of OpenACC support. The cuda driver
30 library appears to hold some implicit state, but the documentation
31 is not clear as to what that state might be. Or how one might
32 propagate it from one thread to another. */
33
34 #define _GNU_SOURCE
35 #include "openacc.h"
36 #include "config.h"
37 #include "libgomp-plugin.h"
38 #include "oacc-plugin.h"
39 #include "gomp-constants.h"
40
41 #include <pthread.h>
42 #include <cuda.h>
43 #include <stdbool.h>
44 #include <stdint.h>
45 #include <limits.h>
46 #include <string.h>
47 #include <stdio.h>
48 #include <unistd.h>
49 #include <assert.h>
50 #include <errno.h>
51
52 #if CUDA_VERSION < 6000
53 extern CUresult cuGetErrorString (CUresult, const char **);
54 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
55 #endif
56
57 #if CUDA_VERSION >= 6050
58 #undef cuLinkCreate
59 #undef cuLinkAddData
60 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
61 const char *, unsigned, CUjit_option *, void **);
62 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
63 #else
64 typedef size_t (*CUoccupancyB2DSize)(int);
65 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
66 const char *, unsigned, CUjit_option *, void **);
67 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
68 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
69 CUoccupancyB2DSize, size_t, int);
70 #endif
71
72 #define DO_PRAGMA(x) _Pragma (#x)
73
74 #if PLUGIN_NVPTX_DYNAMIC
75 # include <dlfcn.h>
76
77 struct cuda_lib_s {
78
79 # define CUDA_ONE_CALL(call) \
80 __typeof (call) *call;
81 # define CUDA_ONE_CALL_MAYBE_NULL(call) \
82 CUDA_ONE_CALL (call)
83 #include "cuda-lib.def"
84 # undef CUDA_ONE_CALL
85 # undef CUDA_ONE_CALL_MAYBE_NULL
86
87 } cuda_lib;
88
89 /* -1 if init_cuda_lib has not been called yet, false
90 if it has been and failed, true if it has been and succeeded. */
91 static signed char cuda_lib_inited = -1;
92
93 /* Dynamically load the CUDA runtime library and initialize function
94 pointers, return false if unsuccessful, true if successful. */
95 static bool
96 init_cuda_lib (void)
97 {
98 if (cuda_lib_inited != -1)
99 return cuda_lib_inited;
100 const char *cuda_runtime_lib = "libcuda.so.1";
101 void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
102 cuda_lib_inited = false;
103 if (h == NULL)
104 return false;
105
106 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
107 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
108 # define CUDA_ONE_CALL_1(call, allow_null) \
109 cuda_lib.call = dlsym (h, #call); \
110 if (!allow_null && cuda_lib.call == NULL) \
111 return false;
112 #include "cuda-lib.def"
113 # undef CUDA_ONE_CALL
114 # undef CUDA_ONE_CALL_1
115 # undef CUDA_ONE_CALL_MAYBE_NULL
116
117 cuda_lib_inited = true;
118 return true;
119 }
120 # define CUDA_CALL_PREFIX cuda_lib.
121 #else
122
123 # define CUDA_ONE_CALL(call)
124 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
125 #include "cuda-lib.def"
126 #undef CUDA_ONE_CALL_MAYBE_NULL
127 #undef CUDA_ONE_CALL
128
129 # define CUDA_CALL_PREFIX
130 # define init_cuda_lib() true
131 #endif
132
133 #include "secure_getenv.h"
134
135 #undef MIN
136 #undef MAX
137 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
138 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
139
140 /* Convenience macros for the frequently used CUDA library call and
141 error handling sequence as well as CUDA library calls that
142 do the error checking themselves or don't do it at all. */
143
144 #define CUDA_CALL_ERET(ERET, FN, ...) \
145 do { \
146 unsigned __r \
147 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
148 if (__r != CUDA_SUCCESS) \
149 { \
150 GOMP_PLUGIN_error (#FN " error: %s", \
151 cuda_error (__r)); \
152 return ERET; \
153 } \
154 } while (0)
155
156 #define CUDA_CALL(FN, ...) \
157 CUDA_CALL_ERET (false, FN, __VA_ARGS__)
158
159 #define CUDA_CALL_ASSERT(FN, ...) \
160 do { \
161 unsigned __r \
162 = CUDA_CALL_PREFIX FN (__VA_ARGS__); \
163 if (__r != CUDA_SUCCESS) \
164 { \
165 GOMP_PLUGIN_fatal (#FN " error: %s", \
166 cuda_error (__r)); \
167 } \
168 } while (0)
169
170 #define CUDA_CALL_NOCHECK(FN, ...) \
171 CUDA_CALL_PREFIX FN (__VA_ARGS__)
172
173 #define CUDA_CALL_EXISTS(FN) \
174 CUDA_CALL_PREFIX FN
175
176 static const char *
177 cuda_error (CUresult r)
178 {
179 const char *fallback = "unknown cuda error";
180 const char *desc;
181
182 if (!CUDA_CALL_EXISTS (cuGetErrorString))
183 return fallback;
184
185 r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
186 if (r == CUDA_SUCCESS)
187 return desc;
188
189 return fallback;
190 }
191
192 static unsigned int instantiated_devices = 0;
193 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
194
195 /* NVPTX/CUDA specific definition of asynchronous queues. */
196 struct goacc_asyncqueue
197 {
198 CUstream cuda_stream;
199 };
200
201 struct nvptx_callback
202 {
203 void (*fn) (void *);
204 void *ptr;
205 struct goacc_asyncqueue *aq;
206 struct nvptx_callback *next;
207 };
208
209 /* Thread-specific data for PTX. */
210
211 struct nvptx_thread
212 {
213 /* We currently have this embedded inside the plugin because libgomp manages
214 devices through integer target_ids. This might be better if using an
215 opaque target-specific pointer directly from gomp_device_descr. */
216 struct ptx_device *ptx_dev;
217 };
218
219 /* Target data function launch information. */
220
221 struct targ_fn_launch
222 {
223 const char *fn;
224 unsigned short dim[GOMP_DIM_MAX];
225 };
226
227 /* Target PTX object information. */
228
229 struct targ_ptx_obj
230 {
231 const char *code;
232 size_t size;
233 };
234
235 /* Target data image information. */
236
237 typedef struct nvptx_tdata
238 {
239 const struct targ_ptx_obj *ptx_objs;
240 unsigned ptx_num;
241
242 const char *const *var_names;
243 unsigned var_num;
244
245 const struct targ_fn_launch *fn_descs;
246 unsigned fn_num;
247 } nvptx_tdata_t;
248
249 /* Descriptor of a loaded function. */
250
251 struct targ_fn_descriptor
252 {
253 CUfunction fn;
254 const struct targ_fn_launch *launch;
255 int regs_per_thread;
256 int max_threads_per_block;
257 };
258
259 /* A loaded PTX image. */
260 struct ptx_image_data
261 {
262 const void *target_data;
263 CUmodule module;
264
265 struct targ_fn_descriptor *fns; /* Array of functions. */
266
267 struct ptx_image_data *next;
268 };
269
270 struct ptx_free_block
271 {
272 void *ptr;
273 struct ptx_free_block *next;
274 };
275
276 struct ptx_device
277 {
278 CUcontext ctx;
279 bool ctx_shared;
280 CUdevice dev;
281
282 int ord;
283 bool overlap;
284 bool map;
285 bool concur;
286 bool mkern;
287 int mode;
288 int clock_khz;
289 int num_sms;
290 int regs_per_block;
291 int regs_per_sm;
292 int warp_size;
293 int max_threads_per_block;
294 int max_threads_per_multiprocessor;
295 int default_dims[GOMP_DIM_MAX];
296
297 struct ptx_image_data *images; /* Images loaded on device. */
298 pthread_mutex_t image_lock; /* Lock for above list. */
299
300 struct ptx_free_block *free_blocks;
301 pthread_mutex_t free_blocks_lock;
302
303 struct ptx_device *next;
304 };
305
306 static struct ptx_device **ptx_devices;
307
308 static inline struct nvptx_thread *
309 nvptx_thread (void)
310 {
311 return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
312 }
313
314 /* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
315 should be locked on entry and remains locked on exit. */
316
317 static bool
318 nvptx_init (void)
319 {
320 int ndevs;
321
322 if (instantiated_devices != 0)
323 return true;
324
325 if (!init_cuda_lib ())
326 return false;
327
328 CUDA_CALL (cuInit, 0);
329
330 CUDA_CALL (cuDeviceGetCount, &ndevs);
331 ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
332 * ndevs);
333 return true;
334 }
335
336 /* Select the N'th PTX device for the current host thread. The device must
337 have been previously opened before calling this function. */
338
339 static bool
340 nvptx_attach_host_thread_to_device (int n)
341 {
342 CUdevice dev;
343 CUresult r;
344 struct ptx_device *ptx_dev;
345 CUcontext thd_ctx;
346
347 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
348 if (r == CUDA_ERROR_NOT_PERMITTED)
349 {
350 /* Assume we're in a CUDA callback, just return true. */
351 return true;
352 }
353 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
354 {
355 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
356 return false;
357 }
358
359 if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
360 return true;
361 else
362 {
363 CUcontext old_ctx;
364
365 ptx_dev = ptx_devices[n];
366 if (!ptx_dev)
367 {
368 GOMP_PLUGIN_error ("device %d not found", n);
369 return false;
370 }
371
372 CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
373
374 /* We don't necessarily have a current context (e.g. if it has been
375 destroyed. Pop it if we do though. */
376 if (thd_ctx != NULL)
377 CUDA_CALL (cuCtxPopCurrent, &old_ctx);
378
379 CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
380 }
381 return true;
382 }
383
384 static struct ptx_device *
385 nvptx_open_device (int n)
386 {
387 struct ptx_device *ptx_dev;
388 CUdevice dev, ctx_dev;
389 CUresult r;
390 int async_engines, pi;
391
392 CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
393
394 ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
395
396 ptx_dev->ord = n;
397 ptx_dev->dev = dev;
398 ptx_dev->ctx_shared = false;
399
400 r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
401 if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
402 {
403 GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
404 return NULL;
405 }
406
407 if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
408 {
409 /* The current host thread has an active context for a different device.
410 Detach it. */
411 CUcontext old_ctx;
412 CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
413 }
414
415 CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
416
417 if (!ptx_dev->ctx)
418 CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
419 else
420 ptx_dev->ctx_shared = true;
421
422 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
423 &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
424 ptx_dev->overlap = pi;
425
426 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
427 &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
428 ptx_dev->map = pi;
429
430 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
431 &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
432 ptx_dev->concur = pi;
433
434 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
435 &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
436 ptx_dev->mode = pi;
437
438 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
439 &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
440 ptx_dev->mkern = pi;
441
442 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
443 &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
444 ptx_dev->clock_khz = pi;
445
446 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
447 &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
448 ptx_dev->num_sms = pi;
449
450 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
451 &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
452 ptx_dev->regs_per_block = pi;
453
454 /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
455 in CUDA 6.0 and newer. */
456 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
457 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
458 dev);
459 /* Fallback: use limit of registers per block, which is usually equal. */
460 if (r == CUDA_ERROR_INVALID_VALUE)
461 pi = ptx_dev->regs_per_block;
462 else if (r != CUDA_SUCCESS)
463 {
464 GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
465 return NULL;
466 }
467 ptx_dev->regs_per_sm = pi;
468
469 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
470 &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
471 if (pi != 32)
472 {
473 GOMP_PLUGIN_error ("Only warp size 32 is supported");
474 return NULL;
475 }
476 ptx_dev->warp_size = pi;
477
478 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
479 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
480 ptx_dev->max_threads_per_block = pi;
481
482 CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
483 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
484 ptx_dev->max_threads_per_multiprocessor = pi;
485
486 r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
487 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
488 if (r != CUDA_SUCCESS)
489 async_engines = 1;
490
491 for (int i = 0; i != GOMP_DIM_MAX; i++)
492 ptx_dev->default_dims[i] = 0;
493
494 ptx_dev->images = NULL;
495 pthread_mutex_init (&ptx_dev->image_lock, NULL);
496
497 ptx_dev->free_blocks = NULL;
498 pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
499
500 return ptx_dev;
501 }
502
503 static bool
504 nvptx_close_device (struct ptx_device *ptx_dev)
505 {
506 if (!ptx_dev)
507 return true;
508
509 for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
510 {
511 struct ptx_free_block *b_next = b->next;
512 CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
513 free (b);
514 b = b_next;
515 }
516
517 pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
518 pthread_mutex_destroy (&ptx_dev->image_lock);
519
520 if (!ptx_dev->ctx_shared)
521 CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
522
523 free (ptx_dev);
524 return true;
525 }
526
527 static int
528 nvptx_get_num_devices (void)
529 {
530 int n;
531
532 /* PR libgomp/65099: Currently, we only support offloading in 64-bit
533 configurations. */
534 if (sizeof (void *) != 8)
535 {
536 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading;"
537 " only 64-bit configurations are supported\n");
538 return 0;
539 }
540
541 /* This function will be called before the plugin has been initialized in
542 order to enumerate available devices, but CUDA API routines can't be used
543 until cuInit has been called. Just call it now (but don't yet do any
544 further initialization). */
545 if (instantiated_devices == 0)
546 {
547 if (!init_cuda_lib ())
548 return 0;
549 CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
550 /* This is not an error: e.g. we may have CUDA libraries installed but
551 no devices available. */
552 if (r != CUDA_SUCCESS)
553 {
554 GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
555 cuda_error (r));
556 return 0;
557 }
558 }
559
560 CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
561 return n;
562 }
563
564 static void
565 notify_var (const char *var_name, const char *env_var)
566 {
567 if (env_var == NULL)
568 GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
569 else
570 GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
571 }
572
573 static void
574 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
575 {
576 const char *var_name = "GOMP_NVPTX_JIT";
577 const char *env_var = secure_getenv (var_name);
578 notify_var (var_name, env_var);
579
580 if (env_var == NULL)
581 return;
582
583 const char *c = env_var;
584 while (*c != '\0')
585 {
586 while (*c == ' ')
587 c++;
588
589 if (c[0] == '-' && c[1] == 'O'
590 && '0' <= c[2] && c[2] <= '4'
591 && (c[3] == '\0' || c[3] == ' '))
592 {
593 *gomp_nvptx_o = c[2] - '0';
594 c += 3;
595 continue;
596 }
597
598 GOMP_PLUGIN_error ("Error parsing %s", var_name);
599 break;
600 }
601 }
602
603 static bool
604 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
605 unsigned num_objs)
606 {
607 CUjit_option opts[7];
608 void *optvals[7];
609 float elapsed = 0.0;
610 char elog[1024];
611 char ilog[16384];
612 CUlinkState linkstate;
613 CUresult r;
614 void *linkout;
615 size_t linkoutsize __attribute__ ((unused));
616
617 opts[0] = CU_JIT_WALL_TIME;
618 optvals[0] = &elapsed;
619
620 opts[1] = CU_JIT_INFO_LOG_BUFFER;
621 optvals[1] = &ilog[0];
622
623 opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
624 optvals[2] = (void *) sizeof ilog;
625
626 opts[3] = CU_JIT_ERROR_LOG_BUFFER;
627 optvals[3] = &elog[0];
628
629 opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
630 optvals[4] = (void *) sizeof elog;
631
632 opts[5] = CU_JIT_LOG_VERBOSE;
633 optvals[5] = (void *) 1;
634
635 static intptr_t gomp_nvptx_o = -1;
636
637 static bool init_done = false;
638 if (!init_done)
639 {
640 process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
641 init_done = true;
642 }
643
644 int nopts = 6;
645 if (gomp_nvptx_o != -1)
646 {
647 opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
648 optvals[nopts] = (void *) gomp_nvptx_o;
649 nopts++;
650 }
651
652 if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
653 CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
654 else
655 CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
656
657 for (; num_objs--; ptx_objs++)
658 {
659 /* cuLinkAddData's 'data' argument erroneously omits the const
660 qualifier. */
661 GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
662 if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
663 r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
664 (char *) ptx_objs->code, ptx_objs->size,
665 0, 0, 0, 0);
666 else
667 r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
668 (char *) ptx_objs->code, ptx_objs->size,
669 0, 0, 0, 0);
670 if (r != CUDA_SUCCESS)
671 {
672 GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
673 GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
674 cuda_error (r));
675 return false;
676 }
677 }
678
679 GOMP_PLUGIN_debug (0, "Linking\n");
680 r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
681
682 GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
683 GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
684
685 if (r != CUDA_SUCCESS)
686 {
687 GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
688 return false;
689 }
690
691 CUDA_CALL (cuModuleLoadData, module, linkout);
692 CUDA_CALL (cuLinkDestroy, linkstate);
693 return true;
694 }
695
696 static void
697 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
698 unsigned *dims, void *targ_mem_desc,
699 CUdeviceptr dp, CUstream stream)
700 {
701 struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
702 CUfunction function;
703 int i;
704 void *kargs[1];
705 struct nvptx_thread *nvthd = nvptx_thread ();
706 int warp_size = nvthd->ptx_dev->warp_size;
707
708 function = targ_fn->fn;
709
710 /* Initialize the launch dimensions. Typically this is constant,
711 provided by the device compiler, but we must permit runtime
712 values. */
713 int seen_zero = 0;
714 for (i = 0; i != GOMP_DIM_MAX; i++)
715 {
716 if (targ_fn->launch->dim[i])
717 dims[i] = targ_fn->launch->dim[i];
718 if (!dims[i])
719 seen_zero = 1;
720 }
721
722 if (seen_zero)
723 {
724 pthread_mutex_lock (&ptx_dev_lock);
725
726 static int gomp_openacc_dims[GOMP_DIM_MAX];
727 if (!gomp_openacc_dims[0])
728 {
729 /* See if the user provided GOMP_OPENACC_DIM environment
730 variable to specify runtime defaults. */
731 for (int i = 0; i < GOMP_DIM_MAX; ++i)
732 gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
733 }
734
735 if (!nvthd->ptx_dev->default_dims[0])
736 {
737 int default_dims[GOMP_DIM_MAX];
738 for (int i = 0; i < GOMP_DIM_MAX; ++i)
739 default_dims[i] = gomp_openacc_dims[i];
740
741 int gang, worker, vector;
742 {
743 int block_size = nvthd->ptx_dev->max_threads_per_block;
744 int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
745 int dev_size = nvthd->ptx_dev->num_sms;
746 GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
747 " dev_size=%d, cpu_size=%d\n",
748 warp_size, block_size, dev_size, cpu_size);
749
750 gang = (cpu_size / block_size) * dev_size;
751 worker = block_size / warp_size;
752 vector = warp_size;
753 }
754
755 /* There is no upper bound on the gang size. The best size
756 matches the hardware configuration. Logical gangs are
757 scheduled onto physical hardware. To maximize usage, we
758 should guess a large number. */
759 if (default_dims[GOMP_DIM_GANG] < 1)
760 default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
761 /* The worker size must not exceed the hardware. */
762 if (default_dims[GOMP_DIM_WORKER] < 1
763 || (default_dims[GOMP_DIM_WORKER] > worker && gang))
764 default_dims[GOMP_DIM_WORKER] = worker;
765 /* The vector size must exactly match the hardware. */
766 if (default_dims[GOMP_DIM_VECTOR] < 1
767 || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
768 default_dims[GOMP_DIM_VECTOR] = vector;
769
770 GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
771 default_dims[GOMP_DIM_GANG],
772 default_dims[GOMP_DIM_WORKER],
773 default_dims[GOMP_DIM_VECTOR]);
774
775 for (i = 0; i != GOMP_DIM_MAX; i++)
776 nvthd->ptx_dev->default_dims[i] = default_dims[i];
777 }
778 pthread_mutex_unlock (&ptx_dev_lock);
779
780 {
781 bool default_dim_p[GOMP_DIM_MAX];
782 for (i = 0; i != GOMP_DIM_MAX; i++)
783 default_dim_p[i] = !dims[i];
784
785 if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
786 {
787 for (i = 0; i != GOMP_DIM_MAX; i++)
788 if (default_dim_p[i])
789 dims[i] = nvthd->ptx_dev->default_dims[i];
790
791 if (default_dim_p[GOMP_DIM_VECTOR])
792 dims[GOMP_DIM_VECTOR]
793 = MIN (dims[GOMP_DIM_VECTOR],
794 (targ_fn->max_threads_per_block / warp_size
795 * warp_size));
796
797 if (default_dim_p[GOMP_DIM_WORKER])
798 dims[GOMP_DIM_WORKER]
799 = MIN (dims[GOMP_DIM_WORKER],
800 targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
801 }
802 else
803 {
804 /* Handle the case that the compiler allows the runtime to choose
805 the vector-length conservatively, by ignoring
806 gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
807 it. */
808 int vectors = 0;
809 /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
810 gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
811 exceed targ_fn->max_threads_per_block. */
812 int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
813 int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
814 int grids, blocks;
815
816 CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
817 &blocks, function, NULL, 0,
818 dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
819 GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
820 "grid = %d, block = %d\n", grids, blocks);
821
822 /* Keep the num_gangs proportional to the block size. In
823 the case were a block size is limited by shared-memory
824 or the register file capacity, the runtime will not
825 excessively over assign gangs to the multiprocessor
826 units if their state is going to be swapped out even
827 more than necessary. The constant factor 2 is there to
828 prevent threads from idling when there is insufficient
829 work for them. */
830 if (gangs == 0)
831 gangs = 2 * grids * (blocks / warp_size);
832
833 if (vectors == 0)
834 vectors = warp_size;
835
836 if (workers == 0)
837 {
838 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
839 ? vectors
840 : dims[GOMP_DIM_VECTOR]);
841 workers = blocks / actual_vectors;
842 workers = MAX (workers, 1);
843 /* If we need a per-worker barrier ... . */
844 if (actual_vectors > 32)
845 /* Don't use more barriers than available. */
846 workers = MIN (workers, 15);
847 }
848
849 for (i = 0; i != GOMP_DIM_MAX; i++)
850 if (default_dim_p[i])
851 switch (i)
852 {
853 case GOMP_DIM_GANG: dims[i] = gangs; break;
854 case GOMP_DIM_WORKER: dims[i] = workers; break;
855 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
856 default: GOMP_PLUGIN_fatal ("invalid dim");
857 }
858 }
859 }
860 }
861
862 /* Check if the accelerator has sufficient hardware resources to
863 launch the offloaded kernel. */
864 if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
865 > targ_fn->max_threads_per_block)
866 {
867 const char *msg
868 = ("The Nvidia accelerator has insufficient resources to launch '%s'"
869 " with num_workers = %d and vector_length = %d"
870 "; "
871 "recompile the program with 'num_workers = x and vector_length = y'"
872 " on that offloaded region or '-fopenacc-dim=:x:y' where"
873 " x * y <= %d"
874 ".\n");
875 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
876 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
877 }
878
879 /* Check if the accelerator has sufficient barrier resources to
880 launch the offloaded kernel. */
881 if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
882 {
883 const char *msg
884 = ("The Nvidia accelerator has insufficient barrier resources to launch"
885 " '%s' with num_workers = %d and vector_length = %d"
886 "; "
887 "recompile the program with 'num_workers = x' on that offloaded"
888 " region or '-fopenacc-dim=:x:' where x <= 15"
889 "; "
890 "or, recompile the program with 'vector_length = 32' on that"
891 " offloaded region or '-fopenacc-dim=::32'"
892 ".\n");
893 GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
894 dims[GOMP_DIM_VECTOR]);
895 }
896
897 GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
898 " gangs=%u, workers=%u, vectors=%u\n",
899 __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
900 dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
901
902 // OpenACC CUDA
903 //
904 // num_gangs nctaid.x
905 // num_workers ntid.y
906 // vector length ntid.x
907 kargs[0] = &dp;
908 CUDA_CALL_ASSERT (cuLaunchKernel, function,
909 dims[GOMP_DIM_GANG], 1, 1,
910 dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
911 0, stream, kargs, 0);
912
913 GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
914 targ_fn->launch->fn);
915 }
916
917 void * openacc_get_current_cuda_context (void);
918
919 static void *
920 nvptx_alloc (size_t s)
921 {
922 CUdeviceptr d;
923
924 CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
925 return (void *) d;
926 }
927
928 static bool
929 nvptx_free (void *p, struct ptx_device *ptx_dev)
930 {
931 /* Assume callback context if this is null. */
932 if (GOMP_PLUGIN_acc_thread () == NULL)
933 {
934 struct ptx_free_block *n
935 = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
936 n->ptr = p;
937 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
938 n->next = ptx_dev->free_blocks;
939 ptx_dev->free_blocks = n;
940 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
941 return true;
942 }
943
944 CUdeviceptr pb;
945 size_t ps;
946
947 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
948 if ((CUdeviceptr) p != pb)
949 {
950 GOMP_PLUGIN_error ("invalid device address");
951 return false;
952 }
953
954 CUDA_CALL (cuMemFree, (CUdeviceptr) p);
955 return true;
956 }
957
958 static void *
959 nvptx_get_current_cuda_device (void)
960 {
961 struct nvptx_thread *nvthd = nvptx_thread ();
962
963 if (!nvthd || !nvthd->ptx_dev)
964 return NULL;
965
966 return &nvthd->ptx_dev->dev;
967 }
968
969 static void *
970 nvptx_get_current_cuda_context (void)
971 {
972 struct nvptx_thread *nvthd = nvptx_thread ();
973
974 if (!nvthd || !nvthd->ptx_dev)
975 return NULL;
976
977 return nvthd->ptx_dev->ctx;
978 }
979
980 /* Plugin entry points. */
981
982 const char *
983 GOMP_OFFLOAD_get_name (void)
984 {
985 return "nvptx";
986 }
987
988 unsigned int
989 GOMP_OFFLOAD_get_caps (void)
990 {
991 return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
992 }
993
994 int
995 GOMP_OFFLOAD_get_type (void)
996 {
997 return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
998 }
999
1000 int
1001 GOMP_OFFLOAD_get_num_devices (void)
1002 {
1003 return nvptx_get_num_devices ();
1004 }
1005
1006 bool
1007 GOMP_OFFLOAD_init_device (int n)
1008 {
1009 struct ptx_device *dev;
1010
1011 pthread_mutex_lock (&ptx_dev_lock);
1012
1013 if (!nvptx_init () || ptx_devices[n] != NULL)
1014 {
1015 pthread_mutex_unlock (&ptx_dev_lock);
1016 return false;
1017 }
1018
1019 dev = nvptx_open_device (n);
1020 if (dev)
1021 {
1022 ptx_devices[n] = dev;
1023 instantiated_devices++;
1024 }
1025
1026 pthread_mutex_unlock (&ptx_dev_lock);
1027
1028 return dev != NULL;
1029 }
1030
1031 bool
1032 GOMP_OFFLOAD_fini_device (int n)
1033 {
1034 pthread_mutex_lock (&ptx_dev_lock);
1035
1036 if (ptx_devices[n] != NULL)
1037 {
1038 if (!nvptx_attach_host_thread_to_device (n)
1039 || !nvptx_close_device (ptx_devices[n]))
1040 {
1041 pthread_mutex_unlock (&ptx_dev_lock);
1042 return false;
1043 }
1044 ptx_devices[n] = NULL;
1045 instantiated_devices--;
1046 }
1047
1048 if (instantiated_devices == 0)
1049 {
1050 free (ptx_devices);
1051 ptx_devices = NULL;
1052 }
1053
1054 pthread_mutex_unlock (&ptx_dev_lock);
1055 return true;
1056 }
1057
1058 /* Return the libgomp version number we're compatible with. There is
1059 no requirement for cross-version compatibility. */
1060
1061 unsigned
1062 GOMP_OFFLOAD_version (void)
1063 {
1064 return GOMP_VERSION;
1065 }
1066
1067 /* Initialize __nvptx_clocktick, if present in MODULE. */
1068
1069 static void
1070 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1071 {
1072 CUdeviceptr dptr;
1073 CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1074 module, "__nvptx_clocktick");
1075 if (r == CUDA_ERROR_NOT_FOUND)
1076 return;
1077 if (r != CUDA_SUCCESS)
1078 GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1079 double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1080 r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1081 sizeof (__nvptx_clocktick));
1082 if (r != CUDA_SUCCESS)
1083 GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1084 }
1085
1086 /* Load the (partial) program described by TARGET_DATA to device
1087 number ORD. Allocate and return TARGET_TABLE. */
1088
1089 int
1090 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1091 struct addr_pair **target_table)
1092 {
1093 CUmodule module;
1094 const char *const *var_names;
1095 const struct targ_fn_launch *fn_descs;
1096 unsigned int fn_entries, var_entries, i, j;
1097 struct targ_fn_descriptor *targ_fns;
1098 struct addr_pair *targ_tbl;
1099 const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1100 struct ptx_image_data *new_image;
1101 struct ptx_device *dev;
1102
1103 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1104 {
1105 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1106 " (expected %u, received %u)",
1107 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1108 return -1;
1109 }
1110
1111 if (!nvptx_attach_host_thread_to_device (ord)
1112 || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1113 return -1;
1114
1115 dev = ptx_devices[ord];
1116
1117 /* The mkoffload utility emits a struct of pointers/integers at the
1118 start of each offload image. The array of kernel names and the
1119 functions addresses form a one-to-one correspondence. */
1120
1121 var_entries = img_header->var_num;
1122 var_names = img_header->var_names;
1123 fn_entries = img_header->fn_num;
1124 fn_descs = img_header->fn_descs;
1125
1126 targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1127 * (fn_entries + var_entries));
1128 targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1129 * fn_entries);
1130
1131 *target_table = targ_tbl;
1132
1133 new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1134 new_image->target_data = target_data;
1135 new_image->module = module;
1136 new_image->fns = targ_fns;
1137
1138 pthread_mutex_lock (&dev->image_lock);
1139 new_image->next = dev->images;
1140 dev->images = new_image;
1141 pthread_mutex_unlock (&dev->image_lock);
1142
1143 for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1144 {
1145 CUfunction function;
1146 int nregs, mthrs;
1147
1148 CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1149 fn_descs[i].fn);
1150 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1151 CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1152 CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1153 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1154
1155 targ_fns->fn = function;
1156 targ_fns->launch = &fn_descs[i];
1157 targ_fns->regs_per_thread = nregs;
1158 targ_fns->max_threads_per_block = mthrs;
1159
1160 targ_tbl->start = (uintptr_t) targ_fns;
1161 targ_tbl->end = targ_tbl->start + 1;
1162 }
1163
1164 for (j = 0; j < var_entries; j++, targ_tbl++)
1165 {
1166 CUdeviceptr var;
1167 size_t bytes;
1168
1169 CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1170 &var, &bytes, module, var_names[j]);
1171
1172 targ_tbl->start = (uintptr_t) var;
1173 targ_tbl->end = targ_tbl->start + bytes;
1174 }
1175
1176 nvptx_set_clocktick (module, dev);
1177
1178 return fn_entries + var_entries;
1179 }
1180
1181 /* Unload the program described by TARGET_DATA. DEV_DATA is the
1182 function descriptors allocated by G_O_load_image. */
1183
1184 bool
1185 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1186 {
1187 struct ptx_image_data *image, **prev_p;
1188 struct ptx_device *dev = ptx_devices[ord];
1189
1190 if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1191 {
1192 GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1193 " (expected %u, received %u)",
1194 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1195 return false;
1196 }
1197
1198 bool ret = true;
1199 pthread_mutex_lock (&dev->image_lock);
1200 for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1201 if (image->target_data == target_data)
1202 {
1203 *prev_p = image->next;
1204 if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1205 ret = false;
1206 free (image->fns);
1207 free (image);
1208 break;
1209 }
1210 pthread_mutex_unlock (&dev->image_lock);
1211 return ret;
1212 }
1213
1214 void *
1215 GOMP_OFFLOAD_alloc (int ord, size_t size)
1216 {
1217 if (!nvptx_attach_host_thread_to_device (ord))
1218 return NULL;
1219
1220 struct ptx_device *ptx_dev = ptx_devices[ord];
1221 struct ptx_free_block *blocks, *tmp;
1222
1223 pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1224 blocks = ptx_dev->free_blocks;
1225 ptx_dev->free_blocks = NULL;
1226 pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1227
1228 while (blocks)
1229 {
1230 tmp = blocks->next;
1231 nvptx_free (blocks->ptr, ptx_dev);
1232 free (blocks);
1233 blocks = tmp;
1234 }
1235
1236 return nvptx_alloc (size);
1237 }
1238
1239 bool
1240 GOMP_OFFLOAD_free (int ord, void *ptr)
1241 {
1242 return (nvptx_attach_host_thread_to_device (ord)
1243 && nvptx_free (ptr, ptx_devices[ord]));
1244 }
1245
1246 void
1247 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1248 void **hostaddrs, void **devaddrs,
1249 unsigned *dims, void *targ_mem_desc)
1250 {
1251 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1252
1253 void **hp = NULL;
1254 CUdeviceptr dp = 0;
1255
1256 if (mapnum > 0)
1257 {
1258 hp = alloca (mapnum * sizeof (void *));
1259 for (int i = 0; i < mapnum; i++)
1260 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1261 CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
1262 }
1263
1264 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1265 fact have the same value on a unified-memory system). */
1266 if (mapnum > 0)
1267 CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1268 mapnum * sizeof (void *));
1269
1270 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1271 dp, NULL);
1272
1273 CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1274 const char *maybe_abort_msg = "(perhaps abort was called)";
1275 if (r == CUDA_ERROR_LAUNCH_FAILED)
1276 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1277 maybe_abort_msg);
1278 else if (r != CUDA_SUCCESS)
1279 GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1280 CUDA_CALL_ASSERT (cuMemFree, dp);
1281 }
1282
1283 static void
1284 cuda_free_argmem (void *ptr)
1285 {
1286 void **block = (void **) ptr;
1287 nvptx_free (block[0], (struct ptx_device *) block[1]);
1288 free (block);
1289 }
1290
1291 void
1292 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1293 void **hostaddrs, void **devaddrs,
1294 unsigned *dims, void *targ_mem_desc,
1295 struct goacc_asyncqueue *aq)
1296 {
1297 GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
1298
1299 void **hp = NULL;
1300 CUdeviceptr dp = 0;
1301 void **block = NULL;
1302
1303 if (mapnum > 0)
1304 {
1305 block = (void **) GOMP_PLUGIN_malloc ((mapnum + 2) * sizeof (void *));
1306 hp = block + 2;
1307 for (int i = 0; i < mapnum; i++)
1308 hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1309 CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *));
1310 }
1311
1312 /* Copy the (device) pointers to arguments to the device (dp and hp might in
1313 fact have the same value on a unified-memory system). */
1314 if (mapnum > 0)
1315 {
1316 CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1317 mapnum * sizeof (void *), aq->cuda_stream);
1318 block[0] = (void *) dp;
1319
1320 struct nvptx_thread *nvthd =
1321 (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1322 block[1] = (void *) nvthd->ptx_dev;
1323 }
1324 nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1325 dp, aq->cuda_stream);
1326
1327 if (mapnum > 0)
1328 GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1329 }
1330
1331 void *
1332 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1333 {
1334 struct ptx_device *ptx_dev;
1335 struct nvptx_thread *nvthd
1336 = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1337 CUcontext thd_ctx;
1338
1339 ptx_dev = ptx_devices[ord];
1340
1341 assert (ptx_dev);
1342
1343 CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1344
1345 assert (ptx_dev->ctx);
1346
1347 if (!thd_ctx)
1348 CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1349
1350 nvthd->ptx_dev = ptx_dev;
1351
1352 return (void *) nvthd;
1353 }
1354
1355 void
1356 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1357 {
1358 free (data);
1359 }
1360
1361 void *
1362 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1363 {
1364 return nvptx_get_current_cuda_device ();
1365 }
1366
1367 void *
1368 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1369 {
1370 return nvptx_get_current_cuda_context ();
1371 }
1372
1373 /* This returns a CUstream. */
1374 void *
1375 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1376 {
1377 return (void *) aq->cuda_stream;
1378 }
1379
1380 /* This takes a CUstream. */
1381 int
1382 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1383 {
1384 if (aq->cuda_stream)
1385 {
1386 CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1387 CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1388 }
1389
1390 aq->cuda_stream = (CUstream) stream;
1391 return 1;
1392 }
1393
1394 struct goacc_asyncqueue *
1395 GOMP_OFFLOAD_openacc_async_construct (void)
1396 {
1397 CUstream stream = NULL;
1398 CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1399
1400 struct goacc_asyncqueue *aq
1401 = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1402 aq->cuda_stream = stream;
1403 return aq;
1404 }
1405
1406 bool
1407 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1408 {
1409 CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1410 free (aq);
1411 return true;
1412 }
1413
1414 int
1415 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1416 {
1417 CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1418 if (r == CUDA_SUCCESS)
1419 return 1;
1420 if (r == CUDA_ERROR_NOT_READY)
1421 return 0;
1422
1423 GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1424 return -1;
1425 }
1426
1427 bool
1428 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1429 {
1430 CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1431 return true;
1432 }
1433
1434 bool
1435 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1436 struct goacc_asyncqueue *aq2)
1437 {
1438 CUevent e;
1439 CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1440 CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1441 CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1442 return true;
1443 }
1444
1445 static void
1446 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1447 {
1448 if (res != CUDA_SUCCESS)
1449 GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1450 struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1451 cb->fn (cb->ptr);
1452 free (ptr);
1453 }
1454
1455 void
1456 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1457 void (*callback_fn)(void *),
1458 void *userptr)
1459 {
1460 struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1461 b->fn = callback_fn;
1462 b->ptr = userptr;
1463 b->aq = aq;
1464 CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1465 cuda_callback_wrapper, (void *) b, 0);
1466 }
1467
1468 static bool
1469 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1470 {
1471 CUdeviceptr pb;
1472 size_t ps;
1473 if (!s)
1474 return true;
1475 if (!d)
1476 {
1477 GOMP_PLUGIN_error ("invalid device address");
1478 return false;
1479 }
1480 CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1481 if (!pb)
1482 {
1483 GOMP_PLUGIN_error ("invalid device address");
1484 return false;
1485 }
1486 if (!h)
1487 {
1488 GOMP_PLUGIN_error ("invalid host address");
1489 return false;
1490 }
1491 if (d == h)
1492 {
1493 GOMP_PLUGIN_error ("invalid host or device address");
1494 return false;
1495 }
1496 if ((void *)(d + s) > (void *)(pb + ps))
1497 {
1498 GOMP_PLUGIN_error ("invalid size");
1499 return false;
1500 }
1501 return true;
1502 }
1503
1504 bool
1505 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1506 {
1507 if (!nvptx_attach_host_thread_to_device (ord)
1508 || !cuda_memcpy_sanity_check (src, dst, n))
1509 return false;
1510 CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1511 return true;
1512 }
1513
1514 bool
1515 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1516 {
1517 if (!nvptx_attach_host_thread_to_device (ord)
1518 || !cuda_memcpy_sanity_check (dst, src, n))
1519 return false;
1520 CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1521 return true;
1522 }
1523
1524 bool
1525 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1526 {
1527 CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1528 return true;
1529 }
1530
1531 bool
1532 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1533 size_t n, struct goacc_asyncqueue *aq)
1534 {
1535 if (!nvptx_attach_host_thread_to_device (ord)
1536 || !cuda_memcpy_sanity_check (src, dst, n))
1537 return false;
1538 CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1539 return true;
1540 }
1541
1542 bool
1543 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1544 size_t n, struct goacc_asyncqueue *aq)
1545 {
1546 if (!nvptx_attach_host_thread_to_device (ord)
1547 || !cuda_memcpy_sanity_check (dst, src, n))
1548 return false;
1549 CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1550 return true;
1551 }
1552
1553 /* Adjust launch dimensions: pick good values for number of blocks and warps
1554 and ensure that number of warps does not exceed CUDA limits as well as GCC's
1555 own limits. */
1556
1557 static void
1558 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1559 struct ptx_device *ptx_dev,
1560 int *teams_p, int *threads_p)
1561 {
1562 int max_warps_block = fn->max_threads_per_block / 32;
1563 /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1564 and libgcc, which matches documented limit of all GPUs as of 2015. */
1565 if (max_warps_block > 32)
1566 max_warps_block = 32;
1567 if (*threads_p <= 0)
1568 *threads_p = 8;
1569 if (*threads_p > max_warps_block)
1570 *threads_p = max_warps_block;
1571
1572 int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1573 /* This is an estimate of how many blocks the device can host simultaneously.
1574 Actual limit, which may be lower, can be queried with "occupancy control"
1575 driver interface (since CUDA 6.0). */
1576 int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1577 if (*teams_p <= 0 || *teams_p > max_blocks)
1578 *teams_p = max_blocks;
1579 }
1580
1581 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1582 target regions. */
1583
1584 static size_t
1585 nvptx_stacks_size ()
1586 {
1587 return 128 * 1024;
1588 }
1589
1590 /* Return contiguous storage for NUM stacks, each SIZE bytes. */
1591
1592 static void *
1593 nvptx_stacks_alloc (size_t size, int num)
1594 {
1595 CUdeviceptr stacks;
1596 CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
1597 if (r != CUDA_SUCCESS)
1598 GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1599 return (void *) stacks;
1600 }
1601
1602 /* Release storage previously allocated by nvptx_stacks_alloc. */
1603
1604 static void
1605 nvptx_stacks_free (void *p, int num)
1606 {
1607 CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
1608 if (r != CUDA_SUCCESS)
1609 GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1610 }
1611
1612 void
1613 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1614 {
1615 CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
1616 CUresult r;
1617 struct ptx_device *ptx_dev = ptx_devices[ord];
1618 const char *maybe_abort_msg = "(perhaps abort was called)";
1619 int teams = 0, threads = 0;
1620
1621 if (!args)
1622 GOMP_PLUGIN_fatal ("No target arguments provided");
1623 while (*args)
1624 {
1625 intptr_t id = (intptr_t) *args++, val;
1626 if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1627 val = (intptr_t) *args++;
1628 else
1629 val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1630 if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1631 continue;
1632 val = val > INT_MAX ? INT_MAX : val;
1633 id &= GOMP_TARGET_ARG_ID_MASK;
1634 if (id == GOMP_TARGET_ARG_NUM_TEAMS)
1635 teams = val;
1636 else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
1637 threads = val;
1638 }
1639 nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
1640
1641 size_t stack_size = nvptx_stacks_size ();
1642 void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
1643 void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
1644 size_t fn_args_size = sizeof fn_args;
1645 void *config[] = {
1646 CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
1647 CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
1648 CU_LAUNCH_PARAM_END
1649 };
1650 r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
1651 32, threads, 1, 0, NULL, NULL, config);
1652 if (r != CUDA_SUCCESS)
1653 GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1654
1655 r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1656 if (r == CUDA_ERROR_LAUNCH_FAILED)
1657 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1658 maybe_abort_msg);
1659 else if (r != CUDA_SUCCESS)
1660 GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1661 nvptx_stacks_free (stacks, teams * threads);
1662 }
1663
1664 void
1665 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
1666 void *async_data)
1667 {
1668 GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
1669 }