struct goacc_asyncqueue *);
extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t,
struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_dev2dev (int, void *, const void *, size_t,
+ struct goacc_asyncqueue *);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *);
__typeof (GOMP_OFFLOAD_openacc_async_exec) *exec_func;
__typeof (GOMP_OFFLOAD_openacc_async_dev2host) *dev2host_func;
__typeof (GOMP_OFFLOAD_openacc_async_host2dev) *host2dev_func;
+ __typeof (GOMP_OFFLOAD_openacc_async_dev2dev) *dev2dev_func;
} async;
__typeof (GOMP_OFFLOAD_openacc_get_property) *get_property_func;
extern void gomp_copy_dev2host (struct gomp_device_descr *,
struct goacc_asyncqueue *, void *, const void *,
size_t);
+extern void gomp_copy_dev2dev (struct gomp_device_descr *,
+ struct goacc_asyncqueue *, void *, const void *,
+ size_t);
extern uintptr_t gomp_map_val (struct target_mem_desc *, void **, size_t);
extern bool gomp_attach_pointer (struct gomp_device_descr *,
struct goacc_asyncqueue *, splay_tree,
acc_get_property_string_h_;
} OACC_2.5.1;
+OACC_2.6.1 {
+ global:
+ acc_memcpy_device;
+ acc_memcpy_device_async;
+} OACC_2.6;
+
GOACC_2.0 {
global:
GOACC_data_end;
present on device.
* acc_memcpy_to_device:: Copy host memory to device memory.
* acc_memcpy_from_device:: Copy device memory to host memory.
+* acc_memcpy_device:: Copy memory within a device.
* acc_attach:: Let device pointer point to device-pointer target.
* acc_detach:: Let device pointer point to host-pointer target.
+@node acc_memcpy_device
+@section @code{acc_memcpy_device} -- Copy memory within a device.
+@table @asis
+@item @emph{Description}
+This function copies device memory from one memory location to another
+on the current device. It copies @var{bytes} bytes of data from the device
+address, specified by @var{data_dev_src}, to the device address
+@var{data_dev_dest}. The @code{_async} version performs the transfer
+asnychronously using the queue associated with @var{async_arg}.
+
+@item @emph{C/C++}:
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void acc_memcpy_device(d_void* data_dev_dest,}
+@item @tab @code{d_void* data_dev_src, size_t bytes);}
+@item @emph{Prototype}: @tab @code{void acc_memcpy_device_async(d_void* data_dev_dest,}
+@item @tab @code{d_void* data_dev_src, size_t bytes, int async_arg);}
+@end multitable
+
+@item @emph{Fortran}:
+@multitable @columnfractions .20 .80
+@item @emph{Interface}: @tab @code{subroutine acc_memcpy_device(data_dev_dest, &}
+@item @tab @code{data_dev_src, bytes)}
+@item @emph{Interface}: @tab @code{subroutine acc_memcpy_device_async(data_dev_dest, &}
+@item @tab @code{data_dev_src, bytes, async_arg)}
+@item @tab @code{type(c_ptr), value :: data_dev_dest}
+@item @tab @code{type(c_ptr), value :: data_dev_src}
+@item @tab @code{integer(c_size_t), value :: bytes}
+@item @tab @code{integer(acc_handle_kind), value :: async_arg}
+@end multitable
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.33. @uref{https://www.openacc.org, OpenACC specification v3.3}, section
+3.2.28.
+@end table
+
+
+
@node acc_attach
@section @code{acc_attach} -- Let device pointer point to device-pointer target.
@table @asis
}
static void
-memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
- const char *libfnname)
+memcpy_tofrom_device (bool dev_to, bool dev_from, void *dst, void *src,
+ size_t s, int async, const char *libfnname)
{
/* No need to call lazy open here, as the device pointer must have
been obtained from a routine that did that. */
struct goacc_thread *thr = goacc_thread ();
assert (thr && thr->dev);
+ if (s == 0)
+ return;
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
{
- if (from)
- memmove (h, d, s);
- else
- memmove (d, h, s);
+ if (src == dst)
+ return;
+ memcpy (dst, src, s);
return;
}
}
goacc_aq aq = get_goacc_asyncqueue (async);
- if (from)
- gomp_copy_dev2host (thr->dev, aq, h, d, s);
+ if (dev_to && dev_from)
+ {
+ if (dst != src)
+ gomp_copy_dev2dev (thr->dev, aq, dst, src, s);
+ }
+ else if (dev_from)
+ gomp_copy_dev2host (thr->dev, aq, dst, src, s);
else
- gomp_copy_host2dev (thr->dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
+ gomp_copy_host2dev (thr->dev, aq, dst, src, s, false, /* TODO: cbuf? */ NULL);
if (profiling_p)
{
void
acc_memcpy_to_device (void *d, void *h, size_t s)
{
- memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__);
+ memcpy_tofrom_device (true, false, d, h, s, acc_async_sync, __FUNCTION__);
}
void
acc_memcpy_to_device_async (void *d, void *h, size_t s, int async)
{
- memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__);
+ memcpy_tofrom_device (true, false, d, h, s, async, __FUNCTION__);
}
void
acc_memcpy_from_device (void *h, void *d, size_t s)
{
- memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__);
+ memcpy_tofrom_device (false, true, h, d, s, acc_async_sync, __FUNCTION__);
}
void
acc_memcpy_from_device_async (void *h, void *d, size_t s, int async)
{
- memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__);
+ memcpy_tofrom_device (false, true, h, d, s, async, __FUNCTION__);
+}
+
+void
+acc_memcpy_device (void *dst, void *src, size_t s)
+{
+ memcpy_tofrom_device (true, true, dst, src, s, acc_async_sync, __FUNCTION__);
+}
+
+void
+acc_memcpy_device_async (void *dst, void *src, size_t s, int async)
+{
+ memcpy_tofrom_device (true, true, dst, src, s, async, __FUNCTION__);
}
/* Return the device pointer that corresponds to host data H. Or NULL
public :: acc_copyout_finalize, acc_delete_finalize
public :: acc_memcpy_to_device, acc_memcpy_to_device_async
public :: acc_memcpy_from_device, acc_memcpy_from_device_async
+ public :: acc_memcpy_device, acc_memcpy_device_async
integer, parameter :: openacc_version = 201711
end subroutine
end interface
+ interface
+ subroutine acc_memcpy_device (data_dev_dest, data_dev_src, bytes) bind(C)
+ use iso_c_binding, only: c_ptr, c_size_t
+ type(c_ptr), value :: data_dev_dest
+ type(c_ptr), value :: data_dev_src
+ integer(c_size_t), value :: bytes
+ end subroutine
+ end interface
+
+ interface
+ subroutine acc_memcpy_device_async (data_dev_dest, data_dev_src, &
+ bytes, async_arg) bind(C)
+ use iso_c_binding, only: c_ptr, c_size_t
+ import :: acc_handle_kind
+ type(c_ptr), value :: data_dev_dest
+ type(c_ptr), value :: data_dev_src
+ integer(c_size_t), value :: bytes
+ integer(acc_handle_kind), value :: async_arg
+ end subroutine
+ end interface
+
interface acc_copyin_async
procedure :: acc_copyin_async_32_h
procedure :: acc_copyin_async_64_h
int acc_is_present (void *, size_t) __GOACC_NOTHROW;
void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW;
void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW;
+void acc_memcpy_device (void *, void *, size_t) __GOACC_NOTHROW;
void acc_attach (void **) __GOACC_NOTHROW;
void acc_attach_async (void **, int) __GOACC_NOTHROW;
void acc_detach (void **) __GOACC_NOTHROW;
void acc_detach_finalize (void **) __GOACC_NOTHROW;
void acc_detach_finalize_async (void **, int) __GOACC_NOTHROW;
-/* Async functions, specified in OpenACC 2.5. */
+/* Async functions, specified in OpenACC 2.5, acc_memcpy_device in 2.6. */
void acc_copyin_async (void *, size_t, int) __GOACC_NOTHROW;
void acc_create_async (void *, size_t, int) __GOACC_NOTHROW;
void acc_copyout_async (void *, size_t, int) __GOACC_NOTHROW;
void acc_update_self_async (void *, size_t, int) __GOACC_NOTHROW;
void acc_memcpy_to_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
void acc_memcpy_from_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
+void acc_memcpy_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
/* CUDA-specific routines. */
void *acc_get_current_cuda_device (void) __GOACC_NOTHROW;
end subroutine
end interface
+ interface
+ subroutine acc_memcpy_device(data_dev_dest, data_dev_src, &
+ & bytes) bind(C)
+ use iso_c_binding, only: c_ptr, c_size_t
+ type(c_ptr), value :: data_dev_dest
+ type(c_ptr), value :: data_dev_src
+ integer(c_size_t), value :: bytes
+ end subroutine
+ end interface
+
+ interface
+ subroutine acc_memcpy_device_async(data_dev_dest, &
+ & data_dev_src, bytes, &
+ & async_arg) bind(C)
+ use iso_c_binding, only: c_ptr, c_size_t
+ import :: acc_handle_kind
+ type(c_ptr), value :: data_dev_dest
+ type(c_ptr), value :: data_dev_src
+ integer(c_size_t), value :: bytes
+ integer(acc_handle_kind), value :: async_arg
+ end subroutine
+ end interface
+
+
interface acc_copyin_async
subroutine acc_copyin_async_32_h (a, len, async)
use iso_c_binding, only: c_int32_t
queue_push_callback (aq, fn, data);
}
-/* Queue up an asynchronous data copy from host to DEVICE. */
+/* Queue up an asynchronous data copy from host to DEVICE.
+ (Also handles dev2host and dev2dev.) */
bool
GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
size_t n, struct goacc_asyncqueue *aq)
{
- struct agent_info *agent = get_agent_info (device);
- assert (agent == aq->agent);
- queue_push_copy (aq, dst, src, n);
- return true;
+ return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq);
+}
+
+/* Queue up an asynchronous data copy from DEVICE to DEVICE. */
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2dev (int device, void *dst, const void *src,
+ size_t n, struct goacc_asyncqueue *aq)
+{
+ return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq);
}
union goacc_property_value
cuda_callback_wrapper, (void *) b, 0);
}
+static bool
+cuda_memcpy_dev_sanity_check (const void *d1, const void *d2, size_t s)
+{
+ CUdeviceptr pb1, pb2;
+ size_t ps1, ps2;
+ if (!s)
+ return true;
+ if (!d1 || !d2)
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
+ CUDA_CALL (cuMemGetAddressRange, &pb1, &ps1, (CUdeviceptr) d1);
+ CUDA_CALL (cuMemGetAddressRange, &pb2, &ps2, (CUdeviceptr) d2);
+ if (!pb1 || !pb2)
+ {
+ GOMP_PLUGIN_error ("invalid device address");
+ return false;
+ }
+ if ((void *)(d1 + s) > (void *)(pb1 + ps1)
+ || (void *)(d2 + s) > (void *)(pb2 + ps2))
+ {
+ GOMP_PLUGIN_error ("invalid size");
+ return false;
+ }
+ return true;
+}
+
static bool
cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
{
bool
GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_dev_sanity_check (dst, src, n))
+ return false;
CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
return true;
}
return true;
}
+bool
+GOMP_OFFLOAD_openacc_async_dev2dev (int ord, void *dst, const void *src,
+ size_t n, struct goacc_asyncqueue *aq)
+{
+ if (!nvptx_attach_host_thread_to_device (ord)
+ || !cuda_memcpy_dev_sanity_check (dst, src, n))
+ return false;
+ CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
+ aq->cuda_stream);
+ return true;
+}
+
union goacc_property_value
GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
{
gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
}
+attribute_hidden void
+gomp_copy_dev2dev (struct gomp_device_descr *devicep,
+ struct goacc_asyncqueue *aq,
+ void *dst, const void *src, size_t sz)
+{
+ if (__builtin_expect (aq != NULL, 0))
+ goacc_device_copy_async (devicep, devicep->openacc.async.dev2dev_func,
+ "dev", dst, "dev", src, NULL, sz, aq);
+ else
+ gomp_device_copy (devicep, devicep->dev2dev_func, "dev", dst,
+ "dev", src, sz);
+}
+
static void
gomp_free_device_memory (struct gomp_device_descr *devicep, void *devptr)
{
|| !DLSYM_OPT (openacc.async.exec, openacc_async_exec)
|| !DLSYM_OPT (openacc.async.dev2host, openacc_async_dev2host)
|| !DLSYM_OPT (openacc.async.host2dev, openacc_async_host2dev)
+ || !DLSYM_OPT (openacc.async.dev2dev, openacc_async_dev2dev)
|| !DLSYM_OPT (openacc.get_property, openacc_get_property))
{
/* Require all the OpenACC handlers if we have
--- /dev/null
+/* { dg-prune-output "using .vector_length \\(32\\)" } */
+
+/* PR libgomp/93226 */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <openacc.h>
+
+enum { N = 1024 };
+
+static int D[N];
+#pragma acc declare device_resident(D)
+
+#pragma acc routine
+intptr_t init_d()
+{
+ for (int i = 0; i < N; i++)
+ D[i] = 27*i;
+ return (intptr_t) &D[0];
+}
+
+int
+main ()
+{
+ int *a, *b, *e;
+ void *d_a, *d_b, *d_c, *d_d, *d_e, *d_f;
+ intptr_t intptr;
+ bool fail = false;
+
+ a = (int *) malloc (N*sizeof (int));
+ b = (int *) malloc (N*sizeof (int));
+ e = (int *) malloc (N*sizeof (int));
+ d_c = acc_malloc (N*sizeof (int));
+ d_f = acc_malloc (N*sizeof (int));
+
+ memset (e, 0xff, N*sizeof (int));
+ d_e = acc_copyin (e, N*sizeof (int));
+
+ #pragma acc serial copyout(intptr)
+ intptr = init_d ();
+ d_d = (void*) intptr;
+ acc_memcpy_device (d_c, d_d, N*sizeof (int));
+
+ #pragma acc serial copy(fail) deviceptr(d_c) firstprivate(intptr)
+ {
+ int *cc = (int *) d_c;
+ int *dd = (int *) intptr;
+ for (int i = 0; i < N; i++)
+ if (dd[i] != 27*i || cc[i] != 27*i)
+ {
+ fail = true;
+ __builtin_abort ();
+ }
+ }
+ if (fail) __builtin_abort ();
+
+ for (int i = 0; i < N; i++)
+ a[i] = 11*i;
+ for (int i = 0; i < N; i++)
+ b[i] = 31*i;
+
+ d_a = acc_copyin (a, N*sizeof (int));
+ acc_copyin_async (b, N*sizeof (int), acc_async_noval);
+
+ #pragma acc parallel deviceptr(d_c) async
+ {
+ int *cc = (int *) d_c;
+ #pragma acc loop
+ for (int i = 0; i < N; i++)
+ cc[i] = -17*i;
+ }
+
+ acc_memcpy_device_async (d_d, d_a, N*sizeof (int), acc_async_noval);
+ acc_memcpy_device_async (d_f, d_c, N*sizeof (int), acc_async_noval);
+ acc_wait (acc_async_noval);
+ d_b = acc_deviceptr (b);
+ acc_memcpy_device_async (d_e, d_b, N*sizeof (int), acc_async_noval);
+ acc_wait (acc_async_noval);
+
+ #pragma acc serial deviceptr(d_d, d_e, d_f) copy(fail)
+ {
+ int *dd = (int *) d_d;
+ int *ee = (int *) d_e;
+ int *ff = (int *) d_f;
+ for (int i = 0; i < N; i++)
+ if (dd[i] != 11*i
+ || ee[i] != 31*i
+ || ff[i] != -17*i)
+ {
+ fail = true;
+ __builtin_abort ();
+ }
+ }
+ if (fail) __builtin_abort ();
+}
--- /dev/null
+! { dg-prune-output "using .vector_length \\(32\\)" }
+
+! PR libgomp/93226 */
+
+module m
+ use iso_c_binding
+ use openacc
+ implicit none (external, type)
+
+ integer, parameter :: N = 1024
+
+ integer :: D(N)
+ !$acc declare device_resident(D)
+
+contains
+
+ integer(c_intptr_t) function init_d()
+ !$acc routine
+ integer :: i
+ do i = 1, N
+ D(i) = 27*i
+ end do
+ init_d = loc(D)
+ end
+end module
+
+program main
+ use m
+ implicit none (external, type)
+
+ integer, allocatable, target :: a(:), b(:), e(:)
+ type(c_ptr) :: d_a, d_b, d_c, d_d, d_e, d_f
+ integer(c_intptr_t) intptr
+ integer :: i
+ logical fail
+
+ fail = .false.
+
+ allocate(a(N), b(N), e(N))
+ d_c = acc_malloc (N*c_sizeof (i))
+ d_f = acc_malloc (N*c_sizeof (i))
+
+ e = huge(e)
+ call acc_copyin (e, N*c_sizeof (i));
+ d_e = acc_deviceptr (e);
+
+ !$acc serial copyout(intptr)
+ intptr = init_d ()
+ !$acc end serial
+ d_d = transfer(intptr, d_d)
+ call acc_memcpy_device (d_c, d_d, N*c_sizeof (i))
+
+ !$acc serial copy(fail) copy(a) deviceptr(d_c, d_d) firstprivate(intptr)
+ block
+ integer, pointer :: cc(:), dd(:)
+ call c_f_pointer (d_c, cc, [N])
+ call c_f_pointer (d_d, dd, [N])
+ a = cc
+ do i = 1, N
+ if (dd(i) /= 27*i .or. cc(i) /= 27*i) then
+ fail = .true.
+ stop 1
+ end if
+ end do
+ end block
+ !$acc end serial
+ if (fail) error stop 1
+
+ do i = 1, N
+ a(i) = 11*i
+ b(i) = 31*i
+ end do
+
+ call acc_copyin (a, N*c_sizeof (i))
+ d_a = acc_deviceptr (a)
+ call acc_copyin_async (b, N*c_sizeof (i), acc_async_noval)
+
+ !$acc parallel deviceptr(d_c) private(i) async
+ block
+ integer, pointer :: cc(:)
+ call c_f_pointer (d_c, cc, [N])
+ !$acc loop
+ do i = 1, N
+ cc(i) = -17*i
+ end do
+ end block
+ !$acc end parallel
+
+ call acc_memcpy_device_async (d_d, d_a, N*c_sizeof (i), acc_async_noval)
+ call acc_memcpy_device_async (d_f, d_c, N*c_sizeof (i), acc_async_noval)
+ call acc_wait (acc_async_noval)
+ d_b = acc_deviceptr (b)
+ call acc_memcpy_device_async (d_e, d_b, N*c_sizeof (i), acc_async_noval)
+ call acc_wait (acc_async_noval)
+
+ !$acc serial deviceptr(d_d, d_e, d_f) private(i) copy(fail)
+ block
+ integer, pointer :: dd(:), ee(:), ff(:)
+ call c_f_pointer (d_d, dd, [N])
+ call c_f_pointer (d_e, ee, [N])
+ call c_f_pointer (d_f, ff, [N])
+ do i = 1, N
+ if (dd(i) /= 11*i &
+ .or. ee(i) /= 31*i &
+ .or. ff(i) /= -17*i) then
+ fail = .true.
+ stop 2
+ end if
+ end do
+ end block
+ !$acc end serial
+ if (fail) error stop 2
+end