const size_t *dst_dimensions,
const size_t *src_dimensions,
struct gomp_device_descr *dst_devicep,
- struct gomp_device_descr *src_devicep)
+ struct gomp_device_descr *src_devicep,
+ size_t *tmp_size, void **tmp)
{
size_t dst_slice = element_size;
size_t src_slice = element_size;
|| __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off)
|| __builtin_mul_overflow (element_size, src_offsets[0], &src_off))
return EINVAL;
- if (dst_devicep == NULL && src_devicep == NULL)
- {
- memcpy ((char *) dst + dst_off, (const char *) src + src_off,
- length);
- ret = 1;
- }
- else if (src_devicep == NULL)
- ret = dst_devicep->host2dev_func (dst_devicep->target_id,
+ if (src_devicep != NULL && src_devicep == dst_devicep)
+ ret = src_devicep->dev2dev_func (src_devicep->target_id,
+ (char *) dst + dst_off,
+ (const char *) src + src_off,
+ length);
+ else if (src_devicep != NULL
+ && (dst_devicep == NULL
+ || (dst_devicep->capabilities
+ & GOMP_OFFLOAD_CAP_SHARED_MEM)))
+ ret = src_devicep->dev2host_func (src_devicep->target_id,
(char *) dst + dst_off,
(const char *) src + src_off,
length);
- else if (dst_devicep == NULL)
- ret = src_devicep->dev2host_func (src_devicep->target_id,
+ else if (dst_devicep != NULL
+ && (src_devicep == NULL
+ || (src_devicep->capabilities
+ & GOMP_OFFLOAD_CAP_SHARED_MEM)))
+ ret = dst_devicep->host2dev_func (dst_devicep->target_id,
(char *) dst + dst_off,
(const char *) src + src_off,
length);
+ else if (dst_devicep == NULL && src_devicep == NULL)
+ {
+ memcpy ((char *) dst + dst_off, (const char *) src + src_off,
+ length);
+ ret = 1;
+ }
else if (src_devicep == dst_devicep)
ret = src_devicep->dev2dev_func (src_devicep->target_id,
(char *) dst + dst_off,
(const char *) src + src_off,
length);
else
- ret = 0;
+ {
+ if (*tmp_size == 0)
+ {
+ *tmp_size = length;
+ *tmp = malloc (length);
+ if (*tmp == NULL)
+ return ENOMEM;
+ }
+ else if (*tmp_size < length)
+ {
+ *tmp_size = length;
+ *tmp = realloc (*tmp, length);
+ if (*tmp == NULL)
+ return ENOMEM;
+ }
+ ret = src_devicep->dev2host_func (src_devicep->target_id, *tmp,
+ (const char *) src + src_off,
+ length);
+ if (ret == 1)
+ ret = dst_devicep->host2dev_func (dst_devicep->target_id,
+ (char *) dst + dst_off, *tmp,
+ length);
+ }
return ret ? 0 : EINVAL;
}
- /* FIXME: it would be nice to have some plugin function to handle
- num_dims == 2 and num_dims == 3 more efficiently. Larger ones can
- be handled in the generic recursion below, and for host-host it
- should be used even for any num_dims >= 2. */
+ /* host->device, device->host and same-device device->device. */
+ if (num_dims == 2
+ && ((src_devicep
+ && src_devicep == dst_devicep
+ && src_devicep->memcpy2d_func)
+ || (!src_devicep != !dst_devicep
+ && ((src_devicep && src_devicep->memcpy2d_func)
+ || (dst_devicep && dst_devicep->memcpy2d_func)))))
+ {
+ size_t vol_sz1, dst_sz1, src_sz1, dst_off_sz1, src_off_sz1;
+ int dst_id = dst_devicep ? dst_devicep->target_id : -1;
+ int src_id = src_devicep ? src_devicep->target_id : -1;
+ struct gomp_device_descr *devp = dst_devicep ? dst_devicep : src_devicep;
+
+ if (__builtin_mul_overflow (volume[1], element_size, &vol_sz1)
+ || __builtin_mul_overflow (dst_dimensions[1], element_size, &dst_sz1)
+ || __builtin_mul_overflow (src_dimensions[1], element_size, &src_sz1)
+ || __builtin_mul_overflow (dst_offsets[1], element_size, &dst_off_sz1)
+ || __builtin_mul_overflow (src_offsets[1], element_size,
+ &src_off_sz1))
+ return EINVAL;
+ ret = devp->memcpy2d_func (dst_id, src_id, vol_sz1, volume[0],
+ dst, dst_off_sz1, dst_offsets[0], dst_sz1,
+ src, src_off_sz1, src_offsets[0], src_sz1);
+ if (ret != -1)
+ return ret ? 0 : EINVAL;
+ }
+ else if (num_dims == 3
+ && ((src_devicep
+ && src_devicep == dst_devicep
+ && src_devicep->memcpy3d_func)
+ || (!src_devicep != !dst_devicep
+ && ((src_devicep && src_devicep->memcpy3d_func)
+ || (dst_devicep && dst_devicep->memcpy3d_func)))))
+ {
+ size_t vol_sz2, dst_sz2, src_sz2, dst_off_sz2, src_off_sz2;
+ int dst_id = dst_devicep ? dst_devicep->target_id : -1;
+ int src_id = src_devicep ? src_devicep->target_id : -1;
+ struct gomp_device_descr *devp = dst_devicep ? dst_devicep : src_devicep;
+
+ if (__builtin_mul_overflow (volume[2], element_size, &vol_sz2)
+ || __builtin_mul_overflow (dst_dimensions[2], element_size, &dst_sz2)
+ || __builtin_mul_overflow (src_dimensions[2], element_size, &src_sz2)
+ || __builtin_mul_overflow (dst_offsets[2], element_size, &dst_off_sz2)
+ || __builtin_mul_overflow (src_offsets[2], element_size,
+ &src_off_sz2))
+ return EINVAL;
+ ret = devp->memcpy3d_func (dst_id, src_id, vol_sz2, volume[1], volume[0],
+ dst, dst_off_sz2, dst_offsets[1],
+ dst_offsets[0], dst_sz2, dst_dimensions[1],
+ src, src_off_sz2, src_offsets[1],
+ src_offsets[0], src_sz2, src_dimensions[1]);
+ if (ret != -1)
+ return ret ? 0 : EINVAL;
+ }
for (i = 1; i < num_dims; i++)
if (__builtin_mul_overflow (dst_slice, dst_dimensions[i], &dst_slice)
volume + 1, dst_offsets + 1,
src_offsets + 1, dst_dimensions + 1,
src_dimensions + 1, dst_devicep,
- src_devicep);
+ src_devicep, tmp_size, tmp);
if (ret)
return ret;
dst_off += dst_slice;
if (ret)
return ret;
- if (*src_devicep != NULL && *dst_devicep != NULL && *src_devicep != *dst_devicep)
- return EINVAL;
-
return 0;
}
struct gomp_device_descr *dst_devicep,
struct gomp_device_descr *src_devicep)
{
- if (src_devicep)
+ size_t tmp_size = 0;
+ void *tmp = NULL;
+ bool lock_src;
+ bool lock_dst;
+
+ lock_src = (src_devicep
+ && (!dst_devicep
+ || src_devicep == dst_devicep
+ || !(src_devicep->capabilities
+ & GOMP_OFFLOAD_CAP_SHARED_MEM)));
+ lock_dst = (dst_devicep
+ && (!lock_src
+ || (src_devicep != dst_devicep
+ && !(dst_devicep->capabilities
+ & GOMP_OFFLOAD_CAP_SHARED_MEM))));
+ if (lock_src)
gomp_mutex_lock (&src_devicep->lock);
- else if (dst_devicep)
+ if (lock_dst)
gomp_mutex_lock (&dst_devicep->lock);
int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims,
volume, dst_offsets, src_offsets,
dst_dimensions, src_dimensions,
- dst_devicep, src_devicep);
- if (src_devicep)
+ dst_devicep, src_devicep,
+ &tmp_size, &tmp);
+ if (lock_src)
gomp_mutex_unlock (&src_devicep->lock);
- else if (dst_devicep)
+ if (lock_dst)
gomp_mutex_unlock (&dst_devicep->lock);
+ if (tmp)
+ free (tmp);
return ret;
}
DLSYM (free);
DLSYM (dev2host);
DLSYM (host2dev);
+ DLSYM (memcpy2d);
+ DLSYM (memcpy3d);
device->capabilities = device->get_caps_func ();
if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
{
--- /dev/null
+program main
+ use iso_c_binding
+ use omp_lib
+ implicit none (type, external)
+
+ integer(c_size_t), parameter :: sizeof_int = 4
+ integer, parameter :: sk = c_size_t
+ logical, allocatable :: isshared(:)
+ integer, allocatable :: maxdim(:,:)
+ integer :: ndev
+
+ ndev = omp_get_num_devices()
+ call init_isshared
+ call init_maxdim
+
+ call one
+ call two
+ call three
+ call four
+
+ deallocate(isshared, maxdim)
+contains
+
+ subroutine init_maxdim
+ integer :: dev, dev2, r
+ integer(c_size_t), parameter :: nl = 0
+
+ allocate(maxdim(0:ndev,0:ndev))
+ do dev = 0, ndev
+ do dev2 = 0, ndev
+ r = omp_target_memcpy_rect (c_null_ptr, c_null_ptr, nl, &
+ num_dims=1_c_int, volume=[nl], &
+ dst_offsets=[nl], src_offsets=[nl], &
+ dst_dimensions=[nl], src_dimensions=[nl], &
+ dst_device_num=dev, src_device_num=omp_initial_device)
+ if (r < 3) stop 1 ! OpenMP requirement
+ if (r < huge(0_c_int)) stop 2 ! GCC implementation
+ maxdim(dev2,dev) = r
+ end do
+ end do
+ end subroutine
+
+ subroutine init_isshared
+ integer :: dev
+ logical :: dev_isshared
+
+ allocate(isshared(0:ndev))
+ do dev = 0, ndev
+ dev_isshared = .false.
+ !$omp target device(dev) map(to: dev_isshared)
+ dev_isshared = .true.
+ !$omp end target
+ isshared(dev) = dev_isshared
+ end do
+ end subroutine
+
+
+ subroutine one
+ integer(c_size_t), parameter :: N1 = 30
+ integer, target :: host_data(N1)
+ type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr
+ integer :: dev, dev2, i, r
+
+ do dev = 0, ndev
+ dev_cptr(dev) = omp_target_alloc (N1*sizeof_int, dev)
+ if (.not. c_associated (dev_cptr(dev))) stop 11
+ end do
+
+ do i = 1, N1
+ host_data(i) = i
+ end do
+
+ ! copy full array host -> all devices + check value + set per-device value
+ do dev = 0, ndev
+ r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, &
+ num_dims=1_c_int, volume=[N1], &
+ dst_offsets=[0_sk], src_offsets=[0_sk], &
+ dst_dimensions=[N1], src_dimensions=[N1], &
+ dst_device_num=dev, src_device_num=omp_initial_device)
+ if (r /= 0) stop 12
+ cptr = dev_cptr(dev)
+ !$omp target device(dev) is_device_ptr(cptr)
+ block
+ integer, pointer, contiguous :: fptr(:)
+ call c_f_pointer(cptr, fptr, [N1])
+ do i = 1, N1
+ if (fptr(i) /= i) stop 13
+ fptr(i) = i*100 + 10000 * (dev+3)
+ end do
+ end block
+ end do
+
+ ! Test strided data - forth and back - same array sizes
+ do dev = 0, ndev
+ do dev2 = 0, ndev
+ tmp_cptr = omp_target_alloc (N1*sizeof_int, dev)
+ if (.not. c_associated (tmp_cptr)) stop 14
+
+ !$omp target device(dev) is_device_ptr(tmp_cptr)
+ block
+ integer, pointer, contiguous :: fptr(:)
+ call c_f_pointer(tmp_cptr, fptr, [N1])
+ do i = 1, N1
+ fptr(i) = i*100 + 10000*(dev+1)
+ end do
+ end block
+
+ if (N1-17 > N1 - max(12,13)) stop 18
+ r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, &
+ num_dims=1_c_int, volume=[N1-17], &
+ dst_offsets=[12_sk], src_offsets=[13_sk], &
+ dst_dimensions=[N1], src_dimensions=[N1], &
+ dst_device_num=dev2, src_device_num=dev)
+ if (r /= 0) stop 15
+
+ cptr = dev_cptr(dev2)
+ !$omp target device(dev2) is_device_ptr(cptr)
+ block
+ logical :: checked(N1)
+ integer, pointer, contiguous :: fptr(:)
+ call c_f_pointer(cptr, fptr, [N1])
+ checked = .false.
+ do i = 1, N1-17
+ if (fptr(i+12) /= (i+13)*100 + 10000 * (dev+1)) stop 16
+ checked(i+12) = .true.
+ end do
+ ! original device value
+ do i = 1, N1
+ if (.not. checked(i)) then
+ if (fptr(i) /= i*100 + 10000 * (dev2+3)) stop 17
+ end if
+ end do
+ end block
+ call omp_target_free (tmp_cptr, dev)
+ end do
+
+ ! reset to original value
+ do dev2 = 0, ndev
+ cptr = dev_cptr(dev2)
+ !$omp target device(dev2) is_device_ptr(cptr)
+ block
+ integer, pointer, contiguous :: fptr(:)
+ call c_f_pointer(cptr, fptr, [N1])
+ do i = 1, N1
+ fptr(i) = i*100 + 10000 * (dev2+3)
+ end do
+ end block
+ end do
+ end do
+
+ do dev = 0, ndev
+ call omp_target_free (dev_cptr(dev), dev)
+ end do
+ end subroutine
+
+
+ subroutine two
+ integer(c_size_t), parameter :: N = 10, M = 30
+ integer, target :: host_data(N,M)
+ type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr
+ integer :: dev, dev2, i, j, r
+
+ do dev = 0, ndev
+ dev_cptr(dev) = omp_target_alloc (N*M*sizeof_int, dev)
+ if (.not. c_associated (dev_cptr(dev))) stop 21
+ end do
+
+ do i = 1, M
+ do j = 1, N
+ host_data(j,i) = i*100 + j
+ end do
+ end do
+
+ ! copy full array host -> all devices + check value + set per-device value
+ do dev = 0, ndev
+ r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, &
+ num_dims=2_c_int, volume=[M, N], &
+ dst_offsets=[0_sk, 0_sk], src_offsets=[0_sk, 0_sk], &
+ dst_dimensions=[M, N], src_dimensions=[M,N], &
+ dst_device_num=dev, src_device_num=omp_initial_device)
+ if (r /= 0) stop 22
+ cptr = dev_cptr(dev)
+ !$omp target device(dev) is_device_ptr(cptr)
+ block
+ integer, pointer, contiguous :: fptr(:,:)
+ call c_f_pointer(cptr, fptr, [N,M])
+ do i = 1, M
+ do j = 1, N
+ if (fptr(j,i) /= i*100 + j) stop 23
+ fptr(j,i) = i*100 + j + 1000 * dev
+ end do
+ end do
+ end block
+ end do
+
+ ! Test strided data - forth and back - same array sizes
+ do dev = 0, ndev
+ do dev2 = 0, ndev
+ tmp_cptr = omp_target_alloc (N*M*sizeof_int, dev)
+ if (.not. c_associated (tmp_cptr)) stop 24
+
+ !$omp target device(dev) is_device_ptr(tmp_cptr)
+ block
+ integer, pointer, contiguous :: fptr(:,:)
+ call c_f_pointer(tmp_cptr, fptr, [N,M])
+ do i = 1, M
+ do j = 1, N
+ fptr(j,i) = i*100 + j + 100000 * (dev+1)
+ end do
+ end do
+ end block
+
+ if (M-14 > M - max(5,2) &
+ .or. N-3 > N - max(2,1)) stop 28
+ r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, &
+ num_dims=2_c_int, volume=[M-14, N-3], &
+ dst_offsets=[5_sk, 3_sk], src_offsets=[2_sk, 1_sk], &
+ dst_dimensions=[M, N], src_dimensions=[M,N], &
+ dst_device_num=dev2, src_device_num=dev)
+ if (r /= 0) stop 25
+
+ cptr = dev_cptr(dev2)
+ !$omp target device(dev2) is_device_ptr(cptr)
+ block
+ logical :: checked(N,M)
+ integer, pointer, contiguous :: fptr(:,:)
+ call c_f_pointer(cptr, fptr, [N,M])
+ checked = .false.
+ do i = 1, M-14
+ do j = 1, N-3
+ if (fptr(j+3, i+5) /= (i+2)*100 + (j+1) + 100000 * (dev+1)) stop 26
+ checked(j+3, i+5) = .true.
+ end do
+ end do
+ ! original device value
+ do i = 1, M
+ do j = 1, N
+ if (.not. checked(j,i)) then
+ if (fptr(j,i) /= i*100 + j + 1000 * dev2) stop 27
+ end if
+ end do
+ end do
+ end block
+ call omp_target_free (tmp_cptr, dev)
+ end do
+
+ ! reset to original value
+ do dev2 = 0, ndev
+ cptr = dev_cptr(dev2)
+ !$omp target device(dev2) is_device_ptr(cptr)
+ block
+ integer, pointer, contiguous :: fptr(:,:)
+ call c_f_pointer(cptr, fptr, [N,M])
+ do i = 1, M
+ do j = 1, N
+ fptr(j,i) = i*100 + j + 1000 * dev2
+ end do
+ end do
+ end block
+ end do
+ end do
+
+ do dev = 0, ndev
+ call omp_target_free (dev_cptr(dev), dev)
+ end do
+ end subroutine
+
+
+ subroutine three
+ integer(c_size_t), parameter :: N1 = 10, N2 = 30, N3 = 15
+ integer, target :: host_data(N3,N2,N1)
+ type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr
+ integer :: dev, dev2, i, j, k, r
+
+ do dev = 0, ndev
+ dev_cptr(dev) = omp_target_alloc (N1*N2*N3*sizeof_int, dev)
+ if (.not. c_associated (dev_cptr(dev))) stop 31
+ end do
+
+ do i = 1, N1
+ do j = 1, N2
+ do k = 1, N3
+ host_data(k, j,i) = i*1000 + 100*j + k
+ end do
+ end do
+ end do
+
+ ! copy full array host -> all devices + check value + set per-device value
+ do dev = 0, ndev
+ r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, &
+ num_dims=3_c_int, volume=[N1, N2, N3], &
+ dst_offsets=[0_sk, 0_sk, 0_sk], src_offsets=[0_sk, 0_sk, 0_sk], &
+ dst_dimensions=[N1, N2, N3], src_dimensions=[N1, N2, N3], &
+ dst_device_num=dev, src_device_num=omp_initial_device)
+ if (r /= 0) stop 32
+ cptr = dev_cptr(dev)
+ !$omp target device(dev) is_device_ptr(cptr)
+ block
+ integer, pointer, contiguous :: fptr(:,:,:)
+ call c_f_pointer(cptr, fptr, [N3,N2,N1])
+ do i = 1, N1
+ do j = 1, N2
+ do k = 1, N3
+ if (fptr(k, j,i) /= i*1000 + 100*j + k) stop 33
+ fptr(k,j,i) = i*1000 + 100*j + k + 1000 * dev
+ end do
+ end do
+ end do
+ end block
+ end do
+
+ ! Test strided data - forth and back - same array sizes
+ do dev = 0, ndev
+ do dev2 = 0, ndev
+ tmp_cptr = omp_target_alloc (N1*N2*N3*sizeof_int, dev)
+ if (.not. c_associated (tmp_cptr)) stop 34
+
+ !$omp target device(dev) is_device_ptr(tmp_cptr)
+ block
+ integer, pointer, contiguous :: fptr(:,:,:)
+ call c_f_pointer(tmp_cptr, fptr, [N3,N2,N1])
+ do i = 1, N1
+ do j = 1, N2
+ do k = 1, N3
+ fptr(k,j,i) = i*1000 + 100*j + k + 100000 * (dev+1)
+ end do
+ end do
+ end do
+ end block
+
+ if (N1-5 > N1 - max(5,2) &
+ .or. N2-13 > N2 - max(3,1) &
+ .or. N3-5 > N3 - max(2,4)) stop 38
+ r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, &
+ num_dims=3_c_int, volume=[N1-5, N2-13,N3-5], &
+ dst_offsets=[5_sk, 3_sk,2_sk], src_offsets=[2_sk, 1_sk,4_sk], &
+ dst_dimensions=[N1,N2,N3], src_dimensions=[N1,N2,N3], &
+ dst_device_num=dev2, src_device_num=dev)
+ if (r /= 0) stop 35
+
+ cptr = dev_cptr(dev2)
+ !$omp target device(dev2) is_device_ptr(cptr)
+ block
+ logical :: checked(N3,N2,N1)
+ integer, pointer, contiguous :: fptr(:,:,:)
+ call c_f_pointer(cptr, fptr, [N3,N2,N1])
+ checked = .false.
+ do i = 1, N1-5
+ do j = 1, N2-13
+ do k = 1, N3-5
+ if (fptr(k+2, j+3, i+5) /= (i+2)*1000 + 100*(j+1) + (k+4) + 100000 * (dev+1)) stop 36
+ checked(k+2, j+3, i+5) = .true.
+ end do
+ end do
+ end do
+ ! original device value
+ do i = 1, N1
+ do j = 1, N2
+ do k = 1, N3
+ if (.not. checked(k,j,i)) then
+ if (fptr(k,j,i) /= i*1000 + 100*j + k + 1000 * dev2) stop 37
+ end if
+ end do
+ end do
+ end do
+ end block
+ call omp_target_free (tmp_cptr, dev)
+ end do
+
+ ! reset to original value
+ do dev2 = 0, ndev
+ cptr = dev_cptr(dev2)
+ !$omp target device(dev2) is_device_ptr(cptr)
+ block
+ integer, pointer, contiguous :: fptr(:,:,:)
+ call c_f_pointer(cptr, fptr, [N3,N2,N1])
+ do i = 1, N1
+ do j = 1, N2
+ do k = 1, N3
+ fptr(k,j,i) = i*1000 + 100*j + k + 1000 * dev2
+ end do
+ end do
+ end do
+ end block
+ end do
+ end do
+
+ do dev = 0, ndev
+ call omp_target_free (dev_cptr(dev), dev)
+ end do
+ end subroutine
+
+
+ subroutine four
+ integer(c_size_t), parameter :: N1 = 10, N2 = 30, N3 = 15, N4 = 25
+ integer, target :: host_data(N4, N3,N2,N1)
+ type(c_ptr) :: dev_cptr(0:ndev), cptr, tmp_cptr
+ integer :: dev, dev2, i, j, k, ll, r
+
+ do dev = 0, ndev
+ dev_cptr(dev) = omp_target_alloc (N1*N2*N3*N4*sizeof_int, dev)
+ if (.not. c_associated (dev_cptr(dev))) stop 41
+ end do
+
+ do i = 1, N1
+ do j = 1, N2
+ do k = 1, N3
+ do ll = 1, N4
+ host_data(ll, k, j,i) = i*1000 + 100*j + k*10 + ll
+ end do
+ end do
+ end do
+ end do
+
+ ! copy full array host -> all devices + check value + set per-device value
+ do dev = 0, ndev
+ r = omp_target_memcpy_rect (dev_cptr(dev), c_loc(host_data), sizeof_int, &
+ num_dims=4_c_int, volume=[N1, N2, N3, N4], &
+ dst_offsets=[0_sk, 0_sk, 0_sk, 0_sk], src_offsets=[0_sk, 0_sk, 0_sk, 0_sk], &
+ dst_dimensions=[N1, N2, N3, N4], src_dimensions=[N1, N2, N3, N4], &
+ dst_device_num=dev, src_device_num=omp_initial_device)
+ if (r /= 0) stop 42
+ cptr = dev_cptr(dev)
+ !$omp target device(dev) is_device_ptr(cptr)
+ block
+ integer, pointer, contiguous :: fptr(:,:,:,:)
+ call c_f_pointer(cptr, fptr, [N4,N3,N2,N1])
+ do i = 1, N1
+ do j = 1, N2
+ do k = 1, N3
+ do ll = 1, N4
+ if (fptr(ll, k, j,i) /= i*1000 + 100*j + k*10 + ll) stop 43
+ fptr(ll,k,j,i) = i*1000 + 100*j + k*10 + ll + 1000 * dev
+ end do
+ end do
+ end do
+ end do
+ end block
+ end do
+
+ ! Test strided data - forth and back - same array sizes
+ do dev = 0, ndev
+ do dev2 = 0, ndev
+ tmp_cptr = omp_target_alloc (N1*N2*N3*N4*sizeof_int, dev)
+ if (.not. c_associated (tmp_cptr)) stop 44
+
+ !$omp target device(dev) is_device_ptr(tmp_cptr)
+ block
+ integer, pointer, contiguous :: fptr(:,:,:,:)
+ call c_f_pointer(tmp_cptr, fptr, [N4,N3,N2,N1])
+ do i = 1, N1
+ do j = 1, N2
+ do k = 1, N3
+ do ll = 1, N4
+ fptr(ll,k,j,i) = i*1000 + 100*j + k*10 + ll + 100000 * (dev+1)
+ end do
+ end do
+ end do
+ end do
+ end block
+
+ if (N1-5 > N1 - max(5,2) &
+ .or. N2-13 > N2 - max(3,1) &
+ .or. N3-5 > N3 - max(2,4) &
+ .or. N4-11 > N4 - max(7,5)) stop 48
+ r = omp_target_memcpy_rect (dev_cptr(dev2), tmp_cptr, sizeof_int, &
+ num_dims=4_c_int, volume=[N1-5, N2-13,N3-5,N4-11], &
+ dst_offsets=[5_sk, 3_sk,2_sk,7_sk], src_offsets=[2_sk, 1_sk,4_sk,5_sk], &
+ dst_dimensions=[N1,N2,N3,N4], src_dimensions=[N1,N2,N3,N4], &
+ dst_device_num=dev2, src_device_num=dev)
+ if (r /= 0) stop 45
+
+ cptr = dev_cptr(dev2)
+ !$omp target device(dev2) is_device_ptr(cptr)
+ block
+ logical, allocatable :: checked(:,:,:,:) ! allocatble to reduce stack size
+ integer, pointer, contiguous :: fptr(:,:,:,:)
+ call c_f_pointer(cptr, fptr, [N4,N3,N2,N1])
+ allocate (checked(N4,N3,N2,N1), source=.false.)
+ do i = 1, N1-5
+ do j = 1, N2-13
+ do k = 1, N3-5
+ do ll = 1, N4-11
+ if (fptr(ll+7, k+2, j+3, i+5) /= (i+2)*1000 + 100*(j+1) + (k+4)*10 + ll+5 + 100000 * (dev+1)) stop 46
+ checked(ll+7, k+2, j+3, i+5) = .true.
+ end do
+ end do
+ end do
+ end do
+ ! original device value
+ do i = 1, N1
+ do j = 1, N2
+ do k = 1, N3
+ do ll = 1, N4
+ if (.not. checked(ll,k,j,i)) then
+ if (fptr(ll,k,j,i) /= i*1000 + 100*j + k*10 + ll + 1000 * dev2) stop 47
+ end if
+ end do
+ end do
+ end do
+ end do
+ deallocate (checked)
+ end block
+ call omp_target_free (tmp_cptr, dev)
+ end do
+
+ ! reset to original value
+ do dev2 = 0, ndev
+ cptr = dev_cptr(dev2)
+ !$omp target device(dev2) is_device_ptr(cptr)
+ block
+ integer, pointer, contiguous :: fptr(:,:,:,:)
+ call c_f_pointer(cptr, fptr, [N4,N3,N2,N1])
+ do i = 1, N1
+ do j = 1, N2
+ do k = 1, N3
+ do ll = 1, N4
+ fptr(ll,k,j,i) = i*1000 + 100*j + k*10 + ll + 1000 * dev2
+ end do
+ end do
+ end do
+ end do
+ end block
+ end do
+ end do
+
+ do dev = 0, ndev
+ call omp_target_free (dev_cptr(dev), dev)
+ end do
+ end subroutine
+end program