data.srcXInBytes = src_offset1_size;
data.srcY = src_offset0_len;
+ if (data.srcXInBytes != 0 || data.srcY != 0)
+ {
+ /* Adjust origin to the actual array data, else the CUDA 2D memory
+ copy API calls below may fail to validate source/dest pointers
+ correctly (especially for Fortran where the "virtual origin" of an
+ array is often outside the stored data). */
+ if (src_ord == -1)
+ data.srcHost = (const void *) ((const char *) data.srcHost
+ + data.srcY * data.srcPitch
+ + data.srcXInBytes);
+ else
+ data.srcDevice += data.srcY * data.srcPitch + data.srcXInBytes;
+ data.srcXInBytes = 0;
+ data.srcY = 0;
+ }
+
+ if (data.dstXInBytes != 0 || data.dstY != 0)
+ {
+ /* As above. */
+ if (dst_ord == -1)
+ data.dstHost = (void *) ((char *) data.dstHost
+ + data.dstY * data.dstPitch
+ + data.dstXInBytes);
+ else
+ data.dstDevice += data.dstY * data.dstPitch + data.dstXInBytes;
+ data.dstXInBytes = 0;
+ data.dstY = 0;
+ }
+
CUresult res = CUDA_CALL_NOCHECK (cuMemcpy2D, &data);
if (res == CUDA_ERROR_INVALID_VALUE)
/* If pitch > CU_DEVICE_ATTRIBUTE_MAX_PITCH or for device-to-device
data.srcY = src_offset1_len;
data.srcZ = src_offset0_len;
+ if (data.srcXInBytes != 0 || data.srcY != 0 || data.srcZ != 0)
+ {
+ /* Adjust origin to the actual array data, else the CUDA 3D memory
+ copy API call below may fail to validate source/dest pointers
+ correctly (especially for Fortran where the "virtual origin" of an
+ array is often outside the stored data). */
+ if (src_ord == -1)
+ data.srcHost
+ = (const void *) ((const char *) data.srcHost
+ + (data.srcZ * data.srcHeight + data.srcY)
+ * data.srcPitch
+ + data.srcXInBytes);
+ else
+ data.srcDevice
+ += (data.srcZ * data.srcHeight + data.srcY) * data.srcPitch
+ + data.srcXInBytes;
+ data.srcXInBytes = 0;
+ data.srcY = 0;
+ data.srcZ = 0;
+ }
+
+ if (data.dstXInBytes != 0 || data.dstY != 0 || data.dstZ != 0)
+ {
+ /* As above. */
+ if (dst_ord == -1)
+ data.dstHost = (void *) ((char *) data.dstHost
+ + (data.dstZ * data.dstHeight + data.dstY)
+ * data.dstPitch
+ + data.dstXInBytes);
+ else
+ data.dstDevice
+ += (data.dstZ * data.dstHeight + data.dstY) * data.dstPitch
+ + data.dstXInBytes;
+ data.dstXInBytes = 0;
+ data.dstY = 0;
+ data.dstZ = 0;
+ }
+
CUDA_CALL (cuMemcpy3D, &data);
return true;
}
--- /dev/null
+/* { dg-do run } */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <omp.h>
+
+/* Say this is N rows and M columns. */
+#define N 1024
+#define M 2048
+
+#define row_offset 256
+#define row_length 512
+#define col_offset 128
+#define col_length 384
+
+int
+main ()
+{
+ int *arr2d = (int *) calloc (N * M, sizeof (int));
+ uintptr_t dstptr;
+ int hostdev = omp_get_initial_device ();
+ int targdev;
+
+#pragma omp target enter data map(to: arr2d[col_offset*M:col_length*M])
+
+#pragma omp target map(from: targdev, dstptr) \
+ map(present, tofrom: arr2d[col_offset*M:col_length*M])
+ {
+ for (int j = col_offset; j < col_offset + col_length; j++)
+ for (int i = row_offset; i < row_offset + row_length; i++)
+ arr2d[j * M + i]++;
+ targdev = omp_get_device_num ();
+ dstptr = (uintptr_t) arr2d;
+ }
+
+ /* Copy rectangular block back to the host. */
+ {
+ size_t volume[2] = { col_length, row_length };
+ size_t offsets[2] = { col_offset, row_offset };
+ size_t dimensions[2] = { N, M };
+ omp_target_memcpy_rect ((void *) arr2d, (const void *) dstptr,
+ sizeof (int), 2, &volume[0], &offsets[0],
+ &offsets[0], &dimensions[0], &dimensions[0],
+ hostdev, targdev);
+ }
+
+#pragma omp target exit data map(release: arr2d[col_offset*M:col_length*M])
+
+ for (int j = 0; j < N; j++)
+ for (int i = 0; i < M; i++)
+ if (i >= row_offset && i < row_offset + row_length
+ && j >= col_offset && j < col_offset + col_length)
+ assert (arr2d[j * M + i] == 1);
+ else
+ assert (arr2d[j * M + i] == 0);
+
+ free (arr2d);
+
+ return 0;
+}