CU_MEMORYTYPE_UNIFIED = 0x04
} CUmemorytype;
+typedef enum {
+ CU_POINTER_ATTRIBUTE_CONTEXT = 0x01,
+ CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 0x02,
+ CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 0x03,
+ CU_POINTER_ATTRIBUTE_HOST_POINTER = 0x04
+} CUpointer_attribute;
+
typedef struct {
size_t srcXInBytes, srcY;
CUmemorytype srcMemoryType;
CUresult cuModuleLoad (CUmodule *, const char *);
CUresult cuModuleLoadData (CUmodule *, const void *);
CUresult cuModuleUnload (CUmodule);
+CUresult cuPointerGetAttribute (CUmemorytype *, CUpointer_attribute,
+ CUdeviceptr);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
extern bool GOMP_OFFLOAD_free (int, void *);
extern void *GOMP_OFFLOAD_managed_alloc (int, size_t);
extern bool GOMP_OFFLOAD_managed_free (int, void *);
+extern int GOMP_OFFLOAD_is_accessible_ptr (int, const void *, size_t);
extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t);
extern bool GOMP_OFFLOAD_page_locked_host_free (void *);
extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
__typeof (GOMP_OFFLOAD_free) *free_func;
__typeof (GOMP_OFFLOAD_managed_alloc) *managed_alloc_func;
__typeof (GOMP_OFFLOAD_managed_free) *managed_free_func;
+ __typeof (GOMP_OFFLOAD_is_accessible_ptr) *is_accessible_ptr_func;
__typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func;
__typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func;
__typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
@var{device_num}. If so, it returns a nonzero value and otherwise zero.
The address given by @var{ptr} is interpreted to be in the address space of
-the device and @var{size} must be positive.
+the device and @var{size} must be positive. NULL pointers and zero-length
+ranges always return zero.
Note that GCC's current implementation assumes that @var{ptr} is a valid host
-pointer. Therefore, all addresses given by @var{ptr} are assumed to be
-accessible on the initial device. And, to err on the safe side, this memory
-is only available on a non-host device that can access all host memory
-([uniform] shared memory access).
+pointer. Therefore, all non-NULL addresses given by @var{ptr} are assumed to be
+accessible on the initial device. The address is only reported as accessible
+on non-host devices if this is @emph{known} to be the case, or if the device
+reports that all memory is accessible (i.e. [unified] shared memory access).
+If the runtime is uncertain it may report accessible memory as inaccessible.
+For a memory range to be reported accessible, the whole range must be known to
+be accessible.
Running this routine in a @code{target} region except on the initial device
is not supported.
CUDA_ONE_CALL (cuModuleLoadData)
CUDA_ONE_CALL (cuModuleUnload)
CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
+CUDA_ONE_CALL (cuPointerGetAttribute)
CUDA_ONE_CALL (cuStreamAddCallback)
CUDA_ONE_CALL (cuStreamCreate)
CUDA_ONE_CALL (cuStreamDestroy)
hsa_status_t (*hsa_amd_svm_attributes_set_fn)
(void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
size_t attribute_count);
+ hsa_status_t (*hsa_amd_svm_attributes_get_fn)
+ (void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list,
+ size_t attribute_count);
+ hsa_status_t (*hsa_amd_pointer_info_fn)
+ (const void *, hsa_amd_pointer_info_t *, void *(*)(size_t),
+ uint32_t *, hsa_agent_t **);
};
/* As an HIP runtime is dlopened, following structure defines function
DLSYM_OPT_FN (hsa_amd_memory_unlock)
DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
DLSYM_OPT_FN (hsa_amd_svm_attributes_set)
+ DLSYM_OPT_FN (hsa_amd_svm_attributes_get)
+ DLSYM_OPT_FN (hsa_amd_pointer_info)
return true;
#undef DLSYM_OPT_FN
#undef DLSYM_FN
return true;
}
+enum accessible {
+ UNKNOWN,
+ INACCESSIBLE,
+ ACCESSIBLE
+};
+
+/* Is a host memory address accessible on the given device?
+ Returns UNKNOWN if the memory isn't registered, or if it isn't a valid host
+ pointer. */
+
+static enum accessible
+host_memory_is_accessible (hsa_agent_t agent, const void *ptr, size_t size)
+{
+ if (!hsa_fns.hsa_amd_svm_attributes_get_fn)
+ return UNKNOWN;
+
+ /* The HSA API doesn't seem to report for the whole range given, so we call
+ once for each page the range straddles. */
+ const void *p = ptr;
+ size_t remaining = size;
+ do
+ {
+ /* Note: the access query returns in the attribute field. */
+ struct hsa_amd_svm_attribute_pair_s attr = {
+ HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent.handle
+ };
+ hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_get_fn ((void*)p,
+ remaining,
+ &attr, 1);
+ if (status != HSA_STATUS_SUCCESS)
+ /* This happens when the memory isn't registered with ROCr at all. */
+ return UNKNOWN;
+
+ switch (attr.attribute)
+ {
+ case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE:
+ case HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE:
+ break;
+ case HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS:
+ default:
+ return INACCESSIBLE;
+ }
+
+ p = (void*)(((uintptr_t)p + 4096) & ~0xfffUL);
+ remaining = size - ((uintptr_t)p - (uintptr_t)ptr);
+ } while (p < ptr + size);
+
+ /* All pages were accessible. */
+ return ACCESSIBLE;
+}
+
+/* Is a device memory address accessible on the given device?
+ Returns UNKNOWN if it isn't a valid device address. Returns INACCESSIBLE if
+ the pointer is valid, but not the whole range, or if it refers to the wrong
+ device. */
+
+static enum accessible
+device_memory_is_accessible (hsa_agent_t agent, const void *ptr, size_t size)
+{
+ if (!hsa_fns.hsa_amd_pointer_info_fn)
+ return UNKNOWN;
+
+ hsa_amd_pointer_info_t info;
+ uint32_t nagents;
+ hsa_agent_t *agents;
+ info.size = sizeof (hsa_amd_pointer_info_t);
+
+ hsa_status_t status = hsa_fns.hsa_amd_pointer_info_fn (ptr, &info, NULL,
+ &nagents, &agents);
+ if (status != HSA_STATUS_SUCCESS
+ || info.type == HSA_EXT_POINTER_TYPE_UNKNOWN)
+ return UNKNOWN;
+
+ if (agent.handle == info.agentOwner.handle)
+ return (info.sizeInBytes >= size ? ACCESSIBLE : INACCESSIBLE);
+
+ for (unsigned i = 0; i < nagents; i++)
+ {
+ if (agent.handle == agents[0].handle)
+ return (info.sizeInBytes >= size ? ACCESSIBLE : INACCESSIBLE);
+ }
+
+ return INACCESSIBLE;
+}
+
+/* Backend implementation for omp_target_is_accessible. */
+
+int
+GOMP_OFFLOAD_is_accessible_ptr (int device, const void *ptr, size_t size)
+{
+ if (!init_hsa_context (false)
+ || device < 0 || device > hsa_context.agent_count)
+ return 0;
+
+ struct agent_info *agent = get_agent_info (device);
+
+ enum accessible result;
+ result = host_memory_is_accessible (agent->id, ptr, size);
+ if (result == UNKNOWN)
+ result = device_memory_is_accessible (agent->id, ptr, size);
+ return result == ACCESSIBLE;
+}
+
/* }}} */
/* {{{ OpenACC Plugin API */
static struct ptx_device **ptx_devices;
+static bool using_usm = false;
+
/* "Native" GPU thread stack size. */
static unsigned native_gpu_thread_stack_size = 0;
if (num_devices > 0
&& (omp_requires_mask
& (GOMP_REQUIRES_UNIFIED_SHARED_MEMORY | GOMP_REQUIRES_SELF_MAPS)))
- for (int dev = 0; dev < num_devices; dev++)
- {
- int pi;
- CUresult r;
- r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
- CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS, dev);
- if (r != CUDA_SUCCESS || pi == 0)
- return -1;
- }
+ {
+ for (int dev = 0; dev < num_devices; dev++)
+ {
+ int pi;
+ CUresult r;
+ r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
+ CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS,
+ dev);
+ if (r != CUDA_SUCCESS || pi == 0)
+ return -1;
+ }
+
+ using_usm = true;
+ }
return num_devices;
}
return GOMP_OFFLOAD_free (ord, ptr);
}
+int
+GOMP_OFFLOAD_is_accessible_ptr (int ord,
+ const void *ptr, size_t size)
+{
+ /* USM implies access. */
+ if (using_usm)
+ return 1;
+
+ struct ptx_device *ptx_dev = ptx_devices[ord];
+ CUcontext old_ctx;
+ CUDA_CALL_ERET (false, cuCtxPushCurrent, ptx_dev->ctx);
+
+ /* The Cuda API does not permit testing a whole range, so we test each
+ 4K page within the range. If any page is inaccessible return false. */
+ const void *p = ptr;
+ int result = 1; /* All pages accessible. */
+ do
+ {
+ CUmemorytype mem_type;
+ CUresult res = CUDA_CALL_NOCHECK (cuPointerGetAttribute, &mem_type,
+ CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+ (CUdeviceptr)p);
+ if (res != CUDA_SUCCESS)
+ /* Memory is not registered, and therefore not accessible. */
+ result = 0;
+
+ switch (mem_type)
+ {
+ case CU_MEMORYTYPE_HOST:
+ case CU_MEMORYTYPE_UNIFIED:
+ case CU_MEMORYTYPE_DEVICE:
+ break;
+ case CU_MEMORYTYPE_ARRAY:
+ default:
+ result = 0; /* This page isn't accessible. */
+ }
+
+ p = (void*)(((uintptr_t)p + 4096) & ~0xfffUL);
+ } while (result && p < ptr + size);
+
+ CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
+ return result;
+}
+
bool
GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
{
int
omp_target_is_accessible (const void *ptr, size_t size, int device_num)
{
+ if (ptr == NULL || size == 0)
+ return false;
+
if (device_num == omp_default_device)
device_num = gomp_get_default_device ();
if (devicep == NULL)
return false;
- /* TODO: Unified shared memory must be handled when available. */
+ /* Managed memory (or other device feature).
+ is_accessible_ptr may, in future, report more than simply true or false,
+ but we can assume that positive responses are accessible, and
+ zero/negative responses are inaccessible. */
+ if (devicep->is_accessible_ptr_func)
+ return (devicep->is_accessible_ptr_func (devicep->target_id, ptr, size)
+ > 0);
+
+ /* Unified shared memory (or true shared memory). */
+ if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ return true;
- return devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM;
+ return false;
}
int
DLSYM (free);
DLSYM_OPT (managed_alloc, managed_alloc);
DLSYM_OPT (managed_free, managed_free);
+ DLSYM_OPT (is_accessible_ptr, is_accessible_ptr);
DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc);
DLSYM_OPT (page_locked_host_free, page_locked_host_free);
DLSYM (dev2host);
#include <omp.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+//#define __builtin_abort() __builtin_printf ("fail at line %d\n", __LINE__)
int
main ()
int d = omp_get_default_device ();
int id = omp_get_initial_device ();
int n = omp_get_num_devices ();
- void *p;
+ int *p = (int*)malloc (sizeof (int));
if (d < 0 || d >= n)
d = id;
if (omp_target_is_accessible (p, sizeof (int), n + 1))
__builtin_abort ();
- /* Currently, a host pointer is accessible if the device supports shared
- memory or omp_target_is_accessible is executed on the host. This
- test case must be adapted when unified shared memory is avialable. */
int a[128];
for (int d = 0; d <= omp_get_num_devices (); d++)
{
+ if (omp_target_is_accessible (NULL, 1, d))
+ __builtin_abort ();
+
+ if (omp_target_is_accessible (p, 0, d))
+ __builtin_abort ();
+
+ /* Check if libgomp is treating the device as a shared memory device. */
int shared_mem = 0;
#pragma omp target map (alloc: shared_mem) device (d)
shared_mem = 1;
+
+ int heap_accessible = shared_mem;
if (omp_target_is_accessible (p, sizeof (int), d) != shared_mem)
- __builtin_abort ();
+ {
+ if (shared_mem)
+ __builtin_abort ();
+
+ /* shared_mem is false, but the memory is reading as accessible,
+ so let's check that by reading it. We should not do so
+ unconditionally because if it's wrong then we'll probably get
+ a memory fault. */
+ *p = 123;
+ uintptr_t addr = (uintptr_t)p;
+ #pragma omp target is_device_ptr(p) map(from:heap_accessible) \
+ device(d)
+ {
+ if ((uintptr_t)p == addr && *p == 123)
+ heap_accessible = 1;
+ }
+
+ if (!heap_accessible)
+ __builtin_abort ();
+ }
+
+ int stack_accessible = shared_mem;
if (omp_target_is_accessible (a, 128 * sizeof (int), d) != shared_mem)
- __builtin_abort ();
+ {
+ if (shared_mem)
+ __builtin_abort ();
+
+ /* shared_mem is false, but the memory is reading as accessible,
+ so let's check that by reading it. We should not do so
+ unconditionally because if it's wrong then we'll probably get
+ a memory fault. */
+ int test_accessible = 123;
+ uintptr_t addr = (uintptr_t)&test_accessible;
+ #pragma omp target has_device_addr(test_accessible) \
+ map(from:stack_accessible) device(d)
+ {
+ if ((uintptr_t)&test_accessible == addr
+ && test_accessible == 123)
+ stack_accessible = 1;
+ }
+
+ if (!stack_accessible)
+ __builtin_abort ();
+ }
+ __builtin_printf ("device #%d: shared_mem=%d heap_accessible=%d "
+ "stack_accessible=%d\n",
+ d, shared_mem, heap_accessible, stack_accessible);
+
+ /* omp_target_is_accessible returns false if *any* of the array is
+ inaccessible, so we only check the aggregate result.
+ (Varying access observed on amdgcn without xnack.) */
+ bool accessible = true;
for (int i = 0; i < 128; i++)
- if (omp_target_is_accessible (&a[i], sizeof (int), d) != shared_mem)
- __builtin_abort ();
+ if (!omp_target_is_accessible (&a[i], sizeof (int), d))
+ accessible = false;
+ if (accessible != (shared_mem || stack_accessible))
+ __builtin_abort ();
}
return 0;
--- /dev/null
+/* { dg-require-effective-target omp_usm } */
+
+#pragma omp requires unified_shared_memory
+
+#include "target-is-accessible-1.c"
--- /dev/null
+/* { dg-require-effective-target offload_target_amdgcn_with_xnack } */
+/* { dg-additional-options "-foffload-options=amdgcn-amdhsa=-mxnack=on" } */
+
+#include "target-is-accessible-1.c"
--- /dev/null
+/* { dg-require-effective-target omp_managedmem } */
+
+#include <omp.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+//#define __builtin_abort() __builtin_printf ("fail at line %d\n", __LINE__)
+
+int
+main ()
+{
+ int *p = (int*)omp_alloc (sizeof (int), ompx_gnu_managed_mem_alloc);
+
+ *p = 42;
+ uintptr_t a_p = (uintptr_t)p;
+
+ #pragma omp target is_device_ptr(p)
+ {
+ if (*p != 42 || a_p != (uintptr_t)p)
+ __builtin_abort ();
+ }
+ if (!p
+ || !omp_target_is_accessible (p, sizeof (int),
+ omp_get_default_device ()))
+ __builtin_abort ();
+
+ return 0;
+}
--- /dev/null
+#include <omp.h>
+
+void check (int dev)
+{
+ constexpr int N = 10;
+ constexpr int size = N*sizeof(int);
+ int A[N] = {};
+
+ void *ptr = omp_target_alloc (size, dev);
+
+ if (ptr == nullptr || !omp_target_is_accessible (ptr, size, dev))
+ __builtin_abort ();
+
+ #pragma omp target device(dev) firstprivate(ptr)
+ for (int i = 0; i < N; i++)
+ ((int *)ptr)[i] = i + 1;
+
+ if (omp_target_memcpy (A, ptr, size, 0, 0, omp_initial_device, dev) != 0)
+ __builtin_abort ();
+
+ for (int i = 0; i < N; i++)
+ if (A[i] != i + 1)
+ __builtin_abort ();
+
+ omp_target_free (ptr, dev);
+}
+
+int main ()
+{
+ check (omp_default_device);
+ for (int dev = 0; dev <= omp_get_num_devices(); dev++)
+ check (dev);
+}
+! { dg-do run }
+
program main
use omp_lib
use iso_c_binding
implicit none (external, type)
- integer :: d, id, n, shared_mem, i
+ integer :: d, id, n, shared_mem, i, heap_accessible, stack_accessible
+ integer, target :: test_accessible
+ integer, allocatable, target :: p(:)
integer, target :: a(1:128)
- type(c_ptr) :: p
+ integer(c_intptr_t) :: addr
+ logical :: condition
d = omp_get_default_device ()
id = omp_get_initial_device ()
n = omp_get_num_devices ()
+ allocate (p(1))
if (d < 0 .or. d >= n) &
d = id
- if (omp_target_is_accessible (p, c_sizeof (d), n) /= 1) &
+ if (omp_target_is_accessible (c_loc(p), c_sizeof(p(1)), n) == 0) &
stop 1
- if (omp_target_is_accessible (p, c_sizeof (d), id) /= 1) &
+ if (omp_target_is_accessible (c_loc(p), c_sizeof(p(1)), id) == 0) &
stop 2
- if (omp_target_is_accessible (p, c_sizeof (d), omp_initial_device) /= 1) &
+ if (omp_target_is_accessible (c_loc(p), c_sizeof(p(1)), omp_initial_device) == 0) &
stop 3
- if (omp_target_is_accessible (p, c_sizeof (d), -6) /= 0) & ! -6 = omp_default_device - 1
+ if (omp_target_is_accessible (c_loc(p), c_sizeof(p(1)), -6) /= 0) & ! -6 = omp_default_device - 1
stop 4
- if (omp_target_is_accessible (p, c_sizeof (d), n + 1) /= 0) &
+ if (omp_target_is_accessible (c_loc(p), c_sizeof(p(1)), n + 1) /= 0) &
stop 5
! Currently, a host pointer is accessible if the device supports shared
! memory or omp_target_is_accessible is executed on the host. This
- ! test case must be adapted when unified shared memory is avialable.
+ ! test case must be adapted when unified shared memory is available.
do d = 0, omp_get_num_devices ()
- shared_mem = 0;
+ ! Check if libgomp is treating the device as a shared memory device.
+ shared_mem = 0
!$omp target map (alloc: shared_mem) device (d)
- shared_mem = 1;
+ shared_mem = 1
!$omp end target
- if (omp_target_is_accessible (p, c_sizeof (d), d) /= shared_mem) &
- stop 6;
+ heap_accessible = shared_mem
+ condition = omp_target_is_accessible (c_loc(p), c_sizeof(p(1)), d) /= shared_mem
+ if (condition) then
+ if (shared_mem /= 0) &
+ stop 6
+
+ ! shared_mem is false, but the memory is reading as accessible,
+ ! so let's check that by reading it. We should not do so
+ ! unconditionally because if it's wrong then we'll probably get
+ ! a memory fault.
+ p(1) = 123
+ addr = transfer(c_loc(p), addr)
+
+ !$omp target has_device_addr(p) map(from:heap_accessible) device(d)
+ if (transfer(c_loc(p), addr) == addr .and. p(1) == 123) &
+ heap_accessible = 1
+ !$omp end target
+
+ if (heap_accessible == 0) &
+ stop 7
+ end if
+
+ stack_accessible = shared_mem
+ condition = omp_target_is_accessible (c_loc(a), 128 * c_sizeof(a(1)), d) /= shared_mem
+ if (condition) then
+ if (shared_mem /= 0) &
+ stop 8
- if (omp_target_is_accessible (c_loc (a), 128 * sizeof (a(1)), d) /= shared_mem) &
- stop 7;
+ ! shared_mem is false, but the memory is reading as accessible,
+ ! so let's check that by reading it. We should not do so
+ ! unconditionally because if it's wrong then we'll probably get
+ ! a memory fault.
+ test_accessible = 123
+ addr = transfer(c_loc(test_accessible), addr)
+ !$omp target has_device_addr(test_accessible) map(from:stack_accessible) device(d)
+ if (transfer(c_loc(test_accessible), addr) == addr &
+ .and. test_accessible == 123) &
+ stack_accessible = 1
+ !$omp end target
+
+ if (stack_accessible == 0) &
+ stop 9
+ end if
+
+ print '(A,I0,A,I0,A,I0,A,I0)', &
+ 'device #', d, &
+ ': shared_mem=', shared_mem, &
+ ' heap_accessible=', heap_accessible, &
+ ' stack_accessible=', stack_accessible
+
+ ! omp_target_is_accessible returns false if *any* of the array is
+ ! inaccessible, so we only check the aggregate result.
+ ! (Varying access observed on amdgcn without xnack.)
+ condition = .true.
do i = 1, 128
- if (omp_target_is_accessible (c_loc (a(i)), sizeof (a(i)), d) /= shared_mem) &
- stop 8;
+ if (omp_target_is_accessible (c_loc(a(i)), c_sizeof(a(i)), d) == 0) &
+ condition = .false.
end do
-
+ if (condition .neqv. stack_accessible /= 0) &
+ stop 10
end do
+ deallocate (p)
+
end program main