}
/* Assume that a constant expression in the range 1 (omp_default_mem_alloc)
- to 8 (omp_thread_mem_alloc) range, or 200 (ompx_gnu_pinned_mem_alloc) is
- fine. The original symbol name is already lost during matching via
- gfc_match_expr. */
+ to GOMP_OMP_PREDEF_ALLOC_MAX, or GOMP_OMPX_PREDEF_ALLOC_MIN to
+ GOMP_OMPX_PREDEF_ALLOC_MAX is fine. The original symbol name is already
+ lost during matching via gfc_match_expr. */
static bool
is_predefined_allocator (gfc_expr *expr)
{
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88
} CUdevice_attribute;
+typedef enum {
+ CU_MEM_ATTACH_GLOBAL = 0x1
+} CUmemAttach_flags;
+
enum {
CU_EVENT_DEFAULT = 0,
CU_EVENT_DISABLE_TIMING = 2
#define cuMemAllocHost cuMemAllocHost_v2
CUresult cuMemAllocHost (void **, size_t);
CUresult cuMemHostAlloc (void **, size_t, unsigned int);
+CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
CUresult cuMemcpyPeer (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t);
CUresult cuMemcpyPeerAsync (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream);
/* Predefined allocator value ranges. */
#define GOMP_OMP_PREDEF_ALLOC_MAX 8
#define GOMP_OMPX_PREDEF_ALLOC_MIN 200
-#define GOMP_OMPX_PREDEF_ALLOC_MAX 200
+#define GOMP_OMPX_PREDEF_ALLOC_MAX 201
+
+/* Predefined memspace value ranges. */
+#define GOMP_OMP_PREDEF_MEMSPACE_MAX 4
+#define GOMP_OMPX_PREDEF_MEMSPACE_MIN 200
+#define GOMP_OMPX_PREDEF_MEMSPACE_MAX 200
/* Predefined allocator with access == thread. */
#define GOMP_OMP_PREDEF_ALLOC_THREADS 8
#define omp_max_predefined_alloc omp_thread_mem_alloc
#define ompx_gnu_min_predefined_alloc ompx_gnu_pinned_mem_alloc
-#define ompx_gnu_max_predefined_alloc ompx_gnu_pinned_mem_alloc
+#define ompx_gnu_max_predefined_alloc ompx_gnu_managed_mem_alloc
_Static_assert (GOMP_OMP_PREDEF_ALLOC_MAX == omp_thread_mem_alloc,
"GOMP_OMP_PREDEF_ALLOC_MAX == omp_thread_mem_alloc");
_Static_assert (GOMP_OMPX_PREDEF_ALLOC_MIN == ompx_gnu_min_predefined_alloc,
- "GOMP_OMP_PREDEF_ALLOC_MAX == omp_thread_mem_alloc");
+ "GOMP_OMPX_PREDEF_ALLOC_MIN == ompx_gnu_min_predefined_alloc");
_Static_assert (GOMP_OMPX_PREDEF_ALLOC_MAX == ompx_gnu_max_predefined_alloc,
- "GOMP_OMP_PREDEF_ALLOC_MAX == omp_thread_mem_alloc");
+ "GOMP_OMPX_PREDEF_ALLOC_MAX == ompx_gnu_max_predefined_alloc");
_Static_assert (GOMP_OMP_PREDEF_ALLOC_THREADS == omp_thread_mem_alloc,
"GOMP_OMP_PREDEF_ALLOC_THREADS == omp_thread_mem_alloc");
+#define omp_max_predefined_mem_space omp_low_lat_mem_space
+#define ompx_gnu_min_predefined_mem_space ompx_gnu_managed_mem_space
+#define ompx_gnu_max_predefined_mem_space ompx_gnu_managed_mem_space
+
+_Static_assert (GOMP_OMP_PREDEF_MEMSPACE_MAX == omp_max_predefined_mem_space,
+ "GOMP_OMP_PREDEF_MEMSPACE_MAX == omp_max_predefined_mem_space");
+_Static_assert (GOMP_OMPX_PREDEF_MEMSPACE_MIN == ompx_gnu_min_predefined_mem_space,
+ "GOMP_OMPX_PREDEF_MEMSPACE_MIN == ompx_gnu_min_predefined_mem_space");
+_Static_assert (GOMP_OMPX_PREDEF_MEMSPACE_MAX == ompx_gnu_max_predefined_mem_space,
+ "GOMP_OMPX_PREDEF_MEMSPACE_MAX == ompx_gnu_max_predefined_mem_space");
+
+#if 0 /* For testing the fall-back macros compile, only. */
+#undef MEMSPACE_ALLOC
+#undef MEMSPACE_CALLOC
+#undef MEMSPACE_REALLOC
+#undef MEMSPACE_FREE
+#undef MEMSPACE_VALIDATE
+#endif
+
/* These macros may be overridden in config/<target>/allocator.c.
The defaults (no override) are to return NULL for pinned memory requests
- and pass through to the regular OS calls otherwise.
+ or non-standard memory spaces (these need a deliberate implementation), and
+ pass through to the regular OS calls otherwise.
The following definitions (ab)use comma operators to avoid unused
variable errors. */
#ifndef MEMSPACE_ALLOC
#define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
- (PIN ? NULL : malloc (((void)(MEMSPACE), (SIZE))))
+ ((PIN) || (MEMSPACE) > GOMP_OMP_PREDEF_MEMSPACE_MAX \
+ ? NULL : malloc (((void)(MEMSPACE), (SIZE))))
#endif
#ifndef MEMSPACE_CALLOC
#define MEMSPACE_CALLOC(MEMSPACE, SIZE, PIN) \
- (PIN ? NULL : calloc (1, (((void)(MEMSPACE), (SIZE)))))
+ ((PIN) || (MEMSPACE) > GOMP_OMP_PREDEF_MEMSPACE_MAX \
+ ? NULL : calloc (1, (((void)(MEMSPACE), (SIZE)))))
#endif
#ifndef MEMSPACE_REALLOC
#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN) \
- ((PIN) || (OLDPIN) ? NULL \
- : realloc (ADDR, (((void)(MEMSPACE), (void)(OLDSIZE), (SIZE)))))
+ ((PIN) || (OLDPIN) || (MEMSPACE) > GOMP_OMP_PREDEF_MEMSPACE_MAX \
+ ? NULL \
+ : realloc (ADDR, (((void)(MEMSPACE), (void)(OLDSIZE), (SIZE)))))
#endif
#ifndef MEMSPACE_FREE
#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE, PIN) \
#endif
#ifndef MEMSPACE_VALIDATE
#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS, PIN) \
- (PIN ? 0 : ((void)(MEMSPACE), (void)(ACCESS), 1))
+ ((PIN) || (MEMSPACE) > GOMP_OMP_PREDEF_MEMSPACE_MAX \
+ ? 0 : ((void)(MEMSPACE), (void)(ACCESS), 1))
#endif
/* Map the predefined allocators to the correct memory space.
};
static const omp_memspace_handle_t predefined_ompx_gnu_alloc_mapping[] = {
omp_default_mem_space, /* ompx_gnu_pinned_mem_alloc. */
+ ompx_gnu_managed_mem_space, /* ompx_gnu_managed_mem_alloc. */
};
#define ARRAY_SIZE(A) (sizeof (A) / sizeof ((A)[0]))
struct omp_allocator_data *ret;
int i;
- if (memspace > omp_low_lat_mem_space)
+ if (memspace > omp_max_predefined_mem_space
+ && (memspace < ompx_gnu_min_predefined_mem_space
+ || memspace > ompx_gnu_max_predefined_mem_space))
return omp_null_allocator;
for (i = 0; i < ntraits; i++)
switch (traits[i].key)
return __gcn_lowlat_alloc (shared_pool, size);
}
+ else if (memspace > GOMP_OMP_PREDEF_MEMSPACE_MAX)
+ /* No non-standard memspaces are implemented for device-side amdgcn. */
+ return NULL;
else
return malloc (size);
+
}
static void *
return __gcn_lowlat_calloc (shared_pool, size);
}
+ else if (memspace > GOMP_OMP_PREDEF_MEMSPACE_MAX)
+ /* No non-standard memspaces are implemented for device-side amdgcn. */
+ return NULL;
else
return calloc (1, size);
}
return __gcn_lowlat_realloc (shared_pool, addr, oldsize, size);
}
+ else if (memspace > GOMP_OMP_PREDEF_MEMSPACE_MAX)
+ /* No non-standard memspaces are implemented for device-side amdgcn. */
+ return NULL;
else
return realloc (addr, size);
}
{
/* Disallow use of low-latency memory when it must be accessible by
all threads. */
- return (memspace != omp_low_lat_mem_space
- || access != omp_atv_all);
+ if (memspace == omp_low_lat_mem_space
+ && access == omp_atv_all)
+ return false;
+
+ /* Otherwise, standard memspaces are accepted, even when we don't have
+ anything special to do with them, and non-standard memspaces are assumed
+ to need explicit support. */
+ return (memspace <= GOMP_OMP_PREDEF_MEMSPACE_MAX);
}
#define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
{
void *addr = NULL;
- if (pin)
+ if (memspace == ompx_gnu_managed_mem_space)
+ addr = gomp_managed_alloc (size);
+ else if (pin)
{
int using_device = __atomic_load_n (&using_device_for_page_locked,
MEMMODEL_RELAXED);
static void *
linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
{
- if (pin)
+ if (memspace == ompx_gnu_managed_mem_space)
+ {
+ void *ret = gomp_managed_alloc (size);
+ if (!ret)
+ return NULL;
+ memset (ret, 0, size);
+ return ret;
+ }
+ else if (pin)
return linux_memspace_alloc (memspace, size, pin, true);
else
return calloc (1, size);
linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
int pin)
{
- if (pin)
+ if (memspace == ompx_gnu_managed_mem_space)
+ gomp_managed_free (addr);
+ else if (pin)
{
int using_device
= __atomic_load_n (&using_device_for_page_locked,
linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
size_t oldsize, size_t size, int oldpin, int pin)
{
- if (oldpin && pin)
+ if (memspace == ompx_gnu_managed_mem_space)
+ /* Realloc is not implemented for device Managed Memory. */
+ ;
+ else if (oldpin && pin)
{
int using_device
= __atomic_load_n (&using_device_for_page_locked,
static int
linux_memspace_validate (omp_memspace_handle_t, unsigned, int)
{
- /* Everything should be accepted on Linux, including pinning. */
+ /* Everything should be accepted on Linux, including pinning and
+ non-standard memspaces. */
return 1;
}
return __nvptx_lowlat_alloc (shared_pool, size);
}
+ else if (memspace > GOMP_OMP_PREDEF_MEMSPACE_MAX)
+ /* No non-standard memspaces are implemented for device-side nvptx. */
+ return NULL;
else
return malloc (size);
}
return __nvptx_lowlat_calloc (shared_pool, size);
}
+ else if (memspace > GOMP_OMP_PREDEF_MEMSPACE_MAX)
+ /* No non-standard memspaces are implemented for device-side nvptx. */
+ return NULL;
else
return calloc (1, size);
}
return __nvptx_lowlat_realloc (shared_pool, addr, oldsize, size);
}
+ else if (memspace > GOMP_OMP_PREDEF_MEMSPACE_MAX)
+ /* No non-standard memspaces are implemented for device-side nvptx. */
+ return NULL;
else
return realloc (addr, size);
}
|| (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MINOR >= 1)
/* Disallow use of low-latency memory when it must be accessible by
all threads. */
- return (memspace != omp_low_lat_mem_space
- || access != omp_atv_all);
+ if (memspace == omp_low_lat_mem_space
+ && access == omp_atv_all)
+ return false;
#else
/* Low-latency memory is not available before PTX 4.1. */
- return (memspace != omp_low_lat_mem_space);
+ if (memspace == omp_low_lat_mem_space)
+ return false;
#endif
+
+ /* Otherwise, standard memspaces are accepted, even when we don't have
+ anything special to do with them, and non-standard memspaces are assumed
+ to need explicit support. */
+ return (memspace <= GOMP_OMP_PREDEF_MEMSPACE_MAX);
}
#define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
return false;
}
+/* These are reminders to add new allocators to parse_allocator. */
+_Static_assert (GOMP_OMP_PREDEF_ALLOC_MAX == omp_thread_mem_alloc);
+_Static_assert (GOMP_OMPX_PREDEF_ALLOC_MAX == ompx_gnu_managed_mem_alloc);
+_Static_assert (GOMP_OMP_PREDEF_MEMSPACE_MAX == omp_low_lat_mem_space);
+_Static_assert (GOMP_OMPX_PREDEF_MEMSPACE_MAX == ompx_gnu_managed_mem_space);
+
/* Parse the OMP_ALLOCATOR environment variable and return the value. */
static bool
parse_allocator (const char *env, const char *val, void *const params[])
++val;
if (0)
;
-#define C(v, m) \
+#define C(v, is_memspace) \
else if (strncasecmp (val, #v, sizeof (#v) - 1) == 0) \
{ \
*ret = v; \
val += sizeof (#v) - 1; \
- memspace = m; \
+ memspace = is_memspace; \
}
C (omp_default_mem_alloc, false)
C (omp_large_cap_mem_alloc, false)
C (omp_pteam_mem_alloc, false)
C (omp_thread_mem_alloc, false)
C (ompx_gnu_pinned_mem_alloc, false)
+ C (ompx_gnu_managed_mem_alloc, false)
C (omp_default_mem_space, true)
C (omp_large_cap_mem_space, true)
C (omp_const_mem_space, true)
C (omp_high_bw_mem_space, true)
C (omp_low_lat_mem_space, true)
+ C (ompx_gnu_managed_mem_space, true)
#undef C
else
goto invalid;
extern bool GOMP_OFFLOAD_unload_image (int, unsigned, const void *);
extern void *GOMP_OFFLOAD_alloc (int, size_t);
extern bool GOMP_OFFLOAD_free (int, void *);
+extern void *GOMP_OFFLOAD_managed_alloc (int, size_t);
+extern bool GOMP_OFFLOAD_managed_free (int, void *);
extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t);
extern bool GOMP_OFFLOAD_page_locked_host_free (void *);
extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
extern bool gomp_target_task_fn (void *);
extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
int, struct goacc_asyncqueue *);
+extern void *gomp_managed_alloc (size_t size);
+extern void gomp_managed_free (void *device_ptr);
extern bool gomp_page_locked_host_alloc (void **, size_t);
extern void gomp_page_locked_host_free (void *);
__typeof (GOMP_OFFLOAD_unload_image) *unload_image_func;
__typeof (GOMP_OFFLOAD_alloc) *alloc_func;
__typeof (GOMP_OFFLOAD_free) *free_func;
+ __typeof (GOMP_OFFLOAD_managed_alloc) *managed_alloc_func;
+ __typeof (GOMP_OFFLOAD_managed_free) *managed_free_func;
__typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func;
__typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func;
__typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
@item omp_pteam_mem_alloc @tab omp_low_lat_mem_space (implementation defined)
@item omp_thread_mem_alloc @tab omp_low_lat_mem_space (implementation defined)
@item ompx_gnu_pinned_mem_alloc @tab omp_default_mem_space (GNU extension)
+@item ompx_gnu_managed_mem_alloc @tab ompx_gnu_managed_mem_space (GNU extension)
@end multitable
Each predefined allocator, including @code{omp_null_allocator}, has a corresponding
@item omp_pteam_mem_alloc @tab omp::allocator::pteam_mem
@item omp_thread_mem_alloc @tab omp::allocator::thread_mem
@item ompx_gnu_pinned_mem_alloc @tab ompx::allocator::gnu_pinned_mem
+@item ompx_gnu_managed_mem_alloc @tab ompx::allocator::gnu_managed_mem
@end multitable
The following traits are available when constructing a new allocator;
unless the memkind library is available
@item @code{omp_high_bw_mem_space} maps to @code{omp_default_mem_space},
unless the memkind library is available
+@item @code{ompx_gnu_managed_mem_space} is a GNU extension that provides
+ managed memory accessible by both host and devices. The memory space is
+ available if the offload target associated with the
+ @var{default-device-var} ICV supports managed memory (see
+ @ref{Offload-Target Specifics}). This memory is accessible by both the
+ host and the device at the same address, so it need not be mapped with
+ @code{map} clauses. Instead, use the @code{is_device_ptr} clause or
+ @code{has_device_addr} clause to indicate that the pointer is already
+ accessible on the device. If managed memory is not supported by the
+ default device, as configured at the moment the allocator is called, then
+ the allocator will use the fall-back setting. If the default device is
+ configured differently when the memory is freed, via @code{omp_free} or
+ @code{omp_realloc}, the result may be undefined.
@end itemize
On Linux systems, where the @uref{https://github.com/memkind/memkind, memkind
a performance boost for NVPTX offload code and also allows unlimited use
of pinned memory regardless of the OS @code{ulimit}/@code{rlimit}
settings.
+@item Managed memory allocated with the OpenMP
+ @code{ompx_gnu_managed_mem_alloc} allocator or in the
+ @code{ompx_gnu_managed_mem_space} is not currently supported for AMD GPU
+ devices; attempting to use it in an allocator will trigger the fall-back
+ trait.
@item The OpenMP routines @code{omp_target_memcpy_rect} and
@code{omp_target_memcpy_rect_async} and the @code{target update}
directive for non-contiguous list items use the 3D memory-copy function
@code{omp_thread_mem_alloc}, all use low-latency memory as first
preference, and fall back to main graphics memory when the low-latency
pool is exhausted.
+@item Managed memory allocated on the host with the
+ @code{ompx_gnu_managed_mem_alloc} allocator or in the
+ @code{ompx_gnu_managed_mem_space} (both GNU extensions) allocate memory
+ in the CUDA Managed Memory space using @code{cuMemAllocManaged}. This
+ memory is accessible by both the host and the device at the same address,
+ so it need not be mapped with @code{map} clauses. Instead, use the
+ @code{is_device_ptr} clause or @code{has_device_addr} clause to indicate
+ that the pointer is already accessible on the device. The CUDA runtime
+ will automatically handle data migration between host and device as
+ needed. If managed memory is not supported by the default device, as
+ configured at the moment the allocator is called, then the allocator will
+ use the fall-back setting. If the default device is configured
+ differently when the memory is freed, via @code{omp_free} or
+ @code{omp_realloc}, the result may be undefined.
@item The OpenMP routines @code{omp_target_memcpy_rect} and
@code{omp_target_memcpy_rect_async} and the @code{target update}
directive for non-contiguous list items use the 2D and 3D memory-copy
omp_const_mem_space = 2,
omp_high_bw_mem_space = 3,
omp_low_lat_mem_space = 4,
+ ompx_gnu_managed_mem_space = 200,
__omp_memspace_handle_t_max__ = __UINTPTR_MAX__
} omp_memspace_handle_t;
omp_pteam_mem_alloc = 7,
omp_thread_mem_alloc = 8,
ompx_gnu_pinned_mem_alloc = 200,
+ ompx_gnu_managed_mem_alloc = 201,
__omp_allocator_handle_t_max__ = __UINTPTR_MAX__
} omp_allocator_handle_t;
struct gnu_pinned_mem
: omp::allocator::__detail::__allocator_templ <__T,
ompx_gnu_pinned_mem_alloc> {};
+template <typename __T>
+struct gnu_managed_mem
+ : omp::allocator::__detail::__allocator_templ <__T,
+ ompx_gnu_managed_mem_alloc> {};
} /* namespace allocator */
parameter :: omp_thread_mem_alloc = 8
integer (kind=omp_allocator_handle_kind), &
parameter :: ompx_gnu_pinned_mem_alloc = 200
+ integer (kind=omp_allocator_handle_kind), &
+ parameter :: ompx_gnu_managed_mem_alloc = 201
integer (omp_memspace_handle_kind), &
parameter :: omp_default_mem_space = 0
integer (omp_memspace_handle_kind), &
parameter :: omp_high_bw_mem_space = 3
integer (omp_memspace_handle_kind), &
parameter :: omp_low_lat_mem_space = 4
+ integer (omp_memspace_handle_kind), &
+ parameter :: ompx_gnu_managed_mem_space = 200
integer, parameter :: omp_initial_device = -1
integer, parameter :: omp_invalid_device = -4
integer, parameter :: omp_default_device = -5
integer (omp_allocator_handle_kind) omp_pteam_mem_alloc
integer (omp_allocator_handle_kind) omp_thread_mem_alloc
integer (omp_allocator_handle_kind) ompx_gnu_pinned_mem_alloc
+ integer (omp_allocator_handle_kind) ompx_gnu_managed_mem_alloc
parameter (omp_null_allocator = 0)
parameter (omp_default_mem_alloc = 1)
parameter (omp_large_cap_mem_alloc = 2)
parameter (omp_pteam_mem_alloc = 7)
parameter (omp_thread_mem_alloc = 8)
parameter (ompx_gnu_pinned_mem_alloc = 200)
+ parameter (ompx_gnu_managed_mem_alloc = 201)
integer (omp_memspace_handle_kind) omp_default_mem_space
integer (omp_memspace_handle_kind) omp_large_cap_mem_space
integer (omp_memspace_handle_kind) omp_const_mem_space
integer (omp_memspace_handle_kind) omp_high_bw_mem_space
integer (omp_memspace_handle_kind) omp_low_lat_mem_space
+ integer (omp_memspace_handle_kind) ompx_gnu_managed_mem_space
parameter (omp_default_mem_space = 0)
parameter (omp_large_cap_mem_space = 1)
parameter (omp_const_mem_space = 2)
parameter (omp_high_bw_mem_space = 3)
parameter (omp_low_lat_mem_space = 4)
+ parameter (ompx_gnu_managed_mem_space = 200)
integer omp_initial_device, omp_invalid_device, omp_default_device
parameter (omp_initial_device = -1)
parameter (omp_invalid_device = -4)
CUDA_ONE_CALL (cuMemAlloc)
CUDA_ONE_CALL (cuMemAllocHost)
CUDA_ONE_CALL (cuMemHostAlloc)
+CUDA_ONE_CALL (cuMemAllocManaged)
CUDA_ONE_CALL (cuMemcpy)
CUDA_ONE_CALL (cuMemcpyDtoDAsync)
CUDA_ONE_CALL (cuMemcpyDtoH)
}
static void *
-nvptx_alloc (size_t s, bool suppress_errors)
+nvptx_alloc (size_t s, bool suppress_errors, bool managed)
{
CUdeviceptr d;
- CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
+ CUresult r = (managed ? CUDA_CALL_NOCHECK (cuMemAllocManaged, &d, s,
+ CU_MEM_ATTACH_GLOBAL)
+ : CUDA_CALL_NOCHECK (cuMemAlloc, &d, s));
if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
return NULL;
else if (r != CUDA_SUCCESS)
return ret;
}
-void *
-GOMP_OFFLOAD_alloc (int ord, size_t size)
+static void *
+cleanup_and_alloc (int ord, size_t size, bool managed)
{
if (!nvptx_attach_host_thread_to_device (ord))
return NULL;
blocks = tmp;
}
- void *d = nvptx_alloc (size, true);
+ void *d = nvptx_alloc (size, true, managed);
if (d)
return d;
else
/* Memory allocation failed. Try freeing the stacks block, and
retrying. */
nvptx_stacks_free (ptx_dev, true);
- return nvptx_alloc (size, false);
+ return nvptx_alloc (size, false, managed);
}
}
+void *
+GOMP_OFFLOAD_alloc (int ord, size_t size)
+{
+ return cleanup_and_alloc (ord, size, false);
+}
+
+void *
+GOMP_OFFLOAD_managed_alloc (int ord, size_t size)
+{
+ return cleanup_and_alloc (ord, size, true);
+}
+
bool
GOMP_OFFLOAD_free (int ord, void *ptr)
{
&& nvptx_free (ptr, ptx_devices[ord]));
}
+bool
+GOMP_OFFLOAD_managed_free (int ord, void *ptr)
+{
+ return GOMP_OFFLOAD_free (ord, ptr);
+}
+
bool
GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
{
gomp_mutex_unlock (&devicep->lock);
}
+void *
+gomp_managed_alloc (size_t size)
+{
+ struct gomp_task_icv *icv = gomp_icv (false);
+ struct gomp_device_descr *devicep = resolve_device (icv->default_device_var,
+ false);
+ if (devicep == NULL)
+ return NULL;
+
+ void *ret = NULL;
+ gomp_mutex_lock (&devicep->lock);
+ if (devicep->managed_alloc_func)
+ ret = devicep->managed_alloc_func (devicep->target_id, size);
+ gomp_mutex_unlock (&devicep->lock);
+ return ret;
+}
+
+void
+gomp_managed_free (void *device_ptr)
+{
+ if (device_ptr == NULL)
+ return;
+
+ struct gomp_task_icv *icv = gomp_icv (false);
+ struct gomp_device_descr *devicep = resolve_device (icv->default_device_var,
+ false);
+ if (devicep == NULL)
+ gomp_fatal ("attempted to free managed memory at %p, but the default "
+ "device is set to the host device", device_ptr);
+
+ gomp_mutex_lock (&devicep->lock);
+ if (!devicep->managed_free_func
+ || !devicep->managed_free_func (devicep->target_id, device_ptr))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("error in freeing managed memory block at %p", device_ptr);
+ }
+ gomp_mutex_unlock (&devicep->lock);
+}
+
/* Device (really: libgomp plugin) to use for paged-locked memory. We
assume there is either none or exactly one such device for the lifetime of
the process. */
DLSYM (unload_image);
DLSYM (alloc);
DLSYM (free);
+ DLSYM_OPT (managed_alloc, managed_alloc);
+ DLSYM_OPT (managed_free, managed_free);
DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc);
DLSYM_OPT (page_locked_host_free, page_locked_host_free);
DLSYM (dev2host);
return 0;
} } "-lhipblas" ]
}
+
+# return 1 if OpenMP Device Managed Memory is supported
+
+proc check_effective_target_omp_managedmem { } {
+ if { [check_effective_target_offload_device_nvptx] } {
+ return 1
+ }
+ return 0
+}
--- /dev/null
+// { dg-do run }
+// { dg-require-effective-target omp_managedmem }
+
+// Check that the ompx::allocator::gnu_managed_mem allocator can allocate
+// Managed Memory, and that host and target can see the data, at the same
+// address, without a mapping.
+
+#include <omp.h>
+#include <cstdint>
+#include <memory>
+
+int
+main ()
+{
+ using Allocator = ompx::allocator::gnu_managed_mem<int>;
+ using Traits = std::allocator_traits<Allocator>;
+
+ Allocator alloc;
+ int *a = Traits::allocate (alloc, 1);
+ if (!a)
+ __builtin_abort ();
+
+ Traits::construct (alloc, a, 42);
+ std::uintptr_t a_p = reinterpret_cast<std::uintptr_t>(a);
+
+ #pragma omp target is_device_ptr(a)
+ {
+ if (*a != 42 || a_p != reinterpret_cast<std::uintptr_t>(a))
+ __builtin_abort ();
+ }
+
+ Traits::destroy (alloc, a);
+ Traits::deallocate (alloc, a, 1);
+ return 0;
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target omp_managedmem } */
+
+/* Check that omp_alloc can allocate Managed Memory, and that host and target
+ can see the data, at the same address, without a mapping. */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int), ompx_gnu_managed_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ *a = 42;
+ uintptr_t a_p = (uintptr_t)a;
+
+ #pragma omp target is_device_ptr(a)
+ {
+ if (*a != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ omp_free(a, ompx_gnu_managed_mem_alloc);
+ return 0;
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target omp_managedmem } */
+
+/* Check that omp_calloc can allocate Managed Memory, and that host and target
+ can see the data, at the same address, without a mapping. */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_calloc(5, sizeof(int), ompx_gnu_managed_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ /* Check that memory is zero-initialized */
+ for (int i = 0; i < 5; i++)
+ if (a[i] != 0)
+ __builtin_abort ();
+
+ a[0] = 42;
+ a[4] = 99;
+ uintptr_t a_p = (uintptr_t)a;
+
+ #pragma omp target is_device_ptr(a)
+ {
+ if (a[0] != 42 || a[4] != 99 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ /* Check zero-initialization on device side */
+ for (int i = 1; i < 4; i++)
+ if (a[i] != 0)
+ __builtin_abort ();
+ }
+
+ omp_free(a, ompx_gnu_managed_mem_alloc);
+ return 0;
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target omp_managedmem } */
+
+/* Check that omp_realloc can allocate Managed Memory, and that host and target
+ can see the data, at the same address, without a mapping. */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(2 * sizeof(int), ompx_gnu_managed_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+ a[1] = 43;
+
+ /* Reallocate to larger size */
+ int *b = (int *) omp_realloc(a, 5 * sizeof(int), ompx_gnu_managed_mem_alloc,
+ ompx_gnu_managed_mem_alloc);
+ if (!b)
+ __builtin_abort ();
+
+ /* Check that original data is preserved */
+ if (b[0] != 42 || b[1] != 43)
+ __builtin_abort ();
+
+ b[2] = 44;
+ b[3] = 45;
+ b[4] = 46;
+ uintptr_t b_p = (uintptr_t)b;
+
+ #pragma omp target is_device_ptr(b)
+ {
+ if (b[0] != 42 || b[1] != 43 || b[2] != 44 || b[3] != 45 || b[4] != 46
+ || b_p != (uintptr_t)b)
+ __builtin_abort ();
+ }
+
+ omp_free(b, ompx_gnu_managed_mem_alloc);
+ return 0;
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target omp_managedmem } */
+/* { dg-shouldfail "" } */
+/* { dg-output "libgomp: attempted to free managed memory at 0x\[0-9a-f\]+, but the default device is set to the host device" } */
+
+/* Check that omp_free emits an error if the default device has been changed
+ to the host device. */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(2 * sizeof(int), ompx_gnu_managed_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ omp_set_default_device (omp_initial_device);
+ omp_free(a, ompx_gnu_managed_mem_alloc);
+ return 0;
+}
--- /dev/null
+! { dg-do run }
+! { dg-require-effective-target omp_managedmem }
+
+! Check that omp_alloc can allocate Managed Memory, and that host and target
+! can see the data, at the same address, without a mapping.
+
+program main
+ use omp_lib
+ use iso_c_binding
+ implicit none
+
+ type(c_ptr) :: cptr
+ integer, pointer :: a
+ integer(c_intptr_t) :: a_p, a_p2
+
+ cptr = omp_alloc(c_sizeof(a), ompx_gnu_managed_mem_alloc)
+ if (.not. c_associated(cptr)) stop 1
+
+ call c_f_pointer(cptr, a)
+ a = 42
+ a_p = transfer(c_loc(a), a_p)
+
+ !$omp target is_device_ptr(a)
+ a_p2 = transfer(c_loc(a), a_p2)
+ if (a /= 42 .or. a_p /= a_p2) stop 2
+ !$omp end target
+
+ call omp_free(cptr, ompx_gnu_managed_mem_alloc)
+end program main