+2022-03-11 Andrew Stubbs <ams@codesourcery.com>
+
+ Backport of the patch posted at
+ https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591352.html
+
+ * allocator.c (omp_max_predefined_alloc): Update.
+ (omp_aligned_alloc): Don't fallback ompx_host_mem_alloc.
+ (omp_aligned_calloc): Likewise.
+ (omp_realloc): Likewise.
+ * config/linux/allocator.c (linux_memspace_alloc): Handle USM.
+ (linux_memspace_calloc): Handle USM.
+ (linux_memspace_free): Handle USM.
+ (linux_memspace_realloc): Handle USM.
+ * config/nvptx/allocator.c (nvptx_memspace_alloc): Reject
+ ompx_host_mem_alloc.
+ (nvptx_memspace_calloc): Likewise.
+ (nvptx_memspace_realloc): Likewise.
+ * libgomp-plugin.h (GOMP_OFFLOAD_usm_alloc): New prototype.
+ (GOMP_OFFLOAD_usm_free): New prototype.
+ (GOMP_OFFLOAD_is_usm_ptr): New prototype.
+ * libgomp.h (gomp_usm_alloc): New prototype.
+ (gomp_usm_free): New prototype.
+ (gomp_is_usm_ptr): New prototype.
+ (struct gomp_device_descr): Add USM functions.
+ * omp.h.in (omp_memspace_handle_t): Add ompx_unified_shared_mem_space
+ and ompx_host_mem_space.
+ (omp_allocator_handle_t): Add ompx_unified_shared_mem_alloc and
+ ompx_host_mem_alloc.
+ * omp_lib.f90.in: Likewise.
+ * plugin/plugin-nvptx.c (nvptx_alloc): Add "usm" parameter.
+ Call cuMemAllocManaged as appropriate.
+ (GOMP_OFFLOAD_alloc): Move internals to ...
+ (GOMP_OFFLOAD_alloc_1): ... this, and add usm parameter.
+ (GOMP_OFFLOAD_usm_alloc): New function.
+ (GOMP_OFFLOAD_usm_free): New function.
+ (GOMP_OFFLOAD_is_usm_ptr): New function.
+ * target.c (gomp_map_vars_internal): Add USM support.
+ (gomp_usm_alloc): New function.
+ (gomp_usm_free): New function.
+ (gomp_load_plugin_for_device): New function.
+ * testsuite/libgomp.c/usm-1.c: New test.
+ * testsuite/libgomp.c/usm-2.c: New test.
+ * testsuite/libgomp.c/usm-3.c: New test.
+ * testsuite/libgomp.c/usm-4.c: New test.
+ * testsuite/libgomp.c/usm-5.c: New test.
+
2022-03-11 Andrew Stubbs <ams@codesourcery.com>
Backport of a patch posted at
#include <stdlib.h>
#include <string.h>
-#define omp_max_predefined_alloc ompx_pinned_mem_alloc
+#define omp_max_predefined_alloc ompx_host_mem_alloc
/* These macros may be overridden in config/<target>/allocator.c. */
#ifndef MEMSPACE_ALLOC
omp_low_lat_mem_space, /* omp_pteam_mem_alloc. */
omp_low_lat_mem_space, /* omp_thread_mem_alloc. */
omp_default_mem_space, /* ompx_pinned_mem_alloc. */
+ ompx_unified_shared_mem_space, /* ompx_unified_shared_mem_alloc. */
+ ompx_host_mem_space, /* ompx_host_mem_alloc. */
};
struct omp_allocator_data
int fallback = (allocator_data
? allocator_data->fallback
: (allocator == omp_default_mem_alloc
- || allocator == ompx_pinned_mem_alloc)
+ || allocator == ompx_pinned_mem_alloc
+ || allocator == ompx_host_mem_alloc)
? omp_atv_null_fb
: omp_atv_default_mem_fb);
switch (fallback)
int fallback = (allocator_data
? allocator_data->fallback
: (allocator == omp_default_mem_alloc
- || allocator == ompx_pinned_mem_alloc)
+ || allocator == ompx_pinned_mem_alloc
+ || allocator == ompx_host_mem_alloc)
? omp_atv_null_fb
: omp_atv_default_mem_fb);
switch (fallback)
int fallback = (allocator_data
? allocator_data->fallback
: (allocator == omp_default_mem_alloc
- || allocator == ompx_pinned_mem_alloc)
+ || allocator == ompx_pinned_mem_alloc
+ || allocator == ompx_host_mem_alloc)
? omp_atv_null_fb
: omp_atv_default_mem_fb);
switch (fallback)
static void *
linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
{
- (void)memspace;
-
- if (pin)
+ if (memspace == ompx_unified_shared_mem_space)
+ {
+ return gomp_usm_alloc (size, GOMP_DEVICE_ICV);
+ }
+ else if (pin)
{
void *addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
static void *
linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
{
- if (pin)
+ if (memspace == ompx_unified_shared_mem_space)
+ {
+ void *ret = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
+ memset (ret, 0, size);
+ return ret;
+ }
+ else if (memspace == ompx_unified_shared_mem_space
+ || pin)
return linux_memspace_alloc (memspace, size, pin);
else
return calloc (1, size);
linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
int pin)
{
- (void)memspace;
-
- if (pin)
+ if (memspace == ompx_unified_shared_mem_space)
+ gomp_usm_free (addr, GOMP_DEVICE_ICV);
+ else if (pin)
munmap (addr, size);
else
free (addr);
linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
size_t oldsize, size_t size, int oldpin, int pin)
{
- if (oldpin && pin)
+ if (memspace == ompx_unified_shared_mem_space)
+ goto manual_realloc;
+ else if (oldpin && pin)
{
void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
if (newaddr == MAP_FAILED)
return newaddr;
}
else if (oldpin || pin)
- {
- void *newaddr = linux_memspace_alloc (memspace, size, pin);
- if (newaddr)
- {
- memcpy (newaddr, addr, oldsize < size ? oldsize : size);
- linux_memspace_free (memspace, addr, oldsize, oldpin);
- }
-
- return newaddr;
- }
+ goto manual_realloc;
else
return realloc (addr, size);
+
+manual_realloc:
+ void *newaddr = linux_memspace_alloc (memspace, size, pin);
+ if (newaddr)
+ {
+ memcpy (newaddr, addr, oldsize < size ? oldsize : size);
+ linux_memspace_free (memspace, addr, oldsize, oldpin);
+ }
+
+ return newaddr;
}
#define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
__atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE);
return result;
}
+ else if (memspace == ompx_host_mem_space)
+ return NULL;
else
return malloc (size);
}
return result;
}
+ else if (memspace == ompx_host_mem_space)
+ return NULL;
else
return calloc (1, size);
}
}
return result;
}
+ else if (memspace == ompx_host_mem_space)
+ return NULL;
else
return realloc (addr, size);
}
extern bool GOMP_OFFLOAD_unload_image (int, unsigned, const void *);
extern void *GOMP_OFFLOAD_alloc (int, size_t);
extern bool GOMP_OFFLOAD_free (int, void *);
+extern void *GOMP_OFFLOAD_usm_alloc (int, size_t);
+extern bool GOMP_OFFLOAD_usm_free (int, void *);
+extern bool GOMP_OFFLOAD_is_usm_ptr (void *);
extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
extern void gomp_init_targets_once (void);
extern int gomp_get_num_devices (void);
extern bool gomp_target_task_fn (void *);
+extern void * gomp_usm_alloc (size_t size, int device_num);
+extern void gomp_usm_free (void *device_ptr, int device_num);
+extern bool gomp_is_usm_ptr (void *ptr);
/* Splay tree definitions. */
typedef struct splay_tree_node_s *splay_tree_node;
__typeof (GOMP_OFFLOAD_unload_image) *unload_image_func;
__typeof (GOMP_OFFLOAD_alloc) *alloc_func;
__typeof (GOMP_OFFLOAD_free) *free_func;
+ __typeof (GOMP_OFFLOAD_usm_alloc) *usm_alloc_func;
+ __typeof (GOMP_OFFLOAD_usm_free) *usm_free_func;
+ __typeof (GOMP_OFFLOAD_is_usm_ptr) *is_usm_ptr_func;
__typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
__typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
__typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
omp_const_mem_space = 2,
omp_high_bw_mem_space = 3,
omp_low_lat_mem_space = 4,
+ ompx_unified_shared_mem_space = 5,
+ ompx_host_mem_space = 6,
__omp_memspace_handle_t_max__ = __UINTPTR_MAX__
} omp_memspace_handle_t;
omp_pteam_mem_alloc = 7,
omp_thread_mem_alloc = 8,
ompx_pinned_mem_alloc = 9,
+ ompx_unified_shared_mem_alloc = 10,
+ ompx_host_mem_alloc = 11,
__omp_allocator_handle_t_max__ = __UINTPTR_MAX__
} omp_allocator_handle_t;
parameter :: omp_thread_mem_alloc = 8
integer (kind=omp_allocator_handle_kind), &
parameter :: ompx_pinned_mem_alloc = 9
+ integer (kind=omp_allocator_handle_kind), &
+ parameter :: ompx_unified_shared_mem_alloc = 10
+ integer (kind=omp_allocator_handle_kind), &
+ parameter :: ompx_host_mem_alloc = 11
integer (omp_memspace_handle_kind), &
parameter :: omp_default_mem_space = 0
integer (omp_memspace_handle_kind), &
parameter :: omp_high_bw_mem_space = 3
integer (omp_memspace_handle_kind), &
parameter :: omp_low_lat_mem_space = 4
+ integer (omp_memspace_handle_kind), &
+ parameter :: omp_unified_shared_mem_space = 5
+ integer (omp_memspace_handle_kind), &
+ parameter :: omp_host_mem_space = 6
type omp_alloctrait
integer (kind=omp_alloctrait_key_kind) key
}
static void *
-nvptx_alloc (size_t s, bool suppress_errors)
+nvptx_alloc (size_t s, bool suppress_errors, bool usm)
{
CUdeviceptr d;
- CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
+ CUresult r = (usm ? CUDA_CALL_NOCHECK (cuMemAllocManaged, &d, s,
+ CU_MEM_ATTACH_GLOBAL)
+ : CUDA_CALL_NOCHECK (cuMemAlloc, &d, s));
if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
return NULL;
else if (r != CUDA_SUCCESS)
return ret;
}
-void *
-GOMP_OFFLOAD_alloc (int ord, size_t size)
+static void *
+GOMP_OFFLOAD_alloc_1 (int ord, size_t size, bool usm)
{
if (!nvptx_attach_host_thread_to_device (ord))
return NULL;
blocks = tmp;
}
- void *d = nvptx_alloc (size, true);
+ void *d = nvptx_alloc (size, true, usm);
if (d)
return d;
else
/* Memory allocation failed. Try freeing the stacks block, and
retrying. */
nvptx_stacks_free (ptx_dev, true);
- return nvptx_alloc (size, false);
+ return nvptx_alloc (size, false, usm);
}
}
+void *
+GOMP_OFFLOAD_alloc (int ord, size_t size)
+{
+ return GOMP_OFFLOAD_alloc_1 (ord, size, false);
+}
+
+void *
+GOMP_OFFLOAD_usm_alloc (int ord, size_t size)
+{
+ return GOMP_OFFLOAD_alloc_1 (ord, size, true);
+}
+
bool
GOMP_OFFLOAD_free (int ord, void *ptr)
{
&& nvptx_free (ptr, ptx_devices[ord]));
}
+bool
+GOMP_OFFLOAD_usm_free (int ord, void *ptr)
+{
+ return GOMP_OFFLOAD_free (ord, ptr);
+}
+
+bool
+GOMP_OFFLOAD_is_usm_ptr (void *ptr)
+{
+ bool managed = false;
+ /* This returns 3 outcomes ...
+ CUDA_ERROR_INVALID_VALUE - Not a Cuda allocated pointer.
+ CUDA_SUCCESS, managed:false - Cuda allocated, but not USM.
+ CUDA_SUCCESS, managed:true - USM. */
+ CUDA_CALL_NOCHECK (cuPointerGetAttribute, &managed,
+ CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr);
+ return managed;
+}
+
void
GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
void **hostaddrs, void **devaddrs,
tgt->list[i].offset = 0;
continue;
}
+ else if (devicep->is_usm_ptr_func
+ && devicep->is_usm_ptr_func (hostaddrs[i]))
+ {
+ /* The memory is visible from both host and target
+ so nothing needs to be moved. */
+ tgt->list[i].key = NULL;
+ tgt->list[i].offset = OFFSET_INLINED;
+ continue;
+ }
else if ((kind & typemask) == GOMP_MAP_STRUCT)
{
size_t first = i + 1;
continue;
}
default:
+ if (tgt->list[i].offset == OFFSET_INLINED
+ && !array)
+ continue;
break;
}
gomp_mutex_unlock (&devicep->lock);
}
+void *
+gomp_usm_alloc (size_t size, int device_num)
+{
+ if (device_num == gomp_get_num_devices ())
+ return malloc (size);
+
+ struct gomp_device_descr *devicep = resolve_device (device_num);
+ if (devicep == NULL)
+ return NULL;
+
+ if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+ || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ return malloc (size);
+
+ void *ret = NULL;
+ gomp_mutex_lock (&devicep->lock);
+ if (devicep->usm_alloc_func)
+ ret = devicep->usm_alloc_func (devicep->target_id, size);
+ gomp_mutex_unlock (&devicep->lock);
+ return ret;
+}
+
+void
+gomp_usm_free (void *device_ptr, int device_num)
+{
+ if (device_ptr == NULL)
+ return;
+
+ if (device_num == gomp_get_num_devices ())
+ {
+ free (device_ptr);
+ return;
+ }
+
+ struct gomp_device_descr *devicep = resolve_device (device_num);
+ if (devicep == NULL)
+ return;
+
+ if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+ || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ {
+ free (device_ptr);
+ return;
+ }
+
+ gomp_mutex_lock (&devicep->lock);
+ if (devicep->usm_free_func
+ && !devicep->usm_free_func (devicep->target_id, device_ptr))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("error in freeing device memory block at %p", device_ptr);
+ }
+ gomp_mutex_unlock (&devicep->lock);
+}
+
int
omp_target_is_present (const void *ptr, int device_num)
{
DLSYM (unload_image);
DLSYM (alloc);
DLSYM (free);
+ DLSYM_OPT (usm_alloc, usm_alloc);
+ DLSYM_OPT (usm_free, usm_free);
+ DLSYM_OPT (is_usm_ptr, is_usm_ptr);
DLSYM (dev2host);
DLSYM (host2dev);
DLSYM (evaluate_device);
--- /dev/null
+/* { dg-do run } */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int), ompx_unified_shared_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ *a = 42;
+ uintptr_t a_p = (uintptr_t)a;
+
+ #pragma omp target is_device_ptr(a)
+ {
+ if (*a != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ omp_free(a, ompx_unified_shared_mem_alloc);
+ return 0;
+}
--- /dev/null
+/* { dg-do run } */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int)*2, ompx_unified_shared_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+ a[1] = 43;
+
+ uintptr_t a_p = (uintptr_t)a;
+
+ #pragma omp target map(a[0])
+ {
+ if (a[0] != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ #pragma omp target map(a[1])
+ {
+ if (a[1] != 43 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ omp_free(a, ompx_unified_shared_mem_alloc);
+ return 0;
+}
--- /dev/null
+/* { dg-do run } */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int)*2, ompx_unified_shared_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+ a[1] = 43;
+
+ uintptr_t a_p = (uintptr_t)a;
+
+#pragma omp target data map(a[0:2])
+ {
+#pragma omp target
+ {
+ if (a[0] != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+#pragma omp target
+ {
+ if (a[1] != 43 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+ }
+
+ omp_free(a, ompx_unified_shared_mem_alloc);
+ return 0;
+}
--- /dev/null
+/* { dg-do run } */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int)*2, ompx_unified_shared_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+ a[1] = 43;
+
+ uintptr_t a_p = (uintptr_t)a;
+
+#pragma omp target enter data map(to:a[0:2])
+
+#pragma omp target
+ {
+ if (a[0] != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+#pragma omp target
+ {
+ if (a[1] != 43 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+#pragma omp target exit data map(delete:a[0:2])
+
+ omp_free(a, ompx_unified_shared_mem_alloc);
+ return 0;
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target offload_device } */
+
+#include <omp.h>
+#include <stdint.h>
+
+#pragma omp requires unified_shared_memory
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int), ompx_host_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+
+ uintptr_t a_p = (uintptr_t)a;
+
+#pragma omp target map(a[0:1])
+ {
+ if (a[0] != 42 || a_p == (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ omp_free(a, ompx_host_mem_alloc);
+ return 0;
+}