libgomp: Enable USM for some nvptx devices

author Tobias Burnus <tburnus@baylibre.com>

Wed, 29 May 2024 13:14:38 +0000 (15:14 +0200)

committer Tobias Burnus <tburnus@baylibre.com>

Wed, 29 May 2024 13:14:38 +0000 (15:14 +0200)
author Tobias Burnus <tburnus@baylibre.com>
Wed, 29 May 2024 13:14:38 +0000 (15:14 +0200)
committer Tobias Burnus <tburnus@baylibre.com>
Wed, 29 May 2024 13:14:38 +0000 (15:14 +0200)
diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h

index 0dca4b3a5c0b055dcc13ceef7b2559c22fd73a36..804d08ca57eadb9691e750b4832a8c77604c3d8f 100644 (file)
--- a/include/cuda/cuda.h
+++ b/include/cuda/cuda.h
@@ -83,7 +83,8 @@ typedef enum {
    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,
-  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
+  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,
+  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88
  } CUdevice_attribute;
  
  enum {
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi

index 71d62105a20e6fb89b1808dcc953c20d58e0aa84..22868635230c40dceafe140fbee111ae390d43b8 100644 (file)
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -6435,8 +6435,11 @@ The implementation remark:
        the next reverse offload region is only executed after the previous
        one returned.
  @item OpenMP code that has a @code{requires} directive with
-      @code{unified_shared_memory} will remove any nvptx device from the
-      list of available devices (``host fallback'').
+      @code{unified_shared_memory} runs on nvptx devices if and only if
+      all of those support the @code{pageableMemoryAccess} property;@footnote{
+      @uref{https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements}}
+      otherwise, all nvptx device are removed from the list of available
+      devices (``host fallback'').
  @item The default per-warp stack size is 128 kiB; see also @code{-msoft-stack}
        in the GCC manual.
  @item The OpenMP routines @code{omp_target_memcpy_rect} and
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c

index 5aad3448a8db5cd9821ec23ab9354994586881ab..4cedc5390a31a99d9b7d977531ab4734c7f0175c 100644 (file)
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1201,8 +1201,23 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
    if (num_devices > 0
        && ((omp_requires_mask
            & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
+              | GOMP_REQUIRES_UNIFIED_SHARED_MEMORY
                | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
      return -1;
+  /* Check whether host page access (direct or via migration) is supported;
+     if so, enable USM.  Currently, capabilities is per device type, hence,
+     check all devices.  */
+  if (num_devices > 0
+      && (omp_requires_mask & GOMP_REQUIRES_UNIFIED_SHARED_MEMORY))
+    for (int dev = 0; dev < num_devices; dev++)
+      {
+       int pi;
+       CUresult r;
+       r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
+                              CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS, dev);
+       if (r != CUDA_SUCCESS || pi == 0)
+         return -1;
+      }
    return num_devices;
  }
  
diff --git a/libgomp/target.c b/libgomp/target.c

index 5ec19ae489ec841e6a002ffd4953dcd6e0139404..48689920d4a3e60533ea1b377fab7ab1fa545fa7 100644 (file)
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -2969,8 +2969,25 @@ gomp_copy_back_icvs (struct gomp_device_descr *devicep, int device)
    if (item == NULL)
      return;
  
+  gomp_mutex_lock (&devicep->lock);
+
+  struct splay_tree_s *mem_map = &devicep->mem_map;
+  struct splay_tree_key_s cur_node;
+  void *dev_ptr = NULL;
+
    void *host_ptr = &item->icvs;
-  void *dev_ptr = omp_get_mapped_ptr (host_ptr, device);
+  cur_node.host_start = (uintptr_t) host_ptr;
+  cur_node.host_end = cur_node.host_start;
+  splay_tree_key n = gomp_map_0len_lookup (mem_map, &cur_node);
+
+  if (n)
+    {
+      uintptr_t offset = cur_node.host_start - n->host_start;
+      dev_ptr = (void *) (n->tgt->tgt_start + n->tgt_offset + offset);
+    }
+
+  gomp_mutex_unlock (&devicep->lock);
+
    if (dev_ptr != NULL)
      gomp_copy_dev2host (devicep, NULL, host_ptr, dev_ptr,
                         sizeof (struct gomp_offload_icvs));
@@ -5303,6 +5320,11 @@ gomp_target_init (void)
               {
                 /* Augment DEVICES and NUM_DEVICES.  */
  
+               /* If USM has been requested and is supported by all devices
+                  of this type, set the capability accordingly.  */
+               if (omp_requires_mask & GOMP_REQUIRES_UNIFIED_SHARED_MEMORY)
+                 current_device.capabilities |= GOMP_OFFLOAD_CAP_SHARED_MEM;
+
                 devs = realloc (devs, (num_devs + new_num_devs)
                                       * sizeof (struct gomp_device_descr));
                 if (!devs)
author	Tobias Burnus <tburnus@baylibre.com>
	Wed, 29 May 2024 13:14:38 +0000 (15:14 +0200)
committer	Tobias Burnus <tburnus@baylibre.com>
	Wed, 29 May 2024 13:14:38 +0000 (15:14 +0200)
include/cuda/cuda.h		patch \| blob \| blame \| history
libgomp/libgomp.texi		patch \| blob \| blame \| history
libgomp/plugin/plugin-nvptx.c		patch \| blob \| blame \| history
libgomp/target.c		patch \| blob \| blame \| history