]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation
authorThomas Schwinge <thomas@codesourcery.com>
Tue, 21 Mar 2023 15:14:16 +0000 (16:14 +0100)
committerThomas Schwinge <thomas@codesourcery.com>
Mon, 8 May 2023 13:58:05 +0000 (15:58 +0200)
... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it.

Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609
"libgomp/nvptx: Prepare for reverse-offload callback handling",
and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8
"libgomp: Handle OpenMP's reverse offloads".

libgomp/
* target.c (gomp_target_rev): Instead of 'dev_to_host_cpy',
'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'.
* libgomp.h (gomp_target_rev): Adjust.
* libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust.
* libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust.
* plugin/plugin-gcn.c (process_reverse_offload): Adjust.
* plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy)
(rev_off_host_to_dev_cpy): Remove.
(GOMP_OFFLOAD_run): Adjust.

libgomp/libgomp-plugin.c
libgomp/libgomp-plugin.h
libgomp/libgomp.h
libgomp/plugin/plugin-gcn.c
libgomp/plugin/plugin-nvptx.c
libgomp/target.c

index 27e7c94ba9b464b4c1a570f3d549b9e05edbf29c..d696515eeb6e86399995f1ce7e9e91e7d4165c92 100644 (file)
@@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
 void
 GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
                        uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
-                       void (*dev_to_host_cpy) (void *, const void *, size_t,
-                                                void *),
-                       void (*host_to_dev_cpy) (void *, const void *, size_t,
-                                                void *), void *token)
+                       struct goacc_asyncqueue *aq)
 {
   gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
-                  dev_to_host_cpy, host_to_dev_cpy, token);
+                  aq);
 }
index 28267f75f7aee62225b481b79492cacc6f4225e5..42ee3d6c7f97b876f73c63303bde46358a22e896 100644 (file)
@@ -121,11 +121,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...)
        __attribute__ ((noreturn, format (printf, 1, 2)));
 
 extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
-                                   uint64_t, int,
-                                   void (*) (void *, const void *, size_t,
-                                             void *),
-                                   void (*) (void *, const void *, size_t,
-                                             void *), void *);
+                                   uint64_t, int, struct goacc_asyncqueue *);
 
 /* Prototypes for functions implemented by libgomp plugins.  */
 extern const char *GOMP_OFFLOAD_get_name (void);
index ba8fe348aba85c9e109b32d17214fc827862d073..4d2bfab4b716a9b5fee65917547e6b5be0c47546 100644 (file)
@@ -1130,10 +1130,7 @@ extern void gomp_init_targets_once (void);
 extern int gomp_get_num_devices (void);
 extern bool gomp_target_task_fn (void *);
 extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
-                            int,
-                            void (*) (void *, const void *, size_t, void *),
-                            void (*) (void *, const void *, size_t, void *),
-                            void *);
+                            int, struct goacc_asyncqueue *);
 
 /* Splay tree definitions.  */
 typedef struct splay_tree_node_s *splay_tree_node;
index 347803762eb9ddc99cc95fc95140ba2a74ac9f12..2181bf0235f7e5aad5e57deaf640b1e41c3edf0a 100644 (file)
@@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
 {
   int dev_num = dev_num64;
   GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
-                         NULL, NULL, NULL);
+                         NULL);
 }
 
 /* Output any data written to console output from the kernel.  It is expected
index b3481c408c90b824e290ebceaf1907589dd15f0a..ffc8e2d79d132449b7c97c53d33172f1312506aa 100644 (file)
@@ -56,6 +56,7 @@
 #include <unistd.h>
 #include <assert.h>
 #include <errno.h>
+#include <stdlib.h>
 
 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
    block to cache between kernel invocations.  For soft-stacks blocks bigger
@@ -1625,11 +1626,11 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
   return 1;
 }
 
-struct goacc_asyncqueue *
-GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+static struct goacc_asyncqueue *
+nvptx_goacc_asyncqueue_construct (unsigned int flags)
 {
   CUstream stream = NULL;
-  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
 
   struct goacc_asyncqueue *aq
     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
@@ -1637,14 +1638,26 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
   return aq;
 }
 
-bool
-GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+{
+  return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
+}
+
+static bool
+nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
 {
   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
   free (aq);
   return true;
 }
 
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+  return nvptx_goacc_asyncqueue_destruct (aq);
+}
+
 int
 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
 {
@@ -1658,13 +1671,19 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
   return -1;
 }
 
-bool
-GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+static bool
+nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
 {
   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
   return true;
 }
 
+bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+  return nvptx_goacc_asyncqueue_synchronize (aq);
+}
+
 bool
 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
                                      struct goacc_asyncqueue *aq2)
@@ -1924,22 +1943,6 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
 }
 
 
-void
-rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
-                        CUstream stream)
-{
-  CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
-  CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
-void
-rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
-                        CUstream stream)
-{
-  CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
-  CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
 void
 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
 {
@@ -1973,9 +1976,17 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
     }
   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
 
-  size_t stack_size = nvptx_stacks_size ();
   bool reverse_offload = ptx_dev->rev_data != NULL;
-  CUstream copy_stream = NULL;
+  struct goacc_asyncqueue *reverse_offload_aq = NULL;
+  if (reverse_offload)
+    {
+      reverse_offload_aq
+       = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
+      if (!reverse_offload_aq)
+       exit (EXIT_FAILURE);
+    }
+
+  size_t stack_size = nvptx_stacks_size ();
 
   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
   void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
@@ -1989,8 +2000,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
                     " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
                     __FUNCTION__, fn_name, teams, threads);
-  if (reverse_offload)
-    CUDA_CALL_ASSERT (cuStreamCreate, &copy_stream, CU_STREAM_NON_BLOCKING);
   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
                         32, threads, 1, 0, NULL, NULL, config);
   if (r != CUDA_SUCCESS)
@@ -2013,17 +2022,15 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
            GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
                                    rev_data->addrs, rev_data->sizes,
                                    rev_data->kinds, rev_data->dev_num,
-                                   rev_off_dev_to_host_cpy,
-                                   rev_off_host_to_dev_cpy, copy_stream);
-           CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
+                                   reverse_offload_aq);
+           if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
+             exit (EXIT_FAILURE);
            __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
          }
        usleep (1);
       }
   else
     r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
-  if (reverse_offload)
-    CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
   if (r == CUDA_ERROR_LAUNCH_FAILED)
     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
                       maybe_abort_msg);
@@ -2031,6 +2038,12 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
 
   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
+
+  if (reverse_offload)
+    {
+      if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
+       exit (EXIT_FAILURE);
+    }
 }
 
 /* TODO: Implement GOMP_OFFLOAD_async_run. */
index b30c6a50c7e24bdef11f6f235cf758f097cccb16..32389540accd306cc1aeaa468d6f7583bedacc75 100644 (file)
@@ -3299,9 +3299,7 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
 void
 gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
                 uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
-                void (*dev_to_host_cpy) (void *, const void *, size_t, void*),
-                void (*host_to_dev_cpy) (void *, const void *, size_t, void*),
-                void *token)
+                struct goacc_asyncqueue *aq)
 {
   /* Return early if there is no offload code.  */
   if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
@@ -3343,26 +3341,17 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
       devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
       sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
       kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
-      if (dev_to_host_cpy)
-       {
-         dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr,
-                          mapnum * sizeof (uint64_t), token);
-         dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr,
-                          mapnum * sizeof (uint64_t), token);
-         dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr,
-                          mapnum * sizeof (unsigned short), token);
-       }
-      else
-       {
-         gomp_copy_dev2host (devicep, NULL, devaddrs,
-                             (const void *) (uintptr_t) devaddrs_ptr,
-                             mapnum * sizeof (uint64_t));
-         gomp_copy_dev2host (devicep, NULL, sizes,
-                             (const void *) (uintptr_t) sizes_ptr,
-                             mapnum * sizeof (uint64_t));
-         gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) kinds_ptr,
-                             mapnum * sizeof (unsigned short));
-       }
+      gomp_copy_dev2host (devicep, aq, devaddrs,
+                         (const void *) (uintptr_t) devaddrs_ptr,
+                         mapnum * sizeof (uint64_t));
+      gomp_copy_dev2host (devicep, aq, sizes,
+                         (const void *) (uintptr_t) sizes_ptr,
+                         mapnum * sizeof (uint64_t));
+      gomp_copy_dev2host (devicep, aq, kinds,
+                         (const void *) (uintptr_t) kinds_ptr,
+                         mapnum * sizeof (unsigned short));
+      if (aq && !devicep->openacc.async.synchronize_func (aq))
+       exit (EXIT_FAILURE);
     }
 
   size_t tgt_align = 0, tgt_size = 0;
@@ -3389,13 +3378,14 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
            if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
              memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
                      (size_t) sizes[i]);
-           else if (dev_to_host_cpy)
-             dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
-                              (size_t) sizes[i], token);
            else
-             gomp_copy_dev2host (devicep, NULL, tgt + tgt_size,
-                                 (void *) (uintptr_t) devaddrs[i],
-                                 (size_t) sizes[i]);
+             {
+               gomp_copy_dev2host (devicep, aq, tgt + tgt_size,
+                                   (void *) (uintptr_t) devaddrs[i],
+                                   (size_t) sizes[i]);
+               if (aq && !devicep->openacc.async.synchronize_func (aq))
+                 exit (EXIT_FAILURE);
+             }
            devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size;
            tgt_size = tgt_size + sizes[i];
            if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
@@ -3485,15 +3475,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
                    || kind == GOMP_MAP_ALWAYS_TO
                    || kind == GOMP_MAP_ALWAYS_TOFROM)
                  {
-                   if (dev_to_host_cpy)
-                     dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
-                                      (void *) (uintptr_t) cdata[i].devaddr,
-                                      sizes[i], token);
-                   else
-                     gomp_copy_dev2host (devicep, NULL,
-                                         (void *) (uintptr_t) devaddrs[i],
-                                         (void *) (uintptr_t) cdata[i].devaddr,
-                                         sizes[i]);
+                   gomp_copy_dev2host (devicep, aq,
+                                       (void *) (uintptr_t) devaddrs[i],
+                                       (void *) (uintptr_t) cdata[i].devaddr,
+                                       sizes[i]);
+                   if (aq && !devicep->openacc.async.synchronize_func (aq))
+                     {
+                       gomp_mutex_unlock (&devicep->lock);
+                       exit (EXIT_FAILURE);
+                     }
                  }
                if (struct_cpy)
                  struct_cpy--;
@@ -3560,15 +3550,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
                    devaddrs[i]
                      = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
                                                                   sizes[i]);
-                   if (dev_to_host_cpy)
-                     dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
-                                      (void *) (uintptr_t) cdata[i].devaddr,
-                                      sizes[i], token);
-                   else
-                     gomp_copy_dev2host (devicep, NULL,
-                                         (void *) (uintptr_t) devaddrs[i],
-                                         (void *) (uintptr_t) cdata[i].devaddr,
-                                         sizes[i]);
+                   gomp_copy_dev2host (devicep, aq,
+                                       (void *) (uintptr_t) devaddrs[i],
+                                       (void *) (uintptr_t) cdata[i].devaddr,
+                                       sizes[i]);
+                   if (aq && !devicep->openacc.async.synchronize_func (aq))
+                     {
+                       gomp_mutex_unlock (&devicep->lock);
+                       exit (EXIT_FAILURE);
+                     }
                  }
                for (j = i + 1; j < mapnum; j++)
                  {
@@ -3672,15 +3662,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
                /* FALLTHRU */
              case GOMP_MAP_FROM:
              case GOMP_MAP_TOFROM:
-               if (copy && host_to_dev_cpy)
-                 host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr,
-                                  (void *) (uintptr_t) devaddrs[i],
-                                  sizes[i], token);
-               else if (copy)
-                 gomp_copy_host2dev (devicep, NULL,
-                                     (void *) (uintptr_t) cdata[i].devaddr,
-                                     (void *) (uintptr_t) devaddrs[i],
-                                     sizes[i], false, NULL);
+               if (copy)
+                 {
+                   gomp_copy_host2dev (devicep, aq,
+                                       (void *) (uintptr_t) cdata[i].devaddr,
+                                       (void *) (uintptr_t) devaddrs[i],
+                                       sizes[i], false, NULL);
+                   if (aq && !devicep->openacc.async.synchronize_func (aq))
+                     exit (EXIT_FAILURE);
+                 }
              default:
                break;
            }