nvptx: Support global constructors/destructors via 'collect2' for offloading

author Thomas Schwinge <thomas@codesourcery.com>

Wed, 30 Nov 2022 21:09:35 +0000 (22:09 +0100)

committer Thomas Schwinge <thomas@codesourcery.com>

Fri, 20 Jan 2023 20:43:32 +0000 (21:43 +0100)
author Thomas Schwinge <thomas@codesourcery.com>
Wed, 30 Nov 2022 21:09:35 +0000 (22:09 +0100)
committer Thomas Schwinge <thomas@codesourcery.com>
Fri, 20 Jan 2023 20:43:32 +0000 (21:43 +0100)
diff --git a/libgcc/ChangeLog.omp b/libgcc/ChangeLog.omp

index 68a99cbe4276ef621ced0a6c533d760917a46231..2e7bf5cc029ac33cebcc733a4b0bfd9061a74d60 100644 (file)
--- a/libgcc/ChangeLog.omp
+++ b/libgcc/ChangeLog.omp
@@ -1,5 +1,11 @@
  2023-01-20  Thomas Schwinge  <thomas@codesourcery.com>
  
+       * config/nvptx/crtstuff.c ["mgomp"]
+       (__do_global_ctors__entry__mgomp)
+       (__do_global_dtors__entry__mgomp): New.
+       [!"mgomp"] (__do_global_ctors__entry, __do_global_dtors__entry):
+       New.
+
         * config.host <case ${host} in nvptx-*>: Add 'crtbegin.o',
         'crtend.o' to 'extra_parts'.
         * config/nvptx/crt0.c: Invoke '__do_global_ctors',
diff --git a/libgcc/config/nvptx/crtstuff.c b/libgcc/config/nvptx/crtstuff.c

index 0823fc499019cfd93a51ecbe830a1ba7d69bffdc..8dc80687e0aa29eaf1ca5833057b286cd7624ca4 100644 (file)
--- a/libgcc/config/nvptx/crtstuff.c
+++ b/libgcc/config/nvptx/crtstuff.c
@@ -29,6 +29,14 @@
     files (via 'CRT_BEGIN' and 'CRT_END'): 'crtbegin.o' and 'crtend.o', but we
     do so anyway, for symmetry with other configurations.  */
  
+
+/* See 'crt0.c', 'mgomp.c'.  */
+#if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__)
+extern void *__nvptx_stacks[32] __attribute__((shared,nocommon));
+extern unsigned __nvptx_uni[32] __attribute__((shared,nocommon));
+#endif
+
+
  #ifdef CRT_BEGIN
  
  void
@@ -37,6 +45,33 @@ __do_global_ctors (void)
    DO_GLOBAL_CTORS_BODY;
  }
  
+/* Need '.entry' wrapper for offloading.  */
+
+# if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__)
+
+__attribute__((kernel)) void __do_global_ctors__entry__mgomp (void *);
+
+void
+__do_global_ctors__entry__mgomp (void *nvptx_stacks_0)
+{
+  __nvptx_stacks[0] = nvptx_stacks_0;
+  __nvptx_uni[0] = 0;
+
+  __do_global_ctors ();
+}
+
+# else
+
+__attribute__((kernel)) void __do_global_ctors__entry (void);
+
+void
+__do_global_ctors__entry (void)
+{
+  __do_global_ctors ();
+}
+
+# endif
+
  #elif defined(CRT_END) /* ! CRT_BEGIN */
  
  void
@@ -45,7 +80,7 @@ __do_global_dtors (void)
    /* In this configuration here, there's no way that "this routine is run more
       than once [...] when exit is called recursively": for nvptx target, the
       call to '__do_global_dtors' is registered via 'atexit', which doesn't
-     re-enter a function already run.
+     re-enter a function already run, and neither does nvptx offload target.
       Therefore, we do *not* "arrange to remember where in the list we left off
       processing".  */
    func_ptr *p;
@@ -53,6 +88,33 @@ __do_global_dtors (void)
      (*p++) ();
  }
  
+/* Need '.entry' wrapper for offloading.  */
+
+# if defined(__nvptx_softstack__) && defined(__nvptx_unisimt__)
+
+__attribute__((kernel)) void __do_global_dtors__entry__mgomp (void *);
+
+void
+__do_global_dtors__entry__mgomp (void *nvptx_stacks_0)
+{
+  __nvptx_stacks[0] = nvptx_stacks_0;
+  __nvptx_uni[0] = 0;
+
+  __do_global_dtors ();
+}
+
+# else
+
+__attribute__((kernel)) void __do_global_dtors__entry (void);
+
+void
+__do_global_dtors__entry (void)
+{
+  __do_global_dtors ();
+}
+
+# endif
+
  #else /* ! CRT_BEGIN && ! CRT_END */
  #error "One of CRT_BEGIN or CRT_END must be defined."
  #endif
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp

index 4447b74a2abd1b9e0e3ee97abea9c66718c498a4..32aa97052965413539164b63abd5d6aaa4f9b5cc 100644 (file)
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,5 +1,9 @@
  2023-01-20  Thomas Schwinge  <thomas@codesourcery.com>
  
+       * plugin/plugin-nvptx.c (nvptx_do_global_cdtors): New.
+       (nvptx_close_device, GOMP_OFFLOAD_load_image)
+       (GOMP_OFFLOAD_unload_image): Call it.
+
         * plugin/plugin-nvptx.c (nvptx_exec): Assert what we know about
         'blockDimX'.
  
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c

index b2fabc61cc88b1e56e09dae4455eff88de1f29bc..8e7b63bd63702ea6d47a83591eff0b873ee5d98c 100644 (file)
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -344,6 +344,11 @@ static struct ptx_device **ptx_devices;
     default is set here.  */
  static unsigned lowlat_pool_size = 8*1024;
  
+static bool nvptx_do_global_cdtors (CUmodule, struct ptx_device *,
+                                   const char *);
+static size_t nvptx_stacks_size ();
+static void *nvptx_stacks_acquire (struct ptx_device *, size_t, int);
+
  static inline struct nvptx_thread *
  nvptx_thread (void)
  {
@@ -571,6 +576,17 @@ nvptx_close_device (struct ptx_device *ptx_dev)
    if (!ptx_dev)
      return true;
  
+  bool ret = true;
+
+  for (struct ptx_image_data *image = ptx_dev->images;
+       image != NULL;
+       image = image->next)
+    {
+      if (!nvptx_do_global_cdtors (image->module, ptx_dev,
+                                  "__do_global_dtors__entry"))
+       ret = false;
+    }
+
    for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
      {
        struct ptx_free_block *b_next = b->next;
@@ -591,7 +607,8 @@ nvptx_close_device (struct ptx_device *ptx_dev)
      CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
  
    free (ptx_dev);
-  return true;
+
+  return ret;
  }
  
  static int
@@ -1313,6 +1330,93 @@ nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
      GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
  }
  
+/* Invoke MODULE's global constructors/destructors.  */
+
+static bool
+nvptx_do_global_cdtors (CUmodule module, struct ptx_device *ptx_dev,
+                       const char *funcname)
+{
+  bool ret = true;
+  char *funcname_mgomp = NULL;
+  CUresult r;
+  CUfunction funcptr;
+  r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
+                        &funcptr, module, funcname);
+  GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
+                    funcname, cuda_error (r));
+  if (r == CUDA_ERROR_NOT_FOUND)
+    {
+      /* Try '[funcname]__mgomp'.  */
+
+      size_t funcname_len = strlen (funcname);
+      const char *mgomp_suffix = "__mgomp";
+      size_t mgomp_suffix_len = strlen (mgomp_suffix);
+      funcname_mgomp
+       = GOMP_PLUGIN_malloc (funcname_len + mgomp_suffix_len + 1);
+      memcpy (funcname_mgomp, funcname, funcname_len);
+      memcpy (funcname_mgomp + funcname_len,
+             mgomp_suffix, mgomp_suffix_len + 1);
+      funcname = funcname_mgomp;
+
+      r = CUDA_CALL_NOCHECK (cuModuleGetFunction,
+                            &funcptr, module, funcname);
+      GOMP_PLUGIN_debug (0, "cuModuleGetFunction (%s): %s\n",
+                        funcname, cuda_error (r));
+    }
+  if (r == CUDA_ERROR_NOT_FOUND)
+    ;
+  else if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("cuModuleGetFunction (%s) error: %s",
+                        funcname, cuda_error (r));
+      ret = false;
+    }
+  else
+    {
+      /* If necessary, set up soft stack.  */
+      void *nvptx_stacks_0;
+      void *kargs[1];
+      if (funcname_mgomp)
+       {
+         size_t stack_size = nvptx_stacks_size ();
+         pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
+         nvptx_stacks_0 = nvptx_stacks_acquire (ptx_dev, stack_size, 1);
+         nvptx_stacks_0 += stack_size;
+         kargs[0] = &nvptx_stacks_0;
+       }
+      r = CUDA_CALL_NOCHECK (cuLaunchKernel,
+                            funcptr,
+                            1, 1, 1, 1, 1, 1,
+                            /* sharedMemBytes */ 0,
+                            /* hStream */ NULL,
+                            /* kernelParams */ funcname_mgomp ? kargs : NULL,
+                            /* extra */ NULL);
+      if (r != CUDA_SUCCESS)
+       {
+         GOMP_PLUGIN_error ("cuLaunchKernel (%s) error: %s",
+                            funcname, cuda_error (r));
+         ret = false;
+       }
+
+      r = CUDA_CALL_NOCHECK (cuStreamSynchronize,
+                            NULL);
+      if (r != CUDA_SUCCESS)
+       {
+         GOMP_PLUGIN_error ("cuStreamSynchronize (%s) error: %s",
+                            funcname, cuda_error (r));
+         ret = false;
+       }
+
+      if (funcname_mgomp)
+       pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
+    }
+
+  if (funcname_mgomp)
+    free (funcname_mgomp);
+
+  return ret;
+}
+
  /* Load the (partial) program described by TARGET_DATA to device
     number ORD.  Allocate and return TARGET_TABLE.  If not NULL, REV_FN_TABLE
     will contain the on-device addresses of the functions for reverse offload.
@@ -1485,6 +1589,9 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
  
    nvptx_set_clocktick (module, dev);
  
+  if (!nvptx_do_global_cdtors (module, dev, "__do_global_ctors__entry"))
+    return -1;
+
    return fn_entries + var_entries + other_entries;
  }
  
@@ -1510,6 +1617,10 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
    for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
      if (image->target_data == target_data)
        {
+       if (!nvptx_do_global_cdtors (image->module, dev,
+                                    "__do_global_dtors__entry"))
+         ret = false;
+
         *prev_p = image->next;
         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
           ret = false;
author	Thomas Schwinge <thomas@codesourcery.com>
	Wed, 30 Nov 2022 21:09:35 +0000 (22:09 +0100)
committer	Thomas Schwinge <thomas@codesourcery.com>
	Fri, 20 Jan 2023 20:43:32 +0000 (21:43 +0100)
libgcc/ChangeLog.omp		patch \| blob \| blame \| history
libgcc/config/nvptx/crtstuff.c		patch \| blob \| blame \| history
libgomp/ChangeLog.omp		patch \| blob \| blame \| history
libgomp/plugin/plugin-nvptx.c		patch \| blob \| blame \| history