]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
[nvptx] Expand OpenACC child function arguments to use CUDA params space
authorKwok Cheung Yeung <kcy@codesourcery.com>
Mon, 19 Apr 2021 22:01:03 +0000 (15:01 -0700)
committerKwok Cheung Yeung <kcy@codesourcery.com>
Tue, 20 Apr 2021 18:49:36 +0000 (11:49 -0700)
This patch replaces the patches "Use functional parameters for data mappings
in OpenACC child functions" and "Make OpenACC function-parameter explosion
optional".

2019-09-10  Chung-Lin Tang  (cltang@codesourcery.com)

* config/nvptx/nvptx.c (nvptx_expand_to_rtl_hook): New function
implementing CUDA .params space transformation.
(TARGET_EXPAND_TO_RTL_HOOK): implement hook with
nvptx_expand_to_rtl_hook.

libgomp/
* plugin/plugin-nvptx.c (nvptx_exec): Adjust arguments, add
kernel argument setup code, adjust cuLaunchKernel calling code.
(GOMP_OFFLOAD_openacc_exec): Adjust nvptx_exec call, delete
profiling code.
(GOMP_OFFLOAD_openacc_async_exec): Likewise.
(cuda_free_argmem): Delete function.

gcc/c/ChangeLog.omp
gcc/config/nvptx/nvptx.c
libgomp/ChangeLog.omp
libgomp/plugin/plugin-nvptx.c

index 673f344544bf6af649cf0a33c97c6752de0c84bb..f65e2ea9327f2c05c261fc96ac9f777ea907c3b5 100644 (file)
@@ -1,3 +1,10 @@
+2019-09-10  Chung-Lin Tang  <cltang@codesourcery.com>
+
+       * config/nvptx/nvptx.c (nvptx_expand_to_rtl_hook): New function
+       implementing CUDA .params space transformation.
+       (TARGET_EXPAND_TO_RTL_HOOK): implement hook with
+       nvptx_expand_to_rtl_hook.
+
 2018-12-13  Cesar Philippidis  <cesar@codesourcery.com>
             Nathan Sidwell  <nathan@acm.org>
             Julian Brown  <julian@codesourcery.com>
index 245624247ba7eaa9178262232524ee450e1099b9..eb24628eb0586f9deaf9f2399fa9f09ea8a85a58 100644 (file)
 #include "attribs.h"
 #include "tree-vrp.h"
 #include "tree-ssa-operands.h"
+#include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "tree-cfg.h"
+#include "gimple-ssa.h"
 #include "tree-ssanames.h"
 #include "gimplify.h"
 #include "tree-phinodes.h"
@@ -6848,6 +6852,228 @@ nvptx_libc_has_function (enum function_class fn_class, tree type)
   return default_libc_has_function (fn_class, type);
 }
 
+static void
+nvptx_expand_to_rtl_hook (void)
+{
+  /* For utilizing CUDA .param kernel arguments, we detect and modify
+     the gimple of offloaded child functions, here before RTL expansion,
+     starting with standard OMP form:
+      foo._omp_fn.0 (const struct .omp_data_t.8 & restrict .omp_data_i) { ... }
+   
+     and transform it into a style where the OMP data record fields are
+     "exploded" into individual scalar arguments:
+      foo._omp_fn.0 (int * a, int * b, int * c) { ... }
+
+     Note that there are implicit assumptions of how OMP lowering (and/or other
+     intervening passes) behaves contained in this transformation code;
+     if those passes change in their output, this code may possibly need
+     updating.  */
+
+  if (lookup_attribute ("omp target entrypoint",
+                       DECL_ATTRIBUTES (current_function_decl))
+      /* The rather indirect manner in which OpenMP target functions are
+        launched makes this transformation only valid for OpenACC currently.
+        TODO: e.g. write_omp_entry(), nvptx_declare_function_name(), etc.
+        needs changes for this to work with OpenMP.  */
+      && lookup_attribute ("oacc function",
+                          DECL_ATTRIBUTES (current_function_decl))
+      && VOID_TYPE_P (TREE_TYPE (DECL_RESULT (current_function_decl))))
+    {
+      tree omp_data_arg = DECL_ARGUMENTS (current_function_decl);
+      tree argtype = TREE_TYPE (omp_data_arg);
+
+      /* Ensure this function is of the form of a single reference argument
+        to the OMP data record, or a single void* argument (when no values
+        passed)  */
+      if (! (DECL_CHAIN (omp_data_arg) == NULL_TREE
+            && ((TREE_CODE (argtype) == REFERENCE_TYPE
+                 && TREE_CODE (TREE_TYPE (argtype)) == RECORD_TYPE)
+                || (TREE_CODE (argtype) == POINTER_TYPE
+                    && TREE_TYPE (argtype) == void_type_node))))
+       return;
+
+      if (dump_file)
+       {
+         fprintf (dump_file, "Detected offloaded child function %s, "
+                  "starting parameter conversion\n",
+                  print_generic_expr_to_str (current_function_decl));
+         fprintf (dump_file, "OMP data record argument: %s (tree type: %s)\n",
+                  print_generic_expr_to_str (omp_data_arg),
+                  print_generic_expr_to_str (argtype));
+         fprintf (dump_file, "Data record fields:\n");
+       }
+      
+      hash_map<tree,tree> fld_to_args;
+      tree fld, rectype = TREE_TYPE (argtype);
+      tree arglist = NULL_TREE, argtypelist = NULL_TREE;
+
+      if (TREE_CODE (rectype) == RECORD_TYPE)
+       {
+         /* For each field in the OMP data record type, create a corresponding
+            PARM_DECL, and map field -> parm using the fld_to_args hash_map.
+            Also create the tree chains for creating function type and
+            DECL_ARGUMENTS below.  */
+         for (fld = TYPE_FIELDS (rectype); fld; fld = DECL_CHAIN (fld))
+           {
+             tree narg = build_decl (DECL_SOURCE_LOCATION (fld), PARM_DECL,
+                                     DECL_NAME (fld), TREE_TYPE (fld));
+             DECL_ARTIFICIAL (narg) = 1;
+             DECL_ARG_TYPE (narg) = TREE_TYPE (fld);
+             DECL_CONTEXT (narg) = current_function_decl;
+             TREE_USED (narg) = 1;
+             TREE_READONLY (narg) = 1;
+
+             if (dump_file)
+               fprintf (dump_file, "\t%s, type: %s, offset: %s bytes + %s bits\n",
+                        print_generic_expr_to_str (fld),
+                        print_generic_expr_to_str (TREE_TYPE (fld)),
+                        print_generic_expr_to_str (DECL_FIELD_OFFSET (fld)),
+                        print_generic_expr_to_str (DECL_FIELD_BIT_OFFSET (fld)));
+             fld_to_args.put (fld, narg);
+
+             TREE_CHAIN (narg) = arglist;
+             arglist = narg;
+             argtypelist = tree_cons (NULL_TREE, TREE_TYPE (narg),
+                                      argtypelist);
+           }
+         arglist = nreverse (arglist);
+         argtypelist = nreverse (argtypelist);
+       }
+      /* This is needed to not be mistaken for a stdarg function.  */
+      argtypelist = chainon (argtypelist, void_list_node);
+
+      if (dump_file)
+       {
+         fprintf (dump_file, "Function before OMP data arg replaced:\n");
+         dump_function_to_file (current_function_decl, dump_file, dump_flags);
+       }
+
+      /* Actually modify the tree type and DECL_ARGUMENTS here.  */
+      TREE_TYPE (current_function_decl) = build_function_type (void_type_node,
+                                                              argtypelist);
+      DECL_ARGUMENTS (current_function_decl) = arglist;
+
+      /* Remove local decls which correspond to *.omp_data_i->FIELD entries, by
+        scanning and skipping those entries, creating a new local_decls list.
+        We assume a very specific MEM_REF tree expression shape.  */
+      tree decl;
+      unsigned int i;
+      vec<tree, va_gc> *new_local_decls = NULL;
+      FOR_EACH_VEC_SAFE_ELT (cfun->local_decls, i, decl)
+       {
+         if (DECL_HAS_VALUE_EXPR_P (decl))
+           {
+             tree t = DECL_VALUE_EXPR (decl);
+             if (TREE_CODE (t) == MEM_REF
+                 && TREE_CODE (TREE_OPERAND (t, 0)) == COMPONENT_REF
+                 && TREE_CODE (TREE_OPERAND (TREE_OPERAND (t, 0), 0)) == MEM_REF
+                 && (TREE_OPERAND (TREE_OPERAND (TREE_OPERAND (t, 0), 0), 0)
+                     == omp_data_arg))
+               continue;
+           }
+         vec_safe_push (new_local_decls, decl);
+       }
+      vec_free (cfun->local_decls);
+      cfun->local_decls = new_local_decls;
+      
+      /* Scan function body for assignments from .omp_data_i->FIELD, and using
+        the above created fld_to_args hash map, convert them to reads of
+        function arguments.  */
+      basic_block bb;
+      gimple_stmt_iterator gsi;
+      FOR_EACH_BB_FN (bb, cfun)
+       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+         {
+           tree val, *val_ptr = NULL;
+           gimple *stmt = gsi_stmt (gsi);
+           if (is_gimple_assign (stmt)
+               && gimple_assign_rhs_class (stmt) == GIMPLE_SINGLE_RHS)
+             val_ptr = gimple_assign_rhs1_ptr (stmt);
+           else if (is_gimple_debug (stmt) && gimple_debug_bind_p (stmt))
+             val_ptr = gimple_debug_bind_get_value_ptr (stmt);
+
+           if (val_ptr == NULL || (val = *val_ptr) == NULL_TREE)
+             continue;
+
+           tree new_val = NULL_TREE, fld = NULL_TREE;
+
+           if (TREE_CODE (val) == COMPONENT_REF
+               && TREE_CODE (TREE_OPERAND (val, 0)) == MEM_REF
+               && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (val, 0), 0))
+                   == SSA_NAME)
+               && (SSA_NAME_VAR (TREE_OPERAND (TREE_OPERAND (val, 0), 0))
+                   == omp_data_arg))
+             {
+               /* .omp_data->FIELD case.  */
+               fld = TREE_OPERAND (val, 1);
+               new_val = *fld_to_args.get (fld);
+             }
+           else if (TREE_CODE (val) == MEM_REF
+                    && TREE_CODE (TREE_OPERAND (val, 0)) == SSA_NAME
+                    && SSA_NAME_VAR (TREE_OPERAND (val, 0)) == omp_data_arg)
+             {
+               /* This case may happen in the final tree level optimization
+                  output, due to SLP:
+                  vect.XX = MEM <vector(1) unsigned long> [(void *).omp_data_i_5(D) + 8B]
+
+                  Therefore here we need a more elaborate search of the field
+                  list to reverse map to which field the offset is referring
+                  to.  */
+               unsigned HOST_WIDE_INT offset
+                 = tree_to_uhwi (TREE_OPERAND (val, 1));
+
+               for (hash_map<tree, tree>::iterator i = fld_to_args.begin ();
+                    i != fld_to_args.end (); ++i)
+                 {
+                   tree cur_fld = (*i).first;
+                   tree cur_arg = (*i).second;
+                   gcc_assert (TREE_CODE (cur_arg) == PARM_DECL);
+
+                   unsigned HOST_WIDE_INT cur_offset =
+                     (tree_to_uhwi (DECL_FIELD_OFFSET (cur_fld))
+                      + (tree_to_uhwi (DECL_FIELD_BIT_OFFSET (cur_fld))
+                         / BITS_PER_UNIT));
+
+                   if (offset == cur_offset)
+                     {
+                       new_val = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (val),
+                                         cur_arg);
+                       break;
+                     }
+                 }
+             }
+
+           /* If we found the corresponding OMP data record field, replace the
+              RHS with the new created PARM_DECL.  */
+           if (new_val != NULL_TREE)
+             {
+               if (dump_file)
+                 {
+                   fprintf (dump_file, "For gimple stmt: ");
+                   print_gimple_stmt (dump_file, stmt, 0);
+                   fprintf (dump_file, "\tReplacing OMP recv ref %s with %s\n",
+                            print_generic_expr_to_str (val),
+                            print_generic_expr_to_str (new_val));
+                 }
+               /* Write in looked up ARG as new RHS value.  */
+               *val_ptr = new_val;
+             }
+         }
+
+      /* Delete SSA_NAMEs of .omp_data_i by setting them to NULL_TREE.  */
+      tree name;
+      FOR_EACH_SSA_NAME (i, name, cfun)
+       if (SSA_NAME_VAR (name) == omp_data_arg)
+         (*SSANAMES (cfun))[SSA_NAME_VERSION (name)] = NULL_TREE;
+
+      if (dump_file)
+       {
+         fprintf (dump_file, "Function after OMP data arg replaced: ");
+         dump_function_to_file (current_function_decl, dump_file, dump_flags);
+       }
+    }
+}
+
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE nvptx_option_override
 
@@ -6999,6 +7225,9 @@ nvptx_libc_has_function (enum function_class fn_class, tree type)
 #undef TARGET_LIBC_HAS_FUNCTION
 #define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
 
+#undef TARGET_EXPAND_TO_RTL_HOOK
+#define TARGET_EXPAND_TO_RTL_HOOK nvptx_expand_to_rtl_hook
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-nvptx.h"
index 8e48296fc8c04f1f2a2fa722820614af7f4ed8f1..ffefb43a5997b35f5d6a71b79d0dfe324d857d22 100644 (file)
@@ -1,3 +1,12 @@
+2019-09-10  Chung-Lin Tang  <cltang@codesourcery.com>
+
+       * plugin/plugin-nvptx.c (nvptx_exec): Adjust arguments, add
+       kernel argument setup code, adjust cuLaunchKernel calling code.
+       (GOMP_OFFLOAD_openacc_exec): Adjust nvptx_exec call, delete
+       profiling code.
+       (GOMP_OFFLOAD_openacc_async_exec): Likewise.
+       (cuda_free_argmem): Delete function.
+
 2018-12-11  Julian Brown  <julian@codesourcery.com>
            Chung-Lin Tang  <cltang@codesourcery.com>
 
index 1215212d501b5cb3dd99d39645370dc9cbeaf92a..1bea0bedccb66a9b17eebaf5d148e77a2b96fe9c 100644 (file)
@@ -730,16 +730,24 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 
 static void
 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
-           unsigned *dims, void *targ_mem_desc,
-           CUdeviceptr dp, CUstream stream)
+           unsigned *dims, CUstream stream)
 {
   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
   CUfunction function;
   int i;
-  void *kargs[1];
   struct nvptx_thread *nvthd = nvptx_thread ();
   int warp_size = nvthd->ptx_dev->warp_size;
+  void **kernel_args = NULL;
 
+  GOMP_PLUGIN_debug (0, "prepare mappings (mapnum: %u)\n", (unsigned) mapnum);
+
+  if (mapnum > 0)
+    {
+      kernel_args = alloca (mapnum * sizeof (void *));
+      for (int i = 0; i < mapnum; i++)
+       kernel_args[i] = (devaddrs[i] ? &devaddrs[i] : &hostaddrs[i]);
+    }
+  
   function = targ_fn->fn;
 
   /* Initialize the launch dimensions.  Typically this is constant,
@@ -971,11 +979,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
                                            api_info);
     }
 
-  kargs[0] = &dp;
   CUDA_CALL_ASSERT (cuLaunchKernel, function,
                    dims[GOMP_DIM_GANG], 1, 1,
                    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
-                   0, stream, kargs, 0);
+                   0, stream, kernel_args, 0);
 
   if (profiling_p)
     {
@@ -1431,66 +1438,7 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
                           void **hostaddrs, void **devaddrs,
                           unsigned *dims, void *targ_mem_desc)
 {
-  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
-
-  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
-  acc_prof_info *prof_info = thr->prof_info;
-  acc_event_info data_event_info;
-  acc_api_info *api_info = thr->api_info;
-  bool profiling_p = __builtin_expect (prof_info != NULL, false);
-
-  void **hp = NULL;
-  CUdeviceptr dp = 0;
-
-  if (mapnum > 0)
-    {
-      size_t s = mapnum * sizeof (void *);
-      hp = alloca (s);
-      for (int i = 0; i < mapnum; i++)
-       hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
-      CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
-      if (profiling_p)
-       goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
-    }
-
-  /* Copy the (device) pointers to arguments to the device (dp and hp might in
-     fact have the same value on a unified-memory system).  */
-  if (mapnum > 0)
-    {
-      if (profiling_p)
-       {
-         prof_info->event_type = acc_ev_enqueue_upload_start;
-
-         data_event_info.data_event.event_type = prof_info->event_type;
-         data_event_info.data_event.valid_bytes
-           = _ACC_DATA_EVENT_INFO_VALID_BYTES;
-         data_event_info.data_event.parent_construct
-           = acc_construct_parallel;
-         data_event_info.data_event.implicit = 1; /* Always implicit.  */
-         data_event_info.data_event.tool_info = NULL;
-         data_event_info.data_event.var_name = NULL;
-         data_event_info.data_event.bytes = mapnum * sizeof (void *);
-         data_event_info.data_event.host_ptr = hp;
-         data_event_info.data_event.device_ptr = (const void *) dp;
-
-         api_info->device_api = acc_device_api_cuda;
-
-         GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
-                                               api_info);
-       }
-      CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
-                       mapnum * sizeof (void *));
-      if (profiling_p)
-       {
-         prof_info->event_type = acc_ev_enqueue_upload_end;
-         data_event_info.data_event.event_type = prof_info->event_type;
-         GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
-                                               api_info);
-       }
-    }
-
-  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
-             dp, NULL);
+  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, NULL);
 
   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
   const char *maybe_abort_msg = "(perhaps abort was called)";
@@ -1499,18 +1447,6 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
                       maybe_abort_msg);
   else if (r != CUDA_SUCCESS)
     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
-
-  CUDA_CALL_ASSERT (cuMemFree, dp);
-  if (profiling_p)
-    goacc_profiling_acc_ev_free (thr, (void *) dp);
-}
-
-static void
-cuda_free_argmem (void *ptr)
-{
-  void **block = (void **) ptr;
-  nvptx_free (block[0], (struct ptx_device *) block[1]);
-  free (block);
 }
 
 void
@@ -1519,78 +1455,7 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
                                 unsigned *dims, void *targ_mem_desc,
                                 struct goacc_asyncqueue *aq)
 {
-  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
-
-  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
-  acc_prof_info *prof_info = thr->prof_info;
-  acc_event_info data_event_info;
-  acc_api_info *api_info = thr->api_info;
-  bool profiling_p = __builtin_expect (prof_info != NULL, false);
-
-  void **hp = NULL;
-  CUdeviceptr dp = 0;
-  void **block = NULL;
-
-  if (mapnum > 0)
-    {
-      size_t s = mapnum * sizeof (void *);
-      block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
-      hp = block + 2;
-      for (int i = 0; i < mapnum; i++)
-       hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
-      CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
-      if (profiling_p)
-       goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
-    }
-
-  /* Copy the (device) pointers to arguments to the device (dp and hp might in
-     fact have the same value on a unified-memory system).  */
-  if (mapnum > 0)
-    {
-      if (profiling_p)
-       {
-         prof_info->event_type = acc_ev_enqueue_upload_start;
-
-         data_event_info.data_event.event_type = prof_info->event_type;
-         data_event_info.data_event.valid_bytes
-           = _ACC_DATA_EVENT_INFO_VALID_BYTES;
-         data_event_info.data_event.parent_construct
-           = acc_construct_parallel;
-         data_event_info.data_event.implicit = 1; /* Always implicit.  */
-         data_event_info.data_event.tool_info = NULL;
-         data_event_info.data_event.var_name = NULL;
-         data_event_info.data_event.bytes = mapnum * sizeof (void *);
-         data_event_info.data_event.host_ptr = hp;
-         data_event_info.data_event.device_ptr = (const void *) dp;
-
-         api_info->device_api = acc_device_api_cuda;
-
-         GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
-                                               api_info);
-       }
-
-      CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
-                       mapnum * sizeof (void *), aq->cuda_stream);
-      block[0] = (void *) dp;
-
-      struct nvptx_thread *nvthd =
-       (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
-      block[1] = (void *) nvthd->ptx_dev;
-
-      if (profiling_p)
-       {
-         prof_info->event_type = acc_ev_enqueue_upload_end;
-         data_event_info.data_event.event_type = prof_info->event_type;
-         GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
-                                               api_info);
-       }
-    }
-
-  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
-             dp, aq->cuda_stream);
-
-  if (mapnum > 0)
-    GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
+  nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, aq->cuda_stream);
 }
 
 void *