Run all kernels regions with GOMP_MAP_FORCE_TOFROM mappings synchronously

author Julian Brown <julian@codesourcery.com>

Tue, 22 Oct 2019 00:22:31 +0000 (17:22 -0700)

committer Kwok Cheung Yeung <kcy@codesourcery.com>

Thu, 22 Apr 2021 17:14:21 +0000 (10:14 -0700)
author Julian Brown <julian@codesourcery.com>
Tue, 22 Oct 2019 00:22:31 +0000 (17:22 -0700)
committer Kwok Cheung Yeung <kcy@codesourcery.com>
Thu, 22 Apr 2021 17:14:21 +0000 (10:14 -0700)
diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp

index b9fb82e39d84e1f8994a374bf0f4c95e32b7e351..71d5c5fddbf8d281498c195bac31eb624470ba4f 100644 (file)
--- a/gcc/ChangeLog.omp
+++ b/gcc/ChangeLog.omp
@@ -1,3 +1,13 @@
+2019-10-21  Julian Brown  <julian@codesourcery.com>
+
+       * omp-oacc-kernels-decompose.cc (decompose_kernels_region_body): Add
+       inhibit_async parameter.  Force asynchronous kernel launches to
+       run synchronously if they have problematic variable mappings.
+       Don't add explicit wait for decomposed kernels regions we forced
+       synchronous.
+       (omp_oacc_kernels_decompose_1): Detect problematic variable mappings,
+       and inhibit asynchronous execution if we find any.
+
  2020-02-06  Tobias Burnus  <tobias@codesourcery.com>
  
         * omp-low.c (convert_from_firstprivate_int):
diff --git a/gcc/omp-oacc-kernels-decompose.cc b/gcc/omp-oacc-kernels-decompose.cc

index 26c26e5b896f4e14f5bed9738056058024fe4766..bceb03c422e0f3b249864c305a6e9e933ed534ca 100644 (file)
--- a/gcc/omp-oacc-kernels-decompose.cc
+++ b/gcc/omp-oacc-kernels-decompose.cc
@@ -1154,7 +1154,8 @@ control_flow_regions::compute_regions (gimple_seq seq)
     with the KERNELS_CLAUSES, into a series of compute constructs.  */
  
  static gimple *
-decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
+decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses,
+                              bool inhibit_async)
  {
    location_t loc = gimple_location (kernels_region);
  
@@ -1280,6 +1281,48 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
       separated from the loop.  */
    bool only_simple_assignments = true;
  
+  /* If we have variable mappings that are problematic to use with
+     asynchronous compute regions (see transform_kernels_region), force
+     decomposed asynchronous kernels launches to run in synchronous mode.
+     FIXME: This is potentially a performance loss.  */
+  if (inhibit_async && async_clause)
+    {
+      int num_waits = 0;
+
+      tree prev_clause = NULL_TREE;
+      for (tree clause = kernels_clauses;
+          clause;
+          clause = OMP_CLAUSE_CHAIN (clause))
+       {
+         if (OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_ASYNC)
+           {
+             /* Wait for the async queue given on the original kernels
+                construct.  We're running the decomposed sequence of parallels
+                synchronously now.  */
+             tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
+             tree sync_arg
+               = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
+             tree wait = unshare_expr (OMP_CLAUSE_OPERAND (clause, 0));
+             gimple *wait_call
+               = gimple_build_call (wait_fn, 3, sync_arg, integer_one_node,
+                                    wait);
+             gimple_set_location (wait_call, loc);
+             gimple_seq_add_stmt (&region_body, wait_call);
+
+             num_waits++;
+
+             /* Remove async clause.  */
+             if (prev_clause)
+               OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (clause);
+             else
+               kernels_clauses = OMP_CLAUSE_CHAIN (clause);
+           }
+         prev_clause = clause;
+       }
+
+      gcc_assert (num_waits == 1);
+    }
+
    /* Precompute the control flow region information to determine whether an
       OpenACC loop is executed conditionally or unconditionally.  */
    control_flow_regions cf_regions (body_sequence);
@@ -1391,21 +1434,24 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
        gimple_seq_add_stmt (&region_body, single_region);
      }
  
-  /* We want to launch these kernels asynchronously.  If the original
-     kernels region had an async clause, this is done automatically because
-     that async clause was copied to the individual regions we created.
-     Otherwise, add an async clause to each newly created region, as well as
-     a wait directive at the end.  */
-  if (async_clause == NULL)
-    add_async_clauses_and_wait (loc, &region_body);
-  else
-    /* !!! If we have asynchronous parallel blocks inside a (synchronous) data
-       region, then target memory will get unmapped at the point the data
-       region ends, even if the inner asynchronous parallels have not yet
-       completed.  For kernels marked "async", we might want to use "enter data
-       async(...)" and "exit data async(...)" instead.
-       For now, insert a (synchronous) wait at the end of the block.  */
-    add_wait (loc, &region_body);
+  if (!inhibit_async)
+    {
+      /* We want to launch these kernels asynchronously.  If the original
+        kernels region had an async clause, this is done automatically because
+        that async clause was copied to the individual regions we created.
+        Otherwise, add an async clause to each newly created region, as well
+        as a wait directive at the end.  */
+      if (async_clause == NULL)
+       add_async_clauses_and_wait (loc, &region_body);
+      else
+       /* !!! If we have asynchronous parallel blocks inside a (synchronous)
+          data region, then target memory will get unmapped at the point the
+          data region ends, even if the inner asynchronous parallels have not
+          yet completed.  For kernels marked "async", we might want to use
+          "enter data async(...)" and "exit data async(...)" instead.
+          For now, insert a (synchronous) wait at the end of the block.  */
+       add_wait (loc, &region_body);
+    }
  
    tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body));
    gimple *body = gimple_build_bind (kernels_locals, region_body,
@@ -1426,6 +1472,8 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
  static gimple *
  omp_oacc_kernels_decompose_1 (gimple *kernels_stmt)
  {
+  bool inhibit_async = false;
+
    gcc_checking_assert (gimple_omp_target_kind (kernels_stmt)
                        == GF_OMP_TARGET_KIND_OACC_KERNELS);
    location_t loc = gimple_location (kernels_stmt);
@@ -1475,9 +1523,22 @@ omp_oacc_kernels_decompose_1 (gimple *kernels_stmt)
                 }
               break;
  
+           case GOMP_MAP_FORCE_TOFROM:
+             /* If we have a FORCE_TOFROM mapping, it might be to a
+                non-addressable scalar.  The value is copied directly from the
+                offload kernel without waiting for an async to complete (a
+                race condition).  We have to be careful since the scalar might
+                be used as a loop index variable, and those are handled
+                specially.  We can't just force such variables to be
+                addressable.  Hence, run parts of kernels with such mappings
+                synchronously.
+                FIXME: Ideally we want to remove this hack and be able to
+                move this mapping to the enclosing data region.  */
+             inhibit_async = true;
+             break;
+
             case GOMP_MAP_POINTER:
             case GOMP_MAP_TO_PSET:
-           case GOMP_MAP_FORCE_TOFROM:
             case GOMP_MAP_FIRSTPRIVATE_POINTER:
             case GOMP_MAP_FIRSTPRIVATE_REFERENCE:
               /* ??? Copying these map kinds leads to internal compiler
@@ -1509,7 +1570,8 @@ omp_oacc_kernels_decompose_1 (gimple *kernels_stmt)
    /* Transform the body of the kernels region into a sequence of compute
       constructs.  */
    gimple *body = decompose_kernels_region_body (kernels_stmt,
-                                               kernels_clauses);
+                                               kernels_clauses,
+                                               inhibit_async);
  
    /* Put the transformed pieces together.  The entire body of the region is
       wrapped in a try-finally statement that calls __builtin_GOACC_data_end
author	Julian Brown <julian@codesourcery.com>
	Tue, 22 Oct 2019 00:22:31 +0000 (17:22 -0700)
committer	Kwok Cheung Yeung <kcy@codesourcery.com>
	Thu, 22 Apr 2021 17:14:21 +0000 (10:14 -0700)
gcc/ChangeLog.omp		patch \| blob \| blame \| history
gcc/omp-oacc-kernels-decompose.cc		patch \| blob \| blame \| history