From: Julian Brown <julian@codesourcery.com>
Date: Tue, 22 Oct 2019 00:22:31 +0000 (-0700)
Subject: Run all kernels regions with GOMP_MAP_FORCE_TOFROM mappings synchronously
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8f5bf07a50155ddac1f38c657736cf09865a40ea;p=thirdparty%2Fgcc.git

Run all kernels regions with GOMP_MAP_FORCE_TOFROM mappings synchronously

	gcc/
	* omp-oacc-kernels-decompose.cc (decompose_kernels_region_body): Add
	inhibit_async parameter.  Force asynchronous kernel launches to
	run synchronously if they have problematic variable mappings.
	Don't add explicit wait for decomposed kernels regions we forced
	synchronous.
	(omp_oacc_kernels_decompose_1): Detect problematic variable mappings,
	and inhibit asynchronous execution if we find any.
---

diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp
index b9fb82e39d84..71d5c5fddbf8 100644
--- a/gcc/ChangeLog.omp
+++ b/gcc/ChangeLog.omp
@@ -1,3 +1,13 @@
+2019-10-21  Julian Brown  <julian@codesourcery.com>
+
+	* omp-oacc-kernels-decompose.cc (decompose_kernels_region_body): Add
+	inhibit_async parameter.  Force asynchronous kernel launches to
+	run synchronously if they have problematic variable mappings.
+	Don't add explicit wait for decomposed kernels regions we forced
+	synchronous.
+	(omp_oacc_kernels_decompose_1): Detect problematic variable mappings,
+	and inhibit asynchronous execution if we find any.
+
 2020-02-06  Tobias Burnus  <tobias@codesourcery.com>
 
 	* omp-low.c (convert_from_firstprivate_int):
diff --git a/gcc/omp-oacc-kernels-decompose.cc b/gcc/omp-oacc-kernels-decompose.cc
index 26c26e5b896f..bceb03c422e0 100644
--- a/gcc/omp-oacc-kernels-decompose.cc
+++ b/gcc/omp-oacc-kernels-decompose.cc
@@ -1154,7 +1154,8 @@ control_flow_regions::compute_regions (gimple_seq seq)
    with the KERNELS_CLAUSES, into a series of compute constructs.  */
 
 static gimple *
-decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
+decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses,
+			       bool inhibit_async)
 {
   location_t loc = gimple_location (kernels_region);
 
@@ -1280,6 +1281,48 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
      separated from the loop.  */
   bool only_simple_assignments = true;
 
+  /* If we have variable mappings that are problematic to use with
+     asynchronous compute regions (see transform_kernels_region), force
+     decomposed asynchronous kernels launches to run in synchronous mode.
+     FIXME: This is potentially a performance loss.  */
+  if (inhibit_async && async_clause)
+    {
+      int num_waits = 0;
+
+      tree prev_clause = NULL_TREE;
+      for (tree clause = kernels_clauses;
+	   clause;
+	   clause = OMP_CLAUSE_CHAIN (clause))
+	{
+	  if (OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_ASYNC)
+	    {
+	      /* Wait for the async queue given on the original kernels
+		 construct.  We're running the decomposed sequence of parallels
+		 synchronously now.  */
+	      tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT);
+	      tree sync_arg
+		= build_int_cst (integer_type_node, GOMP_ASYNC_SYNC);
+	      tree wait = unshare_expr (OMP_CLAUSE_OPERAND (clause, 0));
+	      gimple *wait_call
+		= gimple_build_call (wait_fn, 3, sync_arg, integer_one_node,
+				     wait);
+	      gimple_set_location (wait_call, loc);
+	      gimple_seq_add_stmt (&region_body, wait_call);
+
+	      num_waits++;
+
+	      /* Remove async clause.  */
+	      if (prev_clause)
+		OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (clause);
+	      else
+		kernels_clauses = OMP_CLAUSE_CHAIN (clause);
+	    }
+	  prev_clause = clause;
+	}
+
+      gcc_assert (num_waits == 1);
+    }
+
   /* Precompute the control flow region information to determine whether an
      OpenACC loop is executed conditionally or unconditionally.  */
   control_flow_regions cf_regions (body_sequence);
@@ -1391,21 +1434,24 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
       gimple_seq_add_stmt (&region_body, single_region);
     }
 
-  /* We want to launch these kernels asynchronously.  If the original
-     kernels region had an async clause, this is done automatically because
-     that async clause was copied to the individual regions we created.
-     Otherwise, add an async clause to each newly created region, as well as
-     a wait directive at the end.  */
-  if (async_clause == NULL)
-    add_async_clauses_and_wait (loc, &region_body);
-  else
-    /* !!! If we have asynchronous parallel blocks inside a (synchronous) data
-       region, then target memory will get unmapped at the point the data
-       region ends, even if the inner asynchronous parallels have not yet
-       completed.  For kernels marked "async", we might want to use "enter data
-       async(...)" and "exit data async(...)" instead.
-       For now, insert a (synchronous) wait at the end of the block.  */
-    add_wait (loc, &region_body);
+  if (!inhibit_async)
+    {
+      /* We want to launch these kernels asynchronously.  If the original
+	 kernels region had an async clause, this is done automatically because
+	 that async clause was copied to the individual regions we created.
+	 Otherwise, add an async clause to each newly created region, as well
+	 as a wait directive at the end.  */
+      if (async_clause == NULL)
+	add_async_clauses_and_wait (loc, &region_body);
+      else
+	/* !!! If we have asynchronous parallel blocks inside a (synchronous)
+	   data region, then target memory will get unmapped at the point the
+	   data region ends, even if the inner asynchronous parallels have not
+	   yet completed.  For kernels marked "async", we might want to use
+	   "enter data async(...)" and "exit data async(...)" instead.
+	   For now, insert a (synchronous) wait at the end of the block.  */
+	add_wait (loc, &region_body);
+    }
 
   tree kernels_locals = gimple_bind_vars (as_a <gbind *> (kernels_body));
   gimple *body = gimple_build_bind (kernels_locals, region_body,
@@ -1426,6 +1472,8 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses)
 static gimple *
 omp_oacc_kernels_decompose_1 (gimple *kernels_stmt)
 {
+  bool inhibit_async = false;
+
   gcc_checking_assert (gimple_omp_target_kind (kernels_stmt)
 		       == GF_OMP_TARGET_KIND_OACC_KERNELS);
   location_t loc = gimple_location (kernels_stmt);
@@ -1475,9 +1523,22 @@ omp_oacc_kernels_decompose_1 (gimple *kernels_stmt)
 		}
 	      break;
 
+	    case GOMP_MAP_FORCE_TOFROM:
+	      /* If we have a FORCE_TOFROM mapping, it might be to a
+		 non-addressable scalar.  The value is copied directly from the
+		 offload kernel without waiting for an async to complete (a
+		 race condition).  We have to be careful since the scalar might
+		 be used as a loop index variable, and those are handled
+		 specially.  We can't just force such variables to be
+		 addressable.  Hence, run parts of kernels with such mappings
+		 synchronously.
+		 FIXME: Ideally we want to remove this hack and be able to
+		 move this mapping to the enclosing data region.  */
+	      inhibit_async = true;
+	      break;
+
 	    case GOMP_MAP_POINTER:
 	    case GOMP_MAP_TO_PSET:
-	    case GOMP_MAP_FORCE_TOFROM:
 	    case GOMP_MAP_FIRSTPRIVATE_POINTER:
 	    case GOMP_MAP_FIRSTPRIVATE_REFERENCE:
 	      /* ??? Copying these map kinds leads to internal compiler
@@ -1509,7 +1570,8 @@ omp_oacc_kernels_decompose_1 (gimple *kernels_stmt)
   /* Transform the body of the kernels region into a sequence of compute
      constructs.  */
   gimple *body = decompose_kernels_region_body (kernels_stmt,
-						kernels_clauses);
+						kernels_clauses,
+						inhibit_async);
 
   /* Put the transformed pieces together.  The entire body of the region is
      wrapped in a try-finally statement that calls __builtin_GOACC_data_end