From: Julian Brown Date: Tue, 22 Oct 2019 00:22:31 +0000 (-0700) Subject: Run all kernels regions with GOMP_MAP_FORCE_TOFROM mappings synchronously X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8f5bf07a50155ddac1f38c657736cf09865a40ea;p=thirdparty%2Fgcc.git Run all kernels regions with GOMP_MAP_FORCE_TOFROM mappings synchronously gcc/ * omp-oacc-kernels-decompose.cc (decompose_kernels_region_body): Add inhibit_async parameter. Force asynchronous kernel launches to run synchronously if they have problematic variable mappings. Don't add explicit wait for decomposed kernels regions we forced synchronous. (omp_oacc_kernels_decompose_1): Detect problematic variable mappings, and inhibit asynchronous execution if we find any. --- diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp index b9fb82e39d84..71d5c5fddbf8 100644 --- a/gcc/ChangeLog.omp +++ b/gcc/ChangeLog.omp @@ -1,3 +1,13 @@ +2019-10-21 Julian Brown + + * omp-oacc-kernels-decompose.cc (decompose_kernels_region_body): Add + inhibit_async parameter. Force asynchronous kernel launches to + run synchronously if they have problematic variable mappings. + Don't add explicit wait for decomposed kernels regions we forced + synchronous. + (omp_oacc_kernels_decompose_1): Detect problematic variable mappings, + and inhibit asynchronous execution if we find any. + 2020-02-06 Tobias Burnus * omp-low.c (convert_from_firstprivate_int): diff --git a/gcc/omp-oacc-kernels-decompose.cc b/gcc/omp-oacc-kernels-decompose.cc index 26c26e5b896f..bceb03c422e0 100644 --- a/gcc/omp-oacc-kernels-decompose.cc +++ b/gcc/omp-oacc-kernels-decompose.cc @@ -1154,7 +1154,8 @@ control_flow_regions::compute_regions (gimple_seq seq) with the KERNELS_CLAUSES, into a series of compute constructs. */ static gimple * -decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses) +decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses, + bool inhibit_async) { location_t loc = gimple_location (kernels_region); @@ -1280,6 +1281,48 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses) separated from the loop. */ bool only_simple_assignments = true; + /* If we have variable mappings that are problematic to use with + asynchronous compute regions (see transform_kernels_region), force + decomposed asynchronous kernels launches to run in synchronous mode. + FIXME: This is potentially a performance loss. */ + if (inhibit_async && async_clause) + { + int num_waits = 0; + + tree prev_clause = NULL_TREE; + for (tree clause = kernels_clauses; + clause; + clause = OMP_CLAUSE_CHAIN (clause)) + { + if (OMP_CLAUSE_CODE (clause) == OMP_CLAUSE_ASYNC) + { + /* Wait for the async queue given on the original kernels + construct. We're running the decomposed sequence of parallels + synchronously now. */ + tree wait_fn = builtin_decl_explicit (BUILT_IN_GOACC_WAIT); + tree sync_arg + = build_int_cst (integer_type_node, GOMP_ASYNC_SYNC); + tree wait = unshare_expr (OMP_CLAUSE_OPERAND (clause, 0)); + gimple *wait_call + = gimple_build_call (wait_fn, 3, sync_arg, integer_one_node, + wait); + gimple_set_location (wait_call, loc); + gimple_seq_add_stmt (®ion_body, wait_call); + + num_waits++; + + /* Remove async clause. */ + if (prev_clause) + OMP_CLAUSE_CHAIN (prev_clause) = OMP_CLAUSE_CHAIN (clause); + else + kernels_clauses = OMP_CLAUSE_CHAIN (clause); + } + prev_clause = clause; + } + + gcc_assert (num_waits == 1); + } + /* Precompute the control flow region information to determine whether an OpenACC loop is executed conditionally or unconditionally. */ control_flow_regions cf_regions (body_sequence); @@ -1391,21 +1434,24 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses) gimple_seq_add_stmt (®ion_body, single_region); } - /* We want to launch these kernels asynchronously. If the original - kernels region had an async clause, this is done automatically because - that async clause was copied to the individual regions we created. - Otherwise, add an async clause to each newly created region, as well as - a wait directive at the end. */ - if (async_clause == NULL) - add_async_clauses_and_wait (loc, ®ion_body); - else - /* !!! If we have asynchronous parallel blocks inside a (synchronous) data - region, then target memory will get unmapped at the point the data - region ends, even if the inner asynchronous parallels have not yet - completed. For kernels marked "async", we might want to use "enter data - async(...)" and "exit data async(...)" instead. - For now, insert a (synchronous) wait at the end of the block. */ - add_wait (loc, ®ion_body); + if (!inhibit_async) + { + /* We want to launch these kernels asynchronously. If the original + kernels region had an async clause, this is done automatically because + that async clause was copied to the individual regions we created. + Otherwise, add an async clause to each newly created region, as well + as a wait directive at the end. */ + if (async_clause == NULL) + add_async_clauses_and_wait (loc, ®ion_body); + else + /* !!! If we have asynchronous parallel blocks inside a (synchronous) + data region, then target memory will get unmapped at the point the + data region ends, even if the inner asynchronous parallels have not + yet completed. For kernels marked "async", we might want to use + "enter data async(...)" and "exit data async(...)" instead. + For now, insert a (synchronous) wait at the end of the block. */ + add_wait (loc, ®ion_body); + } tree kernels_locals = gimple_bind_vars (as_a (kernels_body)); gimple *body = gimple_build_bind (kernels_locals, region_body, @@ -1426,6 +1472,8 @@ decompose_kernels_region_body (gimple *kernels_region, tree kernels_clauses) static gimple * omp_oacc_kernels_decompose_1 (gimple *kernels_stmt) { + bool inhibit_async = false; + gcc_checking_assert (gimple_omp_target_kind (kernels_stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS); location_t loc = gimple_location (kernels_stmt); @@ -1475,9 +1523,22 @@ omp_oacc_kernels_decompose_1 (gimple *kernels_stmt) } break; + case GOMP_MAP_FORCE_TOFROM: + /* If we have a FORCE_TOFROM mapping, it might be to a + non-addressable scalar. The value is copied directly from the + offload kernel without waiting for an async to complete (a + race condition). We have to be careful since the scalar might + be used as a loop index variable, and those are handled + specially. We can't just force such variables to be + addressable. Hence, run parts of kernels with such mappings + synchronously. + FIXME: Ideally we want to remove this hack and be able to + move this mapping to the enclosing data region. */ + inhibit_async = true; + break; + case GOMP_MAP_POINTER: case GOMP_MAP_TO_PSET: - case GOMP_MAP_FORCE_TOFROM: case GOMP_MAP_FIRSTPRIVATE_POINTER: case GOMP_MAP_FIRSTPRIVATE_REFERENCE: /* ??? Copying these map kinds leads to internal compiler @@ -1509,7 +1570,8 @@ omp_oacc_kernels_decompose_1 (gimple *kernels_stmt) /* Transform the body of the kernels region into a sequence of compute constructs. */ gimple *body = decompose_kernels_region_body (kernels_stmt, - kernels_clauses); + kernels_clauses, + inhibit_async); /* Put the transformed pieces together. The entire body of the region is wrapped in a try-finally statement that calls __builtin_GOACC_data_end