From: Julian Brown Date: Tue, 26 Feb 2019 21:39:03 +0000 (-0800) Subject: Generate sequential loop for OpenACC loop directive inside kernels X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5e34073bd779684f0fc67ee208e98762610b2f94;p=thirdparty%2Fgcc.git Generate sequential loop for OpenACC loop directive inside kernels 2019-09-20 Chung-Lin Tang Cesar Philippidis gcc/ * omp-expand.c (struct omp_region): Add inside_kernels_p field. (expand_omp_for_generic): Adjust to generate a 'sequential' loop when GOMP builtin arguments are BUILT_IN_NONE. (expand_omp_for): Use expand_omp_for_generic to generate a non-parallelized loop for OMP_FORs inside OpenACC kernels regions. (expand_omp): Mark inside_kernels_p field true for regions nested inside OpenACC kernels constructs. gcc/testsuite/ * c-c++-common/goacc/kernels-loop-acc-loop.c: New test. * c-c++-common/goacc/kernels-loop-2-acc-loop.c: New test. * c-c++-common/goacc/kernels-loop-3-acc-loop.c: New test. * c-c++-common/goacc/kernels-loop-n-acc-loop.c: New test. * c-c++-common/goacc/kernels-acc-loop-reduction.c: New test. * c-c++-common/goacc/kernels-acc-loop-smaller-equal.c: New test. (cherry picked from openacc-gcc-9-branch commit 691d1aa4a69aac5230ada0f116efee445a2ea4fe) --- diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp index f23ad4ed2202..f0a68bfed8d7 100644 --- a/gcc/ChangeLog.omp +++ b/gcc/ChangeLog.omp @@ -1,3 +1,14 @@ +2019-09-20 Chung-Lin Tang + Cesar Philippidis + + * omp-expand.c (struct omp_region): Add inside_kernels_p field. + (expand_omp_for_generic): Adjust to generate a 'sequential' loop + when GOMP builtin arguments are BUILT_IN_NONE. + (expand_omp_for): Use expand_omp_for_generic to generate a + non-parallelized loop for OMP_FORs inside OpenACC kernels regions. + (expand_omp): Mark inside_kernels_p field true for regions + nested inside OpenACC kernels constructs. + 2018-09-20 Cesar Philippidis Julian Brown diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c index 89a3a3232671..27de3be190d6 100644 --- a/gcc/omp-expand.c +++ b/gcc/omp-expand.c @@ -103,6 +103,9 @@ struct omp_region /* The ordered stmt if type is GIMPLE_OMP_ORDERED and it has a depend clause. */ gomp_ordered *ord_stmt; + + /* True if this is nested inside an OpenACC kernels construct. */ + bool inside_kernels_p; }; static struct omp_region *root_omp_region; @@ -2645,6 +2648,7 @@ expand_omp_for_generic (struct omp_region *region, gassign *assign_stmt; bool in_combined_parallel = is_combined_parallel (region); bool broken_loop = region->cont == NULL; + bool seq_loop = (start_fn == BUILT_IN_NONE || next_fn == BUILT_IN_NONE); edge e, ne; tree *counts = NULL; int i; @@ -2766,8 +2770,12 @@ expand_omp_for_generic (struct omp_region *region, type = TREE_TYPE (fd->loop.v); istart0 = create_tmp_var (fd->iter_type, ".istart0"); iend0 = create_tmp_var (fd->iter_type, ".iend0"); - TREE_ADDRESSABLE (istart0) = 1; - TREE_ADDRESSABLE (iend0) = 1; + + if (!seq_loop) + { + TREE_ADDRESSABLE (istart0) = 1; + TREE_ADDRESSABLE (iend0) = 1; + } /* See if we need to bias by LLONG_MIN. */ if (fd->iter_type == long_long_unsigned_type_node @@ -2797,7 +2805,25 @@ expand_omp_for_generic (struct omp_region *region, gsi_prev (&gsif); tree arr = NULL_TREE; - if (in_combined_parallel) + if (seq_loop) + { + tree n1 = fold_convert (fd->iter_type, fd->loop.n1); + tree n2 = fold_convert (fd->iter_type, fd->loop.n2); + + n1 = force_gimple_operand_gsi_1 (&gsi, n1, is_gimple_reg, NULL_TREE, true, + GSI_SAME_STMT); + n2 = force_gimple_operand_gsi_1 (&gsi, n2, is_gimple_reg, NULL_TREE, true, + GSI_SAME_STMT); + + assign_stmt = gimple_build_assign (istart0, n1); + gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); + + assign_stmt = gimple_build_assign (iend0, n2); + gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); + + t = fold_build2 (NE_EXPR, boolean_type_node, istart0, iend0); + } + else if (in_combined_parallel) { gcc_assert (fd->ordered == 0); /* In a combined parallel loop, emit a call to @@ -3246,48 +3272,55 @@ expand_omp_for_generic (struct omp_region *region, collapse_bb = extract_omp_for_update_vars (fd, cont_bb, l1_bb); /* Emit code to get the next parallel iteration in L2_BB. */ - gsi = gsi_start_bb (l2_bb); + if (!seq_loop) + { + gsi = gsi_start_bb (l2_bb); - t = build_call_expr (builtin_decl_explicit (next_fn), 2, - build_fold_addr_expr (istart0), - build_fold_addr_expr (iend0)); - t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, - false, GSI_CONTINUE_LINKING); - if (TREE_TYPE (t) != boolean_type_node) - t = fold_build2 (NE_EXPR, boolean_type_node, - t, build_int_cst (TREE_TYPE (t), 0)); - gcond *cond_stmt = gimple_build_cond_empty (t); - gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING); + t = build_call_expr (builtin_decl_explicit (next_fn), 2, + build_fold_addr_expr (istart0), + build_fold_addr_expr (iend0)); + t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, + false, GSI_CONTINUE_LINKING); + if (TREE_TYPE (t) != boolean_type_node) + t = fold_build2 (NE_EXPR, boolean_type_node, + t, build_int_cst (TREE_TYPE (t), 0)); + gcond *cond_stmt = gimple_build_cond_empty (t); + gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING); + } } /* Add the loop cleanup function. */ gsi = gsi_last_nondebug_bb (exit_bb); - if (gimple_omp_return_nowait_p (gsi_stmt (gsi))) - t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT); - else if (gimple_omp_return_lhs (gsi_stmt (gsi))) - t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL); - else - t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END); - gcall *call_stmt = gimple_build_call (t, 0); - if (fd->ordered) + if (!seq_loop) { - tree arr = counts[fd->ordered]; - tree clobber = build_constructor (TREE_TYPE (arr), NULL); - TREE_THIS_VOLATILE (clobber) = 1; - gsi_insert_after (&gsi, gimple_build_assign (arr, clobber), - GSI_SAME_STMT); - } - if (gimple_omp_return_lhs (gsi_stmt (gsi))) - { - gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi))); - if (fd->have_reductemp) + if (gimple_omp_return_nowait_p (gsi_stmt (gsi))) + t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT); + else if (gimple_omp_return_lhs (gsi_stmt (gsi))) + t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL); + else + t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END); + gcall *call_stmt = gimple_build_call (t, 0); + if (fd->ordered) { - gimple *g = gimple_build_assign (reductions, NOP_EXPR, - gimple_call_lhs (call_stmt)); - gsi_insert_after (&gsi, g, GSI_SAME_STMT); + tree arr = counts[fd->ordered]; + tree clobber = build_constructor (TREE_TYPE (arr), NULL); + TREE_THIS_VOLATILE (clobber) = 1; + gsi_insert_after (&gsi, gimple_build_assign (arr, clobber), + GSI_SAME_STMT); } + if (gimple_omp_return_lhs (gsi_stmt (gsi))) + { + gimple_call_set_lhs (call_stmt, + gimple_omp_return_lhs (gsi_stmt (gsi))); + if (fd->have_reductemp) + { + gimple *g = gimple_build_assign (reductions, NOP_EXPR, + gimple_call_lhs (call_stmt)); + gsi_insert_after (&gsi, g, GSI_SAME_STMT); + } + } + gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT); } - gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT); gsi_remove (&gsi, true); /* Connect the new blocks. */ @@ -3299,7 +3332,8 @@ expand_omp_for_generic (struct omp_region *region, gimple_seq phis; e = find_edge (cont_bb, l3_bb); - ne = make_edge (l2_bb, l3_bb, EDGE_FALSE_VALUE); + ne = make_edge (l2_bb, l3_bb, + seq_loop ? EDGE_FALLTHRU : EDGE_FALSE_VALUE); phis = phi_nodes (l3_bb); for (gsi = gsi_start (phis); !gsi_end_p (gsi); gsi_next (&gsi)) @@ -3339,7 +3373,8 @@ expand_omp_for_generic (struct omp_region *region, e = find_edge (cont_bb, l2_bb); e->flags = EDGE_FALLTHRU; } - make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE); + if (!seq_loop) + make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE); if (gimple_in_ssa_p (cfun)) { @@ -3398,12 +3433,16 @@ expand_omp_for_generic (struct omp_region *region, add_bb_to_loop (l2_bb, outer_loop); - /* We've added a new loop around the original loop. Allocate the - corresponding loop struct. */ - struct loop *new_loop = alloc_loop (); - new_loop->header = l0_bb; - new_loop->latch = l2_bb; - add_loop (new_loop, outer_loop); + struct loop *new_loop = NULL; + if (!seq_loop) + { + /* We've added a new loop around the original loop. Allocate the + corresponding loop struct. */ + new_loop = alloc_loop (); + new_loop->header = l0_bb; + new_loop->latch = l2_bb; + add_loop (new_loop, outer_loop); + } /* Allocate a loop structure for the original loop unless we already had one. */ @@ -3413,7 +3452,8 @@ expand_omp_for_generic (struct omp_region *region, struct loop *orig_loop = alloc_loop (); orig_loop->header = l1_bb; /* The loop may have multiple latches. */ - add_loop (orig_loop, new_loop); + add_loop (orig_loop, + new_loop != NULL ? new_loop : outer_loop); } } } @@ -5944,7 +5984,10 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt) original loops from being detected. Fix that up. */ loops_state_set (LOOPS_NEED_FIXUP); - if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD) + if (region->inside_kernels_p) + expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE, + NULL_TREE, inner_stmt); + else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD) expand_omp_simd (region, &fd); else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP) { @@ -8129,7 +8172,19 @@ expand_omp (struct omp_region *region) if (region->type == GIMPLE_OMP_PARALLEL) determine_parallel_type (region); else if (region->type == GIMPLE_OMP_TARGET) - grid_expand_target_grid_body (region); + { + grid_expand_target_grid_body (region); + + if (region->inner) + { + gomp_target *entry + = as_a (last_stmt (region->entry)); + if (region->inside_kernels_p + || (gimple_omp_target_kind (entry) + == GF_OMP_TARGET_KIND_OACC_KERNELS)) + region->inner->inside_kernels_p = true; + } + } if (region->type == GIMPLE_OMP_FOR && gimple_omp_for_combined_p (last_stmt (region->entry))) diff --git a/gcc/testsuite/ChangeLog.omp b/gcc/testsuite/ChangeLog.omp index 11be8f7ed887..b7e3a2a814fd 100644 --- a/gcc/testsuite/ChangeLog.omp +++ b/gcc/testsuite/ChangeLog.omp @@ -1,3 +1,13 @@ +2019-09-20 Chung-Lin Tang + Cesar Philippidis + + * c-c++-common/goacc/kernels-loop-acc-loop.c: New test. + * c-c++-common/goacc/kernels-loop-2-acc-loop.c: New test. + * c-c++-common/goacc/kernels-loop-3-acc-loop.c: New test. + * c-c++-common/goacc/kernels-loop-n-acc-loop.c: New test. + * c-c++-common/goacc/kernels-acc-loop-reduction.c: New test. + * c-c++-common/goacc/kernels-acc-loop-smaller-equal.c: New test. + 2018-10-22 Cesar Philippidis * g++.dg/goacc/loop-1.c: New test. diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c new file mode 100644 index 000000000000..4824e5309254 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c @@ -0,0 +1,23 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +unsigned int +foo (int n, unsigned int *a) +{ + unsigned int sum = 0; + +#pragma acc kernels loop gang reduction(+:sum) + for (int i = 0; i < n; i++) + sum += a[i]; + + return sum; +} + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c new file mode 100644 index 000000000000..d70afb0e662a --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c @@ -0,0 +1,23 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +unsigned int +foo (int n) +{ + unsigned int sum = 1; + + #pragma acc kernels loop + for (int i = 1; i <= n; i++) + sum += i; + + return sum; +} + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c new file mode 100644 index 000000000000..7b127cb6fd9d --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c @@ -0,0 +1,18 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */ +#define ACC_LOOP +#include "kernels-loop-2.c" + +/* Check that only three loops are analyzed, and that all can be + parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c new file mode 100644 index 000000000000..a040e096fc19 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c @@ -0,0 +1,15 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */ +#define ACC_LOOP +#include "kernels-loop-3.c" + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c new file mode 100644 index 000000000000..070a5b5bf3d4 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c @@ -0,0 +1,15 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */ +#define ACC_LOOP +#include "kernels-loop.c" + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c new file mode 100644 index 000000000000..1f25e63fbbb1 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c @@ -0,0 +1,15 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */ +#define ACC_LOOP +#include "kernels-loop-n.c" + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */