]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
Generate sequential loop for OpenACC loop directive inside kernels
authorJulian Brown <julian@codesourcery.com>
Tue, 26 Feb 2019 21:39:03 +0000 (13:39 -0800)
committerThomas Schwinge <thomas@codesourcery.com>
Tue, 3 Mar 2020 11:13:59 +0000 (12:13 +0100)
2019-09-20  Chung-Lin Tang <cltang@codesourcery.com>
    Cesar Philippidis  <cesar@codesourcery.com>

gcc/
* omp-expand.c (struct omp_region): Add inside_kernels_p field.
(expand_omp_for_generic): Adjust to generate a 'sequential' loop
when GOMP builtin arguments are BUILT_IN_NONE.
(expand_omp_for): Use expand_omp_for_generic to generate a
non-parallelized loop for OMP_FORs inside OpenACC kernels regions.
(expand_omp): Mark inside_kernels_p field true for regions
nested inside OpenACC kernels constructs.
gcc/testsuite/
* c-c++-common/goacc/kernels-loop-acc-loop.c: New test.
* c-c++-common/goacc/kernels-loop-2-acc-loop.c: New test.
* c-c++-common/goacc/kernels-loop-3-acc-loop.c: New test.
* c-c++-common/goacc/kernels-loop-n-acc-loop.c: New test.
* c-c++-common/goacc/kernels-acc-loop-reduction.c: New test.
* c-c++-common/goacc/kernels-acc-loop-smaller-equal.c: New test.

(cherry picked from openacc-gcc-9-branch commit
691d1aa4a69aac5230ada0f116efee445a2ea4fe)

gcc/ChangeLog.omp
gcc/omp-expand.c
gcc/testsuite/ChangeLog.omp
gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c [new file with mode: 0644]
gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c [new file with mode: 0644]
gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c [new file with mode: 0644]
gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c [new file with mode: 0644]
gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c [new file with mode: 0644]
gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c [new file with mode: 0644]

index f23ad4ed2202994cf21f3741b5f46dc24493d15c..f0a68bfed8d7755c5e9c188ceeee9387e847eabe 100644 (file)
@@ -1,3 +1,14 @@
+2019-09-20  Chung-Lin Tang <cltang@codesourcery.com>
+           Cesar Philippidis  <cesar@codesourcery.com>
+
+       * omp-expand.c (struct omp_region): Add inside_kernels_p field.
+       (expand_omp_for_generic): Adjust to generate a 'sequential' loop
+       when GOMP builtin arguments are BUILT_IN_NONE.
+       (expand_omp_for): Use expand_omp_for_generic to generate a
+       non-parallelized loop for OMP_FORs inside OpenACC kernels regions.
+       (expand_omp): Mark inside_kernels_p field true for regions
+       nested inside OpenACC kernels constructs.
+
 2018-09-20  Cesar Philippidis  <cesar@codesourcery.com>
            Julian Brown  <julian@codesourcery.com>
 
index 89a3a3232671954418ee5b2892891fb6ad2f0bb7..27de3be190d6373c03a14afc3aa029514b93a9ca 100644 (file)
@@ -103,6 +103,9 @@ struct omp_region
   /* The ordered stmt if type is GIMPLE_OMP_ORDERED and it has
      a depend clause.  */
   gomp_ordered *ord_stmt;
+
+  /* True if this is nested inside an OpenACC kernels construct.  */
+  bool inside_kernels_p;
 };
 
 static struct omp_region *root_omp_region;
@@ -2645,6 +2648,7 @@ expand_omp_for_generic (struct omp_region *region,
   gassign *assign_stmt;
   bool in_combined_parallel = is_combined_parallel (region);
   bool broken_loop = region->cont == NULL;
+  bool seq_loop = (start_fn == BUILT_IN_NONE || next_fn == BUILT_IN_NONE);
   edge e, ne;
   tree *counts = NULL;
   int i;
@@ -2766,8 +2770,12 @@ expand_omp_for_generic (struct omp_region *region,
   type = TREE_TYPE (fd->loop.v);
   istart0 = create_tmp_var (fd->iter_type, ".istart0");
   iend0 = create_tmp_var (fd->iter_type, ".iend0");
-  TREE_ADDRESSABLE (istart0) = 1;
-  TREE_ADDRESSABLE (iend0) = 1;
+
+  if (!seq_loop)
+    {
+      TREE_ADDRESSABLE (istart0) = 1;
+      TREE_ADDRESSABLE (iend0) = 1;
+    }
 
   /* See if we need to bias by LLONG_MIN.  */
   if (fd->iter_type == long_long_unsigned_type_node
@@ -2797,7 +2805,25 @@ expand_omp_for_generic (struct omp_region *region,
   gsi_prev (&gsif);
 
   tree arr = NULL_TREE;
-  if (in_combined_parallel)
+  if (seq_loop)
+    {
+      tree n1 = fold_convert (fd->iter_type, fd->loop.n1);
+      tree n2 = fold_convert (fd->iter_type, fd->loop.n2);
+
+      n1 = force_gimple_operand_gsi_1 (&gsi, n1, is_gimple_reg, NULL_TREE, true,
+                                      GSI_SAME_STMT);
+      n2 = force_gimple_operand_gsi_1 (&gsi, n2, is_gimple_reg, NULL_TREE, true,
+                                      GSI_SAME_STMT);
+
+      assign_stmt = gimple_build_assign (istart0, n1);
+      gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+
+      assign_stmt = gimple_build_assign (iend0, n2);
+      gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+
+      t = fold_build2 (NE_EXPR, boolean_type_node, istart0, iend0);
+    }
+  else if (in_combined_parallel)
     {
       gcc_assert (fd->ordered == 0);
       /* In a combined parallel loop, emit a call to
@@ -3246,48 +3272,55 @@ expand_omp_for_generic (struct omp_region *region,
        collapse_bb = extract_omp_for_update_vars (fd, cont_bb, l1_bb);
 
       /* Emit code to get the next parallel iteration in L2_BB.  */
-      gsi = gsi_start_bb (l2_bb);
+      if (!seq_loop)
+        {
+         gsi = gsi_start_bb (l2_bb);
 
-      t = build_call_expr (builtin_decl_explicit (next_fn), 2,
-                          build_fold_addr_expr (istart0),
-                          build_fold_addr_expr (iend0));
-      t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
-                                   false, GSI_CONTINUE_LINKING);
-      if (TREE_TYPE (t) != boolean_type_node)
-       t = fold_build2 (NE_EXPR, boolean_type_node,
-                        t, build_int_cst (TREE_TYPE (t), 0));
-      gcond *cond_stmt = gimple_build_cond_empty (t);
-      gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING);
+         t = build_call_expr (builtin_decl_explicit (next_fn), 2,
+                              build_fold_addr_expr (istart0),
+                              build_fold_addr_expr (iend0));
+         t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+                                       false, GSI_CONTINUE_LINKING);
+         if (TREE_TYPE (t) != boolean_type_node)
+           t = fold_build2 (NE_EXPR, boolean_type_node,
+                            t, build_int_cst (TREE_TYPE (t), 0));
+         gcond *cond_stmt = gimple_build_cond_empty (t);
+         gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING);
+       }
     }
 
   /* Add the loop cleanup function.  */
   gsi = gsi_last_nondebug_bb (exit_bb);
-  if (gimple_omp_return_nowait_p (gsi_stmt (gsi)))
-    t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
-  else if (gimple_omp_return_lhs (gsi_stmt (gsi)))
-    t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
-  else
-    t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
-  gcall *call_stmt = gimple_build_call (t, 0);
-  if (fd->ordered)
+  if (!seq_loop)
     {
-      tree arr = counts[fd->ordered];
-      tree clobber = build_constructor (TREE_TYPE (arr), NULL);
-      TREE_THIS_VOLATILE (clobber) = 1;
-      gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
-                       GSI_SAME_STMT);
-    }
-  if (gimple_omp_return_lhs (gsi_stmt (gsi)))
-    {
-      gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
-      if (fd->have_reductemp)
+      if (gimple_omp_return_nowait_p (gsi_stmt (gsi)))
+       t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
+      else if (gimple_omp_return_lhs (gsi_stmt (gsi)))
+       t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
+      else
+       t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
+      gcall *call_stmt = gimple_build_call (t, 0);
+      if (fd->ordered)
        {
-         gimple *g = gimple_build_assign (reductions, NOP_EXPR,
-                                          gimple_call_lhs (call_stmt));
-         gsi_insert_after (&gsi, g, GSI_SAME_STMT);
+         tree arr = counts[fd->ordered];
+         tree clobber = build_constructor (TREE_TYPE (arr), NULL);
+         TREE_THIS_VOLATILE (clobber) = 1;
+         gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
+                           GSI_SAME_STMT);
        }
+      if (gimple_omp_return_lhs (gsi_stmt (gsi)))
+       {
+         gimple_call_set_lhs (call_stmt,
+                              gimple_omp_return_lhs (gsi_stmt (gsi)));
+         if (fd->have_reductemp)
+           {
+             gimple *g = gimple_build_assign (reductions, NOP_EXPR,
+                                              gimple_call_lhs (call_stmt));
+             gsi_insert_after (&gsi, g, GSI_SAME_STMT);
+           }
+       }
+      gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
     }
-  gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
   gsi_remove (&gsi, true);
 
   /* Connect the new blocks.  */
@@ -3299,7 +3332,8 @@ expand_omp_for_generic (struct omp_region *region,
       gimple_seq phis;
 
       e = find_edge (cont_bb, l3_bb);
-      ne = make_edge (l2_bb, l3_bb, EDGE_FALSE_VALUE);
+      ne = make_edge (l2_bb, l3_bb,
+                     seq_loop ? EDGE_FALLTHRU : EDGE_FALSE_VALUE);
 
       phis = phi_nodes (l3_bb);
       for (gsi = gsi_start (phis); !gsi_end_p (gsi); gsi_next (&gsi))
@@ -3339,7 +3373,8 @@ expand_omp_for_generic (struct omp_region *region,
          e = find_edge (cont_bb, l2_bb);
          e->flags = EDGE_FALLTHRU;
        }
-      make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE);
+      if (!seq_loop)
+       make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE);
 
       if (gimple_in_ssa_p (cfun))
        {
@@ -3398,12 +3433,16 @@ expand_omp_for_generic (struct omp_region *region,
 
       add_bb_to_loop (l2_bb, outer_loop);
 
-      /* We've added a new loop around the original loop.  Allocate the
-        corresponding loop struct.  */
-      struct loop *new_loop = alloc_loop ();
-      new_loop->header = l0_bb;
-      new_loop->latch = l2_bb;
-      add_loop (new_loop, outer_loop);
+      struct loop *new_loop = NULL;
+      if (!seq_loop)
+       {
+         /* We've added a new loop around the original loop.  Allocate the
+            corresponding loop struct.  */
+         new_loop = alloc_loop ();
+         new_loop->header = l0_bb;
+         new_loop->latch = l2_bb;
+         add_loop (new_loop, outer_loop);
+       }
 
       /* Allocate a loop structure for the original loop unless we already
         had one.  */
@@ -3413,7 +3452,8 @@ expand_omp_for_generic (struct omp_region *region,
          struct loop *orig_loop = alloc_loop ();
          orig_loop->header = l1_bb;
          /* The loop may have multiple latches.  */
-         add_loop (orig_loop, new_loop);
+         add_loop (orig_loop,
+                   new_loop != NULL ? new_loop : outer_loop);
        }
     }
 }
@@ -5944,7 +5984,10 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
        original loops from being detected.  Fix that up.  */
     loops_state_set (LOOPS_NEED_FIXUP);
 
-  if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
+  if (region->inside_kernels_p)
+    expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
+                           NULL_TREE, inner_stmt);
+  else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
     expand_omp_simd (region, &fd);
   else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
     {
@@ -8129,7 +8172,19 @@ expand_omp (struct omp_region *region)
       if (region->type == GIMPLE_OMP_PARALLEL)
        determine_parallel_type (region);
       else if (region->type == GIMPLE_OMP_TARGET)
-       grid_expand_target_grid_body (region);
+       {
+         grid_expand_target_grid_body (region);
+
+         if (region->inner)
+           {
+             gomp_target *entry
+               = as_a <gomp_target *> (last_stmt (region->entry));
+             if (region->inside_kernels_p
+                 || (gimple_omp_target_kind (entry)
+                     == GF_OMP_TARGET_KIND_OACC_KERNELS))
+               region->inner->inside_kernels_p = true;
+           }
+       }
 
       if (region->type == GIMPLE_OMP_FOR
          && gimple_omp_for_combined_p (last_stmt (region->entry)))
index 11be8f7ed887cf677544ab944e3afeb5c66a5e72..b7e3a2a814fdf51da8fbf3a0f1cbb275195e2c3e 100644 (file)
@@ -1,3 +1,13 @@
+2019-09-20  Chung-Lin Tang <cltang@codesourcery.com>
+           Cesar Philippidis  <cesar@codesourcery.com>
+
+       * c-c++-common/goacc/kernels-loop-acc-loop.c: New test.
+       * c-c++-common/goacc/kernels-loop-2-acc-loop.c: New test.
+       * c-c++-common/goacc/kernels-loop-3-acc-loop.c: New test.
+       * c-c++-common/goacc/kernels-loop-n-acc-loop.c: New test.
+       * c-c++-common/goacc/kernels-acc-loop-reduction.c: New test.
+       * c-c++-common/goacc/kernels-acc-loop-smaller-equal.c: New test.
+
 2018-10-22  Cesar Philippidis  <cesar@codesourcery.com>
 
        * g++.dg/goacc/loop-1.c: New test.
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
new file mode 100644 (file)
index 0000000..4824e53
--- /dev/null
@@ -0,0 +1,23 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+unsigned int
+foo (int n, unsigned int *a)
+{
+  unsigned int sum = 0;
+
+#pragma acc kernels loop gang reduction(+:sum)
+  for (int i = 0; i < n; i++)
+    sum += a[i];
+
+  return sum;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
new file mode 100644 (file)
index 0000000..d70afb0
--- /dev/null
@@ -0,0 +1,23 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+unsigned int
+foo (int n)
+{
+  unsigned int sum = 1;
+
+  #pragma acc kernels loop
+  for (int i = 1; i <= n; i++)
+    sum += i;
+
+  return sum;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
new file mode 100644 (file)
index 0000000..7b127cb
--- /dev/null
@@ -0,0 +1,18 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized.  */
+#define ACC_LOOP
+#include "kernels-loop-2.c"
+
+/* Check that only three loops are analyzed, and that all can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
new file mode 100644 (file)
index 0000000..a040e09
--- /dev/null
@@ -0,0 +1,15 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized.  */
+#define ACC_LOOP
+#include "kernels-loop-3.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
new file mode 100644 (file)
index 0000000..070a5b5
--- /dev/null
@@ -0,0 +1,15 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized.  */
+#define ACC_LOOP
+#include "kernels-loop.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c
new file mode 100644 (file)
index 0000000..1f25e63
--- /dev/null
@@ -0,0 +1,15 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized.  */
+#define ACC_LOOP
+#include "kernels-loop-n.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */