From: Julian Brown Date: Thu, 21 Mar 2019 22:09:24 +0000 (-0700) Subject: Add support for gang local storage allocation in shared memory X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0f0f24da61ead3a3aceed8a4253845058294d29a;p=thirdparty%2Fgcc.git Add support for gang local storage allocation in shared memory 2018-12-11 Julian Brown Chung-Lin Tang gcc/ * config/nvptx/nvptx.c (tree-hash-traits.h): Include. (gangprivate_shared_size): New global variable. (gangprivate_shared_align): Likewise. (gangprivate_shared_sym): Likewise. (gangprivate_shared_hmap): Likewise. (nvptx_option_override): Initialize gangprivate_shared_sym, gangprivate_shared_align. (nvptx_file_end): Output gangprivate_shared_sym. (nvptx_goacc_expand_accel_var): New function. (nvptx_set_current_function): New function. (TARGET_SET_CURRENT_FUNCTION): Define hook. (TARGET_GOACC_EXPAND_ACCEL): Likewise. * doc/tm.texi (TARGET_GOACC_EXPAND_ACCEL_VAR): Document new hook. * doc/tm.texi.in (TARGET_GOACC_EXPAND_ACCEL_VAR): Likewise. * expr.c (expand_expr_real_1): Remap decls marked with the "oacc gangprivate" attribute. * omp-low.c (omp_context): Add oacc_partitioning_level and oacc_addressable_var_decls fields. (new_omp_context): Initialize oacc_addressable_var_decls in new omp_context. (delete_omp_context): Delete oacc_addressable_var_decls in old omp_context. (lower_oacc_head_tail): Record partitioning-level count in omp context. (oacc_record_private_var_clauses, oacc_record_vars_in_bind) (mark_oacc_gangprivate): New functions. (lower_omp_for): Call oacc_record_private_var_clauses with "for" clauses. Call mark_oacc_gangprivate for gang-partitioned loops. (lower_omp_target): Call oacc_record_private_var_clauses with "target" clauses. Call mark_oacc_gangprivate for offloaded target regions. (lower_omp_1): Call vars_in_bind for GIMPLE_BIND within OMP regions. * target.def (expand_accel_var): New hook. libgomp/ * testsuite/libgomp.oacc-c-c++-common/gang-private-1.c: New test. * testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c: New test. * testsuite/libgomp.oacc-c/pr85465.c: New test. * testsuite/libgomp.oacc-fortran/gangprivate-attrib-1.f90: New test.kk (cherry picked from openacc-gcc-9-branch commit 605f776041c41d77bd541cc0732cd26a2939c581) --- diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp index 0e76faedc3f5..5281ba66d3bd 100644 --- a/gcc/ChangeLog.omp +++ b/gcc/ChangeLog.omp @@ -1,3 +1,39 @@ +2018-12-11 Julian Brown + Chung-Lin Tang + + * config/nvptx/nvptx.c (tree-hash-traits.h): Include. + (gangprivate_shared_size): New global variable. + (gangprivate_shared_align): Likewise. + (gangprivate_shared_sym): Likewise. + (gangprivate_shared_hmap): Likewise. + (nvptx_option_override): Initialize gangprivate_shared_sym, + gangprivate_shared_align. + (nvptx_file_end): Output gangprivate_shared_sym. + (nvptx_goacc_expand_accel_var): New function. + (nvptx_set_current_function): New function. + (TARGET_SET_CURRENT_FUNCTION): Define hook. + (TARGET_GOACC_EXPAND_ACCEL): Likewise. + * doc/tm.texi (TARGET_GOACC_EXPAND_ACCEL_VAR): Document new hook. + * doc/tm.texi.in (TARGET_GOACC_EXPAND_ACCEL_VAR): Likewise. + * expr.c (expand_expr_real_1): Remap decls marked with the + "oacc gangprivate" attribute. + * omp-low.c (omp_context): Add oacc_partitioning_level and + oacc_addressable_var_decls fields. + (new_omp_context): Initialize oacc_addressable_var_decls in new + omp_context. + (delete_omp_context): Delete oacc_addressable_var_decls in old + omp_context. + (lower_oacc_head_tail): Record partitioning-level count in omp context. + (oacc_record_private_var_clauses, oacc_record_vars_in_bind) + (mark_oacc_gangprivate): New functions. + (lower_omp_for): Call oacc_record_private_var_clauses with "for" + clauses. Call mark_oacc_gangprivate for gang-partitioned loops. + (lower_omp_target): Call oacc_record_private_var_clauses with "target" + clauses. + Call mark_oacc_gangprivate for offloaded target regions. + (lower_omp_1): Call vars_in_bind for GIMPLE_BIND within OMP regions. + * target.def (expand_accel_var): New hook. + 2018-09-05 Cesar Philippidis Chung-Lin Tang diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 28fa35d74cae..0b4feec2e9ec 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -74,6 +74,7 @@ #include "cfgloop.h" #include "fold-const.h" #include "intl.h" +#include "tree-hash-traits.h" /* This file should be included last. */ #include "target-def.h" @@ -166,6 +167,12 @@ static unsigned vector_red_align; static unsigned vector_red_partition; static GTY(()) rtx vector_red_sym; +/* Shared memory block for gang-private variables. */ +static unsigned gangprivate_shared_size; +static unsigned gangprivate_shared_align; +static GTY(()) rtx gangprivate_shared_sym; +static hash_map gangprivate_shared_hmap; + /* Global lock variable, needed for 128bit worker & gang reductions. */ static GTY(()) tree global_lock_var; @@ -247,6 +254,10 @@ nvptx_option_override (void) vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; vector_red_partition = 0; + gangprivate_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gangprivate_shared"); + SET_SYMBOL_DATA_AREA (gangprivate_shared_sym, DATA_AREA_SHARED); + gangprivate_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; + diagnose_openacc_conflict (TARGET_GOMP, "-mgomp"); diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack"); diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt"); @@ -5286,6 +5297,10 @@ nvptx_file_end (void) write_shared_buffer (asm_out_file, vector_red_sym, vector_red_align, vector_red_size); + if (gangprivate_shared_size) + write_shared_buffer (asm_out_file, gangprivate_shared_sym, + gangprivate_shared_align, gangprivate_shared_size); + if (need_softstack_decl) { write_var_marker (asm_out_file, false, true, "__nvptx_stacks"); @@ -6618,6 +6633,38 @@ nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t) return false; } +/* Implement TARGET_GOACC_EXPAND_ACCEL_VAR. Place "oacc gangprivate" + variables in shared memory. */ + +static rtx +nvptx_goacc_expand_accel_var (tree var) +{ + if (VAR_P (var) + && lookup_attribute ("oacc gangprivate", DECL_ATTRIBUTES (var))) + { + unsigned int offset, *poffset; + poffset = gangprivate_shared_hmap.get (var); + if (poffset) + offset = *poffset; + else + { + unsigned HOST_WIDE_INT align = DECL_ALIGN (var); + gangprivate_shared_size + = (gangprivate_shared_size + align - 1) & ~(align - 1); + if (gangprivate_shared_align < align) + gangprivate_shared_align = align; + + offset = gangprivate_shared_size; + bool existed = gangprivate_shared_hmap.put (var, offset); + gcc_assert (!existed); + gangprivate_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var)); + } + rtx addr = plus_constant (Pmode, gangprivate_shared_sym, offset); + return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr); + } + return NULL_RTX; +} + static GTY(()) tree nvptx_previous_fndecl; static void @@ -6626,6 +6673,7 @@ nvptx_set_current_function (tree fndecl) if (!fndecl || fndecl == nvptx_previous_fndecl) return; + gangprivate_shared_hmap.empty (); nvptx_previous_fndecl = fndecl; vector_red_partition = 0; oacc_bcast_partition = 0; @@ -6767,6 +6815,9 @@ nvptx_set_current_function (tree fndecl) #undef TARGET_HAVE_SPECULATION_SAFE_VALUE #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed +#undef TARGET_GOACC_EXPAND_ACCEL_VAR +#define TARGET_GOACC_EXPAND_ACCEL_VAR nvptx_goacc_expand_accel_var + #undef TARGET_SET_CURRENT_FUNCTION #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 8c8978bb13a7..9f6bf8d190c0 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6154,6 +6154,14 @@ like @code{cond_add@var{m}}. The default implementation returns a zero constant of type @var{type}. @end deftypefn +@deftypefn {Target Hook} rtx TARGET_GOACC_EXPAND_ACCEL_VAR (tree @var{var}) +This hook, if defined, is used by accelerator target back-ends to expand +specially handled kinds of VAR_DECL expressions. A particular use is to +place variables with specific attributes inside special accelarator +memories. A return value of NULL indicates that the target does not +handle this VAR_DECL, and normal RTL expanding is resumed. +@end deftypefn + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index fe1194ef91ae..a3ec9702ac8e 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4208,6 +4208,8 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_PREFERRED_ELSE_VALUE +@hook TARGET_GOACC_EXPAND_ACCEL_VAR + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses diff --git a/gcc/expr.c b/gcc/expr.c index fa15b7eceae9..50e90e016c1b 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -9963,8 +9963,19 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode, exp = SSA_NAME_VAR (ssa_name); goto expand_decl_rtl; - case PARM_DECL: case VAR_DECL: + /* Allow accel compiler to handle specific cases of variables, + specifically those tagged with the "oacc gangprivate" attribute, + which may be intended to be placed in special memory in GPUs. */ + if (flag_openacc && targetm.goacc.expand_accel_var) + { + temp = targetm.goacc.expand_accel_var (exp); + if (temp) + return temp; + } + /* ... fall through ... */ + + case PARM_DECL: /* If a static var's type was incomplete when the decl was written, but the type is complete now, lay out the decl now. */ if (DECL_SIZE (exp) == 0 diff --git a/gcc/omp-low.c b/gcc/omp-low.c index 171abd54745a..dc95d01e51a4 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -137,6 +137,12 @@ struct omp_context /* Hash map of dynamic arrays in this context. */ hash_map *dynamic_arrays; + + /* The number of levels of OpenACC partitioning invoked in this context. */ + unsigned oacc_partitioning_levels; + + /* Addressable variable decls in this context. */ + vec *oacc_addressable_var_decls; }; static splay_tree all_contexts; @@ -1015,6 +1021,7 @@ new_omp_context (gimple *stmt, omp_context *outer_ctx) ctx->cb.decl_map = new hash_map; ctx->dynamic_arrays = new hash_map; + ctx->oacc_addressable_var_decls = new vec (); return ctx; } @@ -1097,6 +1104,7 @@ delete_omp_context (splay_tree_value value) } delete ctx->dynamic_arrays; + delete ctx->oacc_addressable_var_decls; XDELETE (ctx); } @@ -6823,6 +6831,9 @@ lower_oacc_head_tail (location_t loc, tree clauses, tree join_kind = build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_JOIN); gcc_assert (count); + + ctx->oacc_partitioning_levels = count; + for (unsigned done = 1; count; count--, done++) { gimple_seq fork_seq = NULL; @@ -8507,6 +8518,68 @@ lower_omp_for_lastprivate (struct omp_for_data *fd, gimple_seq *body_p, } } +/* Record vars listed in private clauses in CLAUSES in CTX. This information + is used to mark up variables that should be made private per-gang. */ + +static void +oacc_record_private_var_clauses (omp_context *ctx, tree clauses) +{ + tree c; + + if (!ctx) + return; + + for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_PRIVATE) + { + tree decl = OMP_CLAUSE_DECL (c); + if (VAR_P (decl) && TREE_ADDRESSABLE (decl)) + ctx->oacc_addressable_var_decls->safe_push (decl); + } +} + +/* Record addressable vars declared in BINDVARS in CTX. This information is + used to mark up variables that should be made private per-gang. */ + +static void +oacc_record_vars_in_bind (omp_context *ctx, tree bindvars) +{ + if (!ctx) + return; + + for (tree v = bindvars; v; v = DECL_CHAIN (v)) + if (VAR_P (v) && TREE_ADDRESSABLE (v)) + ctx->oacc_addressable_var_decls->safe_push (v); +} + +/* Mark addressable variables which are declared implicitly or explicitly as + gang private with a special attribute. These may need to have their + declarations altered later on in compilation (e.g. in + execute_oacc_device_lower or the backend, depending on how the OpenACC + execution model is implemented on a given target) to ensure that sharing + semantics are correct. */ + +static void +mark_oacc_gangprivate (vec *decls) +{ + int i; + tree decl; + + FOR_EACH_VEC_ELT (*decls, i, decl) + if (!lookup_attribute ("oacc gangprivate", DECL_ATTRIBUTES (decl))) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, + "Setting 'oacc gangprivate' attribute for decl:"); + print_generic_decl (dump_file, decl, TDF_SLIM); + fputc ('\n', dump_file); + } + DECL_ATTRIBUTES (decl) + = tree_cons (get_identifier ("oacc gangprivate"), + NULL, DECL_ATTRIBUTES (decl)); + } +} /* Lower code for an OMP loop directive. */ @@ -8524,6 +8597,8 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx) push_gimplify_context (); + oacc_record_private_var_clauses (ctx, gimple_omp_for_clauses (stmt)); + lower_omp (gimple_omp_for_pre_body_ptr (stmt), ctx); block = make_node (BLOCK); @@ -8679,7 +8754,20 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx) /* Add OpenACC partitioning and reduction markers just before the loop. */ if (oacc_head) - gimple_seq_add_seq (&body, oacc_head); + { + gimple_seq_add_seq (&body, oacc_head); + + unsigned level_total = 0; + omp_context *thisctx; + + for (thisctx = ctx; thisctx; thisctx = thisctx->outer) + level_total += thisctx->oacc_partitioning_levels; + + /* If the current context and parent contexts are distributed over a + total of one parallelism level, we have gang partitioning. */ + if (level_total == 1) + mark_oacc_gangprivate (ctx->oacc_addressable_var_decls); + } lower_omp_for_lastprivate (&fd, &body, &dlist, ctx); @@ -9540,6 +9628,8 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx) clauses = gimple_omp_target_clauses (stmt); + oacc_record_private_var_clauses (ctx, clauses); + gimple_seq dep_ilist = NULL; gimple_seq dep_olist = NULL; if (omp_find_clause (clauses, OMP_CLAUSE_DEPEND)) @@ -9833,6 +9923,8 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx) if (offloaded) { + mark_oacc_gangprivate (ctx->oacc_addressable_var_decls); + /* Declare all the variables created by mapping and the variables declared in the scope of the target body. */ record_vars_into (ctx->block_vars, child_fn); @@ -11075,6 +11167,7 @@ lower_omp_1 (gimple_stmt_iterator *gsi_p, omp_context *ctx) ctx); break; case GIMPLE_BIND: + oacc_record_vars_in_bind (ctx, gimple_bind_vars (as_a (stmt))); lower_omp (gimple_bind_body_ptr (as_a (stmt)), ctx); maybe_remove_omp_member_access_dummy_vars (as_a (stmt)); break; diff --git a/gcc/target.def b/gcc/target.def index 66cee075018b..294af6cb1d67 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1719,6 +1719,16 @@ for allocating any storage for reductions when necessary.", void, (gcall *call), default_goacc_reduction) +DEFHOOK +(expand_accel_var, +"This hook, if defined, is used by accelerator target back-ends to expand\n\ +specially handled kinds of VAR_DECL expressions. A particular use is to\n\ +place variables with specific attributes inside special accelarator\n\ +memories. A return value of NULL indicates that the target does not\n\ +handle this VAR_DECL, and normal RTL expanding is resumed.", +rtx, (tree var), +NULL) + HOOK_VECTOR_END (goacc) /* Functions relating to vectorization. */ diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp index 1c76fc01018e..f338eb1cf483 100644 --- a/libgomp/ChangeLog.omp +++ b/libgomp/ChangeLog.omp @@ -1,3 +1,11 @@ +2018-12-11 Julian Brown + Chung-Lin Tang + + * testsuite/libgomp.oacc-c-c++-common/gang-private-1.c: New test. + * testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c: New test. + * testsuite/libgomp.oacc-c/pr85465.c: New test. + * testsuite/libgomp.oacc-fortran/gangprivate-attrib-1.f90: New test.kk + 2019-03-19 Julian Brown * testsuite/libgomp.oacc-c-c++-common/lib-93.c: Adjust target selector. diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-private-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-private-1.c new file mode 100644 index 000000000000..f378346ed0a9 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-private-1.c @@ -0,0 +1,38 @@ +#include + +int main (void) +{ + int ret; + + #pragma acc parallel num_gangs(1) num_workers(32) copyout(ret) + { + int w = 0; + + #pragma acc loop worker + for (int i = 0; i < 32; i++) + { + #pragma acc atomic update + w++; + } + + ret = (w == 32); + } + assert (ret); + + #pragma acc parallel num_gangs(1) vector_length(32) copyout(ret) + { + int v = 0; + + #pragma acc loop vector + for (int i = 0; i < 32; i++) + { + #pragma acc atomic update + v++; + } + + ret = (v == 32); + } + assert (ret); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c new file mode 100644 index 000000000000..a4f81a39e242 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-2.c @@ -0,0 +1,95 @@ +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUG(DIM, IDX, VAL) \ + fprintf (stderr, "%sdist[%d] = %d\n", (DIM), (IDX), (VAL)) +#else +#define DEBUG(DIM, IDX, VAL) +#endif + +#define N (32*32*32) + +int +check (const char *dim, int *dist, int dimsize) +{ + int ix; + int exit = 0; + + for (ix = 0; ix < dimsize; ix++) + { + DEBUG(dim, ix, dist[ix]); + if (dist[ix] < (N) / (dimsize + 0.5) + || dist[ix] > (N) / (dimsize - 0.5)) + { + fprintf (stderr, "did not distribute to %ss (%d not between %d " + "and %d)\n", dim, dist[ix], (int) ((N) / (dimsize + 0.5)), + (int) ((N) / (dimsize - 0.5))); + exit |= 1; + } + } + + return exit; +} + +int main () +{ + int ary[N]; + int ix; + int exit = 0; + int gangsize = 0, workersize = 0, vectorsize = 0; + int *gangdist, *workerdist, *vectordist; + + for (ix = 0; ix < N;ix++) + ary[ix] = -1; + +#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \ + copy(ary) copyout(gangsize, workersize, vectorsize) + { +#pragma acc loop gang worker vector + for (unsigned ix = 0; ix < N; ix++) + { + int g, w, v; + + g = __builtin_goacc_parlevel_id (GOMP_DIM_GANG); + w = __builtin_goacc_parlevel_id (GOMP_DIM_WORKER); + v = __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR); + + ary[ix] = (g << 16) | (w << 8) | v; + } + + gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG); + workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER); + vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR); + } + + gangdist = (int *) alloca (gangsize * sizeof (int)); + workerdist = (int *) alloca (workersize * sizeof (int)); + vectordist = (int *) alloca (vectorsize * sizeof (int)); + memset (gangdist, 0, gangsize * sizeof (int)); + memset (workerdist, 0, workersize * sizeof (int)); + memset (vectordist, 0, vectorsize * sizeof (int)); + + /* Test that work is shared approximately equally amongst each active + gang/worker/vector. */ + for (ix = 0; ix < N; ix++) + { + int g = (ary[ix] >> 16) & 255; + int w = (ary[ix] >> 8) & 255; + int v = ary[ix] & 255; + + gangdist[g]++; + workerdist[w]++; + vectordist[v]++; + } + + exit = check ("gang", gangdist, gangsize); + exit |= check ("worker", workerdist, workersize); + exit |= check ("vector", vectordist, vectorsize); + + return exit; +} diff --git a/libgomp/testsuite/libgomp.oacc-c/pr85465.c b/libgomp/testsuite/libgomp.oacc-c/pr85465.c new file mode 100644 index 000000000000..329e8a09cf9a --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c/pr85465.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-w" } */ + +int +main (void) +{ +#pragma acc parallel + foo (); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.oacc-fortran/gangprivate-attrib-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/gangprivate-attrib-1.f90 new file mode 100644 index 000000000000..9158b6f4768b --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/gangprivate-attrib-1.f90 @@ -0,0 +1,25 @@ +! Test for "oacc gangprivate" attribute on gang-private variables + +! { dg-do run } +! { dg-additional-options "-fdump-tree-omplower-details" } +! { dg-final { scan-tree-dump-times "Setting 'oacc gangprivate' attribute for decl: integer\\(kind=4\\) w;" 1 "omplower" } } */ + +program main + integer :: w, arr(0:31) + + !$acc parallel num_gangs(32) num_workers(32) copyout(arr) ! { dg-warning "region is worker partitioned" } + !$acc loop gang private(w) + do j = 0, 31 + w = 0 + !$acc loop seq + do i = 0, 31 + !$acc atomic update + w = w + 1 + !$acc end atomic + end do + arr(j) = w + end do + !$acc end parallel + + if (any (arr .ne. 32)) stop 1 +end program main