tree offset = gimple_call_arg (call, 5);
if (array_p)
{
+ tree copy_src = !integer_zerop (ref_to_res) ? ref_to_res : array_addr;
tree decl = gcn_goacc_get_worker_array_reduction_buffer
(array_type, array_max_idx, &seq);
tree ptr = make_ssa_name (TREE_TYPE (array_addr));
gimplify_assign (ptr, build_fold_addr_expr (decl), &seq);
/* Store incoming value to worker reduction buffer. */
- oacc_build_array_copy (ptr, array_addr, array_max_idx, &seq);
+ oacc_build_array_copy (ptr, copy_src, array_max_idx, &seq);
}
else
{
if (TREE_CONSTANT (size_expr))
size = TREE_INT_CST_LOW (size_expr);
+ /* Default size for unknown size expression. */
+ if (size == 0)
+ size = 256;
+
if (vector)
{
offload_attrs oa;
populate_offload_attrs (&oa);
- /* Default size for unknown size expression. */
- if (size == 0)
- size = 256;
-
unsigned int psize = ROUND_UP (size + offset, align);
unsigned int pnum = nvptx_mach_max_workers ();
vector_red_partition = MAX (vector_red_partition, psize);
NVPTX_BUILTIN_BAR_RED_AND,
NVPTX_BUILTIN_BAR_RED_OR,
NVPTX_BUILTIN_BAR_RED_POPC,
+ NVPTX_BUILTIN_BAR_WARPSYNC,
NVPTX_BUILTIN_BREV,
NVPTX_BUILTIN_BREVLL,
NVPTX_BUILTIN_COND_UNI,
DEF (BAR_RED_POPC, "bar_red_popc",
(UINT, UINT, UINT, UINT, UINT, NULL_TREE));
+ DEF (BAR_WARPSYNC, "bar_warpsync", (VOID, VOID, NULL_TREE));
+
DEF (BREV, "brev", (UINT, UINT, NULL_TREE));
DEF (BREVLL, "brevll", (LLUINT, LLUINT, NULL_TREE));
case NVPTX_BUILTIN_BAR_RED_POPC:
return nvptx_expand_bar_red (exp, target, mode, ignore);
+ case NVPTX_BUILTIN_BAR_WARPSYNC:
+ emit_insn (gen_nvptx_warpsync ());
+ return NULL_RTX;
+
case NVPTX_BUILTIN_BREV:
case NVPTX_BUILTIN_BREVLL:
return nvptx_expand_brev (exp, target, mode, ignore);
push_gimplify_context (true);
+ /* Copy the receiver object. */
+ tree ref_to_res = gimple_call_arg (call, 1);
+
if (level != GOMP_DIM_GANG)
{
- /* Copy the receiver object. */
- tree ref_to_res = gimple_call_arg (call, 1);
-
if (!integer_zerop (ref_to_res))
{
if (!array_p)
tree call, ptr;
if (array_p)
{
+ tree copy_src = !integer_zerop (ref_to_res) ? ref_to_res : array_addr;
tree array_elem_type = TREE_TYPE (array_type);
call = nvptx_get_shared_red_addr (array_elem_type, array_max_idx,
offset, level == GOMP_DIM_VECTOR);
ptr = make_ssa_name (TREE_TYPE (call));
gimplify_assign (ptr, call, &seq);
oacc_build_array_copy (fold_convert (TREE_TYPE (array_addr), ptr),
- array_addr, array_max_idx, &seq);
+ copy_src, array_max_idx, &seq);
}
else
{
else
r = nvptx_reduction_update (gimple_location (call), &gsi,
accum, var, op, level);
+
+ if (TARGET_SM70 && level == GOMP_DIM_VECTOR)
+ {
+ /* After SM70, with Independent Thread Scheduling introduced,
+ place a warpsync after vector-mode update of accum buffer. */
+ tree fn = nvptx_builtin_decl (NVPTX_BUILTIN_BAR_WARPSYNC, true);
+ gimple_seq_add_stmt (&seq, gimple_build_call (fn, 0));
+ }
}
}
if (a[i] != o[i])
__builtin_abort ();
+ #pragma acc parallel
+ #pragma acc loop gang reduction(+:a[1:2])
+ ARRAY_BODY (a, 1, 2)
+ ARRAY_BODY (o, 1, 2)
+ for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+ if (a[i] != o[i])
+ __builtin_abort ();
+
#pragma acc parallel copy(a[3:2])
#pragma acc loop reduction(+:a[3:2])
ARRAY_BODY (a, 3, 2)
if (a[i] != o[i])
__builtin_abort ();
+ #pragma acc parallel copy(a[3:2])
+ #pragma acc loop worker reduction(+:a[3:2])
+ ARRAY_BODY (a, 3, 2)
+ ARRAY_BODY (o, 3, 2)
+ for (int i = 0; i < 6; i++)
+ if (a[i] != o[i])
+ __builtin_abort ();
+
#pragma acc parallel copy(a)
#pragma acc loop reduction(+:a[0:5])
ARRAY_BODY (a, 0, 5)
if (a[i] != o[i])
__builtin_abort ();
+ #pragma acc parallel copy(a)
+ #pragma acc loop vector reduction(+:a[0:5])
+ ARRAY_BODY (a, 0, 5)
+ ARRAY_BODY (o, 0, 5)
+ for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+ if (a[i] != o[i])
+ __builtin_abort ();
+
#pragma acc parallel
#pragma acc loop reduction(+:a)
ARRAY_BODY (a, 4, 1)
if (a[i] != o[i])
__builtin_abort ();
+ #pragma acc parallel
+ #pragma acc loop gang reduction(+:a[one:2])
+ ARRAY_BODY (a, one, 2)
+ ARRAY_BODY (o, one, 2)
+ for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+ if (a[i] != o[i])
+ __builtin_abort ();
+
#pragma acc parallel copy(a[three:2])
#pragma acc loop reduction(+:a[three:2])
ARRAY_BODY (a, three, 2)
if (a[i] != o[i])
__builtin_abort ();
+ #pragma acc parallel copy(a[three:2])
+ #pragma acc loop worker reduction(+:a[three:2])
+ ARRAY_BODY (a, three, 2)
+ ARRAY_BODY (o, three, 2)
+ for (int i = 0; i < 6; i++)
+ if (a[i] != o[i])
+ __builtin_abort ();
+
#pragma acc parallel copy(a)
#pragma acc loop reduction(+:a[zero:5])
ARRAY_BODY (a, zero, 5)
if (a[i] != o[i])
__builtin_abort ();
+ #pragma acc parallel copy(a)
+ #pragma acc loop vector reduction(+:a[zero:5])
+ ARRAY_BODY (a, zero, 5)
+ ARRAY_BODY (o, zero, 5)
+ for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+ if (a[i] != o[i])
+ __builtin_abort ();
+
#pragma acc parallel
#pragma acc loop reduction(+:a)
ARRAY_BODY (a, four, 1)
if (a[i] != o[i])
__builtin_abort ();
+ #pragma acc parallel
+ #pragma acc loop gang reduction(+:a[one:two])
+ ARRAY_BODY (a, one, two)
+ ARRAY_BODY (o, one, two)
+ for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+ if (a[i] != o[i])
+ __builtin_abort ();
+
#pragma acc parallel copy(a[three:two])
#pragma acc loop reduction(+:a[three:two])
ARRAY_BODY (a, three, two)
if (a[i] != o[i])
__builtin_abort ();
+ #pragma acc parallel copy(a[three:two])
+ #pragma acc loop worker reduction(+:a[three:two])
+ ARRAY_BODY (a, three, two)
+ ARRAY_BODY (o, three, two)
+ for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+ if (a[i] != o[i])
+ __builtin_abort ();
+
#pragma acc parallel copy(a)
#pragma acc loop reduction(+:a[zero:five])
ARRAY_BODY (a, zero, five)
if (a[i] != o[i])
__builtin_abort ();
+ #pragma acc parallel copy(a)
+ #pragma acc loop vector reduction(+:a[zero:five])
+ ARRAY_BODY (a, zero, five)
+ ARRAY_BODY (o, zero, five)
+ for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+ if (a[i] != o[i])
+ __builtin_abort ();
+
#pragma acc parallel
#pragma acc loop reduction(+:a)
ARRAY_BODY (a, four, one)
if (a[i] != o[i])
__builtin_abort ();
+ #pragma acc parallel
+ #pragma acc loop gang reduction(+:a[one:two])
+ ARRAY_BODY (a, one, two)
+ ARRAY_BODY (o, one, two)
+ for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+ if (a[i] != o[i])
+ __builtin_abort ();
+
#pragma acc parallel copy(a[three:two])
#pragma acc loop reduction(+:a[three:two])
ARRAY_BODY (a, three, two)
if (a[i] != o[i])
__builtin_abort ();
+ #pragma acc parallel copy(a[three:two])
+ #pragma acc loop worker reduction(+:a[three:two])
+ ARRAY_BODY (a, three, two)
+ ARRAY_BODY (o, three, two)
+ for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+ if (a[i] != o[i])
+ __builtin_abort ();
+
#pragma acc parallel copy(a)
#pragma acc loop reduction(+:a[zero:five])
ARRAY_BODY (a, zero, five)
if (a[i] != o[i])
__builtin_abort ();
+ #pragma acc parallel copy(a)
+ #pragma acc loop vector reduction(+:a[zero:five])
+ ARRAY_BODY (a, zero, five)
+ ARRAY_BODY (o, zero, five)
+ for (int i = 0; i < sizeof (a) / sizeof (int); i++)
+ if (a[i] != o[i])
+ __builtin_abort ();
+
#pragma acc parallel
#pragma acc loop reduction(+:a)
ARRAY_BODY (a, four, one)