ass = gimple_build_assign (chunk_no, expr);
gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
build_int_cst (integer_type_node,
IFN_GOACC_LOOP_CHUNKS),
- dir, range, s, chunk_size, gwv);
+ dir, range, s, chunk_size, gwv,
+ integer_one_node);
gimple_call_set_lhs (call, chunk_max);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
else
chunk_size = chunk_no;
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
build_int_cst (integer_type_node,
IFN_GOACC_LOOP_STEP),
- dir, range, s, chunk_size, gwv);
+ dir, range, s, chunk_size, gwv,
+ integer_one_node);
gimple_call_set_lhs (call, step);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
/* Loop offset & bound go into head_bb. */
gsi = gsi_start_bb (head_bb);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 8,
build_int_cst (integer_type_node,
IFN_GOACC_LOOP_OFFSET),
- dir, range, s,
- chunk_size, gwv, chunk_no);
+ dir, range, s, chunk_size, gwv, chunk_no,
+ integer_one_node);
gimple_call_set_lhs (call, offset_init);
gimple_set_location (call, loc);
gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 8,
build_int_cst (integer_type_node,
IFN_GOACC_LOOP_BOUND),
- dir, range, s,
- chunk_size, gwv, offset_init);
+ dir, range, s, chunk_size, gwv,
+ offset_init, integer_one_node);
gimple_call_set_lhs (call, bound);
gimple_set_location (call, loc);
gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING);
tree chunk = build_int_cst (diff_type, 0); /* Never chunked. */
t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_OFFSET);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
- element_s, chunk, e_gwv, chunk);
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 8, t, dir, e_range,
+ element_s, chunk, e_gwv, chunk,
+ integer_one_node);
gimple_call_set_lhs (call, e_offset);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_BOUND);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
- element_s, chunk, e_gwv, e_offset);
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 8, t, dir, e_range,
+ element_s, chunk, e_gwv, e_offset,
+ integer_one_node);
gimple_call_set_lhs (call, e_bound);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_STEP);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 6, t, dir, e_range,
- element_s, chunk, e_gwv);
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
+ element_s, chunk, e_gwv,
+ integer_one_node);
gimple_call_set_lhs (call, e_step);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
tree vf_by_vectorizer = NULL_TREE;
+ tree noalias = NULL_TREE;
/* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
if (!lhs)
switch (code)
{
- default: gcc_unreachable ();
+ default:
+ gcc_unreachable ();
case IFN_GOACC_LOOP_CHUNKS:
+ noalias = gimple_call_arg (call, 6);
if (!chunking)
- r = build_int_cst (type, 1);
+ r = build_int_cst (type, 1);
else
- {
- /* chunk_max
- = (range - dir) / (chunks * step * num_threads) + dir */
- tree per = oacc_thread_numbers (false, mask, &seq);
- per = fold_convert (type, per);
- chunk_size = fold_convert (type, chunk_size);
- per = fold_build2 (MULT_EXPR, type, per, chunk_size);
- per = fold_build2 (MULT_EXPR, type, per, step);
- r = fold_build2 (MINUS_EXPR, type, range, dir);
- r = fold_build2 (PLUS_EXPR, type, r, per);
- r = build2 (TRUNC_DIV_EXPR, type, r, per);
- }
+ {
+ /* chunk_max
+ = (range - dir) / (chunks * step * num_threads) + dir */
+ tree per = oacc_thread_numbers (false, mask, &seq);
+ per = fold_convert (type, per);
+ noalias = fold_convert (type, noalias);
+ per = fold_build2 (MULT_EXPR, type, per, noalias);
+ per = fold_build2 (MAX_EXPR, type, per, fold_convert (type, integer_one_node));
+ chunk_size = fold_convert (type, chunk_size);
+ per = fold_build2 (MULT_EXPR, type, per, chunk_size);
+ per = fold_build2 (MULT_EXPR, type, per, step);
+ r = fold_build2 (MINUS_EXPR, type, range, dir);
+ r = fold_build2 (PLUS_EXPR, type, r, per);
+ r = build2 (TRUNC_DIV_EXPR, type, r, per);
+ }
break;
case IFN_GOACC_LOOP_STEP:
+ noalias = gimple_call_arg (call, 6);
{
- if (vf_by_vectorizer)
- r = step;
- else
- {
- /* If striding, step by the entire compute volume, otherwise
- step by the inner volume. */
- unsigned volume = striding ? mask : inner_mask;
-
- r = oacc_thread_numbers (false, volume, &seq);
- r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
- }
+ if (vf_by_vectorizer)
+ r = step;
+ else
+ {
+ /* If striding, step by the entire compute volume, otherwise
+ step by the inner volume. */
+ unsigned volume = striding ? mask : inner_mask;
+
+ noalias = fold_convert (type, noalias);
+ r = oacc_thread_numbers (false, volume, &seq);
+ r = fold_convert (type, r);
+ r = build2 (MULT_EXPR, type, r, noalias);
+ r = build2 (MAX_EXPR, type, r, fold_convert (type, fold_convert (type, integer_one_node)));
+ r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
+ }
+ break;
}
- break;
-
- case IFN_GOACC_LOOP_OFFSET:
- if (vf_by_vectorizer)
- {
- /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
- the loop. */
- if (flag_tree_loop_vectorize
- || !global_options_set.x_flag_tree_loop_vectorize)
- {
- /* Enable vectorization on non-SIMT targets. */
- basic_block bb = gsi_bb (gsi);
- class loop *chunk_loop = bb->loop_father;
- class loop *inner_loop = chunk_loop->inner;
-
- /* Chunking isn't supported for VF_BY_VECTORIZER loops yet,
- so we know that the outer chunking loop will be executed just
- once and the inner loop is the one which must be
- vectorized (unless it has been optimized out for some
- reason). */
- gcc_assert (!chunking);
-
- if (inner_loop)
- {
- inner_loop->force_vectorize = true;
- inner_loop->safelen = INT_MAX;
-
- cfun->has_force_vectorize_loops = true;
- }
- }
- /* ...and expand the abstract loops such that the vectorizer can
- work on them more effectively.
-
- It might be nicer to merge this code with the "!striding" case
- below, particularly if chunking support is added. */
- tree warppos
- = oacc_thread_numbers (true, mask, vf_by_vectorizer, &seq);
- warppos = fold_convert (diff_type, warppos);
-
- tree volume
- = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
- volume = fold_convert (diff_type, volume);
-
- tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
- chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
- chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
- chunk_size = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size,
- per);
-
- warppos = fold_build2 (MULT_EXPR, diff_type, warppos, chunk_size);
-
- tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
- chunk = fold_build2 (MULT_EXPR, diff_type, chunk, volume);
- r = fold_build2 (PLUS_EXPR, diff_type, chunk, warppos);
- }
- else if (striding)
- {
- r = oacc_thread_numbers (true, mask, &seq);
- r = fold_convert (diff_type, r);
- }
- else
- {
- tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
- tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
- tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
- inner_size, outer_size);
-
- volume = fold_convert (diff_type, volume);
- if (chunking)
- chunk_size = fold_convert (diff_type, chunk_size);
- else
- {
- tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
- /* chunk_size = (range + per - 1) / per. */
- chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
- chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
- chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
- }
-
- tree span = build2 (MULT_EXPR, diff_type, chunk_size,
- fold_convert (diff_type, inner_size));
- r = oacc_thread_numbers (true, outer_mask, &seq);
- r = fold_convert (diff_type, r);
- r = build2 (MULT_EXPR, diff_type, r, span);
-
- tree inner = oacc_thread_numbers (true, inner_mask, &seq);
- inner = fold_convert (diff_type, inner);
- r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
-
- if (chunking)
- {
- tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
- tree per
- = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
- per = build2 (MULT_EXPR, diff_type, per, chunk);
-
- r = build2 (PLUS_EXPR, diff_type, r, per);
- }
- }
- r = fold_build2 (MULT_EXPR, diff_type, r, step);
- if (type != diff_type)
- r = fold_convert (type, r);
- break;
-
- case IFN_GOACC_LOOP_BOUND:
- if (vf_by_vectorizer)
- {
- tree volume
- = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
- volume = fold_convert (diff_type, volume);
-
- tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
- chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
- chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
- chunk_size = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size,
- per);
-
- vf_by_vectorizer = fold_convert (diff_type, vf_by_vectorizer);
- tree vecsize = fold_build2 (MULT_EXPR, diff_type, chunk_size,
- vf_by_vectorizer);
- vecsize = fold_build2 (MULT_EXPR, diff_type, vecsize, step);
- tree vecend = fold_convert (diff_type, gimple_call_arg (call, 6));
- vecend = fold_build2 (PLUS_EXPR, diff_type, vecend, vecsize);
- r = fold_build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, diff_type,
- range, vecend);
- }
- else if (striding)
- r = range;
- else
- {
- tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
- tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
- tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
- inner_size, outer_size);
-
- volume = fold_convert (diff_type, volume);
- if (chunking)
- chunk_size = fold_convert (diff_type, chunk_size);
- else
- {
- tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
- /* chunk_size = (range + per - 1) / per. */
- chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
- chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
- chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
- }
-
- tree span = build2 (MULT_EXPR, diff_type, chunk_size,
- fold_convert (diff_type, inner_size));
-
- r = fold_build2 (MULT_EXPR, diff_type, span, step);
+ case IFN_GOACC_LOOP_OFFSET:
+ noalias = gimple_call_arg (call, 7);
+ if (vf_by_vectorizer)
+ {
+ /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
+ the loop. */
+ if (flag_tree_loop_vectorize
+ || !global_options_set.x_flag_tree_loop_vectorize)
+ {
+ /* Enable vectorization on non-SIMT targets. */
+ basic_block bb = gsi_bb (gsi);
+ class loop *chunk_loop = bb->loop_father;
+ class loop *inner_loop = chunk_loop->inner;
+
+ /* Chunking isn't supported for VF_BY_VECTORIZER loops yet,
+ so we know that the outer chunking loop will be executed
+ just once and the inner loop is the one which must be
+ vectorized (unless it has been optimized out for some
+ reason). */
+ gcc_assert (!chunking);
+
+ if (inner_loop)
+ {
+ inner_loop->force_vectorize = true;
+ inner_loop->safelen = INT_MAX;
+
+ cfun->has_force_vectorize_loops = true;
+ }
+ }
+
+ /* ...and expand the abstract loops such that the vectorizer can
+ work on them more effectively.
+
+ It might be nicer to merge this code with the "!striding" case
+ below, particularly if chunking support is added. */
+ tree warppos
+ = oacc_thread_numbers (true, mask, vf_by_vectorizer, &seq);
+ warppos = fold_convert (diff_type, warppos);
+
+ tree volume
+ = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
+ volume = fold_convert (diff_type, volume);
+
+ tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+ chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
+ chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
+ chunk_size
+ = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+
+ warppos = fold_build2 (MULT_EXPR, diff_type, warppos, chunk_size);
+
+ tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
+ chunk = fold_build2 (MULT_EXPR, diff_type, chunk, volume);
+ r = fold_build2 (PLUS_EXPR, diff_type, chunk, warppos);
+ }
+ else if (striding)
+ {
+ r = oacc_thread_numbers (true, mask, &seq);
+ r = fold_convert (diff_type, r);
+ tree tmp1 = build2 (NE_EXPR, boolean_type_node, r,
+ fold_convert (diff_type, integer_zero_node));
+ tree tmp2 = build2 (EQ_EXPR, boolean_type_node, noalias,
+ boolean_false_node);
+ tree tmp3 = build2 (BIT_AND_EXPR, diff_type,
+ fold_convert (diff_type, tmp1),
+ fold_convert (diff_type, tmp2));
+ tree tmp4 = build2 (MULT_EXPR, diff_type, tmp3, range);
+ r = build2 (PLUS_EXPR, diff_type, r, tmp4);
+ }
+ else
+ {
+ tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
+ tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
+ tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+ inner_size, outer_size);
+
+ volume = fold_convert (diff_type, volume);
+ if (chunking)
+ chunk_size = fold_convert (diff_type, chunk_size);
+ else
+ {
+ tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+ /* chunk_size = (range + per - 1) / per. */
+ chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+ chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+ chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+ }
+
+ /* Curtail the range in all but one thread when there may be
+ aliasing to prevent parallelization. */
+ tree n = oacc_thread_numbers (true, mask, &seq);
+ n = fold_convert (diff_type, n);
+ tree tmp1 = build2 (NE_EXPR, boolean_type_node, n,
+ fold_convert (diff_type, integer_zero_node));
+ tree tmp2 = build2 (EQ_EXPR, boolean_type_node, noalias,
+ boolean_false_node);
+ tree tmp3 = build2 (BIT_AND_EXPR, diff_type,
+ fold_convert (diff_type, tmp1),
+ fold_convert (diff_type, tmp2));
+ range = build2 (MULT_EXPR, diff_type, tmp3, range);
+
+ tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+ fold_convert (diff_type, inner_size));
+ r = oacc_thread_numbers (true, outer_mask, &seq);
+ r = fold_convert (diff_type, r);
+ r = build2 (PLUS_EXPR, diff_type, r, range);
+ r = build2 (MULT_EXPR, diff_type, r, span);
+
+ tree inner = oacc_thread_numbers (true, inner_mask, &seq);
+
+ inner = fold_convert (diff_type, inner);
+ r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
+
+ if (chunking)
+ {
+ tree chunk
+ = fold_convert (diff_type, gimple_call_arg (call, 6));
+ tree per
+ = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
+ per = build2 (MULT_EXPR, diff_type, per, chunk);
+
+ r = build2 (PLUS_EXPR, diff_type, r, per);
+ }
+ }
+ r = fold_build2 (MULT_EXPR, diff_type, r, step);
+ if (type != diff_type)
+ r = fold_convert (type, r);
+ break;
- tree offset = gimple_call_arg (call, 6);
- r = build2 (PLUS_EXPR, diff_type, r,
- fold_convert (diff_type, offset));
- r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
- diff_type, r, range);
- }
- if (diff_type != type)
- r = fold_convert (type, r);
- break;
+ case IFN_GOACC_LOOP_BOUND:
+ if (vf_by_vectorizer)
+ {
+ tree volume
+ = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
+ volume = fold_convert (diff_type, volume);
+
+ tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+ chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
+ chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
+ chunk_size
+ = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+
+ vf_by_vectorizer = fold_convert (diff_type, vf_by_vectorizer);
+ tree vecsize = fold_build2 (MULT_EXPR, diff_type, chunk_size,
+ vf_by_vectorizer);
+ vecsize = fold_build2 (MULT_EXPR, diff_type, vecsize, step);
+ tree vecend = fold_convert (diff_type, gimple_call_arg (call, 6));
+ vecend = fold_build2 (PLUS_EXPR, diff_type, vecend, vecsize);
+ r = fold_build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
+ diff_type, range, vecend);
+ }
+ else if (striding)
+ r = range;
+ else
+ {
+ noalias = fold_convert (diff_type, gimple_call_arg (call, 7));
+
+ tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
+ tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
+ tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+ inner_size, outer_size);
+
+ volume = fold_convert (diff_type, volume);
+ volume = fold_build2 (MULT_EXPR, diff_type, volume, noalias);
+ volume
+ = fold_build2 (MAX_EXPR, diff_type, volume, fold_convert (diff_type, integer_one_node));
+ if (chunking)
+ chunk_size = fold_convert (diff_type, chunk_size);
+ else
+ {
+ tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+ /* chunk_size = (range + per - 1) / per. */
+ chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+ chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+ chunk_size
+ = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+ }
+
+ tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+ fold_convert (diff_type, inner_size));
+
+ r = fold_build2 (MULT_EXPR, diff_type, span, step);
+
+ tree offset = gimple_call_arg (call, 6);
+ r = build2 (PLUS_EXPR, diff_type, r,
+ fold_convert (diff_type, offset));
+ r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, diff_type, r,
+ range);
+ }
+ if (diff_type != type)
+ r = fold_convert (type, r);
+ break;
}
gimplify_assign (lhs, r, &seq);