From: Jakub Jelinek <jakub@redhat.com>
Date: Wed, 7 Nov 2018 19:21:43 +0000 (+0100)
Subject: builtin-types.def (BT_FN_VOID_BOOL, [...]): New.
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0bb1abc55ad30f708911b66727140754424f9af3;p=thirdparty%2Fgcc.git

builtin-types.def (BT_FN_VOID_BOOL, [...]): New.

	* builtin-types.def (BT_FN_VOID_BOOL, BT_FN_UINT_UINT_PTR_PTR,
	BT_FN_BOOL_UINT_LONGPTR_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
	BT_FN_BOOL_UINT_ULLPTR_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR,
	BT_FN_BOOL_LONG_LONG_LONG_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
	BT_FN_BOOL_BOOL_ULL_ULL_ULL_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR): New.
	* omp-builtins.def (BUILT_IN_GOMP_LOOP_START,
	BUILT_IN_GOMP_LOOP_ORDERED_START, BUILT_IN_GOMP_LOOP_DOACROSS_START,
	BUILT_IN_GOMP_LOOP_ULL_START, BUILT_IN_GOMP_LOOP_ULL_ORDERED_START,
	BUILT_IN_GOMP_LOOP_ULL_DOACROSS_START, BUILT_IN_GOMP_SECTIONS2_START,
	BUILT_IN_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER): New.
	* omp-general.h (struct omp_for_data): Add have_reductemp member.
	* omp-general.c (omp_extract_for_data): Initialize it.
	* omp-low.c (build_outer_var_ref): Ignore taskgroup outer contexts.
	Fix up the condition when lookup_decl should be used.
	(scan_sharing_clauses): Call install_var_local for reductions with
	task modifier even in worksharing contexts.
	(lower_rec_input_clauses): Don't lookup_decl reductemp in worksharing
	contexts.  Handle reductions with task modifier in worksharing
	contexts.  Ignore _reductemp_ clause in worksharing contexts.
	(lower_reduction_clauses): Ignore reduction clause with task modifiers
	even in worksharing contexts.
	(lower_send_clauses): Likewise.
	(maybe_add_implicit_barrier_cancel): Add OMP_RETURN argument, don't
	rely that it is the last stmt in body so far.  Ignore outer taskgroup
	contexts.
	(omp_task_reductions_find_first): Move earlier.
	(lower_omp_task_reductions): Add forward declaration.  Handle
	OMP_FOR and OMP_SECTIONS, add support for parallel cancellation.
	(lower_omp_sections): Handle reduction clauses with taskgroup
	modifiers.  Adjust maybe_add_implicit_barrier_cancel caller.
	(lower_omp_single): Adjust maybe_add_implicit_barrier_cancel caller.
	(lower_omp_for): Likewise.  Handle reduction clauses with taskgroup
	modifiers.
	* omp-expand.c (omp_adjust_chunk_size): Don't adjust anything if
	chunk_size is zero.
	(determine_parallel_type): Don't combine parallel with worksharing
	which has _reductemp_ clause.
	(expand_omp_for_generic): Add SCHED_ARG argument.  Handle expansion
	of worksharing loops with task reductions.
	(expand_omp_for_static_nochunk): Handle expansion of worksharing
	loops with task reductions.
	(expand_omp_for_static_chunk): Likewise.
	(expand_omp_for): Adjust expand_omp_for_generic caller, use
	GOMP_loop{,_ull}{,_ordered,_doacross}_start builtins if there are
	task reductions.
	(expand_omp_sections): Handle expansion of sections with task
	reductions.
gcc/fortran/
	* types.def (BT_FN_VOID_BOOL, BT_FN_UINT_UINT_PTR_PTR,
	BT_FN_BOOL_UINT_LONGPTR_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
	BT_FN_BOOL_UINT_ULLPTR_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR,
	BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR,
	BT_FN_BOOL_LONG_LONG_LONG_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
	BT_FN_BOOL_BOOL_ULL_ULL_ULL_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR): New.
libgomp/
	* libgomp_g.h (GOMP_loop_start, GOMP_loop_ordered_start,
	GOMP_loop_doacross_start, GOMP_loop_ull_start,
	GOMP_loop_ull_ordered_start, GOMP_loop_ull_doacross_start,
	GOMP_workshare_task_reduction_unregister, GOMP_sections2_start): New
	prototypes.
	* libgomp.h (struct gomp_doacross_work_share): Add extra field.
	(struct gomp_work_share): Add task_reductions field.
	(struct gomp_taskgroup): Add workshare flag.
	(gomp_doacross_init, gomp_doacross_ull_init): Add size_t argument.
	(gomp_workshare_taskgroup_start,
	gomp_workshare_task_reduction_register): New prototypes.
	(gomp_init_work_share, gomp_work_share_start): Change bool argument
	to size_t.
	* libgomp.map (GOMP_5.0): Export GOMP_loop_start,
	GOMP_loop_ordered_start, GOMP_loop_doacross_start,
	GOMP_loop_ull_start, GOMP_loop_ull_ordered_start,
	GOMP_loop_ull_doacross_start,
	GOMP_workshare_task_reduction_unregister and GOMP_sections2_start.
	* loop.c: Include string.h.
	(GOMP_loop_runtime_next): Add ialias.
	(GOMP_taskgroup_reduction_register): Add ialias_redirect.
	(gomp_loop_static_start, gomp_loop_dynamic_start,
	gomp_loop_guided_start, gomp_loop_ordered_static_start,
	gomp_loop_ordered_dynamic_start, gomp_loop_ordered_guided_start,
	gomp_loop_doacross_static_start, gomp_loop_doacross_dynamic_start,
	gomp_loop_doacross_guided_start): Adjust gomp_work_share_start
	or gomp_doacross_init callers.
	(gomp_adjust_sched, GOMP_loop_start, GOMP_loop_ordered_start,
	GOMP_loop_doacross_start): New functions.
	* loop_ull.c: Include string.h.
	(GOMP_loop_ull_runtime_next): Add ialias.
	(GOMP_taskgroup_reduction_register): Add ialias_redirect.
	(gomp_loop_ull_static_start, gomp_loop_ull_dynamic_start,
	gomp_loop_ull_guided_start, gomp_loop_ull_ordered_static_start,
	gomp_loop_ull_ordered_dynamic_start,
	gomp_loop_ull_ordered_guided_start,
	gomp_loop_ull_doacross_static_start,
	gomp_loop_ull_doacross_dynamic_start,
	gomp_loop_ull_doacross_guided_start): Adjust gomp_work_share_start
	and gomp_doacross_ull_init callers.
	(gomp_adjust_sched, GOMP_loop_ull_start, GOMP_loop_ull_ordered_start,
	GOMP_loop_ull_doacross_start): New functions.
	* sections.c: Include string.h.
	(GOMP_taskgroup_reduction_register): Add ialias_redirect.
	(GOMP_sections_start): Adjust gomp_work_share_start caller.
	(GOMP_sections2_start): New function.
	* ordered.c (gomp_doacross_init, gomp_doacross_ull_init): Add
	EXTRA argument.  If not needed to prepare array, if extra is 0,
	clear ws->doacross, otherwise allocate just doacross structure and
	extra payload.  If array is needed, allocate also extra payload.
	(GOMP_doacross_post, GOMP_doacross_wait, GOMP_doacross_ull_post,
	GOMP_doacross_ull_wait): Handle doacross->array == NULL like
	doacross == NULL.
	* parallel.c (GOMP_cancellation_point): If taskgroup has workshare
	flag set, check cancelled of prev taskgroup if any.
	(GOMP_cancel): If taskgroup has workshare flag set, set cancelled
	on prev taskgroup if any.
	* single.c (GOMP_single_start, GOMP_single_copy_start): Adjust
	gomp_work_share_start callers.
	* target.c (GOMP_target_update_ext, GOMP_target_enter_exit_data):
	If taskgroup has workshare flag set, check cancelled on prev
	taskgroup if any.  Guard all cancellation tests with
	gomp_cancel_var test.
	* taskloop.c (GOMP_taskloop): Likewise.
	* task.c (GOMP_task, gomp_create_target_task, gomp_task_run_pre,
	GOMP_taskwait_depend): Likewise.
	(gomp_taskgroup_init): Clear workshare flag, reorder initialization.
	(gomp_reduction_register): Add always_inline attribute.  Add
	ORIG argument, if non-NULL, don't allocate memory, but copy it
	from there.
	(gomp_create_artificial_team): New function.
	(GOMP_taskgroup_reduction_register): Extend function comment.
	Use gomp_create_artificial_team.  Adjust gomp_reduction_register
	caller.
	(gomp_parallel_reduction_register): Adjust gomp_reduction_register
	caller.
	(gomp_workshare_task_reduction_register,
	gomp_workshare_taskgroup_start,
	GOMP_workshare_task_reduction_unregister): New functions.
	* team.c (gomp_new_team): Adjust gomp_init_work_share caller.
	* work.c (gomp_init_work_share): Change ORDERED argument from
	bool to size_t, if more than 1 allocate also extra payload at the
	end of array.  Never keep ordered_team_ids NULL, set it
	to inline_ordered_team_ids instead.
	(gomp_work_share_start): Change ORDERED argument from bool to size_t,
	return true instead of ws.
	* testsuite/libgomp.c-c++-common/cancel-parallel-1.c: New test.
	* testsuite/libgomp.c-c++-common/cancel-taskgroup-3.c: New test.
	* testsuite/libgomp.c-c++-common/task-reduction-6.c (struct S):
	Use unsigned long long int instead of unsigned long int.
	(main): Verify r == t.
	* testsuite/libgomp.c-c++-common/task-reduction-8.c: New test.
	* testsuite/libgomp.c-c++-common/task-reduction-9.c: New test.
	* testsuite/libgomp.c-c++-common/task-reduction-11.c: New test.
	* testsuite/libgomp.c-c++-common/task-reduction-12.c: New test.
	* testsuite/libgomp.c++/task-reduction-14.C: New test.
	* testsuite/libgomp.c++/task-reduction-15.C: New test.
	* testsuite/libgomp.c++/task-reduction-16.C: New test.
	* testsuite/libgomp.c++/task-reduction-17.C: New test.
	* testsuite/libgomp.c++/task-reduction-18.C: New test.
	* testsuite/libgomp.c++/task-reduction-19.C: New test.

From-SVN: r265885
---

diff --git a/gcc/ChangeLog.gomp b/gcc/ChangeLog.gomp
index 7940067c0a40..66fd79a4c4ff 100644
--- a/gcc/ChangeLog.gomp
+++ b/gcc/ChangeLog.gomp
@@ -1,3 +1,53 @@
+2018-11-07  Jakub Jelinek  <jakub@redhat.com>
+
+	* builtin-types.def (BT_FN_VOID_BOOL, BT_FN_UINT_UINT_PTR_PTR,
+	BT_FN_BOOL_UINT_LONGPTR_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
+	BT_FN_BOOL_UINT_ULLPTR_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR,
+	BT_FN_BOOL_LONG_LONG_LONG_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
+	BT_FN_BOOL_BOOL_ULL_ULL_ULL_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR): New.
+	* omp-builtins.def (BUILT_IN_GOMP_LOOP_START,
+	BUILT_IN_GOMP_LOOP_ORDERED_START, BUILT_IN_GOMP_LOOP_DOACROSS_START,
+	BUILT_IN_GOMP_LOOP_ULL_START, BUILT_IN_GOMP_LOOP_ULL_ORDERED_START,
+	BUILT_IN_GOMP_LOOP_ULL_DOACROSS_START, BUILT_IN_GOMP_SECTIONS2_START,
+	BUILT_IN_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER): New.
+	* omp-general.h (struct omp_for_data): Add have_reductemp member.
+	* omp-general.c (omp_extract_for_data): Initialize it.
+	* omp-low.c (build_outer_var_ref): Ignore taskgroup outer contexts.
+	Fix up the condition when lookup_decl should be used.
+	(scan_sharing_clauses): Call install_var_local for reductions with
+	task modifier even in worksharing contexts.
+	(lower_rec_input_clauses): Don't lookup_decl reductemp in worksharing
+	contexts.  Handle reductions with task modifier in worksharing
+	contexts.  Ignore _reductemp_ clause in worksharing contexts.
+	(lower_reduction_clauses): Ignore reduction clause with task modifiers
+	even in worksharing contexts.
+	(lower_send_clauses): Likewise.
+	(maybe_add_implicit_barrier_cancel): Add OMP_RETURN argument, don't
+	rely that it is the last stmt in body so far.  Ignore outer taskgroup
+	contexts.
+	(omp_task_reductions_find_first): Move earlier.
+	(lower_omp_task_reductions): Add forward declaration.  Handle
+	OMP_FOR and OMP_SECTIONS, add support for parallel cancellation.
+	(lower_omp_sections): Handle reduction clauses with taskgroup
+	modifiers.  Adjust maybe_add_implicit_barrier_cancel caller.
+	(lower_omp_single): Adjust maybe_add_implicit_barrier_cancel caller.
+	(lower_omp_for): Likewise.  Handle reduction clauses with taskgroup
+	modifiers.
+	* omp-expand.c (omp_adjust_chunk_size): Don't adjust anything if
+	chunk_size is zero.
+	(determine_parallel_type): Don't combine parallel with worksharing
+	which has _reductemp_ clause.
+	(expand_omp_for_generic): Add SCHED_ARG argument.  Handle expansion
+	of worksharing loops with task reductions.
+	(expand_omp_for_static_nochunk): Handle expansion of worksharing
+	loops with task reductions.
+	(expand_omp_for_static_chunk): Likewise.
+	(expand_omp_for): Adjust expand_omp_for_generic caller, use
+	GOMP_loop{,_ull}{,_ordered,_doacross}_start builtins if there are
+	task reductions.
+	(expand_omp_sections): Handle expansion of sections with task
+	reductions.
+
 2018-10-25  Jakub Jelinek  <jakub@redhat.com>
 
 	* omp-builtins.def (BUILT_IN_GOMP_LOOP_NONMONOTONIC_RUNTIME_START,
diff --git a/gcc/builtin-types.def b/gcc/builtin-types.def
index 6e8448486a1a..92c0b0bb779e 100644
--- a/gcc/builtin-types.def
+++ b/gcc/builtin-types.def
@@ -251,6 +251,7 @@ DEF_FUNCTION_TYPE_1 (BT_FN_INT_CONST_STRING, BT_INT, BT_CONST_STRING)
 DEF_FUNCTION_TYPE_1 (BT_FN_PTR_PTR, BT_PTR, BT_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_VOID_VALIST_REF, BT_VOID, BT_VALIST_REF)
 DEF_FUNCTION_TYPE_1 (BT_FN_VOID_INT, BT_VOID, BT_INT)
+DEF_FUNCTION_TYPE_1 (BT_FN_VOID_BOOL, BT_VOID, BT_BOOL)
 DEF_FUNCTION_TYPE_1 (BT_FN_FLOAT_CONST_STRING, BT_FLOAT, BT_CONST_STRING)
 DEF_FUNCTION_TYPE_1 (BT_FN_DOUBLE_CONST_STRING, BT_DOUBLE, BT_CONST_STRING)
 DEF_FUNCTION_TYPE_1 (BT_FN_LONGDOUBLE_CONST_STRING,
@@ -621,6 +622,7 @@ DEF_FUNCTION_TYPE_3 (BT_FN_VOID_UINT32_UINT32_PTR,
 		     BT_VOID, BT_UINT32, BT_UINT32, BT_PTR)
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_SIZE_SIZE_PTR, BT_VOID, BT_SIZE, BT_SIZE,
 		     BT_PTR)
+DEF_FUNCTION_TYPE_3 (BT_FN_UINT_UINT_PTR_PTR, BT_UINT, BT_UINT, BT_PTR, BT_PTR)
 
 DEF_FUNCTION_TYPE_4 (BT_FN_SIZE_CONST_PTR_SIZE_SIZE_FILEPTR,
 		     BT_SIZE, BT_CONST_PTR, BT_SIZE, BT_SIZE, BT_FILEPTR)
@@ -731,6 +733,12 @@ DEF_FUNCTION_TYPE_7 (BT_FN_VOID_INT_SIZE_PTR_PTR_PTR_UINT_PTR,
 DEF_FUNCTION_TYPE_8 (BT_FN_VOID_OMPFN_PTR_UINT_LONG_LONG_LONG_LONG_UINT,
 		     BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR, BT_UINT,
 		     BT_LONG, BT_LONG, BT_LONG, BT_LONG, BT_UINT)
+DEF_FUNCTION_TYPE_8 (BT_FN_BOOL_UINT_LONGPTR_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
+		     BT_BOOL, BT_UINT, BT_PTR_LONG, BT_LONG, BT_LONG,
+		     BT_PTR_LONG, BT_PTR_LONG, BT_PTR, BT_PTR)
+DEF_FUNCTION_TYPE_8 (BT_FN_BOOL_UINT_ULLPTR_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR,
+		     BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_LONG, BT_ULONGLONG,
+		     BT_PTR_ULONGLONG, BT_PTR_ULONGLONG, BT_PTR, BT_PTR)
 
 DEF_FUNCTION_TYPE_9 (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_BOOL_UINT_PTR_INT,
 		     BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR,
@@ -739,6 +747,14 @@ DEF_FUNCTION_TYPE_9 (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_BOOL_UINT_PTR_INT,
 DEF_FUNCTION_TYPE_9 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR,
 		     BT_VOID, BT_INT, BT_PTR_FN_VOID_PTR, BT_SIZE, BT_PTR,
 		     BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_PTR)
+DEF_FUNCTION_TYPE_9 (BT_FN_BOOL_LONG_LONG_LONG_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
+		     BT_BOOL, BT_LONG, BT_LONG, BT_LONG, BT_LONG, BT_LONG,
+		     BT_PTR_LONG, BT_PTR_LONG, BT_PTR, BT_PTR)
+
+DEF_FUNCTION_TYPE_10 (BT_FN_BOOL_BOOL_ULL_ULL_ULL_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR,
+		      BT_BOOL, BT_BOOL, BT_ULONGLONG, BT_ULONGLONG,
+		      BT_ULONGLONG, BT_LONG, BT_ULONGLONG, BT_PTR_ULONGLONG,
+		      BT_PTR_ULONGLONG, BT_PTR, BT_PTR)
 
 DEF_FUNCTION_TYPE_11 (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_UINT_LONG_INT_LONG_LONG_LONG,
 		      BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR,
diff --git a/gcc/fortran/ChangeLog.gomp b/gcc/fortran/ChangeLog.gomp
index 999509f3af9f..5355ef1f35df 100644
--- a/gcc/fortran/ChangeLog.gomp
+++ b/gcc/fortran/ChangeLog.gomp
@@ -1,3 +1,12 @@
+2018-11-07  Jakub Jelinek  <jakub@redhat.com>
+
+	* types.def (BT_FN_VOID_BOOL, BT_FN_UINT_UINT_PTR_PTR,
+	BT_FN_BOOL_UINT_LONGPTR_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
+	BT_FN_BOOL_UINT_ULLPTR_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR,
+	BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR,
+	BT_FN_BOOL_LONG_LONG_LONG_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
+	BT_FN_BOOL_BOOL_ULL_ULL_ULL_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR): New.
+
 2018-10-23  Jakub Jelinek  <jakub@redhat.com>
 
 	* types.def (BT_FN_UINT_OMPFN_PTR_UINT_UINT): New.
diff --git a/gcc/fortran/types.def b/gcc/fortran/types.def
index ba12ede7d938..7ba23bac34c0 100644
--- a/gcc/fortran/types.def
+++ b/gcc/fortran/types.def
@@ -86,6 +86,7 @@ DEF_FUNCTION_TYPE_1 (BT_FN_INT_INT, BT_INT, BT_INT)
 DEF_FUNCTION_TYPE_1 (BT_FN_UINT_UINT, BT_UINT, BT_UINT)
 DEF_FUNCTION_TYPE_1 (BT_FN_PTR_PTR, BT_PTR, BT_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_VOID_INT, BT_VOID, BT_INT)
+DEF_FUNCTION_TYPE_1 (BT_FN_VOID_BOOL, BT_VOID, BT_BOOL)
 DEF_FUNCTION_TYPE_1 (BT_FN_BOOL_INT, BT_BOOL, BT_INT)
 
 DEF_POINTER_TYPE (BT_PTR_FN_VOID_PTR, BT_FN_VOID_PTR)
@@ -147,6 +148,7 @@ DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I8_INT, BT_VOID, BT_VOLATILE_PTR, BT_I8, BT
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_VPTR_I16_INT, BT_VOID, BT_VOLATILE_PTR, BT_I16, BT_INT)
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_SIZE_SIZE_PTR, BT_VOID, BT_SIZE, BT_SIZE,
 		     BT_PTR)
+DEF_FUNCTION_TYPE_3 (BT_FN_UINT_UINT_PTR_PTR, BT_UINT, BT_UINT, BT_PTR, BT_PTR)
 
 DEF_FUNCTION_TYPE_4 (BT_FN_VOID_OMPFN_PTR_UINT_UINT,
                      BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR, BT_UINT, BT_UINT)
@@ -221,14 +223,28 @@ DEF_FUNCTION_TYPE_7 (BT_FN_VOID_INT_SIZE_PTR_PTR_PTR_UINT_PTR,
 DEF_FUNCTION_TYPE_8 (BT_FN_VOID_OMPFN_PTR_UINT_LONG_LONG_LONG_LONG_UINT,
 		     BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR, BT_UINT,
 		     BT_LONG, BT_LONG, BT_LONG, BT_LONG, BT_UINT)
+DEF_FUNCTION_TYPE_8 (BT_FN_BOOL_UINT_LONGPTR_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
+		     BT_BOOL, BT_UINT, BT_PTR_LONG, BT_LONG, BT_LONG,
+		     BT_PTR_LONG, BT_PTR_LONG, BT_PTR, BT_PTR)
+DEF_FUNCTION_TYPE_8 (BT_FN_BOOL_UINT_ULLPTR_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR,
+		     BT_BOOL, BT_UINT, BT_PTR_ULONGLONG, BT_LONG, BT_ULONGLONG,
+		     BT_PTR_ULONGLONG, BT_PTR_ULONGLONG, BT_PTR, BT_PTR)
 
 DEF_FUNCTION_TYPE_9 (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_BOOL_UINT_PTR_INT,
 		     BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR,
 		     BT_PTR_FN_VOID_PTR_PTR, BT_LONG, BT_LONG,
 		     BT_BOOL, BT_UINT, BT_PTR, BT_INT)
 DEF_FUNCTION_TYPE_9 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR,
-		      BT_VOID, BT_INT, BT_PTR_FN_VOID_PTR, BT_SIZE, BT_PTR,
-		      BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_PTR)
+		     BT_VOID, BT_INT, BT_PTR_FN_VOID_PTR, BT_SIZE, BT_PTR,
+		     BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_PTR)
+DEF_FUNCTION_TYPE_9 (BT_FN_BOOL_LONG_LONG_LONG_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
+		     BT_BOOL, BT_LONG, BT_LONG, BT_LONG, BT_LONG, BT_LONG,
+		     BT_PTR_LONG, BT_PTR_LONG, BT_PTR, BT_PTR)
+
+DEF_FUNCTION_TYPE_10 (BT_FN_BOOL_BOOL_ULL_ULL_ULL_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR,
+		      BT_BOOL, BT_BOOL, BT_ULONGLONG, BT_ULONGLONG,
+		      BT_ULONGLONG, BT_LONG, BT_ULONGLONG, BT_PTR_ULONGLONG,
+		      BT_PTR_ULONGLONG, BT_PTR, BT_PTR)
 
 DEF_FUNCTION_TYPE_11 (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_UINT_LONG_INT_LONG_LONG_LONG,
 		      BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR,
diff --git a/gcc/omp-builtins.def b/gcc/omp-builtins.def
index 5698af7a6a21..70051635fa0a 100644
--- a/gcc/omp-builtins.def
+++ b/gcc/omp-builtins.def
@@ -164,6 +164,18 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_DOACROSS_RUNTIME_START,
 		  "GOMP_loop_doacross_runtime_start",
 		  BT_FN_BOOL_UINT_LONGPTR_LONGPTR_LONGPTR,
 		  ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_START,
+		  "GOMP_loop_start",
+		  BT_FN_BOOL_LONG_LONG_LONG_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
+		  ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ORDERED_START,
+		  "GOMP_loop_ordered_start",
+		  BT_FN_BOOL_LONG_LONG_LONG_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
+		  ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_DOACROSS_START,
+		  "GOMP_loop_doacross_start",
+		  BT_FN_BOOL_UINT_LONGPTR_LONG_LONG_LONGPTR_LONGPTR_PTR_PTR,
+		  ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_STATIC_NEXT, "GOMP_loop_static_next",
 		  BT_FN_BOOL_LONGPTR_LONGPTR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_DYNAMIC_NEXT, "GOMP_loop_dynamic_next",
@@ -260,6 +272,18 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START,
 		  "GOMP_loop_ull_doacross_runtime_start",
 		  BT_FN_BOOL_UINT_ULLPTR_ULLPTR_ULLPTR,
 		  ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_START,
+		  "GOMP_loop_ull_start",
+		  BT_FN_BOOL_BOOL_ULL_ULL_ULL_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR,
+		  ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_ORDERED_START,
+		  "GOMP_loop_ull_ordered_start",
+		  BT_FN_BOOL_BOOL_ULL_ULL_ULL_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR,
+		  ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_DOACROSS_START,
+		  "GOMP_loop_ull_doacross_start",
+		  BT_FN_BOOL_UINT_ULLPTR_LONG_ULL_ULLPTR_ULLPTR_PTR_PTR,
+		  ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_LOOP_ULL_STATIC_NEXT,
 		  "GOMP_loop_ull_static_next",
 		  BT_FN_BOOL_ULONGLONGPTR_ULONGLONGPTR, ATTR_NOTHROW_LEAF_LIST)
@@ -365,6 +389,8 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASKLOOP_ULL, "GOMP_taskloop_ull",
 		  ATTR_NOTHROW_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_SECTIONS_START, "GOMP_sections_start",
 		  BT_FN_UINT_UINT, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_SECTIONS2_START, "GOMP_sections2_start",
+		  BT_FN_UINT_UINT_PTR_PTR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_SECTIONS_NEXT, "GOMP_sections_next",
 		  BT_FN_UINT, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_PARALLEL_SECTIONS,
@@ -415,5 +441,8 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASKGROUP_REDUCTION_UNREGISTER,
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASK_REDUCTION_REMAP,
 		  "GOMP_task_reduction_remap",
 		  BT_FN_VOID_SIZE_SIZE_PTR, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER,
+		  "GOMP_workshare_task_reduction_unregister",
+		  BT_FN_VOID_BOOL, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOACC_BUILTIN (BUILT_IN_GOACC_DECLARE, "GOACC_declare",
 		   BT_FN_VOID_INT_SIZE_PTR_PTR_PTR, ATTR_NOTHROW_LIST)
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index 11790682857f..ccb94ba9de4a 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -204,7 +204,7 @@ workshare_safe_to_combine_p (basic_block ws_entry_bb)
 static tree
 omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
 {
-  if (!simd_schedule)
+  if (!simd_schedule || integer_zerop (chunk_size))
     return chunk_size;
 
   poly_uint64 vf = omp_max_vf ();
@@ -345,13 +345,14 @@ determine_parallel_type (struct omp_region *region)
 	  if (c == NULL
 	      || ((OMP_CLAUSE_SCHEDULE_KIND (c) & OMP_CLAUSE_SCHEDULE_MASK)
 		  == OMP_CLAUSE_SCHEDULE_STATIC)
-	      || omp_find_clause (clauses, OMP_CLAUSE_ORDERED))
-	    {
-	      region->is_combined_parallel = false;
-	      region->inner->is_combined_parallel = false;
-	      return;
-	    }
+	      || omp_find_clause (clauses, OMP_CLAUSE_ORDERED)
+	      || omp_find_clause (clauses, OMP_CLAUSE__REDUCTEMP_))
+	    return;
 	}
+      else if (region->inner->type == GIMPLE_OMP_SECTIONS
+	       && omp_find_clause (gimple_omp_sections_clauses (ws_stmt),
+				   OMP_CLAUSE__REDUCTEMP_))
+	return;
 
       region->is_combined_parallel = true;
       region->inner->is_combined_parallel = true;
@@ -2618,6 +2619,7 @@ expand_omp_for_generic (struct omp_region *region,
 			struct omp_for_data *fd,
 			enum built_in_function start_fn,
 			enum built_in_function next_fn,
+			tree sched_arg,
 			gimple *inner_stmt)
 {
   tree type, istart0, iend0, iend;
@@ -2665,6 +2667,30 @@ expand_omp_for_generic (struct omp_region *region,
       && omp_find_clause (gimple_omp_for_clauses (gsi_stmt (gsi)),
 			  OMP_CLAUSE_LASTPRIVATE))
     ordered_lastprivate = false;
+  tree reductions = NULL_TREE;
+  tree mem = NULL_TREE;
+  if (sched_arg)
+    {
+      if (fd->have_reductemp)
+	{
+	  tree c = omp_find_clause (gimple_omp_for_clauses (gsi_stmt (gsi)),
+				    OMP_CLAUSE__REDUCTEMP_);
+	  reductions = OMP_CLAUSE_DECL (c);
+	  gcc_assert (TREE_CODE (reductions) == SSA_NAME);
+	  gimple *g = SSA_NAME_DEF_STMT (reductions);
+	  reductions = gimple_assign_rhs1 (g);
+	  OMP_CLAUSE_DECL (c) = reductions;
+	  entry_bb = gimple_bb (g);
+	  edge e = split_block (entry_bb, g);
+	  if (region->entry == entry_bb)
+	    region->entry = e->dest;
+	  gsi = gsi_last_bb (entry_bb);
+	}
+      else
+	reductions = null_pointer_node;
+      /* For now.  */
+      mem = null_pointer_node;
+    }
   if (fd->collapse > 1 || fd->ordered)
     {
       int first_zero_iter1 = -1, first_zero_iter2 = -1;
@@ -2851,7 +2877,18 @@ expand_omp_for_generic (struct omp_region *region,
 	    {
 	      t = fold_convert (fd->iter_type, fd->chunk_size);
 	      t = omp_adjust_chunk_size (t, fd->simd_schedule);
-	      if (fd->ordered)
+	      if (sched_arg)
+		{
+		  if (fd->ordered)
+		    t = build_call_expr (builtin_decl_explicit (start_fn),
+					 8, t0, t1, sched_arg, t, t3, t4,
+					 reductions, mem);
+		  else
+		    t = build_call_expr (builtin_decl_explicit (start_fn),
+					 9, t0, t1, t2, sched_arg, t, t3, t4,
+					 reductions, mem);
+		}
+	      else if (fd->ordered)
 		t = build_call_expr (builtin_decl_explicit (start_fn),
 				     5, t0, t1, t, t3, t4);
 	      else
@@ -2884,7 +2921,11 @@ expand_omp_for_generic (struct omp_region *region,
 	      tree bfn_decl = builtin_decl_explicit (start_fn);
 	      t = fold_convert (fd->iter_type, fd->chunk_size);
 	      t = omp_adjust_chunk_size (t, fd->simd_schedule);
-	      t = build_call_expr (bfn_decl, 7, t5, t0, t1, t2, t, t3, t4);
+	      if (sched_arg)
+		t = build_call_expr (bfn_decl, 10, t5, t0, t1, t2, sched_arg,
+				     t, t3, t4, reductions, mem);
+	      else
+		t = build_call_expr (bfn_decl, 7, t5, t0, t1, t2, t, t3, t4);
 	    }
 	  else
 	    t = build_call_expr (builtin_decl_explicit (start_fn),
@@ -2903,6 +2944,17 @@ expand_omp_for_generic (struct omp_region *region,
       gsi_insert_before (&gsi, gimple_build_assign (arr, clobber),
 			 GSI_SAME_STMT);
     }
+  if (fd->have_reductemp)
+    {
+      gimple *g = gsi_stmt (gsi);
+      gsi_remove (&gsi, true);
+      release_ssa_name (gimple_assign_lhs (g));
+
+      entry_bb = region->entry;
+      gsi = gsi_last_nondebug_bb (entry_bb);
+
+      gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
+    }
   gsi_insert_after (&gsi, gimple_build_cond_empty (t), GSI_SAME_STMT);
 
   /* Remove the GIMPLE_OMP_FOR statement.  */
@@ -3201,9 +3253,6 @@ expand_omp_for_generic (struct omp_region *region,
   else
     t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
   gcall *call_stmt = gimple_build_call (t, 0);
-  if (gimple_omp_return_lhs (gsi_stmt (gsi)))
-    gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
-  gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
   if (fd->ordered)
     {
       tree arr = counts[fd->ordered];
@@ -3212,6 +3261,17 @@ expand_omp_for_generic (struct omp_region *region,
       gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
 			GSI_SAME_STMT);
     }
+  if (gimple_omp_return_lhs (gsi_stmt (gsi)))
+    {
+      gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
+      if (fd->have_reductemp)
+	{
+	  gimple *g = gimple_build_assign (reductions, NOP_EXPR,
+					   gimple_call_lhs (call_stmt));
+	  gsi_insert_after (&gsi, g, GSI_SAME_STMT);
+	}
+    }
+  gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
   gsi_remove (&gsi, true);
 
   /* Connect the new blocks.  */
@@ -3394,6 +3454,7 @@ expand_omp_for_static_nochunk (struct omp_region *region,
   bool broken_loop = region->cont == NULL;
   tree *counts = NULL;
   tree n1, n2, step;
+  tree reductions = NULL_TREE;
 
   itype = type = TREE_TYPE (fd->loop.v);
   if (POINTER_TYPE_P (type))
@@ -3477,6 +3538,29 @@ expand_omp_for_static_nochunk (struct omp_region *region,
       gsi = gsi_last_bb (entry_bb);
     }
 
+  if (fd->have_reductemp)
+    {
+      tree t1 = build_int_cst (long_integer_type_node, 0);
+      tree t2 = build_int_cst (long_integer_type_node, 1);
+      tree t3 = build_int_cstu (long_integer_type_node,
+				(HOST_WIDE_INT_1U << 31) + 1);
+      tree clauses = gimple_omp_for_clauses (fd->for_stmt);
+      clauses = omp_find_clause (clauses, OMP_CLAUSE__REDUCTEMP_);
+      reductions = OMP_CLAUSE_DECL (clauses);
+      gcc_assert (TREE_CODE (reductions) == SSA_NAME);
+      gimple *g = SSA_NAME_DEF_STMT (reductions);
+      reductions = gimple_assign_rhs1 (g);
+      OMP_CLAUSE_DECL (clauses) = reductions;
+      gimple_stmt_iterator gsi2 = gsi_for_stmt (g);
+      tree t
+	= build_call_expr (builtin_decl_explicit (BUILT_IN_GOMP_LOOP_START),
+			   9, t1, t2, t2, t3, t1, null_pointer_node,
+			   null_pointer_node, reductions, null_pointer_node);
+      force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+				true, GSI_SAME_STMT);
+      gsi_remove (&gsi2, true);
+      release_ssa_name (gimple_assign_lhs (g));
+    }
   switch (gimple_omp_for_kind (fd->for_stmt))
     {
     case GF_OMP_FOR_KIND_FOR:
@@ -3747,7 +3831,25 @@ expand_omp_for_static_nochunk (struct omp_region *region,
   if (!gimple_omp_return_nowait_p (gsi_stmt (gsi)))
     {
       t = gimple_omp_return_lhs (gsi_stmt (gsi));
-      gsi_insert_after (&gsi, omp_build_barrier (t), GSI_SAME_STMT);
+      if (fd->have_reductemp)
+	{
+	  tree fn;
+	  if (t)
+	    fn = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
+	  else
+	    fn = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
+	  gcall *g = gimple_build_call (fn, 0);
+	  if (t)
+	    {
+	      gimple_call_set_lhs (g, t);
+	      gsi_insert_after (&gsi, gimple_build_assign (reductions,
+							   NOP_EXPR, t),
+				GSI_SAME_STMT);
+	    }
+	  gsi_insert_after (&gsi, g, GSI_SAME_STMT);
+	}
+      else
+	gsi_insert_after (&gsi, omp_build_barrier (t), GSI_SAME_STMT);
     }
   gsi_remove (&gsi, true);
 
@@ -3884,6 +3986,7 @@ expand_omp_for_static_chunk (struct omp_region *region,
   bool broken_loop = region->cont == NULL;
   tree *counts = NULL;
   tree n1, n2, step;
+  tree reductions = NULL_TREE;
 
   itype = type = TREE_TYPE (fd->loop.v);
   if (POINTER_TYPE_P (type))
@@ -3971,6 +4074,29 @@ expand_omp_for_static_chunk (struct omp_region *region,
       gsi = gsi_last_bb (entry_bb);
     }
 
+  if (fd->have_reductemp)
+    {
+      tree t1 = build_int_cst (long_integer_type_node, 0);
+      tree t2 = build_int_cst (long_integer_type_node, 1);
+      tree t3 = build_int_cstu (long_integer_type_node,
+				(HOST_WIDE_INT_1U << 31) + 1);
+      tree clauses = gimple_omp_for_clauses (fd->for_stmt);
+      clauses = omp_find_clause (clauses, OMP_CLAUSE__REDUCTEMP_);
+      reductions = OMP_CLAUSE_DECL (clauses);
+      gcc_assert (TREE_CODE (reductions) == SSA_NAME);
+      gimple *g = SSA_NAME_DEF_STMT (reductions);
+      reductions = gimple_assign_rhs1 (g);
+      OMP_CLAUSE_DECL (clauses) = reductions;
+      gimple_stmt_iterator gsi2 = gsi_for_stmt (g);
+      tree t
+	= build_call_expr (builtin_decl_explicit (BUILT_IN_GOMP_LOOP_START),
+			   9, t1, t2, t2, t3, t1, null_pointer_node,
+			   null_pointer_node, reductions, null_pointer_node);
+      force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+				true, GSI_SAME_STMT);
+      gsi_remove (&gsi2, true);
+      release_ssa_name (gimple_assign_lhs (g));
+    }
   switch (gimple_omp_for_kind (fd->for_stmt))
     {
     case GF_OMP_FOR_KIND_FOR:
@@ -4274,7 +4400,25 @@ expand_omp_for_static_chunk (struct omp_region *region,
   if (!gimple_omp_return_nowait_p (gsi_stmt (gsi)))
     {
       t = gimple_omp_return_lhs (gsi_stmt (gsi));
-      gsi_insert_after (&gsi, omp_build_barrier (t), GSI_SAME_STMT);
+      if (fd->have_reductemp)
+	{
+	  tree fn;
+	  if (t)
+	    fn = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
+	  else
+	    fn = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
+	  gcall *g = gimple_build_call (fn, 0);
+	  if (t)
+	    {
+	      gimple_call_set_lhs (g, t);
+	      gsi_insert_after (&gsi, gimple_build_assign (reductions,
+							   NOP_EXPR, t),
+				GSI_SAME_STMT);
+	    }
+	  gsi_insert_after (&gsi, g, GSI_SAME_STMT);
+	}
+      else
+	gsi_insert_after (&gsi, omp_build_barrier (t), GSI_SAME_STMT);
     }
   gsi_remove (&gsi, true);
 
@@ -5809,6 +5953,8 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
   else
     {
       int fn_index, start_ix, next_ix;
+      unsigned HOST_WIDE_INT sched = 0;
+      tree sched_arg = NULL_TREE;
 
       gcc_assert (gimple_omp_for_kind (fd.for_stmt)
 		  == GF_OMP_FOR_KIND_FOR);
@@ -5822,12 +5968,16 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
 	    {
 	      gcc_assert (!fd.have_ordered);
 	      fn_index = 6;
+	      sched = 4;
 	    }
 	  else if ((fd.sched_modifiers & OMP_CLAUSE_SCHEDULE_MONOTONIC) == 0
 		   && !fd.have_ordered)
 	    fn_index = 7;
 	  else
-	    fn_index = 3;
+	    {
+	      fn_index = 3;
+	      sched = (HOST_WIDE_INT_1U << 31);
+	    }
 	  break;
 	case OMP_CLAUSE_SCHEDULE_DYNAMIC:
 	case OMP_CLAUSE_SCHEDULE_GUIDED:
@@ -5835,13 +5985,17 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
 	      && !fd.have_ordered)
 	    {
 	      fn_index = 3 + fd.sched_kind;
+	      sched = (fd.sched_kind == OMP_CLAUSE_SCHEDULE_GUIDED) + 2;
 	      break;
 	    }
 	  fn_index = fd.sched_kind;
+	  sched = (fd.sched_kind == OMP_CLAUSE_SCHEDULE_GUIDED) + 2;
+	  sched += (HOST_WIDE_INT_1U << 31);
 	  break;
 	case OMP_CLAUSE_SCHEDULE_STATIC:
 	  gcc_assert (fd.have_ordered);
 	  fn_index = 0;
+	  sched = (HOST_WIDE_INT_1U << 31) + 1;
 	  break;
 	default:
 	  gcc_unreachable ();
@@ -5853,6 +6007,18 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
       else
 	start_ix = ((int)BUILT_IN_GOMP_LOOP_STATIC_START) + fn_index;
       next_ix = ((int)BUILT_IN_GOMP_LOOP_STATIC_NEXT) + fn_index;
+      if (fd.have_reductemp)
+	{
+	  if (fd.ordered)
+	    start_ix = (int)BUILT_IN_GOMP_LOOP_DOACROSS_START;
+	  else if (fd.have_ordered)
+	    start_ix = (int)BUILT_IN_GOMP_LOOP_ORDERED_START;
+	  else
+	    start_ix = (int)BUILT_IN_GOMP_LOOP_START;
+	  sched_arg = build_int_cstu (long_integer_type_node, sched);
+	  if (!fd.chunk_size)
+	    fd.chunk_size = integer_zero_node;
+	}
       if (fd.iter_type == long_long_unsigned_type_node)
 	{
 	  start_ix += ((int)BUILT_IN_GOMP_LOOP_ULL_STATIC_START
@@ -5861,7 +6027,8 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
 		      - (int)BUILT_IN_GOMP_LOOP_STATIC_NEXT);
 	}
       expand_omp_for_generic (region, &fd, (enum built_in_function) start_ix,
-			      (enum built_in_function) next_ix, inner_stmt);
+			      (enum built_in_function) next_ix, sched_arg,
+			      inner_stmt);
     }
 
   if (gimple_in_ssa_p (cfun))
@@ -5961,7 +6128,25 @@ expand_omp_sections (struct omp_region *region)
   sections_stmt = as_a <gomp_sections *> (gsi_stmt (si));
   gcc_assert (gimple_code (sections_stmt) == GIMPLE_OMP_SECTIONS);
   vin = gimple_omp_sections_control (sections_stmt);
-  if (!is_combined_parallel (region))
+  tree clauses = gimple_omp_sections_clauses (sections_stmt);
+  tree reductmp = omp_find_clause (clauses, OMP_CLAUSE__REDUCTEMP_);
+  if (reductmp)
+    {
+      tree reductions = OMP_CLAUSE_DECL (reductmp);
+      gcc_assert (TREE_CODE (reductions) == SSA_NAME);
+      gimple *g = SSA_NAME_DEF_STMT (reductions);
+      reductions = gimple_assign_rhs1 (g);
+      OMP_CLAUSE_DECL (reductmp) = reductions;
+      gimple_stmt_iterator gsi = gsi_for_stmt (g);
+      t = build_int_cst (unsigned_type_node, len - 1);
+      u = builtin_decl_explicit (BUILT_IN_GOMP_SECTIONS2_START);
+      stmt = gimple_build_call (u, 3, t, reductions, null_pointer_node);
+      gimple_call_set_lhs (stmt, vin);
+      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+      gsi_remove (&gsi, true);
+      release_ssa_name (gimple_assign_lhs (g));
+    }
+  else if (!is_combined_parallel (region))
     {
       /* If we are not inside a combined parallel+sections region,
 	 call GOMP_sections_start.  */
@@ -5975,8 +6160,11 @@ expand_omp_sections (struct omp_region *region)
       u = builtin_decl_explicit (BUILT_IN_GOMP_SECTIONS_NEXT);
       stmt = gimple_build_call (u, 0);
     }
-  gimple_call_set_lhs (stmt, vin);
-  gsi_insert_after (&si, stmt, GSI_SAME_STMT);
+  if (!reductmp)
+    {
+      gimple_call_set_lhs (stmt, vin);
+      gsi_insert_after (&si, stmt, GSI_SAME_STMT);
+    }
   gsi_remove (&si, true);
 
   /* The switch() statement replacing GIMPLE_OMP_SECTIONS_SWITCH goes in
diff --git a/gcc/omp-general.c b/gcc/omp-general.c
index 2d53e105ec39..99d8226ef213 100644
--- a/gcc/omp-general.c
+++ b/gcc/omp-general.c
@@ -138,6 +138,7 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
   fd->pre = NULL;
   fd->have_nowait = distribute || simd;
   fd->have_ordered = false;
+  fd->have_reductemp = false;
   fd->tiling = NULL_TREE;
   fd->collapse = 1;
   fd->ordered = 0;
@@ -188,6 +189,8 @@ omp_extract_for_data (gomp_for *for_stmt, struct omp_for_data *fd,
 	collapse_iter = &OMP_CLAUSE_TILE_ITERVAR (t);
 	collapse_count = &OMP_CLAUSE_TILE_COUNT (t);
 	break;
+      case OMP_CLAUSE__REDUCTEMP_:
+	fd->have_reductemp = true;
       default:
 	break;
       }
diff --git a/gcc/omp-general.h b/gcc/omp-general.h
index b5af0e07ebc0..b847506d4528 100644
--- a/gcc/omp-general.h
+++ b/gcc/omp-general.h
@@ -62,7 +62,7 @@ struct omp_for_data
   tree tiling;  /* Tiling values (if non null).  */
   int collapse;  /* Collapsed loops, 1 for a non-collapsed loop.  */
   int ordered;
-  bool have_nowait, have_ordered, simd_schedule;
+  bool have_nowait, have_ordered, simd_schedule, have_reductemp;
   unsigned char sched_modifiers;
   enum omp_clause_schedule_kind sched_kind;
   struct omp_for_data_loop *loops;
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 040795950e64..0679904c7edb 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -542,6 +542,9 @@ build_outer_var_ref (tree var, omp_context *ctx,
 		     enum omp_clause_code code = OMP_CLAUSE_ERROR)
 {
   tree x;
+  omp_context *outer = ctx->outer;
+  while (outer && gimple_code (outer->stmt) == GIMPLE_OMP_TASKGROUP)
+    outer = outer->outer;
 
   if (is_global_var (maybe_lookup_decl_in_outer_ctx (var, ctx)))
     x = var;
@@ -568,44 +571,43 @@ build_outer_var_ref (tree var, omp_context *ctx,
 	 Similarly for OMP_CLAUSE_PRIVATE with outer ref, that can refer
 	 to private vars in all worksharing constructs.  */
       x = NULL_TREE;
-      if (ctx->outer && is_taskreg_ctx (ctx))
-	x = lookup_decl (var, ctx->outer);
-      else if (ctx->outer)
+      if (outer && is_taskreg_ctx (outer))
+	x = lookup_decl (var, outer);
+      else if (outer)
 	x = maybe_lookup_decl_in_outer_ctx (var, ctx);
       if (x == NULL_TREE)
 	x = var;
     }
   else if (code == OMP_CLAUSE_LASTPRIVATE && is_taskloop_ctx (ctx))
     {
-      gcc_assert (ctx->outer);
+      gcc_assert (outer);
       splay_tree_node n
-	= splay_tree_lookup (ctx->outer->field_map,
+	= splay_tree_lookup (outer->field_map,
 			     (splay_tree_key) &DECL_UID (var));
       if (n == NULL)
 	{
-	  if (is_global_var (maybe_lookup_decl_in_outer_ctx (var, ctx->outer)))
+	  if (is_global_var (maybe_lookup_decl_in_outer_ctx (var, outer)))
 	    x = var;
 	  else
-	    x = lookup_decl (var, ctx->outer);
+	    x = lookup_decl (var, outer);
 	}
       else
 	{
 	  tree field = (tree) n->value;
 	  /* If the receiver record type was remapped in the child function,
 	     remap the field into the new record type.  */
-	  x = maybe_lookup_field (field, ctx->outer);
+	  x = maybe_lookup_field (field, outer);
 	  if (x != NULL)
 	    field = x;
 
-	  x = build_simple_mem_ref (ctx->outer->receiver_decl);
+	  x = build_simple_mem_ref (outer->receiver_decl);
 	  x = omp_build_component_ref (x, field);
-	  if (use_pointer_for_field (var, ctx->outer))
+	  if (use_pointer_for_field (var, outer))
 	    x = build_simple_mem_ref (x);
 	}
     }
-  else if (ctx->outer)
+  else if (outer)
     {
-      omp_context *outer = ctx->outer;
       if (gimple_code (outer->stmt) == GIMPLE_OMP_GRID_BODY)
 	{
 	  outer = outer->outer;
@@ -1130,6 +1132,12 @@ scan_sharing_clauses (tree clauses, omp_context *ctx)
 	      install_var_local (decl, ctx);
 	      break;
 	    }
+	  if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
+	      && OMP_CLAUSE_REDUCTION_TASK (c))
+	    {
+	      install_var_local (decl, ctx);
+	      break;
+	    }
 	  goto do_private;
 
 	case OMP_CLAUSE_LASTPRIVATE:
@@ -3833,7 +3841,9 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist,
 	  gimple_call_set_lhs (g, v);
 	  gimple_seq_add_stmt (ilist, g);
 	  c = omp_find_clause (clauses, OMP_CLAUSE__REDUCTEMP_);
-	  tskred_temp = lookup_decl (OMP_CLAUSE_DECL (c), ctx);
+	  tskred_temp = OMP_CLAUSE_DECL (c);
+	  if (is_taskreg_ctx (ctx))
+	    tskred_temp = lookup_decl (tskred_temp, ctx);
 	  tree v2 = create_tmp_var (sizetype);
 	  g = gimple_build_assign (v2, NOP_EXPR, v);
 	  gimple_seq_add_stmt (ilist, g);
@@ -3890,8 +3900,7 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist,
 	      break;
 	    case OMP_CLAUSE_REDUCTION:
 	    case OMP_CLAUSE_IN_REDUCTION:
-	      if (is_task_ctx (ctx)
-		  || (OMP_CLAUSE_REDUCTION_TASK (c) && is_parallel_ctx (ctx)))
+	      if (is_task_ctx (ctx) || OMP_CLAUSE_REDUCTION_TASK (c))
 		{
 		  task_reduction_p = true;
 		  if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION)
@@ -3923,8 +3932,11 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist,
               else if (OMP_CLAUSE_REDUCTION_OMP_ORIG_REF (c))
 		reduction_omp_orig_ref = true;
 	      break;
-	    case OMP_CLAUSE__LOOPTEMP_:
 	    case OMP_CLAUSE__REDUCTEMP_:
+	      if (!is_taskreg_ctx (ctx))
+		continue;
+	      /* FALLTHRU */
+	    case OMP_CLAUSE__LOOPTEMP_:
 	      /* Handle _looptemp_/_reductemp_ clauses only on
 		 parallel/task.  */
 	      if (fd)
@@ -5761,8 +5773,7 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
      update in that case, otherwise use a lock.  */
   for (c = clauses; c && count < 2; c = OMP_CLAUSE_CHAIN (c))
     if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
-        && (!OMP_CLAUSE_REDUCTION_TASK (c)
-	    || !is_parallel_ctx (ctx)))
+	&& !OMP_CLAUSE_REDUCTION_TASK (c))
       {
 	if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c)
 	    || TREE_CODE (OMP_CLAUSE_DECL (c)) == MEM_REF)
@@ -5784,8 +5795,7 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
       location_t clause_loc = OMP_CLAUSE_LOCATION (c);
 
       if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION
-	  || (OMP_CLAUSE_REDUCTION_TASK (c)
-	      && is_parallel_ctx (ctx)))
+	  || OMP_CLAUSE_REDUCTION_TASK (c))
 	continue;
 
       enum omp_clause_code ccode = OMP_CLAUSE_REDUCTION;
@@ -6078,9 +6088,7 @@ lower_send_clauses (tree clauses, gimple_seq *ilist, gimple_seq *olist,
 	case OMP_CLAUSE__REDUCTEMP_:
 	  break;
 	case OMP_CLAUSE_REDUCTION:
-	  if (is_task_ctx (ctx))
-	    continue;
-          if (OMP_CLAUSE_REDUCTION_TASK (c) && is_parallel_ctx (ctx))
+	  if (is_task_ctx (ctx) || OMP_CLAUSE_REDUCTION_TASK (c))
 	    continue;
 	  break;
 	case OMP_CLAUSE_SHARED:
@@ -6511,30 +6519,55 @@ maybe_catch_exception (gimple_seq body)
    cancellation in the implicit barrier.  */
 
 static void
-maybe_add_implicit_barrier_cancel (omp_context *ctx, gimple_seq *body)
+maybe_add_implicit_barrier_cancel (omp_context *ctx, gimple *omp_return,
+				   gimple_seq *body)
 {
-  gimple *omp_return = gimple_seq_last_stmt (*body);
   gcc_assert (gimple_code (omp_return) == GIMPLE_OMP_RETURN);
   if (gimple_omp_return_nowait_p (omp_return))
     return;
-  if (ctx->outer
-      && gimple_code (ctx->outer->stmt) == GIMPLE_OMP_PARALLEL
-      && ctx->outer->cancellable)
-    {
-      tree fndecl = builtin_decl_explicit (BUILT_IN_GOMP_CANCEL);
-      tree c_bool_type = TREE_TYPE (TREE_TYPE (fndecl));
-      tree lhs = create_tmp_var (c_bool_type);
-      gimple_omp_return_set_lhs (omp_return, lhs);
-      tree fallthru_label = create_artificial_label (UNKNOWN_LOCATION);
-      gimple *g = gimple_build_cond (NE_EXPR, lhs,
-				    fold_convert (c_bool_type,
-						  boolean_false_node),
-				    ctx->outer->cancel_label, fallthru_label);
-      gimple_seq_add_stmt (body, g);
-      gimple_seq_add_stmt (body, gimple_build_label (fallthru_label));
+  for (omp_context *outer = ctx->outer; outer; outer = outer->outer)
+    if (gimple_code (outer->stmt) == GIMPLE_OMP_PARALLEL
+	&& outer->cancellable)
+      {
+	tree fndecl = builtin_decl_explicit (BUILT_IN_GOMP_CANCEL);
+	tree c_bool_type = TREE_TYPE (TREE_TYPE (fndecl));
+	tree lhs = create_tmp_var (c_bool_type);
+	gimple_omp_return_set_lhs (omp_return, lhs);
+	tree fallthru_label = create_artificial_label (UNKNOWN_LOCATION);
+	gimple *g = gimple_build_cond (NE_EXPR, lhs,
+				       fold_convert (c_bool_type,
+						     boolean_false_node),
+				       outer->cancel_label, fallthru_label);
+	gimple_seq_add_stmt (body, g);
+	gimple_seq_add_stmt (body, gimple_build_label (fallthru_label));
+      }
+    else if (gimple_code (outer->stmt) != GIMPLE_OMP_TASKGROUP)
+      return;
+}
+
+/* Find the first task_reduction or reduction clause or return NULL
+   if there are none.  */
+
+static inline tree
+omp_task_reductions_find_first (tree clauses, enum tree_code code,
+				enum omp_clause_code ccode)
+{
+  while (1)
+    {
+      clauses = omp_find_clause (clauses, ccode);
+      if (clauses == NULL_TREE)
+	return NULL_TREE;
+      if (ccode != OMP_CLAUSE_REDUCTION
+	  || code == OMP_TASKLOOP
+	  || OMP_CLAUSE_REDUCTION_TASK (clauses))
+	return clauses;
+      clauses = OMP_CLAUSE_CHAIN (clauses);
     }
 }
 
+static void lower_omp_task_reductions (omp_context *, enum tree_code, tree,
+				       gimple_seq *, gimple_seq *);
+
 /* Lower the OpenMP sections directive in the current statement in GSI_P.
    CTX is the enclosing OMP context for the current statement.  */
 
@@ -6546,7 +6579,7 @@ lower_omp_sections (gimple_stmt_iterator *gsi_p, omp_context *ctx)
   gomp_sections *stmt;
   gimple *t;
   gbind *new_stmt, *bind;
-  gimple_seq ilist, dlist, olist, new_body;
+  gimple_seq ilist, dlist, olist, tred_dlist = NULL, new_body;
 
   stmt = as_a <gomp_sections *> (gsi_stmt (*gsi_p));
 
@@ -6554,6 +6587,27 @@ lower_omp_sections (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 
   dlist = NULL;
   ilist = NULL;
+
+  tree rclauses
+    = omp_task_reductions_find_first (gimple_omp_sections_clauses (stmt),
+				      OMP_SECTIONS, OMP_CLAUSE_REDUCTION);
+  tree rtmp = NULL_TREE;
+  if (rclauses)
+    {
+      tree type = build_pointer_type (pointer_sized_int_node);
+      tree temp = create_tmp_var (type);
+      tree c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE__REDUCTEMP_);
+      OMP_CLAUSE_DECL (c) = temp;
+      OMP_CLAUSE_CHAIN (c) = gimple_omp_sections_clauses (stmt);
+      gimple_omp_sections_set_clauses (stmt, c);
+      lower_omp_task_reductions (ctx, OMP_SECTIONS,
+				 gimple_omp_sections_clauses (stmt),
+				 &ilist, &tred_dlist);
+      rclauses = c;
+      rtmp = make_ssa_name (type);
+      gimple_seq_add_stmt (&ilist, gimple_build_assign (rtmp, temp));
+    }
+
   lower_rec_input_clauses (gimple_omp_sections_clauses (stmt),
       			   &ilist, &dlist, ctx, NULL);
 
@@ -6625,7 +6679,11 @@ lower_omp_sections (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 				 OMP_CLAUSE_NOWAIT) != NULL_TREE;
   t = gimple_build_omp_return (nowait);
   gimple_seq_add_stmt (&new_body, t);
-  maybe_add_implicit_barrier_cancel (ctx, &new_body);
+  gimple_seq_add_seq (&new_body, tred_dlist);
+  maybe_add_implicit_barrier_cancel (ctx, t, &new_body);
+
+  if (rclauses)
+    OMP_CLAUSE_DECL (rclauses) = rtmp;
 
   gimple_bind_set_body (new_stmt, new_body);
 }
@@ -6787,7 +6845,7 @@ lower_omp_single (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 				 OMP_CLAUSE_NOWAIT) != NULL_TREE;
   gimple *g = gimple_build_omp_return (nowait);
   gimple_seq_add_stmt (&bind_body_tail, g);
-  maybe_add_implicit_barrier_cancel (ctx, &bind_body_tail);
+  maybe_add_implicit_barrier_cancel (ctx, g, &bind_body_tail);
   if (ctx->record_type)
     {
       gimple_stmt_iterator gsi = gsi_start (bind_body_tail);
@@ -6849,26 +6907,6 @@ lower_omp_master (gimple_stmt_iterator *gsi_p, omp_context *ctx)
   BLOCK_VARS (block) = ctx->block_vars;
 }
 
-/* Find the first task_reduction or reduction clause or return NULL
-   if there are none.  */
-
-static inline tree
-omp_task_reductions_find_first (tree clauses, enum tree_code code,
-				enum omp_clause_code ccode)
-{
-  while (1)
-    {
-      clauses = omp_find_clause (clauses, ccode);
-      if (clauses == NULL_TREE)
-	return NULL_TREE;
-      if (ccode != OMP_CLAUSE_REDUCTION
-	  || code == OMP_TASKLOOP
-	  || OMP_CLAUSE_REDUCTION_TASK (clauses))
-	return clauses;
-      clauses = OMP_CLAUSE_CHAIN (clauses);
-    }
-}
-
 /* Helper function for lower_omp_task_reductions.  For a specific PASS
    find out the current clause it should be processed, or return false
    if all have been processed already.  */
@@ -6918,12 +6956,35 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
   enum omp_clause_code ccode
     = (code == OMP_TASKGROUP
        ? OMP_CLAUSE_TASK_REDUCTION : OMP_CLAUSE_REDUCTION);
+  tree cancellable = NULL_TREE;
   clauses = omp_task_reductions_find_first (clauses, code, ccode);
   if (clauses == NULL_TREE)
     return;
+  if (code == OMP_FOR || code == OMP_SECTIONS)
+    {
+      for (omp_context *outer = ctx->outer; outer; outer = outer->outer)
+	if (gimple_code (outer->stmt) == GIMPLE_OMP_PARALLEL
+	    && outer->cancellable)
+	  {
+	    cancellable = error_mark_node;
+	    break;
+	  }
+	else if (gimple_code (outer->stmt) != GIMPLE_OMP_TASKGROUP)
+	  break;
+    }
   tree record_type = lang_hooks.types.make_type (RECORD_TYPE);
   tree *last = &TYPE_FIELDS (record_type);
   unsigned cnt = 0;
+  if (cancellable)
+    {
+      tree field = build_decl (UNKNOWN_LOCATION, FIELD_DECL, NULL_TREE,
+			       ptr_type_node);
+      tree ifield = build_decl (UNKNOWN_LOCATION, FIELD_DECL, NULL_TREE,
+				integer_type_node);
+      *last = field;
+      DECL_CHAIN (field) = ifield;
+      last = &DECL_CHAIN (ifield);
+    }
   for (int pass = 0; pass < 2; pass++)
     {
       tree decl, type, next;
@@ -7010,7 +7071,49 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
   tree idx = create_tmp_var (size_type_node);
   gimple_seq_add_stmt (end, gimple_build_assign (idx, size_zero_node));
   tree num_thr_sz = create_tmp_var (size_type_node);
+  tree lab1 = create_artificial_label (UNKNOWN_LOCATION);
+  tree lab2 = create_artificial_label (UNKNOWN_LOCATION);
+  tree lab3 = NULL_TREE;
   gimple *g;
+  if (code == OMP_FOR || code == OMP_SECTIONS)
+    {
+      /* For worksharing constructs, only perform it in the master thread,
+	 with the exception of cancelled implicit barriers - then only handle
+	 the current thread.  */
+      tree lab4 = create_artificial_label (UNKNOWN_LOCATION);
+      t = builtin_decl_explicit (BUILT_IN_OMP_GET_THREAD_NUM);
+      tree thr_num = create_tmp_var (integer_type_node);
+      g = gimple_build_call (t, 0);
+      gimple_call_set_lhs (g, thr_num);
+      gimple_seq_add_stmt (end, g);
+      if (cancellable)
+	{
+	  tree c;
+	  tree lab5 = create_artificial_label (UNKNOWN_LOCATION);
+	  tree lab6 = create_artificial_label (UNKNOWN_LOCATION);
+	  lab3 = create_artificial_label (UNKNOWN_LOCATION);
+	  if (code == OMP_FOR)
+	    c = gimple_omp_for_clauses (ctx->stmt);
+	  else if (code == OMP_SECTIONS)
+	    c = gimple_omp_sections_clauses (ctx->stmt);
+	  c = OMP_CLAUSE_DECL (omp_find_clause (c, OMP_CLAUSE__REDUCTEMP_));
+	  cancellable = c;
+	  g = gimple_build_cond (NE_EXPR, c, build_zero_cst (TREE_TYPE (c)),
+				 lab5, lab6);
+	  gimple_seq_add_stmt (end, g);
+	  gimple_seq_add_stmt (end, gimple_build_label (lab5));
+	  g = gimple_build_assign (idx, NOP_EXPR, thr_num);
+	  gimple_seq_add_stmt (end, g);
+	  g = gimple_build_assign (num_thr_sz, PLUS_EXPR, idx,
+				   build_one_cst (TREE_TYPE (idx)));
+	  gimple_seq_add_stmt (end, g);
+	  gimple_seq_add_stmt (end, gimple_build_goto (lab3));
+	  gimple_seq_add_stmt (end, gimple_build_label (lab6));
+	}
+      g = gimple_build_cond (NE_EXPR, thr_num, integer_zero_node, lab2, lab4);
+      gimple_seq_add_stmt (end, g);
+      gimple_seq_add_stmt (end, gimple_build_label (lab4));
+    }
   if (code != OMP_PARALLEL)
     {
       t = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_THREADS);
@@ -7020,6 +7123,8 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
       gimple_seq_add_stmt (end, g);
       g = gimple_build_assign (num_thr_sz, NOP_EXPR, num_thr);
       gimple_seq_add_stmt (end, g);
+      if (cancellable)
+	gimple_seq_add_stmt (end, gimple_build_label (lab3));
     }
   else
     {
@@ -7033,8 +7138,6 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
 	      NULL_TREE, NULL_TREE);
   tree data = create_tmp_var (pointer_sized_int_node);
   gimple_seq_add_stmt (end, gimple_build_assign (data, t));
-  tree lab1 = create_artificial_label (UNKNOWN_LOCATION);
-  tree lab2 = create_artificial_label (UNKNOWN_LOCATION);
   gimple_seq_add_stmt (end, gimple_build_label (lab1));
   tree ptr;
   if (TREE_CODE (TYPE_SIZE_UNIT (record_type)) == INTEGER_CST)
@@ -7045,6 +7148,8 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
 
   tree field = TYPE_FIELDS (record_type);
   cnt = 0;
+  if (cancellable)
+    field = DECL_CHAIN (DECL_CHAIN (field));
   for (int pass = 0; pass < 2; pass++)
     {
       tree decl, type, next;
@@ -7117,9 +7222,9 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
 
 	  tree bfield = DECL_CHAIN (field);
 	  tree cond;
-	  if (code == OMP_PARALLEL)
-	    /* In parallel all threads unconditionally initialize all their
-	       task reduction private variables.  */
+	  if (code == OMP_PARALLEL || code == OMP_FOR || code == OMP_SECTIONS)
+	    /* In parallel or worksharing all threads unconditionally
+	       initialize all their task reduction private variables.  */
 	    cond = boolean_true_node;
 	  else if (TREE_TYPE (ptr) == ptr_type_node)
 	    {
@@ -7143,6 +7248,18 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
 				 lab3, lab4);
 	  gimple_seq_add_stmt (end, g);
 	  gimple_seq_add_stmt (end, gimple_build_label (lab3));
+	  if (cancellable && OMP_CLAUSE_REDUCTION_PLACEHOLDER (c) == NULL_TREE)
+	    {
+	      /* If this reduction doesn't need destruction and parallel
+		 has been cancelled, there is nothing to do for this
+		 reduction, so jump around the merge operation.  */
+	      tree lab5 = create_artificial_label (UNKNOWN_LOCATION);
+	      g = gimple_build_cond (NE_EXPR, cancellable,
+				     build_zero_cst (TREE_TYPE (cancellable)),
+				     lab4, lab5);
+	      gimple_seq_add_stmt (end, g);
+	      gimple_seq_add_stmt (end, gimple_build_label (lab5));
+	    }
 
 	  tree new_var;
 	  if (TREE_TYPE (ptr) == ptr_type_node)
@@ -7202,6 +7319,20 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
 		  tree placeholder = OMP_CLAUSE_REDUCTION_PLACEHOLDER (c);
 		  tree decl_placeholder
 		    = OMP_CLAUSE_REDUCTION_DECL_PLACEHOLDER (c);
+		  tree lab6 = NULL_TREE;
+		  if (cancellable)
+		    {
+		      /* If this reduction needs destruction and parallel
+			 has been cancelled, jump around the merge operation
+			 to the destruction.  */
+		      tree lab5 = create_artificial_label (UNKNOWN_LOCATION);
+		      lab6 = create_artificial_label (UNKNOWN_LOCATION);
+		      tree zero = build_zero_cst (TREE_TYPE (cancellable));
+		      g = gimple_build_cond (NE_EXPR, cancellable, zero,
+					     lab6, lab5);
+		      gimple_seq_add_stmt (end, g);
+		      gimple_seq_add_stmt (end, gimple_build_label (lab5));
+		    }
 		  SET_DECL_VALUE_EXPR (placeholder, out);
 		  DECL_HAS_VALUE_EXPR_P (placeholder) = 1;
 		  SET_DECL_VALUE_EXPR (decl_placeholder, priv);
@@ -7215,6 +7346,8 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
 		      OMP_CLAUSE_REDUCTION_PLACEHOLDER (c) = NULL;
 		      OMP_CLAUSE_REDUCTION_DECL_PLACEHOLDER (c) = NULL;
 		    }
+		  if (cancellable)
+		    gimple_seq_add_stmt (end, gimple_build_label (lab6));
 		  tree x = lang_hooks.decls.omp_clause_dtor (c, priv);
 		  if (x)
 		    {
@@ -7247,7 +7380,20 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
 	    {
 	      tree placeholder = OMP_CLAUSE_REDUCTION_PLACEHOLDER (c);
 	      tree oldv = NULL_TREE;
-
+	      tree lab6 = NULL_TREE;
+	      if (cancellable)
+		{
+		  /* If this reduction needs destruction and parallel
+		     has been cancelled, jump around the merge operation
+		     to the destruction.  */
+		  tree lab5 = create_artificial_label (UNKNOWN_LOCATION);
+		  lab6 = create_artificial_label (UNKNOWN_LOCATION);
+		  tree zero = build_zero_cst (TREE_TYPE (cancellable));
+		  g = gimple_build_cond (NE_EXPR, cancellable, zero,
+					 lab6, lab5);
+		  gimple_seq_add_stmt (end, g);
+		  gimple_seq_add_stmt (end, gimple_build_label (lab5));
+		}
 	      if (omp_is_reference (decl)
 		  && !useless_type_conversion_p (TREE_TYPE (placeholder),
 						 TREE_TYPE (ref)))
@@ -7283,6 +7429,8 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
 	      OMP_CLAUSE_REDUCTION_GIMPLE_MERGE (c) = NULL;
 	      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_TASK_REDUCTION)
 		OMP_CLAUSE_REDUCTION_PLACEHOLDER (c) = NULL;
+	      if (cancellable)
+		gimple_seq_add_stmt (end, gimple_build_label (lab6));
 	      tree x = lang_hooks.decls.omp_clause_dtor (c, new_var);
 	      if (x)
 		{
@@ -7309,10 +7457,16 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
       g = gimple_build_call (t, 1, build_fold_addr_expr (avar));
       gimple_seq_add_stmt (start, g);
     }
-  else if (code == OMP_TASKLOOP || code == OMP_PARALLEL)
+  else
     {
-      tree c = omp_find_clause (gimple_omp_taskreg_clauses (ctx->stmt),
-				OMP_CLAUSE__REDUCTEMP_);
+      tree c;
+      if (code == OMP_FOR)
+	c = gimple_omp_for_clauses (ctx->stmt);
+      else if (code == OMP_SECTIONS)
+	c = gimple_omp_sections_clauses (ctx->stmt);
+      else
+	c = gimple_omp_taskreg_clauses (ctx->stmt);
+      c = omp_find_clause (c, OMP_CLAUSE__REDUCTEMP_);
       t = fold_convert (TREE_TYPE (OMP_CLAUSE_DECL (c)),
 			build_fold_addr_expr (avar));
       gimplify_assign (OMP_CLAUSE_DECL (c), t, start);
@@ -7324,8 +7478,28 @@ lower_omp_task_reductions (omp_context *ctx, enum tree_code code, tree clauses,
   g = gimple_build_cond (NE_EXPR, idx, num_thr_sz, lab1, lab2);
   gimple_seq_add_stmt (end, g);
   gimple_seq_add_stmt (end, gimple_build_label (lab2));
-  t = builtin_decl_explicit (BUILT_IN_GOMP_TASKGROUP_REDUCTION_UNREGISTER);
-  g = gimple_build_call (t, 1, build_fold_addr_expr (avar));
+  if (code == OMP_FOR || code == OMP_SECTIONS)
+    {
+      enum built_in_function bfn
+	= BUILT_IN_GOMP_WORKSHARE_TASK_REDUCTION_UNREGISTER;
+      t = builtin_decl_explicit (bfn);
+      tree c_bool_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (t)));
+      tree arg;
+      if (cancellable)
+	{
+	  arg = create_tmp_var (c_bool_type);
+	  gimple_seq_add_stmt (end, gimple_build_assign (arg, NOP_EXPR,
+							 cancellable));
+	}
+      else
+	arg = build_int_cst (c_bool_type, 0);
+      g = gimple_build_call (t, 1, arg);
+    }
+  else
+    {
+      t = builtin_decl_explicit (BUILT_IN_GOMP_TASKGROUP_REDUCTION_UNREGISTER);
+      g = gimple_build_call (t, 1, build_fold_addr_expr (avar));
+    }
   gimple_seq_add_stmt (end, g);
   t = build_constructor (atype, NULL);
   TREE_THIS_VOLATILE (t) = 1;
@@ -7953,7 +8127,8 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx)
   struct omp_for_data fd, *fdp = NULL;
   gomp_for *stmt = as_a <gomp_for *> (gsi_stmt (*gsi_p));
   gbind *new_stmt;
-  gimple_seq omp_for_body, body, dlist;
+  gimple_seq omp_for_body, body, dlist, tred_ilist = NULL, tred_dlist = NULL;
+  gimple_seq cnt_list = NULL;
   gimple_seq oacc_head = NULL, oacc_tail = NULL;
   size_t i;
 
@@ -8046,9 +8221,30 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx)
   /* The pre-body and input clauses go before the lowered GIMPLE_OMP_FOR.  */
   dlist = NULL;
   body = NULL;
+  tree rclauses
+    = omp_task_reductions_find_first (gimple_omp_for_clauses (stmt), OMP_FOR,
+				      OMP_CLAUSE_REDUCTION);
+  tree rtmp = NULL_TREE;
+  if (rclauses)
+    {
+      tree type = build_pointer_type (pointer_sized_int_node);
+      tree temp = create_tmp_var (type);
+      tree c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE__REDUCTEMP_);
+      OMP_CLAUSE_DECL (c) = temp;
+      OMP_CLAUSE_CHAIN (c) = gimple_omp_for_clauses (stmt);
+      gimple_omp_for_set_clauses (stmt, c);
+      lower_omp_task_reductions (ctx, OMP_FOR,
+				 gimple_omp_for_clauses (stmt),
+				 &tred_ilist, &tred_dlist);
+      rclauses = c;
+      rtmp = make_ssa_name (type);
+      gimple_seq_add_stmt (&body, gimple_build_assign (rtmp, temp));
+    }
+
   lower_rec_input_clauses (gimple_omp_for_clauses (stmt), &body, &dlist, ctx,
 			   fdp);
-  gimple_seq_add_seq (&body, gimple_omp_for_pre_body (stmt));
+  gimple_seq_add_seq (rclauses ? &tred_ilist : &body,
+		      gimple_omp_for_pre_body (stmt));
 
   lower_omp (gimple_omp_body_ptr (stmt), ctx);
 
@@ -8063,20 +8259,24 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx)
     {
       rhs_p = gimple_omp_for_initial_ptr (stmt, i);
       if (!is_gimple_min_invariant (*rhs_p))
-	*rhs_p = get_formal_tmp_var (*rhs_p, &body);
+	*rhs_p = get_formal_tmp_var (*rhs_p, &cnt_list);
       else if (TREE_CODE (*rhs_p) == ADDR_EXPR)
 	recompute_tree_invariant_for_addr_expr (*rhs_p);
 
       rhs_p = gimple_omp_for_final_ptr (stmt, i);
       if (!is_gimple_min_invariant (*rhs_p))
-	*rhs_p = get_formal_tmp_var (*rhs_p, &body);
+	*rhs_p = get_formal_tmp_var (*rhs_p, &cnt_list);
       else if (TREE_CODE (*rhs_p) == ADDR_EXPR)
 	recompute_tree_invariant_for_addr_expr (*rhs_p);
 
       rhs_p = &TREE_OPERAND (gimple_omp_for_incr (stmt, i), 1);
       if (!is_gimple_min_invariant (*rhs_p))
-	*rhs_p = get_formal_tmp_var (*rhs_p, &body);
+	*rhs_p = get_formal_tmp_var (*rhs_p, &cnt_list);
     }
+  if (rclauses)
+    gimple_seq_add_seq (&tred_ilist, cnt_list);
+  else
+    gimple_seq_add_seq (&body, cnt_list);
 
   /* Once lowered, extract the bounds and clauses.  */
   omp_extract_for_data (stmt, &fd, NULL);
@@ -8123,13 +8323,26 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx)
 
   gimple_seq_add_seq (&body, dlist);
 
+  if (rclauses)
+    {
+      gimple_seq_add_seq (&tred_ilist, body);
+      body = tred_ilist;
+    }
+
   body = maybe_catch_exception (body);
 
   if (!phony_loop)
     {
       /* Region exit marker goes at the end of the loop body.  */
-      gimple_seq_add_stmt (&body, gimple_build_omp_return (fd.have_nowait));
-      maybe_add_implicit_barrier_cancel (ctx, &body);
+      gimple *g = gimple_build_omp_return (fd.have_nowait);
+      gimple_seq_add_stmt (&body, g);
+
+      gimple_seq_add_seq (&body, tred_dlist);
+
+      maybe_add_implicit_barrier_cancel (ctx, g, &body);
+
+      if (rclauses)
+	OMP_CLAUSE_DECL (rclauses) = rtmp;
     }
 
   /* Add OpenACC joining and reduction markers just after the loop.  */
diff --git a/libgomp/ChangeLog.gomp b/libgomp/ChangeLog.gomp
index cddc1365d05d..0f459346aeea 100644
--- a/libgomp/ChangeLog.gomp
+++ b/libgomp/ChangeLog.gomp
@@ -1,3 +1,107 @@
+2018-11-07  Jakub Jelinek  <jakub@redhat.com>
+
+	* libgomp_g.h (GOMP_loop_start, GOMP_loop_ordered_start,
+	GOMP_loop_doacross_start, GOMP_loop_ull_start,
+	GOMP_loop_ull_ordered_start, GOMP_loop_ull_doacross_start,
+	GOMP_workshare_task_reduction_unregister, GOMP_sections2_start): New
+	prototypes.
+	* libgomp.h (struct gomp_doacross_work_share): Add extra field.
+	(struct gomp_work_share): Add task_reductions field.
+	(struct gomp_taskgroup): Add workshare flag.
+	(gomp_doacross_init, gomp_doacross_ull_init): Add size_t argument.
+	(gomp_workshare_taskgroup_start,
+	gomp_workshare_task_reduction_register): New prototypes.
+	(gomp_init_work_share, gomp_work_share_start): Change bool argument
+	to size_t.
+	* libgomp.map (GOMP_5.0): Export GOMP_loop_start,
+	GOMP_loop_ordered_start, GOMP_loop_doacross_start,
+	GOMP_loop_ull_start, GOMP_loop_ull_ordered_start,
+	GOMP_loop_ull_doacross_start,
+	GOMP_workshare_task_reduction_unregister and GOMP_sections2_start.
+	* loop.c: Include string.h.
+	(GOMP_loop_runtime_next): Add ialias.
+	(GOMP_taskgroup_reduction_register): Add ialias_redirect.
+	(gomp_loop_static_start, gomp_loop_dynamic_start,
+	gomp_loop_guided_start, gomp_loop_ordered_static_start,
+	gomp_loop_ordered_dynamic_start, gomp_loop_ordered_guided_start,
+	gomp_loop_doacross_static_start, gomp_loop_doacross_dynamic_start,
+	gomp_loop_doacross_guided_start): Adjust gomp_work_share_start
+	or gomp_doacross_init callers.
+	(gomp_adjust_sched, GOMP_loop_start, GOMP_loop_ordered_start,
+	GOMP_loop_doacross_start): New functions.
+	* loop_ull.c: Include string.h.
+	(GOMP_loop_ull_runtime_next): Add ialias.
+	(GOMP_taskgroup_reduction_register): Add ialias_redirect.
+	(gomp_loop_ull_static_start, gomp_loop_ull_dynamic_start,
+	gomp_loop_ull_guided_start, gomp_loop_ull_ordered_static_start,
+	gomp_loop_ull_ordered_dynamic_start,
+	gomp_loop_ull_ordered_guided_start,
+	gomp_loop_ull_doacross_static_start,
+	gomp_loop_ull_doacross_dynamic_start,
+	gomp_loop_ull_doacross_guided_start): Adjust gomp_work_share_start
+	and gomp_doacross_ull_init callers.
+	(gomp_adjust_sched, GOMP_loop_ull_start, GOMP_loop_ull_ordered_start,
+	GOMP_loop_ull_doacross_start): New functions.
+	* sections.c: Include string.h.
+	(GOMP_taskgroup_reduction_register): Add ialias_redirect.
+	(GOMP_sections_start): Adjust gomp_work_share_start caller.
+	(GOMP_sections2_start): New function.
+	* ordered.c (gomp_doacross_init, gomp_doacross_ull_init): Add
+	EXTRA argument.  If not needed to prepare array, if extra is 0,
+	clear ws->doacross, otherwise allocate just doacross structure and
+	extra payload.  If array is needed, allocate also extra payload.
+	(GOMP_doacross_post, GOMP_doacross_wait, GOMP_doacross_ull_post,
+	GOMP_doacross_ull_wait): Handle doacross->array == NULL like
+	doacross == NULL.
+	* parallel.c (GOMP_cancellation_point): If taskgroup has workshare
+	flag set, check cancelled of prev taskgroup if any.
+	(GOMP_cancel): If taskgroup has workshare flag set, set cancelled
+	on prev taskgroup if any.
+	* single.c (GOMP_single_start, GOMP_single_copy_start): Adjust
+	gomp_work_share_start callers.
+	* target.c (GOMP_target_update_ext, GOMP_target_enter_exit_data):
+	If taskgroup has workshare flag set, check cancelled on prev
+	taskgroup if any.  Guard all cancellation tests with
+	gomp_cancel_var test.
+	* taskloop.c (GOMP_taskloop): Likewise.
+	* task.c (GOMP_task, gomp_create_target_task, gomp_task_run_pre,
+	GOMP_taskwait_depend): Likewise.
+	(gomp_taskgroup_init): Clear workshare flag, reorder initialization.
+	(gomp_reduction_register): Add always_inline attribute.  Add
+	ORIG argument, if non-NULL, don't allocate memory, but copy it
+	from there.
+	(gomp_create_artificial_team): New function.
+	(GOMP_taskgroup_reduction_register): Extend function comment.
+	Use gomp_create_artificial_team.  Adjust gomp_reduction_register
+	caller.
+	(gomp_parallel_reduction_register): Adjust gomp_reduction_register
+	caller.
+	(gomp_workshare_task_reduction_register,
+	gomp_workshare_taskgroup_start,
+	GOMP_workshare_task_reduction_unregister): New functions.
+	* team.c (gomp_new_team): Adjust gomp_init_work_share caller.
+	* work.c (gomp_init_work_share): Change ORDERED argument from
+	bool to size_t, if more than 1 allocate also extra payload at the
+	end of array.  Never keep ordered_team_ids NULL, set it
+	to inline_ordered_team_ids instead.
+	(gomp_work_share_start): Change ORDERED argument from bool to size_t,
+	return true instead of ws.
+	* testsuite/libgomp.c-c++-common/cancel-parallel-1.c: New test.
+	* testsuite/libgomp.c-c++-common/cancel-taskgroup-3.c: New test.
+	* testsuite/libgomp.c-c++-common/task-reduction-6.c (struct S):
+	Use unsigned long long int instead of unsigned long int.
+	(main): Verify r == t.
+	* testsuite/libgomp.c-c++-common/task-reduction-8.c: New test.
+	* testsuite/libgomp.c-c++-common/task-reduction-9.c: New test.
+	* testsuite/libgomp.c-c++-common/task-reduction-11.c: New test.
+	* testsuite/libgomp.c-c++-common/task-reduction-12.c: New test.
+	* testsuite/libgomp.c++/task-reduction-14.C: New test.
+	* testsuite/libgomp.c++/task-reduction-15.C: New test.
+	* testsuite/libgomp.c++/task-reduction-16.C: New test.
+	* testsuite/libgomp.c++/task-reduction-17.C: New test.
+	* testsuite/libgomp.c++/task-reduction-18.C: New test.
+	* testsuite/libgomp.c++/task-reduction-19.C: New test.
+
 2018-10-26  Jakub Jelinek  <jakub@redhat.com>
 
 	* libgomp.h (GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC): Define unless
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 9728c8e5f687..828e9b0095b3 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -188,6 +188,8 @@ struct gomp_doacross_work_share
     /* Likewise, but for the ull implementation.  */
     unsigned long long boundary_ull;
   };
+  /* Pointer to extra memory if needed for lastprivate(conditional).  */
+  void *extra;
   /* Array of shift counts for each dimension if they can be flattened.  */
   unsigned int shift_counts[];
 };
@@ -289,6 +291,9 @@ struct gomp_work_share
     struct gomp_work_share *next_free;
   };
 
+  /* Task reductions for this work-sharing construct.  */
+  uintptr_t *task_reductions;
+
   /* If only few threads are in the team, ordered_team_ids can point
      to this array which fills the padding at the end of this struct.  */
   unsigned inline_ordered_team_ids[0];
@@ -490,6 +495,7 @@ struct gomp_taskgroup
   uintptr_t *reductions;
   bool in_taskgroup_wait;
   bool cancelled;
+  bool workshare;
   gomp_sem_t taskgroup_sem;
   size_t num_children;
 };
@@ -795,9 +801,9 @@ extern void gomp_ordered_next (void);
 extern void gomp_ordered_static_init (void);
 extern void gomp_ordered_static_next (void);
 extern void gomp_ordered_sync (void);
-extern void gomp_doacross_init (unsigned, long *, long);
+extern void gomp_doacross_init (unsigned, long *, long, size_t);
 extern void gomp_doacross_ull_init (unsigned, unsigned long long *,
-				    unsigned long long);
+				    unsigned long long, size_t);
 
 /* parallel.c */
 
@@ -822,6 +828,8 @@ extern bool gomp_create_target_task (struct gomp_device_descr *,
 				     enum gomp_target_task_state);
 extern struct gomp_taskgroup *gomp_parallel_reduction_register (uintptr_t *,
 								unsigned);
+extern void gomp_workshare_taskgroup_start (void);
+extern void gomp_workshare_task_reduction_register (uintptr_t *, uintptr_t *);
 
 static void inline
 gomp_finish_task (struct gomp_task *task)
@@ -1061,9 +1069,9 @@ extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key);
 
 /* work.c */
 
-extern void gomp_init_work_share (struct gomp_work_share *, bool, unsigned);
+extern void gomp_init_work_share (struct gomp_work_share *, size_t, unsigned);
 extern void gomp_fini_work_share (struct gomp_work_share *);
-extern bool gomp_work_share_start (bool);
+extern bool gomp_work_share_start (size_t);
 extern void gomp_work_share_end (void);
 extern bool gomp_work_share_end_cancel (void);
 extern void gomp_work_share_end_nowait (void);
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index 0ea7578e027b..4c19a259eaa6 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -316,22 +316,30 @@ GOMP_4.5 {
 
 GOMP_5.0 {
   global:
+	GOMP_loop_doacross_start;
 	GOMP_loop_maybe_nonmonotonic_runtime_next;
 	GOMP_loop_maybe_nonmonotonic_runtime_start;
 	GOMP_loop_nonmonotonic_runtime_next;
 	GOMP_loop_nonmonotonic_runtime_start;
+	GOMP_loop_ordered_start;
+	GOMP_loop_start;
+	GOMP_loop_ull_doacross_start;
 	GOMP_loop_ull_maybe_nonmonotonic_runtime_next;
 	GOMP_loop_ull_maybe_nonmonotonic_runtime_start;
 	GOMP_loop_ull_nonmonotonic_runtime_next;
 	GOMP_loop_ull_nonmonotonic_runtime_start;
+	GOMP_loop_ull_ordered_start;
+	GOMP_loop_ull_start;
 	GOMP_parallel_loop_maybe_nonmonotonic_runtime;
 	GOMP_parallel_loop_nonmonotonic_runtime;
 	GOMP_parallel_reductions;
+	GOMP_sections2_start;
 	GOMP_taskgroup_reduction_register;
 	GOMP_taskgroup_reduction_unregister;
 	GOMP_task_reduction_remap;
 	GOMP_taskwait_depend;
 	GOMP_teams_reg;
+	GOMP_workshare_task_reduction_unregister;
 } GOMP_4.5;
 
 OACC_2.0 {
diff --git a/libgomp/libgomp_g.h b/libgomp/libgomp_g.h
index 6d24a4f0192a..5b54839b29e5 100644
--- a/libgomp/libgomp_g.h
+++ b/libgomp/libgomp_g.h
@@ -61,6 +61,8 @@ extern bool GOMP_loop_nonmonotonic_runtime_start (long, long, long,
 						  long *, long *);
 extern bool GOMP_loop_maybe_nonmonotonic_runtime_start (long, long, long,
 							long *, long *);
+extern bool GOMP_loop_start (long, long, long, long, long, long *, long *,
+			     uintptr_t *, void **);
 
 extern bool GOMP_loop_ordered_static_start (long, long, long, long,
 					    long *, long *);
@@ -69,6 +71,8 @@ extern bool GOMP_loop_ordered_dynamic_start (long, long, long, long,
 extern bool GOMP_loop_ordered_guided_start (long, long, long, long,
 					    long *, long *);
 extern bool GOMP_loop_ordered_runtime_start (long, long, long, long *, long *);
+extern bool GOMP_loop_ordered_start (long, long, long, long, long, long *,
+				     long *, uintptr_t *, void **);
 
 extern bool GOMP_loop_static_next (long *, long *);
 extern bool GOMP_loop_dynamic_next (long *, long *);
@@ -92,6 +96,8 @@ extern bool GOMP_loop_doacross_guided_start (unsigned, long *, long, long *,
 					     long *);
 extern bool GOMP_loop_doacross_runtime_start (unsigned, long *, long *,
 					      long *);
+extern bool GOMP_loop_doacross_start (unsigned, long *, long, long, long *,
+				      long *, uintptr_t *, void **);
 
 extern void GOMP_parallel_loop_static_start (void (*)(void *), void *,
 					     unsigned, long, long, long, long);
@@ -179,6 +185,10 @@ extern bool GOMP_loop_ull_maybe_nonmonotonic_runtime_start (bool,
 							    unsigned long long,
 							    unsigned long long *,
 							    unsigned long long *);
+extern bool GOMP_loop_ull_start (bool, unsigned long long, unsigned long long,
+				 unsigned long long, long, unsigned long long,
+				 unsigned long long *, unsigned long long *,
+				 uintptr_t *, void **);
 
 extern bool GOMP_loop_ull_ordered_static_start (bool, unsigned long long,
 						unsigned long long,
@@ -203,6 +213,13 @@ extern bool GOMP_loop_ull_ordered_runtime_start (bool, unsigned long long,
 						 unsigned long long,
 						 unsigned long long *,
 						 unsigned long long *);
+extern bool GOMP_loop_ull_ordered_start (bool, unsigned long long,
+					 unsigned long long,
+					 unsigned long long, long,
+					 unsigned long long,
+					 unsigned long long *,
+					 unsigned long long *,
+					 uintptr_t *, void **);
 
 extern bool GOMP_loop_ull_static_next (unsigned long long *,
 				       unsigned long long *);
@@ -249,6 +266,11 @@ extern bool GOMP_loop_ull_doacross_runtime_start (unsigned,
 						  unsigned long long *,
 						  unsigned long long *,
 						  unsigned long long *);
+extern bool GOMP_loop_ull_doacross_start (unsigned, unsigned long long *,
+					  long, unsigned long long,
+					  unsigned long long *,
+					  unsigned long long *,
+					  uintptr_t *, void **);
 
 /* ordered.c */
 
@@ -289,10 +311,12 @@ extern void GOMP_taskgroup_end (void);
 extern void GOMP_taskgroup_reduction_register (uintptr_t *);
 extern void GOMP_taskgroup_reduction_unregister (uintptr_t *);
 extern void GOMP_task_reduction_remap (size_t, size_t, void **);
+extern void GOMP_workshare_task_reduction_unregister (bool);
 
 /* sections.c */
 
 extern unsigned GOMP_sections_start (unsigned);
+extern unsigned GOMP_sections2_start (unsigned, uintptr_t *, void **);
 extern unsigned GOMP_sections_next (void);
 extern void GOMP_parallel_sections_start (void (*) (void *), void *,
 					  unsigned, unsigned);
diff --git a/libgomp/loop.c b/libgomp/loop.c
index bafd89a0c20b..4e0683ba675a 100644
--- a/libgomp/loop.c
+++ b/libgomp/loop.c
@@ -27,9 +27,13 @@
 
 #include <limits.h>
 #include <stdlib.h>
+#include <string.h>
 #include "libgomp.h"
 
 
+ialias (GOMP_loop_runtime_next)
+ialias_redirect (GOMP_taskgroup_reduction_register)
+
 /* Initialize the given work share construct from the given arguments.  */
 
 static inline void
@@ -101,7 +105,7 @@ gomp_loop_static_start (long start, long end, long incr, long chunk_size,
   struct gomp_thread *thr = gomp_thread ();
 
   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_STATIC, chunk_size);
@@ -123,7 +127,7 @@ gomp_loop_dynamic_start (long start, long end, long incr, long chunk_size,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_DYNAMIC, chunk_size);
@@ -151,7 +155,7 @@ gomp_loop_guided_start (long start, long end, long incr, long chunk_size,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_GUIDED, chunk_size);
@@ -197,6 +201,100 @@ GOMP_loop_runtime_start (long start, long end, long incr,
     }
 }
 
+static long
+gomp_adjust_sched (long sched, long *chunk_size)
+{
+  sched &= ~GFS_MONOTONIC;
+  switch (sched)
+    {
+    case GFS_STATIC:
+    case GFS_DYNAMIC:
+    case GFS_GUIDED:
+      return sched;
+    /* GFS_RUNTIME is used for runtime schedule without monotonic
+       or nonmonotonic modifiers on the clause.
+       GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic
+       modifier.  */
+    case GFS_RUNTIME:
+    /* GFS_AUTO is used for runtime schedule with nonmonotonic
+       modifier.  */
+    case GFS_AUTO:
+      {
+	struct gomp_task_icv *icv = gomp_icv (false);
+	sched = icv->run_sched_var & ~GFS_MONOTONIC;
+	switch (sched)
+	  {
+	  case GFS_STATIC:
+	  case GFS_DYNAMIC:
+	  case GFS_GUIDED:
+	    *chunk_size = icv->run_sched_chunk_size;
+	    break;
+	  case GFS_AUTO:
+	    sched = GFS_STATIC;
+	    *chunk_size = 0;
+	    break;
+	  default:
+	    abort ();
+	  }
+	return sched;
+      }
+    default:
+      abort ();
+    }
+}
+
+bool
+GOMP_loop_start (long start, long end, long incr, long sched,
+		 long chunk_size, long *istart, long *iend,
+		 uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (gomp_work_share_start (0))
+    {
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_init (thr->ts.work_share, start, end, incr,
+		      sched, chunk_size);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      if (mem)
+	{
+	  uintptr_t size = (uintptr_t) *mem;
+	  if (size > (sizeof (struct gomp_work_share)
+		      - offsetof (struct gomp_work_share,
+				  inline_ordered_team_ids)))
+	    thr->ts.work_share->ordered_team_ids
+	      = gomp_malloc_cleared (size);
+	  else
+	    memset (thr->ts.work_share->ordered_team_ids, '\0', size);
+	  *mem = (void *) thr->ts.work_share->ordered_team_ids;
+	}
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      if (mem)
+	*mem = (void *) thr->ts.work_share->ordered_team_ids;
+    }
+
+  if (!istart)
+    return true;
+  return ialias_call (GOMP_loop_runtime_next) (istart, iend);
+}
+
 /* The *_ordered_*_start routines are similar.  The only difference is that
    this work-share construct is initialized to expect an ORDERED section.  */
 
@@ -207,7 +305,7 @@ gomp_loop_ordered_static_start (long start, long end, long incr,
   struct gomp_thread *thr = gomp_thread ();
 
   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_STATIC, chunk_size);
@@ -225,7 +323,7 @@ gomp_loop_ordered_dynamic_start (long start, long end, long incr,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_DYNAMIC, chunk_size);
@@ -250,7 +348,7 @@ gomp_loop_ordered_guided_start (long start, long end, long incr,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_GUIDED, chunk_size);
@@ -297,6 +395,81 @@ GOMP_loop_ordered_runtime_start (long start, long end, long incr,
     }
 }
 
+bool
+GOMP_loop_ordered_start (long start, long end, long incr, long sched,
+			 long chunk_size, long *istart, long *iend,
+			 uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  size_t ordered = 1;
+  bool ret;
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (mem)
+    ordered += (uintptr_t) *mem;
+  if (gomp_work_share_start (ordered))
+    {
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_init (thr->ts.work_share, start, end, incr,
+		      sched, chunk_size);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      if (sched == GFS_STATIC)
+	gomp_ordered_static_init ();
+      else
+	gomp_mutex_lock (&thr->ts.work_share->lock);
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      sched = thr->ts.work_share->sched;
+      if (sched != GFS_STATIC)
+	gomp_mutex_lock (&thr->ts.work_share->lock);
+    }
+
+  if (mem)
+    {
+      uintptr_t p
+	= (uintptr_t) (thr->ts.work_share->ordered_team_ids
+		       + (thr->ts.team ? thr->ts.team->nthreads : 1));
+      p += __alignof__ (long long) - 1;
+      p &= ~(__alignof__ (long long) - 1);
+      *mem = (void *) p;
+    }
+
+  switch (sched)
+    {
+    case GFS_STATIC:
+    case GFS_AUTO:
+      return !gomp_iter_static_next (istart, iend);
+    case GFS_DYNAMIC:
+      ret = gomp_iter_dynamic_next_locked (istart, iend);
+      break;
+    case GFS_GUIDED:
+      ret = gomp_iter_guided_next_locked (istart, iend);
+      break;
+    default:
+      abort ();
+    }
+
+  if (ret)
+    gomp_ordered_first ();
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+  return ret;
+}
+
 /* The *_doacross_*_start routines are similar.  The only difference is that
    this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
    section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
@@ -310,11 +483,11 @@ gomp_loop_doacross_static_start (unsigned ncounts, long *counts,
   struct gomp_thread *thr = gomp_thread ();
 
   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
 		      GFS_STATIC, chunk_size);
-      gomp_doacross_init (ncounts, counts, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }
 
@@ -328,11 +501,11 @@ gomp_loop_doacross_dynamic_start (unsigned ncounts, long *counts,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
 		      GFS_DYNAMIC, chunk_size);
-      gomp_doacross_init (ncounts, counts, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }
 
@@ -354,11 +527,11 @@ gomp_loop_doacross_guided_start (unsigned ncounts, long *counts,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
 		      GFS_GUIDED, chunk_size);
-      gomp_doacross_init (ncounts, counts, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }
 
@@ -402,6 +575,50 @@ GOMP_loop_doacross_runtime_start (unsigned ncounts, long *counts,
     }
 }
 
+bool
+GOMP_loop_doacross_start (unsigned ncounts, long *counts, long sched,
+			  long chunk_size, long *istart, long *iend,
+			  uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (gomp_work_share_start (0))
+    {
+      size_t extra = 0;
+      if (mem)
+	extra = (uintptr_t) *mem;
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
+		      sched, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size, extra);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      sched = thr->ts.work_share->sched;
+    }
+
+  if (mem)
+    *mem = thr->ts.work_share->doacross->extra;
+
+  return ialias_call (GOMP_loop_runtime_next) (istart, iend);
+}
+
 /* The *_next routines are called when the thread completes processing of
    the iteration block currently assigned to it.  If the work-share
    construct is bound directly to a parallel construct, then the iteration
diff --git a/libgomp/loop_ull.c b/libgomp/loop_ull.c
index 7b2dfe6e26b2..ac658023e13b 100644
--- a/libgomp/loop_ull.c
+++ b/libgomp/loop_ull.c
@@ -27,8 +27,12 @@
 
 #include <limits.h>
 #include <stdlib.h>
+#include <string.h>
 #include "libgomp.h"
 
+ialias (GOMP_loop_ull_runtime_next)
+ialias_redirect (GOMP_taskgroup_reduction_register)
+
 typedef unsigned long long gomp_ull;
 
 /* Initialize the given work share construct from the given arguments.  */
@@ -104,7 +108,7 @@ gomp_loop_ull_static_start (bool up, gomp_ull start, gomp_ull end,
   struct gomp_thread *thr = gomp_thread ();
 
   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
 			  GFS_STATIC, chunk_size);
@@ -122,7 +126,7 @@ gomp_loop_ull_dynamic_start (bool up, gomp_ull start, gomp_ull end,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
 			  GFS_DYNAMIC, chunk_size);
@@ -148,7 +152,7 @@ gomp_loop_ull_guided_start (bool up, gomp_ull start, gomp_ull end,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
 			  GFS_GUIDED, chunk_size);
@@ -195,6 +199,99 @@ GOMP_loop_ull_runtime_start (bool up, gomp_ull start, gomp_ull end,
     }
 }
 
+static long
+gomp_adjust_sched (long sched, gomp_ull *chunk_size)
+{
+  sched &= ~GFS_MONOTONIC;
+  switch (sched)
+    {
+    case GFS_STATIC:
+    case GFS_DYNAMIC:
+    case GFS_GUIDED:
+      return sched;
+    /* GFS_RUNTIME is used for runtime schedule without monotonic
+       or nonmonotonic modifiers on the clause.
+       GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic
+       modifier.  */
+    case GFS_RUNTIME:
+    /* GFS_AUTO is used for runtime schedule with nonmonotonic
+       modifier.  */
+    case GFS_AUTO:
+      {
+	struct gomp_task_icv *icv = gomp_icv (false);
+	sched = icv->run_sched_var & ~GFS_MONOTONIC;
+	switch (sched)
+	  {
+	  case GFS_STATIC:
+	  case GFS_DYNAMIC:
+	  case GFS_GUIDED:
+	    *chunk_size = icv->run_sched_chunk_size;
+	    break;
+	  case GFS_AUTO:
+	    sched = GFS_STATIC;
+	    *chunk_size = 0;
+	    break;
+	  default:
+	    abort ();
+	  }
+	return sched;
+      }
+    default:
+      abort ();
+    }
+}
+
+bool
+GOMP_loop_ull_start (bool up, gomp_ull start, gomp_ull end,
+		     gomp_ull incr, long sched, gomp_ull chunk_size,
+		     gomp_ull *istart, gomp_ull *iend,
+		     uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (gomp_work_share_start (0))
+    {
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
+      			  sched, chunk_size);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      if (mem)
+	{
+	  uintptr_t size = (uintptr_t) *mem;
+	  if (size > (sizeof (struct gomp_work_share)
+		      - offsetof (struct gomp_work_share,
+				  inline_ordered_team_ids)))
+	    thr->ts.work_share->ordered_team_ids
+	      = gomp_malloc_cleared (size);
+	  else
+	    memset (thr->ts.work_share->ordered_team_ids, '\0', size);
+	  *mem = (void *) thr->ts.work_share->ordered_team_ids;
+	}
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      if (mem)
+	*mem = (void *) thr->ts.work_share->ordered_team_ids;
+    }
+
+  return ialias_call (GOMP_loop_ull_runtime_next) (istart, iend);
+}
+
 /* The *_ordered_*_start routines are similar.  The only difference is that
    this work-share construct is initialized to expect an ORDERED section.  */
 
@@ -206,7 +303,7 @@ gomp_loop_ull_ordered_static_start (bool up, gomp_ull start, gomp_ull end,
   struct gomp_thread *thr = gomp_thread ();
 
   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
 			  GFS_STATIC, chunk_size);
@@ -225,7 +322,7 @@ gomp_loop_ull_ordered_dynamic_start (bool up, gomp_ull start, gomp_ull end,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
 			  GFS_DYNAMIC, chunk_size);
@@ -251,7 +348,7 @@ gomp_loop_ull_ordered_guided_start (bool up, gomp_ull start, gomp_ull end,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
 			  GFS_GUIDED, chunk_size);
@@ -299,6 +396,82 @@ GOMP_loop_ull_ordered_runtime_start (bool up, gomp_ull start, gomp_ull end,
     }
 }
 
+bool
+GOMP_loop_ull_ordered_start (bool up, gomp_ull start, gomp_ull end,
+			     gomp_ull incr, long sched, gomp_ull chunk_size,
+			     gomp_ull *istart, gomp_ull *iend,
+			     uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  size_t ordered = 1;
+  bool ret;
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (mem)
+    ordered += (uintptr_t) *mem;
+  if (gomp_work_share_start (ordered))
+    {
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
+			  sched, chunk_size);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      if (sched == GFS_STATIC)
+	gomp_ordered_static_init ();
+      else
+	gomp_mutex_lock (&thr->ts.work_share->lock);
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      sched = thr->ts.work_share->sched;
+      if (sched != GFS_STATIC)
+	gomp_mutex_lock (&thr->ts.work_share->lock);
+    }
+
+  if (mem)
+    {
+      uintptr_t p
+	= (uintptr_t) (thr->ts.work_share->ordered_team_ids
+		       + (thr->ts.team ? thr->ts.team->nthreads : 1));
+      p += __alignof__ (long long) - 1;
+      p &= ~(__alignof__ (long long) - 1);
+      *mem = (void *) p;
+    }
+
+  switch (sched)
+    {
+    case GFS_STATIC:
+    case GFS_AUTO:
+      return !gomp_iter_ull_static_next (istart, iend);
+    case GFS_DYNAMIC:
+      ret = gomp_iter_ull_dynamic_next_locked (istart, iend);
+      break;
+    case GFS_GUIDED:
+      ret = gomp_iter_ull_guided_next_locked (istart, iend);
+      break;
+    default:
+      abort ();
+    }
+
+  if (ret)
+    gomp_ordered_first ();
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+  return ret;
+}
+
 /* The *_doacross_*_start routines are similar.  The only difference is that
    this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
    section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
@@ -313,11 +486,11 @@ gomp_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts,
   struct gomp_thread *thr = gomp_thread ();
 
   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
 			  GFS_STATIC, chunk_size);
-      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }
 
@@ -332,11 +505,11 @@ gomp_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
 			  GFS_DYNAMIC, chunk_size);
-      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }
 
@@ -359,11 +532,11 @@ gomp_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts,
   struct gomp_thread *thr = gomp_thread ();
   bool ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
 			  GFS_GUIDED, chunk_size);
-      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }
 
@@ -407,6 +580,51 @@ GOMP_loop_ull_doacross_runtime_start (unsigned ncounts, gomp_ull *counts,
     }
 }
 
+bool
+GOMP_loop_ull_doacross_start (unsigned ncounts, gomp_ull *counts,
+			      long sched, gomp_ull chunk_size,
+			      gomp_ull *istart, gomp_ull *iend,
+			      uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (gomp_work_share_start (0))
+    {
+      size_t extra = 0;
+      if (mem)
+	extra = (uintptr_t) *mem;
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+			  sched, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size, extra);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      sched = thr->ts.work_share->sched;
+    }
+
+  if (mem)
+    *mem = thr->ts.work_share->doacross->extra;
+
+  return ialias_call (GOMP_loop_ull_runtime_next) (istart, iend);
+}
+
 /* The *_next routines are called when the thread completes processing of
    the iteration block currently assigned to it.  If the work-share
    construct is bound directly to a parallel construct, then the iteration
diff --git a/libgomp/ordered.c b/libgomp/ordered.c
index 1bdd5b2f25bc..521e9122d908 100644
--- a/libgomp/ordered.c
+++ b/libgomp/ordered.c
@@ -259,7 +259,8 @@ GOMP_ordered_end (void)
 #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
 
 void
-gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
+gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
+		    size_t extra)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -269,13 +270,24 @@ gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
   struct gomp_doacross_work_share *doacross;
 
   if (team == NULL || team->nthreads == 1)
-    return;
+    {
+    empty:
+      if (!extra)
+	ws->doacross = NULL;
+      else
+	{
+	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
+	  doacross->extra = (void *) (doacross + 1);
+	  ws->doacross = doacross;
+	}
+      return;
+    }
 
   for (i = 0; i < ncounts; i++)
     {
       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
       if (counts[i] == 0)
-	return;
+	goto empty;
 
       if (num_bits <= MAX_COLLAPSED_BITS)
 	{
@@ -314,7 +326,7 @@ gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
   elt_sz = (elt_sz + 63) & ~63UL;
 
   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
-			  + shift_sz);
+			  + shift_sz + extra);
   doacross->chunk_size = chunk_size;
   doacross->elt_sz = elt_sz;
   doacross->ncounts = ncounts;
@@ -322,6 +334,13 @@ gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
   doacross->array = (unsigned char *)
 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
 		     & ~(uintptr_t) 63);
+  if (extra)
+    {
+      doacross->extra = doacross->array + num_ents * elt_sz;
+      memset (doacross->extra, '\0', extra);
+    }
+  else
+    doacross->extra = NULL;
   if (num_bits <= MAX_COLLAPSED_BITS)
     {
       unsigned int shift_count = 0;
@@ -360,7 +379,8 @@ GOMP_doacross_post (long *counts)
   unsigned long ent;
   unsigned int i;
 
-  if (__builtin_expect (doacross == NULL, 0))
+  if (__builtin_expect (doacross == NULL, 0)
+      || __builtin_expect (doacross->array == NULL, 0))
     {
       __sync_synchronize ();
       return;
@@ -411,7 +431,8 @@ GOMP_doacross_wait (long first, ...)
   unsigned long ent;
   unsigned int i;
 
-  if (__builtin_expect (doacross == NULL, 0))
+  if (__builtin_expect (doacross == NULL, 0)
+      || __builtin_expect (doacross->array == NULL, 0))
     {
       __sync_synchronize ();
       return;
@@ -488,7 +509,8 @@ GOMP_doacross_wait (long first, ...)
 typedef unsigned long long gomp_ull;
 
 void
-gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
+gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
+			gomp_ull chunk_size, size_t extra)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -498,13 +520,24 @@ gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
   struct gomp_doacross_work_share *doacross;
 
   if (team == NULL || team->nthreads == 1)
-    return;
+    {
+    empty:
+      if (!extra)
+	ws->doacross = NULL;
+      else
+	{
+	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
+	  doacross->extra = (void *) (doacross + 1);
+	  ws->doacross = doacross;
+	}
+      return;
+    }
 
   for (i = 0; i < ncounts; i++)
     {
       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
       if (counts[i] == 0)
-	return;
+	goto empty;
 
       if (num_bits <= MAX_COLLAPSED_BITS)
 	{
@@ -557,6 +590,13 @@ gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
   doacross->array = (unsigned char *)
 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
 		     & ~(uintptr_t) 63);
+  if (extra)
+    {
+      doacross->extra = doacross->array + num_ents * elt_sz;
+      memset (doacross->extra, '\0', extra);
+    }
+  else
+    doacross->extra = NULL;
   if (num_bits <= MAX_COLLAPSED_BITS)
     {
       unsigned int shift_count = 0;
@@ -595,7 +635,8 @@ GOMP_doacross_ull_post (gomp_ull *counts)
   unsigned long ent;
   unsigned int i;
 
-  if (__builtin_expect (doacross == NULL, 0))
+  if (__builtin_expect (doacross == NULL, 0)
+      || __builtin_expect (doacross->array == NULL, 0))
     {
       __sync_synchronize ();
       return;
@@ -667,7 +708,8 @@ GOMP_doacross_ull_wait (gomp_ull first, ...)
   unsigned long ent;
   unsigned int i;
 
-  if (__builtin_expect (doacross == NULL, 0))
+  if (__builtin_expect (doacross == NULL, 0)
+      || __builtin_expect (doacross->array == NULL, 0))
     {
       __sync_synchronize ();
       return;
diff --git a/libgomp/parallel.c b/libgomp/parallel.c
index ead1394aaa3a..c7a8c788a3bc 100644
--- a/libgomp/parallel.c
+++ b/libgomp/parallel.c
@@ -205,8 +205,15 @@ GOMP_cancellation_point (int which)
     }
   else if (which & GOMP_CANCEL_TASKGROUP)
     {
-      if (thr->task->taskgroup && thr->task->taskgroup->cancelled)
-	return true;
+      if (thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return true;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return true;
+	}
       /* FALLTHRU into the GOMP_CANCEL_PARALLEL case,
 	 as #pragma omp cancel parallel also cancels all explicit
 	 tasks.  */
@@ -238,11 +245,17 @@ GOMP_cancel (int which, bool do_cancel)
     }
   else if (which & GOMP_CANCEL_TASKGROUP)
     {
-      if (thr->task->taskgroup && !thr->task->taskgroup->cancelled)
+      if (thr->task->taskgroup)
 	{
-	  gomp_mutex_lock (&team->task_lock);
-	  thr->task->taskgroup->cancelled = true;
-	  gomp_mutex_unlock (&team->task_lock);
+	  struct gomp_taskgroup *taskgroup = thr->task->taskgroup;
+	  if (taskgroup->workshare && taskgroup->prev)
+	    taskgroup = taskgroup->prev;
+	  if (!taskgroup->cancelled)
+	    {
+	      gomp_mutex_lock (&team->task_lock);
+	      taskgroup->cancelled = true;
+	      gomp_mutex_unlock (&team->task_lock);
+	    }
 	}
       return true;
     }
diff --git a/libgomp/sections.c b/libgomp/sections.c
index 2a6e6ec1c9d4..3449e0067ddc 100644
--- a/libgomp/sections.c
+++ b/libgomp/sections.c
@@ -26,8 +26,11 @@
 /* This file handles the SECTIONS construct.  */
 
 #include "libgomp.h"
+#include <string.h>
 
 
+ialias_redirect (GOMP_taskgroup_reduction_register)
+
 /* Initialize the given work share construct from the given arguments.  */
 
 static inline void
@@ -72,7 +75,7 @@ GOMP_sections_start (unsigned count)
   struct gomp_thread *thr = gomp_thread ();
   long s, e, ret;
 
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_sections_init (thr->ts.work_share, count);
       gomp_work_share_init_done ();
@@ -95,6 +98,66 @@ GOMP_sections_start (unsigned count)
   return ret;
 }
 
+unsigned
+GOMP_sections2_start (unsigned count, uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  long s, e, ret;
+
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (gomp_work_share_start (0))
+    {
+      gomp_sections_init (thr->ts.work_share, count);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      if (mem)
+	{
+	  uintptr_t size = (uintptr_t) *mem;
+	  if (size > (sizeof (struct gomp_work_share)
+		      - offsetof (struct gomp_work_share,
+				  inline_ordered_team_ids)))
+	    thr->ts.work_share->ordered_team_ids
+	      = gomp_malloc_cleared (size);
+	  else
+	    memset (thr->ts.work_share->ordered_team_ids, '\0', size);
+	  *mem = (void *) thr->ts.work_share->ordered_team_ids;
+	}
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      if (mem)
+	*mem = (void *) thr->ts.work_share->ordered_team_ids;
+    }
+
+#ifdef HAVE_SYNC_BUILTINS
+  if (gomp_iter_dynamic_next (&s, &e))
+    ret = s;
+  else
+    ret = 0;
+#else
+  gomp_mutex_lock (&thr->ts.work_share->lock);
+  if (gomp_iter_dynamic_next_locked (&s, &e))
+    ret = s;
+  else
+    ret = 0;
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+#endif
+
+  return ret;
+}
+
 /* This routine is called when the thread completes processing of the
    section currently assigned to it.  If the work-share construct is
    bound directly to a parallel construct, then the construct may have
diff --git a/libgomp/single.c b/libgomp/single.c
index 24a7780ad93a..d5093c6730c3 100644
--- a/libgomp/single.c
+++ b/libgomp/single.c
@@ -47,7 +47,7 @@ GOMP_single_start (void)
   return __sync_bool_compare_and_swap (&team->single_count, single_count,
 				       single_count + 1L);
 #else
-  bool ret = gomp_work_share_start (false);
+  bool ret = gomp_work_share_start (0);
   if (ret)
     gomp_work_share_init_done ();
   gomp_work_share_end_nowait ();
@@ -68,7 +68,7 @@ GOMP_single_copy_start (void)
   bool first;
   void *ret;
 
-  first = gomp_work_share_start (false);
+  first = gomp_work_share_start (0);
   
   if (first)
     {
diff --git a/libgomp/target.c b/libgomp/target.c
index d9288274243d..8ebc2a370a16 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -1854,11 +1854,20 @@ GOMP_target_update_ext (int device, size_t mapnum, void **hostaddrs,
 	      struct gomp_team *team = thr->ts.team;
 	      /* If parallel or taskgroup has been cancelled, don't start new
 		 tasks.  */
-	      if (team
-		  && (gomp_team_barrier_cancelled (&team->barrier)
-		      || (thr->task->taskgroup
-			  && thr->task->taskgroup->cancelled)))
-		return;
+	      if (__builtin_expect (gomp_cancel_var, 0) && team)
+		{
+		  if (gomp_team_barrier_cancelled (&team->barrier))
+		    return;
+		  if (thr->task->taskgroup)
+		    {
+		      if (thr->task->taskgroup->cancelled)
+			return;
+		      if (thr->task->taskgroup->workshare
+			  && thr->task->taskgroup->prev
+			  && thr->task->taskgroup->prev->cancelled)
+			return;
+		    }
+		}
 
 	      gomp_task_maybe_wait_for_dependencies (depend);
 	    }
@@ -1873,10 +1882,20 @@ GOMP_target_update_ext (int device, size_t mapnum, void **hostaddrs,
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
-  if (team
-      && (gomp_team_barrier_cancelled (&team->barrier)
-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
-    return;
+  if (__builtin_expect (gomp_cancel_var, 0) && team)
+    {
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	return;
+      if (thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return;
+	}
+    }
 
   gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, true);
 }
@@ -1985,11 +2004,20 @@ GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs,
 	      struct gomp_team *team = thr->ts.team;
 	      /* If parallel or taskgroup has been cancelled, don't start new
 		 tasks.  */
-	      if (team
-		  && (gomp_team_barrier_cancelled (&team->barrier)
-		      || (thr->task->taskgroup
-			  && thr->task->taskgroup->cancelled)))
-		return;
+	      if (__builtin_expect (gomp_cancel_var, 0) && team)
+		{
+		  if (gomp_team_barrier_cancelled (&team->barrier))
+		    return;
+		  if (thr->task->taskgroup)
+		    {
+		      if (thr->task->taskgroup->cancelled)
+			return;
+		      if (thr->task->taskgroup->workshare
+			  && thr->task->taskgroup->prev
+			  && thr->task->taskgroup->prev->cancelled)
+			return;
+		    }
+		}
 
 	      gomp_task_maybe_wait_for_dependencies (depend);
 	    }
@@ -2004,10 +2032,20 @@ GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs,
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
-  if (team
-      && (gomp_team_barrier_cancelled (&team->barrier)
-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
-    return;
+  if (__builtin_expect (gomp_cancel_var, 0) && team)
+    {
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	return;
+      if (thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return;
+	}
+    }
 
   size_t i;
   if ((flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0)
diff --git a/libgomp/task.c b/libgomp/task.c
index 77e12c404ac5..0c78b3c939cd 100644
--- a/libgomp/task.c
+++ b/libgomp/task.c
@@ -363,10 +363,20 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 #endif
 
   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
-  if (team
-      && (gomp_team_barrier_cancelled (&team->barrier)
-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
-    return;
+  if (__builtin_expect (gomp_cancel_var, 0) && team)
+    {
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	return;
+      if (thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return;
+	}
+    }
 
   if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0)
     priority = 0;
@@ -464,14 +474,26 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
       gomp_mutex_lock (&team->task_lock);
       /* If parallel or taskgroup has been cancelled, don't start new
 	 tasks.  */
-      if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
-			     || (taskgroup && taskgroup->cancelled))
-			    && !task->copy_ctors_done, 0))
+      if (__builtin_expect (gomp_cancel_var, 0)
+	  && !task->copy_ctors_done)
 	{
-	  gomp_mutex_unlock (&team->task_lock);
-	  gomp_finish_task (task);
-	  free (task);
-	  return;
+	  if (gomp_team_barrier_cancelled (&team->barrier))
+	    {
+	    do_cancel:
+	      gomp_mutex_unlock (&team->task_lock);
+	      gomp_finish_task (task);
+	      free (task);
+	      return;
+	    }
+	  if (taskgroup)
+	    {
+	      if (taskgroup->cancelled)
+		goto do_cancel;
+	      if (taskgroup->workshare
+		  && taskgroup->prev
+		  && taskgroup->prev->cancelled)
+		goto do_cancel;
+	    }
 	}
       if (taskgroup)
 	taskgroup->num_children++;
@@ -662,10 +684,20 @@ gomp_create_target_task (struct gomp_device_descr *devicep,
   struct gomp_team *team = thr->ts.team;
 
   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
-  if (team
-      && (gomp_team_barrier_cancelled (&team->barrier)
-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
-    return true;
+  if (__builtin_expect (gomp_cancel_var, 0) && team)
+    {
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	return true;
+      if (thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return true;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return true;
+	}
+    }
 
   struct gomp_target_task *ttask;
   struct gomp_task *task;
@@ -748,13 +780,25 @@ gomp_create_target_task (struct gomp_device_descr *devicep,
   task->final_task = 0;
   gomp_mutex_lock (&team->task_lock);
   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
-  if (__builtin_expect (gomp_team_barrier_cancelled (&team->barrier)
-			|| (taskgroup && taskgroup->cancelled), 0))
+  if (__builtin_expect (gomp_cancel_var, 0))
     {
-      gomp_mutex_unlock (&team->task_lock);
-      gomp_finish_task (task);
-      free (task);
-      return true;
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	{
+	do_cancel:
+	  gomp_mutex_unlock (&team->task_lock);
+	  gomp_finish_task (task);
+	  free (task);
+	  return true;
+	}
+      if (taskgroup)
+	{
+	  if (taskgroup->cancelled)
+	    goto do_cancel;
+	  if (taskgroup->workshare
+	      && taskgroup->prev
+	      && taskgroup->prev->cancelled)
+	    goto do_cancel;
+	}
     }
   if (depend_size)
     {
@@ -1047,10 +1091,21 @@ gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
 
   if (--team->task_queued_count == 0)
     gomp_team_barrier_clear_task_pending (&team->barrier);
-  if ((gomp_team_barrier_cancelled (&team->barrier)
-       || (taskgroup && taskgroup->cancelled))
+  if (__builtin_expect (gomp_cancel_var, 0)
       && !child_task->copy_ctors_done)
-    return true;
+    {
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	return true;
+      if (taskgroup)
+	{
+	  if (taskgroup->cancelled)
+	    return true;
+	  if (taskgroup->workshare
+	      && taskgroup->prev
+	      && taskgroup->prev->cancelled)
+	    return true;
+	}
+    }
   return false;
 }
 
@@ -1527,10 +1582,20 @@ GOMP_taskwait_depend (void **depend)
   struct gomp_team *team = thr->ts.team;
 
   /* If parallel or taskgroup has been cancelled, return early.  */
-  if (team
-      && (gomp_team_barrier_cancelled (&team->barrier)
-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
-    return;
+  if (__builtin_expect (gomp_cancel_var, 0) && team)
+    {
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	return;
+      if (thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return;
+	}
+    }
 
   if (thr->task && thr->task->depend_hash)
     gomp_task_maybe_wait_for_dependencies (depend);
@@ -1770,9 +1835,10 @@ gomp_taskgroup_init (struct gomp_taskgroup *prev)
     = gomp_malloc (sizeof (struct gomp_taskgroup));
   taskgroup->prev = prev;
   priority_queue_init (&taskgroup->taskgroup_queue);
-  taskgroup->in_taskgroup_wait = false;
   taskgroup->reductions = prev ? prev->reductions : NULL;
+  taskgroup->in_taskgroup_wait = false;
   taskgroup->cancelled = false;
+  taskgroup->workshare = false;
   taskgroup->num_children = 0;
   gomp_sem_init (&taskgroup->taskgroup_sem, 0);
   return taskgroup;
@@ -1956,21 +2022,34 @@ GOMP_taskgroup_end (void)
   free (taskgroup);
 }
 
-static inline void
-gomp_reduction_register (uintptr_t *data, uintptr_t *old, unsigned nthreads)
+static inline __attribute__((always_inline)) void
+gomp_reduction_register (uintptr_t *data, uintptr_t *old, uintptr_t *orig,
+			 unsigned nthreads)
 {
   size_t total_cnt = 0;
   uintptr_t *d = data;
   struct htab *old_htab = NULL, *new_htab;
   do
     {
-      size_t sz = d[1] * nthreads;
-      /* Should use omp_alloc if d[3] is not -1.  */
-      void *ptr = gomp_aligned_alloc (d[2], sz);
-      memset (ptr, '\0', sz);
-      d[2] = (uintptr_t) ptr;
+      if (__builtin_expect (orig != NULL, 0))
+	{
+	  /* For worksharing task reductions, memory has been allocated
+	     already by some other thread that encountered the construct
+	     earlier.  */
+	  d[2] = orig[2];
+	  d[6] = orig[6];
+	  orig = (uintptr_t *) orig[4];
+	}
+      else
+	{
+	  size_t sz = d[1] * nthreads;
+	  /* Should use omp_alloc if d[3] is not -1.  */
+	  void *ptr = gomp_aligned_alloc (d[2], sz);
+	  memset (ptr, '\0', sz);
+	  d[2] = (uintptr_t) ptr;
+	  d[6] = d[2] + sz;
+	}
       d[5] = 0;
-      d[6] = d[2] + sz;
       total_cnt += d[0];
       if (d[4] == 0)
 	{
@@ -2028,6 +2107,38 @@ gomp_reduction_register (uintptr_t *data, uintptr_t *old, unsigned nthreads)
   d[5] = (uintptr_t) new_htab;
 }
 
+static void
+gomp_create_artificial_team (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_task_icv *icv;
+  struct gomp_team *team = gomp_new_team (1);
+  struct gomp_task *task = thr->task;
+  icv = task ? &task->icv : &gomp_global_icv;
+  team->prev_ts = thr->ts;
+  thr->ts.team = team;
+  thr->ts.team_id = 0;
+  thr->ts.work_share = &team->work_shares[0];
+  thr->ts.last_work_share = NULL;
+#ifdef HAVE_SYNC_BUILTINS
+  thr->ts.single_count = 0;
+#endif
+  thr->ts.static_trip = 0;
+  thr->task = &team->implicit_task[0];
+  gomp_init_task (thr->task, NULL, icv);
+  if (task)
+    {
+      thr->task = task;
+      gomp_end_task ();
+      free (task);
+      thr->task = &team->implicit_task[0];
+    }
+#ifdef LIBGOMP_USE_PTHREADS
+  else
+    pthread_setspecific (gomp_thread_destructor, thr);
+#endif
+}
+
 /* The format of data is:
    data[0]	cnt
    data[1]	size
@@ -2039,7 +2150,12 @@ gomp_reduction_register (uintptr_t *data, uintptr_t *old, unsigned nthreads)
    cnt times
    ent[0]	address
    ent[1]	offset
-   ent[2]	used internally (pointer to data[0]).  */
+   ent[2]	used internally (pointer to data[0])
+   The entries are sorted by increasing offset, so that a binary
+   search can be performed.  Normally, data[8] is 0, exception is
+   for worksharing construct task reductions in cancellable parallel,
+   where at offset 0 there should be space for a pointer and an integer
+   which are used internally.  */
 
 void
 GOMP_taskgroup_reduction_register (uintptr_t *data)
@@ -2047,41 +2163,18 @@ GOMP_taskgroup_reduction_register (uintptr_t *data)
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
   struct gomp_task *task;
+  unsigned nthreads;
   if (__builtin_expect (team == NULL, 0))
     {
       /* The task reduction code needs a team and task, so for
 	 orphaned taskgroups just create the implicit team.  */
-      struct gomp_task_icv *icv;
-      team = gomp_new_team (1);
-      task = thr->task;
-      icv = task ? &task->icv : &gomp_global_icv;
-      team->prev_ts = thr->ts;
-      thr->ts.team = team;
-      thr->ts.team_id = 0;
-      thr->ts.work_share = &team->work_shares[0];
-      thr->ts.last_work_share = NULL;
-#ifdef HAVE_SYNC_BUILTINS
-      thr->ts.single_count = 0;
-#endif
-      thr->ts.static_trip = 0;
-      thr->task = &team->implicit_task[0];
-      gomp_init_task (thr->task, NULL, icv);
-      if (task)
-	{
-	  thr->task = task;
-	  gomp_end_task ();
-	  free (task);
-	  thr->task = &team->implicit_task[0];
-	}
-#ifdef LIBGOMP_USE_PTHREADS
-      else
-	pthread_setspecific (gomp_thread_destructor, thr);
-#endif
-      GOMP_taskgroup_start ();
+      gomp_create_artificial_team ();
+      ialias_call (GOMP_taskgroup_start) ();
+      team = thr->ts.team;
     }
-  unsigned nthreads = team->nthreads;
+  nthreads = team->nthreads;
   task = thr->task;
-  gomp_reduction_register (data, task->taskgroup->reductions, nthreads);
+  gomp_reduction_register (data, task->taskgroup->reductions, NULL, nthreads);
   task->taskgroup->reductions = data;
 }
 
@@ -2175,11 +2268,56 @@ struct gomp_taskgroup *
 gomp_parallel_reduction_register (uintptr_t *data, unsigned nthreads)
 {
   struct gomp_taskgroup *taskgroup = gomp_taskgroup_init (NULL);
-  gomp_reduction_register (data, NULL, nthreads);
+  gomp_reduction_register (data, NULL, NULL, nthreads);
   taskgroup->reductions = data;
   return taskgroup;
 }
 
+void
+gomp_workshare_task_reduction_register (uintptr_t *data, uintptr_t *orig)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_task *task = thr->task;
+  unsigned nthreads = team->nthreads;
+  gomp_reduction_register (data, task->taskgroup->reductions, orig, nthreads);
+  task->taskgroup->reductions = data;
+}
+
+void
+gomp_workshare_taskgroup_start (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_task *task;
+
+  if (team == NULL)
+    {
+      gomp_create_artificial_team ();
+      team = thr->ts.team;
+    }
+  task = thr->task;
+  task->taskgroup = gomp_taskgroup_init (task->taskgroup);
+  task->taskgroup->workshare = true;
+}
+
+void
+GOMP_workshare_task_reduction_unregister (bool cancelled)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_task *task = thr->task;
+  struct gomp_team *team = thr->ts.team;
+  uintptr_t *data = task->taskgroup->reductions;
+  ialias_call (GOMP_taskgroup_end) ();
+  if (thr->ts.team_id == 0)
+    ialias_call (GOMP_taskgroup_reduction_unregister) (data);
+  else
+    htab_free ((struct htab *) data[5]);
+
+  if (!cancelled)
+    gomp_team_barrier_wait (&team->barrier);
+}
+
 int
 omp_in_final (void)
 {
diff --git a/libgomp/taskloop.c b/libgomp/taskloop.c
index d20af399d385..4621405aa58e 100644
--- a/libgomp/taskloop.c
+++ b/libgomp/taskloop.c
@@ -149,8 +149,17 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 
   if (flags & GOMP_TASK_FLAG_NOGROUP)
     {
-      if (thr->task && thr->task->taskgroup && thr->task->taskgroup->cancelled)
-	return;
+      if (__builtin_expect (gomp_cancel_var, 0)
+	  && thr->task
+	  && thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return;
+	}
     }
   else
     {
@@ -292,19 +301,31 @@ GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
       gomp_mutex_lock (&team->task_lock);
       /* If parallel or taskgroup has been cancelled, don't start new
 	 tasks.  */
-      if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
-			     || (taskgroup && taskgroup->cancelled))
-			    && cpyfn == NULL, 0))
+      if (__builtin_expect (gomp_cancel_var, 0)
+	  && cpyfn == NULL)
 	{
-	  gomp_mutex_unlock (&team->task_lock);
-	  for (i = 0; i < num_tasks; i++)
+	  if (gomp_team_barrier_cancelled (&team->barrier))
+	    {
+	    do_cancel:
+	      gomp_mutex_unlock (&team->task_lock);
+	      for (i = 0; i < num_tasks; i++)
+		{
+		  gomp_finish_task (tasks[i]);
+		  free (tasks[i]);
+		}
+	      if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
+		ialias_call (GOMP_taskgroup_end) ();
+	      return;
+	    }
+	  if (taskgroup)
 	    {
-	      gomp_finish_task (tasks[i]);
-	      free (tasks[i]);
+	      if (taskgroup->cancelled)
+		goto do_cancel;
+	      if (taskgroup->workshare
+		  && taskgroup->prev
+		  && taskgroup->prev->cancelled)
+		goto do_cancel;
 	    }
-	  if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
-	    ialias_call (GOMP_taskgroup_end) ();
-	  return;
 	}
       if (taskgroup)
 	taskgroup->num_children += num_tasks;
diff --git a/libgomp/team.c b/libgomp/team.c
index 0956a1f8f1ff..e3e4c4d1ef27 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -187,7 +187,7 @@ gomp_new_team (unsigned nthreads)
   team->single_count = 0;
 #endif
   team->work_shares_to_free = &team->work_shares[0];
-  gomp_init_work_share (&team->work_shares[0], false, nthreads);
+  gomp_init_work_share (&team->work_shares[0], 0, nthreads);
   team->work_shares[0].next_alloc = NULL;
   team->work_share_list_free = NULL;
   team->work_share_list_alloc = &team->work_shares[1];
diff --git a/libgomp/testsuite/libgomp.c++/task-reduction-14.C b/libgomp/testsuite/libgomp.c++/task-reduction-14.C
new file mode 100644
index 000000000000..3f4e79b16c59
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/task-reduction-14.C
@@ -0,0 +1,72 @@
+#include <omp.h>
+#include <stdlib.h>
+
+struct A { A (); ~A (); A (const A &); static int cnt1, cnt2, cnt3; int a; };
+int A::cnt1;
+int A::cnt2;
+int A::cnt3;
+A::A () : a (0)
+{
+  #pragma omp atomic
+  cnt1++;
+}
+A::A (const A &x) : a (x.a)
+{
+  #pragma omp atomic
+  cnt2++;
+}
+A::~A ()
+{
+  #pragma omp atomic
+  cnt3++;
+}
+#pragma omp declare reduction (+: A: omp_out.a += omp_in.a)
+
+void
+foo (int x)
+{
+  A a, b[2];
+  int d = 1;
+  long int e[2] = { 1L, 1L };
+  int c = 0;
+  #pragma omp parallel
+  {
+    if (x && omp_get_thread_num () == 0)
+      {
+	for (int i = 0; i < 10000000; ++i)
+	  asm volatile ("");
+	c = 1;
+	#pragma omp cancel parallel
+      }
+    #pragma omp for reduction (task, +: a, b) reduction (task, *: d, e)
+    for (int i = 0; i < 64; i++)
+      #pragma omp task in_reduction (+: a, b) in_reduction (*: d, e)
+      {
+	a.a++;
+	b[0].a += 2;
+	b[1].a += 3;
+	d *= ((i & 7) == 0) + 1;
+	e[0] *= ((i & 7) == 3) + 1;
+	e[1] *= ((i & 3) == 2) + 1;
+      }
+    if (x && omp_get_cancellation ())
+      abort ();
+  }
+  if (!c)
+    {
+      if (a.a != 64 || b[0].a != 128 || b[1].a != 192)
+	abort ();
+      if (d != 256 || e[0] != 256L || e[1] != 65536L)
+	abort ();
+    }
+}
+
+int
+main ()
+{
+  int c1 = A::cnt1, c2 = A::cnt2, c3 = A::cnt3;
+  volatile int zero = 0;
+  foo (zero);
+  if (A::cnt1 + A::cnt2 - c1 - c2 != A::cnt3 - c3)
+    abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/task-reduction-15.C b/libgomp/testsuite/libgomp.c++/task-reduction-15.C
new file mode 100644
index 000000000000..8a01e6b240a1
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/task-reduction-15.C
@@ -0,0 +1,75 @@
+extern "C" void abort ();
+
+int as;
+int &a = as;
+long int bs = 1;
+long int &b = bs;
+
+template <typename T, typename U>
+void
+foo (T &c, U &d)
+{
+  T i;
+  for (i = 0; i < 2; i++)
+    #pragma omp task in_reduction (*: d) in_reduction (+: c) \
+		     in_reduction (+: a) in_reduction (*: b)
+    {
+      a += 7;
+      b *= 2;
+      c += 9;
+      d *= 3;
+    }
+}
+
+template <typename T, typename U>
+void
+bar ()
+{
+  T cs = 0;
+  T &c = cs;
+  U ds = 1;
+  #pragma omp parallel if (0)
+  {
+    U &d = ds;
+    #pragma omp parallel
+    {
+      T i;
+      #pragma omp for reduction (task, +: a, c) reduction (task, *: b, d)
+      for (i = 0; i < 4; i++)
+	#pragma omp task in_reduction (+: a, c) in_reduction (*: b, d)
+	{
+	  T j;
+	  a += 7;
+	  b *= 2;
+	  for (j = 0; j < 2; j++)
+	    #pragma omp task in_reduction (+: a, c) in_reduction (*: b, d)
+	    {
+	      a += 7;
+	      b *= 2;
+	      c += 9;
+	      d *= 3;
+	      foo (c, d);
+	    }
+	  c += 9;
+	  d *= 3;
+	}
+#define THREEP4 (3LL * 3LL * 3LL * 3LL)
+      if (d != (THREEP4 * THREEP4 * THREEP4 * THREEP4 * THREEP4 * THREEP4
+		* THREEP4))
+	abort ();
+      if (a != 28 * 7 || b != (1L << 28) || c != 28 * 9)
+	abort ();
+    }
+  }
+  if (a != 28 * 7 || b != (1L << 28) || c != 28 * 9)
+    abort ();
+  if (ds != (THREEP4 * THREEP4 * THREEP4 * THREEP4 * THREEP4 * THREEP4
+	     * THREEP4))
+    abort ();
+}
+
+int
+main ()
+{
+  bar<int, long long int> ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/task-reduction-16.C b/libgomp/testsuite/libgomp.c++/task-reduction-16.C
new file mode 100644
index 000000000000..5835edcbd5b3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/task-reduction-16.C
@@ -0,0 +1,130 @@
+extern "C" void abort ();
+
+struct S { S (); S (long long int, int); ~S (); static int cnt1, cnt2, cnt3; long long int s; int t; };
+
+int S::cnt1;
+int S::cnt2;
+int S::cnt3;
+
+S::S ()
+{
+  #pragma omp atomic
+  cnt1++;
+}
+
+S::S (long long int x, int y) : s (x), t (y)
+{
+  #pragma omp atomic update
+  ++cnt2;
+}
+
+S::~S ()
+{
+  #pragma omp atomic
+  cnt3 = cnt3 + 1;
+  if (t < 3 || t > 9 || (t & 1) == 0)
+    abort ();
+}
+
+void
+bar (S *p, S *o)
+{
+  p->s = 1;
+  if (o->t != 5)
+    abort ();
+  p->t = 9;
+}
+
+static inline void
+baz (S *o, S *i)
+{
+  if (o->t != 5 || i->t != 9)
+    abort ();
+  o->s *= i->s;
+}
+
+#pragma omp declare reduction (+: S : omp_out.s += omp_in.s) initializer (omp_priv (0, 3))
+#pragma omp declare reduction (*: S : baz (&omp_out, &omp_in)) initializer (bar (&omp_priv, &omp_orig))
+
+S as = { 0LL, 7 };
+S &a = as;
+S bs (1LL, 5);
+S &b = bs;
+
+void
+foo (S &c, S &d)
+{
+  int i;
+  for (i = 0; i < 2; i++)
+    #pragma omp task in_reduction (+: c) in_reduction (*: b, d) in_reduction (+: a)
+    {
+      a.s += 7;
+      b.s *= 2;
+      c.s += 9;
+      d.s *= 3;
+      if ((a.t != 7 && a.t != 3) || (b.t != 5 && b.t != 9)
+	  || (c.t != 7 && c.t != 3) || (d.t != 5 && d.t != 9))
+	abort ();
+    }
+}
+
+void
+test ()
+{
+  S cs = { 0LL, 7 };
+  S &c = cs;
+  S ds (1LL, 5);
+  #pragma omp parallel if (0)
+  {
+    S &d = ds;
+    #pragma omp parallel shared (a, b, c, d)
+    {
+      #pragma omp for schedule (static, 1) reduction (task, +: a, c) reduction (task, *: b, d)
+      for (int i = 0; i < 4; i++)
+	#pragma omp task in_reduction (*: b, d) in_reduction (+: a, c)
+	{
+	  int j;
+	  a.s += 7;
+	  b.s *= 2;
+	  for (j = 0; j < 2; j++)
+	    #pragma omp task in_reduction (+: a) in_reduction (*: b) \
+			     in_reduction (+: c) in_reduction (*: d)
+	    {
+	      a.s += 7;
+	      b.s *= 2;
+	      c.s += 9;
+	      d.s *= 3;
+	      foo (c, d);
+	      if ((a.t != 7 && a.t != 3) || (b.t != 5 && b.t != 9)
+		  || (c.t != 7 && c.t != 3) || (d.t != 5 && d.t != 9))
+		abort ();
+	    }
+	  c.s += 9;
+	  d.s *= 3;
+	  if ((a.t != 7 && a.t != 3) || (b.t != 5 && b.t != 9)
+	      || (c.t != 7 && c.t != 3) || (d.t != 5 && d.t != 9))
+	    abort ();
+	}
+#define THREEP7 (3LL * 3LL * 3LL * 3LL * 3LL * 3LL * 3LL)
+      if (d.s != (THREEP7 * THREEP7 * THREEP7 * THREEP7) || d.t != 5)
+	abort ();
+      if (a.s != 28 * 7 || a.t != 7 || b.s != (1L << 28) || b.t != 5
+	  || c.s != 28 * 9 || c.t != 7)
+	abort ();
+    }
+  }
+  if (a.s != 28 * 7 || a.t != 7 || b.s != (1L << 28) || b.t != 5
+      || c.s != 28 * 9 || c.t != 7)
+    abort ();
+  if (ds.s != (THREEP7 * THREEP7 * THREEP7 * THREEP7) || ds.t != 5)
+    abort ();
+}
+
+int
+main ()
+{
+  int c1 = S::cnt1, c2 = S::cnt2, c3 = S::cnt3;
+  test ();
+  if (S::cnt1 + S::cnt2 - c1 - c2 != S::cnt3 - c3)
+    abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/task-reduction-17.C b/libgomp/testsuite/libgomp.c++/task-reduction-17.C
new file mode 100644
index 000000000000..c00c8e46542e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/task-reduction-17.C
@@ -0,0 +1,300 @@
+extern "C" void abort ();
+
+int as[2];
+int (&a)[2] = as;
+long long int bs[7] = { 9, 11, 1, 1, 1, 13, 15 };
+long long int (&b)[7] = bs;
+int es[3] = { 5, 0, 5 };
+int (&e)[3] = es;
+int fs[5] = { 6, 7, 0, 0, 9 };
+int (&f)[5] = fs;
+int gs[4] = { 1, 0, 0, 2 };
+int (&g)[4] = gs;
+int hs[3] = { 0, 1, 4 };
+int (&h)[3] = hs;
+int ks[4][2] = { { 5, 6 }, { 0, 0 }, { 0, 0 }, { 7, 8 } };
+int (&k)[4][2] = ks;
+long long *ss;
+long long *&s = ss;
+long long (*ts)[2];
+long long (*&t)[2] = ts;
+
+template <typename T>
+void
+foo (T &n, T *&c, long long int *&d, T (&m)[3], T *&r, T (&o)[4], T *&p, T (&q)[4][2])
+{
+  T i;
+  for (i = 0; i < 2; i++)
+    #pragma omp task in_reduction (+: a, c[:2]) in_reduction (*: b[2 * n:3 * n], d[0:2]) \
+		     in_reduction (+: o[n:n*2], m[1], k[1:2][:], p[0], f[2:2]) \
+		     in_reduction (+: q[1:2][:], g[n:n*2], e[1], h[0], r[2:2]) \
+		     in_reduction (*: s[1:2], t[2:2][:])
+    {
+      a[0] += 7;
+      a[1] += 17;
+      b[2] *= 2;
+      b[4] *= 2;
+      c[0] += 6;
+      d[1] *= 2;
+      e[1] += 19;
+      f[2] += 21;
+      f[3] += 23;
+      g[1] += 25;
+      g[2] += 27;
+      h[0] += 29;
+      k[1][0] += 31;
+      k[2][1] += 33;
+      m[1] += 19;
+      r[2] += 21;
+      r[3] += 23;
+      o[1] += 25;
+      o[2] += 27;
+      p[0] += 29;
+      q[1][0] += 31;
+      q[2][1] += 33;
+      s[1] *= 2;
+      t[2][0] *= 2;
+      t[3][1] *= 2;
+    }
+}
+
+template <typename T, typename I>
+void
+test (T &n, I x, I y)
+{
+  T cs[2] = { 0, 0 };
+  T (&c)[2] = cs;
+  T ps[3] = { 0, 1, 4 };
+  T (&p)[3] = ps;
+  T qs[4][2] = { { 5, 6 }, { 0, 0 }, { 0, 0 }, { 7, 8 } };
+  T (&q)[4][2] = qs;
+  long long sb[4] = { 5, 1, 1, 6 };
+  long long tb[5][2] = { { 9, 10 }, { 11, 12 }, { 1, 1 }, { 1, 1 }, { 13, 14 } };
+  T ms[3] = { 5, 0, 5 };
+  T os[4] = { 1, 0, 0, 2 };
+  s = sb;
+  t = tb;
+  #pragma omp parallel if (0)
+  {
+    long long int ds[] = { 1, 1 };
+    long long int (&d)[2] = ds;
+    T (&m)[3] = ms;
+    T rs[5] = { 6, 7, 0, 0, 9 };
+    T (&r)[5] = rs;
+    T (&o)[4] = os;
+    #pragma omp parallel
+    {
+      #pragma omp for reduction (task,+: a, c) reduction (task,*: b[2 * n:3 * n], d) \
+		      reduction (task,+: e[1], f[2:2], g[n:n*2], h[0], k[1:2][0:2]) \
+		      reduction (task,+: o[n:n*2], m[1], q[1:2][:], p[0], r[2:2]) \
+		      reduction (task,*: t[2:2][:], s[1:n + 1]) schedule (dynamic)
+      for (I i = x; i != y; i++)
+	#pragma omp task in_reduction (+: a, c) in_reduction (*: b[2 * n:3 * n], d) \
+			 in_reduction (+: o[n:n*2], q[1:2][:], p[0], m[1], r[2:2]) \
+			 in_reduction (+: g[n:n * 2], e[1], k[1:2][:], h[0], f[2:2]) \
+			 in_reduction (*: s[1:2], t[2:2][:])
+	{
+	  T j;
+	  a[0] += 2;
+	  a[1] += 3;
+	  b[2] *= 2;
+	  f[3] += 8;
+	  g[1] += 9;
+	  g[2] += 10;
+	  h[0] += 11;
+	  k[1][1] += 13;
+	  k[2][1] += 15;
+	  m[1] += 16;
+	  r[2] += 8;
+	  s[1] *= 2;
+	  t[2][1] *= 2;
+	  t[3][1] *= 2;
+	  for (j = 0; j < 2; j++)
+	    #pragma omp task in_reduction (+: a, c[:2]) \
+			     in_reduction (*: b[2 * n:3 * n], d[n - 1:n + 1]) \
+			     in_reduction (+: e[1], f[2:2], g[n:n*2], h[0], k[1:2][:2]) \
+			     in_reduction (+: m[1], r[2:2], o[n:n*2], p[0], q[1:2][:2]) \
+			     in_reduction (*: s[n:2], t[2:2][:])
+	    {
+	      m[1] += 6;
+	      r[2] += 7;
+	      q[1][0] += 17;
+	      q[2][0] += 19;
+	      a[0] += 4;
+	      a[1] += 5;
+	      b[3] *= 2;
+	      b[4] *= 2;
+	      f[3] += 18;
+	      g[1] += 29;
+	      g[2] += 18;
+	      h[0] += 19;
+	      s[2] *= 2;
+	      t[2][0] *= 2;
+	      t[3][0] *= 2;
+	      T *cp = c;
+	      long long int *dp = d;
+	      T *rp = r;
+	      T *pp = p;
+	      foo (n, cp, dp, m, rp, o, pp, q);
+	      r[3] += 18;
+	      o[1] += 29;
+	      o[2] += 18;
+	      p[0] += 19;
+	      c[0] += 4;
+	      c[1] += 5;
+	      d[0] *= 2;
+	      e[1] += 6;
+	      f[2] += 7;
+	      k[1][0] += 17;
+	      k[2][0] += 19;
+	    }
+	  r[3] += 8;
+	  o[1] += 9;
+	  o[2] += 10;
+	  p[0] += 11;
+	  q[1][1] += 13;
+	  q[2][1] += 15;
+	  b[3] *= 2;
+	  c[0] += 4;
+	  c[1] += 9;
+	  d[0] *= 2;
+	  e[1] += 16;
+	  f[2] += 8;
+	}
+      if (d[0] != 1LL << (8 + 4)
+	  || d[1] != 1LL << 16
+	  || m[0] != 5
+	  || m[1] != 19 * 16 + 6 * 8 + 16 * 4
+	  || m[2] != 5
+	  || r[0] != 6
+	  || r[1] != 7
+	  || r[2] != 21 * 16 + 7 * 8 + 8 * 4
+	  || r[3] != 23 * 16 + 18 * 8 + 8 * 4
+	  || r[4] != 9
+	  || o[0] != 1
+	  || o[1] != 25 * 16 + 29 * 8 + 9 * 4
+	  || o[2] != 27 * 16 + 18 * 8 + 10 * 4
+	  || o[3] != 2)
+	abort ();
+      if (a[0] != 7 * 16 + 4 * 8 + 2 * 4
+	  || a[1] != 17 * 16 + 5 * 8 + 3 * 4
+	  || b[0] != 9 || b[1] != 11
+	  || b[2] != 1LL << (16 + 4)
+	  || b[3] != 1LL << (8 + 4)
+	  || b[4] != 1LL << (16 + 8)
+	  || b[5] != 13 || b[6] != 15
+	  || c[0] != 6 * 16 + 4 * 8 + 4 * 4
+	  || c[1] != 5 * 8 + 9 * 4
+	  || e[0] != 5
+	  || e[1] != 19 * 16 + 6 * 8 + 16 * 4
+	  || e[2] != 5
+	  || f[0] != 6
+	  || f[1] != 7
+	  || f[2] != 21 * 16 + 7 * 8 + 8 * 4
+	  || f[3] != 23 * 16 + 18 * 8 + 8 * 4
+	  || f[4] != 9
+	  || g[0] != 1
+	  || g[1] != 25 * 16 + 29 * 8 + 9 * 4
+	  || g[2] != 27 * 16 + 18 * 8 + 10 * 4
+	  || g[3] != 2
+	  || h[0] != 29 * 16 + 19 * 8 + 11 * 4
+	  || h[1] != 1 || h[2] != 4
+	  || k[0][0] != 5 || k[0][1] != 6
+	  || k[1][0] != 31 * 16 + 17 * 8
+	  || k[1][1] != 13 * 4
+	  || k[2][0] != 19 * 8
+	  || k[2][1] != 33 * 16 + 15 * 4
+	  || k[3][0] != 7 || k[3][1] != 8
+	  || p[0] != 29 * 16 + 19 * 8 + 11 * 4
+	  || p[1] != 1 || p[2] != 4
+	  || q[0][0] != 5 || q[0][1] != 6
+	  || q[1][0] != 31 * 16 + 17 * 8
+	  || q[1][1] != 13 * 4
+	  || q[2][0] != 19 * 8
+	  || q[2][1] != 33 * 16 + 15 * 4
+	  || q[3][0] != 7 || q[3][1] != 8
+	  || sb[0] != 5
+	  || sb[1] != 1LL << (16 + 4)
+	  || sb[2] != 1LL << 8
+	  || sb[3] != 6
+	  || tb[0][0] != 9 || tb[0][1] != 10 || tb[1][0] != 11 || tb[1][1] != 12
+	  || tb[2][0] != 1LL << (16 + 8)
+	  || tb[2][1] != 1LL << 4
+	  || tb[3][0] != 1LL << 8
+	  || tb[3][1] != 1LL << (16 + 4)
+	  || tb[4][0] != 13 || tb[4][1] != 14)
+	abort ();
+    }
+    if (d[0] != 1LL << (8 + 4)
+	|| d[1] != 1LL << 16
+	|| m[0] != 5
+	|| m[1] != 19 * 16 + 6 * 8 + 16 * 4
+	|| m[2] != 5
+	|| r[0] != 6
+	|| r[1] != 7
+	|| r[2] != 21 * 16 + 7 * 8 + 8 * 4
+	|| r[3] != 23 * 16 + 18 * 8 + 8 * 4
+	|| r[4] != 9
+	|| o[0] != 1
+	|| o[1] != 25 * 16 + 29 * 8 + 9 * 4
+	|| o[2] != 27 * 16 + 18 * 8 + 10 * 4
+	|| o[3] != 2)
+      abort ();
+  }
+  if (a[0] != 7 * 16 + 4 * 8 + 2 * 4
+      || a[1] != 17 * 16 + 5 * 8 + 3 * 4
+      || b[0] != 9 || b[1] != 11
+      || b[2] != 1LL << (16 + 4)
+      || b[3] != 1LL << (8 + 4)
+      || b[4] != 1LL << (16 + 8)
+      || b[5] != 13 || b[6] != 15
+      || c[0] != 6 * 16 + 4 * 8 + 4 * 4
+      || c[1] != 5 * 8 + 9 * 4
+      || e[0] != 5
+      || e[1] != 19 * 16 + 6 * 8 + 16 * 4
+      || e[2] != 5
+      || f[0] != 6
+      || f[1] != 7
+      || f[2] != 21 * 16 + 7 * 8 + 8 * 4
+      || f[3] != 23 * 16 + 18 * 8 + 8 * 4
+      || f[4] != 9
+      || g[0] != 1
+      || g[1] != 25 * 16 + 29 * 8 + 9 * 4
+      || g[2] != 27 * 16 + 18 * 8 + 10 * 4
+      || g[3] != 2
+      || h[0] != 29 * 16 + 19 * 8 + 11 * 4
+      || h[1] != 1 || h[2] != 4
+      || k[0][0] != 5 || k[0][1] != 6
+      || k[1][0] != 31 * 16 + 17 * 8
+      || k[1][1] != 13 * 4
+      || k[2][0] != 19 * 8
+      || k[2][1] != 33 * 16 + 15 * 4
+      || k[3][0] != 7 || k[3][1] != 8
+      || p[0] != 29 * 16 + 19 * 8 + 11 * 4
+      || p[1] != 1 || p[2] != 4
+      || q[0][0] != 5 || q[0][1] != 6
+      || q[1][0] != 31 * 16 + 17 * 8
+      || q[1][1] != 13 * 4
+      || q[2][0] != 19 * 8
+      || q[2][1] != 33 * 16 + 15 * 4
+      || q[3][0] != 7 || q[3][1] != 8
+      || sb[0] != 5
+      || sb[1] != 1LL << (16 + 4)
+      || sb[2] != 1LL << 8
+      || sb[3] != 6
+      || tb[0][0] != 9 || tb[0][1] != 10 || tb[1][0] != 11 || tb[1][1] != 12
+      || tb[2][0] != 1LL << (16 + 8)
+      || tb[2][1] != 1LL << 4
+      || tb[3][0] != 1LL << 8
+      || tb[3][1] != 1LL << (16 + 4)
+      || tb[4][0] != 13 || tb[4][1] != 14)
+    abort ();
+}
+
+int
+main ()
+{
+  int n = 1;
+  test (n, 0ULL, 4ULL);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/task-reduction-18.C b/libgomp/testsuite/libgomp.c++/task-reduction-18.C
new file mode 100644
index 000000000000..99c0e3727d43
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/task-reduction-18.C
@@ -0,0 +1,325 @@
+extern "C" void abort ();
+
+struct S { S (); S (long int, long int); ~S (); static int cnt1, cnt2, cnt3; long int s, t; };
+
+int S::cnt1;
+int S::cnt2;
+int S::cnt3;
+
+S::S ()
+{
+  #pragma omp atomic
+  cnt1++;
+}
+
+S::S (long int x, long int y) : s (x), t (y)
+{
+  #pragma omp atomic update
+  ++cnt2;
+}
+
+S::~S ()
+{
+  #pragma omp atomic
+  cnt3 = cnt3 + 1;
+  if (t < 3 || t > 9 || (t & 1) == 0)
+    abort ();
+}
+
+void
+bar (S *p, S *o)
+{
+  p->s = 1;
+  if (o->t != 5)
+    abort ();
+  p->t = 9;
+}
+
+static inline void
+baz (S *o, S *i)
+{
+  if (o->t != 5 || i->t != 9)
+    abort ();
+  o->s *= i->s;
+}
+
+#pragma omp declare reduction (+: S : omp_out.s += omp_in.s) initializer (omp_priv (0, 3))
+#pragma omp declare reduction (*: S : baz (&omp_out, &omp_in)) initializer (bar (&omp_priv, &omp_orig))
+
+S a[2] = { { 0, 7 }, { 0, 7 } };
+S b[7] = { { 9, 5 }, { 11, 5 }, { 1, 5 }, { 1, 5 }, { 1, 5 }, { 13, 5 }, { 15, 5 } };
+S e[3] = { { 5, 7 }, { 0, 7 }, { 5, 7 } };
+S f[5] = { { 6, 7 }, { 7, 7 }, { 0, 7 }, { 0, 7 }, { 9, 7 } };
+S g[4] = { { 1, 7 }, { 0, 7 }, { 0, 7 }, { 2, 7 } };
+S h[3] = { { 0, 7 }, { 1, 7 }, { 4, 7 } };
+S k[4][2] = { { { 5, 7 }, { 6, 7 } }, { { 0, 7 }, { 0, 7 } }, { { 0, 7 }, { 0, 7 } }, { { 7, 7 }, { 8, 7 } } };
+S *s;
+S (*t)[2];
+
+template <int N>
+void
+foo (int n, S *c, S *d, S m[3], S *r, S o[4], S *p, S q[4][2])
+{
+  int i;
+  for (i = 0; i < 2; i++)
+    #pragma omp task in_reduction (+: a, c[:2]) in_reduction (*: b[2 * n:3 * n], d[0:2]) \
+		     in_reduction (+: o[n:n*2], m[1], k[1:2][:], p[0], f[2:2]) \
+		     in_reduction (+: q[1:2][:], g[n:n*2], e[1], h[0], r[2:2]) \
+		     in_reduction (*: s[1:2], t[2:2][:])
+    {
+      a[0].s += 7;
+      a[1].s += 17;
+      b[2].s *= 2;
+      b[4].s *= 2;
+      c[0].s += 6;
+      d[1].s *= 2;
+      e[1].s += 19;
+      f[2].s += 21;
+      f[3].s += 23;
+      g[1].s += 25;
+      g[2].s += 27;
+      h[0].s += 29;
+      k[1][0].s += 31;
+      k[2][1].s += 33;
+      m[1].s += 19;
+      r[2].s += 21;
+      r[3].s += 23;
+      o[1].s += 25;
+      o[2].s += 27;
+      p[0].s += 29;
+      q[1][0].s += 31;
+      q[2][1].s += 33;
+      s[1].s *= 2;
+      t[2][0].s *= 2;
+      t[3][1].s *= 2;
+      if ((e[1].t != 7 && e[1].t != 3) || (h[0].t != 7 && h[0].t != 3)
+	  || (m[1].t != 7 && m[1].t != 3) || (p[0].t != 7 && p[0].t != 3))
+	abort ();
+      for (int z = 0; z < 2; z++)
+	if ((a[z].t != 7 && a[z].t != 3) || (c[z].t != 7 && c[z].t != 3)
+	    || (d[z].t != 5 && d[z].t != 9) || (f[z + 2].t != 7 && f[z + 2].t != 3)
+	    || (g[z + 1].t != 7 && g[z + 1].t != 3) || (r[z + 2].t != 7 && r[z + 2].t != 3)
+	    || (s[z + 1].t != 5 && s[z + 1].t != 9) || (o[z + 1].t != 7 && o[z + 1].t != 3)
+	    || (k[z + 1][0].t != 7 && k[z + 1][0].t != 3) || (k[z + 1][1].t != 7 && k[z + 1][1].t != 3)
+	    || (q[z + 1][0].t != 7 && q[z + 1][0].t != 3) || (q[z + 1][1].t != 7 && q[z + 1][1].t != 3)
+	    || (t[z + 2][0].t != 5 && t[z + 2][0].t != 9) || (t[z + 2][1].t != 5 && t[z + 2][1].t != 9))
+	  abort ();
+      for (int z = 0; z < 3; z++)
+	if (b[z + 2].t != 5 && b[z + 2].t != 9)
+	  abort ();
+    }
+}
+
+template <int N>
+void
+test (int n)
+{
+  S c[2] = { { 0, 7 }, { 0, 7 } };
+  S p[3] = { { 0, 7 }, { 1, 7 }, { 4, 7 } };
+  S q[4][2] = { { { 5, 7 }, { 6, 7 } }, { { 0, 7 }, { 0, 7 } }, { { 0, 7 }, { 0, 7 } }, { { 7, 7 }, { 8, 7 } } };
+  S ss[4] = { { 5, 5 }, { 1, 5 }, { 1, 5 }, { 6, 5 } };
+  S tt[5][2] = { { { 9, 5 }, { 10, 5 } }, { { 11, 5 }, { 12, 5 } }, { { 1, 5 }, { 1, 5 } }, { { 1, 5 }, { 1, 5 } }, { { 13, 5 }, { 14, 5 } } };
+  s = ss;
+  t = tt;
+  #pragma omp parallel num_threads (1) if (0)
+  {
+    S d[] = { { 1, 5 }, { 1, 5 } };
+    S m[3] = { { 5, 7 }, { 0, 7 }, { 5, 7 } };
+    S r[5] = { { 6, 7 }, { 7, 7 }, { 0, 7 }, { 0, 7 }, { 9, 7 } };
+    S o[4] = { { 1, 7 }, { 0, 7 }, { 0, 7 }, { 2, 7 } };
+    volatile unsigned long long x = 0;
+    volatile unsigned long long y = 4;
+    volatile unsigned long long z = 1;
+    #pragma omp parallel
+    {
+      #pragma omp for reduction (task, +: a, c) reduction (task, *: b[2 * n:3 * n], d) \
+		      reduction (task, +: e[1], f[2:2], g[n:n*2], h[0], k[1:2][0:2]) \
+		      reduction (task, +: o[n:n*2], m[1], q[1:2][:], p[0], r[2:2]) \
+		      reduction (task, *: t[2:2][:], s[1:n + 1]) \
+		      schedule (nonmonotonic: guided, 1)
+      for (unsigned long long i = x; i < y; i += z)
+	#pragma omp task in_reduction (+: a, c) in_reduction (*: b[2 * n:3 * n], d) \
+			 in_reduction (+: o[n:n*2], q[1:2][:], p[0], m[1], r[2:2]) \
+			 in_reduction (+: g[n:n * 2], e[1], k[1:2][:], h[0], f[2:2]) \
+			 in_reduction (*: s[1:2], t[2:2][:])
+	{
+	  int j;
+	  a[0].s += 2;
+	  a[1].s += 3;
+	  b[2].s *= 2;
+	  f[3].s += 8;
+	  g[1].s += 9;
+	  g[2].s += 10;
+	  h[0].s += 11;
+	  k[1][1].s += 13;
+	  k[2][1].s += 15;
+	  m[1].s += 16;
+	  r[2].s += 8;
+	  s[1].s *= 2;
+	  t[2][1].s *= 2;
+	  t[3][1].s *= 2;
+	  if ((e[1].t != 7 && e[1].t != 3) || (h[0].t != 7 && h[0].t != 3)
+	      || (m[1].t != 7 && m[1].t != 3) || (p[0].t != 7 && p[0].t != 3))
+	    abort ();
+	  for (int z = 0; z < 2; z++)
+	    if ((a[z].t != 7 && a[z].t != 3) || (c[z].t != 7 && c[z].t != 3)
+		|| (d[z].t != 5 && d[z].t != 9) || (f[z + 2].t != 7 && f[z + 2].t != 3)
+		|| (g[z + 1].t != 7 && g[z + 1].t != 3) || (r[z + 2].t != 7 && r[z + 2].t != 3)
+		|| (s[z + 1].t != 5 && s[z + 1].t != 9) || (o[z + 1].t != 7 && o[z + 1].t != 3)
+		|| (k[z + 1][0].t != 7 && k[z + 1][0].t != 3) || (k[z + 1][1].t != 7 && k[z + 1][1].t != 3)
+		|| (q[z + 1][0].t != 7 && q[z + 1][0].t != 3) || (q[z + 1][1].t != 7 && q[z + 1][1].t != 3)
+		|| (t[z + 2][0].t != 5 && t[z + 2][0].t != 9) || (t[z + 2][1].t != 5 && t[z + 2][1].t != 9))
+	      abort ();
+	  for (int z = 0; z < 3; z++)
+	    if (b[z + 2].t != 5 && b[z + 2].t != 9)
+	      abort ();
+	  for (j = 0; j < 2; j++)
+	    #pragma omp task in_reduction (+: a, c[:2]) \
+			     in_reduction (*: b[2 * n:3 * n], d[n - 1:n + 1]) \
+			     in_reduction (+: e[1], f[2:2], g[n:n*2], h[0], k[1:2][:2]) \
+			     in_reduction (+: m[1], r[2:2], o[n:n*2], p[0], q[1:2][:2]) \
+			     in_reduction (*: s[n:2], t[2:2][:])
+	    {
+	      m[1].s += 6;
+	      r[2].s += 7;
+	      q[1][0].s += 17;
+	      q[2][0].s += 19;
+	      a[0].s += 4;
+	      a[1].s += 5;
+	      b[3].s *= 2;
+	      b[4].s *= 2;
+	      f[3].s += 18;
+	      g[1].s += 29;
+	      g[2].s += 18;
+	      h[0].s += 19;
+	      s[2].s *= 2;
+	      t[2][0].s *= 2;
+	      t[3][0].s *= 2;
+	      foo<N> (n, c, d, m, r, o, p, q);
+	      if ((e[1].t != 7 && e[1].t != 3) || (h[0].t != 7 && h[0].t != 3)
+		  || (m[1].t != 7 && m[1].t != 3) || (p[0].t != 7 && p[0].t != 3))
+		abort ();
+	      for (int z = 0; z < 2; z++)
+		if ((a[z].t != 7 && a[z].t != 3) || (c[z].t != 7 && c[z].t != 3)
+		    || (d[z].t != 5 && d[z].t != 9) || (f[z + 2].t != 7 && f[z + 2].t != 3)
+		    || (g[z + 1].t != 7 && g[z + 1].t != 3) || (r[z + 2].t != 7 && r[z + 2].t != 3)
+		    || (s[z + 1].t != 5 && s[z + 1].t != 9) || (o[z + 1].t != 7 && o[z + 1].t != 3)
+		    || (k[z + 1][0].t != 7 && k[z + 1][0].t != 3) || (k[z + 1][1].t != 7 && k[z + 1][1].t != 3)
+		    || (q[z + 1][0].t != 7 && q[z + 1][0].t != 3) || (q[z + 1][1].t != 7 && q[z + 1][1].t != 3)
+		    || (t[z + 2][0].t != 5 && t[z + 2][0].t != 9) || (t[z + 2][1].t != 5 && t[z + 2][1].t != 9))
+		  abort ();
+	      for (int z = 0; z < 3; z++)
+		if (b[z + 2].t != 5 && b[z + 2].t != 9)
+		  abort ();
+	      r[3].s += 18;
+	      o[1].s += 29;
+	      o[2].s += 18;
+	      p[0].s += 19;
+	      c[0].s += 4;
+	      c[1].s += 5;
+	      d[0].s *= 2;
+	      e[1].s += 6;
+	      f[2].s += 7;
+	      k[1][0].s += 17;
+	      k[2][0].s += 19;
+	    }
+	  r[3].s += 8;
+	  o[1].s += 9;
+	  o[2].s += 10;
+	  p[0].s += 11;
+	  q[1][1].s += 13;
+	  q[2][1].s += 15;
+	  b[3].s *= 2;
+	  c[0].s += 4;
+	  c[1].s += 9;
+	  d[0].s *= 2;
+	  e[1].s += 16;
+	  f[2].s += 8;
+	}
+      if (a[0].s != 7 * 16 + 4 * 8 + 2 * 4
+	  || a[1].s != 17 * 16 + 5 * 8 + 3 * 4
+	  || b[0].s != 9 || b[1].s != 11
+	  || b[2].s != 1LL << (16 + 4)
+	  || b[3].s != 1LL << (8 + 4)
+	  || b[4].s != 1LL << (16 + 8)
+	  || b[5].s != 13 || b[6].s != 15
+	  || c[0].s != 6 * 16 + 4 * 8 + 4 * 4
+	  || c[1].s != 5 * 8 + 9 * 4
+	  || e[0].s != 5
+	  || e[1].s != 19 * 16 + 6 * 8 + 16 * 4
+	  || e[2].s != 5
+	  || f[0].s != 6
+	  || f[1].s != 7
+	  || f[2].s != 21 * 16 + 7 * 8 + 8 * 4
+	  || f[3].s != 23 * 16 + 18 * 8 + 8 * 4
+	  || f[4].s != 9
+	  || g[0].s != 1
+	  || g[1].s != 25 * 16 + 29 * 8 + 9 * 4
+	  || g[2].s != 27 * 16 + 18 * 8 + 10 * 4
+	  || g[3].s != 2
+	  || h[0].s != 29 * 16 + 19 * 8 + 11 * 4
+	  || h[1].s != 1 || h[2].s != 4
+	  || k[0][0].s != 5 || k[0][1].s != 6
+	  || k[1][0].s != 31 * 16 + 17 * 8
+	  || k[1][1].s != 13 * 4
+	  || k[2][0].s != 19 * 8
+	  || k[2][1].s != 33 * 16 + 15 * 4
+	  || k[3][0].s != 7 || k[3][1].s != 8
+	  || p[0].s != 29 * 16 + 19 * 8 + 11 * 4
+	  || p[1].s != 1 || p[2].s != 4
+	  || q[0][0].s != 5 || q[0][1].s != 6
+	  || q[1][0].s != 31 * 16 + 17 * 8
+	  || q[1][1].s != 13 * 4
+	  || q[2][0].s != 19 * 8
+	  || q[2][1].s != 33 * 16 + 15 * 4
+	  || q[3][0].s != 7 || q[3][1].s != 8
+	  || ss[0].s != 5
+	  || ss[1].s != 1LL << (16 + 4)
+	  || ss[2].s != 1LL << 8
+	  || ss[3].s != 6
+	  || tt[0][0].s != 9 || tt[0][1].s != 10 || tt[1][0].s != 11 || tt[1][1].s != 12
+	  || tt[2][0].s != 1LL << (16 + 8)
+	  || tt[2][1].s != 1LL << 4
+	  || tt[3][0].s != 1LL << 8
+	  || tt[3][1].s != 1LL << (16 + 4)
+	  || tt[4][0].s != 13 || tt[4][1].s != 14)
+	abort ();
+    }
+    if (d[0].s != 1LL << (8 + 4)
+	|| d[1].s != 1LL << 16
+	|| m[0].s != 5
+	|| m[1].s != 19 * 16 + 6 * 8 + 16 * 4
+	|| m[2].s != 5
+	|| r[0].s != 6
+	|| r[1].s != 7
+	|| r[2].s != 21 * 16 + 7 * 8 + 8 * 4
+	|| r[3].s != 23 * 16 + 18 * 8 + 8 * 4
+	|| r[4].s != 9
+	|| o[0].s != 1
+	|| o[1].s != 25 * 16 + 29 * 8 + 9 * 4
+	|| o[2].s != 27 * 16 + 18 * 8 + 10 * 4
+	|| o[3].s != 2)
+      abort ();
+    if (e[1].t != 7 || h[0].t != 7 || m[1].t != 7 || p[0].t != 7)
+      abort ();
+    for (int z = 0; z < 2; z++)
+      if (a[z].t != 7 || c[z].t != 7 || d[z].t != 5 || f[z + 2].t != 7
+	  || g[z + 1].t != 7 || r[z + 2].t != 7 || s[z + 1].t != 5 || o[z + 1].t != 7
+	  || k[z + 1][0].t != 7 || k[z + 1][1].t != 7 || q[z + 1][0].t != 7 || q[z + 1][1].t != 7
+	  || t[z + 2][0].t != 5 || t[z + 2][1].t != 5)
+	abort ();
+    for (int z = 0; z < 3; z++)
+      if (b[z + 2].t != 5)
+	abort ();
+  }
+}
+
+int
+main ()
+{
+  int c1 = S::cnt1, c2 = S::cnt2, c3 = S::cnt3;
+  test<0> (1);
+  if (S::cnt1 + S::cnt2 - c1 - c2 != S::cnt3 - c3)
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/task-reduction-19.C b/libgomp/testsuite/libgomp.c++/task-reduction-19.C
new file mode 100644
index 000000000000..15945c57cc24
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/task-reduction-19.C
@@ -0,0 +1,343 @@
+extern "C" void abort ();
+
+struct S { S (); S (long int, long int); ~S (); static int cnt1, cnt2, cnt3; long int s, t; };
+
+int S::cnt1;
+int S::cnt2;
+int S::cnt3;
+
+S::S ()
+{
+  #pragma omp atomic
+  cnt1++;
+}
+
+S::S (long int x, long int y) : s (x), t (y)
+{
+  #pragma omp atomic update
+  ++cnt2;
+}
+
+S::~S ()
+{
+  #pragma omp atomic
+  cnt3 = cnt3 + 1;
+  if (t < 3 || t > 9 || (t & 1) == 0)
+    abort ();
+}
+
+void
+bar (S *p, S *o)
+{
+  p->s = 1;
+  if (o->t != 5)
+    abort ();
+  p->t = 9;
+}
+
+static inline void
+baz (S *o, S *i)
+{
+  if (o->t != 5 || i->t != 9)
+    abort ();
+  o->s *= i->s;
+}
+
+#pragma omp declare reduction (+: S : omp_out.s += omp_in.s) initializer (omp_priv (0, 3))
+#pragma omp declare reduction (*: S : baz (&omp_out, &omp_in)) initializer (bar (&omp_priv, &omp_orig))
+
+S as[2] = { { 0, 7 }, { 0, 7 } };
+S (&a)[2] = as;
+S bs[7] = { { 9, 5 }, { 11, 5 }, { 1, 5 }, { 1, 5 }, { 1, 5 }, { 13, 5 }, { 15, 5 } };
+S (&b)[7] = bs;
+S es[3] = { { 5, 7 }, { 0, 7 }, { 5, 7 } };
+S (&e)[3] = es;
+S fs[5] = { { 6, 7 }, { 7, 7 }, { 0, 7 }, { 0, 7 }, { 9, 7 } };
+S (&f)[5] = fs;
+S gs[4] = { { 1, 7 }, { 0, 7 }, { 0, 7 }, { 2, 7 } };
+S (&g)[4] = gs;
+S hs[3] = { { 0, 7 }, { 1, 7 }, { 4, 7 } };
+S (&h)[3] = hs;
+S ks[4][2] = { { { 5, 7 }, { 6, 7 } }, { { 0, 7 }, { 0, 7 } }, { { 0, 7 }, { 0, 7 } }, { { 7, 7 }, { 8, 7 } } };
+S (&k)[4][2] = ks;
+S *ss;
+S *&s = ss;
+S (*ts)[2];
+S (*&t)[2] = ts;
+
+template <typename S, typename T>
+void
+foo (T &n, S *&c, S *&d, S (&m)[3], S *&r, S (&o)[4], S *&p, S (&q)[4][2])
+{
+  T i;
+  for (i = 0; i < 2; i++)
+    #pragma omp task in_reduction (+: a, c[:2]) in_reduction (*: b[2 * n:3 * n], d[0:2]) \
+		     in_reduction (+: o[n:n*2], m[1], k[1:2][:], p[0], f[2:2]) \
+		     in_reduction (+: q[1:2][:], g[n:n*2], e[1], h[0], r[2:2]) \
+		     in_reduction (*: s[1:2], t[2:2][:])
+    {
+      a[0].s += 7;
+      a[1].s += 17;
+      b[2].s *= 2;
+      b[4].s *= 2;
+      c[0].s += 6;
+      d[1].s *= 2;
+      e[1].s += 19;
+      f[2].s += 21;
+      f[3].s += 23;
+      g[1].s += 25;
+      g[2].s += 27;
+      h[0].s += 29;
+      k[1][0].s += 31;
+      k[2][1].s += 33;
+      m[1].s += 19;
+      r[2].s += 21;
+      r[3].s += 23;
+      o[1].s += 25;
+      o[2].s += 27;
+      p[0].s += 29;
+      q[1][0].s += 31;
+      q[2][1].s += 33;
+      s[1].s *= 2;
+      t[2][0].s *= 2;
+      t[3][1].s *= 2;
+      if ((e[1].t != 7 && e[1].t != 3) || (h[0].t != 7 && h[0].t != 3)
+	  || (m[1].t != 7 && m[1].t != 3) || (p[0].t != 7 && p[0].t != 3))
+	abort ();
+      for (T z = 0; z < 2; z++)
+	if ((a[z].t != 7 && a[z].t != 3) || (c[z].t != 7 && c[z].t != 3)
+	    || (d[z].t != 5 && d[z].t != 9) || (f[z + 2].t != 7 && f[z + 2].t != 3)
+	    || (g[z + 1].t != 7 && g[z + 1].t != 3) || (r[z + 2].t != 7 && r[z + 2].t != 3)
+	    || (s[z + 1].t != 5 && s[z + 1].t != 9) || (o[z + 1].t != 7 && o[z + 1].t != 3)
+	    || (k[z + 1][0].t != 7 && k[z + 1][0].t != 3) || (k[z + 1][1].t != 7 && k[z + 1][1].t != 3)
+	    || (q[z + 1][0].t != 7 && q[z + 1][0].t != 3) || (q[z + 1][1].t != 7 && q[z + 1][1].t != 3)
+	    || (t[z + 2][0].t != 5 && t[z + 2][0].t != 9) || (t[z + 2][1].t != 5 && t[z + 2][1].t != 9))
+	  abort ();
+      for (T z = 0; z < 3; z++)
+	if (b[z + 2].t != 5 && b[z + 2].t != 9)
+	  abort ();
+    }
+}
+
+template <typename S, typename T>
+void
+test (T &n)
+{
+  S cs[2] = { { 0, 7 }, { 0, 7 } };
+  S (&c)[2] = cs;
+  S ps[3] = { { 0, 7 }, { 1, 7 }, { 4, 7 } };
+  S (&p)[3] = ps;
+  S qs[4][2] = { { { 5, 7 }, { 6, 7 } }, { { 0, 7 }, { 0, 7 } }, { { 0, 7 }, { 0, 7 } }, { { 7, 7 }, { 8, 7 } } };
+  S (&q)[4][2] = qs;
+  S sb[4] = { { 5, 5 }, { 1, 5 }, { 1, 5 }, { 6, 5 } };
+  S tb[5][2] = { { { 9, 5 }, { 10, 5 } }, { { 11, 5 }, { 12, 5 } }, { { 1, 5 }, { 1, 5 } }, { { 1, 5 }, { 1, 5 } }, { { 13, 5 }, { 14, 5 } } };
+  S ms[3] = { { 5, 7 }, { 0, 7 }, { 5, 7 } };
+  S os[4] = { { 1, 7 }, { 0, 7 }, { 0, 7 }, { 2, 7 } };
+  s = sb;
+  t = tb;
+  #pragma omp parallel if (0)
+  {
+    S ds[] = { { 1, 5 }, { 1, 5 } };
+    S (&d)[2] = ds;
+    S (&m)[3] = ms;
+    S rs[5] = { { 6, 7 }, { 7, 7 }, { 0, 7 }, { 0, 7 }, { 9, 7 } };
+    S (&r)[5] = rs;
+    S (&o)[4] = os;
+    #pragma omp parallel
+    {
+      #pragma omp for reduction (task, +: a, c) reduction (task, *: b[2 * n:3 * n], d) \
+		      reduction (task, +: e[1], f[2:2], g[n:n*2], h[0], k[1:2][0:2]) \
+		      reduction (task, +: o[n:n*2], m[1], q[1:2][:], p[0], r[2:2]) \
+		      reduction (task, *: t[2:2][:], s[1:n + 1]) \
+		      schedule (monotonic: runtime)
+      for (T i = 0; i < 4; i++)
+	#pragma omp task in_reduction (+: a, c) in_reduction (*: b[2 * n:3 * n], d) \
+			 in_reduction (+: o[n:n*2], q[1:2][:], p[0], m[1], r[2:2]) \
+			 in_reduction (+: g[n:n * 2], e[1], k[1:2][:], h[0], f[2:2]) \
+			 in_reduction (*: s[1:2], t[2:2][:])
+	{
+	  T j;
+	  a[0].s += 2;
+	  a[1].s += 3;
+	  b[2].s *= 2;
+	  f[3].s += 8;
+	  g[1].s += 9;
+	  g[2].s += 10;
+	  h[0].s += 11;
+	  k[1][1].s += 13;
+	  k[2][1].s += 15;
+	  m[1].s += 16;
+	  r[2].s += 8;
+	  s[1].s *= 2;
+	  t[2][1].s *= 2;
+	  t[3][1].s *= 2;
+	  if ((e[1].t != 7 && e[1].t != 3) || (h[0].t != 7 && h[0].t != 3)
+	      || (m[1].t != 7 && m[1].t != 3) || (p[0].t != 7 && p[0].t != 3))
+	    abort ();
+	  for (T z = 0; z < 2; z++)
+	    if ((a[z].t != 7 && a[z].t != 3) || (c[z].t != 7 && c[z].t != 3)
+		|| (d[z].t != 5 && d[z].t != 9) || (f[z + 2].t != 7 && f[z + 2].t != 3)
+		|| (g[z + 1].t != 7 && g[z + 1].t != 3) || (r[z + 2].t != 7 && r[z + 2].t != 3)
+		|| (s[z + 1].t != 5 && s[z + 1].t != 9) || (o[z + 1].t != 7 && o[z + 1].t != 3)
+		|| (k[z + 1][0].t != 7 && k[z + 1][0].t != 3) || (k[z + 1][1].t != 7 && k[z + 1][1].t != 3)
+		|| (q[z + 1][0].t != 7 && q[z + 1][0].t != 3) || (q[z + 1][1].t != 7 && q[z + 1][1].t != 3)
+		|| (t[z + 2][0].t != 5 && t[z + 2][0].t != 9) || (t[z + 2][1].t != 5 && t[z + 2][1].t != 9))
+	      abort ();
+	  for (T z = 0; z < 3; z++)
+	    if (b[z + 2].t != 5 && b[z + 2].t != 9)
+	      abort ();
+	  for (j = 0; j < 2; j++)
+	    #pragma omp task in_reduction (+: a, c[:2]) \
+			     in_reduction (*: b[2 * n:3 * n], d[n - 1:n + 1]) \
+			     in_reduction (+: e[1], f[2:2], g[n:n*2], h[0], k[1:2][:2]) \
+			     in_reduction (+: m[1], r[2:2], o[n:n*2], p[0], q[1:2][:2]) \
+			     in_reduction (*: s[n:2], t[2:2][:])
+	    {
+	      m[1].s += 6;
+	      r[2].s += 7;
+	      q[1][0].s += 17;
+	      q[2][0].s += 19;
+	      a[0].s += 4;
+	      a[1].s += 5;
+	      b[3].s *= 2;
+	      b[4].s *= 2;
+	      f[3].s += 18;
+	      g[1].s += 29;
+	      g[2].s += 18;
+	      h[0].s += 19;
+	      s[2].s *= 2;
+	      t[2][0].s *= 2;
+	      t[3][0].s *= 2;
+	      S *cp = c;
+	      S *dp = d;
+	      S *rp = r;
+	      S *pp = p;
+	      if ((e[1].t != 7 && e[1].t != 3) || (h[0].t != 7 && h[0].t != 3)
+		  || (m[1].t != 7 && m[1].t != 3) || (p[0].t != 7 && p[0].t != 3))
+		abort ();
+	      for (T z = 0; z < 2; z++)
+		if ((a[z].t != 7 && a[z].t != 3) || (c[z].t != 7 && c[z].t != 3)
+		    || (d[z].t != 5 && d[z].t != 9) || (f[z + 2].t != 7 && f[z + 2].t != 3)
+		    || (g[z + 1].t != 7 && g[z + 1].t != 3) || (r[z + 2].t != 7 && r[z + 2].t != 3)
+		    || (s[z + 1].t != 5 && s[z + 1].t != 9) || (o[z + 1].t != 7 && o[z + 1].t != 3)
+		    || (k[z + 1][0].t != 7 && k[z + 1][0].t != 3) || (k[z + 1][1].t != 7 && k[z + 1][1].t != 3)
+		    || (q[z + 1][0].t != 7 && q[z + 1][0].t != 3) || (q[z + 1][1].t != 7 && q[z + 1][1].t != 3)
+		    || (t[z + 2][0].t != 5 && t[z + 2][0].t != 9) || (t[z + 2][1].t != 5 && t[z + 2][1].t != 9))
+		  abort ();
+	      for (T z = 0; z < 3; z++)
+		if (b[z + 2].t != 5 && b[z + 2].t != 9)
+		  abort ();
+	      foo (n, cp, dp, m, rp, o, pp, q);
+	      r[3].s += 18;
+	      o[1].s += 29;
+	      o[2].s += 18;
+	      p[0].s += 19;
+	      c[0].s += 4;
+	      c[1].s += 5;
+	      d[0].s *= 2;
+	      e[1].s += 6;
+	      f[2].s += 7;
+	      k[1][0].s += 17;
+	      k[2][0].s += 19;
+	    }
+	  r[3].s += 8;
+	  o[1].s += 9;
+	  o[2].s += 10;
+	  p[0].s += 11;
+	  q[1][1].s += 13;
+	  q[2][1].s += 15;
+	  b[3].s *= 2;
+	  c[0].s += 4;
+	  c[1].s += 9;
+	  d[0].s *= 2;
+	  e[1].s += 16;
+	  f[2].s += 8;
+	}
+      if (a[0].s != 7 * 16 + 4 * 8 + 2 * 4
+	  || a[1].s != 17 * 16 + 5 * 8 + 3 * 4
+	  || b[0].s != 9 || b[1].s != 11
+	  || b[2].s != 1LL << (16 + 4)
+	  || b[3].s != 1LL << (8 + 4)
+	  || b[4].s != 1LL << (16 + 8)
+	  || b[5].s != 13 || b[6].s != 15
+	  || c[0].s != 6 * 16 + 4 * 8 + 4 * 4
+	  || c[1].s != 5 * 8 + 9 * 4
+	  || e[0].s != 5
+	  || e[1].s != 19 * 16 + 6 * 8 + 16 * 4
+	  || e[2].s != 5
+	  || f[0].s != 6
+	  || f[1].s != 7
+	  || f[2].s != 21 * 16 + 7 * 8 + 8 * 4
+	  || f[3].s != 23 * 16 + 18 * 8 + 8 * 4
+	  || f[4].s != 9
+	  || g[0].s != 1
+	  || g[1].s != 25 * 16 + 29 * 8 + 9 * 4
+	  || g[2].s != 27 * 16 + 18 * 8 + 10 * 4
+	  || g[3].s != 2
+	  || h[0].s != 29 * 16 + 19 * 8 + 11 * 4
+	  || h[1].s != 1 || h[2].s != 4
+	  || k[0][0].s != 5 || k[0][1].s != 6
+	  || k[1][0].s != 31 * 16 + 17 * 8
+	  || k[1][1].s != 13 * 4
+	  || k[2][0].s != 19 * 8
+	  || k[2][1].s != 33 * 16 + 15 * 4
+	  || k[3][0].s != 7 || k[3][1].s != 8
+	  || p[0].s != 29 * 16 + 19 * 8 + 11 * 4
+	  || p[1].s != 1 || p[2].s != 4
+	  || q[0][0].s != 5 || q[0][1].s != 6
+	  || q[1][0].s != 31 * 16 + 17 * 8
+	  || q[1][1].s != 13 * 4
+	  || q[2][0].s != 19 * 8
+	  || q[2][1].s != 33 * 16 + 15 * 4
+	  || q[3][0].s != 7 || q[3][1].s != 8
+	  || sb[0].s != 5
+	  || sb[1].s != 1LL << (16 + 4)
+	  || sb[2].s != 1LL << 8
+	  || sb[3].s != 6
+	  || tb[0][0].s != 9 || tb[0][1].s != 10 || tb[1][0].s != 11 || tb[1][1].s != 12
+	  || tb[2][0].s != 1LL << (16 + 8)
+	  || tb[2][1].s != 1LL << 4
+	  || tb[3][0].s != 1LL << 8
+	  || tb[3][1].s != 1LL << (16 + 4)
+	  || tb[4][0].s != 13 || tb[4][1].s != 14)
+	abort ();
+      if (d[0].s != 1LL << (8 + 4)
+	  || d[1].s != 1LL << 16
+	  || m[0].s != 5
+	  || m[1].s != 19 * 16 + 6 * 8 + 16 * 4
+	  || m[2].s != 5
+	  || r[0].s != 6
+	  || r[1].s != 7
+	  || r[2].s != 21 * 16 + 7 * 8 + 8 * 4
+	  || r[3].s != 23 * 16 + 18 * 8 + 8 * 4
+	  || r[4].s != 9
+	  || o[0].s != 1
+	  || o[1].s != 25 * 16 + 29 * 8 + 9 * 4
+	  || o[2].s != 27 * 16 + 18 * 8 + 10 * 4
+	  || o[3].s != 2)
+	abort ();
+      if (e[1].t != 7 || h[0].t != 7 || m[1].t != 7 || p[0].t != 7)
+	abort ();
+      for (T z = 0; z < 2; z++)
+	if (a[z].t != 7 || c[z].t != 7 || d[z].t != 5 || f[z + 2].t != 7
+	    || g[z + 1].t != 7 || r[z + 2].t != 7 || s[z + 1].t != 5 || o[z + 1].t != 7
+	    || k[z + 1][0].t != 7 || k[z + 1][1].t != 7 || q[z + 1][0].t != 7 || q[z + 1][1].t != 7
+	    || t[z + 2][0].t != 5 || t[z + 2][1].t != 5)
+	  abort ();
+      for (T z = 0; z < 3; z++)
+	if (b[z + 2].t != 5)
+	  abort ();
+    }
+  }
+}
+
+int
+main ()
+{
+  int c1 = S::cnt1, c2 = S::cnt2, c3 = S::cnt3;
+  int n = 1;
+  test<S, int> (n);
+  if (S::cnt1 + S::cnt2 - c1 - c2 != S::cnt3 - c3)
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/cancel-parallel-1.c b/libgomp/testsuite/libgomp.c-c++-common/cancel-parallel-1.c
new file mode 100644
index 000000000000..77395e2b0f3e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/cancel-parallel-1.c
@@ -0,0 +1,42 @@
+/* { dg-do run } */
+/* { dg-set-target-env-var OMP_CANCELLATION "true" } */
+
+#include <stdlib.h>
+#include <omp.h>
+
+int
+main ()
+{
+  int a[64];
+  #pragma omp parallel
+  {
+    #pragma omp barrier
+    if (omp_get_thread_num () == 0)
+      {
+	#pragma omp cancel parallel
+      }
+    #pragma omp for
+    for (int i = 0; i < 64; i++)
+      a[i] = i;
+    if (omp_get_cancellation ())
+      abort ();
+  }
+  #pragma omp parallel
+  {
+    #pragma omp barrier
+    if (omp_get_thread_num () == 0)
+      {
+	#pragma omp cancel parallel
+      }
+    #pragma omp taskgroup
+    {
+      #pragma omp for
+      for (int i = 0; i < 64; i++)
+	#pragma omp task
+	a[i] += i;
+      if (omp_get_cancellation ())
+	abort ();
+    }
+  }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/cancel-taskgroup-3.c b/libgomp/testsuite/libgomp.c-c++-common/cancel-taskgroup-3.c
new file mode 100644
index 000000000000..b9af83595b0e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/cancel-taskgroup-3.c
@@ -0,0 +1,68 @@
+/* { dg-do run } */
+/* { dg-set-target-env-var OMP_CANCELLATION "true" } */
+
+#include <stdlib.h>
+#include <omp.h>
+
+int
+main ()
+{
+  int a = 0, i;
+  #pragma omp parallel
+  #pragma omp taskgroup
+  {
+    #pragma omp task
+    {
+      #pragma omp cancel taskgroup
+      if (omp_get_cancellation ())
+	abort ();
+    }
+    #pragma omp taskwait
+    #pragma omp for reduction (task, +: a)
+    for (i = 0; i < 64; ++i)
+      {
+	a++;
+	#pragma omp task in_reduction (+: a)
+	{
+	  volatile int zero = 0;
+	  a += zero;
+	  if (omp_get_cancellation ())
+	    abort ();
+	}
+      }
+    if (a != 64)
+      abort ();
+    #pragma omp task
+    {
+      if (omp_get_cancellation ())
+	abort ();
+    }
+  }
+  a = 0;
+  #pragma omp parallel
+  #pragma omp taskgroup
+  {
+    #pragma omp taskwait
+    #pragma omp for reduction (task, +: a)
+    for (i = 0; i < 64; ++i)
+      {
+	a++;
+	#pragma omp task in_reduction (+: a)
+	{
+	  volatile int zero = 0;
+	  a += zero;
+	  #pragma omp cancel taskgroup
+	  if (omp_get_cancellation ())
+	    abort ();
+	}
+      }
+    if (a != 64)
+      abort ();
+    #pragma omp task
+    {
+      if (omp_get_cancellation ())
+	abort ();
+    }
+  }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/task-reduction-11.c b/libgomp/testsuite/libgomp.c-c++-common/task-reduction-11.c
new file mode 100644
index 000000000000..038b0e269e76
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/task-reduction-11.c
@@ -0,0 +1,56 @@
+extern
+#ifdef __cplusplus
+"C"
+#endif
+void abort (void);
+int a, b[3] = { 1, 1, 1 };
+unsigned long int c[2] = { ~0UL, ~0UL };
+
+void
+bar (int i)
+{
+  #pragma omp task in_reduction (*: b[:3]) in_reduction (&: c[1:]) \
+	      in_reduction (+: a)
+  {
+    a += 4;
+    b[1] *= 4;
+    c[1] &= ~(1UL << (i + 16));
+  }
+}
+
+void
+foo (unsigned long long int x, unsigned long long int y, unsigned long long int z)
+{
+  unsigned long long int i;
+  #pragma omp for schedule(runtime) reduction (task, +: a) \
+		  reduction (task, *: b) reduction (task, &: c[1:1])
+  for (i = x; i < y; i += z)
+    {
+      a++;
+      b[0] *= 2;
+      bar (i);
+      b[2] *= 3;
+      c[1] &= ~(1UL << i);
+    }
+}
+
+int
+main ()
+{
+  volatile int two = 2;
+  foo (two, 7 * two, two);
+  if (a != 30 || b[0] != 64 || b[1] != (1 << 12) || b[2] != 3 * 3 * 3 * 3 * 3 * 3
+      || c[0] != ~0UL || c[1] != ~0x15541554UL)
+    abort ();
+  a = 0;
+  b[0] = 1;
+  b[1] = 1;
+  b[2] = 1;
+  c[1] = ~0UL;
+  #pragma omp parallel
+  foo (two, 8 * two, two);
+  if (a != 35 || b[0] != 128 || b[1] != (1 << 14) || b[2] != 3 * 3 * 3 * 3 * 3 * 3 * 3
+      || c[0] != ~0UL || c[1] != ~0x55545554UL)
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/task-reduction-12.c b/libgomp/testsuite/libgomp.c-c++-common/task-reduction-12.c
new file mode 100644
index 000000000000..0ad92735ca74
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/task-reduction-12.c
@@ -0,0 +1,67 @@
+extern
+#ifdef __cplusplus
+"C"
+#endif
+void abort (void);
+int a, b[3] = { 1, 1, 1 };
+unsigned long int c[2] = { ~0UL, ~0UL };
+
+void
+bar (int i)
+{
+  #pragma omp task in_reduction (*: b[:3]) in_reduction (&: c[1:]) \
+	      in_reduction (+: a)
+  {
+    a += 4;
+    b[1] *= 4;
+    c[1] &= ~(1UL << (i + 16));
+  }
+}
+
+void
+foo (int x)
+{
+  #pragma omp sections reduction (task, +: a) reduction (task, *: b) \
+		       reduction (task, &: c[1:1])
+  {
+    {
+      a++; b[0] *= 2; bar (2); b[2] *= 3; c[1] &= ~(1UL << 2);
+    }
+    #pragma omp section
+    { b[0] *= 2; bar (4); b[2] *= 3; c[1] &= ~(1UL << 4); a++; }
+    #pragma omp section
+    { bar (6); b[2] *= 3; c[1] &= ~(1UL << 6); a++; b[0] *= 2; }
+    #pragma omp section
+    { b[2] *= 3; c[1] &= ~(1UL << 8); a++; b[0] *= 2; bar (8); }
+    #pragma omp section
+    { c[1] &= ~(1UL << 10); a++; b[0] *= 2; bar (10); b[2] *= 3; }
+    #pragma omp section
+    { a++; b[0] *= 2; b[2] *= 3; c[1] &= ~(1UL << 12); bar (12); }
+    #pragma omp section
+    if (x)
+      {
+	a++; b[0] *= 2; b[2] *= 3; bar (14); c[1] &= ~(1UL << 14);
+      }
+  }
+}
+
+int
+main ()
+{
+  volatile int one = 1;
+  foo (!one);
+  if (a != 30 || b[0] != 64 || b[1] != (1 << 12) || b[2] != 3 * 3 * 3 * 3 * 3 * 3
+      || c[0] != ~0UL || c[1] != ~0x15541554UL)
+    abort ();
+  a = 0;
+  b[0] = 1;
+  b[1] = 1;
+  b[2] = 1;
+  c[1] = ~0UL;
+  #pragma omp parallel
+  foo (one);
+  if (a != 35 || b[0] != 128 || b[1] != (1 << 14) || b[2] != 3 * 3 * 3 * 3 * 3 * 3 * 3
+      || c[0] != ~0UL || c[1] != ~0x55545554UL)
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/task-reduction-6.c b/libgomp/testsuite/libgomp.c-c++-common/task-reduction-6.c
index b7a29f043250..e0a946efba17 100644
--- a/libgomp/testsuite/libgomp.c-c++-common/task-reduction-6.c
+++ b/libgomp/testsuite/libgomp.c-c++-common/task-reduction-6.c
@@ -1,7 +1,7 @@
 #include <omp.h>
 #include <stdlib.h>
 
-struct S { unsigned long int s, t; };
+struct S { unsigned long long int s, t; };
 
 void
 rbar (struct S *p, struct S *o)
@@ -119,5 +119,7 @@ main ()
     abort ();
   if (m.s != 63 * 64 * 4 || m.t != 7)
     abort ();
+  if (r != t)
+    abort ();
   return 0;
 }
diff --git a/libgomp/testsuite/libgomp.c-c++-common/task-reduction-8.c b/libgomp/testsuite/libgomp.c-c++-common/task-reduction-8.c
new file mode 100644
index 000000000000..7b0859db6f01
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/task-reduction-8.c
@@ -0,0 +1,141 @@
+#include <omp.h>
+#include <stdlib.h>
+
+struct S { unsigned long long int s, t; };
+
+void
+rbar (struct S *p, struct S *o)
+{
+  p->s = 1;
+  if (o->t != 5)
+    abort ();
+  p->t = 9;
+}
+
+static inline void
+rbaz (struct S *o, struct S *i)
+{
+  if (o->t != 5 || i->t != 9)
+    abort ();
+  o->s *= i->s;
+}
+
+#pragma omp declare reduction (+: struct S : omp_out.s += omp_in.s) \
+  initializer (omp_priv = { 0, 3 })
+#pragma omp declare reduction (*: struct S : rbaz (&omp_out, &omp_in)) \
+  initializer (rbar (&omp_priv, &omp_orig))
+
+struct S g = { 0, 7 };
+struct S h = { 1, 5 };
+
+int
+foo (int z, int *a, int *b)
+{
+  int x = 0;
+  #pragma omp taskloop reduction (+:x) in_reduction (+:b[0])
+  for (int i = z; i < z + 8; i++)
+    {
+      x += a[i];
+      *b += a[i] * 2;
+    }
+  return x;
+}
+
+unsigned long long int
+bar (int z, int *a, unsigned long long int *b, int *s)
+{
+  unsigned long long int x = 1;
+  #pragma omp taskloop reduction (*:x) in_reduction (*:b[0])
+  for (int i = z; i < z + 8; i++)
+    {
+      #pragma omp task in_reduction (*:x)
+      x *= a[i];
+      #pragma omp task in_reduction (*:b[0])
+      *b *= (3 - a[i]);
+      s[0]++;
+    }
+  return x;
+}
+
+void
+baz (int i, int *a, int *c)
+{
+  #pragma omp task in_reduction (*:h) in_reduction (+:g)
+  {
+    g.s += 7 * a[i];
+    h.s *= (3 - c[i]);
+    if ((g.t != 7 && g.t != 3) || (h.t != 5 && h.t != 9))
+      abort ();
+  }
+}
+
+int
+main ()
+{
+  int i, j = 0, a[64], b = 0, c[64], f = 0;
+  unsigned long long int d = 1, e = 1;
+  volatile int one = 1;
+  int r = 0, s = 0, t;
+  struct S m = { 0, 7 };
+  struct S n = { 1, 5 };
+  for (i = 0; i < 64; i++)
+    {
+      a[i] = 2 * i;
+      c[i] = 1 + ((i % 3) != 1);
+    }
+  #pragma omp parallel reduction (task, +:b) shared(t) reduction(+:r, s)
+  {
+    int z, q1, q2, q3;
+    #pragma omp master
+    t = omp_get_num_threads ();
+    #pragma omp for schedule(static) reduction (task, +: f) reduction (+: j)
+    for (z = 0; z < 64; z += 8)
+      {
+	f++;
+	j += foo (z, a, &b);
+	j += foo (z, a, &f);
+      }
+    if (j != 63 * 64 * 2 || f != 63 * 64 * 2 + 8)
+      abort ();
+    r++;
+    #pragma omp taskgroup task_reduction (+: s)
+    {
+      #pragma omp for schedule(static, 1) reduction(task, *: d) reduction (*: e)
+      for (z = 0; z < 64; z += 8)
+	e *= bar (z, c, &d, &s);
+    }
+    if (e != (1ULL << 43) || d != (1ULL << 21))
+      abort ();
+    #pragma omp for schedule(monotonic: dynamic, 1) reduction (task, +: g, m) \
+		    reduction (task, *: h, n) collapse(3)
+    for (q1 = 0; q1 < one; q1++)
+      for (q2 = 0; q2 < 64; q2 += 8)
+	for (q3 = 0; q3 < one; ++q3)
+	  #pragma omp taskloop in_reduction (+: g, m) in_reduction (*: h, n) \
+			       nogroup
+	  for (i = q2; i < q2 + 8; ++i)
+	    {
+	      g.s += 3 * a[i];
+	      h.s *= (3 - c[i]);
+	      m.s += 4 * a[i];
+	      n.s *= c[i];
+	      if ((g.t != 7 && g.t != 3) || (h.t != 5 && h.t != 9)
+		  || (m.t != 7 && m.t != 3) || (n.t != 5 && n.t != 9))
+		abort ();
+	      baz (i, a, c);
+	    }
+    if (n.s != (1ULL << 43) || n.t != 5)
+      abort ();
+    if (g.s != 63 * 64 * 10 || g.t != 7)
+      abort ();
+    if (h.s != (1ULL << 42) || h.t != 5)
+      abort ();
+    if (m.s != 63 * 64 * 4 || m.t != 7)
+      abort ();
+  }
+  if (b != 63 * 64 * 2)
+    abort ();
+  if (r != t || s != 64)
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/task-reduction-9.c b/libgomp/testsuite/libgomp.c-c++-common/task-reduction-9.c
new file mode 100644
index 000000000000..3d71fef86708
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/task-reduction-9.c
@@ -0,0 +1,217 @@
+#ifdef __cplusplus
+extern "C"
+#endif
+void abort (void);
+
+int a[2];
+long long int b[7] = { 9, 11, 1, 1, 1, 13, 15 };
+int e[3] = { 5, 0, 5 };
+int f[5] = { 6, 7, 0, 0, 9 };
+int g[4] = { 1, 0, 0, 2 };
+int h[3] = { 0, 1, 4 };
+int k[4][2] = { { 5, 6 }, { 0, 0 }, { 0, 0 }, { 7, 8 } };
+long long *s;
+long long (*t)[2];
+
+void
+foo (int n, int *c, long long int *d, int m[3], int *r, int o[4], int *p, int q[4][2])
+{
+  int i;
+  for (i = 0; i < 2; i++)
+    #pragma omp task in_reduction (+: a, c[:2]) in_reduction (*: b[2 * n:3 * n], d[0:2]) \
+		     in_reduction (+: o[n:n*2], m[1], k[1:2][:], p[0], f[2:2]) \
+		     in_reduction (+: q[1:2][:], g[n:n*2], e[1], h[0], r[2:2]) \
+		     in_reduction (*: s[1:2], t[2:2][:])
+    {
+      a[0] += 7;
+      a[1] += 17;
+      b[2] *= 2;
+      b[4] *= 2;
+      c[0] += 6;
+      d[1] *= 2;
+      e[1] += 19;
+      f[2] += 21;
+      f[3] += 23;
+      g[1] += 25;
+      g[2] += 27;
+      h[0] += 29;
+      k[1][0] += 31;
+      k[2][1] += 33;
+      m[1] += 19;
+      r[2] += 21;
+      r[3] += 23;
+      o[1] += 25;
+      o[2] += 27;
+      p[0] += 29;
+      q[1][0] += 31;
+      q[2][1] += 33;
+      s[1] *= 2;
+      t[2][0] *= 2;
+      t[3][1] *= 2;
+    }
+}
+
+void
+test (int n)
+{
+  int c[2] = { 0, 0 };
+  int p[3] = { 0, 1, 4 };
+  int q[4][2] = { { 5, 6 }, { 0, 0 }, { 0, 0 }, { 7, 8 } };
+  long long ss[4] = { 5, 1, 1, 6 };
+  long long tt[5][2] = { { 9, 10 }, { 11, 12 }, { 1, 1 }, { 1, 1 }, { 13, 14 } };
+  long long int d[] = { 1, 1 };
+  int m[3] = { 5, 0, 5 };
+  int r[5] = { 6, 7, 0, 0, 9 };
+  int o[4] = { 1, 0, 0, 2 };
+  s = ss;
+  t = tt;
+  #pragma omp parallel num_threads(4)
+  {
+    int i;
+    #pragma omp for reduction (task, +: a, c) reduction (task, *: b[2 * n:3 * n], d) \
+		    reduction (task, +: e[1], f[2:2], g[n:n*2], h[0], k[1:2][0:2]) \
+		    reduction (task, +: o[n:n*2], m[1], q[1:2][:], p[0], r[2:2]) \
+		    reduction (task, *: t[2:2][:], s[1:n + 1]) \
+		    schedule(nonmonotonic: runtime)
+    for (i = 0; i < 4; i++)
+      {
+	#pragma omp task in_reduction (+: a, c) in_reduction (*: b[2 * n:3 * n], d) \
+			 in_reduction (+: o[n:n*2], q[1:2][:], p[0], m[1], r[2:2]) \
+			 in_reduction (+: g[n:n * 2], e[1], k[1:2][:], h[0], f[2:2]) \
+			 in_reduction (*: s[1:2], t[2:2][:])
+	{
+	  int j;
+	  a[0] += 2;
+	  a[1] += 3;
+	  b[2] *= 2;
+	  f[3] += 8;
+	  g[1] += 9;
+	  g[2] += 10;
+	  h[0] += 11;
+	  k[1][1] += 13;
+	  k[2][1] += 15;
+	  m[1] += 16;
+	  r[2] += 8;
+	  s[1] *= 2;
+	  t[2][1] *= 2;
+	  t[3][1] *= 2;
+	  for (j = 0; j < 2; j++)
+	    #pragma omp task in_reduction (+: a, c[:2]) \
+			     in_reduction (*: b[2 * n:3 * n], d[n - 1:n + 1]) \
+			     in_reduction (+: e[1], f[2:2], g[n:n*2], h[0], k[1:2][:2]) \
+			     in_reduction (+: m[1], r[2:2], o[n:n*2], p[0], q[1:2][:2]) \
+			     in_reduction (*: s[n:2], t[2:2][:])
+	    {
+	      m[1] += 6;
+	      r[2] += 7;
+	      q[1][0] += 17;
+	      q[2][0] += 19;
+	      a[0] += 4;
+	      a[1] += 5;
+	      b[3] *= 2;
+	      b[4] *= 2;
+	      f[3] += 18;
+	      g[1] += 29;
+	      g[2] += 18;
+	      h[0] += 19;
+	      s[2] *= 2;
+	      t[2][0] *= 2;
+	      t[3][0] *= 2;
+	      foo (n, c, d, m, r, o, p, q);
+	      r[3] += 18;
+	      o[1] += 29;
+	      o[2] += 18;
+	      p[0] += 19;
+	      c[0] += 4;
+	      c[1] += 5;
+	      d[0] *= 2;
+	      e[1] += 6;
+	      f[2] += 7;
+	      k[1][0] += 17;
+	      k[2][0] += 19;
+	    }
+	  r[3] += 8;
+	  o[1] += 9;
+	  o[2] += 10;
+	  p[0] += 11;
+	  q[1][1] += 13;
+	  q[2][1] += 15;
+	  b[3] *= 2;
+	  c[0] += 4;
+	  c[1] += 9;
+	  d[0] *= 2;
+	  e[1] += 16;
+	  f[2] += 8;
+	}
+      }
+  }
+  if (a[0] != 7 * 16 + 4 * 8 + 2 * 4
+      || a[1] != 17 * 16 + 5 * 8 + 3 * 4
+      || b[0] != 9 || b[1] != 11
+      || b[2] != 1LL << (16 + 4)
+      || b[3] != 1LL << (8 + 4)
+      || b[4] != 1LL << (16 + 8)
+      || b[5] != 13 || b[6] != 15
+      || c[0] != 6 * 16 + 4 * 8 + 4 * 4
+      || c[1] != 5 * 8 + 9 * 4
+      || d[0] != 1LL << (8 + 4)
+      || d[1] != 1LL << 16
+      || e[0] != 5
+      || e[1] != 19 * 16 + 6 * 8 + 16 * 4
+      || e[2] != 5
+      || f[0] != 6
+      || f[1] != 7
+      || f[2] != 21 * 16 + 7 * 8 + 8 * 4
+      || f[3] != 23 * 16 + 18 * 8 + 8 * 4
+      || f[4] != 9
+      || g[0] != 1
+      || g[1] != 25 * 16 + 29 * 8 + 9 * 4
+      || g[2] != 27 * 16 + 18 * 8 + 10 * 4
+      || g[3] != 2
+      || h[0] != 29 * 16 + 19 * 8 + 11 * 4
+      || h[1] != 1 || h[2] != 4
+      || k[0][0] != 5 || k[0][1] != 6
+      || k[1][0] != 31 * 16 + 17 * 8
+      || k[1][1] != 13 * 4
+      || k[2][0] != 19 * 8
+      || k[2][1] != 33 * 16 + 15 * 4
+      || k[3][0] != 7 || k[3][1] != 8
+      || m[0] != 5
+      || m[1] != 19 * 16 + 6 * 8 + 16 * 4
+      || m[2] != 5
+      || o[0] != 1
+      || o[1] != 25 * 16 + 29 * 8 + 9 * 4
+      || o[2] != 27 * 16 + 18 * 8 + 10 * 4
+      || o[3] != 2
+      || p[0] != 29 * 16 + 19 * 8 + 11 * 4
+      || p[1] != 1 || p[2] != 4
+      || q[0][0] != 5 || q[0][1] != 6
+      || q[1][0] != 31 * 16 + 17 * 8
+      || q[1][1] != 13 * 4
+      || q[2][0] != 19 * 8
+      || q[2][1] != 33 * 16 + 15 * 4
+      || q[3][0] != 7 || q[3][1] != 8
+      || r[0] != 6
+      || r[1] != 7
+      || r[2] != 21 * 16 + 7 * 8 + 8 * 4
+      || r[3] != 23 * 16 + 18 * 8 + 8 * 4
+      || r[4] != 9
+      || ss[0] != 5
+      || ss[1] != 1LL << (16 + 4)
+      || ss[2] != 1LL << 8
+      || ss[3] != 6
+      || tt[0][0] != 9 || tt[0][1] != 10 || tt[1][0] != 11 || tt[1][1] != 12
+      || tt[2][0] != 1LL << (16 + 8)
+      || tt[2][1] != 1LL << 4
+      || tt[3][0] != 1LL << 8
+      || tt[3][1] != 1LL << (16 + 4)
+      || tt[4][0] != 13 || tt[4][1] != 14)
+    abort ();
+}
+
+int
+main ()
+{
+  test (1);
+  return 0;
+}
diff --git a/libgomp/work.c b/libgomp/work.c
index ac2f0233120f..16fc7076eddf 100644
--- a/libgomp/work.c
+++ b/libgomp/work.c
@@ -98,30 +98,35 @@ alloc_work_share (struct gomp_team *team)
    This shouldn't touch the next_alloc field.  */
 
 void
-gomp_init_work_share (struct gomp_work_share *ws, bool ordered,
+gomp_init_work_share (struct gomp_work_share *ws, size_t ordered,
 		      unsigned nthreads)
 {
   gomp_mutex_init (&ws->lock);
   if (__builtin_expect (ordered, 0))
     {
-#define INLINE_ORDERED_TEAM_IDS_CNT \
-  ((sizeof (struct gomp_work_share) \
-    - offsetof (struct gomp_work_share, inline_ordered_team_ids)) \
-   / sizeof (((struct gomp_work_share *) 0)->inline_ordered_team_ids[0]))
-
-      if (nthreads > INLINE_ORDERED_TEAM_IDS_CNT)
-	ws->ordered_team_ids
-	  = gomp_malloc (nthreads * sizeof (*ws->ordered_team_ids));
+#define INLINE_ORDERED_TEAM_IDS_SIZE \
+  (sizeof (struct gomp_work_share) \
+   - offsetof (struct gomp_work_share, inline_ordered_team_ids))
+
+      if (__builtin_expect (ordered != 1, 0))
+	{
+	  ordered += nthreads * sizeof (*ws->ordered_team_ids) - 1;
+	  ordered = ordered + __alignof__ (long long) - 1;
+	  ordered &= ~(__alignof__ (long long) - 1);
+	}
+      else
+	ordered = nthreads * sizeof (*ws->ordered_team_ids);
+      if (ordered > INLINE_ORDERED_TEAM_IDS_SIZE)
+	ws->ordered_team_ids = gomp_malloc (ordered);
       else
 	ws->ordered_team_ids = ws->inline_ordered_team_ids;
-      memset (ws->ordered_team_ids, '\0',
-	      nthreads * sizeof (*ws->ordered_team_ids));
+      memset (ws->ordered_team_ids, '\0', ordered);
       ws->ordered_num_used = 0;
       ws->ordered_owner = -1;
       ws->ordered_cur = 0;
     }
   else
-    ws->ordered_team_ids = NULL;
+    ws->ordered_team_ids = ws->inline_ordered_team_ids;
   gomp_ptrlock_init (&ws->next_ws, NULL);
   ws->threads_completed = 0;
 }
@@ -174,7 +179,7 @@ free_work_share (struct gomp_team *team, struct gomp_work_share *ws)
    if this was the first thread to reach this point.  */
 
 bool
-gomp_work_share_start (bool ordered)
+gomp_work_share_start (size_t ordered)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -186,7 +191,7 @@ gomp_work_share_start (bool ordered)
       ws = gomp_malloc (sizeof (*ws));
       gomp_init_work_share (ws, ordered, 1);
       thr->ts.work_share = ws;
-      return ws;
+      return true;
     }
 
   ws = thr->ts.work_share;