[thirdparty/gcc.git] / libgomp / iter.c

/* Copyright (C) 2005-2024 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.

   This file is part of the GNU Offloading and Multi Processing Library
   (libgomp).

   Libgomp is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* This file contains routines for managing work-share iteration, both
   for loops and sections.  */

#include "libgomp.h"
#include <stdlib.h>


/* This function implements the STATIC scheduling method.  The caller should
   iterate *pstart <= x < *pend.  Return zero if there are more iterations
   to perform; nonzero if not.  Return less than 0 if this thread had
   received the absolutely last iteration.  */

int
gomp_iter_static_next (long *pstart, long *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_team *team = thr->ts.team;
  struct gomp_work_share *ws = thr->ts.work_share;
  unsigned long nthreads = team ? team->nthreads : 1;

  if (thr->ts.static_trip == -1)
    return -1;

  /* Quick test for degenerate teams and orphaned constructs.  */
  if (nthreads == 1)
    {
      *pstart = ws->next;
      *pend = ws->end;
      thr->ts.static_trip = -1;
      return ws->next == ws->end;
    }

  /* We interpret chunk_size zero as "unspecified", which means that we
     should break up the iterations such that each thread makes only one
     trip through the outer loop.  */
  if (ws->chunk_size == 0)
    {
      unsigned long n, q, i, t;
      unsigned long s0, e0;
      long s, e;

      if (thr->ts.static_trip > 0)
	return 1;

      /* Compute the total number of iterations.  */
      s = ws->incr + (ws->incr > 0 ? -1 : 1);
      n = (ws->end - ws->next + s) / ws->incr;
      i = thr->ts.team_id;

      /* Compute the "zero-based" start and end points.  That is, as
         if the loop began at zero and incremented by one.  */
      q = n / nthreads;
      t = n % nthreads;
      if (i < t)
	{
	  t = 0;
	  q++;
	}
      s0 = q * i + t;
      e0 = s0 + q;

      /* Notice when no iterations allocated for this thread.  */
      if (s0 >= e0)
	{
	  thr->ts.static_trip = 1;
	  return 1;
	}

      /* Transform these to the actual start and end numbers.  */
      s = (long)s0 * ws->incr + ws->next;
      e = (long)e0 * ws->incr + ws->next;

      *pstart = s;
      *pend = e;
      thr->ts.static_trip = (e0 == n ? -1 : 1);
      return 0;
    }
  else
    {
      unsigned long n, s0, e0, i, c;
      long s, e;

      /* Otherwise, each thread gets exactly chunk_size iterations
	 (if available) each time through the loop.  */

      s = ws->incr + (ws->incr > 0 ? -1 : 1);
      n = (ws->end - ws->next + s) / ws->incr;
      i = thr->ts.team_id;
      c = ws->chunk_size;

      /* Initial guess is a C sized chunk positioned nthreads iterations
	 in, offset by our thread number.  */
      s0 = (thr->ts.static_trip * nthreads + i) * c;
      e0 = s0 + c;

      /* Detect overflow.  */
      if (s0 >= n)
	return 1;
      if (e0 > n)
	e0 = n;

      /* Transform these to the actual start and end numbers.  */
      s = (long)s0 * ws->incr + ws->next;
      e = (long)e0 * ws->incr + ws->next;

      *pstart = s;
      *pend = e;

      if (e0 == n)
	thr->ts.static_trip = -1;
      else
	thr->ts.static_trip++;
      return 0;
    }
}


/* This function implements the DYNAMIC scheduling method.  Arguments are
   as for gomp_iter_static_next.  This function must be called with ws->lock
   held.  */

bool
gomp_iter_dynamic_next_locked (long *pstart, long *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_work_share *ws = thr->ts.work_share;
  long start, end, chunk, left;

  start = ws->next;
  if (start == ws->end)
    return false;

  chunk = ws->chunk_size;
  left = ws->end - start;
  if (ws->incr < 0)
    {
      if (chunk < left)
	chunk = left;
    }
  else
    {
      if (chunk > left)
	chunk = left;
    }
  end = start + chunk;

  ws->next = end;
  *pstart = start;
  *pend = end;
  return true;
}


#ifdef HAVE_SYNC_BUILTINS
/* Similar, but doesn't require the lock held, and uses compare-and-swap
   instead.  Note that the only memory value that changes is ws->next.  */

bool
gomp_iter_dynamic_next (long *pstart, long *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_work_share *ws = thr->ts.work_share;
  long start, end, nend, chunk, incr;

  end = ws->end;
  incr = ws->incr;
  chunk = ws->chunk_size;

  if (__builtin_expect (ws->mode, 1))
    {
      long tmp = __sync_fetch_and_add (&ws->next, chunk);
      if (incr > 0)
	{
	  if (tmp >= end)
	    return false;
	  nend = tmp + chunk;
	  if (nend > end)
	    nend = end;
	  *pstart = tmp;
	  *pend = nend;
	  return true;
	}
      else
	{
	  if (tmp <= end)
	    return false;
	  nend = tmp + chunk;
	  if (nend < end)
	    nend = end;
	  *pstart = tmp;
	  *pend = nend;
	  return true;
	}
    }

  start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
  while (1)
    {
      long left = end - start;
      long tmp;

      if (start == end)
	return false;

      if (incr < 0)
	{
	  if (chunk < left)
	    chunk = left;
	}
      else
	{
	  if (chunk > left)
	    chunk = left;
	}
      nend = start + chunk;

      tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
      if (__builtin_expect (tmp == start, 1))
	break;

      start = tmp;
    }

  *pstart = start;
  *pend = nend;
  return true;
}
#endif /* HAVE_SYNC_BUILTINS */


/* This function implements the GUIDED scheduling method.  Arguments are
   as for gomp_iter_static_next.  This function must be called with the
   work share lock held.  */

bool
gomp_iter_guided_next_locked (long *pstart, long *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_work_share *ws = thr->ts.work_share;
  struct gomp_team *team = thr->ts.team;
  unsigned long nthreads = team ? team->nthreads : 1;
  unsigned long n, q;
  long start, end;

  if (ws->next == ws->end)
    return false;

  start = ws->next;
  n = (ws->end - start) / ws->incr;
  q = (n + nthreads - 1) / nthreads;

  if (q < ws->chunk_size)
    q = ws->chunk_size;
  if (q <= n)
    end = start + q * ws->incr;
  else
    end = ws->end;

  ws->next = end;
  *pstart = start;
  *pend = end;
  return true;
}

#ifdef HAVE_SYNC_BUILTINS
/* Similar, but doesn't require the lock held, and uses compare-and-swap
   instead.  Note that the only memory value that changes is ws->next.  */

bool
gomp_iter_guided_next (long *pstart, long *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_work_share *ws = thr->ts.work_share;
  struct gomp_team *team = thr->ts.team;
  unsigned long nthreads = team ? team->nthreads : 1;
  long start, end, nend, incr;
  unsigned long chunk_size;

  start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
  end = ws->end;
  incr = ws->incr;
  chunk_size = ws->chunk_size;

  while (1)
    {
      unsigned long n, q;
      long tmp;

      if (start == end)
	return false;

      n = (end - start) / incr;
      q = (n + nthreads - 1) / nthreads;

      if (q < chunk_size)
	q = chunk_size;
      if (__builtin_expect (q <= n, 1))
	nend = start + q * incr;
      else
	nend = end;

      tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
      if (__builtin_expect (tmp == start, 1))
	break;

      start = tmp;
    }

  *pstart = start;
  *pend = nend;
  return true;
}
#endif /* HAVE_SYNC_BUILTINS */
Commit	Line	Data
a945c346	1	/* Copyright (C) 2005-2024 Free Software Foundation, Inc.
953ff289 DN	2	Contributed by Richard Henderson <rth@redhat.com>.
953ff289 DN	3
f1f3453e TS	4	This file is part of the GNU Offloading and Multi Processing Library
f1f3453e TS	5	(libgomp).
953ff289 DN	6
953ff289 DN	7	Libgomp is free software; you can redistribute it and/or modify it
748086b7 JJ	8	under the terms of the GNU General Public License as published by
	9	the Free Software Foundation; either version 3, or (at your option)
	10	any later version.
953ff289 DN	11
	12	Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
	13	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
748086b7	14	FOR A PARTICULAR PURPOSE. See the GNU General Public License for
953ff289 DN	15	more details.
953ff289 DN	16
748086b7 JJ	17	Under Section 7 of GPL version 3, you are granted additional
	18	permissions described in the GCC Runtime Library Exception, version
	19	3.1, as published by the Free Software Foundation.
	20
	21	You should have received a copy of the GNU General Public License and
	22	a copy of the GCC Runtime Library Exception along with this program;
	23	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	24	<http://www.gnu.org/licenses/>. */
953ff289 DN	25
	26	/* This file contains routines for managing work-share iteration, both
	27	for loops and sections. */
	28
	29	#include "libgomp.h"
	30	#include <stdlib.h>
	31
	32
	33	/* This function implements the STATIC scheduling method. The caller should
	34	iterate pstart <= x < pend. Return zero if there are more iterations
	35	to perform; nonzero if not. Return less than 0 if this thread had
	36	received the absolutely last iteration. */
	37
	38	int
	39	gomp_iter_static_next (long pstart, long pend)
	40	{
	41	struct gomp_thread *thr = gomp_thread ();
	42	struct gomp_team *team = thr->ts.team;
	43	struct gomp_work_share *ws = thr->ts.work_share;
	44	unsigned long nthreads = team ? team->nthreads : 1;
	45
	46	if (thr->ts.static_trip == -1)
	47	return -1;
	48
	49	/* Quick test for degenerate teams and orphaned constructs. */
	50	if (nthreads == 1)
	51	{
	52	*pstart = ws->next;
	53	*pend = ws->end;
	54	thr->ts.static_trip = -1;
	55	return ws->next == ws->end;
	56	}
	57
	58	/* We interpret chunk_size zero as "unspecified", which means that we
	59	should break up the iterations such that each thread makes only one
	60	trip through the outer loop. */
	61	if (ws->chunk_size == 0)
	62	{
fb79f500	63	unsigned long n, q, i, t;
953ff289 DN	64	unsigned long s0, e0;
	65	long s, e;
	66
	67	if (thr->ts.static_trip > 0)
	68	return 1;
	69
	70	/* Compute the total number of iterations. */
	71	s = ws->incr + (ws->incr > 0 ? -1 : 1);
	72	n = (ws->end - ws->next + s) / ws->incr;
	73	i = thr->ts.team_id;
	74
	75	/* Compute the "zero-based" start and end points. That is, as
	76	if the loop began at zero and incremented by one. */
	77	q = n / nthreads;
fb79f500 JJ	78	t = n % nthreads;
	79	if (i < t)
	80	{
	81	t = 0;
	82	q++;
	83	}
	84	s0 = q * i + t;
953ff289	85	e0 = s0 + q;
953ff289 DN	86
	87	/* Notice when no iterations allocated for this thread. */
	88	if (s0 >= e0)
	89	{
	90	thr->ts.static_trip = 1;
	91	return 1;
	92	}
	93
	94	/* Transform these to the actual start and end numbers. */
	95	s = (long)s0 * ws->incr + ws->next;
	96	e = (long)e0 * ws->incr + ws->next;
	97
	98	*pstart = s;
	99	*pend = e;
	100	thr->ts.static_trip = (e0 == n ? -1 : 1);
	101	return 0;
	102	}
	103	else
	104	{
	105	unsigned long n, s0, e0, i, c;
	106	long s, e;
	107
	108	/* Otherwise, each thread gets exactly chunk_size iterations
	109	(if available) each time through the loop. */
	110
	111	s = ws->incr + (ws->incr > 0 ? -1 : 1);
	112	n = (ws->end - ws->next + s) / ws->incr;
	113	i = thr->ts.team_id;
	114	c = ws->chunk_size;
	115
	116	/* Initial guess is a C sized chunk positioned nthreads iterations
	117	in, offset by our thread number. */
	118	s0 = (thr->ts.static_trip * nthreads + i) * c;
	119	e0 = s0 + c;
	120
	121	/* Detect overflow. */
	122	if (s0 >= n)
	123	return 1;
	124	if (e0 > n)
	125	e0 = n;
	126
	127	/* Transform these to the actual start and end numbers. */
	128	s = (long)s0 * ws->incr + ws->next;
	129	e = (long)e0 * ws->incr + ws->next;
	130
	131	*pstart = s;
	132	*pend = e;
	133
	134	if (e0 == n)
	135	thr->ts.static_trip = -1;
	136	else
	137	thr->ts.static_trip++;
	138	return 0;
	139	}
	140	}
	141
	142
	143	/* This function implements the DYNAMIC scheduling method. Arguments are
	144	as for gomp_iter_static_next. This function must be called with ws->lock
	145	held. */
	146
	147	bool
	148	gomp_iter_dynamic_next_locked (long pstart, long pend)
	149	{
150	struct gomp_thread *thr = gomp_thread ();
151	struct gomp_work_share *ws = thr->ts.work_share;
152	long start, end, chunk, left;
153
154	start = ws->next;
155	if (start == ws->end)
156	return false;
157
a68ab351	158	chunk = ws->chunk_size;
953ff289 DN	159	left = ws->end - start;
	160	if (ws->incr < 0)
	161	{
	162	if (chunk < left)
	163	chunk = left;
	164	}
	165	else
	166	{
	167	if (chunk > left)
	168	chunk = left;
	169	}
	170	end = start + chunk;
	171
	172	ws->next = end;
	173	*pstart = start;
	174	*pend = end;
	175	return true;
	176	}
	177
	178
	179	#ifdef HAVE_SYNC_BUILTINS
	180	/* Similar, but doesn't require the lock held, and uses compare-and-swap
	181	instead. Note that the only memory value that changes is ws->next. */
	182
	183	bool
	184	gomp_iter_dynamic_next (long pstart, long pend)
	185	{
	186	struct gomp_thread *thr = gomp_thread ();
	187	struct gomp_work_share *ws = thr->ts.work_share;
	188	long start, end, nend, chunk, incr;
	189
953ff289 DN	190	end = ws->end;
953ff289 DN	191	incr = ws->incr;
a68ab351 JJ	192	chunk = ws->chunk_size;
	193
	194	if (__builtin_expect (ws->mode, 1))
	195	{
	196	long tmp = __sync_fetch_and_add (&ws->next, chunk);
	197	if (incr > 0)
	198	{
	199	if (tmp >= end)
	200	return false;
	201	nend = tmp + chunk;
	202	if (nend > end)
	203	nend = end;
	204	*pstart = tmp;
	205	*pend = nend;
	206	return true;
	207	}
	208	else
	209	{
	210	if (tmp <= end)
	211	return false;
	212	nend = tmp + chunk;
	213	if (nend < end)
	214	nend = end;
	215	*pstart = tmp;
	216	*pend = nend;
	217	return true;
	218	}
	219	}
953ff289	220
bfe7ac89	221	start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
953ff289 DN	222	while (1)
	223	{
	224	long left = end - start;
	225	long tmp;
	226
	227	if (start == end)
	228	return false;
	229
	230	if (incr < 0)
	231	{
	232	if (chunk < left)
	233	chunk = left;
	234	}
	235	else
	236	{
	237	if (chunk > left)
	238	chunk = left;
	239	}
	240	nend = start + chunk;
	241
	242	tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
	243	if (__builtin_expect (tmp == start, 1))
	244	break;
	245
	246	start = tmp;
	247	}
	248
	249	*pstart = start;
	250	*pend = nend;
	251	return true;
	252	}
	253	#endif /* HAVE_SYNC_BUILTINS */
	254
	255
	256	/* This function implements the GUIDED scheduling method. Arguments are
	257	as for gomp_iter_static_next. This function must be called with the
	258	work share lock held. */
	259
	260	bool
	261	gomp_iter_guided_next_locked (long pstart, long pend)
	262	{
	263	struct gomp_thread *thr = gomp_thread ();
	264	struct gomp_work_share *ws = thr->ts.work_share;
	265	struct gomp_team *team = thr->ts.team;
	266	unsigned long nthreads = team ? team->nthreads : 1;
	267	unsigned long n, q;
	268	long start, end;
	269
	270	if (ws->next == ws->end)
	271	return false;
	272
9e775963 JJ	273	start = ws->next;
9e775963 JJ	274	n = (ws->end - start) / ws->incr;
953ff289 DN	275	q = (n + nthreads - 1) / nthreads;
	276
	277	if (q < ws->chunk_size)
	278	q = ws->chunk_size;
9e775963 JJ	279	if (q <= n)
	280	end = start + q * ws->incr;
	281	else
	282	end = ws->end;
953ff289 DN	283
	284	ws->next = end;
	285	*pstart = start;
	286	*pend = end;
	287	return true;
	288	}
	289
	290	#ifdef HAVE_SYNC_BUILTINS
	291	/* Similar, but doesn't require the lock held, and uses compare-and-swap
	292	instead. Note that the only memory value that changes is ws->next. */
	293
	294	bool
	295	gomp_iter_guided_next (long pstart, long pend)
	296	{
	297	struct gomp_thread *thr = gomp_thread ();
	298	struct gomp_work_share *ws = thr->ts.work_share;
	299	struct gomp_team *team = thr->ts.team;
	300	unsigned long nthreads = team ? team->nthreads : 1;
	301	long start, end, nend, incr;
	302	unsigned long chunk_size;
	303
bfe7ac89	304	start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
953ff289 DN	305	end = ws->end;
	306	incr = ws->incr;
	307	chunk_size = ws->chunk_size;
	308
	309	while (1)
	310	{
	311	unsigned long n, q;
	312	long tmp;
	313
	314	if (start == end)
	315	return false;
	316
9e775963	317	n = (end - start) / incr;
953ff289 DN	318	q = (n + nthreads - 1) / nthreads;
	319
	320	if (q < chunk_size)
	321	q = chunk_size;
9e775963 JJ	322	if (__builtin_expect (q <= n, 1))
	323	nend = start + q * incr;
	324	else
	325	nend = end;
953ff289 DN	326
	327	tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
	328	if (__builtin_expect (tmp == start, 1))
	329	break;
	330
	331	start = tmp;
	332	}
	333
	334	*pstart = start;
	335	*pend = nend;
	336	return true;
	337	}
	338	#endif /* HAVE_SYNC_BUILTINS */