[thirdparty/gcc.git] / libgomp / iter_ull.c

/* Copyright (C) 2005-2024 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.

   This file is part of the GNU Offloading and Multi Processing Library
   (libgomp).

   Libgomp is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* This file contains routines for managing work-share iteration, both
   for loops and sections.  */

#include "libgomp.h"
#include <stdlib.h>

typedef unsigned long long gomp_ull;

/* This function implements the STATIC scheduling method.  The caller should
   iterate *pstart <= x < *pend.  Return zero if there are more iterations
   to perform; nonzero if not.  Return less than 0 if this thread had
   received the absolutely last iteration.  */

int
gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_team *team = thr->ts.team;
  struct gomp_work_share *ws = thr->ts.work_share;
  unsigned long nthreads = team ? team->nthreads : 1;

  if (thr->ts.static_trip == -1)
    return -1;

  /* Quick test for degenerate teams and orphaned constructs.  */
  if (nthreads == 1)
    {
      *pstart = ws->next_ull;
      *pend = ws->end_ull;
      thr->ts.static_trip = -1;
      return ws->next_ull == ws->end_ull;
    }

  /* We interpret chunk_size zero as "unspecified", which means that we
     should break up the iterations such that each thread makes only one
     trip through the outer loop.  */
  if (ws->chunk_size_ull == 0)
    {
      gomp_ull n, q, i, t, s0, e0, s, e;

      if (thr->ts.static_trip > 0)
	return 1;

      /* Compute the total number of iterations.  */
      if (__builtin_expect (ws->mode, 0) == 0)
	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
      else
	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
      i = thr->ts.team_id;

      /* Compute the "zero-based" start and end points.  That is, as
	 if the loop began at zero and incremented by one.  */
      q = n / nthreads;
      t = n % nthreads;
      if (i < t)
	{
	  t = 0;
	  q++;
	}
      s0 = q * i + t;
      e0 = s0 + q;

      /* Notice when no iterations allocated for this thread.  */
      if (s0 >= e0)
	{
	  thr->ts.static_trip = 1;
	  return 1;
	}

      /* Transform these to the actual start and end numbers.  */
      s = s0 * ws->incr_ull + ws->next_ull;
      e = e0 * ws->incr_ull + ws->next_ull;

      *pstart = s;
      *pend = e;
      thr->ts.static_trip = (e0 == n ? -1 : 1);
      return 0;
    }
  else
    {
      gomp_ull n, s0, e0, i, c, s, e;

      /* Otherwise, each thread gets exactly chunk_size iterations
	 (if available) each time through the loop.  */

      if (__builtin_expect (ws->mode, 0) == 0)
	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
      else
	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
      i = thr->ts.team_id;
      c = ws->chunk_size_ull;

      /* Initial guess is a C sized chunk positioned nthreads iterations
	 in, offset by our thread number.  */
      s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
      e0 = s0 + c;

      /* Detect overflow.  */
      if (s0 >= n)
	return 1;
      if (e0 > n)
	e0 = n;

      /* Transform these to the actual start and end numbers.  */
      s = s0 * ws->incr_ull + ws->next_ull;
      e = e0 * ws->incr_ull + ws->next_ull;

      *pstart = s;
      *pend = e;

      if (e0 == n)
	thr->ts.static_trip = -1;
      else
	thr->ts.static_trip++;
      return 0;
    }
}


/* This function implements the DYNAMIC scheduling method.  Arguments are
   as for gomp_iter_ull_static_next.  This function must be called with
   ws->lock held.  */

bool
gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_work_share *ws = thr->ts.work_share;
  gomp_ull start, end, chunk, left;

  start = ws->next_ull;
  if (start == ws->end_ull)
    return false;

  chunk = ws->chunk_size_ull;
  left = ws->end_ull - start;
  if (__builtin_expect (ws->mode & 2, 0))
    {
      if (chunk < left)
	chunk = left;
    }
  else
    {
      if (chunk > left)
	chunk = left;
    }
  end = start + chunk;

  ws->next_ull = end;
  *pstart = start;
  *pend = end;
  return true;
}


#if defined HAVE_SYNC_BUILTINS && defined __LP64__
/* Similar, but doesn't require the lock held, and uses compare-and-swap
   instead.  Note that the only memory value that changes is ws->next_ull.  */

bool
gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_work_share *ws = thr->ts.work_share;
  gomp_ull start, end, nend, chunk;

  end = ws->end_ull;
  chunk = ws->chunk_size_ull;

  if (__builtin_expect (ws->mode & 1, 1))
    {
      gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
      if (__builtin_expect (ws->mode & 2, 0) == 0)
	{
	  if (tmp >= end)
	    return false;
	  nend = tmp + chunk;
	  if (nend > end)
	    nend = end;
	  *pstart = tmp;
	  *pend = nend;
	  return true;
	}
      else
	{
	  if (tmp <= end)
	    return false;
	  nend = tmp + chunk;
	  if (nend < end)
	    nend = end;
	  *pstart = tmp;
	  *pend = nend;
	  return true;
	}
    }

  start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
  while (1)
    {
      gomp_ull left = end - start;
      gomp_ull tmp;

      if (start == end)
	return false;

      if (__builtin_expect (ws->mode & 2, 0))
	{
	  if (chunk < left)
	    chunk = left;
	}
      else
	{
	  if (chunk > left)
	    chunk = left;
	}
      nend = start + chunk;

      tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
      if (__builtin_expect (tmp == start, 1))
	break;

      start = tmp;
    }

  *pstart = start;
  *pend = nend;
  return true;
}
#endif /* HAVE_SYNC_BUILTINS */


/* This function implements the GUIDED scheduling method.  Arguments are
   as for gomp_iter_ull_static_next.  This function must be called with the
   work share lock held.  */

bool
gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_work_share *ws = thr->ts.work_share;
  struct gomp_team *team = thr->ts.team;
  gomp_ull nthreads = team ? team->nthreads : 1;
  gomp_ull n, q;
  gomp_ull start, end;

  if (ws->next_ull == ws->end_ull)
    return false;

  start = ws->next_ull;
  if (__builtin_expect (ws->mode, 0) == 0)
    n = (ws->end_ull - start) / ws->incr_ull;
  else
    n = (start - ws->end_ull) / -ws->incr_ull;
  q = (n + nthreads - 1) / nthreads;

  if (q < ws->chunk_size_ull)
    q = ws->chunk_size_ull;
  if (q <= n)
    end = start + q * ws->incr_ull;
  else
    end = ws->end_ull;

  ws->next_ull = end;
  *pstart = start;
  *pend = end;
  return true;
}

#if defined HAVE_SYNC_BUILTINS && defined __LP64__
/* Similar, but doesn't require the lock held, and uses compare-and-swap
   instead.  Note that the only memory value that changes is ws->next_ull.  */

bool
gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_work_share *ws = thr->ts.work_share;
  struct gomp_team *team = thr->ts.team;
  gomp_ull nthreads = team ? team->nthreads : 1;
  gomp_ull start, end, nend, incr;
  gomp_ull chunk_size;

  start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
  end = ws->end_ull;
  incr = ws->incr_ull;
  chunk_size = ws->chunk_size_ull;

  while (1)
    {
      gomp_ull n, q;
      gomp_ull tmp;

      if (start == end)
	return false;

      if (__builtin_expect (ws->mode, 0) == 0)
	n = (end - start) / incr;
      else
	n = (start - end) / -incr;
      q = (n + nthreads - 1) / nthreads;

      if (q < chunk_size)
	q = chunk_size;
      if (__builtin_expect (q <= n, 1))
	nend = start + q * incr;
      else
	nend = end;

      tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
      if (__builtin_expect (tmp == start, 1))
	break;

      start = tmp;
    }

  *pstart = start;
  *pend = nend;
  return true;
}
#endif /* HAVE_SYNC_BUILTINS */
Commit	Line	Data
a945c346	1	/* Copyright (C) 2005-2024 Free Software Foundation, Inc.
a68ab351 JJ	2	Contributed by Richard Henderson <rth@redhat.com>.
a68ab351 JJ	3
f1f3453e TS	4	This file is part of the GNU Offloading and Multi Processing Library
f1f3453e TS	5	(libgomp).
a68ab351 JJ	6
a68ab351 JJ	7	Libgomp is free software; you can redistribute it and/or modify it
748086b7 JJ	8	under the terms of the GNU General Public License as published by
	9	the Free Software Foundation; either version 3, or (at your option)
	10	any later version.
a68ab351 JJ	11
	12	Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
	13	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
748086b7	14	FOR A PARTICULAR PURPOSE. See the GNU General Public License for
a68ab351 JJ	15	more details.
a68ab351 JJ	16
748086b7 JJ	17	Under Section 7 of GPL version 3, you are granted additional
	18	permissions described in the GCC Runtime Library Exception, version
	19	3.1, as published by the Free Software Foundation.
	20
	21	You should have received a copy of the GNU General Public License and
	22	a copy of the GCC Runtime Library Exception along with this program;
	23	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	24	<http://www.gnu.org/licenses/>. */
a68ab351 JJ	25
	26	/* This file contains routines for managing work-share iteration, both
	27	for loops and sections. */
	28
	29	#include "libgomp.h"
	30	#include <stdlib.h>
	31
	32	typedef unsigned long long gomp_ull;
	33
	34	/* This function implements the STATIC scheduling method. The caller should
	35	iterate pstart <= x < pend. Return zero if there are more iterations
	36	to perform; nonzero if not. Return less than 0 if this thread had
	37	received the absolutely last iteration. */
	38
	39	int
	40	gomp_iter_ull_static_next (gomp_ull pstart, gomp_ull pend)
	41	{
	42	struct gomp_thread *thr = gomp_thread ();
	43	struct gomp_team *team = thr->ts.team;
	44	struct gomp_work_share *ws = thr->ts.work_share;
	45	unsigned long nthreads = team ? team->nthreads : 1;
	46
	47	if (thr->ts.static_trip == -1)
	48	return -1;
	49
	50	/* Quick test for degenerate teams and orphaned constructs. */
	51	if (nthreads == 1)
	52	{
	53	*pstart = ws->next_ull;
	54	*pend = ws->end_ull;
	55	thr->ts.static_trip = -1;
	56	return ws->next_ull == ws->end_ull;
	57	}
	58
	59	/* We interpret chunk_size zero as "unspecified", which means that we
	60	should break up the iterations such that each thread makes only one
	61	trip through the outer loop. */
	62	if (ws->chunk_size_ull == 0)
	63	{
fb79f500	64	gomp_ull n, q, i, t, s0, e0, s, e;
a68ab351 JJ	65
	66	if (thr->ts.static_trip > 0)
	67	return 1;
	68
	69	/* Compute the total number of iterations. */
	70	if (__builtin_expect (ws->mode, 0) == 0)
	71	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
	72	else
	73	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
	74	i = thr->ts.team_id;
	75
	76	/* Compute the "zero-based" start and end points. That is, as
	77	if the loop began at zero and incremented by one. */
	78	q = n / nthreads;
fb79f500 JJ	79	t = n % nthreads;
	80	if (i < t)
	81	{
	82	t = 0;
	83	q++;
	84	}
	85	s0 = q * i + t;
a68ab351	86	e0 = s0 + q;
a68ab351 JJ	87
	88	/* Notice when no iterations allocated for this thread. */
	89	if (s0 >= e0)
	90	{
	91	thr->ts.static_trip = 1;
	92	return 1;
	93	}
	94
	95	/* Transform these to the actual start and end numbers. */
	96	s = s0 * ws->incr_ull + ws->next_ull;
	97	e = e0 * ws->incr_ull + ws->next_ull;
	98
	99	*pstart = s;
	100	*pend = e;
	101	thr->ts.static_trip = (e0 == n ? -1 : 1);
	102	return 0;
	103	}
	104	else
	105	{
	106	gomp_ull n, s0, e0, i, c, s, e;
	107
	108	/* Otherwise, each thread gets exactly chunk_size iterations
	109	(if available) each time through the loop. */
	110
	111	if (__builtin_expect (ws->mode, 0) == 0)
	112	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
	113	else
	114	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
	115	i = thr->ts.team_id;
	116	c = ws->chunk_size_ull;
	117
	118	/* Initial guess is a C sized chunk positioned nthreads iterations
	119	in, offset by our thread number. */
	120	s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
	121	e0 = s0 + c;
	122
	123	/* Detect overflow. */
	124	if (s0 >= n)
	125	return 1;
	126	if (e0 > n)
	127	e0 = n;
	128
	129	/* Transform these to the actual start and end numbers. */
	130	s = s0 * ws->incr_ull + ws->next_ull;
	131	e = e0 * ws->incr_ull + ws->next_ull;
	132
	133	*pstart = s;
	134	*pend = e;
	135
	136	if (e0 == n)
	137	thr->ts.static_trip = -1;
	138	else
	139	thr->ts.static_trip++;
	140	return 0;
	141	}
	142	}
	143
	144
	145	/* This function implements the DYNAMIC scheduling method. Arguments are
	146	as for gomp_iter_ull_static_next. This function must be called with
	147	ws->lock held. */
	148
	149	bool
	150	gomp_iter_ull_dynamic_next_locked (gomp_ull pstart, gomp_ull pend)
151	{
152	struct gomp_thread *thr = gomp_thread ();
153	struct gomp_work_share *ws = thr->ts.work_share;
154	gomp_ull start, end, chunk, left;
155
156	start = ws->next_ull;
157	if (start == ws->end_ull)
158	return false;
159
160	chunk = ws->chunk_size_ull;
161	left = ws->end_ull - start;
162	if (__builtin_expect (ws->mode & 2, 0))
163	{
164	if (chunk < left)
165	chunk = left;
166	}
167	else
168	{
169	if (chunk > left)
170	chunk = left;
171	}
172	end = start + chunk;
173
174	ws->next_ull = end;
175	*pstart = start;
176	*pend = end;
177	return true;
178	}
179
180
181	#if defined HAVE_SYNC_BUILTINS && defined __LP64__
182	/* Similar, but doesn't require the lock held, and uses compare-and-swap
183	instead. Note that the only memory value that changes is ws->next_ull. */
184
185	bool
186	gomp_iter_ull_dynamic_next (gomp_ull pstart, gomp_ull pend)
187	{
188	struct gomp_thread *thr = gomp_thread ();
189	struct gomp_work_share *ws = thr->ts.work_share;
190	gomp_ull start, end, nend, chunk;
191
192	end = ws->end_ull;
193	chunk = ws->chunk_size_ull;
194
195	if (__builtin_expect (ws->mode & 1, 1))
196	{
197	gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
198	if (__builtin_expect (ws->mode & 2, 0) == 0)
199	{
200	if (tmp >= end)
201	return false;
202	nend = tmp + chunk;
203	if (nend > end)
204	nend = end;
205	*pstart = tmp;
206	*pend = nend;
207	return true;
208	}
209	else
210	{
211	if (tmp <= end)
212	return false;
213	nend = tmp + chunk;
214	if (nend < end)
215	nend = end;
216	*pstart = tmp;
217	*pend = nend;
218	return true;
219	}
220	}
221
bfe7ac89	222	start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
a68ab351 JJ	223	while (1)
	224	{
	225	gomp_ull left = end - start;
	226	gomp_ull tmp;
	227
	228	if (start == end)
	229	return false;
	230
	231	if (__builtin_expect (ws->mode & 2, 0))
	232	{
	233	if (chunk < left)
	234	chunk = left;
	235	}
	236	else
	237	{
	238	if (chunk > left)
	239	chunk = left;
	240	}
	241	nend = start + chunk;
	242
	243	tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
	244	if (__builtin_expect (tmp == start, 1))
	245	break;
	246
	247	start = tmp;
	248	}
	249
	250	*pstart = start;
	251	*pend = nend;
	252	return true;
	253	}
	254	#endif /* HAVE_SYNC_BUILTINS */
	255
	256
	257	/* This function implements the GUIDED scheduling method. Arguments are
	258	as for gomp_iter_ull_static_next. This function must be called with the
	259	work share lock held. */
	260
	261	bool
	262	gomp_iter_ull_guided_next_locked (gomp_ull pstart, gomp_ull pend)
	263	{
	264	struct gomp_thread *thr = gomp_thread ();
	265	struct gomp_work_share *ws = thr->ts.work_share;
	266	struct gomp_team *team = thr->ts.team;
	267	gomp_ull nthreads = team ? team->nthreads : 1;
	268	gomp_ull n, q;
	269	gomp_ull start, end;
	270
	271	if (ws->next_ull == ws->end_ull)
	272	return false;
	273
	274	start = ws->next_ull;
	275	if (__builtin_expect (ws->mode, 0) == 0)
	276	n = (ws->end_ull - start) / ws->incr_ull;
	277	else
	278	n = (start - ws->end_ull) / -ws->incr_ull;
	279	q = (n + nthreads - 1) / nthreads;
	280
	281	if (q < ws->chunk_size_ull)
	282	q = ws->chunk_size_ull;
	283	if (q <= n)
	284	end = start + q * ws->incr_ull;
	285	else
	286	end = ws->end_ull;
287
288	ws->next_ull = end;
289	*pstart = start;
290	*pend = end;
291	return true;
292	}
293
294	#if defined HAVE_SYNC_BUILTINS && defined __LP64__
295	/* Similar, but doesn't require the lock held, and uses compare-and-swap
296	instead. Note that the only memory value that changes is ws->next_ull. */
297
298	bool
299	gomp_iter_ull_guided_next (gomp_ull pstart, gomp_ull pend)
300	{
301	struct gomp_thread *thr = gomp_thread ();
302	struct gomp_work_share *ws = thr->ts.work_share;
303	struct gomp_team *team = thr->ts.team;
304	gomp_ull nthreads = team ? team->nthreads : 1;
305	gomp_ull start, end, nend, incr;
306	gomp_ull chunk_size;
307
bfe7ac89	308	start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
a68ab351 JJ	309	end = ws->end_ull;
	310	incr = ws->incr_ull;
	311	chunk_size = ws->chunk_size_ull;
	312
	313	while (1)
	314	{
	315	gomp_ull n, q;
	316	gomp_ull tmp;
	317
	318	if (start == end)
	319	return false;
	320
	321	if (__builtin_expect (ws->mode, 0) == 0)
	322	n = (end - start) / incr;
	323	else
	324	n = (start - end) / -incr;
	325	q = (n + nthreads - 1) / nthreads;
	326
	327	if (q < chunk_size)
	328	q = chunk_size;
	329	if (__builtin_expect (q <= n, 1))
	330	nend = start + q * incr;
	331	else
	332	nend = end;
	333
	334	tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
	335	if (__builtin_expect (tmp == start, 1))
	336	break;
	337
	338	start = tmp;
	339	}
	340
	341	*pstart = start;
	342	*pend = nend;
	343	return true;
	344	}
	345	#endif /* HAVE_SYNC_BUILTINS */