[thirdparty/gcc.git] / libgomp / iter_ull.c

/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.

   This file is part of the GNU OpenMP Library (libgomp).

   Libgomp is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

/* This file contains routines for managing work-share iteration, both
   for loops and sections.  */

#include "libgomp.h"
#include <stdlib.h>

typedef unsigned long long gomp_ull;

/* This function implements the STATIC scheduling method.  The caller should
   iterate *pstart <= x < *pend.  Return zero if there are more iterations
   to perform; nonzero if not.  Return less than 0 if this thread had
   received the absolutely last iteration.  */

int
gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_team *team = thr->ts.team;
  struct gomp_work_share *ws = thr->ts.work_share;
  unsigned long nthreads = team ? team->nthreads : 1;

  if (thr->ts.static_trip == -1)
    return -1;

  /* Quick test for degenerate teams and orphaned constructs.  */
  if (nthreads == 1)
    {
      *pstart = ws->next_ull;
      *pend = ws->end_ull;
      thr->ts.static_trip = -1;
      return ws->next_ull == ws->end_ull;
    }

  /* We interpret chunk_size zero as "unspecified", which means that we
     should break up the iterations such that each thread makes only one
     trip through the outer loop.  */
  if (ws->chunk_size_ull == 0)
    {
      gomp_ull n, q, i, s0, e0, s, e;

      if (thr->ts.static_trip > 0)
	return 1;

      /* Compute the total number of iterations.  */
      if (__builtin_expect (ws->mode, 0) == 0)
	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
      else
	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
      i = thr->ts.team_id;

      /* Compute the "zero-based" start and end points.  That is, as
	 if the loop began at zero and incremented by one.  */
      q = n / nthreads;
      q += (q * nthreads != n);
      s0 = q * i;
      e0 = s0 + q;
      if (e0 > n)
	e0 = n;

      /* Notice when no iterations allocated for this thread.  */
      if (s0 >= e0)
	{
	  thr->ts.static_trip = 1;
	  return 1;
	}

      /* Transform these to the actual start and end numbers.  */
      s = s0 * ws->incr_ull + ws->next_ull;
      e = e0 * ws->incr_ull + ws->next_ull;

      *pstart = s;
      *pend = e;
      thr->ts.static_trip = (e0 == n ? -1 : 1);
      return 0;
    }
  else
    {
      gomp_ull n, s0, e0, i, c, s, e;

      /* Otherwise, each thread gets exactly chunk_size iterations
	 (if available) each time through the loop.  */

      if (__builtin_expect (ws->mode, 0) == 0)
	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
      else
	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
      i = thr->ts.team_id;
      c = ws->chunk_size_ull;

      /* Initial guess is a C sized chunk positioned nthreads iterations
	 in, offset by our thread number.  */
      s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
      e0 = s0 + c;

      /* Detect overflow.  */
      if (s0 >= n)
	return 1;
      if (e0 > n)
	e0 = n;

      /* Transform these to the actual start and end numbers.  */
      s = s0 * ws->incr_ull + ws->next_ull;
      e = e0 * ws->incr_ull + ws->next_ull;

      *pstart = s;
      *pend = e;

      if (e0 == n)
	thr->ts.static_trip = -1;
      else
	thr->ts.static_trip++;
      return 0;
    }
}


/* This function implements the DYNAMIC scheduling method.  Arguments are
   as for gomp_iter_ull_static_next.  This function must be called with
   ws->lock held.  */

bool
gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_work_share *ws = thr->ts.work_share;
  gomp_ull start, end, chunk, left;

  start = ws->next_ull;
  if (start == ws->end_ull)
    return false;

  chunk = ws->chunk_size_ull;
  left = ws->end_ull - start;
  if (__builtin_expect (ws->mode & 2, 0))
    {
      if (chunk < left)
	chunk = left;
    }
  else
    {
      if (chunk > left)
	chunk = left;
    }
  end = start + chunk;

  ws->next_ull = end;
  *pstart = start;
  *pend = end;
  return true;
}


#if defined HAVE_SYNC_BUILTINS && defined __LP64__
/* Similar, but doesn't require the lock held, and uses compare-and-swap
   instead.  Note that the only memory value that changes is ws->next_ull.  */

bool
gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_work_share *ws = thr->ts.work_share;
  gomp_ull start, end, nend, chunk;

  end = ws->end_ull;
  chunk = ws->chunk_size_ull;

  if (__builtin_expect (ws->mode & 1, 1))
    {
      gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
      if (__builtin_expect (ws->mode & 2, 0) == 0)
	{
	  if (tmp >= end)
	    return false;
	  nend = tmp + chunk;
	  if (nend > end)
	    nend = end;
	  *pstart = tmp;
	  *pend = nend;
	  return true;
	}
      else
	{
	  if (tmp <= end)
	    return false;
	  nend = tmp + chunk;
	  if (nend < end)
	    nend = end;
	  *pstart = tmp;
	  *pend = nend;
	  return true;
	}
    }

  start = ws->next_ull;
  while (1)
    {
      gomp_ull left = end - start;
      gomp_ull tmp;

      if (start == end)
	return false;

      if (__builtin_expect (ws->mode & 2, 0))
	{
	  if (chunk < left)
	    chunk = left;
	}
      else
	{
	  if (chunk > left)
	    chunk = left;
	}
      nend = start + chunk;

      tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
      if (__builtin_expect (tmp == start, 1))
	break;

      start = tmp;
    }

  *pstart = start;
  *pend = nend;
  return true;
}
#endif /* HAVE_SYNC_BUILTINS */


/* This function implements the GUIDED scheduling method.  Arguments are
   as for gomp_iter_ull_static_next.  This function must be called with the
   work share lock held.  */

bool
gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_work_share *ws = thr->ts.work_share;
  struct gomp_team *team = thr->ts.team;
  gomp_ull nthreads = team ? team->nthreads : 1;
  gomp_ull n, q;
  gomp_ull start, end;

  if (ws->next_ull == ws->end_ull)
    return false;

  start = ws->next_ull;
  if (__builtin_expect (ws->mode, 0) == 0)
    n = (ws->end_ull - start) / ws->incr_ull;
  else
    n = (start - ws->end_ull) / -ws->incr_ull;
  q = (n + nthreads - 1) / nthreads;

  if (q < ws->chunk_size_ull)
    q = ws->chunk_size_ull;
  if (q <= n)
    end = start + q * ws->incr_ull;
  else
    end = ws->end_ull;

  ws->next_ull = end;
  *pstart = start;
  *pend = end;
  return true;
}

#if defined HAVE_SYNC_BUILTINS && defined __LP64__
/* Similar, but doesn't require the lock held, and uses compare-and-swap
   instead.  Note that the only memory value that changes is ws->next_ull.  */

bool
gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
{
  struct gomp_thread *thr = gomp_thread ();
  struct gomp_work_share *ws = thr->ts.work_share;
  struct gomp_team *team = thr->ts.team;
  gomp_ull nthreads = team ? team->nthreads : 1;
  gomp_ull start, end, nend, incr;
  gomp_ull chunk_size;

  start = ws->next_ull;
  end = ws->end_ull;
  incr = ws->incr_ull;
  chunk_size = ws->chunk_size_ull;

  while (1)
    {
      gomp_ull n, q;
      gomp_ull tmp;

      if (start == end)
	return false;

      if (__builtin_expect (ws->mode, 0) == 0)
	n = (end - start) / incr;
      else
	n = (start - end) / -incr;
      q = (n + nthreads - 1) / nthreads;

      if (q < chunk_size)
	q = chunk_size;
      if (__builtin_expect (q <= n, 1))
	nend = start + q * incr;
      else
	nend = end;

      tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
      if (__builtin_expect (tmp == start, 1))
	break;

      start = tmp;
    }

  *pstart = start;
  *pend = nend;
  return true;
}
#endif /* HAVE_SYNC_BUILTINS */
Commit	Line	Data
748086b7	1	/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc.
a68ab351 JJ	2	Contributed by Richard Henderson <rth@redhat.com>.
	3
	4	This file is part of the GNU OpenMP Library (libgomp).
	5
	6	Libgomp is free software; you can redistribute it and/or modify it
748086b7 JJ	7	under the terms of the GNU General Public License as published by
	8	the Free Software Foundation; either version 3, or (at your option)
	9	any later version.
a68ab351 JJ	10
	11	Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
	12	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
748086b7	13	FOR A PARTICULAR PURPOSE. See the GNU General Public License for
a68ab351 JJ	14	more details.
a68ab351 JJ	15
748086b7 JJ	16	Under Section 7 of GPL version 3, you are granted additional
	17	permissions described in the GCC Runtime Library Exception, version
	18	3.1, as published by the Free Software Foundation.
	19
	20	You should have received a copy of the GNU General Public License and
	21	a copy of the GCC Runtime Library Exception along with this program;
	22	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	23	<http://www.gnu.org/licenses/>. */
a68ab351 JJ	24
	25	/* This file contains routines for managing work-share iteration, both
	26	for loops and sections. */
	27
	28	#include "libgomp.h"
	29	#include <stdlib.h>
	30
	31	typedef unsigned long long gomp_ull;
	32
	33	/* This function implements the STATIC scheduling method. The caller should
	34	iterate pstart <= x < pend. Return zero if there are more iterations
	35	to perform; nonzero if not. Return less than 0 if this thread had
	36	received the absolutely last iteration. */
	37
	38	int
	39	gomp_iter_ull_static_next (gomp_ull pstart, gomp_ull pend)
	40	{
	41	struct gomp_thread *thr = gomp_thread ();
	42	struct gomp_team *team = thr->ts.team;
	43	struct gomp_work_share *ws = thr->ts.work_share;
	44	unsigned long nthreads = team ? team->nthreads : 1;
	45
	46	if (thr->ts.static_trip == -1)
	47	return -1;
	48
	49	/* Quick test for degenerate teams and orphaned constructs. */
	50	if (nthreads == 1)
	51	{
	52	*pstart = ws->next_ull;
	53	*pend = ws->end_ull;
	54	thr->ts.static_trip = -1;
	55	return ws->next_ull == ws->end_ull;
	56	}
	57
	58	/* We interpret chunk_size zero as "unspecified", which means that we
	59	should break up the iterations such that each thread makes only one
	60	trip through the outer loop. */
	61	if (ws->chunk_size_ull == 0)
	62	{
	63	gomp_ull n, q, i, s0, e0, s, e;
	64
	65	if (thr->ts.static_trip > 0)
	66	return 1;
	67
	68	/* Compute the total number of iterations. */
	69	if (__builtin_expect (ws->mode, 0) == 0)
	70	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
	71	else
	72	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
	73	i = thr->ts.team_id;
	74
	75	/* Compute the "zero-based" start and end points. That is, as
	76	if the loop began at zero and incremented by one. */
	77	q = n / nthreads;
	78	q += (q * nthreads != n);
	79	s0 = q * i;
	80	e0 = s0 + q;
	81	if (e0 > n)
	82	e0 = n;
	83
	84	/* Notice when no iterations allocated for this thread. */
	85	if (s0 >= e0)
	86	{
	87	thr->ts.static_trip = 1;
88	return 1;
89	}
90
91	/* Transform these to the actual start and end numbers. */
92	s = s0 * ws->incr_ull + ws->next_ull;
93	e = e0 * ws->incr_ull + ws->next_ull;
94
95	*pstart = s;
96	*pend = e;
97	thr->ts.static_trip = (e0 == n ? -1 : 1);
98	return 0;
99	}
100	else
101	{
102	gomp_ull n, s0, e0, i, c, s, e;
103
104	/* Otherwise, each thread gets exactly chunk_size iterations
105	(if available) each time through the loop. */
106
107	if (__builtin_expect (ws->mode, 0) == 0)
108	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
109	else
110	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
111	i = thr->ts.team_id;
112	c = ws->chunk_size_ull;
113
114	/* Initial guess is a C sized chunk positioned nthreads iterations
115	in, offset by our thread number. */
116	s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
117	e0 = s0 + c;
118
119	/* Detect overflow. */
120	if (s0 >= n)
121	return 1;
122	if (e0 > n)
123	e0 = n;
124
125	/* Transform these to the actual start and end numbers. */
126	s = s0 * ws->incr_ull + ws->next_ull;
127	e = e0 * ws->incr_ull + ws->next_ull;
128
129	*pstart = s;
130	*pend = e;
131
132	if (e0 == n)
133	thr->ts.static_trip = -1;
134	else
135	thr->ts.static_trip++;
136	return 0;
137	}
138	}
139
140
141	/* This function implements the DYNAMIC scheduling method. Arguments are
142	as for gomp_iter_ull_static_next. This function must be called with
143	ws->lock held. */
144
145	bool
146	gomp_iter_ull_dynamic_next_locked (gomp_ull pstart, gomp_ull pend)
147	{
148	struct gomp_thread *thr = gomp_thread ();
149	struct gomp_work_share *ws = thr->ts.work_share;
150	gomp_ull start, end, chunk, left;
151
152	start = ws->next_ull;
153	if (start == ws->end_ull)
154	return false;
155
156	chunk = ws->chunk_size_ull;
157	left = ws->end_ull - start;
158	if (__builtin_expect (ws->mode & 2, 0))
159	{
160	if (chunk < left)
161	chunk = left;
162	}
163	else
164	{
165	if (chunk > left)
166	chunk = left;
167	}
168	end = start + chunk;
169
170	ws->next_ull = end;
171	*pstart = start;
172	*pend = end;
173	return true;
174	}
175
176
177	#if defined HAVE_SYNC_BUILTINS && defined __LP64__
178	/* Similar, but doesn't require the lock held, and uses compare-and-swap
179	instead. Note that the only memory value that changes is ws->next_ull. */
180
181	bool
182	gomp_iter_ull_dynamic_next (gomp_ull pstart, gomp_ull pend)
183	{
184	struct gomp_thread *thr = gomp_thread ();
185	struct gomp_work_share *ws = thr->ts.work_share;
186	gomp_ull start, end, nend, chunk;
187
188	end = ws->end_ull;
189	chunk = ws->chunk_size_ull;
190
191	if (__builtin_expect (ws->mode & 1, 1))
192	{
193	gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
194	if (__builtin_expect (ws->mode & 2, 0) == 0)
195	{
196	if (tmp >= end)
197	return false;
198	nend = tmp + chunk;
199	if (nend > end)
200	nend = end;
201	*pstart = tmp;
202	*pend = nend;
203	return true;
204	}
205	else
206	{
207	if (tmp <= end)
208	return false;
209	nend = tmp + chunk;
210	if (nend < end)
211	nend = end;
212	*pstart = tmp;
213	*pend = nend;
214	return true;
215	}
216	}
217
218	start = ws->next_ull;
219	while (1)
220	{
221	gomp_ull left = end - start;
222	gomp_ull tmp;
223
224	if (start == end)
225	return false;
226
227	if (__builtin_expect (ws->mode & 2, 0))
228	{
229	if (chunk < left)
230	chunk = left;
231	}
232	else
233	{
234	if (chunk > left)
235	chunk = left;
236	}
237	nend = start + chunk;
238
239	tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
240	if (__builtin_expect (tmp == start, 1))
241	break;
242
243	start = tmp;
244	}
245
246	*pstart = start;
247	*pend = nend;
248	return true;
249	}
250	#endif /* HAVE_SYNC_BUILTINS */
251
252
253	/* This function implements the GUIDED scheduling method. Arguments are
254	as for gomp_iter_ull_static_next. This function must be called with the
255	work share lock held. */
256
257	bool
258	gomp_iter_ull_guided_next_locked (gomp_ull pstart, gomp_ull pend)
259	{
260	struct gomp_thread *thr = gomp_thread ();
261	struct gomp_work_share *ws = thr->ts.work_share;
262	struct gomp_team *team = thr->ts.team;
263	gomp_ull nthreads = team ? team->nthreads : 1;
264	gomp_ull n, q;
265	gomp_ull start, end;
266
267	if (ws->next_ull == ws->end_ull)
268	return false;
269
270	start = ws->next_ull;
271	if (__builtin_expect (ws->mode, 0) == 0)
272	n = (ws->end_ull - start) / ws->incr_ull;
273	else
274	n = (start - ws->end_ull) / -ws->incr_ull;
275	q = (n + nthreads - 1) / nthreads;
276
277	if (q < ws->chunk_size_ull)
278	q = ws->chunk_size_ull;
279	if (q <= n)
280	end = start + q * ws->incr_ull;
281	else
282	end = ws->end_ull;
283
284	ws->next_ull = end;
285	*pstart = start;
286	*pend = end;
287	return true;
288	}
289
290	#if defined HAVE_SYNC_BUILTINS && defined __LP64__
291	/* Similar, but doesn't require the lock held, and uses compare-and-swap
292	instead. Note that the only memory value that changes is ws->next_ull. */
293
294	bool
295	gomp_iter_ull_guided_next (gomp_ull pstart, gomp_ull pend)
296	{
297	struct gomp_thread *thr = gomp_thread ();
298	struct gomp_work_share *ws = thr->ts.work_share;
299	struct gomp_team *team = thr->ts.team;
300	gomp_ull nthreads = team ? team->nthreads : 1;
301	gomp_ull start, end, nend, incr;
302	gomp_ull chunk_size;
303
304	start = ws->next_ull;
305	end = ws->end_ull;
306	incr = ws->incr_ull;
307	chunk_size = ws->chunk_size_ull;
308
309	while (1)
310	{
311	gomp_ull n, q;
312	gomp_ull tmp;
313
314	if (start == end)
315	return false;
316
317	if (__builtin_expect (ws->mode, 0) == 0)
318	n = (end - start) / incr;
319	else
320	n = (start - end) / -incr;
321	q = (n + nthreads - 1) / nthreads;
322
323	if (q < chunk_size)
324	q = chunk_size;
325	if (__builtin_expect (q <= n, 1))
326	nend = start + q * incr;
327	else
328	nend = end;
329
330	tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
331	if (__builtin_expect (tmp == start, 1))
332	break;
333
334	start = tmp;
335	}
336
337	*pstart = start;
338	*pend = nend;
339	return true;
340	}
341	#endif /* HAVE_SYNC_BUILTINS */