]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgomp/ordered.c
Update copyright years.
[thirdparty/gcc.git] / libgomp / ordered.c
1 /* Copyright (C) 2005-2021 Free Software Foundation, Inc.
2 Contributed by Richard Henderson <rth@redhat.com>.
3
4 This file is part of the GNU Offloading and Multi Processing Library
5 (libgomp).
6
7 Libgomp is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 more details.
16
17 Under Section 7 of GPL version 3, you are granted additional
18 permissions described in the GCC Runtime Library Exception, version
19 3.1, as published by the Free Software Foundation.
20
21 You should have received a copy of the GNU General Public License and
22 a copy of the GCC Runtime Library Exception along with this program;
23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 <http://www.gnu.org/licenses/>. */
25
26 /* This file handles the ORDERED construct. */
27
28 #include "libgomp.h"
29 #include <stdarg.h>
30 #include <string.h>
31 #include "doacross.h"
32
33
34 /* This function is called when first allocating an iteration block. That
35 is, the thread is not currently on the queue. The work-share lock must
36 be held on entry. */
37
38 void
39 gomp_ordered_first (void)
40 {
41 struct gomp_thread *thr = gomp_thread ();
42 struct gomp_team *team = thr->ts.team;
43 struct gomp_work_share *ws = thr->ts.work_share;
44 unsigned index;
45
46 /* Work share constructs can be orphaned. */
47 if (team == NULL || team->nthreads == 1)
48 return;
49
50 index = ws->ordered_cur + ws->ordered_num_used;
51 if (index >= team->nthreads)
52 index -= team->nthreads;
53 ws->ordered_team_ids[index] = thr->ts.team_id;
54
55 /* If this is the first and only thread in the queue, then there is
56 no one to release us when we get to our ordered section. Post to
57 our own release queue now so that we won't block later. */
58 if (ws->ordered_num_used++ == 0)
59 gomp_sem_post (team->ordered_release[thr->ts.team_id]);
60 }
61
62 /* This function is called when completing the last iteration block. That
63 is, there are no more iterations to perform and so the thread should be
64 removed from the queue entirely. Because of the way ORDERED blocks are
65 managed, it follows that we currently own access to the ORDERED block,
66 and should now pass it on to the next thread. The work-share lock must
67 be held on entry. */
68
69 void
70 gomp_ordered_last (void)
71 {
72 struct gomp_thread *thr = gomp_thread ();
73 struct gomp_team *team = thr->ts.team;
74 struct gomp_work_share *ws = thr->ts.work_share;
75 unsigned next_id;
76
77 /* Work share constructs can be orphaned. */
78 if (team == NULL || team->nthreads == 1)
79 return;
80
81 /* We're no longer the owner. */
82 ws->ordered_owner = -1;
83
84 /* If we're not the last thread in the queue, then wake the next. */
85 if (--ws->ordered_num_used > 0)
86 {
87 unsigned next = ws->ordered_cur + 1;
88 if (next == team->nthreads)
89 next = 0;
90 ws->ordered_cur = next;
91
92 next_id = ws->ordered_team_ids[next];
93 gomp_sem_post (team->ordered_release[next_id]);
94 }
95 }
96
97
98 /* This function is called when allocating a subsequent allocation block.
99 That is, we're done with the current iteration block and we're allocating
100 another. This is the logical combination of a call to gomp_ordered_last
101 followed by a call to gomp_ordered_first. The work-share lock must be
102 held on entry. */
103
104 void
105 gomp_ordered_next (void)
106 {
107 struct gomp_thread *thr = gomp_thread ();
108 struct gomp_team *team = thr->ts.team;
109 struct gomp_work_share *ws = thr->ts.work_share;
110 unsigned index, next_id;
111
112 /* Work share constructs can be orphaned. */
113 if (team == NULL || team->nthreads == 1)
114 return;
115
116 /* We're no longer the owner. */
117 ws->ordered_owner = -1;
118
119 /* If there's only one thread in the queue, that must be us. */
120 if (ws->ordered_num_used == 1)
121 {
122 /* We have a similar situation as in gomp_ordered_first
123 where we need to post to our own release semaphore. */
124 gomp_sem_post (team->ordered_release[thr->ts.team_id]);
125 return;
126 }
127
128 /* If the queue is entirely full, then we move ourself to the end of
129 the queue merely by incrementing ordered_cur. Only if it's not
130 full do we have to write our id. */
131 if (ws->ordered_num_used < team->nthreads)
132 {
133 index = ws->ordered_cur + ws->ordered_num_used;
134 if (index >= team->nthreads)
135 index -= team->nthreads;
136 ws->ordered_team_ids[index] = thr->ts.team_id;
137 }
138
139 index = ws->ordered_cur + 1;
140 if (index == team->nthreads)
141 index = 0;
142 ws->ordered_cur = index;
143
144 next_id = ws->ordered_team_ids[index];
145 gomp_sem_post (team->ordered_release[next_id]);
146 }
147
148
149 /* This function is called when a statically scheduled loop is first
150 being created. */
151
152 void
153 gomp_ordered_static_init (void)
154 {
155 struct gomp_thread *thr = gomp_thread ();
156 struct gomp_team *team = thr->ts.team;
157
158 if (team == NULL || team->nthreads == 1)
159 return;
160
161 gomp_sem_post (team->ordered_release[0]);
162 }
163
164 /* This function is called when a statically scheduled loop is moving to
165 the next allocation block. Static schedules are not first come first
166 served like the others, so we're to move to the numerically next thread,
167 not the next thread on a list. The work-share lock should *not* be held
168 on entry. */
169
170 void
171 gomp_ordered_static_next (void)
172 {
173 struct gomp_thread *thr = gomp_thread ();
174 struct gomp_team *team = thr->ts.team;
175 struct gomp_work_share *ws = thr->ts.work_share;
176 unsigned id = thr->ts.team_id;
177
178 if (team == NULL || team->nthreads == 1)
179 return;
180
181 ws->ordered_owner = -1;
182
183 /* This thread currently owns the lock. Increment the owner. */
184 if (++id == team->nthreads)
185 id = 0;
186 ws->ordered_team_ids[0] = id;
187 gomp_sem_post (team->ordered_release[id]);
188 }
189
190 /* This function is called when we need to assert that the thread owns the
191 ordered section. Due to the problem of posted-but-not-waited semaphores,
192 this needs to happen before completing a loop iteration. */
193
194 void
195 gomp_ordered_sync (void)
196 {
197 struct gomp_thread *thr = gomp_thread ();
198 struct gomp_team *team = thr->ts.team;
199 struct gomp_work_share *ws = thr->ts.work_share;
200
201 /* Work share constructs can be orphaned. But this clearly means that
202 we are the only thread, and so we automatically own the section. */
203 if (team == NULL || team->nthreads == 1)
204 return;
205
206 /* ??? I believe it to be safe to access this data without taking the
207 ws->lock. The only presumed race condition is with the previous
208 thread on the queue incrementing ordered_cur such that it points
209 to us, concurrently with our check below. But our team_id is
210 already present in the queue, and the other thread will always
211 post to our release semaphore. So the two cases are that we will
212 either win the race an momentarily block on the semaphore, or lose
213 the race and find the semaphore already unlocked and so not block.
214 Either way we get correct results.
215 However, there is an implicit flush on entry to an ordered region,
216 so we do need to have a barrier here. If we were taking a lock
217 this could be MEMMODEL_RELEASE since the acquire would be covered
218 by the lock. */
219
220 __atomic_thread_fence (MEMMODEL_ACQ_REL);
221 if (ws->ordered_owner != thr->ts.team_id)
222 {
223 gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
224 ws->ordered_owner = thr->ts.team_id;
225 }
226 }
227
228 /* This function is called by user code when encountering the start of an
229 ORDERED block. We must check to see if the current thread is at the
230 head of the queue, and if not, block. */
231
232 #ifdef HAVE_ATTRIBUTE_ALIAS
233 extern void GOMP_ordered_start (void)
234 __attribute__((alias ("gomp_ordered_sync")));
235 #else
236 void
237 GOMP_ordered_start (void)
238 {
239 gomp_ordered_sync ();
240 }
241 #endif
242
243 /* This function is called by user code when encountering the end of an
244 ORDERED block. With the current ORDERED implementation there's nothing
245 for us to do.
246
247 However, the current implementation has a flaw in that it does not allow
248 the next thread into the ORDERED section immediately after the current
249 thread exits the ORDERED section in its last iteration. The existence
250 of this function allows the implementation to change. */
251
252 void
253 GOMP_ordered_end (void)
254 {
255 }
256
257 /* DOACROSS initialization. */
258
259 #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
260
261 void
262 gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
263 size_t extra)
264 {
265 struct gomp_thread *thr = gomp_thread ();
266 struct gomp_team *team = thr->ts.team;
267 struct gomp_work_share *ws = thr->ts.work_share;
268 unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
269 unsigned long ent, num_ents, elt_sz, shift_sz;
270 struct gomp_doacross_work_share *doacross;
271
272 if (team == NULL || team->nthreads == 1)
273 {
274 empty:
275 if (!extra)
276 ws->doacross = NULL;
277 else
278 {
279 doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
280 doacross->extra = (void *) (doacross + 1);
281 ws->doacross = doacross;
282 }
283 return;
284 }
285
286 for (i = 0; i < ncounts; i++)
287 {
288 /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */
289 if (counts[i] == 0)
290 goto empty;
291
292 if (num_bits <= MAX_COLLAPSED_BITS)
293 {
294 unsigned int this_bits;
295 if (counts[i] == 1)
296 this_bits = 1;
297 else
298 this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
299 - __builtin_clzl (counts[i] - 1);
300 if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
301 {
302 bits[i] = this_bits;
303 num_bits += this_bits;
304 }
305 else
306 num_bits = MAX_COLLAPSED_BITS + 1;
307 }
308 }
309
310 if (ws->sched == GFS_STATIC)
311 num_ents = team->nthreads;
312 else if (ws->sched == GFS_GUIDED)
313 num_ents = counts[0];
314 else
315 num_ents = (counts[0] - 1) / chunk_size + 1;
316 if (num_bits <= MAX_COLLAPSED_BITS)
317 {
318 elt_sz = sizeof (unsigned long);
319 shift_sz = ncounts * sizeof (unsigned int);
320 }
321 else
322 {
323 elt_sz = sizeof (unsigned long) * ncounts;
324 shift_sz = 0;
325 }
326 elt_sz = (elt_sz + 63) & ~63UL;
327
328 doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
329 + shift_sz + extra);
330 doacross->chunk_size = chunk_size;
331 doacross->elt_sz = elt_sz;
332 doacross->ncounts = ncounts;
333 doacross->flattened = false;
334 doacross->array = (unsigned char *)
335 ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
336 & ~(uintptr_t) 63);
337 if (extra)
338 {
339 doacross->extra = doacross->array + num_ents * elt_sz;
340 memset (doacross->extra, '\0', extra);
341 }
342 else
343 doacross->extra = NULL;
344 if (num_bits <= MAX_COLLAPSED_BITS)
345 {
346 unsigned int shift_count = 0;
347 doacross->flattened = true;
348 for (i = ncounts; i > 0; i--)
349 {
350 doacross->shift_counts[i - 1] = shift_count;
351 shift_count += bits[i - 1];
352 }
353 for (ent = 0; ent < num_ents; ent++)
354 *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
355 }
356 else
357 for (ent = 0; ent < num_ents; ent++)
358 memset (doacross->array + ent * elt_sz, '\0',
359 sizeof (unsigned long) * ncounts);
360 if (ws->sched == GFS_STATIC && chunk_size == 0)
361 {
362 unsigned long q = counts[0] / num_ents;
363 unsigned long t = counts[0] % num_ents;
364 doacross->boundary = t * (q + 1);
365 doacross->q = q;
366 doacross->t = t;
367 }
368 ws->doacross = doacross;
369 }
370
371 /* DOACROSS POST operation. */
372
373 void
374 GOMP_doacross_post (long *counts)
375 {
376 struct gomp_thread *thr = gomp_thread ();
377 struct gomp_work_share *ws = thr->ts.work_share;
378 struct gomp_doacross_work_share *doacross = ws->doacross;
379 unsigned long ent;
380 unsigned int i;
381
382 if (__builtin_expect (doacross == NULL, 0)
383 || __builtin_expect (doacross->array == NULL, 0))
384 {
385 __sync_synchronize ();
386 return;
387 }
388
389 if (__builtin_expect (ws->sched == GFS_STATIC, 1))
390 ent = thr->ts.team_id;
391 else if (ws->sched == GFS_GUIDED)
392 ent = counts[0];
393 else
394 ent = counts[0] / doacross->chunk_size;
395 unsigned long *array = (unsigned long *) (doacross->array
396 + ent * doacross->elt_sz);
397
398 if (__builtin_expect (doacross->flattened, 1))
399 {
400 unsigned long flattened
401 = (unsigned long) counts[0] << doacross->shift_counts[0];
402
403 for (i = 1; i < doacross->ncounts; i++)
404 flattened |= (unsigned long) counts[i]
405 << doacross->shift_counts[i];
406 flattened++;
407 if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
408 __atomic_thread_fence (MEMMODEL_RELEASE);
409 else
410 __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
411 return;
412 }
413
414 __atomic_thread_fence (MEMMODEL_ACQUIRE);
415 for (i = doacross->ncounts; i-- > 0; )
416 {
417 if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
418 __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
419 }
420 }
421
422 /* DOACROSS WAIT operation. */
423
424 void
425 GOMP_doacross_wait (long first, ...)
426 {
427 struct gomp_thread *thr = gomp_thread ();
428 struct gomp_work_share *ws = thr->ts.work_share;
429 struct gomp_doacross_work_share *doacross = ws->doacross;
430 va_list ap;
431 unsigned long ent;
432 unsigned int i;
433
434 if (__builtin_expect (doacross == NULL, 0)
435 || __builtin_expect (doacross->array == NULL, 0))
436 {
437 __sync_synchronize ();
438 return;
439 }
440
441 if (__builtin_expect (ws->sched == GFS_STATIC, 1))
442 {
443 if (ws->chunk_size == 0)
444 {
445 if (first < doacross->boundary)
446 ent = first / (doacross->q + 1);
447 else
448 ent = (first - doacross->boundary) / doacross->q
449 + doacross->t;
450 }
451 else
452 ent = first / ws->chunk_size % thr->ts.team->nthreads;
453 }
454 else if (ws->sched == GFS_GUIDED)
455 ent = first;
456 else
457 ent = first / doacross->chunk_size;
458 unsigned long *array = (unsigned long *) (doacross->array
459 + ent * doacross->elt_sz);
460
461 if (__builtin_expect (doacross->flattened, 1))
462 {
463 unsigned long flattened
464 = (unsigned long) first << doacross->shift_counts[0];
465 unsigned long cur;
466
467 va_start (ap, first);
468 for (i = 1; i < doacross->ncounts; i++)
469 flattened |= (unsigned long) va_arg (ap, long)
470 << doacross->shift_counts[i];
471 cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
472 if (flattened < cur)
473 {
474 __atomic_thread_fence (MEMMODEL_RELEASE);
475 va_end (ap);
476 return;
477 }
478 doacross_spin (array, flattened, cur);
479 __atomic_thread_fence (MEMMODEL_RELEASE);
480 va_end (ap);
481 return;
482 }
483
484 do
485 {
486 va_start (ap, first);
487 for (i = 0; i < doacross->ncounts; i++)
488 {
489 unsigned long thisv
490 = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
491 unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
492 if (thisv < cur)
493 {
494 i = doacross->ncounts;
495 break;
496 }
497 if (thisv > cur)
498 break;
499 }
500 va_end (ap);
501 if (i == doacross->ncounts)
502 break;
503 cpu_relax ();
504 }
505 while (1);
506 __sync_synchronize ();
507 }
508
509 typedef unsigned long long gomp_ull;
510
511 void
512 gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
513 gomp_ull chunk_size, size_t extra)
514 {
515 struct gomp_thread *thr = gomp_thread ();
516 struct gomp_team *team = thr->ts.team;
517 struct gomp_work_share *ws = thr->ts.work_share;
518 unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
519 unsigned long ent, num_ents, elt_sz, shift_sz;
520 struct gomp_doacross_work_share *doacross;
521
522 if (team == NULL || team->nthreads == 1)
523 {
524 empty:
525 if (!extra)
526 ws->doacross = NULL;
527 else
528 {
529 doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
530 doacross->extra = (void *) (doacross + 1);
531 ws->doacross = doacross;
532 }
533 return;
534 }
535
536 for (i = 0; i < ncounts; i++)
537 {
538 /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */
539 if (counts[i] == 0)
540 goto empty;
541
542 if (num_bits <= MAX_COLLAPSED_BITS)
543 {
544 unsigned int this_bits;
545 if (counts[i] == 1)
546 this_bits = 1;
547 else
548 this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
549 - __builtin_clzll (counts[i] - 1);
550 if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
551 {
552 bits[i] = this_bits;
553 num_bits += this_bits;
554 }
555 else
556 num_bits = MAX_COLLAPSED_BITS + 1;
557 }
558 }
559
560 if (ws->sched == GFS_STATIC)
561 num_ents = team->nthreads;
562 else if (ws->sched == GFS_GUIDED)
563 num_ents = counts[0];
564 else
565 num_ents = (counts[0] - 1) / chunk_size + 1;
566 if (num_bits <= MAX_COLLAPSED_BITS)
567 {
568 elt_sz = sizeof (unsigned long);
569 shift_sz = ncounts * sizeof (unsigned int);
570 }
571 else
572 {
573 if (sizeof (gomp_ull) == sizeof (unsigned long))
574 elt_sz = sizeof (gomp_ull) * ncounts;
575 else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
576 elt_sz = sizeof (unsigned long) * 2 * ncounts;
577 else
578 abort ();
579 shift_sz = 0;
580 }
581 elt_sz = (elt_sz + 63) & ~63UL;
582
583 doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
584 + shift_sz);
585 doacross->chunk_size_ull = chunk_size;
586 doacross->elt_sz = elt_sz;
587 doacross->ncounts = ncounts;
588 doacross->flattened = false;
589 doacross->boundary = 0;
590 doacross->array = (unsigned char *)
591 ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
592 & ~(uintptr_t) 63);
593 if (extra)
594 {
595 doacross->extra = doacross->array + num_ents * elt_sz;
596 memset (doacross->extra, '\0', extra);
597 }
598 else
599 doacross->extra = NULL;
600 if (num_bits <= MAX_COLLAPSED_BITS)
601 {
602 unsigned int shift_count = 0;
603 doacross->flattened = true;
604 for (i = ncounts; i > 0; i--)
605 {
606 doacross->shift_counts[i - 1] = shift_count;
607 shift_count += bits[i - 1];
608 }
609 for (ent = 0; ent < num_ents; ent++)
610 *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
611 }
612 else
613 for (ent = 0; ent < num_ents; ent++)
614 memset (doacross->array + ent * elt_sz, '\0',
615 sizeof (unsigned long) * ncounts);
616 if (ws->sched == GFS_STATIC && chunk_size == 0)
617 {
618 gomp_ull q = counts[0] / num_ents;
619 gomp_ull t = counts[0] % num_ents;
620 doacross->boundary_ull = t * (q + 1);
621 doacross->q_ull = q;
622 doacross->t = t;
623 }
624 ws->doacross = doacross;
625 }
626
627 /* DOACROSS POST operation. */
628
629 void
630 GOMP_doacross_ull_post (gomp_ull *counts)
631 {
632 struct gomp_thread *thr = gomp_thread ();
633 struct gomp_work_share *ws = thr->ts.work_share;
634 struct gomp_doacross_work_share *doacross = ws->doacross;
635 unsigned long ent;
636 unsigned int i;
637
638 if (__builtin_expect (doacross == NULL, 0)
639 || __builtin_expect (doacross->array == NULL, 0))
640 {
641 __sync_synchronize ();
642 return;
643 }
644
645 if (__builtin_expect (ws->sched == GFS_STATIC, 1))
646 ent = thr->ts.team_id;
647 else if (ws->sched == GFS_GUIDED)
648 ent = counts[0];
649 else
650 ent = counts[0] / doacross->chunk_size_ull;
651
652 if (__builtin_expect (doacross->flattened, 1))
653 {
654 unsigned long *array = (unsigned long *) (doacross->array
655 + ent * doacross->elt_sz);
656 gomp_ull flattened
657 = counts[0] << doacross->shift_counts[0];
658
659 for (i = 1; i < doacross->ncounts; i++)
660 flattened |= counts[i] << doacross->shift_counts[i];
661 flattened++;
662 if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
663 __atomic_thread_fence (MEMMODEL_RELEASE);
664 else
665 __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
666 return;
667 }
668
669 __atomic_thread_fence (MEMMODEL_ACQUIRE);
670 if (sizeof (gomp_ull) == sizeof (unsigned long))
671 {
672 gomp_ull *array = (gomp_ull *) (doacross->array
673 + ent * doacross->elt_sz);
674
675 for (i = doacross->ncounts; i-- > 0; )
676 {
677 if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
678 __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
679 }
680 }
681 else
682 {
683 unsigned long *array = (unsigned long *) (doacross->array
684 + ent * doacross->elt_sz);
685
686 for (i = doacross->ncounts; i-- > 0; )
687 {
688 gomp_ull cull = counts[i] + 1UL;
689 unsigned long c = (unsigned long) cull;
690 if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
691 __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
692 c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
693 if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
694 __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
695 }
696 }
697 }
698
699 /* DOACROSS WAIT operation. */
700
701 void
702 GOMP_doacross_ull_wait (gomp_ull first, ...)
703 {
704 struct gomp_thread *thr = gomp_thread ();
705 struct gomp_work_share *ws = thr->ts.work_share;
706 struct gomp_doacross_work_share *doacross = ws->doacross;
707 va_list ap;
708 unsigned long ent;
709 unsigned int i;
710
711 if (__builtin_expect (doacross == NULL, 0)
712 || __builtin_expect (doacross->array == NULL, 0))
713 {
714 __sync_synchronize ();
715 return;
716 }
717
718 if (__builtin_expect (ws->sched == GFS_STATIC, 1))
719 {
720 if (ws->chunk_size_ull == 0)
721 {
722 if (first < doacross->boundary_ull)
723 ent = first / (doacross->q_ull + 1);
724 else
725 ent = (first - doacross->boundary_ull) / doacross->q_ull
726 + doacross->t;
727 }
728 else
729 ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
730 }
731 else if (ws->sched == GFS_GUIDED)
732 ent = first;
733 else
734 ent = first / doacross->chunk_size_ull;
735
736 if (__builtin_expect (doacross->flattened, 1))
737 {
738 unsigned long *array = (unsigned long *) (doacross->array
739 + ent * doacross->elt_sz);
740 gomp_ull flattened = first << doacross->shift_counts[0];
741 unsigned long cur;
742
743 va_start (ap, first);
744 for (i = 1; i < doacross->ncounts; i++)
745 flattened |= va_arg (ap, gomp_ull)
746 << doacross->shift_counts[i];
747 cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
748 if (flattened < cur)
749 {
750 __atomic_thread_fence (MEMMODEL_RELEASE);
751 va_end (ap);
752 return;
753 }
754 doacross_spin (array, flattened, cur);
755 __atomic_thread_fence (MEMMODEL_RELEASE);
756 va_end (ap);
757 return;
758 }
759
760 if (sizeof (gomp_ull) == sizeof (unsigned long))
761 {
762 gomp_ull *array = (gomp_ull *) (doacross->array
763 + ent * doacross->elt_sz);
764 do
765 {
766 va_start (ap, first);
767 for (i = 0; i < doacross->ncounts; i++)
768 {
769 gomp_ull thisv
770 = (i ? va_arg (ap, gomp_ull) : first) + 1;
771 gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
772 if (thisv < cur)
773 {
774 i = doacross->ncounts;
775 break;
776 }
777 if (thisv > cur)
778 break;
779 }
780 va_end (ap);
781 if (i == doacross->ncounts)
782 break;
783 cpu_relax ();
784 }
785 while (1);
786 }
787 else
788 {
789 unsigned long *array = (unsigned long *) (doacross->array
790 + ent * doacross->elt_sz);
791 do
792 {
793 va_start (ap, first);
794 for (i = 0; i < doacross->ncounts; i++)
795 {
796 gomp_ull thisv
797 = (i ? va_arg (ap, gomp_ull) : first) + 1;
798 unsigned long t
799 = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
800 unsigned long cur
801 = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
802 if (t < cur)
803 {
804 i = doacross->ncounts;
805 break;
806 }
807 if (t > cur)
808 break;
809 t = thisv;
810 cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
811 if (t < cur)
812 {
813 i = doacross->ncounts;
814 break;
815 }
816 if (t > cur)
817 break;
818 }
819 va_end (ap);
820 if (i == doacross->ncounts)
821 break;
822 cpu_relax ();
823 }
824 while (1);
825 }
826 __sync_synchronize ();
827 }