]>
Commit | Line | Data |
---|---|---|
a5544970 | 1 | /* Copyright (C) 2005-2019 Free Software Foundation, Inc. |
953ff289 DN |
2 | Contributed by Richard Henderson <rth@redhat.com>. |
3 | ||
f1f3453e TS |
4 | This file is part of the GNU Offloading and Multi Processing Library |
5 | (libgomp). | |
953ff289 DN |
6 | |
7 | Libgomp is free software; you can redistribute it and/or modify it | |
748086b7 JJ |
8 | under the terms of the GNU General Public License as published by |
9 | the Free Software Foundation; either version 3, or (at your option) | |
10 | any later version. | |
953ff289 DN |
11 | |
12 | Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY | |
13 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
748086b7 | 14 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
953ff289 DN |
15 | more details. |
16 | ||
748086b7 JJ |
17 | Under Section 7 of GPL version 3, you are granted additional |
18 | permissions described in the GCC Runtime Library Exception, version | |
19 | 3.1, as published by the Free Software Foundation. | |
20 | ||
21 | You should have received a copy of the GNU General Public License and | |
22 | a copy of the GCC Runtime Library Exception along with this program; | |
23 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
24 | <http://www.gnu.org/licenses/>. */ | |
953ff289 DN |
25 | |
26 | /* This file handles the ORDERED construct. */ | |
27 | ||
28 | #include "libgomp.h" | |
d9a6bd32 JJ |
29 | #include <stdarg.h> |
30 | #include <string.h> | |
31 | #include "doacross.h" | |
953ff289 DN |
32 | |
33 | ||
34 | /* This function is called when first allocating an iteration block. That | |
35 | is, the thread is not currently on the queue. The work-share lock must | |
36 | be held on entry. */ | |
37 | ||
38 | void | |
39 | gomp_ordered_first (void) | |
40 | { | |
41 | struct gomp_thread *thr = gomp_thread (); | |
42 | struct gomp_team *team = thr->ts.team; | |
43 | struct gomp_work_share *ws = thr->ts.work_share; | |
44 | unsigned index; | |
45 | ||
46 | /* Work share constructs can be orphaned. */ | |
47 | if (team == NULL || team->nthreads == 1) | |
48 | return; | |
49 | ||
50 | index = ws->ordered_cur + ws->ordered_num_used; | |
51 | if (index >= team->nthreads) | |
52 | index -= team->nthreads; | |
53 | ws->ordered_team_ids[index] = thr->ts.team_id; | |
54 | ||
55 | /* If this is the first and only thread in the queue, then there is | |
56 | no one to release us when we get to our ordered section. Post to | |
57 | our own release queue now so that we won't block later. */ | |
58 | if (ws->ordered_num_used++ == 0) | |
59 | gomp_sem_post (team->ordered_release[thr->ts.team_id]); | |
60 | } | |
61 | ||
62 | /* This function is called when completing the last iteration block. That | |
63 | is, there are no more iterations to perform and so the thread should be | |
64 | removed from the queue entirely. Because of the way ORDERED blocks are | |
65 | managed, it follows that we currently own access to the ORDERED block, | |
66 | and should now pass it on to the next thread. The work-share lock must | |
67 | be held on entry. */ | |
68 | ||
69 | void | |
70 | gomp_ordered_last (void) | |
71 | { | |
72 | struct gomp_thread *thr = gomp_thread (); | |
73 | struct gomp_team *team = thr->ts.team; | |
74 | struct gomp_work_share *ws = thr->ts.work_share; | |
75 | unsigned next_id; | |
76 | ||
77 | /* Work share constructs can be orphaned. */ | |
78 | if (team == NULL || team->nthreads == 1) | |
79 | return; | |
80 | ||
81 | /* We're no longer the owner. */ | |
82 | ws->ordered_owner = -1; | |
83 | ||
84 | /* If we're not the last thread in the queue, then wake the next. */ | |
85 | if (--ws->ordered_num_used > 0) | |
86 | { | |
87 | unsigned next = ws->ordered_cur + 1; | |
88 | if (next == team->nthreads) | |
89 | next = 0; | |
90 | ws->ordered_cur = next; | |
91 | ||
92 | next_id = ws->ordered_team_ids[next]; | |
93 | gomp_sem_post (team->ordered_release[next_id]); | |
94 | } | |
95 | } | |
96 | ||
97 | ||
98 | /* This function is called when allocating a subsequent allocation block. | |
99 | That is, we're done with the current iteration block and we're allocating | |
100 | another. This is the logical combination of a call to gomp_ordered_last | |
101 | followed by a call to gomp_ordered_first. The work-share lock must be | |
102 | held on entry. */ | |
103 | ||
104 | void | |
105 | gomp_ordered_next (void) | |
106 | { | |
107 | struct gomp_thread *thr = gomp_thread (); | |
108 | struct gomp_team *team = thr->ts.team; | |
109 | struct gomp_work_share *ws = thr->ts.work_share; | |
110 | unsigned index, next_id; | |
111 | ||
112 | /* Work share constructs can be orphaned. */ | |
113 | if (team == NULL || team->nthreads == 1) | |
114 | return; | |
115 | ||
116 | /* We're no longer the owner. */ | |
117 | ws->ordered_owner = -1; | |
118 | ||
119 | /* If there's only one thread in the queue, that must be us. */ | |
120 | if (ws->ordered_num_used == 1) | |
121 | { | |
122 | /* We have a similar situation as in gomp_ordered_first | |
123 | where we need to post to our own release semaphore. */ | |
124 | gomp_sem_post (team->ordered_release[thr->ts.team_id]); | |
125 | return; | |
126 | } | |
127 | ||
128 | /* If the queue is entirely full, then we move ourself to the end of | |
129 | the queue merely by incrementing ordered_cur. Only if it's not | |
130 | full do we have to write our id. */ | |
131 | if (ws->ordered_num_used < team->nthreads) | |
132 | { | |
133 | index = ws->ordered_cur + ws->ordered_num_used; | |
134 | if (index >= team->nthreads) | |
135 | index -= team->nthreads; | |
136 | ws->ordered_team_ids[index] = thr->ts.team_id; | |
137 | } | |
138 | ||
139 | index = ws->ordered_cur + 1; | |
140 | if (index == team->nthreads) | |
141 | index = 0; | |
142 | ws->ordered_cur = index; | |
143 | ||
144 | next_id = ws->ordered_team_ids[index]; | |
145 | gomp_sem_post (team->ordered_release[next_id]); | |
146 | } | |
147 | ||
148 | ||
149 | /* This function is called when a statically scheduled loop is first | |
150 | being created. */ | |
151 | ||
152 | void | |
153 | gomp_ordered_static_init (void) | |
154 | { | |
155 | struct gomp_thread *thr = gomp_thread (); | |
156 | struct gomp_team *team = thr->ts.team; | |
157 | ||
158 | if (team == NULL || team->nthreads == 1) | |
159 | return; | |
160 | ||
161 | gomp_sem_post (team->ordered_release[0]); | |
162 | } | |
163 | ||
164 | /* This function is called when a statically scheduled loop is moving to | |
165 | the next allocation block. Static schedules are not first come first | |
166 | served like the others, so we're to move to the numerically next thread, | |
167 | not the next thread on a list. The work-share lock should *not* be held | |
168 | on entry. */ | |
169 | ||
170 | void | |
171 | gomp_ordered_static_next (void) | |
172 | { | |
173 | struct gomp_thread *thr = gomp_thread (); | |
174 | struct gomp_team *team = thr->ts.team; | |
175 | struct gomp_work_share *ws = thr->ts.work_share; | |
176 | unsigned id = thr->ts.team_id; | |
177 | ||
178 | if (team == NULL || team->nthreads == 1) | |
179 | return; | |
180 | ||
181 | ws->ordered_owner = -1; | |
182 | ||
183 | /* This thread currently owns the lock. Increment the owner. */ | |
184 | if (++id == team->nthreads) | |
185 | id = 0; | |
186 | ws->ordered_team_ids[0] = id; | |
187 | gomp_sem_post (team->ordered_release[id]); | |
188 | } | |
189 | ||
190 | /* This function is called when we need to assert that the thread owns the | |
191 | ordered section. Due to the problem of posted-but-not-waited semaphores, | |
192 | this needs to happen before completing a loop iteration. */ | |
193 | ||
194 | void | |
195 | gomp_ordered_sync (void) | |
196 | { | |
197 | struct gomp_thread *thr = gomp_thread (); | |
198 | struct gomp_team *team = thr->ts.team; | |
199 | struct gomp_work_share *ws = thr->ts.work_share; | |
200 | ||
201 | /* Work share constructs can be orphaned. But this clearly means that | |
202 | we are the only thread, and so we automatically own the section. */ | |
203 | if (team == NULL || team->nthreads == 1) | |
204 | return; | |
205 | ||
206 | /* ??? I believe it to be safe to access this data without taking the | |
207 | ws->lock. The only presumed race condition is with the previous | |
208 | thread on the queue incrementing ordered_cur such that it points | |
209 | to us, concurrently with our check below. But our team_id is | |
210 | already present in the queue, and the other thread will always | |
211 | post to our release semaphore. So the two cases are that we will | |
212 | either win the race an momentarily block on the semaphore, or lose | |
213 | the race and find the semaphore already unlocked and so not block. | |
b40c885f AM |
214 | Either way we get correct results. |
215 | However, there is an implicit flush on entry to an ordered region, | |
216 | so we do need to have a barrier here. If we were taking a lock | |
217 | this could be MEMMODEL_RELEASE since the acquire would be coverd | |
218 | by the lock. */ | |
953ff289 | 219 | |
b40c885f | 220 | __atomic_thread_fence (MEMMODEL_ACQ_REL); |
953ff289 DN |
221 | if (ws->ordered_owner != thr->ts.team_id) |
222 | { | |
223 | gomp_sem_wait (team->ordered_release[thr->ts.team_id]); | |
224 | ws->ordered_owner = thr->ts.team_id; | |
225 | } | |
226 | } | |
227 | ||
228 | /* This function is called by user code when encountering the start of an | |
229 | ORDERED block. We must check to see if the current thread is at the | |
230 | head of the queue, and if not, block. */ | |
231 | ||
232 | #ifdef HAVE_ATTRIBUTE_ALIAS | |
233 | extern void GOMP_ordered_start (void) | |
234 | __attribute__((alias ("gomp_ordered_sync"))); | |
235 | #else | |
236 | void | |
237 | GOMP_ordered_start (void) | |
238 | { | |
239 | gomp_ordered_sync (); | |
240 | } | |
241 | #endif | |
242 | ||
243 | /* This function is called by user code when encountering the end of an | |
244 | ORDERED block. With the current ORDERED implementation there's nothing | |
245 | for us to do. | |
246 | ||
247 | However, the current implementation has a flaw in that it does not allow | |
248 | the next thread into the ORDERED section immediately after the current | |
249 | thread exits the ORDERED section in its last iteration. The existance | |
250 | of this function allows the implementation to change. */ | |
251 | ||
252 | void | |
253 | GOMP_ordered_end (void) | |
254 | { | |
255 | } | |
d9a6bd32 JJ |
256 | |
257 | /* DOACROSS initialization. */ | |
258 | ||
259 | #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__) | |
260 | ||
261 | void | |
28567c40 JJ |
262 | gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size, |
263 | size_t extra) | |
d9a6bd32 JJ |
264 | { |
265 | struct gomp_thread *thr = gomp_thread (); | |
266 | struct gomp_team *team = thr->ts.team; | |
267 | struct gomp_work_share *ws = thr->ts.work_share; | |
268 | unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0; | |
269 | unsigned long ent, num_ents, elt_sz, shift_sz; | |
270 | struct gomp_doacross_work_share *doacross; | |
271 | ||
272 | if (team == NULL || team->nthreads == 1) | |
28567c40 JJ |
273 | { |
274 | empty: | |
275 | if (!extra) | |
276 | ws->doacross = NULL; | |
277 | else | |
278 | { | |
279 | doacross = gomp_malloc_cleared (sizeof (*doacross) + extra); | |
280 | doacross->extra = (void *) (doacross + 1); | |
281 | ws->doacross = doacross; | |
282 | } | |
283 | return; | |
284 | } | |
d9a6bd32 JJ |
285 | |
286 | for (i = 0; i < ncounts; i++) | |
287 | { | |
288 | /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ | |
289 | if (counts[i] == 0) | |
28567c40 | 290 | goto empty; |
d9a6bd32 JJ |
291 | |
292 | if (num_bits <= MAX_COLLAPSED_BITS) | |
293 | { | |
294 | unsigned int this_bits; | |
295 | if (counts[i] == 1) | |
296 | this_bits = 1; | |
297 | else | |
298 | this_bits = __SIZEOF_LONG__ * __CHAR_BIT__ | |
299 | - __builtin_clzl (counts[i] - 1); | |
300 | if (num_bits + this_bits <= MAX_COLLAPSED_BITS) | |
301 | { | |
302 | bits[i] = this_bits; | |
303 | num_bits += this_bits; | |
304 | } | |
305 | else | |
306 | num_bits = MAX_COLLAPSED_BITS + 1; | |
307 | } | |
308 | } | |
309 | ||
310 | if (ws->sched == GFS_STATIC) | |
311 | num_ents = team->nthreads; | |
e4606348 JJ |
312 | else if (ws->sched == GFS_GUIDED) |
313 | num_ents = counts[0]; | |
d9a6bd32 JJ |
314 | else |
315 | num_ents = (counts[0] - 1) / chunk_size + 1; | |
316 | if (num_bits <= MAX_COLLAPSED_BITS) | |
317 | { | |
318 | elt_sz = sizeof (unsigned long); | |
319 | shift_sz = ncounts * sizeof (unsigned int); | |
320 | } | |
321 | else | |
322 | { | |
323 | elt_sz = sizeof (unsigned long) * ncounts; | |
324 | shift_sz = 0; | |
325 | } | |
326 | elt_sz = (elt_sz + 63) & ~63UL; | |
327 | ||
328 | doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz | |
28567c40 | 329 | + shift_sz + extra); |
d9a6bd32 JJ |
330 | doacross->chunk_size = chunk_size; |
331 | doacross->elt_sz = elt_sz; | |
332 | doacross->ncounts = ncounts; | |
333 | doacross->flattened = false; | |
334 | doacross->array = (unsigned char *) | |
335 | ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) | |
336 | & ~(uintptr_t) 63); | |
28567c40 JJ |
337 | if (extra) |
338 | { | |
339 | doacross->extra = doacross->array + num_ents * elt_sz; | |
340 | memset (doacross->extra, '\0', extra); | |
341 | } | |
342 | else | |
343 | doacross->extra = NULL; | |
d9a6bd32 JJ |
344 | if (num_bits <= MAX_COLLAPSED_BITS) |
345 | { | |
346 | unsigned int shift_count = 0; | |
347 | doacross->flattened = true; | |
348 | for (i = ncounts; i > 0; i--) | |
349 | { | |
350 | doacross->shift_counts[i - 1] = shift_count; | |
351 | shift_count += bits[i - 1]; | |
352 | } | |
353 | for (ent = 0; ent < num_ents; ent++) | |
354 | *(unsigned long *) (doacross->array + ent * elt_sz) = 0; | |
355 | } | |
356 | else | |
357 | for (ent = 0; ent < num_ents; ent++) | |
358 | memset (doacross->array + ent * elt_sz, '\0', | |
359 | sizeof (unsigned long) * ncounts); | |
360 | if (ws->sched == GFS_STATIC && chunk_size == 0) | |
361 | { | |
362 | unsigned long q = counts[0] / num_ents; | |
363 | unsigned long t = counts[0] % num_ents; | |
364 | doacross->boundary = t * (q + 1); | |
365 | doacross->q = q; | |
366 | doacross->t = t; | |
367 | } | |
368 | ws->doacross = doacross; | |
369 | } | |
370 | ||
371 | /* DOACROSS POST operation. */ | |
372 | ||
373 | void | |
374 | GOMP_doacross_post (long *counts) | |
375 | { | |
376 | struct gomp_thread *thr = gomp_thread (); | |
377 | struct gomp_work_share *ws = thr->ts.work_share; | |
378 | struct gomp_doacross_work_share *doacross = ws->doacross; | |
379 | unsigned long ent; | |
380 | unsigned int i; | |
381 | ||
28567c40 JJ |
382 | if (__builtin_expect (doacross == NULL, 0) |
383 | || __builtin_expect (doacross->array == NULL, 0)) | |
d9a6bd32 JJ |
384 | { |
385 | __sync_synchronize (); | |
386 | return; | |
387 | } | |
388 | ||
389 | if (__builtin_expect (ws->sched == GFS_STATIC, 1)) | |
390 | ent = thr->ts.team_id; | |
e4606348 JJ |
391 | else if (ws->sched == GFS_GUIDED) |
392 | ent = counts[0]; | |
d9a6bd32 JJ |
393 | else |
394 | ent = counts[0] / doacross->chunk_size; | |
395 | unsigned long *array = (unsigned long *) (doacross->array | |
396 | + ent * doacross->elt_sz); | |
397 | ||
398 | if (__builtin_expect (doacross->flattened, 1)) | |
399 | { | |
400 | unsigned long flattened | |
401 | = (unsigned long) counts[0] << doacross->shift_counts[0]; | |
402 | ||
403 | for (i = 1; i < doacross->ncounts; i++) | |
404 | flattened |= (unsigned long) counts[i] | |
405 | << doacross->shift_counts[i]; | |
406 | flattened++; | |
407 | if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE)) | |
408 | __atomic_thread_fence (MEMMODEL_RELEASE); | |
409 | else | |
410 | __atomic_store_n (array, flattened, MEMMODEL_RELEASE); | |
411 | return; | |
412 | } | |
413 | ||
414 | __atomic_thread_fence (MEMMODEL_ACQUIRE); | |
415 | for (i = doacross->ncounts; i-- > 0; ) | |
416 | { | |
417 | if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED)) | |
418 | __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE); | |
419 | } | |
420 | } | |
421 | ||
422 | /* DOACROSS WAIT operation. */ | |
423 | ||
424 | void | |
425 | GOMP_doacross_wait (long first, ...) | |
426 | { | |
427 | struct gomp_thread *thr = gomp_thread (); | |
428 | struct gomp_work_share *ws = thr->ts.work_share; | |
429 | struct gomp_doacross_work_share *doacross = ws->doacross; | |
430 | va_list ap; | |
431 | unsigned long ent; | |
432 | unsigned int i; | |
433 | ||
28567c40 JJ |
434 | if (__builtin_expect (doacross == NULL, 0) |
435 | || __builtin_expect (doacross->array == NULL, 0)) | |
d9a6bd32 JJ |
436 | { |
437 | __sync_synchronize (); | |
438 | return; | |
439 | } | |
440 | ||
441 | if (__builtin_expect (ws->sched == GFS_STATIC, 1)) | |
442 | { | |
443 | if (ws->chunk_size == 0) | |
444 | { | |
445 | if (first < doacross->boundary) | |
446 | ent = first / (doacross->q + 1); | |
447 | else | |
448 | ent = (first - doacross->boundary) / doacross->q | |
449 | + doacross->t; | |
450 | } | |
451 | else | |
452 | ent = first / ws->chunk_size % thr->ts.team->nthreads; | |
453 | } | |
e4606348 JJ |
454 | else if (ws->sched == GFS_GUIDED) |
455 | ent = first; | |
d9a6bd32 JJ |
456 | else |
457 | ent = first / doacross->chunk_size; | |
458 | unsigned long *array = (unsigned long *) (doacross->array | |
459 | + ent * doacross->elt_sz); | |
460 | ||
461 | if (__builtin_expect (doacross->flattened, 1)) | |
462 | { | |
463 | unsigned long flattened | |
464 | = (unsigned long) first << doacross->shift_counts[0]; | |
465 | unsigned long cur; | |
466 | ||
467 | va_start (ap, first); | |
468 | for (i = 1; i < doacross->ncounts; i++) | |
469 | flattened |= (unsigned long) va_arg (ap, long) | |
470 | << doacross->shift_counts[i]; | |
471 | cur = __atomic_load_n (array, MEMMODEL_ACQUIRE); | |
472 | if (flattened < cur) | |
473 | { | |
474 | __atomic_thread_fence (MEMMODEL_RELEASE); | |
475 | va_end (ap); | |
476 | return; | |
477 | } | |
478 | doacross_spin (array, flattened, cur); | |
479 | __atomic_thread_fence (MEMMODEL_RELEASE); | |
480 | va_end (ap); | |
481 | return; | |
482 | } | |
483 | ||
484 | do | |
485 | { | |
486 | va_start (ap, first); | |
487 | for (i = 0; i < doacross->ncounts; i++) | |
488 | { | |
489 | unsigned long thisv | |
490 | = (unsigned long) (i ? va_arg (ap, long) : first) + 1; | |
491 | unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED); | |
492 | if (thisv < cur) | |
493 | { | |
494 | i = doacross->ncounts; | |
495 | break; | |
496 | } | |
497 | if (thisv > cur) | |
498 | break; | |
499 | } | |
500 | va_end (ap); | |
501 | if (i == doacross->ncounts) | |
502 | break; | |
503 | cpu_relax (); | |
504 | } | |
505 | while (1); | |
506 | __sync_synchronize (); | |
507 | } | |
508 | ||
509 | typedef unsigned long long gomp_ull; | |
510 | ||
511 | void | |
28567c40 JJ |
512 | gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, |
513 | gomp_ull chunk_size, size_t extra) | |
d9a6bd32 JJ |
514 | { |
515 | struct gomp_thread *thr = gomp_thread (); | |
516 | struct gomp_team *team = thr->ts.team; | |
517 | struct gomp_work_share *ws = thr->ts.work_share; | |
518 | unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0; | |
519 | unsigned long ent, num_ents, elt_sz, shift_sz; | |
520 | struct gomp_doacross_work_share *doacross; | |
521 | ||
522 | if (team == NULL || team->nthreads == 1) | |
28567c40 JJ |
523 | { |
524 | empty: | |
525 | if (!extra) | |
526 | ws->doacross = NULL; | |
527 | else | |
528 | { | |
529 | doacross = gomp_malloc_cleared (sizeof (*doacross) + extra); | |
530 | doacross->extra = (void *) (doacross + 1); | |
531 | ws->doacross = doacross; | |
532 | } | |
533 | return; | |
534 | } | |
d9a6bd32 JJ |
535 | |
536 | for (i = 0; i < ncounts; i++) | |
537 | { | |
538 | /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ | |
539 | if (counts[i] == 0) | |
28567c40 | 540 | goto empty; |
d9a6bd32 JJ |
541 | |
542 | if (num_bits <= MAX_COLLAPSED_BITS) | |
543 | { | |
544 | unsigned int this_bits; | |
545 | if (counts[i] == 1) | |
546 | this_bits = 1; | |
547 | else | |
548 | this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__ | |
549 | - __builtin_clzll (counts[i] - 1); | |
550 | if (num_bits + this_bits <= MAX_COLLAPSED_BITS) | |
551 | { | |
552 | bits[i] = this_bits; | |
553 | num_bits += this_bits; | |
554 | } | |
555 | else | |
556 | num_bits = MAX_COLLAPSED_BITS + 1; | |
557 | } | |
558 | } | |
559 | ||
560 | if (ws->sched == GFS_STATIC) | |
561 | num_ents = team->nthreads; | |
e4606348 JJ |
562 | else if (ws->sched == GFS_GUIDED) |
563 | num_ents = counts[0]; | |
d9a6bd32 JJ |
564 | else |
565 | num_ents = (counts[0] - 1) / chunk_size + 1; | |
566 | if (num_bits <= MAX_COLLAPSED_BITS) | |
567 | { | |
568 | elt_sz = sizeof (unsigned long); | |
569 | shift_sz = ncounts * sizeof (unsigned int); | |
570 | } | |
571 | else | |
572 | { | |
573 | if (sizeof (gomp_ull) == sizeof (unsigned long)) | |
574 | elt_sz = sizeof (gomp_ull) * ncounts; | |
575 | else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long)) | |
576 | elt_sz = sizeof (unsigned long) * 2 * ncounts; | |
577 | else | |
578 | abort (); | |
579 | shift_sz = 0; | |
580 | } | |
581 | elt_sz = (elt_sz + 63) & ~63UL; | |
582 | ||
583 | doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz | |
584 | + shift_sz); | |
585 | doacross->chunk_size_ull = chunk_size; | |
586 | doacross->elt_sz = elt_sz; | |
587 | doacross->ncounts = ncounts; | |
588 | doacross->flattened = false; | |
589 | doacross->boundary = 0; | |
590 | doacross->array = (unsigned char *) | |
591 | ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) | |
592 | & ~(uintptr_t) 63); | |
28567c40 JJ |
593 | if (extra) |
594 | { | |
595 | doacross->extra = doacross->array + num_ents * elt_sz; | |
596 | memset (doacross->extra, '\0', extra); | |
597 | } | |
598 | else | |
599 | doacross->extra = NULL; | |
d9a6bd32 JJ |
600 | if (num_bits <= MAX_COLLAPSED_BITS) |
601 | { | |
602 | unsigned int shift_count = 0; | |
603 | doacross->flattened = true; | |
604 | for (i = ncounts; i > 0; i--) | |
605 | { | |
606 | doacross->shift_counts[i - 1] = shift_count; | |
607 | shift_count += bits[i - 1]; | |
608 | } | |
609 | for (ent = 0; ent < num_ents; ent++) | |
610 | *(unsigned long *) (doacross->array + ent * elt_sz) = 0; | |
611 | } | |
612 | else | |
613 | for (ent = 0; ent < num_ents; ent++) | |
614 | memset (doacross->array + ent * elt_sz, '\0', | |
615 | sizeof (unsigned long) * ncounts); | |
616 | if (ws->sched == GFS_STATIC && chunk_size == 0) | |
617 | { | |
618 | gomp_ull q = counts[0] / num_ents; | |
619 | gomp_ull t = counts[0] % num_ents; | |
620 | doacross->boundary_ull = t * (q + 1); | |
621 | doacross->q_ull = q; | |
622 | doacross->t = t; | |
623 | } | |
624 | ws->doacross = doacross; | |
625 | } | |
626 | ||
627 | /* DOACROSS POST operation. */ | |
628 | ||
629 | void | |
630 | GOMP_doacross_ull_post (gomp_ull *counts) | |
631 | { | |
632 | struct gomp_thread *thr = gomp_thread (); | |
633 | struct gomp_work_share *ws = thr->ts.work_share; | |
634 | struct gomp_doacross_work_share *doacross = ws->doacross; | |
635 | unsigned long ent; | |
636 | unsigned int i; | |
637 | ||
28567c40 JJ |
638 | if (__builtin_expect (doacross == NULL, 0) |
639 | || __builtin_expect (doacross->array == NULL, 0)) | |
d9a6bd32 JJ |
640 | { |
641 | __sync_synchronize (); | |
642 | return; | |
643 | } | |
644 | ||
645 | if (__builtin_expect (ws->sched == GFS_STATIC, 1)) | |
646 | ent = thr->ts.team_id; | |
e4606348 JJ |
647 | else if (ws->sched == GFS_GUIDED) |
648 | ent = counts[0]; | |
d9a6bd32 JJ |
649 | else |
650 | ent = counts[0] / doacross->chunk_size_ull; | |
651 | ||
652 | if (__builtin_expect (doacross->flattened, 1)) | |
653 | { | |
654 | unsigned long *array = (unsigned long *) (doacross->array | |
655 | + ent * doacross->elt_sz); | |
656 | gomp_ull flattened | |
657 | = counts[0] << doacross->shift_counts[0]; | |
658 | ||
659 | for (i = 1; i < doacross->ncounts; i++) | |
660 | flattened |= counts[i] << doacross->shift_counts[i]; | |
661 | flattened++; | |
662 | if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE)) | |
663 | __atomic_thread_fence (MEMMODEL_RELEASE); | |
664 | else | |
665 | __atomic_store_n (array, flattened, MEMMODEL_RELEASE); | |
666 | return; | |
667 | } | |
668 | ||
669 | __atomic_thread_fence (MEMMODEL_ACQUIRE); | |
670 | if (sizeof (gomp_ull) == sizeof (unsigned long)) | |
671 | { | |
672 | gomp_ull *array = (gomp_ull *) (doacross->array | |
673 | + ent * doacross->elt_sz); | |
674 | ||
675 | for (i = doacross->ncounts; i-- > 0; ) | |
676 | { | |
677 | if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED)) | |
678 | __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE); | |
679 | } | |
680 | } | |
681 | else | |
682 | { | |
683 | unsigned long *array = (unsigned long *) (doacross->array | |
684 | + ent * doacross->elt_sz); | |
685 | ||
686 | for (i = doacross->ncounts; i-- > 0; ) | |
687 | { | |
688 | gomp_ull cull = counts[i] + 1UL; | |
689 | unsigned long c = (unsigned long) cull; | |
690 | if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED)) | |
691 | __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE); | |
692 | c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2); | |
693 | if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED)) | |
694 | __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE); | |
695 | } | |
696 | } | |
697 | } | |
698 | ||
699 | /* DOACROSS WAIT operation. */ | |
700 | ||
701 | void | |
702 | GOMP_doacross_ull_wait (gomp_ull first, ...) | |
703 | { | |
704 | struct gomp_thread *thr = gomp_thread (); | |
705 | struct gomp_work_share *ws = thr->ts.work_share; | |
706 | struct gomp_doacross_work_share *doacross = ws->doacross; | |
707 | va_list ap; | |
708 | unsigned long ent; | |
709 | unsigned int i; | |
710 | ||
28567c40 JJ |
711 | if (__builtin_expect (doacross == NULL, 0) |
712 | || __builtin_expect (doacross->array == NULL, 0)) | |
d9a6bd32 JJ |
713 | { |
714 | __sync_synchronize (); | |
715 | return; | |
716 | } | |
717 | ||
718 | if (__builtin_expect (ws->sched == GFS_STATIC, 1)) | |
719 | { | |
720 | if (ws->chunk_size_ull == 0) | |
721 | { | |
722 | if (first < doacross->boundary_ull) | |
723 | ent = first / (doacross->q_ull + 1); | |
724 | else | |
725 | ent = (first - doacross->boundary_ull) / doacross->q_ull | |
726 | + doacross->t; | |
727 | } | |
728 | else | |
729 | ent = first / ws->chunk_size_ull % thr->ts.team->nthreads; | |
730 | } | |
e4606348 JJ |
731 | else if (ws->sched == GFS_GUIDED) |
732 | ent = first; | |
d9a6bd32 JJ |
733 | else |
734 | ent = first / doacross->chunk_size_ull; | |
735 | ||
736 | if (__builtin_expect (doacross->flattened, 1)) | |
737 | { | |
738 | unsigned long *array = (unsigned long *) (doacross->array | |
739 | + ent * doacross->elt_sz); | |
740 | gomp_ull flattened = first << doacross->shift_counts[0]; | |
741 | unsigned long cur; | |
742 | ||
743 | va_start (ap, first); | |
744 | for (i = 1; i < doacross->ncounts; i++) | |
745 | flattened |= va_arg (ap, gomp_ull) | |
746 | << doacross->shift_counts[i]; | |
747 | cur = __atomic_load_n (array, MEMMODEL_ACQUIRE); | |
748 | if (flattened < cur) | |
749 | { | |
750 | __atomic_thread_fence (MEMMODEL_RELEASE); | |
751 | va_end (ap); | |
752 | return; | |
753 | } | |
754 | doacross_spin (array, flattened, cur); | |
755 | __atomic_thread_fence (MEMMODEL_RELEASE); | |
756 | va_end (ap); | |
757 | return; | |
758 | } | |
759 | ||
760 | if (sizeof (gomp_ull) == sizeof (unsigned long)) | |
761 | { | |
762 | gomp_ull *array = (gomp_ull *) (doacross->array | |
763 | + ent * doacross->elt_sz); | |
764 | do | |
765 | { | |
766 | va_start (ap, first); | |
767 | for (i = 0; i < doacross->ncounts; i++) | |
768 | { | |
769 | gomp_ull thisv | |
770 | = (i ? va_arg (ap, gomp_ull) : first) + 1; | |
771 | gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED); | |
772 | if (thisv < cur) | |
773 | { | |
774 | i = doacross->ncounts; | |
775 | break; | |
776 | } | |
777 | if (thisv > cur) | |
778 | break; | |
779 | } | |
780 | va_end (ap); | |
781 | if (i == doacross->ncounts) | |
782 | break; | |
783 | cpu_relax (); | |
784 | } | |
785 | while (1); | |
786 | } | |
787 | else | |
788 | { | |
789 | unsigned long *array = (unsigned long *) (doacross->array | |
790 | + ent * doacross->elt_sz); | |
791 | do | |
792 | { | |
793 | va_start (ap, first); | |
794 | for (i = 0; i < doacross->ncounts; i++) | |
795 | { | |
796 | gomp_ull thisv | |
797 | = (i ? va_arg (ap, gomp_ull) : first) + 1; | |
798 | unsigned long t | |
799 | = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2); | |
800 | unsigned long cur | |
801 | = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED); | |
802 | if (t < cur) | |
803 | { | |
804 | i = doacross->ncounts; | |
805 | break; | |
806 | } | |
807 | if (t > cur) | |
808 | break; | |
809 | t = thisv; | |
810 | cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED); | |
811 | if (t < cur) | |
812 | { | |
813 | i = doacross->ncounts; | |
814 | break; | |
815 | } | |
816 | if (t > cur) | |
817 | break; | |
818 | } | |
819 | va_end (ap); | |
820 | if (i == doacross->ncounts) | |
821 | break; | |
822 | cpu_relax (); | |
823 | } | |
824 | while (1); | |
825 | } | |
826 | __sync_synchronize (); | |
827 | } |