]>
Commit | Line | Data |
---|---|---|
04277e02 | 1 | /* Copyright (C) 2003-2019 Free Software Foundation, Inc. |
a88c9263 UD |
2 | This file is part of the GNU C Library. |
3 | Contributed by Martin Schwidefsky <schwidefsky@de.ibm.com>, 2003. | |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
59ba27a6 PE |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ | |
a88c9263 UD |
18 | |
19 | #include <endian.h> | |
20 | #include <errno.h> | |
21 | #include <sysdep.h> | |
ed19993b | 22 | #include <futex-internal.h> |
a88c9263 UD |
23 | #include <pthread.h> |
24 | #include <pthreadP.h> | |
ed19993b TR |
25 | #include <sys/time.h> |
26 | #include <atomic.h> | |
27 | #include <stdint.h> | |
28 | #include <stdbool.h> | |
a88c9263 UD |
29 | |
30 | #include <shlib-compat.h> | |
5acf7263 | 31 | #include <stap-probe.h> |
ed19993b TR |
32 | #include <time.h> |
33 | ||
34 | #include "pthread_cond_common.c" | |
35 | ||
a88c9263 | 36 | |
7ce5c164 UD |
37 | struct _condvar_cleanup_buffer |
38 | { | |
ed19993b | 39 | uint64_t wseq; |
7ce5c164 UD |
40 | pthread_cond_t *cond; |
41 | pthread_mutex_t *mutex; | |
ed19993b | 42 | int private; |
7ce5c164 UD |
43 | }; |
44 | ||
3e976b96 | 45 | |
ed19993b TR |
46 | /* Decrease the waiter reference count. */ |
47 | static void | |
48 | __condvar_confirm_wakeup (pthread_cond_t *cond, int private) | |
a88c9263 | 49 | { |
ed19993b TR |
50 | /* If destruction is pending (i.e., the wake-request flag is nonzero) and we |
51 | are the last waiter (prior value of __wrefs was 1 << 3), then wake any | |
52 | threads waiting in pthread_cond_destroy. Release MO to synchronize with | |
53 | these threads. Don't bother clearing the wake-up request flag. */ | |
54 | if ((atomic_fetch_add_release (&cond->__data.__wrefs, -8) >> 2) == 3) | |
55 | futex_wake (&cond->__data.__wrefs, INT_MAX, private); | |
56 | } | |
57 | ||
a88c9263 | 58 | |
ed19993b TR |
59 | /* Cancel waiting after having registered as a waiter previously. SEQ is our |
60 | position and G is our group index. | |
61 | The goal of cancellation is to make our group smaller if that is still | |
62 | possible. If we are in a closed group, this is not possible anymore; in | |
63 | this case, we need to send a replacement signal for the one we effectively | |
64 | consumed because the signal should have gotten consumed by another waiter | |
65 | instead; we must not both cancel waiting and consume a signal. | |
66 | ||
67 | Must not be called while still holding a reference on the group. | |
68 | ||
69 | Returns true iff we consumed a signal. | |
70 | ||
71 | On some kind of timeouts, we may be able to pretend that a signal we | |
72 | effectively consumed happened before the timeout (i.e., similarly to first | |
73 | spinning on signals before actually checking whether the timeout has | |
74 | passed already). Doing this would allow us to skip sending a replacement | |
75 | signal, but this case might happen rarely because the end of the timeout | |
76 | must race with someone else sending a signal. Therefore, we don't bother | |
77 | trying to optimize this. */ | |
78 | static void | |
79 | __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g, | |
80 | int private) | |
81 | { | |
82 | bool consumed_signal = false; | |
a88c9263 | 83 | |
ed19993b TR |
84 | /* No deadlock with group switching is possible here because we have do |
85 | not hold a reference on the group. */ | |
86 | __condvar_acquire_lock (cond, private); | |
87 | ||
88 | uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1; | |
89 | if (g1_start > seq) | |
90 | { | |
91 | /* Our group is closed, so someone provided enough signals for it. | |
92 | Thus, we effectively consumed a signal. */ | |
93 | consumed_signal = true; | |
94 | } | |
95 | else | |
893a3511 | 96 | { |
ed19993b TR |
97 | if (g1_start + __condvar_get_orig_size (cond) <= seq) |
98 | { | |
99 | /* We are in the current G2 and thus cannot have consumed a signal. | |
100 | Reduce its effective size or handle overflow. Remember that in | |
101 | G2, unsigned int size is zero or a negative value. */ | |
102 | if (cond->__data.__g_size[g] + __PTHREAD_COND_MAX_GROUP_SIZE > 0) | |
103 | { | |
104 | cond->__data.__g_size[g]--; | |
105 | } | |
106 | else | |
107 | { | |
108 | /* Cancellations would overflow the maximum group size. Just | |
109 | wake up everyone spuriously to create a clean state. This | |
110 | also means we do not consume a signal someone else sent. */ | |
111 | __condvar_release_lock (cond, private); | |
112 | __pthread_cond_broadcast (cond); | |
113 | return; | |
114 | } | |
115 | } | |
116 | else | |
2b6a801e | 117 | { |
ed19993b TR |
118 | /* We are in current G1. If the group's size is zero, someone put |
119 | a signal in the group that nobody else but us can consume. */ | |
120 | if (cond->__data.__g_size[g] == 0) | |
121 | consumed_signal = true; | |
122 | else | |
123 | { | |
124 | /* Otherwise, we decrease the size of the group. This is | |
125 | equivalent to atomically putting in a signal just for us and | |
126 | consuming it right away. We do not consume a signal sent | |
127 | by someone else. We also cannot have consumed a futex | |
128 | wake-up because if we were cancelled or timed out in a futex | |
129 | call, the futex will wake another waiter. */ | |
130 | cond->__data.__g_size[g]--; | |
131 | } | |
2b6a801e | 132 | } |
893a3511 | 133 | } |
a88c9263 | 134 | |
ed19993b | 135 | __condvar_release_lock (cond, private); |
73f7c32c | 136 | |
ed19993b | 137 | if (consumed_signal) |
73f7c32c | 138 | { |
ed19993b TR |
139 | /* We effectively consumed a signal even though we didn't want to. |
140 | Therefore, we need to send a replacement signal. | |
141 | If we would want to optimize this, we could do what | |
142 | pthread_cond_signal does right in the critical section above. */ | |
143 | __pthread_cond_signal (cond); | |
73f7c32c | 144 | } |
ed19993b | 145 | } |
73f7c32c | 146 | |
ed19993b TR |
147 | /* Wake up any signalers that might be waiting. */ |
148 | static void | |
149 | __condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private) | |
150 | { | |
151 | /* Release MO to synchronize-with the acquire load in | |
152 | __condvar_quiesce_and_switch_g1. */ | |
153 | if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3) | |
154 | { | |
155 | /* Clear the wake-up request flag before waking up. We do not need more | |
156 | than relaxed MO and it doesn't matter if we apply this for an aliased | |
157 | group because we wake all futex waiters right after clearing the | |
158 | flag. */ | |
159 | atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1); | |
160 | futex_wake (cond->__data.__g_refs + g, INT_MAX, private); | |
161 | } | |
a88c9263 UD |
162 | } |
163 | ||
ed19993b TR |
164 | /* Clean-up for cancellation of waiters waiting for normal signals. We cancel |
165 | our registration as a waiter, confirm we have woken up, and re-acquire the | |
166 | mutex. */ | |
167 | static void | |
168 | __condvar_cleanup_waiting (void *arg) | |
169 | { | |
170 | struct _condvar_cleanup_buffer *cbuffer = | |
171 | (struct _condvar_cleanup_buffer *) arg; | |
172 | pthread_cond_t *cond = cbuffer->cond; | |
173 | unsigned g = cbuffer->wseq & 1; | |
a88c9263 | 174 | |
ed19993b TR |
175 | __condvar_dec_grefs (cond, g, cbuffer->private); |
176 | ||
177 | __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private); | |
178 | /* FIXME With the current cancellation implementation, it is possible that | |
179 | a thread is cancelled after it has returned from a syscall. This could | |
180 | result in a cancelled waiter consuming a futex wake-up that is then | |
181 | causing another waiter in the same group to not wake up. To work around | |
182 | this issue until we have fixed cancellation, just add a futex wake-up | |
183 | conservatively. */ | |
184 | futex_wake (cond->__data.__g_signals + g, 1, cbuffer->private); | |
185 | ||
186 | __condvar_confirm_wakeup (cond, cbuffer->private); | |
187 | ||
188 | /* XXX If locking the mutex fails, should we just stop execution? This | |
189 | might be better than silently ignoring the error. */ | |
190 | __pthread_mutex_cond_lock (cbuffer->mutex); | |
191 | } | |
192 | ||
193 | /* This condvar implementation guarantees that all calls to signal and | |
194 | broadcast and all of the three virtually atomic parts of each call to wait | |
195 | (i.e., (1) releasing the mutex and blocking, (2) unblocking, and (3) re- | |
196 | acquiring the mutex) happen in some total order that is consistent with the | |
197 | happens-before relations in the calling program. However, this order does | |
198 | not necessarily result in additional happens-before relations being | |
199 | established (which aligns well with spurious wake-ups being allowed). | |
200 | ||
201 | All waiters acquire a certain position in a 64b waiter sequence (__wseq). | |
202 | This sequence determines which waiters are allowed to consume signals. | |
203 | A broadcast is equal to sending as many signals as are unblocked waiters. | |
204 | When a signal arrives, it samples the current value of __wseq with a | |
205 | relaxed-MO load (i.e., the position the next waiter would get). (This is | |
206 | sufficient because it is consistent with happens-before; the caller can | |
207 | enforce stronger ordering constraints by calling signal while holding the | |
208 | mutex.) Only waiters with a position less than the __wseq value observed | |
209 | by the signal are eligible to consume this signal. | |
210 | ||
211 | This would be straight-forward to implement if waiters would just spin but | |
212 | we need to let them block using futexes. Futexes give no guarantee of | |
213 | waking in FIFO order, so we cannot reliably wake eligible waiters if we | |
214 | just use a single futex. Also, futex words are 32b in size, but we need | |
215 | to distinguish more than 1<<32 states because we need to represent the | |
216 | order of wake-up (and thus which waiters are eligible to consume signals); | |
217 | blocking in a futex is not atomic with a waiter determining its position in | |
218 | the waiter sequence, so we need the futex word to reliably notify waiters | |
219 | that they should not attempt to block anymore because they have been | |
220 | already signaled in the meantime. While an ABA issue on a 32b value will | |
221 | be rare, ignoring it when we are aware of it is not the right thing to do | |
222 | either. | |
223 | ||
224 | Therefore, we use a 64b counter to represent the waiter sequence (on | |
225 | architectures which only support 32b atomics, we use a few bits less). | |
226 | To deal with the blocking using futexes, we maintain two groups of waiters: | |
227 | * Group G1 consists of waiters that are all eligible to consume signals; | |
228 | incoming signals will always signal waiters in this group until all | |
229 | waiters in G1 have been signaled. | |
230 | * Group G2 consists of waiters that arrive when a G1 is present and still | |
231 | contains waiters that have not been signaled. When all waiters in G1 | |
232 | are signaled and a new signal arrives, the new signal will convert G2 | |
233 | into the new G1 and create a new G2 for future waiters. | |
234 | ||
235 | We cannot allocate new memory because of process-shared condvars, so we | |
236 | have just two slots of groups that change their role between G1 and G2. | |
237 | Each has a separate futex word, a number of signals available for | |
238 | consumption, a size (number of waiters in the group that have not been | |
239 | signaled), and a reference count. | |
240 | ||
241 | The group reference count is used to maintain the number of waiters that | |
242 | are using the group's futex. Before a group can change its role, the | |
243 | reference count must show that no waiters are using the futex anymore; this | |
244 | prevents ABA issues on the futex word. | |
245 | ||
246 | To represent which intervals in the waiter sequence the groups cover (and | |
247 | thus also which group slot contains G1 or G2), we use a 64b counter to | |
248 | designate the start position of G1 (inclusive), and a single bit in the | |
249 | waiter sequence counter to represent which group slot currently contains | |
250 | G2. This allows us to switch group roles atomically wrt. waiters obtaining | |
251 | a position in the waiter sequence. The G1 start position allows waiters to | |
252 | figure out whether they are in a group that has already been completely | |
253 | signaled (i.e., if the current G1 starts at a later position that the | |
254 | waiter's position). Waiters cannot determine whether they are currently | |
255 | in G2 or G1 -- but they do not have too because all they are interested in | |
256 | is whether there are available signals, and they always start in G2 (whose | |
257 | group slot they know because of the bit in the waiter sequence. Signalers | |
258 | will simply fill the right group until it is completely signaled and can | |
259 | be closed (they do not switch group roles until they really have to to | |
260 | decrease the likelihood of having to wait for waiters still holding a | |
261 | reference on the now-closed G1). | |
262 | ||
263 | Signalers maintain the initial size of G1 to be able to determine where | |
264 | G2 starts (G2 is always open-ended until it becomes G1). They track the | |
265 | remaining size of a group; when waiters cancel waiting (due to PThreads | |
266 | cancellation or timeouts), they will decrease this remaining size as well. | |
267 | ||
268 | To implement condvar destruction requirements (i.e., that | |
269 | pthread_cond_destroy can be called as soon as all waiters have been | |
270 | signaled), waiters increment a reference count before starting to wait and | |
271 | decrement it after they stopped waiting but right before they acquire the | |
272 | mutex associated with the condvar. | |
273 | ||
274 | pthread_cond_t thus consists of the following (bits that are used for | |
275 | flags and are not part of the primary value of each field but necessary | |
276 | to make some things atomic or because there was no space for them | |
277 | elsewhere in the data structure): | |
278 | ||
279 | __wseq: Waiter sequence counter | |
280 | * LSB is index of current G2. | |
281 | * Waiters fetch-add while having acquire the mutex associated with the | |
282 | condvar. Signalers load it and fetch-xor it concurrently. | |
283 | __g1_start: Starting position of G1 (inclusive) | |
284 | * LSB is index of current G2. | |
285 | * Modified by signalers while having acquired the condvar-internal lock | |
286 | and observed concurrently by waiters. | |
287 | __g1_orig_size: Initial size of G1 | |
288 | * The two least-significant bits represent the condvar-internal lock. | |
289 | * Only accessed while having acquired the condvar-internal lock. | |
290 | __wrefs: Waiter reference counter. | |
291 | * Bit 2 is true if waiters should run futex_wake when they remove the | |
292 | last reference. pthread_cond_destroy uses this as futex word. | |
293 | * Bit 1 is the clock ID (0 == CLOCK_REALTIME, 1 == CLOCK_MONOTONIC). | |
294 | * Bit 0 is true iff this is a process-shared condvar. | |
295 | * Simple reference count used by both waiters and pthread_cond_destroy. | |
296 | (If the format of __wrefs is changed, update nptl_lock_constants.pysym | |
297 | and the pretty printers.) | |
298 | For each of the two groups, we have: | |
299 | __g_refs: Futex waiter reference count. | |
300 | * LSB is true if waiters should run futex_wake when they remove the | |
301 | last reference. | |
302 | * Reference count used by waiters concurrently with signalers that have | |
303 | acquired the condvar-internal lock. | |
304 | __g_signals: The number of signals that can still be consumed. | |
305 | * Used as a futex word by waiters. Used concurrently by waiters and | |
306 | signalers. | |
307 | * LSB is true iff this group has been completely signaled (i.e., it is | |
308 | closed). | |
309 | __g_size: Waiters remaining in this group (i.e., which have not been | |
310 | signaled yet. | |
311 | * Accessed by signalers and waiters that cancel waiting (both do so only | |
312 | when having acquired the condvar-internal lock. | |
313 | * The size of G2 is always zero because it cannot be determined until | |
314 | the group becomes G1. | |
315 | * Although this is of unsigned type, we rely on using unsigned overflow | |
316 | rules to make this hold effectively negative values too (in | |
317 | particular, when waiters in G2 cancel waiting). | |
318 | ||
319 | A PTHREAD_COND_INITIALIZER condvar has all fields set to zero, which yields | |
320 | a condvar that has G2 starting at position 0 and a G1 that is closed. | |
321 | ||
322 | Because waiters do not claim ownership of a group right when obtaining a | |
323 | position in __wseq but only reference count the group when using futexes | |
324 | to block, it can happen that a group gets closed before a waiter can | |
325 | increment the reference count. Therefore, waiters have to check whether | |
326 | their group is already closed using __g1_start. They also have to perform | |
327 | this check when spinning when trying to grab a signal from __g_signals. | |
328 | Note that for these checks, using relaxed MO to load __g1_start is | |
329 | sufficient because if a waiter can see a sufficiently large value, it could | |
330 | have also consume a signal in the waiters group. | |
331 | ||
332 | Waiters try to grab a signal from __g_signals without holding a reference | |
333 | count, which can lead to stealing a signal from a more recent group after | |
334 | their own group was already closed. They cannot always detect whether they | |
335 | in fact did because they do not know when they stole, but they can | |
336 | conservatively add a signal back to the group they stole from; if they | |
337 | did so unnecessarily, all that happens is a spurious wake-up. To make this | |
338 | even less likely, __g1_start contains the index of the current g2 too, | |
339 | which allows waiters to check if there aliasing on the group slots; if | |
340 | there wasn't, they didn't steal from the current G1, which means that the | |
341 | G1 they stole from must have been already closed and they do not need to | |
342 | fix anything. | |
343 | ||
344 | It is essential that the last field in pthread_cond_t is __g_signals[1]: | |
345 | The previous condvar used a pointer-sized field in pthread_cond_t, so a | |
346 | PTHREAD_COND_INITIALIZER from that condvar implementation might only | |
347 | initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes | |
348 | in total instead of the 48 we need). __g_signals[1] is not accessed before | |
349 | the first group switch (G2 starts at index 0), which will set its value to | |
350 | zero after a harmless fetch-or whose return value is ignored. This | |
351 | effectively completes initialization. | |
352 | ||
353 | ||
354 | Limitations: | |
355 | * This condvar isn't designed to allow for more than | |
356 | __PTHREAD_COND_MAX_GROUP_SIZE * (1 << 31) calls to __pthread_cond_wait. | |
357 | * More than __PTHREAD_COND_MAX_GROUP_SIZE concurrent waiters are not | |
358 | supported. | |
359 | * Beyond what is allowed as errors by POSIX or documented, we can also | |
360 | return the following errors: | |
361 | * EPERM if MUTEX is a recursive mutex and the caller doesn't own it. | |
362 | * EOWNERDEAD or ENOTRECOVERABLE when using robust mutexes. Unlike | |
363 | for other errors, this can happen when we re-acquire the mutex; this | |
364 | isn't allowed by POSIX (which requires all errors to virtually happen | |
365 | before we release the mutex or change the condvar state), but there's | |
366 | nothing we can do really. | |
367 | * When using PTHREAD_MUTEX_PP_* mutexes, we can also return all errors | |
368 | returned by __pthread_tpp_change_priority. We will already have | |
369 | released the mutex in such cases, so the caller cannot expect to own | |
370 | MUTEX. | |
371 | ||
372 | Other notes: | |
373 | * Instead of the normal mutex unlock / lock functions, we use | |
374 | __pthread_mutex_unlock_usercnt(m, 0) / __pthread_mutex_cond_lock(m) | |
375 | because those will not change the mutex-internal users count, so that it | |
376 | can be detected when a condvar is still associated with a particular | |
377 | mutex because there is a waiter blocked on this condvar using this mutex. | |
378 | */ | |
379 | static __always_inline int | |
380 | __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, | |
afe4de7d | 381 | clockid_t clockid, |
ed19993b | 382 | const struct timespec *abstime) |
a88c9263 | 383 | { |
ed19993b | 384 | const int maxspin = 0; |
7ce5c164 | 385 | int err; |
ed19993b | 386 | int result = 0; |
8313cb99 | 387 | |
5acf7263 | 388 | LIBC_PROBE (cond_wait, 2, cond, mutex); |
a88c9263 | 389 | |
afe4de7d MC |
390 | /* clockid will already have been checked by |
391 | __pthread_cond_clockwait or pthread_condattr_setclock, or we | |
392 | don't use it if abstime is NULL, so we don't need to check it | |
393 | here. */ | |
394 | ||
ed19993b TR |
395 | /* Acquire a position (SEQ) in the waiter sequence (WSEQ). We use an |
396 | atomic operation because signals and broadcasts may update the group | |
397 | switch without acquiring the mutex. We do not need release MO here | |
398 | because we do not need to establish any happens-before relation with | |
399 | signalers (see __pthread_cond_signal); modification order alone | |
400 | establishes a total order of waiters/signals. We do need acquire MO | |
401 | to synchronize with group reinitialization in | |
402 | __condvar_quiesce_and_switch_g1. */ | |
403 | uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2); | |
404 | /* Find our group's index. We always go into what was G2 when we acquired | |
405 | our position. */ | |
406 | unsigned int g = wseq & 1; | |
407 | uint64_t seq = wseq >> 1; | |
408 | ||
409 | /* Increase the waiter reference count. Relaxed MO is sufficient because | |
410 | we only need to synchronize when decrementing the reference count. */ | |
411 | unsigned int flags = atomic_fetch_add_relaxed (&cond->__data.__wrefs, 8); | |
412 | int private = __condvar_get_private (flags); | |
413 | ||
414 | /* Now that we are registered as a waiter, we can release the mutex. | |
415 | Waiting on the condvar must be atomic with releasing the mutex, so if | |
416 | the mutex is used to establish a happens-before relation with any | |
417 | signaler, the waiter must be visible to the latter; thus, we release the | |
418 | mutex after registering as waiter. | |
419 | If releasing the mutex fails, we just cancel our registration as a | |
420 | waiter and confirm that we have woken up. */ | |
61623643 | 421 | err = __pthread_mutex_unlock_usercnt (mutex, 0); |
ed19993b | 422 | if (__glibc_unlikely (err != 0)) |
7ce5c164 | 423 | { |
ed19993b TR |
424 | __condvar_cancel_waiting (cond, seq, g, private); |
425 | __condvar_confirm_wakeup (cond, private); | |
7ce5c164 UD |
426 | return err; |
427 | } | |
a88c9263 | 428 | |
ed19993b TR |
429 | /* Now wait until a signal is available in our group or it is closed. |
430 | Acquire MO so that if we observe a value of zero written after group | |
431 | switching in __condvar_quiesce_and_switch_g1, we synchronize with that | |
432 | store and will see the prior update of __g1_start done while switching | |
433 | groups too. */ | |
434 | unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); | |
a88c9263 | 435 | |
46a32546 | 436 | do |
a88c9263 | 437 | { |
ed19993b | 438 | while (1) |
8313cb99 | 439 | { |
ed19993b TR |
440 | /* Spin-wait first. |
441 | Note that spinning first without checking whether a timeout | |
442 | passed might lead to what looks like a spurious wake-up even | |
443 | though we should return ETIMEDOUT (e.g., if the caller provides | |
444 | an absolute timeout that is clearly in the past). However, | |
445 | (1) spurious wake-ups are allowed, (2) it seems unlikely that a | |
446 | user will (ab)use pthread_cond_wait as a check for whether a | |
447 | point in time is in the past, and (3) spinning first without | |
448 | having to compare against the current time seems to be the right | |
449 | choice from a performance perspective for most use cases. */ | |
450 | unsigned int spin = maxspin; | |
451 | while (signals == 0 && spin > 0) | |
452 | { | |
453 | /* Check that we are not spinning on a group that's already | |
454 | closed. */ | |
455 | if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) | |
456 | goto done; | |
457 | ||
458 | /* TODO Back off. */ | |
459 | ||
460 | /* Reload signals. See above for MO. */ | |
461 | signals = atomic_load_acquire (cond->__data.__g_signals + g); | |
462 | spin--; | |
463 | } | |
464 | ||
465 | /* If our group will be closed as indicated by the flag on signals, | |
466 | don't bother grabbing a signal. */ | |
467 | if (signals & 1) | |
468 | goto done; | |
469 | ||
470 | /* If there is an available signal, don't block. */ | |
471 | if (signals != 0) | |
472 | break; | |
473 | ||
474 | /* No signals available after spinning, so prepare to block. | |
475 | We first acquire a group reference and use acquire MO for that so | |
476 | that we synchronize with the dummy read-modify-write in | |
477 | __condvar_quiesce_and_switch_g1 if we read from that. In turn, | |
478 | in this case this will make us see the closed flag on __g_signals | |
479 | that designates a concurrent attempt to reuse the group's slot. | |
480 | We use acquire MO for the __g_signals check to make the | |
481 | __g1_start check work (see spinning above). | |
482 | Note that the group reference acquisition will not mask the | |
483 | release MO when decrementing the reference count because we use | |
484 | an atomic read-modify-write operation and thus extend the release | |
485 | sequence. */ | |
486 | atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2); | |
487 | if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0) | |
488 | || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))) | |
489 | { | |
490 | /* Our group is closed. Wake up any signalers that might be | |
491 | waiting. */ | |
492 | __condvar_dec_grefs (cond, g, private); | |
493 | goto done; | |
494 | } | |
495 | ||
496 | // Now block. | |
497 | struct _pthread_cleanup_buffer buffer; | |
498 | struct _condvar_cleanup_buffer cbuffer; | |
499 | cbuffer.wseq = wseq; | |
500 | cbuffer.cond = cond; | |
501 | cbuffer.mutex = mutex; | |
502 | cbuffer.private = private; | |
503 | __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer); | |
504 | ||
505 | if (abstime == NULL) | |
506 | { | |
507 | /* Block without a timeout. */ | |
508 | err = futex_wait_cancelable ( | |
509 | cond->__data.__g_signals + g, 0, private); | |
510 | } | |
511 | else | |
512 | { | |
513 | /* Block, but with a timeout. | |
514 | Work around the fact that the kernel rejects negative timeout | |
515 | values despite them being valid. */ | |
516 | if (__glibc_unlikely (abstime->tv_sec < 0)) | |
517 | err = ETIMEDOUT; | |
ed19993b TR |
518 | else |
519 | { | |
ed19993b | 520 | err = futex_abstimed_wait_cancelable |
99d01ffc MC |
521 | (cond->__data.__g_signals + g, 0, clockid, abstime, |
522 | private); | |
ed19993b TR |
523 | } |
524 | } | |
525 | ||
526 | __pthread_cleanup_pop (&buffer, 0); | |
527 | ||
528 | if (__glibc_unlikely (err == ETIMEDOUT)) | |
529 | { | |
530 | __condvar_dec_grefs (cond, g, private); | |
531 | /* If we timed out, we effectively cancel waiting. Note that | |
532 | we have decremented __g_refs before cancellation, so that a | |
533 | deadlock between waiting for quiescence of our group in | |
534 | __condvar_quiesce_and_switch_g1 and us trying to acquire | |
535 | the lock during cancellation is not possible. */ | |
536 | __condvar_cancel_waiting (cond, seq, g, private); | |
537 | result = ETIMEDOUT; | |
538 | goto done; | |
539 | } | |
540 | else | |
541 | __condvar_dec_grefs (cond, g, private); | |
542 | ||
543 | /* Reload signals. See above for MO. */ | |
544 | signals = atomic_load_acquire (cond->__data.__g_signals + g); | |
8313cb99 | 545 | } |
8313cb99 | 546 | |
ed19993b TR |
547 | } |
548 | /* Try to grab a signal. Use acquire MO so that we see an up-to-date value | |
549 | of __g1_start below (see spinning above for a similar case). In | |
550 | particular, if we steal from a more recent group, we will also see a | |
551 | more recent __g1_start below. */ | |
552 | while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g, | |
553 | &signals, signals - 2)); | |
554 | ||
555 | /* We consumed a signal but we could have consumed from a more recent group | |
556 | that aliased with ours due to being in the same group slot. If this | |
557 | might be the case our group must be closed as visible through | |
558 | __g1_start. */ | |
559 | uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); | |
560 | if (seq < (g1_start >> 1)) | |
561 | { | |
562 | /* We potentially stole a signal from a more recent group but we do not | |
563 | know which group we really consumed from. | |
564 | We do not care about groups older than current G1 because they are | |
565 | closed; we could have stolen from these, but then we just add a | |
566 | spurious wake-up for the current groups. | |
567 | We will never steal a signal from current G2 that was really intended | |
568 | for G2 because G2 never receives signals (until it becomes G1). We | |
569 | could have stolen a signal from G2 that was conservatively added by a | |
570 | previous waiter that also thought it stole a signal -- but given that | |
571 | that signal was added unnecessarily, it's not a problem if we steal | |
572 | it. | |
573 | Thus, the remaining case is that we could have stolen from the current | |
574 | G1, where "current" means the __g1_start value we observed. However, | |
575 | if the current G1 does not have the same slot index as we do, we did | |
576 | not steal from it and do not need to undo that. This is the reason | |
577 | for putting a bit with G2's index into__g1_start as well. */ | |
578 | if (((g1_start & 1) ^ 1) == g) | |
8313cb99 | 579 | { |
ed19993b TR |
580 | /* We have to conservatively undo our potential mistake of stealing |
581 | a signal. We can stop trying to do that when the current G1 | |
582 | changes because other spinning waiters will notice this too and | |
583 | __condvar_quiesce_and_switch_g1 has checked that there are no | |
584 | futex waiters anymore before switching G1. | |
585 | Relaxed MO is fine for the __g1_start load because we need to | |
586 | merely be able to observe this fact and not have to observe | |
587 | something else as well. | |
588 | ??? Would it help to spin for a little while to see whether the | |
589 | current G1 gets closed? This might be worthwhile if the group is | |
590 | small or close to being closed. */ | |
591 | unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g); | |
592 | while (__condvar_load_g1_start_relaxed (cond) == g1_start) | |
593 | { | |
594 | /* Try to add a signal. We don't need to acquire the lock | |
595 | because at worst we can cause a spurious wake-up. If the | |
596 | group is in the process of being closed (LSB is true), this | |
597 | has an effect similar to us adding a signal. */ | |
598 | if (((s & 1) != 0) | |
599 | || atomic_compare_exchange_weak_relaxed | |
600 | (cond->__data.__g_signals + g, &s, s + 2)) | |
601 | { | |
602 | /* If we added a signal, we also need to add a wake-up on | |
603 | the futex. We also need to do that if we skipped adding | |
604 | a signal because the group is being closed because | |
605 | while __condvar_quiesce_and_switch_g1 could have closed | |
606 | the group, it might stil be waiting for futex waiters to | |
607 | leave (and one of those waiters might be the one we stole | |
608 | the signal from, which cause it to block using the | |
609 | futex). */ | |
610 | futex_wake (cond->__data.__g_signals + g, 1, private); | |
611 | break; | |
612 | } | |
613 | /* TODO Back off. */ | |
614 | } | |
8313cb99 | 615 | } |
a88c9263 UD |
616 | } |
617 | ||
ed19993b | 618 | done: |
a88c9263 | 619 | |
ed19993b TR |
620 | /* Confirm that we have been woken. We do that before acquiring the mutex |
621 | to allow for execution of pthread_cond_destroy while having acquired the | |
622 | mutex. */ | |
623 | __condvar_confirm_wakeup (cond, private); | |
73f7c32c | 624 | |
ed19993b TR |
625 | /* Woken up; now re-acquire the mutex. If this doesn't fail, return RESULT, |
626 | which is set to ETIMEDOUT if a timeout occured, or zero otherwise. */ | |
627 | err = __pthread_mutex_cond_lock (mutex); | |
628 | /* XXX Abort on errors that are disallowed by POSIX? */ | |
629 | return (err != 0) ? err : result; | |
630 | } | |
73f7c32c | 631 | |
a88c9263 | 632 | |
ed19993b TR |
633 | /* See __pthread_cond_wait_common. */ |
634 | int | |
635 | __pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex) | |
636 | { | |
afe4de7d MC |
637 | /* clockid is unused when abstime is NULL. */ |
638 | return __pthread_cond_wait_common (cond, mutex, 0, NULL); | |
ed19993b | 639 | } |
a88c9263 | 640 | |
ed19993b TR |
641 | /* See __pthread_cond_wait_common. */ |
642 | int | |
643 | __pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex, | |
644 | const struct timespec *abstime) | |
645 | { | |
646 | /* Check parameter validity. This should also tell the compiler that | |
647 | it can assume that abstime is not NULL. */ | |
648 | if (abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000) | |
649 | return EINVAL; | |
afe4de7d MC |
650 | |
651 | /* Relaxed MO is suffice because clock ID bit is only modified | |
652 | in condition creation. */ | |
653 | unsigned int flags = atomic_load_relaxed (&cond->__data.__wrefs); | |
654 | clockid_t clockid = (flags & __PTHREAD_COND_CLOCK_MONOTONIC_MASK) | |
655 | ? CLOCK_MONOTONIC : CLOCK_REALTIME; | |
656 | return __pthread_cond_wait_common (cond, mutex, clockid, abstime); | |
657 | } | |
9d20e22e MC |
658 | versioned_symbol (libpthread, __pthread_cond_wait, pthread_cond_wait, |
659 | GLIBC_2_3_2); | |
660 | versioned_symbol (libpthread, __pthread_cond_timedwait, pthread_cond_timedwait, | |
661 | GLIBC_2_3_2); | |
afe4de7d MC |
662 | |
663 | /* See __pthread_cond_wait_common. */ | |
664 | int | |
665 | __pthread_cond_clockwait (pthread_cond_t *cond, pthread_mutex_t *mutex, | |
666 | clockid_t clockid, | |
667 | const struct timespec *abstime) | |
668 | { | |
669 | /* Check parameter validity. This should also tell the compiler that | |
670 | it can assume that abstime is not NULL. */ | |
671 | if (abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000) | |
672 | return EINVAL; | |
673 | ||
674 | if (!futex_abstimed_supported_clockid (clockid)) | |
675 | return EINVAL; | |
676 | ||
677 | /* If we do not support waiting using CLOCK_MONOTONIC, return an error. */ | |
678 | if (clockid == CLOCK_MONOTONIC | |
679 | && !futex_supports_exact_relative_timeouts ()) | |
680 | return EINVAL; | |
681 | ||
682 | return __pthread_cond_wait_common (cond, mutex, clockid, abstime); | |
a88c9263 | 683 | } |
afe4de7d | 684 | weak_alias (__pthread_cond_clockwait, pthread_cond_clockwait); |