]>
Commit | Line | Data |
---|---|---|
7b8cad35 GKH |
1 | From 59fa6245192159ab5e1e17b8e31f15afa9cff4bf Mon Sep 17 00:00:00 2001 |
2 | From: Thomas Gleixner <tglx@linutronix.de> | |
3 | Date: Tue, 23 Oct 2012 22:29:38 +0200 | |
4 | Subject: futex: Handle futex_pi OWNER_DIED take over correctly | |
5 | ||
6 | From: Thomas Gleixner <tglx@linutronix.de> | |
7 | ||
8 | commit 59fa6245192159ab5e1e17b8e31f15afa9cff4bf upstream. | |
9 | ||
10 | Siddhesh analyzed a failure in the take over of pi futexes in case the | |
11 | owner died and provided a workaround. | |
12 | See: http://sourceware.org/bugzilla/show_bug.cgi?id=14076 | |
13 | ||
14 | The detailed problem analysis shows: | |
15 | ||
16 | Futex F is initialized with PTHREAD_PRIO_INHERIT and | |
17 | PTHREAD_MUTEX_ROBUST_NP attributes. | |
18 | ||
19 | T1 lock_futex_pi(F); | |
20 | ||
21 | T2 lock_futex_pi(F); | |
22 | --> T2 blocks on the futex and creates pi_state which is associated | |
23 | to T1. | |
24 | ||
25 | T1 exits | |
26 | --> exit_robust_list() runs | |
27 | --> Futex F userspace value TID field is set to 0 and | |
28 | FUTEX_OWNER_DIED bit is set. | |
29 | ||
30 | T3 lock_futex_pi(F); | |
31 | --> Succeeds due to the check for F's userspace TID field == 0 | |
32 | --> Claims ownership of the futex and sets its own TID into the | |
33 | userspace TID field of futex F | |
34 | --> returns to user space | |
35 | ||
36 | T1 --> exit_pi_state_list() | |
37 | --> Transfers pi_state to waiter T2 and wakes T2 via | |
38 | rt_mutex_unlock(&pi_state->mutex) | |
39 | ||
40 | T2 --> acquires pi_state->mutex and gains real ownership of the | |
41 | pi_state | |
42 | --> Claims ownership of the futex and sets its own TID into the | |
43 | userspace TID field of futex F | |
44 | --> returns to user space | |
45 | ||
46 | T3 --> observes inconsistent state | |
47 | ||
48 | This problem is independent of UP/SMP, preemptible/non preemptible | |
49 | kernels, or process shared vs. private. The only difference is that | |
50 | certain configurations are more likely to expose it. | |
51 | ||
52 | So as Siddhesh correctly analyzed the following check in | |
53 | futex_lock_pi_atomic() is the culprit: | |
54 | ||
55 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | |
56 | ||
57 | We check the userspace value for a TID value of 0 and take over the | |
58 | futex unconditionally if that's true. | |
59 | ||
60 | AFAICT this check is there as it is correct for a different corner | |
61 | case of futexes: the WAITERS bit became stale. | |
62 | ||
63 | Now the proposed change | |
64 | ||
65 | - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | |
66 | + if (unlikely(ownerdied || | |
67 | + !(curval & (FUTEX_TID_MASK | FUTEX_WAITERS)))) { | |
68 | ||
69 | solves the problem, but it's not obvious why and it wreckages the | |
70 | "stale WAITERS bit" case. | |
71 | ||
72 | What happens is, that due to the WAITERS bit being set (T2 is blocked | |
73 | on that futex) it enforces T3 to go through lookup_pi_state(), which | |
74 | in the above case returns an existing pi_state and therefor forces T3 | |
75 | to legitimately fight with T2 over the ownership of the pi_state (via | |
76 | pi_state->mutex). Probelm solved! | |
77 | ||
78 | Though that does not work for the "WAITERS bit is stale" problem | |
79 | because if lookup_pi_state() does not find existing pi_state it | |
80 | returns -ERSCH (due to TID == 0) which causes futex_lock_pi() to | |
81 | return -ESRCH to user space because the OWNER_DIED bit is not set. | |
82 | ||
83 | Now there is a different solution to that problem. Do not look at the | |
84 | user space value at all and enforce a lookup of possibly available | |
85 | pi_state. If pi_state can be found, then the new incoming locker T3 | |
86 | blocks on that pi_state and legitimately races with T2 to acquire the | |
87 | rt_mutex and the pi_state and therefor the proper ownership of the | |
88 | user space futex. | |
89 | ||
90 | lookup_pi_state() has the correct order of checks. It first tries to | |
91 | find a pi_state associated with the user space futex and only if that | |
92 | fails it checks for futex TID value = 0. If no pi_state is available | |
93 | nothing can create new state at that point because this happens with | |
94 | the hash bucket lock held. | |
95 | ||
96 | So the above scenario changes to: | |
97 | ||
98 | T1 lock_futex_pi(F); | |
99 | ||
100 | T2 lock_futex_pi(F); | |
101 | --> T2 blocks on the futex and creates pi_state which is associated | |
102 | to T1. | |
103 | ||
104 | T1 exits | |
105 | --> exit_robust_list() runs | |
106 | --> Futex F userspace value TID field is set to 0 and | |
107 | FUTEX_OWNER_DIED bit is set. | |
108 | ||
109 | T3 lock_futex_pi(F); | |
110 | --> Finds pi_state and blocks on pi_state->rt_mutex | |
111 | ||
112 | T1 --> exit_pi_state_list() | |
113 | --> Transfers pi_state to waiter T2 and wakes it via | |
114 | rt_mutex_unlock(&pi_state->mutex) | |
115 | ||
116 | T2 --> acquires pi_state->mutex and gains ownership of the pi_state | |
117 | --> Claims ownership of the futex and sets its own TID into the | |
118 | userspace TID field of futex F | |
119 | --> returns to user space | |
120 | ||
121 | This covers all gazillion points on which T3 might come in between | |
122 | T1's exit_robust_list() clearing the TID field and T2 fixing it up. It | |
123 | also solves the "WAITERS bit stale" problem by forcing the take over. | |
124 | ||
125 | Another benefit of changing the code this way is that it makes it less | |
126 | dependent on untrusted user space values and therefor minimizes the | |
127 | possible wreckage which might be inflicted. | |
128 | ||
129 | As usual after staring for too long at the futex code my brain hurts | |
130 | so much that I really want to ditch that whole optimization of | |
131 | avoiding the syscall for the non contended case for PI futexes and rip | |
132 | out the maze of corner case handling code. Unfortunately we can't as | |
133 | user space relies on that existing behaviour, but at least thinking | |
134 | about it helps me to preserve my mental sanity. Maybe we should | |
135 | nevertheless :) | |
136 | ||
137 | Reported-and-tested-by: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com> | |
138 | Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1210232138540.2756@ionos | |
139 | Acked-by: Darren Hart <dvhart@linux.intel.com> | |
140 | Signed-off-by: Thomas Gleixner <tglx@linutronix.de> | |
141 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
142 | ||
143 | --- | |
144 | kernel/futex.c | 41 ++++++++++++++++++++++------------------- | |
145 | 1 file changed, 22 insertions(+), 19 deletions(-) | |
146 | ||
147 | --- a/kernel/futex.c | |
148 | +++ b/kernel/futex.c | |
149 | @@ -716,7 +716,7 @@ static int futex_lock_pi_atomic(u32 __us | |
150 | struct futex_pi_state **ps, | |
151 | struct task_struct *task, int set_waiters) | |
152 | { | |
153 | - int lock_taken, ret, ownerdied = 0; | |
154 | + int lock_taken, ret, force_take = 0; | |
155 | u32 uval, newval, curval, vpid = task_pid_vnr(task); | |
156 | ||
157 | retry: | |
158 | @@ -755,17 +755,15 @@ retry: | |
159 | newval = curval | FUTEX_WAITERS; | |
160 | ||
161 | /* | |
162 | - * There are two cases, where a futex might have no owner (the | |
163 | - * owner TID is 0): OWNER_DIED. We take over the futex in this | |
164 | - * case. We also do an unconditional take over, when the owner | |
165 | - * of the futex died. | |
166 | - * | |
167 | - * This is safe as we are protected by the hash bucket lock ! | |
168 | + * Should we force take the futex? See below. | |
169 | */ | |
170 | - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | |
171 | - /* Keep the OWNER_DIED bit */ | |
172 | + if (unlikely(force_take)) { | |
173 | + /* | |
174 | + * Keep the OWNER_DIED and the WAITERS bit and set the | |
175 | + * new TID value. | |
176 | + */ | |
177 | newval = (curval & ~FUTEX_TID_MASK) | vpid; | |
178 | - ownerdied = 0; | |
179 | + force_take = 0; | |
180 | lock_taken = 1; | |
181 | } | |
182 | ||
183 | @@ -775,7 +773,7 @@ retry: | |
184 | goto retry; | |
185 | ||
186 | /* | |
187 | - * We took the lock due to owner died take over. | |
188 | + * We took the lock due to forced take over. | |
189 | */ | |
190 | if (unlikely(lock_taken)) | |
191 | return 1; | |
192 | @@ -790,20 +788,25 @@ retry: | |
193 | switch (ret) { | |
194 | case -ESRCH: | |
195 | /* | |
196 | - * No owner found for this futex. Check if the | |
197 | - * OWNER_DIED bit is set to figure out whether | |
198 | - * this is a robust futex or not. | |
199 | + * We failed to find an owner for this | |
200 | + * futex. So we have no pi_state to block | |
201 | + * on. This can happen in two cases: | |
202 | + * | |
203 | + * 1) The owner died | |
204 | + * 2) A stale FUTEX_WAITERS bit | |
205 | + * | |
206 | + * Re-read the futex value. | |
207 | */ | |
208 | if (get_futex_value_locked(&curval, uaddr)) | |
209 | return -EFAULT; | |
210 | ||
211 | /* | |
212 | - * We simply start over in case of a robust | |
213 | - * futex. The code above will take the futex | |
214 | - * and return happy. | |
215 | + * If the owner died or we have a stale | |
216 | + * WAITERS bit the owner TID in the user space | |
217 | + * futex is 0. | |
218 | */ | |
219 | - if (curval & FUTEX_OWNER_DIED) { | |
220 | - ownerdied = 1; | |
221 | + if (!(curval & FUTEX_TID_MASK)) { | |
222 | + force_take = 1; | |
223 | goto retry; | |
224 | } | |
225 | default: |