]>
Commit | Line | Data |
---|---|---|
7ce78e23 | 1 | /* Macros for atomic functionality for tile. |
f1717362 | 2 | Copyright (C) 2011-2016 Free Software Foundation, Inc. |
7ce78e23 | 3 | Contributed by Walter Lee (walt@tilera.com) |
4 | ||
5 | This file is free software; you can redistribute it and/or modify it | |
6 | under the terms of the GNU General Public License as published by the | |
7 | Free Software Foundation; either version 3, or (at your option) any | |
8 | later version. | |
9 | ||
10 | This file is distributed in the hope that it will be useful, but | |
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | General Public License for more details. | |
14 | ||
15 | Under Section 7 of GPL version 3, you are granted additional | |
16 | permissions described in the GCC Runtime Library Exception, version | |
17 | 3.1, as published by the Free Software Foundation. | |
18 | ||
19 | You should have received a copy of the GNU General Public License and | |
20 | a copy of the GCC Runtime Library Exception along with this program; | |
21 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 | <http://www.gnu.org/licenses/>. */ | |
23 | ||
24 | ||
25 | /* Provides macros for common atomic functionality. */ | |
26 | ||
27 | #ifndef _ATOMIC_H_ | |
28 | #define _ATOMIC_H_ | |
29 | ||
30 | #ifdef __tilegx__ | |
31 | /* Atomic instruction macros | |
32 | ||
33 | The macros provided by atomic.h simplify access to the TILE-Gx | |
34 | architecture's atomic instructions. The architecture provides a | |
35 | variety of atomic instructions, including "exchange", "compare and | |
36 | exchange", "fetch and ADD", "fetch and AND", "fetch and OR", and | |
37 | "fetch and ADD if greater than or equal to zero". | |
38 | ||
39 | No barrier or fence semantics are implied by any of the atomic | |
40 | instructions for manipulating memory; you must specify the barriers | |
41 | that you wish explicitly, using the provided macros. | |
42 | ||
43 | Any integral 32- or 64-bit value can be used as the argument | |
44 | to these macros, such as "int", "long long", "unsigned long", etc. | |
45 | The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data. | |
46 | The "exchange" and "compare and exchange" macros may also take | |
47 | pointer values. We use the pseudo-type "VAL" in the documentation | |
48 | to indicate the use of an appropriate type. */ | |
49 | #else | |
50 | /* Atomic instruction macros | |
51 | ||
52 | The macros provided by atomic.h simplify access to the Tile | |
53 | architecture's atomic instructions. Since the architecture | |
54 | supports test-and-set as its only in-silicon atomic operation, many | |
55 | of the operations provided by this header are implemented as | |
56 | fast-path calls to Linux emulation routines. | |
57 | ||
58 | Using the kernel for atomic operations allows userspace to take | |
59 | advantage of the kernel's existing atomic-integer support (managed | |
60 | by a distributed array of locks). The kernel provides proper | |
61 | ordering among simultaneous atomic operations on different cores, | |
62 | and guarantees a process can not be context-switched part way | |
63 | through an atomic operation. By virtue of sharing the kernel | |
64 | atomic implementation, the userspace atomic operations | |
65 | are compatible with the atomic methods provided by the kernel's | |
66 | futex() syscall API. Note that these operations never cause Linux | |
67 | kernel scheduling, and are in fact invisible to the kernel; they | |
68 | simply act as regular function calls but with an elevated privilege | |
69 | level. Note that the kernel's distributed lock array is hashed by | |
70 | using only VA bits from the atomic value's address (to avoid the | |
71 | performance hit of page table locking and multiple page-table | |
72 | lookups to get the PA) and only the VA bits that are below page | |
73 | granularity (to properly lock simultaneous accesses to the same | |
74 | page mapped at different VAs). As a result, simultaneous atomic | |
75 | operations on values whose addresses are at the same offset on a | |
76 | page will contend in the kernel for the same lock array element. | |
77 | ||
78 | No barrier or fence semantics are implied by any of the atomic | |
79 | instructions for manipulating memory; you must specify the barriers | |
80 | that you wish explicitly, using the provided macros. | |
81 | ||
82 | Any integral 32- or 64-bit value can be used as the argument | |
83 | to these macros, such as "int", "long long", "unsigned long", etc. | |
84 | The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data. | |
85 | The "exchange" and "compare and exchange" macros may also take | |
86 | pointer values. We use the pseudo-type "VAL" in the documentation | |
87 | to indicate the use of an appropriate type. | |
88 | ||
89 | The 32-bit routines are implemented using a single kernel fast | |
90 | syscall, as is the 64-bit compare-and-exchange. The other 64-bit | |
91 | routines are implemented by looping over the 64-bit | |
92 | compare-and-exchange routine, so may be potentially less efficient. */ | |
93 | #endif | |
94 | ||
7ce78e23 | 95 | #ifdef __tilegx__ |
96 | #include <arch/spr_def.h> | |
97 | #else | |
98 | #include <asm/unistd.h> | |
99 | #endif | |
100 | ||
101 | ||
102 | /* 32-bit integer compare-and-exchange. */ | |
103 | static __inline __attribute__ ((always_inline)) | |
3d14844b | 104 | int arch_atomic_val_compare_and_exchange_4 (volatile int *mem, |
105 | int oldval, int newval) | |
7ce78e23 | 106 | { |
107 | #ifdef __tilegx__ | |
108 | __insn_mtspr (SPR_CMPEXCH_VALUE, oldval); | |
109 | return __insn_cmpexch4 (mem, newval); | |
110 | #else | |
111 | int result; | |
112 | __asm__ __volatile__ ("swint1":"=R00" (result), | |
113 | "=m" (*mem):"R10" (__NR_FAST_cmpxchg), "R00" (mem), | |
114 | "R01" (oldval), "R02" (newval), "m" (*mem):"r20", | |
115 | "r21", "r22", "r23", "r24", "r25", "r26", "r27", | |
116 | "r28", "r29", "memory"); | |
117 | return result; | |
118 | #endif | |
119 | } | |
120 | ||
121 | /* 64-bit integer compare-and-exchange. */ | |
122 | static __inline __attribute__ ((always_inline)) | |
1c229e23 | 123 | long long arch_atomic_val_compare_and_exchange_8 (volatile long long |
124 | *mem, long long oldval, | |
125 | long long newval) | |
7ce78e23 | 126 | { |
127 | #ifdef __tilegx__ | |
128 | __insn_mtspr (SPR_CMPEXCH_VALUE, oldval); | |
129 | return __insn_cmpexch (mem, newval); | |
130 | #else | |
131 | unsigned int result_lo, result_hi; | |
132 | unsigned int oldval_lo = oldval & 0xffffffffu, oldval_hi = oldval >> 32; | |
133 | unsigned int newval_lo = newval & 0xffffffffu, newval_hi = newval >> 32; | |
134 | __asm__ __volatile__ ("swint1":"=R00" (result_lo), "=R01" (result_hi), | |
135 | "=m" (*mem):"R10" (__NR_FAST_cmpxchg64), "R00" (mem), | |
136 | "R02" (oldval_lo), "R03" (oldval_hi), | |
137 | "R04" (newval_lo), "R05" (newval_hi), | |
138 | "m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25", | |
139 | "r26", "r27", "r28", "r29", "memory"); | |
1c229e23 | 140 | return ((long long) result_hi) << 32 | result_lo; |
7ce78e23 | 141 | #endif |
142 | } | |
143 | ||
144 | /* This non-existent symbol is called for sizes other than "4" and "8", | |
145 | indicating a bug in the caller. */ | |
3d14844b | 146 | extern int __arch_atomic_error_bad_argument_size (void) |
7ce78e23 | 147 | __attribute__ ((warning ("sizeof atomic argument not 4 or 8"))); |
148 | ||
149 | ||
3d14844b | 150 | #define arch_atomic_val_compare_and_exchange(mem, o, n) \ |
1c229e23 | 151 | __extension__ ({ \ |
7ce78e23 | 152 | (__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \ |
153 | ((sizeof(*(mem)) == 8) ? \ | |
3d14844b | 154 | arch_atomic_val_compare_and_exchange_8( \ |
1c229e23 | 155 | (volatile long long*)(mem), (__typeof((o)-(o)))(o), \ |
7ce78e23 | 156 | (__typeof((n)-(n)))(n)) : \ |
157 | (sizeof(*(mem)) == 4) ? \ | |
3d14844b | 158 | arch_atomic_val_compare_and_exchange_4( \ |
7ce78e23 | 159 | (volatile int*)(mem), (__typeof((o)-(o)))(o), \ |
160 | (__typeof((n)-(n)))(n)) : \ | |
3d14844b | 161 | __arch_atomic_error_bad_argument_size()); \ |
7ce78e23 | 162 | }) |
163 | ||
3d14844b | 164 | #define arch_atomic_bool_compare_and_exchange(mem, o, n) \ |
1c229e23 | 165 | __extension__ ({ \ |
7ce78e23 | 166 | __typeof(o) __o = (o); \ |
167 | __builtin_expect( \ | |
3d14844b | 168 | __o == arch_atomic_val_compare_and_exchange((mem), __o, (n)), 1); \ |
7ce78e23 | 169 | }) |
170 | ||
171 | ||
172 | /* Loop with compare_and_exchange until we guess the correct value. | |
173 | Normally "expr" will be an expression using __old and __value. */ | |
3d14844b | 174 | #define __arch_atomic_update_cmpxchg(mem, value, expr) \ |
1c229e23 | 175 | __extension__ ({ \ |
7ce78e23 | 176 | __typeof(value) __value = (value); \ |
177 | __typeof(*(mem)) *__mem = (mem), __old = *__mem, __guess; \ | |
178 | do { \ | |
179 | __guess = __old; \ | |
3d14844b | 180 | __old = arch_atomic_val_compare_and_exchange(__mem, __old, (expr)); \ |
7ce78e23 | 181 | } while (__builtin_expect(__old != __guess, 0)); \ |
182 | __old; \ | |
183 | }) | |
184 | ||
185 | #ifdef __tilegx__ | |
186 | ||
187 | /* Generic atomic op with 8- or 4-byte variant. | |
188 | The _mask, _addend, and _expr arguments are ignored on tilegx. */ | |
3d14844b | 189 | #define __arch_atomic_update(mem, value, op, _mask, _addend, _expr) \ |
1c229e23 | 190 | __extension__ ({ \ |
7ce78e23 | 191 | ((__typeof(*(mem))) \ |
192 | ((sizeof(*(mem)) == 8) ? (__typeof(*(mem)-*(mem)))__insn_##op( \ | |
1c229e23 | 193 | (volatile void *)(mem), \ |
194 | (long long)(__typeof((value)-(value)))(value)) : \ | |
7ce78e23 | 195 | (sizeof(*(mem)) == 4) ? (int)__insn_##op##4( \ |
1c229e23 | 196 | (volatile void *)(mem), \ |
197 | (int)(__typeof((value)-(value)))(value)) : \ | |
3d14844b | 198 | __arch_atomic_error_bad_argument_size())); \ |
7ce78e23 | 199 | }) |
200 | ||
201 | #else | |
202 | ||
203 | /* This uses TILEPro's fast syscall support to atomically compute: | |
204 | ||
205 | int old = *ptr; | |
206 | *ptr = (old & mask) + addend; | |
207 | return old; | |
208 | ||
209 | This primitive can be used for atomic exchange, add, or, and. | |
210 | Only 32-bit support is provided. */ | |
211 | static __inline __attribute__ ((always_inline)) | |
212 | int | |
3d14844b | 213 | __arch_atomic_update_4 (volatile int *mem, int mask, int addend) |
7ce78e23 | 214 | { |
215 | int result; | |
216 | __asm__ __volatile__ ("swint1":"=R00" (result), | |
217 | "=m" (*mem):"R10" (__NR_FAST_atomic_update), | |
218 | "R00" (mem), "R01" (mask), "R02" (addend), | |
219 | "m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25", | |
220 | "r26", "r27", "r28", "r29", "memory"); | |
221 | return result; | |
222 | } | |
223 | ||
224 | /* Generic atomic op with 8- or 4-byte variant. | |
225 | The _op argument is ignored on tilepro. */ | |
3d14844b | 226 | #define __arch_atomic_update(mem, value, _op, mask, addend, expr) \ |
1c229e23 | 227 | __extension__ ({ \ |
7ce78e23 | 228 | (__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \ |
229 | ((sizeof(*(mem)) == 8) ? \ | |
3d14844b | 230 | __arch_atomic_update_cmpxchg((mem), (value), (expr)) : \ |
7ce78e23 | 231 | (sizeof(*(mem)) == 4) ? \ |
3d14844b | 232 | __arch_atomic_update_4((volatile int*)(mem), \ |
233 | (__typeof((mask)-(mask)))(mask), \ | |
234 | (__typeof((addend)-(addend)))(addend)) : \ | |
235 | __arch_atomic_error_bad_argument_size()); \ | |
7ce78e23 | 236 | }) |
237 | ||
238 | #endif /* __tilegx__ */ | |
239 | ||
240 | ||
3d14844b | 241 | #define arch_atomic_exchange(mem, newvalue) \ |
242 | __arch_atomic_update(mem, newvalue, exch, 0, newvalue, __value) | |
7ce78e23 | 243 | |
3d14844b | 244 | #define arch_atomic_add(mem, value) \ |
245 | __arch_atomic_update(mem, value, fetchadd, -1, value, __old + __value) | |
7ce78e23 | 246 | |
3d14844b | 247 | #define arch_atomic_sub(mem, value) arch_atomic_add((mem), -(value)) |
7ce78e23 | 248 | |
3d14844b | 249 | #define arch_atomic_increment(mem) arch_atomic_add((mem), 1) |
7ce78e23 | 250 | |
3d14844b | 251 | #define arch_atomic_decrement(mem) arch_atomic_add((mem), -1) |
7ce78e23 | 252 | |
3d14844b | 253 | #define arch_atomic_and(mem, mask) \ |
254 | __arch_atomic_update(mem, mask, fetchand, mask, 0, __old & __value) | |
7ce78e23 | 255 | |
3d14844b | 256 | #define arch_atomic_or(mem, mask) \ |
257 | __arch_atomic_update(mem, mask, fetchor, ~mask, mask, __old | __value) | |
7ce78e23 | 258 | |
3d14844b | 259 | #define arch_atomic_xor(mem, mask) \ |
260 | __arch_atomic_update_cmpxchg(mem, mask, __old ^ __value) | |
261 | ||
262 | #define arch_atomic_nand(mem, mask) \ | |
263 | __arch_atomic_update_cmpxchg(mem, mask, ~(__old & __value)) | |
264 | ||
265 | #define arch_atomic_bit_set(mem, bit) \ | |
1c229e23 | 266 | __extension__ ({ \ |
7ce78e23 | 267 | __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \ |
3d14844b | 268 | __mask & arch_atomic_or((mem), __mask); \ |
7ce78e23 | 269 | }) |
270 | ||
3d14844b | 271 | #define arch_atomic_bit_clear(mem, bit) \ |
1c229e23 | 272 | __extension__ ({ \ |
7ce78e23 | 273 | __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \ |
3d14844b | 274 | __mask & arch_atomic_and((mem), ~__mask); \ |
7ce78e23 | 275 | }) |
276 | ||
277 | #ifdef __tilegx__ | |
278 | /* Atomically store a new value to memory. | |
279 | Note that you can freely use types of any size here, unlike the | |
280 | other atomic routines, which require 32- or 64-bit types. | |
281 | This accessor is provided for compatibility with TILEPro, which | |
282 | required an explicit atomic operation for stores that needed | |
283 | to be atomic with respect to other atomic methods in this header. */ | |
3d14844b | 284 | #define arch_atomic_write(mem, value) ((void) (*(mem) = (value))) |
7ce78e23 | 285 | #else |
3d14844b | 286 | #define arch_atomic_write(mem, value) \ |
7ce78e23 | 287 | do { \ |
288 | __typeof(mem) __aw_mem = (mem); \ | |
289 | __typeof(value) __aw_val = (value); \ | |
290 | unsigned int *__aw_mem32, __aw_intval, __aw_val32, __aw_off, __aw_mask; \ | |
291 | __aw_intval = (__typeof((value) - (value)))__aw_val; \ | |
292 | switch (sizeof(*__aw_mem)) { \ | |
293 | case 8: \ | |
3d14844b | 294 | __arch_atomic_update_cmpxchg(__aw_mem, __aw_val, __value); \ |
7ce78e23 | 295 | break; \ |
296 | case 4: \ | |
3d14844b | 297 | __arch_atomic_update_4((int *)__aw_mem, 0, __aw_intval); \ |
7ce78e23 | 298 | break; \ |
299 | case 2: \ | |
300 | __aw_off = 8 * ((long)__aw_mem & 0x2); \ | |
301 | __aw_mask = 0xffffU << __aw_off; \ | |
302 | __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x2); \ | |
303 | __aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \ | |
3d14844b | 304 | __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32, \ |
305 | (__old & ~__aw_mask) | __value); \ | |
7ce78e23 | 306 | break; \ |
307 | case 1: \ | |
308 | __aw_off = 8 * ((long)__aw_mem & 0x3); \ | |
309 | __aw_mask = 0xffU << __aw_off; \ | |
310 | __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x3); \ | |
311 | __aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \ | |
3d14844b | 312 | __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32, \ |
313 | (__old & ~__aw_mask) | __value); \ | |
7ce78e23 | 314 | break; \ |
315 | } \ | |
316 | } while (0) | |
317 | #endif | |
318 | ||
319 | /* Compiler barrier. | |
320 | ||
321 | This macro prevents loads or stores from being moved by the compiler | |
322 | across the macro. Any loaded value that was loaded before this | |
323 | macro must then be reloaded by the compiler. */ | |
3d14844b | 324 | #define arch_atomic_compiler_barrier() __asm__ __volatile__("" ::: "memory") |
7ce78e23 | 325 | |
326 | /* Full memory barrier. | |
327 | ||
3d14844b | 328 | This macro has the semantics of arch_atomic_compiler_barrer(), but also |
7ce78e23 | 329 | ensures that previous stores are visible to other cores, and that |
330 | all previous loaded values have been placed into their target | |
331 | register on this core. */ | |
3d14844b | 332 | #define arch_atomic_full_barrier() __insn_mf() |
7ce78e23 | 333 | |
334 | /* Read memory barrier. | |
335 | ||
336 | Ensure that all reads by this processor that occurred prior to the | |
337 | read memory barrier have completed, and that no reads that occur | |
338 | after the read memory barrier on this processor are initiated | |
339 | before the barrier. | |
340 | ||
341 | On current TILE chips a read barrier is implemented as a full barrier, | |
342 | but this may not be true in later versions of the architecture. | |
343 | ||
3d14844b | 344 | See also arch_atomic_acquire_barrier() for the appropriate idiom to use |
7ce78e23 | 345 | to ensure no reads are lifted above an atomic lock instruction. */ |
3d14844b | 346 | #define arch_atomic_read_barrier() arch_atomic_full_barrier() |
7ce78e23 | 347 | |
348 | /* Write memory barrier. | |
349 | ||
350 | Ensure that all writes by this processor that occurred prior to the | |
351 | write memory barrier have completed, and that no writes that occur | |
352 | after the write memory barrier on this processor are initiated | |
353 | before the barrier. | |
354 | ||
355 | On current TILE chips a write barrier is implemented as a full barrier, | |
356 | but this may not be true in later versions of the architecture. | |
357 | ||
3d14844b | 358 | See also arch_atomic_release_barrier() for the appropriate idiom to use |
7ce78e23 | 359 | to ensure all writes are complete prior to an atomic unlock instruction. */ |
3d14844b | 360 | #define arch_atomic_write_barrier() arch_atomic_full_barrier() |
7ce78e23 | 361 | |
362 | /* Lock acquisition barrier. | |
363 | ||
364 | Ensure that no load operations that follow this macro in the | |
365 | program can issue prior to the barrier. Without such a barrier, | |
366 | the compiler can reorder them to issue earlier, or the hardware can | |
367 | issue them speculatively. The latter is not currently done in the | |
368 | Tile microarchitecture, but using this operation improves | |
369 | portability to future implementations. | |
370 | ||
371 | This operation is intended to be used as part of the "acquire" | |
372 | path for locking, that is, when entering a critical section. | |
373 | This should be done after the atomic operation that actually | |
374 | acquires the lock, and in conjunction with a "control dependency" | |
375 | that checks the atomic operation result to see if the lock was | |
3d14844b | 376 | in fact acquired. See the arch_atomic_read_barrier() macro |
7ce78e23 | 377 | for a heavier-weight barrier to use in certain unusual constructs, |
3d14844b | 378 | or arch_atomic_acquire_barrier_value() if no control dependency exists. */ |
379 | #define arch_atomic_acquire_barrier() arch_atomic_compiler_barrier() | |
7ce78e23 | 380 | |
381 | /* Lock release barrier. | |
382 | ||
383 | Ensure that no store operations that precede this macro in the | |
384 | program complete subsequent to the barrier. Without such a | |
385 | barrier, the compiler can reorder stores to issue later, or stores | |
386 | can be still outstanding in the memory network. | |
387 | ||
388 | This operation is intended to be used as part of the "release" path | |
389 | for locking, that is, when leaving a critical section. This should | |
390 | be done before the operation (such as a store of zero) that | |
391 | actually releases the lock. */ | |
3d14844b | 392 | #define arch_atomic_release_barrier() arch_atomic_write_barrier() |
7ce78e23 | 393 | |
394 | /* Barrier until the read of a particular value is complete. | |
395 | ||
396 | This is occasionally useful when constructing certain locking | |
397 | scenarios. For example, you might write a routine that issues an | |
398 | atomic instruction to enter a critical section, then reads one or | |
399 | more values within the critical section without checking to see if | |
400 | the critical section was in fact acquired, and only later checks | |
401 | the atomic instruction result to see if the lock was acquired. If | |
402 | so the routine could properly release the lock and know that the | |
403 | values that were read were valid. | |
404 | ||
405 | In this scenario, it is required to wait for the result of the | |
406 | atomic instruction, even if the value itself is not checked. This | |
407 | guarantees that if the atomic instruction succeeded in taking the lock, | |
408 | the lock was held before any reads in the critical section issued. */ | |
3d14844b | 409 | #define arch_atomic_acquire_barrier_value(val) \ |
7ce78e23 | 410 | __asm__ __volatile__("move %0, %0" :: "r"(val)) |
411 | ||
412 | /* Access the given variable in memory exactly once. | |
413 | ||
414 | In some contexts, an algorithm may need to force access to memory, | |
415 | since otherwise the compiler may think it can optimize away a | |
416 | memory load or store; for example, in a loop when polling memory to | |
417 | see if another cpu has updated it yet. Generally this is only | |
418 | required for certain very carefully hand-tuned algorithms; using it | |
419 | unnecessarily may result in performance losses. | |
420 | ||
421 | A related use of this macro is to ensure that the compiler does not | |
422 | rematerialize the value of "x" by reloading it from memory | |
423 | unexpectedly; the "volatile" marking will prevent the compiler from | |
424 | being able to rematerialize. This is helpful if an algorithm needs | |
425 | to read a variable without locking, but needs it to have the same | |
426 | value if it ends up being used several times within the algorithm. | |
427 | ||
428 | Note that multiple uses of this macro are guaranteed to be ordered, | |
429 | i.e. the compiler will not reorder stores or loads that are wrapped | |
3d14844b | 430 | in arch_atomic_access_once(). */ |
431 | #define arch_atomic_access_once(x) (*(volatile __typeof(x) *)&(x)) | |
432 | ||
7ce78e23 | 433 | |
434 | ||
435 | #endif /* !_ATOMIC_H_ */ |