]> git.ipfire.org Git - thirdparty/gcc.git/blob - libitm/config/x86/unaligned.h
Merge from transactional-memory branch.
[thirdparty/gcc.git] / libitm / config / x86 / unaligned.h
1 /* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
2 Contributed by Richard Henderson <rth@redhat.com>.
3
4 This file is part of the GNU Transactional Memory Library (libitm).
5
6 Libitm is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 more details.
15
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
19
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
24
25 #ifndef LIBITM_X86_UNALIGNED_H
26 #define LIBITM_X86_UNALIGNED_H 1
27
28 #define HAVE_ARCH_UNALIGNED_LOAD2_U4 1
29 #define HAVE_ARCH_UNALIGNED_LOAD2_U8 1
30
31 #include "config/generic/unaligned.h"
32
33 namespace GTM HIDDEN {
34
35 template<>
36 inline uint32_t
37 unaligned_load2<uint32_t>(const gtm_cacheline *c1,
38 const gtm_cacheline *c2, size_t ofs)
39 {
40 uint32_t r, lo, hi;
41 lo = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
42 hi = c2->u32[0];
43 asm("shrd %b2, %1, %0" : "=r"(r) : "r"(hi), "c"((ofs & 3) * 8), "0"(lo));
44 return r;
45 }
46
47 template<>
48 inline uint64_t
49 unaligned_load2<uint64_t>(const gtm_cacheline *c1,
50 const gtm_cacheline *c2, size_t ofs)
51 {
52 #ifdef __x86_64__
53 uint64_t r, lo, hi;
54 lo = c1->u64[CACHELINE_SIZE / sizeof(uint64_t) - 1];
55 hi = c2->u64[0];
56 asm("shrd %b2, %1, %0" : "=r"(r) : "r"(hi), "c"((ofs & 3) * 8), "0"(lo));
57 return r;
58 #else
59 uint32_t v0, v1, v2;
60 uint64_t r;
61
62 if (ofs < CACHELINE_SIZE - 4)
63 {
64 v0 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 2];
65 v1 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
66 v2 = c2->u32[0];
67 }
68 else
69 {
70 v0 = c1->u32[CACHELINE_SIZE / sizeof(uint32_t) - 1];
71 v1 = c2->u32[0];
72 v2 = c2->u32[1];
73 }
74 ofs = (ofs & 3) * 8;
75 asm("shrd %%cl, %[v1], %[v0]; shrd %%cl, %[v2], %[v1]"
76 : "=A"(r) : "c"(ofs), [v0] "a"(v0), [v1] "d"(v1), [v2] "r"(v2));
77
78 return r;
79 #endif
80 }
81
82 #if defined(__SSE2__) || defined(__MMX__)
83 template<>
84 inline _ITM_TYPE_M64
85 unaligned_load2<_ITM_TYPE_M64>(const gtm_cacheline *c1,
86 const gtm_cacheline *c2, size_t ofs)
87 {
88 # ifdef __x86_64__
89 __m128i lo = _mm_movpi64_epi64 (c1->m64[CACHELINE_SIZE / 8 - 1]);
90 __m128i hi = _mm_movpi64_epi64 (c2->m64[0]);
91
92 ofs = (ofs & 7) * 8;
93 lo = _mm_srli_epi64 (lo, ofs);
94 hi = _mm_slli_epi64 (hi, 64 - ofs);
95 lo = lo | hi;
96 return _mm_movepi64_pi64 (lo);
97 # else
98 // On 32-bit we're about to return the result in an MMX register, so go
99 // ahead and do the computation in that unit, even if SSE2 is available.
100 __m64 lo = c1->m64[CACHELINE_SIZE / 8 - 1];
101 __m64 hi = c2->m64[0];
102
103 ofs = (ofs & 7) * 8;
104 lo = _mm_srli_si64 (lo, ofs);
105 hi = _mm_slli_si64 (hi, 64 - ofs);
106 return lo | hi;
107 # endif
108 }
109 #endif // SSE2 or MMX
110
111 // The SSE types are strictly aligned.
112 #ifdef __SSE__
113 template<>
114 struct strict_alignment<_ITM_TYPE_M128>
115 : public std::true_type
116 { };
117
118 // Expand the unaligned SSE move instructions.
119 template<>
120 inline _ITM_TYPE_M128
121 unaligned_load<_ITM_TYPE_M128>(const void *t)
122 {
123 return _mm_loadu_ps (static_cast<const float *>(t));
124 }
125
126 template<>
127 inline void
128 unaligned_store<_ITM_TYPE_M128>(void *t, _ITM_TYPE_M128 val)
129 {
130 _mm_storeu_ps (static_cast<float *>(t), val);
131 }
132 #endif // SSE
133
134 #ifdef __AVX__
135 // The AVX types are strictly aligned when it comes to vmovaps vs vmovups.
136 template<>
137 struct strict_alignment<_ITM_TYPE_M256>
138 : public std::true_type
139 { };
140
141 template<>
142 inline _ITM_TYPE_M256
143 unaligned_load<_ITM_TYPE_M256>(const void *t)
144 {
145 return _mm256_loadu_ps (static_cast<const float *>(t));
146 }
147
148 template<>
149 inline void
150 unaligned_store<_ITM_TYPE_M256>(void *t, _ITM_TYPE_M256 val)
151 {
152 _mm256_storeu_ps (static_cast<float *>(t), val);
153 }
154 #endif // AVX
155
156 #ifdef __XOP__
157 # define HAVE_ARCH_REALIGN_M128I 1
158 extern const __v16qi GTM_vpperm_shift[16];
159 inline __m128i
160 realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
161 {
162 return _mm_perm_epi8 (lo, hi, GTM_vpperm_shift[byte_count]);
163 }
164 #elif defined(__AVX__)
165 # define HAVE_ARCH_REALIGN_M128I 1
166 extern "C" const uint64_t GTM_vpalignr_table[16];
167 inline __m128i
168 realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
169 {
170 register __m128i xmm0 __asm__("xmm0") = hi;
171 register __m128i xmm1 __asm__("xmm1") = lo;
172 __asm("call *%2" : "+x"(xmm0) : "x"(xmm1),
173 "r"(&GTM_vpalignr_table[byte_count]));
174 return xmm0;
175 }
176 #elif defined(__SSSE3__)
177 # define HAVE_ARCH_REALIGN_M128I 1
178 extern "C" const uint64_t GTM_palignr_table[16];
179 inline __m128i
180 realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
181 {
182 register __m128i xmm0 __asm__("xmm0") = hi;
183 register __m128i xmm1 __asm__("xmm1") = lo;
184 __asm("call *%2" : "+x"(xmm0) : "x"(xmm1),
185 "r"(&GTM_palignr_table[byte_count]));
186 return xmm0;
187 }
188 #elif defined(__SSE2__)
189 # define HAVE_ARCH_REALIGN_M128I 1
190 extern "C" const char GTM_pshift_table[16 * 16];
191 inline __m128i
192 realign_m128i (__m128i lo, __m128i hi, unsigned byte_count)
193 {
194 register __m128i xmm0 __asm__("xmm0") = lo;
195 register __m128i xmm1 __asm__("xmm1") = hi;
196 __asm("call *%2" : "+x"(xmm0), "+x"(xmm1)
197 : "r"(GTM_pshift_table + byte_count*16));
198 return xmm0;
199 }
200 #endif // XOP, AVX, SSSE3, SSE2
201
202 #ifdef HAVE_ARCH_REALIGN_M128I
203 template<>
204 inline _ITM_TYPE_M128
205 unaligned_load2<_ITM_TYPE_M128>(const gtm_cacheline *c1,
206 const gtm_cacheline *c2, size_t ofs)
207 {
208 return (_ITM_TYPE_M128)
209 realign_m128i (c1->m128i[CACHELINE_SIZE / 16 - 1],
210 c2->m128i[0], ofs & 15);
211 }
212 #endif // HAVE_ARCH_REALIGN_M128I
213
214 #ifdef __AVX__
215 template<>
216 inline _ITM_TYPE_M256
217 unaligned_load2<_ITM_TYPE_M256>(const gtm_cacheline *c1,
218 const gtm_cacheline *c2, size_t ofs)
219 {
220 __m128i v0, v1;
221 __m256i r;
222
223 v0 = (__m128i) unaligned_load2<_ITM_TYPE_M128>(c1, c2, ofs);
224 if (ofs < CACHELINE_SIZE - 16)
225 v1 = v0, v0 = _mm_loadu_si128 ((const __m128i *) &c1->b[ofs]);
226 else
227 v1 = _mm_loadu_si128((const __m128i *)&c2->b[ofs + 16 - CACHELINE_SIZE]);
228
229 r = _mm256_castsi128_si256 ((__m128i)v0);
230 r = _mm256_insertf128_si256 (r, (__m128i)v1, 1);
231 return (_ITM_TYPE_M256) r;
232 }
233 #endif // AVX
234
235 } // namespace GTM
236
237 #endif // LIBITM_X86_UNALIGNED_H