]>
Commit | Line | Data |
---|---|---|
99dee823 | 1 | /* Copyright (C) 2002-2021 Free Software Foundation, Inc. |
5a9335ef NC |
2 | |
3 | This file is part of GCC. | |
4 | ||
5 | GCC is free software; you can redistribute it and/or modify it | |
6 | under the terms of the GNU General Public License as published | |
748086b7 | 7 | by the Free Software Foundation; either version 3, or (at your |
5a9335ef NC |
8 | option) any later version. |
9 | ||
10 | GCC is distributed in the hope that it will be useful, but WITHOUT | |
11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
12 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | |
13 | License for more details. | |
14 | ||
748086b7 JJ |
15 | Under Section 7 of GPL version 3, you are granted additional |
16 | permissions described in the GCC Runtime Library Exception, version | |
17 | 3.1, as published by the Free Software Foundation. | |
18 | ||
19 | You should have received a copy of the GNU General Public License and | |
20 | a copy of the GCC Runtime Library Exception along with this program; | |
21 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 | <http://www.gnu.org/licenses/>. */ | |
5a9335ef NC |
23 | |
24 | #ifndef _MMINTRIN_H_INCLUDED | |
25 | #define _MMINTRIN_H_INCLUDED | |
26 | ||
8fd03515 XQ |
27 | #ifndef __IWMMXT__ |
28 | #error mmintrin.h included without enabling WMMX/WMMX2 instructions (e.g. -march=iwmmxt or -march=iwmmxt2) | |
29 | #endif | |
30 | ||
31 | ||
32 | #if defined __cplusplus | |
33 | extern "C" { | |
34 | /* Intrinsics use C name-mangling. */ | |
35 | #endif /* __cplusplus */ | |
36 | ||
5a9335ef NC |
37 | /* The data type intended for user use. */ |
38 | typedef unsigned long long __m64, __int64; | |
39 | ||
40 | /* Internal data types for implementing the intrinsics. */ | |
4a5eab38 PB |
41 | typedef int __v2si __attribute__ ((vector_size (8))); |
42 | typedef short __v4hi __attribute__ ((vector_size (8))); | |
8fd03515 | 43 | typedef signed char __v8qi __attribute__ ((vector_size (8))); |
5a9335ef | 44 | |
4ad4fa63 MT |
45 | /* Provided for source compatibility with MMX. */ |
46 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
47 | _mm_empty (void) | |
48 | { | |
49 | } | |
50 | ||
5a9335ef | 51 | /* "Convert" __m64 and __int64 into each other. */ |
8fd03515 | 52 | static __inline __m64 |
5a9335ef NC |
53 | _mm_cvtsi64_m64 (__int64 __i) |
54 | { | |
55 | return __i; | |
56 | } | |
57 | ||
58 | static __inline __int64 | |
59 | _mm_cvtm64_si64 (__m64 __i) | |
60 | { | |
61 | return __i; | |
62 | } | |
63 | ||
64 | static __inline int | |
65 | _mm_cvtsi64_si32 (__int64 __i) | |
66 | { | |
67 | return __i; | |
68 | } | |
69 | ||
70 | static __inline __int64 | |
71 | _mm_cvtsi32_si64 (int __i) | |
72 | { | |
8fd03515 | 73 | return (__i & 0xffffffff); |
5a9335ef NC |
74 | } |
75 | ||
76 | /* Pack the four 16-bit values from M1 into the lower four 8-bit values of | |
77 | the result, and the four 16-bit values from M2 into the upper four 8-bit | |
78 | values of the result, all with signed saturation. */ | |
79 | static __inline __m64 | |
80 | _mm_packs_pi16 (__m64 __m1, __m64 __m2) | |
81 | { | |
82 | return (__m64) __builtin_arm_wpackhss ((__v4hi)__m1, (__v4hi)__m2); | |
83 | } | |
84 | ||
85 | /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of | |
86 | the result, and the two 32-bit values from M2 into the upper two 16-bit | |
87 | values of the result, all with signed saturation. */ | |
88 | static __inline __m64 | |
89 | _mm_packs_pi32 (__m64 __m1, __m64 __m2) | |
90 | { | |
91 | return (__m64) __builtin_arm_wpackwss ((__v2si)__m1, (__v2si)__m2); | |
92 | } | |
93 | ||
94 | /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and | |
95 | the 64-bit value from M2 into the upper 32-bits of the result, all with | |
96 | signed saturation for values that do not fit exactly into 32-bits. */ | |
97 | static __inline __m64 | |
98 | _mm_packs_pi64 (__m64 __m1, __m64 __m2) | |
99 | { | |
100 | return (__m64) __builtin_arm_wpackdss ((long long)__m1, (long long)__m2); | |
101 | } | |
102 | ||
103 | /* Pack the four 16-bit values from M1 into the lower four 8-bit values of | |
104 | the result, and the four 16-bit values from M2 into the upper four 8-bit | |
105 | values of the result, all with unsigned saturation. */ | |
106 | static __inline __m64 | |
107 | _mm_packs_pu16 (__m64 __m1, __m64 __m2) | |
108 | { | |
109 | return (__m64) __builtin_arm_wpackhus ((__v4hi)__m1, (__v4hi)__m2); | |
110 | } | |
111 | ||
112 | /* Pack the two 32-bit values from M1 into the lower two 16-bit values of | |
113 | the result, and the two 32-bit values from M2 into the upper two 16-bit | |
114 | values of the result, all with unsigned saturation. */ | |
115 | static __inline __m64 | |
116 | _mm_packs_pu32 (__m64 __m1, __m64 __m2) | |
117 | { | |
118 | return (__m64) __builtin_arm_wpackwus ((__v2si)__m1, (__v2si)__m2); | |
119 | } | |
120 | ||
121 | /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and | |
122 | the 64-bit value from M2 into the upper 32-bits of the result, all with | |
123 | unsigned saturation for values that do not fit exactly into 32-bits. */ | |
124 | static __inline __m64 | |
125 | _mm_packs_pu64 (__m64 __m1, __m64 __m2) | |
126 | { | |
127 | return (__m64) __builtin_arm_wpackdus ((long long)__m1, (long long)__m2); | |
128 | } | |
129 | ||
130 | /* Interleave the four 8-bit values from the high half of M1 with the four | |
131 | 8-bit values from the high half of M2. */ | |
132 | static __inline __m64 | |
133 | _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) | |
134 | { | |
135 | return (__m64) __builtin_arm_wunpckihb ((__v8qi)__m1, (__v8qi)__m2); | |
136 | } | |
137 | ||
138 | /* Interleave the two 16-bit values from the high half of M1 with the two | |
139 | 16-bit values from the high half of M2. */ | |
140 | static __inline __m64 | |
141 | _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) | |
142 | { | |
143 | return (__m64) __builtin_arm_wunpckihh ((__v4hi)__m1, (__v4hi)__m2); | |
144 | } | |
145 | ||
146 | /* Interleave the 32-bit value from the high half of M1 with the 32-bit | |
147 | value from the high half of M2. */ | |
148 | static __inline __m64 | |
149 | _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) | |
150 | { | |
151 | return (__m64) __builtin_arm_wunpckihw ((__v2si)__m1, (__v2si)__m2); | |
152 | } | |
153 | ||
154 | /* Interleave the four 8-bit values from the low half of M1 with the four | |
155 | 8-bit values from the low half of M2. */ | |
156 | static __inline __m64 | |
157 | _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) | |
158 | { | |
159 | return (__m64) __builtin_arm_wunpckilb ((__v8qi)__m1, (__v8qi)__m2); | |
160 | } | |
161 | ||
162 | /* Interleave the two 16-bit values from the low half of M1 with the two | |
163 | 16-bit values from the low half of M2. */ | |
164 | static __inline __m64 | |
165 | _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) | |
166 | { | |
167 | return (__m64) __builtin_arm_wunpckilh ((__v4hi)__m1, (__v4hi)__m2); | |
168 | } | |
169 | ||
170 | /* Interleave the 32-bit value from the low half of M1 with the 32-bit | |
171 | value from the low half of M2. */ | |
172 | static __inline __m64 | |
173 | _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) | |
174 | { | |
175 | return (__m64) __builtin_arm_wunpckilw ((__v2si)__m1, (__v2si)__m2); | |
176 | } | |
177 | ||
178 | /* Take the four 8-bit values from the low half of M1, sign extend them, | |
179 | and return the result as a vector of four 16-bit quantities. */ | |
180 | static __inline __m64 | |
181 | _mm_unpackel_pi8 (__m64 __m1) | |
182 | { | |
183 | return (__m64) __builtin_arm_wunpckelsb ((__v8qi)__m1); | |
184 | } | |
185 | ||
186 | /* Take the two 16-bit values from the low half of M1, sign extend them, | |
187 | and return the result as a vector of two 32-bit quantities. */ | |
188 | static __inline __m64 | |
189 | _mm_unpackel_pi16 (__m64 __m1) | |
190 | { | |
191 | return (__m64) __builtin_arm_wunpckelsh ((__v4hi)__m1); | |
192 | } | |
193 | ||
194 | /* Take the 32-bit value from the low half of M1, and return it sign extended | |
195 | to 64 bits. */ | |
196 | static __inline __m64 | |
197 | _mm_unpackel_pi32 (__m64 __m1) | |
198 | { | |
199 | return (__m64) __builtin_arm_wunpckelsw ((__v2si)__m1); | |
200 | } | |
201 | ||
202 | /* Take the four 8-bit values from the high half of M1, sign extend them, | |
203 | and return the result as a vector of four 16-bit quantities. */ | |
204 | static __inline __m64 | |
205 | _mm_unpackeh_pi8 (__m64 __m1) | |
206 | { | |
207 | return (__m64) __builtin_arm_wunpckehsb ((__v8qi)__m1); | |
208 | } | |
209 | ||
210 | /* Take the two 16-bit values from the high half of M1, sign extend them, | |
211 | and return the result as a vector of two 32-bit quantities. */ | |
212 | static __inline __m64 | |
213 | _mm_unpackeh_pi16 (__m64 __m1) | |
214 | { | |
215 | return (__m64) __builtin_arm_wunpckehsh ((__v4hi)__m1); | |
216 | } | |
217 | ||
218 | /* Take the 32-bit value from the high half of M1, and return it sign extended | |
219 | to 64 bits. */ | |
220 | static __inline __m64 | |
221 | _mm_unpackeh_pi32 (__m64 __m1) | |
222 | { | |
223 | return (__m64) __builtin_arm_wunpckehsw ((__v2si)__m1); | |
224 | } | |
225 | ||
226 | /* Take the four 8-bit values from the low half of M1, zero extend them, | |
227 | and return the result as a vector of four 16-bit quantities. */ | |
228 | static __inline __m64 | |
229 | _mm_unpackel_pu8 (__m64 __m1) | |
230 | { | |
231 | return (__m64) __builtin_arm_wunpckelub ((__v8qi)__m1); | |
232 | } | |
233 | ||
234 | /* Take the two 16-bit values from the low half of M1, zero extend them, | |
235 | and return the result as a vector of two 32-bit quantities. */ | |
236 | static __inline __m64 | |
237 | _mm_unpackel_pu16 (__m64 __m1) | |
238 | { | |
239 | return (__m64) __builtin_arm_wunpckeluh ((__v4hi)__m1); | |
240 | } | |
241 | ||
242 | /* Take the 32-bit value from the low half of M1, and return it zero extended | |
243 | to 64 bits. */ | |
244 | static __inline __m64 | |
245 | _mm_unpackel_pu32 (__m64 __m1) | |
246 | { | |
247 | return (__m64) __builtin_arm_wunpckeluw ((__v2si)__m1); | |
248 | } | |
249 | ||
250 | /* Take the four 8-bit values from the high half of M1, zero extend them, | |
251 | and return the result as a vector of four 16-bit quantities. */ | |
252 | static __inline __m64 | |
253 | _mm_unpackeh_pu8 (__m64 __m1) | |
254 | { | |
255 | return (__m64) __builtin_arm_wunpckehub ((__v8qi)__m1); | |
256 | } | |
257 | ||
258 | /* Take the two 16-bit values from the high half of M1, zero extend them, | |
259 | and return the result as a vector of two 32-bit quantities. */ | |
260 | static __inline __m64 | |
261 | _mm_unpackeh_pu16 (__m64 __m1) | |
262 | { | |
263 | return (__m64) __builtin_arm_wunpckehuh ((__v4hi)__m1); | |
264 | } | |
265 | ||
266 | /* Take the 32-bit value from the high half of M1, and return it zero extended | |
267 | to 64 bits. */ | |
268 | static __inline __m64 | |
269 | _mm_unpackeh_pu32 (__m64 __m1) | |
270 | { | |
271 | return (__m64) __builtin_arm_wunpckehuw ((__v2si)__m1); | |
272 | } | |
273 | ||
274 | /* Add the 8-bit values in M1 to the 8-bit values in M2. */ | |
275 | static __inline __m64 | |
276 | _mm_add_pi8 (__m64 __m1, __m64 __m2) | |
277 | { | |
278 | return (__m64) __builtin_arm_waddb ((__v8qi)__m1, (__v8qi)__m2); | |
279 | } | |
280 | ||
281 | /* Add the 16-bit values in M1 to the 16-bit values in M2. */ | |
282 | static __inline __m64 | |
283 | _mm_add_pi16 (__m64 __m1, __m64 __m2) | |
284 | { | |
285 | return (__m64) __builtin_arm_waddh ((__v4hi)__m1, (__v4hi)__m2); | |
286 | } | |
287 | ||
288 | /* Add the 32-bit values in M1 to the 32-bit values in M2. */ | |
289 | static __inline __m64 | |
290 | _mm_add_pi32 (__m64 __m1, __m64 __m2) | |
291 | { | |
292 | return (__m64) __builtin_arm_waddw ((__v2si)__m1, (__v2si)__m2); | |
293 | } | |
294 | ||
295 | /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed | |
296 | saturated arithmetic. */ | |
297 | static __inline __m64 | |
298 | _mm_adds_pi8 (__m64 __m1, __m64 __m2) | |
299 | { | |
300 | return (__m64) __builtin_arm_waddbss ((__v8qi)__m1, (__v8qi)__m2); | |
301 | } | |
302 | ||
303 | /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed | |
304 | saturated arithmetic. */ | |
305 | static __inline __m64 | |
306 | _mm_adds_pi16 (__m64 __m1, __m64 __m2) | |
307 | { | |
308 | return (__m64) __builtin_arm_waddhss ((__v4hi)__m1, (__v4hi)__m2); | |
309 | } | |
310 | ||
311 | /* Add the 32-bit values in M1 to the 32-bit values in M2 using signed | |
312 | saturated arithmetic. */ | |
313 | static __inline __m64 | |
314 | _mm_adds_pi32 (__m64 __m1, __m64 __m2) | |
315 | { | |
316 | return (__m64) __builtin_arm_waddwss ((__v2si)__m1, (__v2si)__m2); | |
317 | } | |
318 | ||
319 | /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned | |
320 | saturated arithmetic. */ | |
321 | static __inline __m64 | |
322 | _mm_adds_pu8 (__m64 __m1, __m64 __m2) | |
323 | { | |
324 | return (__m64) __builtin_arm_waddbus ((__v8qi)__m1, (__v8qi)__m2); | |
325 | } | |
326 | ||
327 | /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned | |
328 | saturated arithmetic. */ | |
329 | static __inline __m64 | |
330 | _mm_adds_pu16 (__m64 __m1, __m64 __m2) | |
331 | { | |
332 | return (__m64) __builtin_arm_waddhus ((__v4hi)__m1, (__v4hi)__m2); | |
333 | } | |
334 | ||
335 | /* Add the 32-bit values in M1 to the 32-bit values in M2 using unsigned | |
336 | saturated arithmetic. */ | |
337 | static __inline __m64 | |
338 | _mm_adds_pu32 (__m64 __m1, __m64 __m2) | |
339 | { | |
340 | return (__m64) __builtin_arm_waddwus ((__v2si)__m1, (__v2si)__m2); | |
341 | } | |
342 | ||
343 | /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ | |
344 | static __inline __m64 | |
345 | _mm_sub_pi8 (__m64 __m1, __m64 __m2) | |
346 | { | |
347 | return (__m64) __builtin_arm_wsubb ((__v8qi)__m1, (__v8qi)__m2); | |
348 | } | |
349 | ||
350 | /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ | |
351 | static __inline __m64 | |
352 | _mm_sub_pi16 (__m64 __m1, __m64 __m2) | |
353 | { | |
354 | return (__m64) __builtin_arm_wsubh ((__v4hi)__m1, (__v4hi)__m2); | |
355 | } | |
356 | ||
357 | /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ | |
358 | static __inline __m64 | |
359 | _mm_sub_pi32 (__m64 __m1, __m64 __m2) | |
360 | { | |
361 | return (__m64) __builtin_arm_wsubw ((__v2si)__m1, (__v2si)__m2); | |
362 | } | |
363 | ||
364 | /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed | |
365 | saturating arithmetic. */ | |
366 | static __inline __m64 | |
367 | _mm_subs_pi8 (__m64 __m1, __m64 __m2) | |
368 | { | |
369 | return (__m64) __builtin_arm_wsubbss ((__v8qi)__m1, (__v8qi)__m2); | |
370 | } | |
371 | ||
372 | /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using | |
373 | signed saturating arithmetic. */ | |
374 | static __inline __m64 | |
375 | _mm_subs_pi16 (__m64 __m1, __m64 __m2) | |
376 | { | |
377 | return (__m64) __builtin_arm_wsubhss ((__v4hi)__m1, (__v4hi)__m2); | |
378 | } | |
379 | ||
380 | /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using | |
381 | signed saturating arithmetic. */ | |
382 | static __inline __m64 | |
383 | _mm_subs_pi32 (__m64 __m1, __m64 __m2) | |
384 | { | |
385 | return (__m64) __builtin_arm_wsubwss ((__v2si)__m1, (__v2si)__m2); | |
386 | } | |
387 | ||
388 | /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using | |
389 | unsigned saturating arithmetic. */ | |
390 | static __inline __m64 | |
391 | _mm_subs_pu8 (__m64 __m1, __m64 __m2) | |
392 | { | |
393 | return (__m64) __builtin_arm_wsubbus ((__v8qi)__m1, (__v8qi)__m2); | |
394 | } | |
395 | ||
396 | /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using | |
397 | unsigned saturating arithmetic. */ | |
398 | static __inline __m64 | |
399 | _mm_subs_pu16 (__m64 __m1, __m64 __m2) | |
400 | { | |
401 | return (__m64) __builtin_arm_wsubhus ((__v4hi)__m1, (__v4hi)__m2); | |
402 | } | |
403 | ||
404 | /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using | |
405 | unsigned saturating arithmetic. */ | |
406 | static __inline __m64 | |
407 | _mm_subs_pu32 (__m64 __m1, __m64 __m2) | |
408 | { | |
409 | return (__m64) __builtin_arm_wsubwus ((__v2si)__m1, (__v2si)__m2); | |
410 | } | |
411 | ||
412 | /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing | |
413 | four 32-bit intermediate results, which are then summed by pairs to | |
414 | produce two 32-bit results. */ | |
415 | static __inline __m64 | |
416 | _mm_madd_pi16 (__m64 __m1, __m64 __m2) | |
417 | { | |
418 | return (__m64) __builtin_arm_wmadds ((__v4hi)__m1, (__v4hi)__m2); | |
419 | } | |
420 | ||
421 | /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing | |
422 | four 32-bit intermediate results, which are then summed by pairs to | |
423 | produce two 32-bit results. */ | |
424 | static __inline __m64 | |
425 | _mm_madd_pu16 (__m64 __m1, __m64 __m2) | |
426 | { | |
427 | return (__m64) __builtin_arm_wmaddu ((__v4hi)__m1, (__v4hi)__m2); | |
428 | } | |
429 | ||
430 | /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in | |
431 | M2 and produce the high 16 bits of the 32-bit results. */ | |
432 | static __inline __m64 | |
433 | _mm_mulhi_pi16 (__m64 __m1, __m64 __m2) | |
434 | { | |
f07a6b21 | 435 | return (__m64) __builtin_arm_wmulsm ((__v4hi)__m1, (__v4hi)__m2); |
5a9335ef NC |
436 | } |
437 | ||
438 | /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in | |
439 | M2 and produce the high 16 bits of the 32-bit results. */ | |
440 | static __inline __m64 | |
441 | _mm_mulhi_pu16 (__m64 __m1, __m64 __m2) | |
442 | { | |
f07a6b21 | 443 | return (__m64) __builtin_arm_wmulum ((__v4hi)__m1, (__v4hi)__m2); |
5a9335ef NC |
444 | } |
445 | ||
446 | /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce | |
447 | the low 16 bits of the results. */ | |
448 | static __inline __m64 | |
449 | _mm_mullo_pi16 (__m64 __m1, __m64 __m2) | |
450 | { | |
451 | return (__m64) __builtin_arm_wmulul ((__v4hi)__m1, (__v4hi)__m2); | |
452 | } | |
453 | ||
454 | /* Shift four 16-bit values in M left by COUNT. */ | |
455 | static __inline __m64 | |
456 | _mm_sll_pi16 (__m64 __m, __m64 __count) | |
457 | { | |
458 | return (__m64) __builtin_arm_wsllh ((__v4hi)__m, __count); | |
459 | } | |
460 | ||
461 | static __inline __m64 | |
462 | _mm_slli_pi16 (__m64 __m, int __count) | |
463 | { | |
464 | return (__m64) __builtin_arm_wsllhi ((__v4hi)__m, __count); | |
465 | } | |
466 | ||
467 | /* Shift two 32-bit values in M left by COUNT. */ | |
468 | static __inline __m64 | |
469 | _mm_sll_pi32 (__m64 __m, __m64 __count) | |
470 | { | |
471 | return (__m64) __builtin_arm_wsllw ((__v2si)__m, __count); | |
472 | } | |
473 | ||
474 | static __inline __m64 | |
475 | _mm_slli_pi32 (__m64 __m, int __count) | |
476 | { | |
477 | return (__m64) __builtin_arm_wsllwi ((__v2si)__m, __count); | |
478 | } | |
479 | ||
480 | /* Shift the 64-bit value in M left by COUNT. */ | |
481 | static __inline __m64 | |
482 | _mm_sll_si64 (__m64 __m, __m64 __count) | |
483 | { | |
484 | return (__m64) __builtin_arm_wslld (__m, __count); | |
485 | } | |
486 | ||
487 | static __inline __m64 | |
488 | _mm_slli_si64 (__m64 __m, int __count) | |
489 | { | |
490 | return (__m64) __builtin_arm_wslldi (__m, __count); | |
491 | } | |
492 | ||
493 | /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ | |
494 | static __inline __m64 | |
495 | _mm_sra_pi16 (__m64 __m, __m64 __count) | |
496 | { | |
497 | return (__m64) __builtin_arm_wsrah ((__v4hi)__m, __count); | |
498 | } | |
499 | ||
500 | static __inline __m64 | |
501 | _mm_srai_pi16 (__m64 __m, int __count) | |
502 | { | |
503 | return (__m64) __builtin_arm_wsrahi ((__v4hi)__m, __count); | |
504 | } | |
505 | ||
506 | /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ | |
507 | static __inline __m64 | |
508 | _mm_sra_pi32 (__m64 __m, __m64 __count) | |
509 | { | |
510 | return (__m64) __builtin_arm_wsraw ((__v2si)__m, __count); | |
511 | } | |
512 | ||
513 | static __inline __m64 | |
514 | _mm_srai_pi32 (__m64 __m, int __count) | |
515 | { | |
516 | return (__m64) __builtin_arm_wsrawi ((__v2si)__m, __count); | |
517 | } | |
518 | ||
519 | /* Shift the 64-bit value in M right by COUNT; shift in the sign bit. */ | |
520 | static __inline __m64 | |
521 | _mm_sra_si64 (__m64 __m, __m64 __count) | |
522 | { | |
523 | return (__m64) __builtin_arm_wsrad (__m, __count); | |
524 | } | |
525 | ||
526 | static __inline __m64 | |
527 | _mm_srai_si64 (__m64 __m, int __count) | |
528 | { | |
529 | return (__m64) __builtin_arm_wsradi (__m, __count); | |
530 | } | |
531 | ||
532 | /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ | |
533 | static __inline __m64 | |
534 | _mm_srl_pi16 (__m64 __m, __m64 __count) | |
535 | { | |
536 | return (__m64) __builtin_arm_wsrlh ((__v4hi)__m, __count); | |
537 | } | |
538 | ||
539 | static __inline __m64 | |
540 | _mm_srli_pi16 (__m64 __m, int __count) | |
541 | { | |
542 | return (__m64) __builtin_arm_wsrlhi ((__v4hi)__m, __count); | |
543 | } | |
544 | ||
545 | /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ | |
546 | static __inline __m64 | |
547 | _mm_srl_pi32 (__m64 __m, __m64 __count) | |
548 | { | |
549 | return (__m64) __builtin_arm_wsrlw ((__v2si)__m, __count); | |
550 | } | |
551 | ||
552 | static __inline __m64 | |
553 | _mm_srli_pi32 (__m64 __m, int __count) | |
554 | { | |
555 | return (__m64) __builtin_arm_wsrlwi ((__v2si)__m, __count); | |
556 | } | |
557 | ||
558 | /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ | |
559 | static __inline __m64 | |
560 | _mm_srl_si64 (__m64 __m, __m64 __count) | |
561 | { | |
562 | return (__m64) __builtin_arm_wsrld (__m, __count); | |
563 | } | |
564 | ||
565 | static __inline __m64 | |
566 | _mm_srli_si64 (__m64 __m, int __count) | |
567 | { | |
568 | return (__m64) __builtin_arm_wsrldi (__m, __count); | |
569 | } | |
570 | ||
571 | /* Rotate four 16-bit values in M right by COUNT. */ | |
572 | static __inline __m64 | |
573 | _mm_ror_pi16 (__m64 __m, __m64 __count) | |
574 | { | |
575 | return (__m64) __builtin_arm_wrorh ((__v4hi)__m, __count); | |
576 | } | |
577 | ||
578 | static __inline __m64 | |
579 | _mm_rori_pi16 (__m64 __m, int __count) | |
580 | { | |
581 | return (__m64) __builtin_arm_wrorhi ((__v4hi)__m, __count); | |
582 | } | |
583 | ||
584 | /* Rotate two 32-bit values in M right by COUNT. */ | |
585 | static __inline __m64 | |
586 | _mm_ror_pi32 (__m64 __m, __m64 __count) | |
587 | { | |
588 | return (__m64) __builtin_arm_wrorw ((__v2si)__m, __count); | |
589 | } | |
590 | ||
591 | static __inline __m64 | |
592 | _mm_rori_pi32 (__m64 __m, int __count) | |
593 | { | |
594 | return (__m64) __builtin_arm_wrorwi ((__v2si)__m, __count); | |
595 | } | |
596 | ||
597 | /* Rotate two 64-bit values in M right by COUNT. */ | |
598 | static __inline __m64 | |
599 | _mm_ror_si64 (__m64 __m, __m64 __count) | |
600 | { | |
601 | return (__m64) __builtin_arm_wrord (__m, __count); | |
602 | } | |
603 | ||
604 | static __inline __m64 | |
605 | _mm_rori_si64 (__m64 __m, int __count) | |
606 | { | |
607 | return (__m64) __builtin_arm_wrordi (__m, __count); | |
608 | } | |
609 | ||
610 | /* Bit-wise AND the 64-bit values in M1 and M2. */ | |
611 | static __inline __m64 | |
612 | _mm_and_si64 (__m64 __m1, __m64 __m2) | |
613 | { | |
614 | return __builtin_arm_wand (__m1, __m2); | |
615 | } | |
616 | ||
617 | /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the | |
618 | 64-bit value in M2. */ | |
619 | static __inline __m64 | |
620 | _mm_andnot_si64 (__m64 __m1, __m64 __m2) | |
621 | { | |
8fd03515 | 622 | return __builtin_arm_wandn (__m2, __m1); |
5a9335ef NC |
623 | } |
624 | ||
625 | /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ | |
626 | static __inline __m64 | |
627 | _mm_or_si64 (__m64 __m1, __m64 __m2) | |
628 | { | |
629 | return __builtin_arm_wor (__m1, __m2); | |
630 | } | |
631 | ||
632 | /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ | |
633 | static __inline __m64 | |
634 | _mm_xor_si64 (__m64 __m1, __m64 __m2) | |
635 | { | |
636 | return __builtin_arm_wxor (__m1, __m2); | |
637 | } | |
638 | ||
639 | /* Compare eight 8-bit values. The result of the comparison is 0xFF if the | |
640 | test is true and zero if false. */ | |
641 | static __inline __m64 | |
642 | _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) | |
643 | { | |
644 | return (__m64) __builtin_arm_wcmpeqb ((__v8qi)__m1, (__v8qi)__m2); | |
645 | } | |
646 | ||
647 | static __inline __m64 | |
648 | _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) | |
649 | { | |
650 | return (__m64) __builtin_arm_wcmpgtsb ((__v8qi)__m1, (__v8qi)__m2); | |
651 | } | |
652 | ||
653 | static __inline __m64 | |
654 | _mm_cmpgt_pu8 (__m64 __m1, __m64 __m2) | |
655 | { | |
656 | return (__m64) __builtin_arm_wcmpgtub ((__v8qi)__m1, (__v8qi)__m2); | |
657 | } | |
658 | ||
659 | /* Compare four 16-bit values. The result of the comparison is 0xFFFF if | |
660 | the test is true and zero if false. */ | |
661 | static __inline __m64 | |
662 | _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) | |
663 | { | |
664 | return (__m64) __builtin_arm_wcmpeqh ((__v4hi)__m1, (__v4hi)__m2); | |
665 | } | |
666 | ||
667 | static __inline __m64 | |
668 | _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) | |
669 | { | |
670 | return (__m64) __builtin_arm_wcmpgtsh ((__v4hi)__m1, (__v4hi)__m2); | |
671 | } | |
672 | ||
673 | static __inline __m64 | |
674 | _mm_cmpgt_pu16 (__m64 __m1, __m64 __m2) | |
675 | { | |
676 | return (__m64) __builtin_arm_wcmpgtuh ((__v4hi)__m1, (__v4hi)__m2); | |
677 | } | |
678 | ||
679 | /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if | |
680 | the test is true and zero if false. */ | |
681 | static __inline __m64 | |
682 | _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) | |
683 | { | |
684 | return (__m64) __builtin_arm_wcmpeqw ((__v2si)__m1, (__v2si)__m2); | |
685 | } | |
686 | ||
687 | static __inline __m64 | |
688 | _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) | |
689 | { | |
690 | return (__m64) __builtin_arm_wcmpgtsw ((__v2si)__m1, (__v2si)__m2); | |
691 | } | |
692 | ||
693 | static __inline __m64 | |
694 | _mm_cmpgt_pu32 (__m64 __m1, __m64 __m2) | |
695 | { | |
696 | return (__m64) __builtin_arm_wcmpgtuw ((__v2si)__m1, (__v2si)__m2); | |
697 | } | |
698 | ||
699 | /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed | |
700 | by accumulate across all elements and __A. */ | |
701 | static __inline __m64 | |
702 | _mm_mac_pu16 (__m64 __A, __m64 __B, __m64 __C) | |
703 | { | |
704 | return __builtin_arm_wmacu (__A, (__v4hi)__B, (__v4hi)__C); | |
705 | } | |
706 | ||
707 | /* Element-wise multiplication of signed 16-bit values __B and __C, followed | |
708 | by accumulate across all elements and __A. */ | |
709 | static __inline __m64 | |
710 | _mm_mac_pi16 (__m64 __A, __m64 __B, __m64 __C) | |
711 | { | |
712 | return __builtin_arm_wmacs (__A, (__v4hi)__B, (__v4hi)__C); | |
713 | } | |
714 | ||
715 | /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed | |
716 | by accumulate across all elements. */ | |
717 | static __inline __m64 | |
718 | _mm_macz_pu16 (__m64 __A, __m64 __B) | |
719 | { | |
720 | return __builtin_arm_wmacuz ((__v4hi)__A, (__v4hi)__B); | |
721 | } | |
722 | ||
723 | /* Element-wise multiplication of signed 16-bit values __B and __C, followed | |
724 | by accumulate across all elements. */ | |
725 | static __inline __m64 | |
726 | _mm_macz_pi16 (__m64 __A, __m64 __B) | |
727 | { | |
728 | return __builtin_arm_wmacsz ((__v4hi)__A, (__v4hi)__B); | |
729 | } | |
730 | ||
731 | /* Accumulate across all unsigned 8-bit values in __A. */ | |
732 | static __inline __m64 | |
733 | _mm_acc_pu8 (__m64 __A) | |
734 | { | |
735 | return __builtin_arm_waccb ((__v8qi)__A); | |
736 | } | |
737 | ||
738 | /* Accumulate across all unsigned 16-bit values in __A. */ | |
739 | static __inline __m64 | |
740 | _mm_acc_pu16 (__m64 __A) | |
741 | { | |
742 | return __builtin_arm_wacch ((__v4hi)__A); | |
743 | } | |
744 | ||
745 | /* Accumulate across all unsigned 32-bit values in __A. */ | |
746 | static __inline __m64 | |
747 | _mm_acc_pu32 (__m64 __A) | |
748 | { | |
749 | return __builtin_arm_waccw ((__v2si)__A); | |
750 | } | |
751 | ||
752 | static __inline __m64 | |
753 | _mm_mia_si64 (__m64 __A, int __B, int __C) | |
754 | { | |
755 | return __builtin_arm_tmia (__A, __B, __C); | |
756 | } | |
757 | ||
758 | static __inline __m64 | |
759 | _mm_miaph_si64 (__m64 __A, int __B, int __C) | |
760 | { | |
761 | return __builtin_arm_tmiaph (__A, __B, __C); | |
762 | } | |
763 | ||
764 | static __inline __m64 | |
765 | _mm_miabb_si64 (__m64 __A, int __B, int __C) | |
766 | { | |
767 | return __builtin_arm_tmiabb (__A, __B, __C); | |
768 | } | |
769 | ||
770 | static __inline __m64 | |
771 | _mm_miabt_si64 (__m64 __A, int __B, int __C) | |
772 | { | |
773 | return __builtin_arm_tmiabt (__A, __B, __C); | |
774 | } | |
775 | ||
776 | static __inline __m64 | |
777 | _mm_miatb_si64 (__m64 __A, int __B, int __C) | |
778 | { | |
779 | return __builtin_arm_tmiatb (__A, __B, __C); | |
780 | } | |
781 | ||
782 | static __inline __m64 | |
783 | _mm_miatt_si64 (__m64 __A, int __B, int __C) | |
784 | { | |
785 | return __builtin_arm_tmiatt (__A, __B, __C); | |
786 | } | |
787 | ||
788 | /* Extract one of the elements of A and sign extend. The selector N must | |
789 | be immediate. */ | |
790 | #define _mm_extract_pi8(A, N) __builtin_arm_textrmsb ((__v8qi)(A), (N)) | |
791 | #define _mm_extract_pi16(A, N) __builtin_arm_textrmsh ((__v4hi)(A), (N)) | |
792 | #define _mm_extract_pi32(A, N) __builtin_arm_textrmsw ((__v2si)(A), (N)) | |
793 | ||
794 | /* Extract one of the elements of A and zero extend. The selector N must | |
795 | be immediate. */ | |
796 | #define _mm_extract_pu8(A, N) __builtin_arm_textrmub ((__v8qi)(A), (N)) | |
797 | #define _mm_extract_pu16(A, N) __builtin_arm_textrmuh ((__v4hi)(A), (N)) | |
798 | #define _mm_extract_pu32(A, N) __builtin_arm_textrmuw ((__v2si)(A), (N)) | |
799 | ||
800 | /* Inserts word D into one of the elements of A. The selector N must be | |
801 | immediate. */ | |
802 | #define _mm_insert_pi8(A, D, N) \ | |
803 | ((__m64) __builtin_arm_tinsrb ((__v8qi)(A), (D), (N))) | |
804 | #define _mm_insert_pi16(A, D, N) \ | |
805 | ((__m64) __builtin_arm_tinsrh ((__v4hi)(A), (D), (N))) | |
806 | #define _mm_insert_pi32(A, D, N) \ | |
807 | ((__m64) __builtin_arm_tinsrw ((__v2si)(A), (D), (N))) | |
808 | ||
809 | /* Compute the element-wise maximum of signed 8-bit values. */ | |
810 | static __inline __m64 | |
811 | _mm_max_pi8 (__m64 __A, __m64 __B) | |
812 | { | |
813 | return (__m64) __builtin_arm_wmaxsb ((__v8qi)__A, (__v8qi)__B); | |
814 | } | |
815 | ||
816 | /* Compute the element-wise maximum of signed 16-bit values. */ | |
817 | static __inline __m64 | |
818 | _mm_max_pi16 (__m64 __A, __m64 __B) | |
819 | { | |
820 | return (__m64) __builtin_arm_wmaxsh ((__v4hi)__A, (__v4hi)__B); | |
821 | } | |
822 | ||
823 | /* Compute the element-wise maximum of signed 32-bit values. */ | |
824 | static __inline __m64 | |
825 | _mm_max_pi32 (__m64 __A, __m64 __B) | |
826 | { | |
827 | return (__m64) __builtin_arm_wmaxsw ((__v2si)__A, (__v2si)__B); | |
828 | } | |
829 | ||
830 | /* Compute the element-wise maximum of unsigned 8-bit values. */ | |
831 | static __inline __m64 | |
832 | _mm_max_pu8 (__m64 __A, __m64 __B) | |
833 | { | |
834 | return (__m64) __builtin_arm_wmaxub ((__v8qi)__A, (__v8qi)__B); | |
835 | } | |
836 | ||
837 | /* Compute the element-wise maximum of unsigned 16-bit values. */ | |
838 | static __inline __m64 | |
839 | _mm_max_pu16 (__m64 __A, __m64 __B) | |
840 | { | |
841 | return (__m64) __builtin_arm_wmaxuh ((__v4hi)__A, (__v4hi)__B); | |
842 | } | |
843 | ||
844 | /* Compute the element-wise maximum of unsigned 32-bit values. */ | |
845 | static __inline __m64 | |
846 | _mm_max_pu32 (__m64 __A, __m64 __B) | |
847 | { | |
848 | return (__m64) __builtin_arm_wmaxuw ((__v2si)__A, (__v2si)__B); | |
849 | } | |
850 | ||
851 | /* Compute the element-wise minimum of signed 16-bit values. */ | |
852 | static __inline __m64 | |
853 | _mm_min_pi8 (__m64 __A, __m64 __B) | |
854 | { | |
855 | return (__m64) __builtin_arm_wminsb ((__v8qi)__A, (__v8qi)__B); | |
856 | } | |
857 | ||
858 | /* Compute the element-wise minimum of signed 16-bit values. */ | |
859 | static __inline __m64 | |
860 | _mm_min_pi16 (__m64 __A, __m64 __B) | |
861 | { | |
862 | return (__m64) __builtin_arm_wminsh ((__v4hi)__A, (__v4hi)__B); | |
863 | } | |
864 | ||
865 | /* Compute the element-wise minimum of signed 32-bit values. */ | |
866 | static __inline __m64 | |
867 | _mm_min_pi32 (__m64 __A, __m64 __B) | |
868 | { | |
869 | return (__m64) __builtin_arm_wminsw ((__v2si)__A, (__v2si)__B); | |
870 | } | |
871 | ||
872 | /* Compute the element-wise minimum of unsigned 16-bit values. */ | |
873 | static __inline __m64 | |
874 | _mm_min_pu8 (__m64 __A, __m64 __B) | |
875 | { | |
876 | return (__m64) __builtin_arm_wminub ((__v8qi)__A, (__v8qi)__B); | |
877 | } | |
878 | ||
879 | /* Compute the element-wise minimum of unsigned 16-bit values. */ | |
880 | static __inline __m64 | |
881 | _mm_min_pu16 (__m64 __A, __m64 __B) | |
882 | { | |
883 | return (__m64) __builtin_arm_wminuh ((__v4hi)__A, (__v4hi)__B); | |
884 | } | |
885 | ||
886 | /* Compute the element-wise minimum of unsigned 32-bit values. */ | |
887 | static __inline __m64 | |
888 | _mm_min_pu32 (__m64 __A, __m64 __B) | |
889 | { | |
890 | return (__m64) __builtin_arm_wminuw ((__v2si)__A, (__v2si)__B); | |
891 | } | |
892 | ||
893 | /* Create an 8-bit mask of the signs of 8-bit values. */ | |
894 | static __inline int | |
895 | _mm_movemask_pi8 (__m64 __A) | |
896 | { | |
897 | return __builtin_arm_tmovmskb ((__v8qi)__A); | |
898 | } | |
899 | ||
900 | /* Create an 8-bit mask of the signs of 16-bit values. */ | |
901 | static __inline int | |
902 | _mm_movemask_pi16 (__m64 __A) | |
903 | { | |
904 | return __builtin_arm_tmovmskh ((__v4hi)__A); | |
905 | } | |
906 | ||
907 | /* Create an 8-bit mask of the signs of 32-bit values. */ | |
908 | static __inline int | |
909 | _mm_movemask_pi32 (__m64 __A) | |
910 | { | |
911 | return __builtin_arm_tmovmskw ((__v2si)__A); | |
912 | } | |
913 | ||
914 | /* Return a combination of the four 16-bit values in A. The selector | |
915 | must be an immediate. */ | |
916 | #define _mm_shuffle_pi16(A, N) \ | |
917 | ((__m64) __builtin_arm_wshufh ((__v4hi)(A), (N))) | |
918 | ||
919 | ||
920 | /* Compute the rounded averages of the unsigned 8-bit values in A and B. */ | |
921 | static __inline __m64 | |
922 | _mm_avg_pu8 (__m64 __A, __m64 __B) | |
923 | { | |
924 | return (__m64) __builtin_arm_wavg2br ((__v8qi)__A, (__v8qi)__B); | |
925 | } | |
926 | ||
927 | /* Compute the rounded averages of the unsigned 16-bit values in A and B. */ | |
928 | static __inline __m64 | |
929 | _mm_avg_pu16 (__m64 __A, __m64 __B) | |
930 | { | |
931 | return (__m64) __builtin_arm_wavg2hr ((__v4hi)__A, (__v4hi)__B); | |
932 | } | |
933 | ||
934 | /* Compute the averages of the unsigned 8-bit values in A and B. */ | |
935 | static __inline __m64 | |
936 | _mm_avg2_pu8 (__m64 __A, __m64 __B) | |
937 | { | |
938 | return (__m64) __builtin_arm_wavg2b ((__v8qi)__A, (__v8qi)__B); | |
939 | } | |
940 | ||
941 | /* Compute the averages of the unsigned 16-bit values in A and B. */ | |
942 | static __inline __m64 | |
943 | _mm_avg2_pu16 (__m64 __A, __m64 __B) | |
944 | { | |
945 | return (__m64) __builtin_arm_wavg2h ((__v4hi)__A, (__v4hi)__B); | |
946 | } | |
947 | ||
948 | /* Compute the sum of the absolute differences of the unsigned 8-bit | |
949 | values in A and B. Return the value in the lower 16-bit word; the | |
950 | upper words are cleared. */ | |
951 | static __inline __m64 | |
952 | _mm_sad_pu8 (__m64 __A, __m64 __B) | |
953 | { | |
8fd03515 XQ |
954 | return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B); |
955 | } | |
956 | ||
957 | static __inline __m64 | |
958 | _mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C) | |
959 | { | |
960 | return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C); | |
5a9335ef NC |
961 | } |
962 | ||
963 | /* Compute the sum of the absolute differences of the unsigned 16-bit | |
964 | values in A and B. Return the value in the lower 32-bit word; the | |
965 | upper words are cleared. */ | |
966 | static __inline __m64 | |
967 | _mm_sad_pu16 (__m64 __A, __m64 __B) | |
968 | { | |
8fd03515 | 969 | return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B); |
5a9335ef NC |
970 | } |
971 | ||
8fd03515 XQ |
972 | static __inline __m64 |
973 | _mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C) | |
974 | { | |
975 | return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C); | |
976 | } | |
977 | ||
978 | ||
5a9335ef NC |
979 | /* Compute the sum of the absolute differences of the unsigned 8-bit |
980 | values in A and B. Return the value in the lower 16-bit word; the | |
981 | upper words are cleared. */ | |
982 | static __inline __m64 | |
983 | _mm_sadz_pu8 (__m64 __A, __m64 __B) | |
984 | { | |
985 | return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B); | |
986 | } | |
987 | ||
988 | /* Compute the sum of the absolute differences of the unsigned 16-bit | |
989 | values in A and B. Return the value in the lower 32-bit word; the | |
990 | upper words are cleared. */ | |
991 | static __inline __m64 | |
992 | _mm_sadz_pu16 (__m64 __A, __m64 __B) | |
993 | { | |
994 | return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B); | |
995 | } | |
996 | ||
8fd03515 XQ |
997 | #define _mm_align_si64(__A,__B, N) \ |
998 | (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N)) | |
5a9335ef NC |
999 | |
1000 | /* Creates a 64-bit zero. */ | |
1001 | static __inline __m64 | |
1002 | _mm_setzero_si64 (void) | |
1003 | { | |
1004 | return __builtin_arm_wzero (); | |
1005 | } | |
1006 | ||
1007 | /* Set and Get arbitrary iWMMXt Control registers. | |
1008 | Note only registers 0-3 and 8-11 are currently defined, | |
1009 | the rest are reserved. */ | |
1010 | ||
1011 | static __inline void | |
f07a6b21 | 1012 | _mm_setwcx (const int __value, const int __regno) |
5a9335ef NC |
1013 | { |
1014 | switch (__regno) | |
1015 | { | |
8fd03515 XQ |
1016 | case 0: |
1017 | __asm __volatile ("tmcr wcid, %0" :: "r"(__value)); | |
1018 | break; | |
1019 | case 1: | |
1020 | __asm __volatile ("tmcr wcon, %0" :: "r"(__value)); | |
1021 | break; | |
1022 | case 2: | |
1023 | __asm __volatile ("tmcr wcssf, %0" :: "r"(__value)); | |
1024 | break; | |
1025 | case 3: | |
1026 | __asm __volatile ("tmcr wcasf, %0" :: "r"(__value)); | |
1027 | break; | |
1028 | case 8: | |
1029 | __builtin_arm_setwcgr0 (__value); | |
1030 | break; | |
1031 | case 9: | |
1032 | __builtin_arm_setwcgr1 (__value); | |
1033 | break; | |
1034 | case 10: | |
1035 | __builtin_arm_setwcgr2 (__value); | |
1036 | break; | |
1037 | case 11: | |
1038 | __builtin_arm_setwcgr3 (__value); | |
1039 | break; | |
1040 | default: | |
1041 | break; | |
5a9335ef NC |
1042 | } |
1043 | } | |
1044 | ||
1045 | static __inline int | |
1046 | _mm_getwcx (const int __regno) | |
1047 | { | |
8fd03515 | 1048 | int __value; |
5a9335ef NC |
1049 | switch (__regno) |
1050 | { | |
8fd03515 XQ |
1051 | case 0: |
1052 | __asm __volatile ("tmrc %0, wcid" : "=r"(__value)); | |
1053 | break; | |
1054 | case 1: | |
1055 | __asm __volatile ("tmrc %0, wcon" : "=r"(__value)); | |
1056 | break; | |
1057 | case 2: | |
1058 | __asm __volatile ("tmrc %0, wcssf" : "=r"(__value)); | |
1059 | break; | |
1060 | case 3: | |
1061 | __asm __volatile ("tmrc %0, wcasf" : "=r"(__value)); | |
1062 | break; | |
1063 | case 8: | |
1064 | return __builtin_arm_getwcgr0 (); | |
1065 | case 9: | |
1066 | return __builtin_arm_getwcgr1 (); | |
1067 | case 10: | |
1068 | return __builtin_arm_getwcgr2 (); | |
1069 | case 11: | |
1070 | return __builtin_arm_getwcgr3 (); | |
1071 | default: | |
1072 | break; | |
5a9335ef | 1073 | } |
8fd03515 | 1074 | return __value; |
5a9335ef NC |
1075 | } |
1076 | ||
1077 | /* Creates a vector of two 32-bit values; I0 is least significant. */ | |
1078 | static __inline __m64 | |
1079 | _mm_set_pi32 (int __i1, int __i0) | |
1080 | { | |
8fd03515 XQ |
1081 | union |
1082 | { | |
5a9335ef | 1083 | __m64 __q; |
8fd03515 XQ |
1084 | struct |
1085 | { | |
5a9335ef NC |
1086 | unsigned int __i0; |
1087 | unsigned int __i1; | |
1088 | } __s; | |
1089 | } __u; | |
1090 | ||
1091 | __u.__s.__i0 = __i0; | |
1092 | __u.__s.__i1 = __i1; | |
1093 | ||
1094 | return __u.__q; | |
1095 | } | |
1096 | ||
1097 | /* Creates a vector of four 16-bit values; W0 is least significant. */ | |
1098 | static __inline __m64 | |
1099 | _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) | |
1100 | { | |
8fd03515 XQ |
1101 | unsigned int __i1 = (unsigned short) __w3 << 16 | (unsigned short) __w2; |
1102 | unsigned int __i0 = (unsigned short) __w1 << 16 | (unsigned short) __w0; | |
1103 | ||
5a9335ef | 1104 | return _mm_set_pi32 (__i1, __i0); |
5a9335ef NC |
1105 | } |
1106 | ||
1107 | /* Creates a vector of eight 8-bit values; B0 is least significant. */ | |
1108 | static __inline __m64 | |
1109 | _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, | |
1110 | char __b3, char __b2, char __b1, char __b0) | |
1111 | { | |
1112 | unsigned int __i1, __i0; | |
1113 | ||
1114 | __i1 = (unsigned char)__b7; | |
1115 | __i1 = __i1 << 8 | (unsigned char)__b6; | |
1116 | __i1 = __i1 << 8 | (unsigned char)__b5; | |
1117 | __i1 = __i1 << 8 | (unsigned char)__b4; | |
1118 | ||
1119 | __i0 = (unsigned char)__b3; | |
1120 | __i0 = __i0 << 8 | (unsigned char)__b2; | |
1121 | __i0 = __i0 << 8 | (unsigned char)__b1; | |
1122 | __i0 = __i0 << 8 | (unsigned char)__b0; | |
1123 | ||
1124 | return _mm_set_pi32 (__i1, __i0); | |
1125 | } | |
1126 | ||
1127 | /* Similar, but with the arguments in reverse order. */ | |
1128 | static __inline __m64 | |
1129 | _mm_setr_pi32 (int __i0, int __i1) | |
1130 | { | |
1131 | return _mm_set_pi32 (__i1, __i0); | |
1132 | } | |
1133 | ||
1134 | static __inline __m64 | |
1135 | _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) | |
1136 | { | |
1137 | return _mm_set_pi16 (__w3, __w2, __w1, __w0); | |
1138 | } | |
1139 | ||
1140 | static __inline __m64 | |
1141 | _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, | |
1142 | char __b4, char __b5, char __b6, char __b7) | |
1143 | { | |
1144 | return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); | |
1145 | } | |
1146 | ||
1147 | /* Creates a vector of two 32-bit values, both elements containing I. */ | |
1148 | static __inline __m64 | |
1149 | _mm_set1_pi32 (int __i) | |
1150 | { | |
1151 | return _mm_set_pi32 (__i, __i); | |
1152 | } | |
1153 | ||
1154 | /* Creates a vector of four 16-bit values, all elements containing W. */ | |
1155 | static __inline __m64 | |
1156 | _mm_set1_pi16 (short __w) | |
1157 | { | |
1158 | unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w; | |
1159 | return _mm_set1_pi32 (__i); | |
1160 | } | |
1161 | ||
1162 | /* Creates a vector of four 16-bit values, all elements containing B. */ | |
1163 | static __inline __m64 | |
1164 | _mm_set1_pi8 (char __b) | |
1165 | { | |
1166 | unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b; | |
1167 | unsigned int __i = __w << 16 | __w; | |
1168 | return _mm_set1_pi32 (__i); | |
1169 | } | |
1170 | ||
8fd03515 XQ |
1171 | #ifdef __IWMMXT2__ |
1172 | static __inline __m64 | |
1173 | _mm_abs_pi8 (__m64 m1) | |
1174 | { | |
1175 | return (__m64) __builtin_arm_wabsb ((__v8qi)m1); | |
1176 | } | |
1177 | ||
1178 | static __inline __m64 | |
1179 | _mm_abs_pi16 (__m64 m1) | |
1180 | { | |
1181 | return (__m64) __builtin_arm_wabsh ((__v4hi)m1); | |
1182 | ||
1183 | } | |
1184 | ||
1185 | static __inline __m64 | |
1186 | _mm_abs_pi32 (__m64 m1) | |
1187 | { | |
1188 | return (__m64) __builtin_arm_wabsw ((__v2si)m1); | |
1189 | ||
1190 | } | |
1191 | ||
1192 | static __inline __m64 | |
1193 | _mm_addsubhx_pi16 (__m64 a, __m64 b) | |
1194 | { | |
1195 | return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b); | |
1196 | } | |
1197 | ||
1198 | static __inline __m64 | |
1199 | _mm_absdiff_pu8 (__m64 a, __m64 b) | |
1200 | { | |
1201 | return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b); | |
1202 | } | |
1203 | ||
1204 | static __inline __m64 | |
1205 | _mm_absdiff_pu16 (__m64 a, __m64 b) | |
1206 | { | |
1207 | return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b); | |
1208 | } | |
1209 | ||
1210 | static __inline __m64 | |
1211 | _mm_absdiff_pu32 (__m64 a, __m64 b) | |
1212 | { | |
1213 | return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b); | |
1214 | } | |
1215 | ||
1216 | static __inline __m64 | |
1217 | _mm_addc_pu16 (__m64 a, __m64 b) | |
1218 | { | |
1219 | __m64 result; | |
1220 | __asm__ __volatile__ ("waddhc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b)); | |
1221 | return result; | |
1222 | } | |
1223 | ||
1224 | static __inline __m64 | |
1225 | _mm_addc_pu32 (__m64 a, __m64 b) | |
1226 | { | |
1227 | __m64 result; | |
1228 | __asm__ __volatile__ ("waddwc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b)); | |
1229 | return result; | |
1230 | } | |
1231 | ||
1232 | static __inline __m64 | |
1233 | _mm_avg4_pu8 (__m64 a, __m64 b) | |
1234 | { | |
1235 | return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b); | |
1236 | } | |
1237 | ||
1238 | static __inline __m64 | |
1239 | _mm_avg4r_pu8 (__m64 a, __m64 b) | |
1240 | { | |
1241 | return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b); | |
1242 | } | |
1243 | ||
1244 | static __inline __m64 | |
1245 | _mm_maddx_pi16 (__m64 a, __m64 b) | |
1246 | { | |
1247 | return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b); | |
1248 | } | |
1249 | ||
1250 | static __inline __m64 | |
1251 | _mm_maddx_pu16 (__m64 a, __m64 b) | |
1252 | { | |
1253 | return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b); | |
1254 | } | |
1255 | ||
1256 | static __inline __m64 | |
1257 | _mm_msub_pi16 (__m64 a, __m64 b) | |
1258 | { | |
1259 | return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b); | |
1260 | } | |
1261 | ||
1262 | static __inline __m64 | |
1263 | _mm_msub_pu16 (__m64 a, __m64 b) | |
1264 | { | |
1265 | return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b); | |
1266 | } | |
1267 | ||
1268 | static __inline __m64 | |
1269 | _mm_mulhi_pi32 (__m64 a, __m64 b) | |
1270 | { | |
1271 | return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b); | |
1272 | } | |
1273 | ||
1274 | static __inline __m64 | |
1275 | _mm_mulhi_pu32 (__m64 a, __m64 b) | |
1276 | { | |
1277 | return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b); | |
1278 | } | |
1279 | ||
1280 | static __inline __m64 | |
1281 | _mm_mulhir_pi16 (__m64 a, __m64 b) | |
1282 | { | |
1283 | return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b); | |
1284 | } | |
1285 | ||
1286 | static __inline __m64 | |
1287 | _mm_mulhir_pi32 (__m64 a, __m64 b) | |
1288 | { | |
1289 | return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b); | |
1290 | } | |
1291 | ||
1292 | static __inline __m64 | |
1293 | _mm_mulhir_pu16 (__m64 a, __m64 b) | |
1294 | { | |
1295 | return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b); | |
1296 | } | |
1297 | ||
1298 | static __inline __m64 | |
1299 | _mm_mulhir_pu32 (__m64 a, __m64 b) | |
1300 | { | |
1301 | return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b); | |
1302 | } | |
1303 | ||
1304 | static __inline __m64 | |
1305 | _mm_mullo_pi32 (__m64 a, __m64 b) | |
1306 | { | |
1307 | return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b); | |
1308 | } | |
1309 | ||
1310 | static __inline __m64 | |
1311 | _mm_qmulm_pi16 (__m64 a, __m64 b) | |
1312 | { | |
1313 | return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b); | |
1314 | } | |
1315 | ||
1316 | static __inline __m64 | |
1317 | _mm_qmulm_pi32 (__m64 a, __m64 b) | |
1318 | { | |
1319 | return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b); | |
1320 | } | |
1321 | ||
1322 | static __inline __m64 | |
1323 | _mm_qmulmr_pi16 (__m64 a, __m64 b) | |
1324 | { | |
1325 | return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b); | |
1326 | } | |
1327 | ||
1328 | static __inline __m64 | |
1329 | _mm_qmulmr_pi32 (__m64 a, __m64 b) | |
1330 | { | |
1331 | return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b); | |
1332 | } | |
1333 | ||
1334 | static __inline __m64 | |
1335 | _mm_subaddhx_pi16 (__m64 a, __m64 b) | |
1336 | { | |
1337 | return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b); | |
1338 | } | |
1339 | ||
1340 | static __inline __m64 | |
1341 | _mm_addbhusl_pu8 (__m64 a, __m64 b) | |
1342 | { | |
1343 | return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b); | |
1344 | } | |
1345 | ||
1346 | static __inline __m64 | |
1347 | _mm_addbhusm_pu8 (__m64 a, __m64 b) | |
1348 | { | |
1349 | return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b); | |
1350 | } | |
1351 | ||
1352 | #define _mm_qmiabb_pi32(acc, m1, m2) \ | |
1353 | ({\ | |
1354 | __m64 _acc = acc;\ | |
1355 | __m64 _m1 = m1;\ | |
1356 | __m64 _m2 = m2;\ | |
1357 | _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1358 | _acc;\ | |
1359 | }) | |
1360 | ||
1361 | #define _mm_qmiabbn_pi32(acc, m1, m2) \ | |
1362 | ({\ | |
1363 | __m64 _acc = acc;\ | |
1364 | __m64 _m1 = m1;\ | |
1365 | __m64 _m2 = m2;\ | |
1366 | _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1367 | _acc;\ | |
1368 | }) | |
1369 | ||
1370 | #define _mm_qmiabt_pi32(acc, m1, m2) \ | |
1371 | ({\ | |
1372 | __m64 _acc = acc;\ | |
1373 | __m64 _m1 = m1;\ | |
1374 | __m64 _m2 = m2;\ | |
1375 | _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1376 | _acc;\ | |
1377 | }) | |
1378 | ||
1379 | #define _mm_qmiabtn_pi32(acc, m1, m2) \ | |
1380 | ({\ | |
1381 | __m64 _acc=acc;\ | |
1382 | __m64 _m1=m1;\ | |
1383 | __m64 _m2=m2;\ | |
1384 | _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1385 | _acc;\ | |
1386 | }) | |
1387 | ||
1388 | #define _mm_qmiatb_pi32(acc, m1, m2) \ | |
1389 | ({\ | |
1390 | __m64 _acc = acc;\ | |
1391 | __m64 _m1 = m1;\ | |
1392 | __m64 _m2 = m2;\ | |
1393 | _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1394 | _acc;\ | |
1395 | }) | |
1396 | ||
1397 | #define _mm_qmiatbn_pi32(acc, m1, m2) \ | |
1398 | ({\ | |
1399 | __m64 _acc = acc;\ | |
1400 | __m64 _m1 = m1;\ | |
1401 | __m64 _m2 = m2;\ | |
1402 | _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1403 | _acc;\ | |
1404 | }) | |
1405 | ||
1406 | #define _mm_qmiatt_pi32(acc, m1, m2) \ | |
1407 | ({\ | |
1408 | __m64 _acc = acc;\ | |
1409 | __m64 _m1 = m1;\ | |
1410 | __m64 _m2 = m2;\ | |
1411 | _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1412 | _acc;\ | |
1413 | }) | |
1414 | ||
1415 | #define _mm_qmiattn_pi32(acc, m1, m2) \ | |
1416 | ({\ | |
1417 | __m64 _acc = acc;\ | |
1418 | __m64 _m1 = m1;\ | |
1419 | __m64 _m2 = m2;\ | |
1420 | _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1421 | _acc;\ | |
1422 | }) | |
1423 | ||
1424 | #define _mm_wmiabb_si64(acc, m1, m2) \ | |
1425 | ({\ | |
1426 | __m64 _acc = acc;\ | |
1427 | __m64 _m1 = m1;\ | |
1428 | __m64 _m2 = m2;\ | |
1429 | _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1430 | _acc;\ | |
1431 | }) | |
1432 | ||
1433 | #define _mm_wmiabbn_si64(acc, m1, m2) \ | |
1434 | ({\ | |
1435 | __m64 _acc = acc;\ | |
1436 | __m64 _m1 = m1;\ | |
1437 | __m64 _m2 = m2;\ | |
1438 | _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1439 | _acc;\ | |
1440 | }) | |
1441 | ||
1442 | #define _mm_wmiabt_si64(acc, m1, m2) \ | |
1443 | ({\ | |
1444 | __m64 _acc = acc;\ | |
1445 | __m64 _m1 = m1;\ | |
1446 | __m64 _m2 = m2;\ | |
1447 | _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1448 | _acc;\ | |
1449 | }) | |
1450 | ||
1451 | #define _mm_wmiabtn_si64(acc, m1, m2) \ | |
1452 | ({\ | |
1453 | __m64 _acc = acc;\ | |
1454 | __m64 _m1 = m1;\ | |
1455 | __m64 _m2 = m2;\ | |
1456 | _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1457 | _acc;\ | |
1458 | }) | |
1459 | ||
1460 | #define _mm_wmiatb_si64(acc, m1, m2) \ | |
1461 | ({\ | |
1462 | __m64 _acc = acc;\ | |
1463 | __m64 _m1 = m1;\ | |
1464 | __m64 _m2 = m2;\ | |
1465 | _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1466 | _acc;\ | |
1467 | }) | |
1468 | ||
1469 | #define _mm_wmiatbn_si64(acc, m1, m2) \ | |
1470 | ({\ | |
1471 | __m64 _acc = acc;\ | |
1472 | __m64 _m1 = m1;\ | |
1473 | __m64 _m2 = m2;\ | |
1474 | _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1475 | _acc;\ | |
1476 | }) | |
1477 | ||
1478 | #define _mm_wmiatt_si64(acc, m1, m2) \ | |
1479 | ({\ | |
1480 | __m64 _acc = acc;\ | |
1481 | __m64 _m1 = m1;\ | |
1482 | __m64 _m2 = m2;\ | |
1483 | _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1484 | _acc;\ | |
1485 | }) | |
1486 | ||
1487 | #define _mm_wmiattn_si64(acc, m1, m2) \ | |
1488 | ({\ | |
1489 | __m64 _acc = acc;\ | |
1490 | __m64 _m1 = m1;\ | |
1491 | __m64 _m2 = m2;\ | |
1492 | _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ | |
1493 | _acc;\ | |
1494 | }) | |
1495 | ||
1496 | #define _mm_wmiawbb_si64(acc, m1, m2) \ | |
1497 | ({\ | |
1498 | __m64 _acc = acc;\ | |
1499 | __m64 _m1 = m1;\ | |
1500 | __m64 _m2 = m2;\ | |
1501 | _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\ | |
1502 | _acc;\ | |
1503 | }) | |
1504 | ||
1505 | #define _mm_wmiawbbn_si64(acc, m1, m2) \ | |
1506 | ({\ | |
1507 | __m64 _acc = acc;\ | |
1508 | __m64 _m1 = m1;\ | |
1509 | __m64 _m2 = m2;\ | |
1510 | _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\ | |
1511 | _acc;\ | |
1512 | }) | |
1513 | ||
1514 | #define _mm_wmiawbt_si64(acc, m1, m2) \ | |
1515 | ({\ | |
1516 | __m64 _acc = acc;\ | |
1517 | __m64 _m1 = m1;\ | |
1518 | __m64 _m2 = m2;\ | |
1519 | _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\ | |
1520 | _acc;\ | |
1521 | }) | |
1522 | ||
1523 | #define _mm_wmiawbtn_si64(acc, m1, m2) \ | |
1524 | ({\ | |
1525 | __m64 _acc = acc;\ | |
1526 | __m64 _m1 = m1;\ | |
1527 | __m64 _m2 = m2;\ | |
1528 | _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\ | |
1529 | _acc;\ | |
1530 | }) | |
1531 | ||
1532 | #define _mm_wmiawtb_si64(acc, m1, m2) \ | |
1533 | ({\ | |
1534 | __m64 _acc = acc;\ | |
1535 | __m64 _m1 = m1;\ | |
1536 | __m64 _m2 = m2;\ | |
1537 | _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\ | |
1538 | _acc;\ | |
1539 | }) | |
1540 | ||
1541 | #define _mm_wmiawtbn_si64(acc, m1, m2) \ | |
1542 | ({\ | |
1543 | __m64 _acc = acc;\ | |
1544 | __m64 _m1 = m1;\ | |
1545 | __m64 _m2 = m2;\ | |
1546 | _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\ | |
1547 | _acc;\ | |
1548 | }) | |
1549 | ||
1550 | #define _mm_wmiawtt_si64(acc, m1, m2) \ | |
1551 | ({\ | |
1552 | __m64 _acc = acc;\ | |
1553 | __m64 _m1 = m1;\ | |
1554 | __m64 _m2 = m2;\ | |
1555 | _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\ | |
1556 | _acc;\ | |
1557 | }) | |
1558 | ||
1559 | #define _mm_wmiawttn_si64(acc, m1, m2) \ | |
1560 | ({\ | |
1561 | __m64 _acc = acc;\ | |
1562 | __m64 _m1 = m1;\ | |
1563 | __m64 _m2 = m2;\ | |
1564 | _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\ | |
1565 | _acc;\ | |
1566 | }) | |
1567 | ||
1568 | /* The third arguments should be an immediate. */ | |
1569 | #define _mm_merge_si64(a, b, n) \ | |
1570 | ({\ | |
1571 | __m64 result;\ | |
1572 | result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\ | |
1573 | result;\ | |
1574 | }) | |
1575 | #endif /* __IWMMXT2__ */ | |
1576 | ||
1577 | static __inline __m64 | |
1578 | _mm_alignr0_si64 (__m64 a, __m64 b) | |
1579 | { | |
1580 | return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b); | |
1581 | } | |
1582 | ||
1583 | static __inline __m64 | |
1584 | _mm_alignr1_si64 (__m64 a, __m64 b) | |
1585 | { | |
1586 | return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b); | |
1587 | } | |
1588 | ||
1589 | static __inline __m64 | |
1590 | _mm_alignr2_si64 (__m64 a, __m64 b) | |
1591 | { | |
1592 | return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b); | |
1593 | } | |
1594 | ||
1595 | static __inline __m64 | |
1596 | _mm_alignr3_si64 (__m64 a, __m64 b) | |
1597 | { | |
1598 | return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b); | |
1599 | } | |
1600 | ||
1601 | static __inline void | |
1602 | _mm_tandcb () | |
1603 | { | |
1604 | __asm __volatile ("tandcb r15"); | |
1605 | } | |
1606 | ||
1607 | static __inline void | |
1608 | _mm_tandch () | |
1609 | { | |
1610 | __asm __volatile ("tandch r15"); | |
1611 | } | |
1612 | ||
1613 | static __inline void | |
1614 | _mm_tandcw () | |
1615 | { | |
1616 | __asm __volatile ("tandcw r15"); | |
1617 | } | |
1618 | ||
1619 | #define _mm_textrcb(n) \ | |
1620 | ({\ | |
1621 | __asm__ __volatile__ (\ | |
1622 | "textrcb r15, %0" : : "i" (n));\ | |
1623 | }) | |
1624 | ||
1625 | #define _mm_textrch(n) \ | |
1626 | ({\ | |
1627 | __asm__ __volatile__ (\ | |
1628 | "textrch r15, %0" : : "i" (n));\ | |
1629 | }) | |
1630 | ||
1631 | #define _mm_textrcw(n) \ | |
1632 | ({\ | |
1633 | __asm__ __volatile__ (\ | |
1634 | "textrcw r15, %0" : : "i" (n));\ | |
1635 | }) | |
1636 | ||
1637 | static __inline void | |
1638 | _mm_torcb () | |
1639 | { | |
1640 | __asm __volatile ("torcb r15"); | |
1641 | } | |
1642 | ||
1643 | static __inline void | |
1644 | _mm_torch () | |
1645 | { | |
1646 | __asm __volatile ("torch r15"); | |
1647 | } | |
1648 | ||
1649 | static __inline void | |
1650 | _mm_torcw () | |
1651 | { | |
1652 | __asm __volatile ("torcw r15"); | |
1653 | } | |
1654 | ||
1655 | #ifdef __IWMMXT2__ | |
1656 | static __inline void | |
1657 | _mm_torvscb () | |
1658 | { | |
1659 | __asm __volatile ("torvscb r15"); | |
1660 | } | |
1661 | ||
1662 | static __inline void | |
1663 | _mm_torvsch () | |
1664 | { | |
1665 | __asm __volatile ("torvsch r15"); | |
1666 | } | |
1667 | ||
1668 | static __inline void | |
1669 | _mm_torvscw () | |
1670 | { | |
1671 | __asm __volatile ("torvscw r15"); | |
1672 | } | |
220e70df | 1673 | #endif /* __IWMMXT2__ */ |
8fd03515 XQ |
1674 | |
1675 | static __inline __m64 | |
1676 | _mm_tbcst_pi8 (int value) | |
1677 | { | |
1678 | return (__m64) __builtin_arm_tbcstb ((signed char) value); | |
1679 | } | |
1680 | ||
1681 | static __inline __m64 | |
1682 | _mm_tbcst_pi16 (int value) | |
1683 | { | |
1684 | return (__m64) __builtin_arm_tbcsth ((short) value); | |
1685 | } | |
1686 | ||
5a9335ef | 1687 | static __inline __m64 |
8fd03515 | 1688 | _mm_tbcst_pi32 (int value) |
5a9335ef | 1689 | { |
8fd03515 | 1690 | return (__m64) __builtin_arm_tbcstw (value); |
5a9335ef NC |
1691 | } |
1692 | ||
4ad4fa63 | 1693 | #define _m_empty _mm_empty |
5a9335ef NC |
1694 | #define _m_packsswb _mm_packs_pi16 |
1695 | #define _m_packssdw _mm_packs_pi32 | |
1696 | #define _m_packuswb _mm_packs_pu16 | |
1697 | #define _m_packusdw _mm_packs_pu32 | |
1698 | #define _m_packssqd _mm_packs_pi64 | |
1699 | #define _m_packusqd _mm_packs_pu64 | |
1700 | #define _mm_packs_si64 _mm_packs_pi64 | |
1701 | #define _mm_packs_su64 _mm_packs_pu64 | |
1702 | #define _m_punpckhbw _mm_unpackhi_pi8 | |
1703 | #define _m_punpckhwd _mm_unpackhi_pi16 | |
1704 | #define _m_punpckhdq _mm_unpackhi_pi32 | |
1705 | #define _m_punpcklbw _mm_unpacklo_pi8 | |
1706 | #define _m_punpcklwd _mm_unpacklo_pi16 | |
1707 | #define _m_punpckldq _mm_unpacklo_pi32 | |
1708 | #define _m_punpckehsbw _mm_unpackeh_pi8 | |
1709 | #define _m_punpckehswd _mm_unpackeh_pi16 | |
1710 | #define _m_punpckehsdq _mm_unpackeh_pi32 | |
1711 | #define _m_punpckehubw _mm_unpackeh_pu8 | |
1712 | #define _m_punpckehuwd _mm_unpackeh_pu16 | |
1713 | #define _m_punpckehudq _mm_unpackeh_pu32 | |
1714 | #define _m_punpckelsbw _mm_unpackel_pi8 | |
1715 | #define _m_punpckelswd _mm_unpackel_pi16 | |
1716 | #define _m_punpckelsdq _mm_unpackel_pi32 | |
1717 | #define _m_punpckelubw _mm_unpackel_pu8 | |
1718 | #define _m_punpckeluwd _mm_unpackel_pu16 | |
1719 | #define _m_punpckeludq _mm_unpackel_pu32 | |
1720 | #define _m_paddb _mm_add_pi8 | |
1721 | #define _m_paddw _mm_add_pi16 | |
1722 | #define _m_paddd _mm_add_pi32 | |
1723 | #define _m_paddsb _mm_adds_pi8 | |
1724 | #define _m_paddsw _mm_adds_pi16 | |
1725 | #define _m_paddsd _mm_adds_pi32 | |
1726 | #define _m_paddusb _mm_adds_pu8 | |
1727 | #define _m_paddusw _mm_adds_pu16 | |
1728 | #define _m_paddusd _mm_adds_pu32 | |
1729 | #define _m_psubb _mm_sub_pi8 | |
1730 | #define _m_psubw _mm_sub_pi16 | |
1731 | #define _m_psubd _mm_sub_pi32 | |
1732 | #define _m_psubsb _mm_subs_pi8 | |
1733 | #define _m_psubsw _mm_subs_pi16 | |
1734 | #define _m_psubuw _mm_subs_pi32 | |
1735 | #define _m_psubusb _mm_subs_pu8 | |
1736 | #define _m_psubusw _mm_subs_pu16 | |
1737 | #define _m_psubusd _mm_subs_pu32 | |
1738 | #define _m_pmaddwd _mm_madd_pi16 | |
1739 | #define _m_pmadduwd _mm_madd_pu16 | |
1740 | #define _m_pmulhw _mm_mulhi_pi16 | |
1741 | #define _m_pmulhuw _mm_mulhi_pu16 | |
1742 | #define _m_pmullw _mm_mullo_pi16 | |
1743 | #define _m_pmacsw _mm_mac_pi16 | |
1744 | #define _m_pmacuw _mm_mac_pu16 | |
1745 | #define _m_pmacszw _mm_macz_pi16 | |
1746 | #define _m_pmacuzw _mm_macz_pu16 | |
1747 | #define _m_paccb _mm_acc_pu8 | |
1748 | #define _m_paccw _mm_acc_pu16 | |
1749 | #define _m_paccd _mm_acc_pu32 | |
1750 | #define _m_pmia _mm_mia_si64 | |
1751 | #define _m_pmiaph _mm_miaph_si64 | |
1752 | #define _m_pmiabb _mm_miabb_si64 | |
1753 | #define _m_pmiabt _mm_miabt_si64 | |
1754 | #define _m_pmiatb _mm_miatb_si64 | |
1755 | #define _m_pmiatt _mm_miatt_si64 | |
1756 | #define _m_psllw _mm_sll_pi16 | |
1757 | #define _m_psllwi _mm_slli_pi16 | |
1758 | #define _m_pslld _mm_sll_pi32 | |
1759 | #define _m_pslldi _mm_slli_pi32 | |
1760 | #define _m_psllq _mm_sll_si64 | |
1761 | #define _m_psllqi _mm_slli_si64 | |
1762 | #define _m_psraw _mm_sra_pi16 | |
1763 | #define _m_psrawi _mm_srai_pi16 | |
1764 | #define _m_psrad _mm_sra_pi32 | |
1765 | #define _m_psradi _mm_srai_pi32 | |
1766 | #define _m_psraq _mm_sra_si64 | |
1767 | #define _m_psraqi _mm_srai_si64 | |
1768 | #define _m_psrlw _mm_srl_pi16 | |
1769 | #define _m_psrlwi _mm_srli_pi16 | |
1770 | #define _m_psrld _mm_srl_pi32 | |
1771 | #define _m_psrldi _mm_srli_pi32 | |
1772 | #define _m_psrlq _mm_srl_si64 | |
1773 | #define _m_psrlqi _mm_srli_si64 | |
1774 | #define _m_prorw _mm_ror_pi16 | |
1775 | #define _m_prorwi _mm_rori_pi16 | |
1776 | #define _m_prord _mm_ror_pi32 | |
1777 | #define _m_prordi _mm_rori_pi32 | |
1778 | #define _m_prorq _mm_ror_si64 | |
1779 | #define _m_prorqi _mm_rori_si64 | |
1780 | #define _m_pand _mm_and_si64 | |
1781 | #define _m_pandn _mm_andnot_si64 | |
1782 | #define _m_por _mm_or_si64 | |
1783 | #define _m_pxor _mm_xor_si64 | |
1784 | #define _m_pcmpeqb _mm_cmpeq_pi8 | |
1785 | #define _m_pcmpeqw _mm_cmpeq_pi16 | |
1786 | #define _m_pcmpeqd _mm_cmpeq_pi32 | |
1787 | #define _m_pcmpgtb _mm_cmpgt_pi8 | |
1788 | #define _m_pcmpgtub _mm_cmpgt_pu8 | |
1789 | #define _m_pcmpgtw _mm_cmpgt_pi16 | |
1790 | #define _m_pcmpgtuw _mm_cmpgt_pu16 | |
1791 | #define _m_pcmpgtd _mm_cmpgt_pi32 | |
1792 | #define _m_pcmpgtud _mm_cmpgt_pu32 | |
1793 | #define _m_pextrb _mm_extract_pi8 | |
1794 | #define _m_pextrw _mm_extract_pi16 | |
1795 | #define _m_pextrd _mm_extract_pi32 | |
1796 | #define _m_pextrub _mm_extract_pu8 | |
1797 | #define _m_pextruw _mm_extract_pu16 | |
1798 | #define _m_pextrud _mm_extract_pu32 | |
1799 | #define _m_pinsrb _mm_insert_pi8 | |
1800 | #define _m_pinsrw _mm_insert_pi16 | |
1801 | #define _m_pinsrd _mm_insert_pi32 | |
1802 | #define _m_pmaxsb _mm_max_pi8 | |
1803 | #define _m_pmaxsw _mm_max_pi16 | |
1804 | #define _m_pmaxsd _mm_max_pi32 | |
1805 | #define _m_pmaxub _mm_max_pu8 | |
1806 | #define _m_pmaxuw _mm_max_pu16 | |
1807 | #define _m_pmaxud _mm_max_pu32 | |
1808 | #define _m_pminsb _mm_min_pi8 | |
1809 | #define _m_pminsw _mm_min_pi16 | |
1810 | #define _m_pminsd _mm_min_pi32 | |
1811 | #define _m_pminub _mm_min_pu8 | |
1812 | #define _m_pminuw _mm_min_pu16 | |
1813 | #define _m_pminud _mm_min_pu32 | |
1814 | #define _m_pmovmskb _mm_movemask_pi8 | |
1815 | #define _m_pmovmskw _mm_movemask_pi16 | |
1816 | #define _m_pmovmskd _mm_movemask_pi32 | |
1817 | #define _m_pshufw _mm_shuffle_pi16 | |
1818 | #define _m_pavgb _mm_avg_pu8 | |
1819 | #define _m_pavgw _mm_avg_pu16 | |
1820 | #define _m_pavg2b _mm_avg2_pu8 | |
1821 | #define _m_pavg2w _mm_avg2_pu16 | |
1822 | #define _m_psadbw _mm_sad_pu8 | |
1823 | #define _m_psadwd _mm_sad_pu16 | |
1824 | #define _m_psadzbw _mm_sadz_pu8 | |
1825 | #define _m_psadzwd _mm_sadz_pu16 | |
1826 | #define _m_paligniq _mm_align_si64 | |
1827 | #define _m_cvt_si2pi _mm_cvtsi64_m64 | |
1828 | #define _m_cvt_pi2si _mm_cvtm64_si64 | |
8fd03515 XQ |
1829 | #define _m_from_int _mm_cvtsi32_si64 |
1830 | #define _m_to_int _mm_cvtsi64_si32 | |
5a9335ef | 1831 | |
8fd03515 XQ |
1832 | #if defined __cplusplus |
1833 | }; /* End "C" */ | |
1834 | #endif /* __cplusplus */ | |
220e70df | 1835 | |
5a9335ef | 1836 | #endif /* _MMINTRIN_H_INCLUDED */ |