]>
Commit | Line | Data |
---|---|---|
8d9254fc | 1 | /* Copyright (C) 2003-2020 Free Software Foundation, Inc. |
b1ddadac PC |
2 | |
3 | This file is part of GCC. | |
4 | ||
5 | GCC is free software; you can redistribute it and/or modify | |
6 | it under the terms of the GNU General Public License as published by | |
7 | the Free Software Foundation; either version 3, or (at your option) | |
8 | any later version. | |
9 | ||
10 | GCC is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU General Public License for more details. | |
14 | ||
15 | Under Section 7 of GPL version 3, you are granted additional | |
16 | permissions described in the GCC Runtime Library Exception, version | |
17 | 3.1, as published by the Free Software Foundation. | |
18 | ||
19 | You should have received a copy of the GNU General Public License and | |
20 | a copy of the GCC Runtime Library Exception along with this program; | |
21 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 | <http://www.gnu.org/licenses/>. */ | |
23 | ||
24 | /* Implemented from the specification included in the Intel C++ Compiler | |
25 | User Guide and Reference, version 9.0. */ | |
26 | ||
27 | #ifndef NO_WARN_X86_INTRINSICS | |
28 | /* This header is distributed to simplify porting x86_64 code that | |
29 | makes explicit use of Intel intrinsics to powerpc64le. | |
30 | It is the user's responsibility to determine if the results are | |
31 | acceptable and make additional changes as necessary. | |
32 | Note that much code that uses Intel intrinsics can be rewritten in | |
33 | standard C or GNU C extensions, which are more portable and better | |
34 | optimized across multiple targets. */ | |
35 | #endif | |
36 | ||
37 | #ifndef TMMINTRIN_H_ | |
38 | #define TMMINTRIN_H_ | |
39 | ||
40 | #include <altivec.h> | |
41 | #include <assert.h> | |
42 | ||
43 | /* We need definitions from the SSE header files. */ | |
44 | #include <pmmintrin.h> | |
45 | ||
46 | extern __inline __m128i | |
47 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
48 | _mm_abs_epi16 (__m128i __A) | |
49 | { | |
50 | return (__m128i) vec_abs ((__v8hi) __A); | |
51 | } | |
52 | ||
53 | extern __inline __m128i | |
54 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
55 | _mm_abs_epi32 (__m128i __A) | |
56 | { | |
57 | return (__m128i) vec_abs ((__v4si) __A); | |
58 | } | |
59 | ||
60 | extern __inline __m128i | |
61 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
62 | _mm_abs_epi8 (__m128i __A) | |
63 | { | |
64 | return (__m128i) vec_abs ((__v16qi) __A); | |
65 | } | |
66 | ||
67 | extern __inline __m64 | |
68 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
69 | _mm_abs_pi16 (__m64 __A) | |
70 | { | |
71 | __v8hi __B = (__v8hi) (__v2du) { __A, __A }; | |
72 | return (__m64) ((__v2du) vec_abs (__B))[0]; | |
73 | } | |
74 | ||
75 | extern __inline __m64 | |
76 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
77 | _mm_abs_pi32 (__m64 __A) | |
78 | { | |
79 | __v4si __B = (__v4si) (__v2du) { __A, __A }; | |
80 | return (__m64) ((__v2du) vec_abs (__B))[0]; | |
81 | } | |
82 | ||
83 | extern __inline __m64 | |
84 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
85 | _mm_abs_pi8 (__m64 __A) | |
86 | { | |
87 | __v16qi __B = (__v16qi) (__v2du) { __A, __A }; | |
88 | return (__m64) ((__v2du) vec_abs (__B))[0]; | |
89 | } | |
90 | ||
91 | extern __inline __m128i | |
92 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
93 | _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) | |
94 | { | |
95 | if (__builtin_constant_p (__count) && __count < 16) | |
96 | { | |
60c703ed | 97 | #ifdef __LITTLE_ENDIAN__ |
b1ddadac PC |
98 | __A = (__m128i) vec_reve ((__v16qu) __A); |
99 | __B = (__m128i) vec_reve ((__v16qu) __B); | |
100 | #endif | |
101 | __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count); | |
60c703ed | 102 | #ifdef __LITTLE_ENDIAN__ |
b1ddadac PC |
103 | __A = (__m128i) vec_reve ((__v16qu) __A); |
104 | #endif | |
105 | return __A; | |
106 | } | |
107 | ||
108 | if (__count == 0) | |
109 | return __B; | |
110 | ||
111 | if (__count >= 16) | |
112 | { | |
113 | if (__count >= 32) | |
114 | { | |
115 | const __v16qu zero = { 0 }; | |
116 | return (__m128i) zero; | |
117 | } | |
118 | else | |
119 | { | |
120 | const __v16qu __shift = | |
121 | vec_splats ((unsigned char) ((__count - 16) * 8)); | |
60c703ed | 122 | #ifdef __LITTLE_ENDIAN__ |
b1ddadac PC |
123 | return (__m128i) vec_sro ((__v16qu) __A, __shift); |
124 | #else | |
125 | return (__m128i) vec_slo ((__v16qu) __A, __shift); | |
126 | #endif | |
127 | } | |
128 | } | |
129 | else | |
130 | { | |
131 | const __v16qu __shiftA = | |
132 | vec_splats ((unsigned char) ((16 - __count) * 8)); | |
133 | const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8)); | |
60c703ed | 134 | #ifdef __LITTLE_ENDIAN__ |
b1ddadac PC |
135 | __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA); |
136 | __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB); | |
137 | #else | |
138 | __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA); | |
139 | __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB); | |
140 | #endif | |
141 | return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B); | |
142 | } | |
143 | } | |
144 | ||
145 | extern __inline __m64 | |
146 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
147 | _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count) | |
148 | { | |
149 | if (__count < 16) | |
150 | { | |
151 | __v2du __C = { __B, __A }; | |
60c703ed | 152 | #ifdef __LITTLE_ENDIAN__ |
b1ddadac PC |
153 | const __v4su __shift = { __count << 3, 0, 0, 0 }; |
154 | __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift); | |
155 | #else | |
156 | const __v4su __shift = { 0, 0, 0, __count << 3 }; | |
157 | __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift); | |
158 | #endif | |
159 | return (__m64) __C[0]; | |
160 | } | |
161 | else | |
162 | { | |
163 | const __m64 __zero = { 0 }; | |
164 | return __zero; | |
165 | } | |
166 | } | |
167 | ||
168 | extern __inline __m128i | |
169 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
170 | _mm_hadd_epi16 (__m128i __A, __m128i __B) | |
171 | { | |
172 | const __v16qu __P = | |
173 | { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; | |
174 | const __v16qu __Q = | |
175 | { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; | |
176 | __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); | |
177 | __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); | |
178 | return (__m128i) vec_add (__C, __D); | |
179 | } | |
180 | ||
181 | extern __inline __m128i | |
182 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
183 | _mm_hadd_epi32 (__m128i __A, __m128i __B) | |
184 | { | |
185 | const __v16qu __P = | |
186 | { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; | |
187 | const __v16qu __Q = | |
188 | { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; | |
189 | __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); | |
190 | __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); | |
191 | return (__m128i) vec_add (__C, __D); | |
192 | } | |
193 | ||
194 | extern __inline __m64 | |
195 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
196 | _mm_hadd_pi16 (__m64 __A, __m64 __B) | |
197 | { | |
198 | __v8hi __C = (__v8hi) (__v2du) { __A, __B }; | |
199 | const __v16qu __P = | |
200 | { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; | |
201 | const __v16qu __Q = | |
202 | { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; | |
203 | __v8hi __D = vec_perm (__C, __C, __Q); | |
204 | __C = vec_perm (__C, __C, __P); | |
205 | __C = vec_add (__C, __D); | |
206 | return (__m64) ((__v2du) __C)[1]; | |
207 | } | |
208 | ||
209 | extern __inline __m64 | |
210 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
211 | _mm_hadd_pi32 (__m64 __A, __m64 __B) | |
212 | { | |
213 | __v4si __C = (__v4si) (__v2du) { __A, __B }; | |
214 | const __v16qu __P = | |
215 | { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; | |
216 | const __v16qu __Q = | |
217 | { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; | |
218 | __v4si __D = vec_perm (__C, __C, __Q); | |
219 | __C = vec_perm (__C, __C, __P); | |
220 | __C = vec_add (__C, __D); | |
221 | return (__m64) ((__v2du) __C)[1]; | |
222 | } | |
223 | ||
224 | extern __inline __m128i | |
225 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
226 | _mm_hadds_epi16 (__m128i __A, __m128i __B) | |
227 | { | |
228 | __v4si __C = { 0 }, __D = { 0 }; | |
229 | __C = vec_sum4s ((__v8hi) __A, __C); | |
230 | __D = vec_sum4s ((__v8hi) __B, __D); | |
9a0317e7 | 231 | __C = (__v4si) vec_packs (__C, __D); |
b1ddadac PC |
232 | return (__m128i) __C; |
233 | } | |
234 | ||
235 | extern __inline __m64 | |
236 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
237 | _mm_hadds_pi16 (__m64 __A, __m64 __B) | |
238 | { | |
239 | const __v4si __zero = { 0 }; | |
240 | __v8hi __C = (__v8hi) (__v2du) { __A, __B }; | |
241 | __v4si __D = vec_sum4s (__C, __zero); | |
242 | __C = vec_packs (__D, __D); | |
243 | return (__m64) ((__v2du) __C)[1]; | |
244 | } | |
245 | ||
246 | extern __inline __m128i | |
247 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
248 | _mm_hsub_epi16 (__m128i __A, __m128i __B) | |
249 | { | |
250 | const __v16qu __P = | |
251 | { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; | |
252 | const __v16qu __Q = | |
253 | { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; | |
254 | __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); | |
255 | __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); | |
256 | return (__m128i) vec_sub (__C, __D); | |
257 | } | |
258 | ||
259 | extern __inline __m128i | |
260 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
261 | _mm_hsub_epi32 (__m128i __A, __m128i __B) | |
262 | { | |
263 | const __v16qu __P = | |
264 | { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; | |
265 | const __v16qu __Q = | |
266 | { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; | |
9a0317e7 PC |
267 | __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); |
268 | __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); | |
b1ddadac PC |
269 | return (__m128i) vec_sub (__C, __D); |
270 | } | |
271 | ||
272 | extern __inline __m64 | |
273 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
274 | _mm_hsub_pi16 (__m64 __A, __m64 __B) | |
275 | { | |
276 | const __v16qu __P = | |
277 | { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; | |
278 | const __v16qu __Q = | |
279 | { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; | |
280 | __v8hi __C = (__v8hi) (__v2du) { __A, __B }; | |
281 | __v8hi __D = vec_perm (__C, __C, __Q); | |
282 | __C = vec_perm (__C, __C, __P); | |
283 | __C = vec_sub (__C, __D); | |
284 | return (__m64) ((__v2du) __C)[1]; | |
285 | } | |
286 | ||
287 | extern __inline __m64 | |
288 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
289 | _mm_hsub_pi32 (__m64 __A, __m64 __B) | |
290 | { | |
291 | const __v16qu __P = | |
292 | { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; | |
293 | const __v16qu __Q = | |
294 | { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; | |
295 | __v4si __C = (__v4si) (__v2du) { __A, __B }; | |
296 | __v4si __D = vec_perm (__C, __C, __Q); | |
297 | __C = vec_perm (__C, __C, __P); | |
298 | __C = vec_sub (__C, __D); | |
299 | return (__m64) ((__v2du) __C)[1]; | |
300 | } | |
301 | ||
302 | extern __inline __m128i | |
303 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
304 | _mm_hsubs_epi16 (__m128i __A, __m128i __B) | |
305 | { | |
306 | const __v16qu __P = | |
307 | { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; | |
308 | const __v16qu __Q = | |
309 | { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; | |
310 | __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); | |
311 | __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); | |
312 | return (__m128i) vec_subs (__C, __D); | |
313 | } | |
314 | ||
315 | extern __inline __m64 | |
316 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
317 | _mm_hsubs_pi16 (__m64 __A, __m64 __B) | |
318 | { | |
319 | const __v16qu __P = | |
320 | { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; | |
321 | const __v16qu __Q = | |
322 | { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; | |
323 | __v8hi __C = (__v8hi) (__v2du) { __A, __B }; | |
324 | __v8hi __D = vec_perm (__C, __C, __P); | |
325 | __v8hi __E = vec_perm (__C, __C, __Q); | |
326 | __C = vec_subs (__D, __E); | |
327 | return (__m64) ((__v2du) __C)[1]; | |
328 | } | |
329 | ||
330 | extern __inline __m128i | |
331 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
332 | _mm_shuffle_epi8 (__m128i __A, __m128i __B) | |
333 | { | |
334 | const __v16qi __zero = { 0 }; | |
9a0317e7 | 335 | __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero); |
b1ddadac PC |
336 | __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B); |
337 | return (__m128i) vec_sel (__C, __zero, __select); | |
338 | } | |
339 | ||
340 | extern __inline __m64 | |
341 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
342 | _mm_shuffle_pi8 (__m64 __A, __m64 __B) | |
343 | { | |
344 | const __v16qi __zero = { 0 }; | |
345 | __v16qi __C = (__v16qi) (__v2du) { __A, __A }; | |
346 | __v16qi __D = (__v16qi) (__v2du) { __B, __B }; | |
9a0317e7 | 347 | __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero); |
b1ddadac PC |
348 | __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D); |
349 | __C = vec_sel (__C, __zero, __select); | |
350 | return (__m64) ((__v2du) (__C))[0]; | |
351 | } | |
352 | ||
353 | extern __inline __m128i | |
354 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
355 | _mm_sign_epi8 (__m128i __A, __m128i __B) | |
356 | { | |
357 | const __v16qi __zero = { 0 }; | |
358 | __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero); | |
359 | __v16qi __selectpos = | |
360 | (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero)); | |
361 | __v16qi __conv = vec_add (__selectneg, __selectpos); | |
362 | return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv); | |
363 | } | |
364 | ||
365 | extern __inline __m128i | |
366 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
367 | _mm_sign_epi16 (__m128i __A, __m128i __B) | |
368 | { | |
369 | const __v8hi __zero = { 0 }; | |
370 | __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero); | |
371 | __v8hi __selectpos = | |
372 | (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero)); | |
373 | __v8hi __conv = vec_add (__selectneg, __selectpos); | |
374 | return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv); | |
375 | } | |
376 | ||
377 | extern __inline __m128i | |
378 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
379 | _mm_sign_epi32 (__m128i __A, __m128i __B) | |
380 | { | |
381 | const __v4si __zero = { 0 }; | |
382 | __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero); | |
383 | __v4si __selectpos = | |
384 | (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero)); | |
385 | __v4si __conv = vec_add (__selectneg, __selectpos); | |
386 | return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv); | |
387 | } | |
388 | ||
389 | extern __inline __m64 | |
390 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
391 | _mm_sign_pi8 (__m64 __A, __m64 __B) | |
392 | { | |
393 | const __v16qi __zero = { 0 }; | |
394 | __v16qi __C = (__v16qi) (__v2du) { __A, __A }; | |
395 | __v16qi __D = (__v16qi) (__v2du) { __B, __B }; | |
396 | __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D); | |
397 | return (__m64) ((__v2du) (__C))[0]; | |
398 | } | |
399 | ||
400 | extern __inline __m64 | |
401 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
402 | _mm_sign_pi16 (__m64 __A, __m64 __B) | |
403 | { | |
404 | const __v8hi __zero = { 0 }; | |
405 | __v8hi __C = (__v8hi) (__v2du) { __A, __A }; | |
406 | __v8hi __D = (__v8hi) (__v2du) { __B, __B }; | |
407 | __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D); | |
408 | return (__m64) ((__v2du) (__C))[0]; | |
409 | } | |
410 | ||
411 | extern __inline __m64 | |
412 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
413 | _mm_sign_pi32 (__m64 __A, __m64 __B) | |
414 | { | |
415 | const __v4si __zero = { 0 }; | |
416 | __v4si __C = (__v4si) (__v2du) { __A, __A }; | |
417 | __v4si __D = (__v4si) (__v2du) { __B, __B }; | |
418 | __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D); | |
419 | return (__m64) ((__v2du) (__C))[0]; | |
420 | } | |
421 | ||
422 | extern __inline __m128i | |
423 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
424 | _mm_maddubs_epi16 (__m128i __A, __m128i __B) | |
425 | { | |
b1ddadac | 426 | __v8hi __unsigned = vec_splats ((signed short) 0x00ff); |
9a0317e7 PC |
427 | __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned); |
428 | __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned); | |
429 | __v8hi __E = vec_unpackh ((__v16qi) __B); | |
430 | __v8hi __F = vec_unpackl ((__v16qi) __B); | |
b1ddadac PC |
431 | __C = vec_mul (__C, __E); |
432 | __D = vec_mul (__D, __F); | |
433 | const __v16qu __odds = | |
434 | { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; | |
435 | const __v16qu __evens = | |
436 | { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; | |
437 | __E = vec_perm (__C, __D, __odds); | |
438 | __F = vec_perm (__C, __D, __evens); | |
439 | return (__m128i) vec_adds (__E, __F); | |
440 | } | |
441 | ||
442 | extern __inline __m64 | |
443 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
444 | _mm_maddubs_pi16 (__m64 __A, __m64 __B) | |
445 | { | |
446 | __v8hi __C = (__v8hi) (__v2du) { __A, __A }; | |
447 | __C = vec_unpackl ((__v16qi) __C); | |
9a0317e7 PC |
448 | const __v8hi __unsigned = vec_splats ((signed short) 0x00ff); |
449 | __C = vec_and (__C, __unsigned); | |
b1ddadac PC |
450 | __v8hi __D = (__v8hi) (__v2du) { __B, __B }; |
451 | __D = vec_unpackl ((__v16qi) __D); | |
b1ddadac PC |
452 | __D = vec_mul (__C, __D); |
453 | const __v16qu __odds = | |
454 | { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; | |
455 | const __v16qu __evens = | |
456 | { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; | |
457 | __C = vec_perm (__D, __D, __odds); | |
458 | __D = vec_perm (__D, __D, __evens); | |
459 | __C = vec_adds (__C, __D); | |
460 | return (__m64) ((__v2du) (__C))[0]; | |
461 | } | |
462 | ||
463 | extern __inline __m128i | |
464 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
465 | _mm_mulhrs_epi16 (__m128i __A, __m128i __B) | |
466 | { | |
467 | __v4si __C = vec_unpackh ((__v8hi) __A); | |
468 | __v4si __D = vec_unpackh ((__v8hi) __B); | |
469 | __C = vec_mul (__C, __D); | |
470 | __D = vec_unpackl ((__v8hi) __A); | |
471 | __v4si __E = vec_unpackl ((__v8hi) __B); | |
472 | __D = vec_mul (__D, __E); | |
473 | const __v4su __shift = vec_splats ((unsigned int) 14); | |
474 | __C = vec_sr (__C, __shift); | |
475 | __D = vec_sr (__D, __shift); | |
476 | const __v4si __ones = vec_splats ((signed int) 1); | |
477 | __C = vec_add (__C, __ones); | |
478 | __C = vec_sr (__C, (__v4su) __ones); | |
479 | __D = vec_add (__D, __ones); | |
480 | __D = vec_sr (__D, (__v4su) __ones); | |
481 | return (__m128i) vec_pack (__C, __D); | |
482 | } | |
483 | ||
484 | extern __inline __m64 | |
485 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
486 | _mm_mulhrs_pi16 (__m64 __A, __m64 __B) | |
487 | { | |
488 | __v4si __C = (__v4si) (__v2du) { __A, __A }; | |
489 | __C = vec_unpackh ((__v8hi) __C); | |
490 | __v4si __D = (__v4si) (__v2du) { __B, __B }; | |
491 | __D = vec_unpackh ((__v8hi) __D); | |
492 | __C = vec_mul (__C, __D); | |
493 | const __v4su __shift = vec_splats ((unsigned int) 14); | |
494 | __C = vec_sr (__C, __shift); | |
495 | const __v4si __ones = vec_splats ((signed int) 1); | |
496 | __C = vec_add (__C, __ones); | |
497 | __C = vec_sr (__C, (__v4su) __ones); | |
498 | __v8hi __E = vec_pack (__C, __D); | |
499 | return (__m64) ((__v2du) (__E))[0]; | |
500 | } | |
501 | ||
502 | #endif |