]>
Commit | Line | Data |
---|---|---|
99dee823 | 1 | /* Copyright (C) 2011-2021 Free Software Foundation, Inc. |
b76f1550 SM |
2 | |
3 | This file is part of GCC. | |
4 | ||
5 | GCC is free software; you can redistribute it and/or modify | |
6 | it under the terms of the GNU General Public License as published by | |
7 | the Free Software Foundation; either version 3, or (at your option) | |
8 | any later version. | |
9 | ||
10 | GCC is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU General Public License for more details. | |
14 | ||
15 | Under Section 7 of GPL version 3, you are granted additional | |
16 | permissions described in the GCC Runtime Library Exception, version | |
17 | 3.1, as published by the Free Software Foundation. | |
18 | ||
19 | You should have received a copy of the GNU General Public License and | |
20 | a copy of the GCC Runtime Library Exception along with this program; | |
21 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 | <http://www.gnu.org/licenses/>. */ | |
23 | ||
24 | /* This header is distributed to simplify porting x86_64 code that | |
25 | makes explicit use of Intel intrinsics to powerpc64le. | |
26 | It is the user's responsibility to determine if the results are | |
27 | acceptable and make additional changes as necessary. | |
28 | Note that much code that uses Intel intrinsics can be rewritten in | |
29 | standard C or GNU C extensions, which are more portable and better | |
30 | optimized across multiple targets. */ | |
31 | ||
32 | #if !defined _X86INTRIN_H_INCLUDED | |
33 | # error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead." | |
34 | #endif | |
35 | ||
36 | #ifndef _BMI2INTRIN_H_INCLUDED | |
37 | #define _BMI2INTRIN_H_INCLUDED | |
38 | ||
39 | extern __inline unsigned int | |
40 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
41 | _bzhi_u32 (unsigned int __X, unsigned int __Y) | |
42 | { | |
43 | return ((__X << (32 - __Y)) >> (32 - __Y)); | |
44 | } | |
45 | ||
46 | extern __inline unsigned int | |
47 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
48 | _mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P) | |
49 | { | |
50 | unsigned long long __res = (unsigned long long) __X * __Y; | |
51 | *__P = (unsigned int) (__res >> 32); | |
52 | return (unsigned int) __res; | |
53 | } | |
54 | ||
55 | #ifdef __PPC64__ | |
56 | extern __inline unsigned long long | |
57 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
58 | _bzhi_u64 (unsigned long long __X, unsigned long long __Y) | |
59 | { | |
60 | return ((__X << (64 - __Y)) >> (64 - __Y)); | |
61 | } | |
62 | ||
63 | /* __int128 requires base 64-bit. */ | |
64 | extern __inline unsigned long long | |
65 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
66 | _mulx_u64 (unsigned long long __X, unsigned long long __Y, | |
67 | unsigned long long *__P) | |
68 | { | |
69 | unsigned __int128 __res = (unsigned __int128) __X * __Y; | |
70 | *__P = (unsigned long long) (__res >> 64); | |
71 | return (unsigned long long) __res; | |
72 | } | |
73 | ||
74 | #ifdef _ARCH_PWR7 | |
75 | /* popcount and bpermd require power7 minimum. */ | |
76 | extern __inline unsigned long long | |
77 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
78 | _pdep_u64 (unsigned long long __X, unsigned long long __M) | |
79 | { | |
80 | unsigned long result = 0x0UL; | |
81 | const unsigned long mask = 0x8000000000000000UL; | |
82 | unsigned long m = __M; | |
83 | unsigned long c, t; | |
84 | unsigned long p; | |
85 | ||
86 | /* The pop-count of the mask gives the number of the bits from | |
87 | source to process. This is also needed to shift bits from the | |
88 | source into the correct position for the result. */ | |
89 | p = 64 - __builtin_popcountl (__M); | |
90 | ||
91 | /* The loop is for the number of '1' bits in the mask and clearing | |
92 | each mask bit as it is processed. */ | |
93 | while (m != 0) | |
94 | { | |
95 | c = __builtin_clzl (m); | |
96 | t = __X << (p - c); | |
97 | m ^= (mask >> c); | |
98 | result |= (t & (mask >> c)); | |
99 | p++; | |
100 | } | |
101 | return (result); | |
102 | } | |
103 | ||
104 | extern __inline unsigned long long | |
105 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
106 | _pext_u64 (unsigned long long __X, unsigned long long __M) | |
107 | { | |
108 | unsigned long p = 0x4040404040404040UL; // initial bit permute control | |
109 | const unsigned long mask = 0x8000000000000000UL; | |
110 | unsigned long m = __M; | |
111 | unsigned long c; | |
112 | unsigned long result; | |
113 | ||
114 | /* if the mask is constant and selects 8 bits or less we can use | |
115 | the Power8 Bit permute instruction. */ | |
116 | if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8)) | |
117 | { | |
118 | /* Also if the pext mask is constant, then the popcount is | |
119 | constant, we can evaluate the following loop at compile | |
120 | time and use a constant bit permute vector. */ | |
121 | for (long i = 0; i < __builtin_popcountl (__M); i++) | |
122 | { | |
123 | c = __builtin_clzl (m); | |
124 | p = (p << 8) | c; | |
125 | m ^= (mask >> c); | |
126 | } | |
127 | result = __builtin_bpermd (p, __X); | |
128 | } | |
129 | else | |
130 | { | |
131 | p = 64 - __builtin_popcountl (__M); | |
132 | result = 0; | |
133 | /* We could a use a for loop here, but that combined with | |
134 | -funroll-loops can expand to a lot of code. The while | |
135 | loop avoids unrolling and the compiler commons the xor | |
136 | from clearing the mask bit with the (m != 0) test. The | |
137 | result is a more compact loop setup and body. */ | |
138 | while (m != 0) | |
139 | { | |
140 | unsigned long t; | |
141 | c = __builtin_clzl (m); | |
142 | t = (__X & (mask >> c)) >> (p - c); | |
143 | m ^= (mask >> c); | |
144 | result |= (t); | |
145 | p++; | |
146 | } | |
147 | } | |
148 | return (result); | |
149 | } | |
150 | ||
151 | /* these 32-bit implementations depend on 64-bit pdep/pext | |
152 | which depend on _ARCH_PWR7. */ | |
153 | extern __inline unsigned int | |
154 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
155 | _pdep_u32 (unsigned int __X, unsigned int __Y) | |
156 | { | |
157 | return _pdep_u64 (__X, __Y); | |
158 | } | |
159 | ||
160 | extern __inline unsigned int | |
161 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
162 | _pext_u32 (unsigned int __X, unsigned int __Y) | |
163 | { | |
164 | return _pext_u64 (__X, __Y); | |
165 | } | |
166 | #endif /* _ARCH_PWR7 */ | |
167 | #endif /* __PPC64__ */ | |
168 | ||
169 | #endif /* _BMI2INTRIN_H_INCLUDED */ |