]>
Commit | Line | Data |
---|---|---|
83ffe9cd | 1 | /* Copyright (C) 2019-2023 Free Software Foundation, Inc. |
4f0e90fa HL |
2 | |
3 | This file is part of GCC. | |
4 | ||
5 | GCC is free software; you can redistribute it and/or modify | |
6 | it under the terms of the GNU General Public License as published by | |
7 | the Free Software Foundation; either version 3, or (at your option) | |
8 | any later version. | |
9 | ||
10 | GCC is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU General Public License for more details. | |
14 | ||
15 | Under Section 7 of GPL version 3, you are granted additional | |
16 | permissions described in the GCC Runtime Library Exception, version | |
17 | 3.1, as published by the Free Software Foundation. | |
18 | ||
19 | You should have received a copy of the GNU General Public License and | |
20 | a copy of the GCC Runtime Library Exception along with this program; | |
21 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
22 | <http://www.gnu.org/licenses/>. */ | |
23 | ||
24 | #ifndef _IMMINTRIN_H_INCLUDED | |
25 | #error "Never use <avx512bf16vlintrin.h> directly; include <immintrin.h> instead." | |
26 | #endif | |
27 | ||
28 | #ifndef _AVX512BF16VLINTRIN_H_INCLUDED | |
29 | #define _AVX512BF16VLINTRIN_H_INCLUDED | |
30 | ||
31 | #if !defined(__AVX512VL__) || !defined(__AVX512BF16__) | |
32 | #pragma GCC push_options | |
33 | #pragma GCC target("avx512bf16,avx512vl") | |
34 | #define __DISABLE_AVX512BF16VL__ | |
35 | #endif /* __AVX512BF16__ */ | |
36 | ||
37 | /* Internal data types for implementing the intrinsics. */ | |
87235f1e | 38 | typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32))); |
39 | typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16))); | |
4f0e90fa HL |
40 | |
41 | /* The Intel API is flexible enough that we must allow aliasing with other | |
42 | vector types, and their scalar components. */ | |
87235f1e | 43 | typedef __bf16 __m256bh __attribute__ ((__vector_size__ (32), __may_alias__)); |
44 | typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__)); | |
45 | ||
46 | typedef __bf16 __bfloat16; | |
4f0e90fa | 47 | |
58685b93 | 48 | #define _mm256_cvtneps_pbh(A) \ |
49 | (__m128bh) __builtin_ia32_cvtneps2bf16_v8sf (A) | |
50 | #define _mm_cvtneps_pbh(A) \ | |
51 | (__m128bh) __builtin_ia32_cvtneps2bf16_v4sf (A) | |
52 | ||
4f0e90fa HL |
53 | /* vcvtne2ps2bf16 */ |
54 | ||
55 | extern __inline __m256bh | |
56 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
57 | _mm256_cvtne2ps_pbh (__m256 __A, __m256 __B) | |
58 | { | |
87235f1e | 59 | return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B); |
4f0e90fa HL |
60 | } |
61 | ||
62 | extern __inline __m256bh | |
63 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
64 | _mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D) | |
65 | { | |
87235f1e | 66 | return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B); |
4f0e90fa HL |
67 | } |
68 | ||
69 | extern __inline __m256bh | |
70 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
71 | _mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C) | |
72 | { | |
87235f1e | 73 | return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A); |
4f0e90fa HL |
74 | } |
75 | ||
76 | extern __inline __m128bh | |
77 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
78 | _mm_cvtne2ps_pbh (__m128 __A, __m128 __B) | |
79 | { | |
87235f1e | 80 | return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B); |
4f0e90fa HL |
81 | } |
82 | ||
83 | extern __inline __m128bh | |
84 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
85 | _mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D) | |
86 | { | |
87235f1e | 87 | return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B); |
4f0e90fa HL |
88 | } |
89 | ||
90 | extern __inline __m128bh | |
91 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
92 | _mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C) | |
93 | { | |
87235f1e | 94 | return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A); |
4f0e90fa HL |
95 | } |
96 | ||
97 | /* vcvtneps2bf16 */ | |
98 | ||
4f0e90fa HL |
99 | extern __inline __m128bh |
100 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
101 | _mm256_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m256 __C) | |
102 | { | |
103 | return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_mask(__C, __A, __B); | |
104 | } | |
105 | ||
106 | extern __inline __m128bh | |
107 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
108 | _mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B) | |
109 | { | |
110 | return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_maskz(__B, __A); | |
111 | } | |
112 | ||
4f0e90fa HL |
113 | extern __inline __m128bh |
114 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
115 | _mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C) | |
116 | { | |
117 | return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B); | |
118 | } | |
119 | ||
120 | extern __inline __m128bh | |
121 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
122 | _mm_maskz_cvtneps_pbh (__mmask8 __A, __m128 __B) | |
123 | { | |
124 | return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_maskz(__B, __A); | |
125 | } | |
126 | ||
127 | /* vdpbf16ps */ | |
128 | ||
129 | extern __inline __m256 | |
130 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
131 | _mm256_dpbf16_ps (__m256 __A, __m256bh __B, __m256bh __C) | |
132 | { | |
133 | return (__m256)__builtin_ia32_dpbf16ps_v8sf(__A, __B, __C); | |
134 | } | |
135 | ||
136 | extern __inline __m256 | |
137 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
138 | _mm256_mask_dpbf16_ps (__m256 __A, __mmask8 __B, __m256bh __C, __m256bh __D) | |
139 | { | |
140 | return (__m256)__builtin_ia32_dpbf16ps_v8sf_mask(__A, __C, __D, __B); | |
141 | } | |
142 | ||
143 | extern __inline __m256 | |
144 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
145 | _mm256_maskz_dpbf16_ps (__mmask8 __A, __m256 __B, __m256bh __C, __m256bh __D) | |
146 | { | |
147 | return (__m256)__builtin_ia32_dpbf16ps_v8sf_maskz(__B, __C, __D, __A); | |
148 | } | |
149 | ||
150 | extern __inline __m128 | |
151 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
152 | _mm_dpbf16_ps (__m128 __A, __m128bh __B, __m128bh __C) | |
153 | { | |
154 | return (__m128)__builtin_ia32_dpbf16ps_v4sf(__A, __B, __C); | |
155 | } | |
156 | ||
157 | extern __inline __m128 | |
158 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
159 | _mm_mask_dpbf16_ps (__m128 __A, __mmask8 __B, __m128bh __C, __m128bh __D) | |
160 | { | |
161 | return (__m128)__builtin_ia32_dpbf16ps_v4sf_mask(__A, __C, __D, __B); | |
162 | } | |
163 | ||
164 | extern __inline __m128 | |
165 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | |
166 | _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D) | |
167 | { | |
168 | return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A); | |
169 | } | |
170 | ||
87235f1e | 171 | extern __inline __bf16 |
61e53698 | 172 | __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) |
173 | _mm_cvtness_sbh (float __A) | |
174 | { | |
175 | __v4sf __V = {__A, 0, 0, 0}; | |
87235f1e | 176 | __v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V, |
177 | (__v8bf)_mm_undefined_si128 (), (__mmask8)-1); | |
61e53698 | 178 | return __R[0]; |
179 | } | |
180 | ||
181 | extern __inline __m128 | |
182 | __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) | |
183 | _mm_cvtpbh_ps (__m128bh __A) | |
184 | { | |
185 | return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 ( | |
186 | (__m128i)_mm_cvtepi16_epi32 ((__m128i)__A), 16)); | |
187 | } | |
188 | ||
189 | extern __inline __m256 | |
190 | __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) | |
191 | _mm256_cvtpbh_ps (__m128bh __A) | |
192 | { | |
193 | return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 ( | |
194 | (__m256i)_mm256_cvtepi16_epi32 ((__m128i)__A), 16)); | |
195 | } | |
196 | ||
197 | extern __inline __m128 | |
198 | __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) | |
199 | _mm_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A) | |
200 | { | |
201 | return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 ( | |
202 | (__m128i)_mm_maskz_cvtepi16_epi32 ( | |
203 | (__mmask8)__U, (__m128i)__A), 16)); | |
204 | } | |
205 | ||
206 | extern __inline __m256 | |
207 | __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) | |
208 | _mm256_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A) | |
209 | { | |
210 | return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 ( | |
211 | (__m256i)_mm256_maskz_cvtepi16_epi32 ( | |
212 | (__mmask8)__U, (__m128i)__A), 16)); | |
213 | } | |
214 | ||
215 | extern __inline __m128 | |
216 | __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) | |
217 | _mm_mask_cvtpbh_ps (__m128 __S, __mmask8 __U, __m128bh __A) | |
218 | { | |
219 | return (__m128)_mm_castsi128_ps ((__m128i)_mm_mask_slli_epi32 ( | |
220 | (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32 ( | |
221 | (__m128i)__A), 16)); | |
222 | } | |
223 | ||
224 | extern __inline __m256 | |
225 | __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) | |
226 | _mm256_mask_cvtpbh_ps (__m256 __S, __mmask8 __U, __m128bh __A) | |
227 | { | |
228 | return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_mask_slli_epi32 ( | |
229 | (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32 ( | |
230 | (__m128i)__A), 16)); | |
231 | } | |
232 | ||
4f0e90fa HL |
233 | #ifdef __DISABLE_AVX512BF16VL__ |
234 | #undef __DISABLE_AVX512BF16VL__ | |
235 | #pragma GCC pop_options | |
236 | #endif /* __DISABLE_AVX512BF16VL__ */ | |
237 | ||
238 | #endif /* _AVX512BF16VLINTRIN_H_INCLUDED */ |