[thirdparty/gcc.git] / libgcc / config / gcn / amdgcn_veclib.h

/* Macro library used to help during conversion of scalar math functions to
   vectorized SIMD equivalents on AMD GCN.

   Copyright (C) 2023-2024 Free Software Foundation, Inc.
   Contributed by Siemens.
  
   This file is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by the
   Free Software Foundation; either version 3, or (at your option) any
   later version.

   This file is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

typedef union {
  v2sf t_v2sf;
  v4sf t_v4sf;
  v8sf t_v8sf;
  v16sf t_v16sf;
  v32sf t_v32sf;
  v64sf t_v64sf;

  v2df t_v2df;
  v4df t_v4df;
  v8df t_v8df;
  v16df t_v16df;
  v32df t_v32df;
  v64df t_v64df;

  v64qi t_v64qi;
  v64hi t_v64hi;

  v2si t_v2si;
  v4si t_v4si;
  v8si t_v8si;
  v16si t_v16si;
  v32si t_v32si;
  v64si t_v64si;

  v64usi t_v64usi;

  v2di t_v2di;
  v4di t_v4di;
  v8di t_v8di;
  v16di t_v16di;
  v32di t_v32di;
  v64di t_v64di;
} vector_union;

/* Cast between vectors with a different number of elements, or type.  */

#define VGPR_CAST(to_t, from) \
({ \
  to_t __res; \
  __asm__ ("" : "=v"(__res) : "0"(from)); \
  __res; \
})

#define PACK_SI_PAIR(low, high) \
({ \
  v64udi __res; \
  asm ("v_mov_b32\t%L0, %1\n\t" \
       "v_mov_b32\t%H0, %2" \
       : "=&v"(__res) : "v0"(low), "v"(high), "e"(-1L)); \
  __res; \
 })

#define UNPACK_SI_LOW(to_t, pair) VGPR_CAST(to_t, pair)
#define UNPACK_SI_HIGH(to_t, pair) \
({ \
  to_t __res; \
  asm ("v_mov_b32\t%0, %H1" : "=v"(__res) : "v"(pair), "e"(-1L)); \
  __res; \
 })

#define PACK_DI_PAIR(low, high) \
({ \
  v64uti __res; \
  asm ("v_mov_b32\t%L0, %L1\n\t" \
       "v_mov_b32\t%H0, %H1\n\t" \
       "v_mov_b32\t%J0, %L2\n\t" \
       "v_mov_b32\t%K0, %H2" \
       : "=&v"(__res) : "v0"(low), "v"(high), "e"(-1L)); \
  __res; \
 })

#define UNPACK_DI_LOW(to_t, pair) VGPR_CAST(to_t, pair)
#define UNPACK_DI_HIGH(to_t, pair) \
({ \
  to_t __res; \
  asm ("v_mov_b32\t%L0, %J1\n\t" \
       "v_mov_b32\t%H0, %K1" : "=v"(__res) : "v"(pair), "e"(-1L)); \
  __res; \
 })

#define NO_COND __mask

/* Note - __mask is _not_ accounted for in VECTOR_MERGE!  */
#define VECTOR_MERGE(vec1, vec2, cond) \
({ \
  _Static_assert (__builtin_types_compatible_p (typeof (vec1), typeof (vec2))); \
  union { \
    typeof (vec1) val; \
    v64qi t_v64qi; \
    v64hi t_v64hi; \
    v64si t_v64si; \
    v64di t_v64di; \
  } __vec1, __vec2, __res; \
  __vec1.val = (vec1); \
  __vec2.val = (vec2); \
  __builtin_choose_expr ( \
        sizeof (vec1) == sizeof (v64si), \
        ({ \
          v64si __bitmask = __builtin_convertvector ((cond), v64si); \
          __res.t_v64si = (__vec1.t_v64si & __bitmask) \
                          | (__vec2.t_v64si & ~__bitmask); \
        }), \
	__builtin_choose_expr ( \
	  sizeof (vec1) == sizeof (v64hi), \
	  ({ \
	    v64hi __bitmask = __builtin_convertvector ((cond), v64hi); \
	    __res.t_v64hi = (__vec1.t_v64hi & __bitmask) \
			    | (__vec2.t_v64hi & ~__bitmask); \
	   }), \
	   __builtin_choose_expr ( \
	     sizeof (vec1) == sizeof (v64qi), \
	     ({ \
	     v64qi __bitmask = __builtin_convertvector ((cond), v64qi); \
	     __res.t_v64qi = (__vec1.t_v64qi & __bitmask) \
			      | (__vec2.t_v64qi & ~__bitmask); \
	     }), \
	     ({ \
	      v64di __bitmask = __builtin_convertvector ((cond), v64di); \
	      __res.t_v64di = (__vec1.t_v64di & __bitmask) \
			      | (__vec2.t_v64di & ~__bitmask); \
	      })))); \
  __res.val; \
})

#define VECTOR_COND_MOVE(var, val, cond) \
do { \
  _Static_assert (__builtin_types_compatible_p (typeof (var), typeof (val))); \
  __auto_type __cond = __builtin_convertvector ((cond), typeof (__mask)); \
  var = VECTOR_MERGE ((val), var, __cond & __mask); \
} while (0)

#define VECTOR_IF(cond, cond_var) \
{ \
  __auto_type cond_var = (cond); \
  __auto_type __inv_cond __attribute__((unused)) = ~cond_var; \
  if (!ALL_ZEROES_P (cond_var)) \
  {

#define VECTOR_ELSEIF(cond, cond_var) \
  } \
  cond_var = __inv_cond & (cond); \
  __inv_cond &= ~(cond); \
  if (!ALL_ZEROES_P (cond_var)) \
  {

#define VECTOR_ELSE(cond_var) \
  } \
  cond_var = __inv_cond; \
  if (!ALL_ZEROES_P (cond_var)) \
  {

#define VECTOR_IF2(cond, cond_var, prev_cond_var) \
{ \
  __auto_type cond_var = (cond) & __builtin_convertvector (prev_cond_var, typeof (cond)); \
  __auto_type __inv_cond __attribute__((unused)) = ~cond_var; \
  if (!ALL_ZEROES_P (cond_var)) \
  {

#define VECTOR_ELSEIF2(cond, cond_var, prev_cond_var) \
  } \
  cond_var = (cond) & __inv_cond & __builtin_convertvector (prev_cond_var, typeof (cond)); \
  __inv_cond &= ~(cond); \
  if (!ALL_ZEROES_P (cond_var)) \
  {

#define VECTOR_ELSE2(cond_var, prev_cond_var) \
  } \
  cond_var = __inv_cond & __builtin_convertvector (prev_cond_var, typeof (__inv_cond)); \
  if (!ALL_ZEROES_P (cond_var)) \
  {


#define VECTOR_ENDIF \
  } \
}

#define VECTOR_INIT_AUX(x, type) \
({ \
  typeof (x) __e = (x); \
  type __tmp = { \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e }; \
  __tmp; \
})

#define VECTOR_INIT(x) \
  (_Generic ((x), int: VECTOR_INIT_AUX ((x), v64si), \
                  unsigned: VECTOR_INIT_AUX ((x), v64usi), \
                  char: VECTOR_INIT_AUX ((x), v64qi), \
                  unsigned char: VECTOR_INIT_AUX ((x), v64uqi), \
                  short: VECTOR_INIT_AUX ((x), v64hi), \
                  unsigned short: VECTOR_INIT_AUX ((x), v64uhi), \
                  long: VECTOR_INIT_AUX ((x), v64di), \
                  unsigned long: VECTOR_INIT_AUX ((x), v64udi), \
                  float: VECTOR_INIT_AUX ((x), v64sf), \
                  double: VECTOR_INIT_AUX ((x), v64df)))


#if defined (__GCN3__) || defined (__GCN5__) \
    || defined (__CDNA1__) || defined (__CDNA2__) \
    || defined (__RDNA2__)
#define CDNA3_PLUS 0
#else
#define CDNA3_PLUS 1
#endif

#define VECTOR_INIT_MASK(COUNT) \
({ \
  MASKMODE __mask; \
  int count = (COUNT); \
  if (count == 64) \
    { \
      if (sizeof (MASKMODE) < 512 || CDNA3_PLUS) \
	asm ("v_mov%B0\t%0, -1" : "=v"(__mask) : "e"(-1L)); \
      else \
	asm ("v_mov_b32\t%L0, -1\n\t" \
	     "v_mov_b32\t%H0, -1" : "=v"(__mask) : "e"(-1L)); \
    } \
  else \
    { \
      long bitmask = (count == 64 ? -1 : (1<<count)-1); \
      if (sizeof (MASKMODE) < 512 || CDNA3_PLUS) \
        { \
	  asm ("v_mov%B0\t%0, 0" : "=v"(__mask) : "e"(-1L)); \
	  asm ("v_mov%B0\t%0, -1" : "+v"(__mask) : "e"(bitmask)); \
	} \
      else \
        { \
	  asm ("v_mov_b32\t%L0, 0\n\t" \
	       "v_mov_b32\t%H0, 0" : "=v"(__mask) : "e"(-1L)); \
	  asm ("v_mov_b32\t%L0, -1\n\t" \
	       "v_mov_b32\t%H0, -1" : "+v"(__mask) : "e"(bitmask)); \
	} \
    } \
  __mask; \
})

#define ALL_ZEROES_P(x) (COND_TO_BITMASK(x) == 0)

#define COND_TO_BITMASK(x) \
({ \
  long __tmp = 0; \
  __auto_type __x = __builtin_convertvector((x), typeof (__mask)) & __mask; \
  __builtin_choose_expr (sizeof (__mask) != 512, \
                         ({ asm ("v_cmp_ne_u32_e64 %0, %1, 0" \
                                 : "=Sg" (__tmp) \
                                 : "v" (__x)); }), \
                         ({ asm ("v_cmp_ne_u64_e64 %0, %1, 0" \
                                 : "=Sg" (__tmp) \
                                 : "v" (__x)); })); \
  __tmp; \
})

#define VECTOR_WHILE(cond, cond_var, prev_cond_var) \
{ \
  __auto_type cond_var = prev_cond_var; \
  for (;;) { \
    cond_var &= (cond); \
    if (ALL_ZEROES_P (cond_var)) \
      break;

#define VECTOR_ENDWHILE \
  } \
}

#define DEF_VARIANT(FUN, SUFFIX, OTYPE, TYPE, COUNT) \
v##COUNT##OTYPE \
FUN##v##COUNT##SUFFIX (v##COUNT##TYPE __arg1, v##COUNT##TYPE __arg2) \
{ \
  __auto_type __upsized_arg1 = VGPR_CAST (v64##TYPE, __arg1); \
  __auto_type __upsized_arg2 = VGPR_CAST (v64##TYPE, __arg2); \
  __auto_type __mask = VECTOR_INIT_MASK (COUNT); \
  __auto_type __result = FUN##v64##SUFFIX##_aux (__upsized_arg1, __upsized_arg2, __mask); \
  return VGPR_CAST (v##COUNT##OTYPE, __result); \
}

#define DEF_VARIANTS(FUN, SUFFIX, TYPE) \
  DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 2) \
  DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 4) \
  DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 8) \
  DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 16) \
  DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 32) \
  DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 64)

#define DEF_VARIANTS_B(FUN, SUFFIX, OTYPE, TYPE) \
  DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 2) \
  DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 4) \
  DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 8) \
  DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 16) \
  DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 32) \
  DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 64)
Commit	Line	Data
d9d67745 AS	1	/* Macro library used to help during conversion of scalar math functions to
	2	vectorized SIMD equivalents on AMD GCN.
	3
a945c346	4	Copyright (C) 2023-2024 Free Software Foundation, Inc.
d9d67745 AS	5	Contributed by Siemens.
	6
	7	This file is free software; you can redistribute it and/or modify it
	8	under the terms of the GNU General Public License as published by the
	9	Free Software Foundation; either version 3, or (at your option) any
	10	later version.
	11
	12	This file is distributed in the hope that it will be useful, but
	13	WITHOUT ANY WARRANTY; without even the implied warranty of
	14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	15	General Public License for more details.
	16
	17	Under Section 7 of GPL version 3, you are granted additional
	18	permissions described in the GCC Runtime Library Exception, version
	19	3.1, as published by the Free Software Foundation.
	20
	21	You should have received a copy of the GNU General Public License and
	22	a copy of the GCC Runtime Library Exception along with this program;
	23	see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
	24	<http://www.gnu.org/licenses/>. */
	25
	26	typedef union {
	27	v2sf t_v2sf;
	28	v4sf t_v4sf;
	29	v8sf t_v8sf;
	30	v16sf t_v16sf;
	31	v32sf t_v32sf;
	32	v64sf t_v64sf;
	33
	34	v2df t_v2df;
	35	v4df t_v4df;
	36	v8df t_v8df;
	37	v16df t_v16df;
	38	v32df t_v32df;
	39	v64df t_v64df;
	40
	41	v64qi t_v64qi;
	42	v64hi t_v64hi;
	43
	44	v2si t_v2si;
	45	v4si t_v4si;
	46	v8si t_v8si;
	47	v16si t_v16si;
	48	v32si t_v32si;
	49	v64si t_v64si;
	50
	51	v64usi t_v64usi;
	52
	53	v2di t_v2di;
	54	v4di t_v4di;
	55	v8di t_v8di;
	56	v16di t_v16di;
	57	v32di t_v32di;
	58	v64di t_v64di;
	59	} vector_union;
	60
	61	/* Cast between vectors with a different number of elements, or type. */
	62
	63	#define VGPR_CAST(to_t, from) \
	64	({ \
	65	to_t __res; \
	66	__asm__ ("" : "=v"(__res) : "0"(from)); \
	67	__res; \
	68	})
69
70	#define PACK_SI_PAIR(low, high) \
71	({ \
72	v64udi __res; \
73	asm ("v_mov_b32\t%L0, %1\n\t" \
74	"v_mov_b32\t%H0, %2" \
75	: "=&v"(__res) : "v0"(low), "v"(high), "e"(-1L)); \
76	__res; \
77	})
78
79	#define UNPACK_SI_LOW(to_t, pair) VGPR_CAST(to_t, pair)
80	#define UNPACK_SI_HIGH(to_t, pair) \
81	({ \
82	to_t __res; \
83	asm ("v_mov_b32\t%0, %H1" : "=v"(__res) : "v"(pair), "e"(-1L)); \
84	__res; \
85	})
86
87	#define PACK_DI_PAIR(low, high) \
88	({ \
89	v64uti __res; \
90	asm ("v_mov_b32\t%L0, %L1\n\t" \
91	"v_mov_b32\t%H0, %H1\n\t" \
92	"v_mov_b32\t%J0, %L2\n\t" \
93	"v_mov_b32\t%K0, %H2" \
94	: "=&v"(__res) : "v0"(low), "v"(high), "e"(-1L)); \
95	__res; \
96	})
97
98	#define UNPACK_DI_LOW(to_t, pair) VGPR_CAST(to_t, pair)
99	#define UNPACK_DI_HIGH(to_t, pair) \
100	({ \
101	to_t __res; \
102	asm ("v_mov_b32\t%L0, %J1\n\t" \
103	"v_mov_b32\t%H0, %K1" : "=v"(__res) : "v"(pair), "e"(-1L)); \
104	__res; \
105	})
106
107	#define NO_COND __mask
108
109	/* Note - __mask is _not_ accounted for in VECTOR_MERGE! */
110	#define VECTOR_MERGE(vec1, vec2, cond) \
111	({ \
112	_Static_assert (__builtin_types_compatible_p (typeof (vec1), typeof (vec2))); \
113	union { \
114	typeof (vec1) val; \
115	v64qi t_v64qi; \
116	v64hi t_v64hi; \
117	v64si t_v64si; \
118	v64di t_v64di; \
119	} __vec1, __vec2, __res; \
120	__vec1.val = (vec1); \
121	__vec2.val = (vec2); \
122	__builtin_choose_expr ( \
123	sizeof (vec1) == sizeof (v64si), \
124	({ \
125	v64si __bitmask = __builtin_convertvector ((cond), v64si); \
126	__res.t_v64si = (__vec1.t_v64si & __bitmask) \
127	\| (__vec2.t_v64si & ~__bitmask); \
128	}), \
129	__builtin_choose_expr ( \
130	sizeof (vec1) == sizeof (v64hi), \
131	({ \
132	v64hi __bitmask = __builtin_convertvector ((cond), v64hi); \
133	__res.t_v64hi = (__vec1.t_v64hi & __bitmask) \
134	\| (__vec2.t_v64hi & ~__bitmask); \
135	}), \
136	__builtin_choose_expr ( \
137	sizeof (vec1) == sizeof (v64qi), \
138	({ \
139	v64qi __bitmask = __builtin_convertvector ((cond), v64qi); \
140	__res.t_v64qi = (__vec1.t_v64qi & __bitmask) \
141	\| (__vec2.t_v64qi & ~__bitmask); \
142	}), \
143	({ \
144	v64di __bitmask = __builtin_convertvector ((cond), v64di); \
145	__res.t_v64di = (__vec1.t_v64di & __bitmask) \
146	\| (__vec2.t_v64di & ~__bitmask); \
147	})))); \
148	__res.val; \
149	})
150
151	#define VECTOR_COND_MOVE(var, val, cond) \
152	do { \
153	_Static_assert (__builtin_types_compatible_p (typeof (var), typeof (val))); \
154	__auto_type __cond = __builtin_convertvector ((cond), typeof (__mask)); \
155	var = VECTOR_MERGE ((val), var, __cond & __mask); \
156	} while (0)
157
158	#define VECTOR_IF(cond, cond_var) \
159	{ \
160	__auto_type cond_var = (cond); \
161	__auto_type __inv_cond __attribute__((unused)) = ~cond_var; \
162	if (!ALL_ZEROES_P (cond_var)) \
163	{
164
165	#define VECTOR_ELSEIF(cond, cond_var) \
166	} \
167	cond_var = __inv_cond & (cond); \
168	__inv_cond &= ~(cond); \
169	if (!ALL_ZEROES_P (cond_var)) \
170	{
171
172	#define VECTOR_ELSE(cond_var) \
173	} \
174	cond_var = __inv_cond; \
175	if (!ALL_ZEROES_P (cond_var)) \
176	{
177
178	#define VECTOR_IF2(cond, cond_var, prev_cond_var) \
179	{ \
180	__auto_type cond_var = (cond) & __builtin_convertvector (prev_cond_var, typeof (cond)); \
181	__auto_type __inv_cond __attribute__((unused)) = ~cond_var; \
182	if (!ALL_ZEROES_P (cond_var)) \
183	{
184
185	#define VECTOR_ELSEIF2(cond, cond_var, prev_cond_var) \
186	} \
187	cond_var = (cond) & __inv_cond & __builtin_convertvector (prev_cond_var, typeof (cond)); \
188	__inv_cond &= ~(cond); \
189	if (!ALL_ZEROES_P (cond_var)) \
190	{
191
192	#define VECTOR_ELSE2(cond_var, prev_cond_var) \
193	} \
194	cond_var = __inv_cond & __builtin_convertvector (prev_cond_var, typeof (__inv_cond)); \
195	if (!ALL_ZEROES_P (cond_var)) \
196	{
197
198
199	#define VECTOR_ENDIF \
200	} \
201	}
202
203	#define VECTOR_INIT_AUX(x, type) \
204	({ \
205	typeof (x) __e = (x); \
206	type __tmp = { \
207	__e, __e, __e, __e, __e, __e, __e, __e, \
208	__e, __e, __e, __e, __e, __e, __e, __e, \
209	__e, __e, __e, __e, __e, __e, __e, __e, \
210	__e, __e, __e, __e, __e, __e, __e, __e, \
211	__e, __e, __e, __e, __e, __e, __e, __e, \
212	__e, __e, __e, __e, __e, __e, __e, __e, \
213	__e, __e, __e, __e, __e, __e, __e, __e, \
214	__e, __e, __e, __e, __e, __e, __e, __e }; \
215	__tmp; \
216	})
217
218	#define VECTOR_INIT(x) \
219	(_Generic ((x), int: VECTOR_INIT_AUX ((x), v64si), \
220	unsigned: VECTOR_INIT_AUX ((x), v64usi), \
221	char: VECTOR_INIT_AUX ((x), v64qi), \
222	unsigned char: VECTOR_INIT_AUX ((x), v64uqi), \
223	short: VECTOR_INIT_AUX ((x), v64hi), \
224	unsigned short: VECTOR_INIT_AUX ((x), v64uhi), \
225	long: VECTOR_INIT_AUX ((x), v64di), \
226	unsigned long: VECTOR_INIT_AUX ((x), v64udi), \
227	float: VECTOR_INIT_AUX ((x), v64sf), \
228	double: VECTOR_INIT_AUX ((x), v64df)))
229
230
231	#if defined (__GCN3__) \|\| defined (__GCN5__) \
c7ec7bd1 AS	232	\|\| defined (__CDNA1__) \|\| defined (__CDNA2__) \
c7ec7bd1 AS	233	\|\| defined (__RDNA2__)
d9d67745 AS	234	#define CDNA3_PLUS 0
	235	#else
	236	#define CDNA3_PLUS 1
	237	#endif
	238
	239	#define VECTOR_INIT_MASK(COUNT) \
	240	({ \
	241	MASKMODE __mask; \
	242	int count = (COUNT); \
	243	if (count == 64) \
	244	{ \
	245	if (sizeof (MASKMODE) < 512 \|\| CDNA3_PLUS) \
	246	asm ("v_mov%B0\t%0, -1" : "=v"(__mask) : "e"(-1L)); \
	247	else \
	248	asm ("v_mov_b32\t%L0, -1\n\t" \
	249	"v_mov_b32\t%H0, -1" : "=v"(__mask) : "e"(-1L)); \
	250	} \
	251	else \
	252	{ \
	253	long bitmask = (count == 64 ? -1 : (1<<count)-1); \
	254	if (sizeof (MASKMODE) < 512 \|\| CDNA3_PLUS) \
	255	{ \
	256	asm ("v_mov%B0\t%0, 0" : "=v"(__mask) : "e"(-1L)); \
	257	asm ("v_mov%B0\t%0, -1" : "+v"(__mask) : "e"(bitmask)); \
	258	} \
	259	else \
	260	{ \
	261	asm ("v_mov_b32\t%L0, 0\n\t" \
	262	"v_mov_b32\t%H0, 0" : "=v"(__mask) : "e"(-1L)); \
	263	asm ("v_mov_b32\t%L0, -1\n\t" \
	264	"v_mov_b32\t%H0, -1" : "+v"(__mask) : "e"(bitmask)); \
	265	} \
	266	} \
	267	__mask; \
	268	})
	269
	270	#define ALL_ZEROES_P(x) (COND_TO_BITMASK(x) == 0)
	271
	272	#define COND_TO_BITMASK(x) \
	273	({ \
	274	long __tmp = 0; \
	275	__auto_type __x = __builtin_convertvector((x), typeof (__mask)) & __mask; \
	276	__builtin_choose_expr (sizeof (__mask) != 512, \
	277	({ asm ("v_cmp_ne_u32_e64 %0, %1, 0" \
	278	: "=Sg" (__tmp) \
	279	: "v" (__x)); }), \
	280	({ asm ("v_cmp_ne_u64_e64 %0, %1, 0" \
	281	: "=Sg" (__tmp) \
	282	: "v" (__x)); })); \
	283	__tmp; \
	284	})
	285
	286	#define VECTOR_WHILE(cond, cond_var, prev_cond_var) \
	287	{ \
	288	__auto_type cond_var = prev_cond_var; \
	289	for (;;) { \
	290	cond_var &= (cond); \
	291	if (ALL_ZEROES_P (cond_var)) \
	292	break;
	293
	294	#define VECTOR_ENDWHILE \
	295	} \
	296	}
	297
298	#define DEF_VARIANT(FUN, SUFFIX, OTYPE, TYPE, COUNT) \
299	v##COUNT##OTYPE \
300	FUN##v##COUNT##SUFFIX (v##COUNT##TYPE __arg1, v##COUNT##TYPE __arg2) \
301	{ \
302	__auto_type __upsized_arg1 = VGPR_CAST (v64##TYPE, __arg1); \
303	__auto_type __upsized_arg2 = VGPR_CAST (v64##TYPE, __arg2); \
304	__auto_type __mask = VECTOR_INIT_MASK (COUNT); \
305	__auto_type __result = FUN##v64##SUFFIX##_aux (__upsized_arg1, __upsized_arg2, __mask); \
306	return VGPR_CAST (v##COUNT##OTYPE, __result); \
307	}
308
309	#define DEF_VARIANTS(FUN, SUFFIX, TYPE) \
310	DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 2) \
311	DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 4) \
312	DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 8) \
313	DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 16) \
314	DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 32) \
315	DEF_VARIANT (FUN, SUFFIX, TYPE, TYPE, 64)
316
317	#define DEF_VARIANTS_B(FUN, SUFFIX, OTYPE, TYPE) \
318	DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 2) \
319	DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 4) \
320	DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 8) \
321	DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 16) \
322	DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 32) \
323	DEF_VARIANT (FUN, SUFFIX, OTYPE, TYPE, 64)