return (m128) vsubq_u64((uint64x2_t)a, (uint64x2_t)b);
}
-static really_really_inline
+static really_inline
m128 lshift_m128(m128 a, unsigned b) {
- return (m128) vshlq_n_u32((uint32x4_t)a, b);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+ if (__builtin_constant_p(b)) {
+ return (m128) vshlq_n_u32((uint32x4_t)a, b);
+ }
+#endif
+#define CASE_LSHIFT_m128(a, offset) case offset: return (m128)vshlq_n_u32((int8x16_t)(a), (offset)); break;
+ switch (b) {
+ case 0: return a; break;
+ CASE_LSHIFT_m128(a, 1);
+ CASE_LSHIFT_m128(a, 2);
+ CASE_LSHIFT_m128(a, 3);
+ CASE_LSHIFT_m128(a, 4);
+ CASE_LSHIFT_m128(a, 5);
+ CASE_LSHIFT_m128(a, 6);
+ CASE_LSHIFT_m128(a, 7);
+ CASE_LSHIFT_m128(a, 8);
+ CASE_LSHIFT_m128(a, 9);
+ CASE_LSHIFT_m128(a, 10);
+ CASE_LSHIFT_m128(a, 11);
+ CASE_LSHIFT_m128(a, 12);
+ CASE_LSHIFT_m128(a, 13);
+ CASE_LSHIFT_m128(a, 14);
+ CASE_LSHIFT_m128(a, 15);
+ CASE_LSHIFT_m128(a, 16);
+ CASE_LSHIFT_m128(a, 17);
+ CASE_LSHIFT_m128(a, 18);
+ CASE_LSHIFT_m128(a, 19);
+ CASE_LSHIFT_m128(a, 20);
+ CASE_LSHIFT_m128(a, 21);
+ CASE_LSHIFT_m128(a, 22);
+ CASE_LSHIFT_m128(a, 23);
+ CASE_LSHIFT_m128(a, 24);
+ CASE_LSHIFT_m128(a, 25);
+ CASE_LSHIFT_m128(a, 26);
+ CASE_LSHIFT_m128(a, 27);
+ CASE_LSHIFT_m128(a, 28);
+ CASE_LSHIFT_m128(a, 29);
+ CASE_LSHIFT_m128(a, 30);
+ CASE_LSHIFT_m128(a, 31);
+ default: return zeroes128(); break;
+ }
+#undef CASE_LSHIFT_m128
}
static really_really_inline
m128 rshift_m128(m128 a, unsigned b) {
- return (m128) vshrq_n_u32((uint32x4_t)a, b);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+ if (__builtin_constant_p(b)) {
+ return (m128) vshrq_n_u32((uint32x4_t)a, b);
+ }
+#endif
+#define CASE_RSHIFT_m128(a, offset) case offset: return (m128)vshrq_n_u32((int8x16_t)(a), (offset)); break;
+ switch (b) {
+ case 0: return a; break;
+ CASE_RSHIFT_m128(a, 1);
+ CASE_RSHIFT_m128(a, 2);
+ CASE_RSHIFT_m128(a, 3);
+ CASE_RSHIFT_m128(a, 4);
+ CASE_RSHIFT_m128(a, 5);
+ CASE_RSHIFT_m128(a, 6);
+ CASE_RSHIFT_m128(a, 7);
+ CASE_RSHIFT_m128(a, 8);
+ CASE_RSHIFT_m128(a, 9);
+ CASE_RSHIFT_m128(a, 10);
+ CASE_RSHIFT_m128(a, 11);
+ CASE_RSHIFT_m128(a, 12);
+ CASE_RSHIFT_m128(a, 13);
+ CASE_RSHIFT_m128(a, 14);
+ CASE_RSHIFT_m128(a, 15);
+ CASE_RSHIFT_m128(a, 16);
+ CASE_RSHIFT_m128(a, 17);
+ CASE_RSHIFT_m128(a, 18);
+ CASE_RSHIFT_m128(a, 19);
+ CASE_RSHIFT_m128(a, 20);
+ CASE_RSHIFT_m128(a, 21);
+ CASE_RSHIFT_m128(a, 22);
+ CASE_RSHIFT_m128(a, 23);
+ CASE_RSHIFT_m128(a, 24);
+ CASE_RSHIFT_m128(a, 25);
+ CASE_RSHIFT_m128(a, 26);
+ CASE_RSHIFT_m128(a, 27);
+ CASE_RSHIFT_m128(a, 28);
+ CASE_RSHIFT_m128(a, 29);
+ CASE_RSHIFT_m128(a, 30);
+ CASE_RSHIFT_m128(a, 31);
+ default: return zeroes128(); break;
+ }
+#undef CASE_RSHIFT_m128
}
static really_really_inline
m128 lshift64_m128(m128 a, unsigned b) {
- return (m128) vshlq_n_u64((uint64x2_t)a, b);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+ if (__builtin_constant_p(b)) {
+ return (m128) vshlq_n_u64((uint64x2_t)a, b);
+ }
+#endif
+#define CASE_LSHIFT64_m128(a, offset) case offset: return (m128)vshlq_n_u64((int8x16_t)(a), (offset)); break;
+ switch (b) {
+ case 0: return a; break;
+ CASE_LSHIFT64_m128(a, 1);
+ CASE_LSHIFT64_m128(a, 2);
+ CASE_LSHIFT64_m128(a, 3);
+ CASE_LSHIFT64_m128(a, 4);
+ CASE_LSHIFT64_m128(a, 5);
+ CASE_LSHIFT64_m128(a, 6);
+ CASE_LSHIFT64_m128(a, 7);
+ CASE_LSHIFT64_m128(a, 8);
+ CASE_LSHIFT64_m128(a, 9);
+ CASE_LSHIFT64_m128(a, 10);
+ CASE_LSHIFT64_m128(a, 11);
+ CASE_LSHIFT64_m128(a, 12);
+ CASE_LSHIFT64_m128(a, 13);
+ CASE_LSHIFT64_m128(a, 14);
+ CASE_LSHIFT64_m128(a, 15);
+ CASE_LSHIFT64_m128(a, 16);
+ CASE_LSHIFT64_m128(a, 17);
+ CASE_LSHIFT64_m128(a, 18);
+ CASE_LSHIFT64_m128(a, 19);
+ CASE_LSHIFT64_m128(a, 20);
+ CASE_LSHIFT64_m128(a, 21);
+ CASE_LSHIFT64_m128(a, 22);
+ CASE_LSHIFT64_m128(a, 23);
+ CASE_LSHIFT64_m128(a, 24);
+ CASE_LSHIFT64_m128(a, 25);
+ CASE_LSHIFT64_m128(a, 26);
+ CASE_LSHIFT64_m128(a, 27);
+ CASE_LSHIFT64_m128(a, 28);
+ CASE_LSHIFT64_m128(a, 29);
+ CASE_LSHIFT64_m128(a, 30);
+ CASE_LSHIFT64_m128(a, 31);
+ CASE_LSHIFT64_m128(a, 32);
+ CASE_LSHIFT64_m128(a, 33);
+ CASE_LSHIFT64_m128(a, 34);
+ CASE_LSHIFT64_m128(a, 35);
+ CASE_LSHIFT64_m128(a, 36);
+ CASE_LSHIFT64_m128(a, 37);
+ CASE_LSHIFT64_m128(a, 38);
+ CASE_LSHIFT64_m128(a, 39);
+ CASE_LSHIFT64_m128(a, 40);
+ CASE_LSHIFT64_m128(a, 41);
+ CASE_LSHIFT64_m128(a, 42);
+ CASE_LSHIFT64_m128(a, 43);
+ CASE_LSHIFT64_m128(a, 44);
+ CASE_LSHIFT64_m128(a, 45);
+ CASE_LSHIFT64_m128(a, 46);
+ CASE_LSHIFT64_m128(a, 47);
+ CASE_LSHIFT64_m128(a, 48);
+ CASE_LSHIFT64_m128(a, 49);
+ CASE_LSHIFT64_m128(a, 50);
+ CASE_LSHIFT64_m128(a, 51);
+ CASE_LSHIFT64_m128(a, 52);
+ CASE_LSHIFT64_m128(a, 53);
+ CASE_LSHIFT64_m128(a, 54);
+ CASE_LSHIFT64_m128(a, 55);
+ CASE_LSHIFT64_m128(a, 56);
+ CASE_LSHIFT64_m128(a, 57);
+ CASE_LSHIFT64_m128(a, 58);
+ CASE_LSHIFT64_m128(a, 59);
+ CASE_LSHIFT64_m128(a, 60);
+ CASE_LSHIFT64_m128(a, 61);
+ CASE_LSHIFT64_m128(a, 62);
+ CASE_LSHIFT64_m128(a, 63);
+ default: return zeroes128(); break;
+ }
+#undef CASE_LSHIFT64_m128
}
static really_really_inline
m128 rshift64_m128(m128 a, unsigned b) {
- return (m128) vshrq_n_u64((uint64x2_t)a, b);
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+ if (__builtin_constant_p(b)) {
+ return (m128) vshrq_n_u64((uint64x2_t)a, b);
+ }
+#endif
+#define CASE_RSHIFT64_m128(a, offset) case offset: return (m128)vshrq_n_u64((int8x16_t)(a), (offset)); break;
+ switch (b) {
+ case 0: return a; break;
+ CASE_RSHIFT64_m128(a, 1);
+ CASE_RSHIFT64_m128(a, 2);
+ CASE_RSHIFT64_m128(a, 3);
+ CASE_RSHIFT64_m128(a, 4);
+ CASE_RSHIFT64_m128(a, 5);
+ CASE_RSHIFT64_m128(a, 6);
+ CASE_RSHIFT64_m128(a, 7);
+ CASE_RSHIFT64_m128(a, 8);
+ CASE_RSHIFT64_m128(a, 9);
+ CASE_RSHIFT64_m128(a, 10);
+ CASE_RSHIFT64_m128(a, 11);
+ CASE_RSHIFT64_m128(a, 12);
+ CASE_RSHIFT64_m128(a, 13);
+ CASE_RSHIFT64_m128(a, 14);
+ CASE_RSHIFT64_m128(a, 15);
+ CASE_RSHIFT64_m128(a, 16);
+ CASE_RSHIFT64_m128(a, 17);
+ CASE_RSHIFT64_m128(a, 18);
+ CASE_RSHIFT64_m128(a, 19);
+ CASE_RSHIFT64_m128(a, 20);
+ CASE_RSHIFT64_m128(a, 21);
+ CASE_RSHIFT64_m128(a, 22);
+ CASE_RSHIFT64_m128(a, 23);
+ CASE_RSHIFT64_m128(a, 24);
+ CASE_RSHIFT64_m128(a, 25);
+ CASE_RSHIFT64_m128(a, 26);
+ CASE_RSHIFT64_m128(a, 27);
+ CASE_RSHIFT64_m128(a, 28);
+ CASE_RSHIFT64_m128(a, 29);
+ CASE_RSHIFT64_m128(a, 30);
+ CASE_RSHIFT64_m128(a, 31);
+ CASE_RSHIFT64_m128(a, 32);
+ CASE_RSHIFT64_m128(a, 33);
+ CASE_RSHIFT64_m128(a, 34);
+ CASE_RSHIFT64_m128(a, 35);
+ CASE_RSHIFT64_m128(a, 36);
+ CASE_RSHIFT64_m128(a, 37);
+ CASE_RSHIFT64_m128(a, 38);
+ CASE_RSHIFT64_m128(a, 39);
+ CASE_RSHIFT64_m128(a, 40);
+ CASE_RSHIFT64_m128(a, 41);
+ CASE_RSHIFT64_m128(a, 42);
+ CASE_RSHIFT64_m128(a, 43);
+ CASE_RSHIFT64_m128(a, 44);
+ CASE_RSHIFT64_m128(a, 45);
+ CASE_RSHIFT64_m128(a, 46);
+ CASE_RSHIFT64_m128(a, 47);
+ CASE_RSHIFT64_m128(a, 48);
+ CASE_RSHIFT64_m128(a, 49);
+ CASE_RSHIFT64_m128(a, 50);
+ CASE_RSHIFT64_m128(a, 51);
+ CASE_RSHIFT64_m128(a, 52);
+ CASE_RSHIFT64_m128(a, 53);
+ CASE_RSHIFT64_m128(a, 54);
+ CASE_RSHIFT64_m128(a, 55);
+ CASE_RSHIFT64_m128(a, 56);
+ CASE_RSHIFT64_m128(a, 57);
+ CASE_RSHIFT64_m128(a, 58);
+ CASE_RSHIFT64_m128(a, 59);
+ CASE_RSHIFT64_m128(a, 60);
+ CASE_RSHIFT64_m128(a, 61);
+ CASE_RSHIFT64_m128(a, 62);
+ CASE_RSHIFT64_m128(a, 63);
+ default: return zeroes128(); break;
+ }
+#undef CASE_RSHIFT64_m128
}
static really_inline m128 eq128(m128 a, m128 b) {
template<>
template<>
-really_inline SuperVector<16>::SuperVector<int8x16_t>(int8x16_t other)
+really_inline SuperVector<16>::SuperVector(int8x16_t other)
{
u.s8x16[0] = other;
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<uint8x16_t>(uint8x16_t other)
+really_inline SuperVector<16>::SuperVector(uint8x16_t other)
{
u.u8x16[0] = other;
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<int16x8_t>(int16x8_t other)
+really_inline SuperVector<16>::SuperVector(int16x8_t other)
{
u.s16x8[0] = other;
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<uint16x8_t>(uint16x8_t other)
+really_inline SuperVector<16>::SuperVector(uint16x8_t other)
{
u.u16x8[0] = other;
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<int32x4_t>(int32x4_t other)
+really_inline SuperVector<16>::SuperVector(int32x4_t other)
{
u.s32x4[0] = other;
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<uint32x4_t>(uint32x4_t other)
+really_inline SuperVector<16>::SuperVector(uint32x4_t other)
{
u.u32x4[0] = other;
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<int64x2_t>(int64x2_t other)
+really_inline SuperVector<16>::SuperVector(int64x2_t other)
{
u.s64x2[0] = other;
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<uint64x2_t>(uint64x2_t other)
+really_inline SuperVector<16>::SuperVector(uint64x2_t other)
{
u.u64x2[0] = other;
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<int8_t>(int8_t const other)
+really_inline SuperVector<16>::SuperVector(int8_t const other)
{
u.s8x16[0] = vdupq_n_s8(other);
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<uint8_t>(uint8_t const other)
+really_inline SuperVector<16>::SuperVector(uint8_t const other)
{
u.u8x16[0] = vdupq_n_u8(other);
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<int16_t>(int16_t const other)
+really_inline SuperVector<16>::SuperVector(int16_t const other)
{
u.s16x8[0] = vdupq_n_s16(other);
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<uint16_t>(uint16_t const other)
+really_inline SuperVector<16>::SuperVector(uint16_t const other)
{
u.u16x8[0] = vdupq_n_u16(other);
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<int32_t>(int32_t const other)
+really_inline SuperVector<16>::SuperVector(int32_t const other)
{
u.s32x4[0] = vdupq_n_s32(other);
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<uint32_t>(uint32_t const other)
+really_inline SuperVector<16>::SuperVector(uint32_t const other)
{
u.u32x4[0] = vdupq_n_u32(other);
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<int64_t>(int64_t const other)
+really_inline SuperVector<16>::SuperVector(int64_t const other)
{
u.s64x2[0] = vdupq_n_s64(other);
}
template<>
template<>
-really_inline SuperVector<16>::SuperVector<uint64_t>(uint64_t const other)
+really_inline SuperVector<16>::SuperVector(uint64_t const other)
{
u.u64x2[0] = vdupq_n_u64(other);
}
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
- Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(u.u8x16[0], n)}; });
+ Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u8(v->u.u8x16[0], n)}; });
return result;
}
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
- Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(u.u16x8[0], n)}; });
+ Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u16(v->u.u16x8[0], n)}; });
return result;
}
really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const
{
if (N == 0) return *this;
- if (N == 16) return Zeroes();
+ if (N == 32) return Zeroes();
SuperVector result;
- Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(u.u32x4[0], n)}; });
+ Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u32(v->u.u32x4[0], n)}; });
return result;
}
really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const
{
if (N == 0) return *this;
- if (N == 16) return Zeroes();
+ if (N == 64) return Zeroes();
SuperVector result;
- Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(u.u64x2[0], n)}; });
+ Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshlq_n_u64(v->u.u64x2[0], n)}; });
return result;
}
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
- Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), u.u8x16[0], 16 - n)}; });
+ Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(vdupq_n_u8(0), v->u.u8x16[0], 16 - n)}; });
return result;
}
really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const
{
if (N == 0) return *this;
- if (N == 16) return Zeroes();
+ if (N == 8) return Zeroes();
SuperVector result;
- Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(u.u8x16[0], n)}; });
+ Unroller<1, 8>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u8(v->u.u8x16[0], n)}; });
return result;
}
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
- Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(u.u16x8[0], n)}; });
+ Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u16(v->u.u16x8[0], n)}; });
return result;
}
really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const
{
if (N == 0) return *this;
- if (N == 16) return Zeroes();
+ if (N == 32) return Zeroes();
SuperVector result;
- Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(u.u32x4[0], n)}; });
+ Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u32(v->u.u32x4[0], n)}; });
return result;
}
really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const
{
if (N == 0) return *this;
- if (N == 16) return Zeroes();
+ if (N == 64) return Zeroes();
SuperVector result;
- Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(u.u64x2[0], n)}; });
+ Unroller<1, 64>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vshrq_n_u64(v->u.u64x2[0], n)}; });
return result;
}
if (N == 0) return *this;
if (N == 16) return Zeroes();
SuperVector result;
- Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(u.u8x16[0], vdupq_n_u8(0), n)}; });
+ Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {vextq_u8(v->u.u8x16[0], vdupq_n_u8(0), n)}; });
return result;
}