}
#endif
+template<>
+really_inline SuperVector<16> SuperVector<16>::lshift128(uint8_t const N)
+{
+ return *this << N;
+}
+
+template<>
+really_inline SuperVector<16> SuperVector<16>::rshift128(uint8_t const N)
+{
+ return *this >> N;
+}
+
// 256-bit AVX2 implementation
#if defined(HAVE_AVX2)
template<>
case 13: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 13)}; break;
case 14: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 14)}; break;
case 15: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 15)}; break;
+ case 16: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 16)}; break;
+ case 17: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 17)}; break;
+ case 18: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 18)}; break;
+ case 19: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 19)}; break;
+ case 20: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 20)}; break;
+ case 21: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 21)}; break;
+ case 22: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 22)}; break;
+ case 23: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 23)}; break;
+ case 24: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 24)}; break;
+ case 25: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 25)}; break;
+ case 26: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 26)}; break;
+ case 27: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 27)}; break;
+ case 28: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 28)}; break;
+ case 29: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 39)}; break;
+ case 30: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 30)}; break;
+ case 31: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 31)}; break;
default: break;
}
return *this;
case 13: return {_mm256_slli_epi64(u.v256[0], 13)}; break;
case 14: return {_mm256_slli_epi64(u.v256[0], 14)}; break;
case 15: return {_mm256_slli_epi64(u.v256[0], 15)}; break;
+ case 16: return {_mm256_slli_epi64(u.v256[0], 16)}; break;
+ case 17: return {_mm256_slli_epi64(u.v256[0], 17)}; break;
+ case 18: return {_mm256_slli_epi64(u.v256[0], 18)}; break;
+ case 19: return {_mm256_slli_epi64(u.v256[0], 19)}; break;
+ case 20: return {_mm256_slli_epi64(u.v256[0], 20)}; break;
+ case 21: return {_mm256_slli_epi64(u.v256[0], 21)}; break;
+ case 22: return {_mm256_slli_epi64(u.v256[0], 22)}; break;
+ case 23: return {_mm256_slli_epi64(u.v256[0], 23)}; break;
+ case 24: return {_mm256_slli_epi64(u.v256[0], 24)}; break;
+ case 25: return {_mm256_slli_epi64(u.v256[0], 25)}; break;
+ case 26: return {_mm256_slli_epi64(u.v256[0], 26)}; break;
+ case 27: return {_mm256_slli_epi64(u.v256[0], 27)}; break;
+ case 28: return {_mm256_slli_epi64(u.v256[0], 28)}; break;
+ case 29: return {_mm256_slli_epi64(u.v256[0], 29)}; break;
+ case 30: return {_mm256_slli_epi64(u.v256[0], 30)}; break;
+ case 31: return {_mm256_slli_epi64(u.v256[0], 31)}; break;
default: break;
}
return *this;
case 13: return {_mm256_srli_epi64(u.v256[0], 13)}; break;
case 14: return {_mm256_srli_epi64(u.v256[0], 14)}; break;
case 15: return {_mm256_srli_epi64(u.v256[0], 15)}; break;
+ case 16: return {_mm256_srli_epi64(u.v256[0], 16)}; break;
+ case 17: return {_mm256_srli_epi64(u.v256[0], 17)}; break;
+ case 18: return {_mm256_srli_epi64(u.v256[0], 18)}; break;
+ case 19: return {_mm256_srli_epi64(u.v256[0], 19)}; break;
+ case 20: return {_mm256_srli_epi64(u.v256[0], 20)}; break;
+ case 21: return {_mm256_srli_epi64(u.v256[0], 21)}; break;
+ case 22: return {_mm256_srli_epi64(u.v256[0], 22)}; break;
+ case 23: return {_mm256_srli_epi64(u.v256[0], 23)}; break;
+ case 24: return {_mm256_srli_epi64(u.v256[0], 24)}; break;
+ case 25: return {_mm256_srli_epi64(u.v256[0], 25)}; break;
+ case 26: return {_mm256_srli_epi64(u.v256[0], 26)}; break;
+ case 27: return {_mm256_srli_epi64(u.v256[0], 27)}; break;
+ case 28: return {_mm256_srli_epi64(u.v256[0], 28)}; break;
+ case 29: return {_mm256_srli_epi64(u.v256[0], 29)}; break;
+ case 30: return {_mm256_srli_epi64(u.v256[0], 30)}; break;
+ case 31: return {_mm256_srli_epi64(u.v256[0], 31)}; break;
+ default: break;
+ }
+ return *this;
+}
+#endif
+
+#ifdef HS_OPTIMIZE
+template<>
+really_inline SuperVector<32> SuperVector<32>::lshift128(uint8_t const N)
+{
+ return {_mm256_slli_si256(u.v256[0], N)};
+}
+#else
+template<>
+really_inline SuperVector<32> SuperVector<32>::lshift128(uint8_t const N)
+{
+ switch(N) {
+ case 0: return *this; break;
+ case 1: return {_mm256_slli_si256(u.v256[0], 1)}; break;
+ case 2: return {_mm256_slli_si256(u.v256[0], 2)}; break;
+ case 3: return {_mm256_slli_si256(u.v256[0], 3)}; break;
+ case 4: return {_mm256_slli_si256(u.v256[0], 4)}; break;
+ case 5: return {_mm256_slli_si256(u.v256[0], 5)}; break;
+ case 6: return {_mm256_slli_si256(u.v256[0], 6)}; break;
+ case 7: return {_mm256_slli_si256(u.v256[0], 7)}; break;
+ case 8: return {_mm256_slli_si256(u.v256[0], 8)}; break;
+ case 9: return {_mm256_slli_si256(u.v256[0], 9)}; break;
+ case 10: return {_mm256_slli_si256(u.v256[0], 10)}; break;
+ case 11: return {_mm256_slli_si256(u.v256[0], 11)}; break;
+ case 12: return {_mm256_slli_si256(u.v256[0], 12)}; break;
+ case 13: return {_mm256_slli_si256(u.v256[0], 13)}; break;
+ case 14: return {_mm256_slli_si256(u.v256[0], 14)}; break;
+ case 15: return {_mm256_slli_si256(u.v256[0], 15)}; break;
+ case 16: return {_mm256_slli_si256(u.v256[0], 16)}; break;
+ case 17: return {_mm256_slli_si256(u.v256[0], 17)}; break;
+ case 18: return {_mm256_slli_si256(u.v256[0], 18)}; break;
+ case 19: return {_mm256_slli_si256(u.v256[0], 19)}; break;
+ case 20: return {_mm256_slli_si256(u.v256[0], 20)}; break;
+ case 21: return {_mm256_slli_si256(u.v256[0], 21)}; break;
+ case 22: return {_mm256_slli_si256(u.v256[0], 22)}; break;
+ case 23: return {_mm256_slli_si256(u.v256[0], 23)}; break;
+ case 24: return {_mm256_slli_si256(u.v256[0], 24)}; break;
+ case 25: return {_mm256_slli_si256(u.v256[0], 25)}; break;
+ case 26: return {_mm256_slli_si256(u.v256[0], 26)}; break;
+ case 27: return {_mm256_slli_si256(u.v256[0], 27)}; break;
+ case 28: return {_mm256_slli_si256(u.v256[0], 28)}; break;
+ case 29: return {_mm256_slli_si256(u.v256[0], 29)}; break;
+ case 30: return {_mm256_slli_si256(u.v256[0], 30)}; break;
+ case 31: return {_mm256_slli_si256(u.v256[0], 31)}; break;
+ default: break;
+ }
+ return *this;
+}
+#endif
+
+#ifdef HS_OPTIMIZE
+template<>
+really_inline SuperVector<32> SuperVector<32>::rshift128(uint8_t const N)
+{
+ return {_mm256_srli_si256(u.v256[0], N)};
+}
+#else
+template<>
+really_inline SuperVector<32> SuperVector<32>::rshift128(uint8_t const N)
+{
+ switch(N) {
+ case 0: return *this; break;
+ case 1: return {_mm256_srli_si256(u.v256[0], 1)}; break;
+ case 2: return {_mm256_srli_si256(u.v256[0], 2)}; break;
+ case 3: return {_mm256_srli_si256(u.v256[0], 3)}; break;
+ case 4: return {_mm256_srli_si256(u.v256[0], 4)}; break;
+ case 5: return {_mm256_srli_si256(u.v256[0], 5)}; break;
+ case 6: return {_mm256_srli_si256(u.v256[0], 6)}; break;
+ case 7: return {_mm256_srli_si256(u.v256[0], 7)}; break;
+ case 8: return {_mm256_srli_si256(u.v256[0], 8)}; break;
+ case 9: return {_mm256_srli_si256(u.v256[0], 9)}; break;
+ case 10: return {_mm256_srli_si256(u.v256[0], 10)}; break;
+ case 11: return {_mm256_srli_si256(u.v256[0], 11)}; break;
+ case 12: return {_mm256_srli_si256(u.v256[0], 12)}; break;
+ case 13: return {_mm256_srli_si256(u.v256[0], 13)}; break;
+ case 14: return {_mm256_srli_si256(u.v256[0], 14)}; break;
+ case 15: return {_mm256_srli_si256(u.v256[0], 15)}; break;
+ case 16: return {_mm256_srli_si256(u.v256[0], 16)}; break;
+ case 17: return {_mm256_srli_si256(u.v256[0], 17)}; break;
+ case 18: return {_mm256_srli_si256(u.v256[0], 18)}; break;
+ case 19: return {_mm256_srli_si256(u.v256[0], 19)}; break;
+ case 20: return {_mm256_srli_si256(u.v256[0], 20)}; break;
+ case 21: return {_mm256_srli_si256(u.v256[0], 21)}; break;
+ case 22: return {_mm256_srli_si256(u.v256[0], 22)}; break;
+ case 23: return {_mm256_srli_si256(u.v256[0], 23)}; break;
+ case 24: return {_mm256_srli_si256(u.v256[0], 24)}; break;
+ case 25: return {_mm256_srli_si256(u.v256[0], 25)}; break;
+ case 26: return {_mm256_srli_si256(u.v256[0], 26)}; break;
+ case 27: return {_mm256_srli_si256(u.v256[0], 27)}; break;
+ case 28: return {_mm256_srli_si256(u.v256[0], 28)}; break;
+ case 29: return {_mm256_srli_si256(u.v256[0], 29)}; break;
+ case 30: return {_mm256_srli_si256(u.v256[0], 30)}; break;
+ case 31: return {_mm256_srli_si256(u.v256[0], 31)}; break;
default: break;
}
return *this;
TEST_LSHIFT256(buf, vec, SP, 14);
TEST_LSHIFT256(buf, vec, SP, 15);
TEST_LSHIFT256(buf, vec, SP, 16);
+ TEST_LSHIFT256(buf, vec, SP, 17);
+ TEST_LSHIFT256(buf, vec, SP, 18);
+ TEST_LSHIFT256(buf, vec, SP, 19);
+ TEST_LSHIFT256(buf, vec, SP, 20);
+ TEST_LSHIFT256(buf, vec, SP, 21);
+ TEST_LSHIFT256(buf, vec, SP, 22);
+ TEST_LSHIFT256(buf, vec, SP, 23);
+ TEST_LSHIFT256(buf, vec, SP, 24);
+ TEST_LSHIFT256(buf, vec, SP, 25);
+ TEST_LSHIFT256(buf, vec, SP, 26);
+ TEST_LSHIFT256(buf, vec, SP, 27);
+ TEST_LSHIFT256(buf, vec, SP, 28);
+ TEST_LSHIFT256(buf, vec, SP, 29);
+ TEST_LSHIFT256(buf, vec, SP, 30);
+ TEST_LSHIFT256(buf, vec, SP, 31);
+ TEST_LSHIFT256(buf, vec, SP, 32);
}
/*
TEST_RSHIFT256(buf, vec, SP, 14);
TEST_RSHIFT256(buf, vec, SP, 15);
TEST_RSHIFT256(buf, vec, SP, 16);
+ TEST_RSHIFT256(buf, vec, SP, 17);
+ TEST_RSHIFT256(buf, vec, SP, 18);
+ TEST_RSHIFT256(buf, vec, SP, 19);
+ TEST_RSHIFT256(buf, vec, SP, 20);
+ TEST_RSHIFT256(buf, vec, SP, 21);
+ TEST_RSHIFT256(buf, vec, SP, 22);
+ TEST_RSHIFT256(buf, vec, SP, 23);
+ TEST_RSHIFT256(buf, vec, SP, 24);
+ TEST_RSHIFT256(buf, vec, SP, 25);
+ TEST_RSHIFT256(buf, vec, SP, 26);
+ TEST_RSHIFT256(buf, vec, SP, 27);
+ TEST_RSHIFT256(buf, vec, SP, 28);
+ TEST_RSHIFT256(buf, vec, SP, 29);
+ TEST_RSHIFT256(buf, vec, SP, 30);
+ TEST_RSHIFT256(buf, vec, SP, 31);
+ TEST_RSHIFT256(buf, vec, SP, 32);
}
/*
#define TEST_ALIGNR256(v1, v2, buf, l) { \
auto v_aligned = v2.alignr(v1, l); \
+ v_aligned.print8("v_aligned");\
for (size_t i=0; i<32; i++) { \
+ printf("vec[%ld] = %02x\n", i+l, vec[i+l]);\
ASSERT_EQ(v_aligned.u.u8[i], vec[i + l]); \
} \
}
TEST_ALIGNR256(SP1, SP2, vec, 14);
TEST_ALIGNR256(SP1, SP2, vec, 15);
TEST_ALIGNR256(SP1, SP2, vec, 16);
+ TEST_ALIGNR256(SP1, SP2, vec, 17);
+ TEST_ALIGNR256(SP1, SP2, vec, 18);
+ TEST_ALIGNR256(SP1, SP2, vec, 19);
+ TEST_ALIGNR256(SP1, SP2, vec, 20);
+ TEST_ALIGNR256(SP1, SP2, vec, 21);
+ TEST_ALIGNR256(SP1, SP2, vec, 22);
+ TEST_ALIGNR256(SP1, SP2, vec, 23);
+ TEST_ALIGNR256(SP1, SP2, vec, 24);
+ TEST_ALIGNR256(SP1, SP2, vec, 25);
+ TEST_ALIGNR256(SP1, SP2, vec, 26);
+ TEST_ALIGNR256(SP1, SP2, vec, 27);
+ TEST_ALIGNR256(SP1, SP2, vec, 28);
+ TEST_ALIGNR256(SP1, SP2, vec, 29);
+ TEST_ALIGNR256(SP1, SP2, vec, 30);
+ TEST_ALIGNR256(SP1, SP2, vec, 31);
+ TEST_ALIGNR256(SP1, SP2, vec, 32);
}
*/