u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
tmp &= fdr->domainMask;
s = *((const m128 *)ft + tmp);
- s = shiftRight8Bits(s);
+ s = rshiftbyte_m128(s, 1);
} else {
s = fdr->start;
}
m128 st14 = *(const m128 *)(ft + v14*8);
m128 st15 = *(const m128 *)(ft + v15*8);
- st1 = byteShiftLeft128(st1, 1);
- st2 = byteShiftLeft128(st2, 2);
- st3 = byteShiftLeft128(st3, 3);
- st4 = byteShiftLeft128(st4, 4);
- st5 = byteShiftLeft128(st5, 5);
- st6 = byteShiftLeft128(st6, 6);
- st7 = byteShiftLeft128(st7, 7);
- st9 = byteShiftLeft128(st9, 1);
- st10 = byteShiftLeft128(st10, 2);
- st11 = byteShiftLeft128(st11, 3);
- st12 = byteShiftLeft128(st12, 4);
- st13 = byteShiftLeft128(st13, 5);
- st14 = byteShiftLeft128(st14, 6);
- st15 = byteShiftLeft128(st15, 7);
+ st1 = lshiftbyte_m128(st1, 1);
+ st2 = lshiftbyte_m128(st2, 2);
+ st3 = lshiftbyte_m128(st3, 3);
+ st4 = lshiftbyte_m128(st4, 4);
+ st5 = lshiftbyte_m128(st5, 5);
+ st6 = lshiftbyte_m128(st6, 6);
+ st7 = lshiftbyte_m128(st7, 7);
+ st9 = lshiftbyte_m128(st9, 1);
+ st10 = lshiftbyte_m128(st10, 2);
+ st11 = lshiftbyte_m128(st11, 3);
+ st12 = lshiftbyte_m128(st12, 4);
+ st13 = lshiftbyte_m128(st13, 5);
+ st14 = lshiftbyte_m128(st14, 6);
+ st15 = lshiftbyte_m128(st15, 7);
*s = or128(*s, st0);
*s = or128(*s, st1);
*s = or128(*s, st6);
*s = or128(*s, st7);
*conf0 = movq(*s);
- *s = byteShiftRight128(*s, 8);
+ *s = rshiftbyte_m128(*s, 8);
*conf0 ^= ~0ULL;
*s = or128(*s, st8);
*s = or128(*s, st14);
*s = or128(*s, st15);
*conf8 = movq(*s);
- *s = byteShiftRight128(*s, 8);
+ *s = rshiftbyte_m128(*s, 8);
*conf8 ^= ~0ULL;
}
m128 st12 = *(const m128 *)(ft + v12*8);
m128 st14 = *(const m128 *)(ft + v14*8);
- st2 = byteShiftLeft128(st2, 2);
- st4 = byteShiftLeft128(st4, 4);
- st6 = byteShiftLeft128(st6, 6);
- st10 = byteShiftLeft128(st10, 2);
- st12 = byteShiftLeft128(st12, 4);
- st14 = byteShiftLeft128(st14, 6);
+ st2 = lshiftbyte_m128(st2, 2);
+ st4 = lshiftbyte_m128(st4, 4);
+ st6 = lshiftbyte_m128(st6, 6);
+ st10 = lshiftbyte_m128(st10, 2);
+ st12 = lshiftbyte_m128(st12, 4);
+ st14 = lshiftbyte_m128(st14, 6);
*s = or128(*s, st0);
*s = or128(*s, st2);
*s = or128(*s, st4);
*s = or128(*s, st6);
*conf0 = movq(*s);
- *s = byteShiftRight128(*s, 8);
+ *s = rshiftbyte_m128(*s, 8);
*conf0 ^= ~0ULL;
*s = or128(*s, st8);
*s = or128(*s, st12);
*s = or128(*s, st14);
*conf8 = movq(*s);
- *s = byteShiftRight128(*s, 8);
+ *s = rshiftbyte_m128(*s, 8);
*conf8 ^= ~0ULL;
}
m128 st8 = *(const m128 *)(ft + v8*8);
m128 st12 = *(const m128 *)(ft + v12*8);
- st4 = byteShiftLeft128(st4, 4);
- st12 = byteShiftLeft128(st12, 4);
+ st4 = lshiftbyte_m128(st4, 4);
+ st12 = lshiftbyte_m128(st12, 4);
*s = or128(*s, st0);
*s = or128(*s, st4);
*conf0 = movq(*s);
- *s = byteShiftRight128(*s, 8);
+ *s = rshiftbyte_m128(*s, 8);
*conf0 ^= ~0ULL;
*s = or128(*s, st8);
*s = or128(*s, st12);
*conf8 = movq(*s);
- *s = byteShiftRight128(*s, 8);
+ *s = rshiftbyte_m128(*s, 8);
*conf8 ^= ~0ULL;
}
do { \
if (unlikely(isnonzero128(var))) { \
u64a lo = movq(var); \
- u64a hi = movq(byteShiftRight128(var, 8)); \
+ u64a hi = movq(rshiftbyte_m128(var, 8)); \
if (unlikely(lo)) { \
conf_fn(&lo, bucket, offset, confBase, reason, a, ptr, \
control, &last_match); \
do { \
if (unlikely(isnonzero128(var))) { \
u32 part1 = movd(var); \
- u32 part2 = movd(byteShiftRight128(var, 4)); \
- u32 part3 = movd(byteShiftRight128(var, 8)); \
- u32 part4 = movd(byteShiftRight128(var, 12)); \
+ u32 part2 = movd(rshiftbyte_m128(var, 4)); \
+ u32 part3 = movd(rshiftbyte_m128(var, 8)); \
+ u32 part4 = movd(rshiftbyte_m128(var, 12)); \
if (unlikely(part1)) { \
conf_fn(&part1, bucket, offset, confBase, reason, a, ptr, \
control, &last_match); \
m128 prep_conf_teddy_m1(const m128 *maskBase, m128 p_mask, m128 val) {
m128 mask = set16x8(0xf);
m128 lo = and128(val, mask);
- m128 hi = and128(rshift2x64(val, 4), mask);
+ m128 hi = and128(rshift64_m128(val, 4), mask);
return and128(and128(pshufb(maskBase[0*2], lo),
pshufb(maskBase[0*2+1], hi)), p_mask);
}
m128 val) {
m128 mask = set16x8(0xf);
m128 lo = and128(val, mask);
- m128 hi = and128(rshift2x64(val, 4), mask);
+ m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m1(maskBase, p_mask, val);
m128 res_1 = and128(pshufb(maskBase[1*2], lo),
m128 p_mask, m128 val) {
m128 mask = set16x8(0xf);
m128 lo = and128(val, mask);
- m128 hi = and128(rshift2x64(val, 4), mask);
+ m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m2(maskBase, old_1, p_mask, val);
m128 res_2 = and128(pshufb(maskBase[2*2], lo),
m128 *old_3, m128 p_mask, m128 val) {
m128 mask = set16x8(0xf);
m128 lo = and128(val, mask);
- m128 hi = and128(rshift2x64(val, 4), mask);
+ m128 hi = and128(rshift64_m128(val, 4), mask);
m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, p_mask, val);
m128 res_3 = and128(pshufb(maskBase[3*2], lo),
64 * (offset);
*arrCnt += 1;
}
- u64a part_1 = movq(byteShiftRight128(var, 8));
+ u64a part_1 = movq(rshiftbyte_m128(var, 8));
while (unlikely(part_1)) {
bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
64 * (offset + 1);
32 * (offset * 2);
*arrCnt += 1;
}
- u32 part_1 = movd(byteShiftRight128(var, 4));
+ u32 part_1 = movd(rshiftbyte_m128(var, 4));
while (unlikely(part_1)) {
bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
32 * (offset * 2 + 1);
*arrCnt += 1;
}
- u32 part_2 = movd(byteShiftRight128(var, 8));
+ u32 part_2 = movd(rshiftbyte_m128(var, 8));
while (unlikely(part_2)) {
bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_2) +
32 * (offset * 2 + 2);
*arrCnt += 1;
}
- u32 part_3 = movd(byteShiftRight128(var, 12));
+ u32 part_3 = movd(rshiftbyte_m128(var, 12));
while (unlikely(part_3)) {
bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_3) +
32 * (offset * 2 + 3);
m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 p_mask, m256 val) {
m256 mask = set32x8(0xf);
m256 lo = and256(val, mask);
- m256 hi = and256(rshift4x64(val, 4), mask);
+ m256 hi = and256(rshift64_m256(val, 4), mask);
return and256(and256(vpshufb(maskBase[0*2], lo),
vpshufb(maskBase[0*2+1], hi)), p_mask);
}
m256 val) {
m256 mask = set32x8(0xf);
m256 lo = and256(val, mask);
- m256 hi = and256(rshift4x64(val, 4), mask);
+ m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m1(maskBase, p_mask, val);
m256 res_1 = and256(vpshufb(maskBase[1*2], lo),
m256 p_mask, m256 val) {
m256 mask = set32x8(0xf);
m256 lo = and256(val, mask);
- m256 hi = and256(rshift4x64(val, 4), mask);
+ m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, p_mask, val);
m256 res_2 = and256(vpshufb(maskBase[2*2], lo),
m256 *old_3, m256 p_mask, m256 val) {
m256 mask = set32x8(0xf);
m256 lo = and256(val, mask);
- m256 hi = and256(rshift4x64(val, 4), mask);
+ m256 hi = and256(rshift64_m256(val, 4), mask);
m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, p_mask, val);
m256 res_3 = and256(vpshufb(maskBase[3*2], lo),
m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi,
m256 p_mask) {
m256 lo = and256(val, mask);
- m256 hi = and256(rshift4x64(val, 4), mask);
+ m256 hi = and256(rshift64_m256(val, 4), mask);
m256 res = and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi));
return and256(res, p_mask);
}
v = and128(v, caseMask);
}
- u32 z = movemask128(and128(shiftLeft8Bits(eq128(mask1, v)), eq128(mask2, v)));
+ u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
+ eq128(mask2, v)));
// mask out where we can't match
u32 mask = (0xFFFF >> (16 - l));
v = and128(v, caseMask);
}
- u32 z = movemask128(and128(shiftLeft8Bits(eq128(mask1, v)), eq128(mask2, v)));
+ u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1),
+ eq128(mask2, v)));
// mask out where we can't match
u32 buf_off = start - offset;
// Shift macros for Limited NFAs. Defined in terms of uniform ops.
// LimExNFAxxx ptr in 'limex' and the current state in 's'
#define NFA_EXEC_LIM_SHIFT(nels_type, nels_i) \
- (JOIN(shift_, nels_type)( \
+ (JOIN(lshift_, nels_type)( \
JOIN(and_, nels_type)(s, \
JOIN(load_, nels_type)(&limex->shift[nels_i])), \
limex->shiftAmount[nels_i]))
#include "shufti_common.h"
-
/** \brief Naive byte-by-byte implementation. */
static really_inline
const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
m128 c2_lo = pshufb(mask2_lo, chars_lo);
m128 c2_hi = pshufb(mask2_hi, chars_hi);
- m128 t2 = or128(t, shiftRight8Bits(or128(c2_lo, c2_hi)));
+ m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
#ifdef DEBUG
DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo); printf("\n");
m256 c2_lo = vpshufb(mask2_lo, chars_lo);
m256 c2_hi = vpshufb(mask2_hi, chars_hi);
- m256 t2 = or256(t, shift256Right8Bits(or256(c2_lo, c2_hi)));
+ m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
#ifdef DEBUG
DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo); printf("\n");
#endif
#define GET_LO_4(chars) and128(chars, low4bits)
-#define GET_HI_4(chars) rshift2x64(andnot128(low4bits, chars), 4)
+#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
static really_inline
u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
#endif
#define GET_LO_4(chars) and256(chars, low4bits)
-#define GET_HI_4(chars) rshift4x64(andnot256(low4bits, chars), 4)
+#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
static really_inline
u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
return NULL; // no match
}
-#define shift128r(a, b) _mm_srli_epi64((a), (b))
static really_inline
u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
m128 shuf1 = pshufb(shuf_mask_lo_highclear, v);
m128 t1 = xor128(v, highconst);
m128 shuf2 = pshufb(shuf_mask_lo_highset, t1);
- m128 t2 = andnot128(highconst, shift128r(v, 4));
+ m128 t2 = andnot128(highconst, rshift64_m128(v, 4));
m128 shuf3 = pshufb(shuf_mask_hi, t2);
m128 tmp = and128(or128(shuf1, shuf2), shuf3);
m128 tmp2 = eq128(tmp, zeroes128());
return NULL; // no match
}
-#define shift256r(a, b) _mm256_srli_epi64((a), (b))
static really_inline
u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
m256 shuf1 = vpshufb(shuf_mask_lo_highclear, v);
m256 t1 = xor256(v, highconst);
m256 shuf2 = vpshufb(shuf_mask_lo_highset, t1);
- m256 t2 = andnot256(highconst, shift256r(v, 4));
+ m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
m256 shuf3 = vpshufb(shuf_mask_hi, t2);
m256 tmp = and256(or256(shuf1, shuf2), shuf3);
m256 tmp2 = eq256(tmp, zeroes256());
for (; buf + 16 < buf_end; buf += 16) {
m128 data = load128(buf);
u32 z = movemask128(and128(eq128(chars1, data),
- shiftRight8Bits(eq128(chars2, data))));
+ rshiftbyte_m128(eq128(chars2, data), 1)));
if (buf[15] == c1 && buf[16] == c2) {
z |= (1 << 15);
}
m128 data = load128(buf);
m128 v = and128(casemask, data);
u32 z = movemask128(and128(eq128(chars1, v),
- shiftRight8Bits(eq128(chars2, v))));
+ rshiftbyte_m128(eq128(chars2, v), 1)));
if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) {
z |= (1 << 15);
}
for (; buf + 16 < buf_end; buf += 16) {
m128 data = load128(buf);
- u32 z = movemask128(and128(eq128(chars1, and128(data, mask1)),
- shiftRight8Bits(eq128(chars2, and128(data, mask2)))));
+ m128 v1 = eq128(chars1, and128(data, mask1));
+ m128 v2 = eq128(chars2, and128(data, mask2));
+ u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
+
if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) {
z |= (1 << 15);
}
const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
m128 data = loadu128(buf); // unaligned
u32 z = movemask128(and128(eq128(chars1, data),
- shiftRight8Bits(eq128(chars2, data))));
+ rshiftbyte_m128(eq128(chars2, data), 1)));
/* no fixup of the boundary required - the aligned run will pick it up */
if (unlikely(z)) {
m128 data = loadu128(buf); // unaligned
m128 v = and128(casemask, data);
u32 z = movemask128(and128(eq128(chars1, v),
- shiftRight8Bits(eq128(chars2, v))));
+ rshiftbyte_m128(eq128(chars2, v), 1)));
/* no fixup of the boundary required - the aligned run will pick it up */
if (unlikely(z)) {
const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2,
m128 mask1, m128 mask2, const u8 *buf) {
m128 data = loadu128(buf); // unaligned
- u32 z = movemask128(and128(eq128(chars1, and128(data, mask1)),
- shiftRight8Bits(eq128(chars2, and128(data, mask2)))));
+ m128 v1 = eq128(chars1, and128(data, mask1));
+ m128 v2 = eq128(chars2, and128(data, mask2));
+ u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1)));
/* no fixup of the boundary required - the aligned run will pick it up */
if (unlikely(z)) {
for (; buf + 16 < buf_end; buf_end -= 16) {
m128 data = load128(buf_end - 16);
u32 z = movemask128(and128(eq128(chars2, data),
- shiftLeft8Bits(eq128(chars1, data))));
+ lshiftbyte_m128(eq128(chars1, data), 1)));
if (buf_end[-17] == c1 && buf_end[-16] == c2) {
z |= 1;
}
m128 data = load128(buf_end - 16);
m128 v = and128(casemask, data);
u32 z = movemask128(and128(eq128(chars2, v),
- shiftLeft8Bits(eq128(chars1, v))));
+ lshiftbyte_m128(eq128(chars1, v), 1)));
if ((buf_end[-17] & CASE_CLEAR) == c1
&& (buf_end[-16] & CASE_CLEAR) == c2) {
z |= 1;
const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) {
m128 data = loadu128(buf);
u32 z = movemask128(and128(eq128(chars2, data),
- shiftLeft8Bits(eq128(chars1, data))));
+ lshiftbyte_m128(eq128(chars1, data), 1)));
/* no fixup of the boundary required - the aligned run will pick it up */
if (unlikely(z)) {
m128 data = loadu128(buf);
m128 v = and128(casemask, data);
u32 z = movemask128(and128(eq128(chars2, v),
- shiftLeft8Bits(eq128(chars1, v))));
+ lshiftbyte_m128(eq128(chars1, v), 1)));
/* no fixup of the boundary required - the aligned run will pick it up */
if (unlikely(z)) {
return lastMatchOffset(buf + 16, z);
}
#define GET_LO_4(chars) and128(chars, low4bits)
-#define GET_HI_4(chars) rshift2x64(andnot128(low4bits, chars), 4)
+#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
static really_inline
u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
#endif
}
-#define shift2x64(a, b) _mm_slli_epi64((a), (b))
-#define rshift2x64(a, b) _mm_srli_epi64((a), (b))
+#define lshift64_m128(a, b) _mm_slli_epi64((a), (b))
+#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
#define eq128(a, b) _mm_cmpeq_epi8((a), (b))
#define movemask128(a) ((u32)_mm_movemask_epi8((a)))
#endif
}
-static really_inline m128 shiftRight8Bits(m128 a) {
- return _mm_srli_si128(a,1);
-}
-
-static really_inline m128 shiftLeft8Bits(m128 a) {
- return _mm_slli_si128(a,1);
-}
-
-#define byteShiftRight128(a, count_immed) _mm_srli_si128(a, count_immed)
-#define byteShiftLeft128(a, count_immed) _mm_slli_si128(a, count_immed)
+#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
+#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
#if !defined(__AVX2__)
// TODO: this entire file needs restructuring - this carveout is awful
#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
#define extract64from256(a, imm) _mm_extract_epi64((imm >> 2) ? a.hi : a.lo, imm % 2)
#else
-#define extract32from256(a, imm) movd(byteShiftRight128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 8))
-#define extract64from256(a, imm) movq(byteShiftRight128((imm >> 2) ? a.hi : a.lo, (imm % 2) * 8))
+#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 8))
+#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 2) * 8))
#endif
#endif // !AVX2
return _mm_andnot_si128(a, b);
}
-// The shift amount is an immediate, so we define these operations as macros on
-// Intel SIMD.
-#define shift128(a, b) _mm_slli_epi64((a), (b))
-
// aligned load
static really_inline m128 load128(const void *ptr) {
assert(ISALIGNED_N(ptr, alignof(m128)));
****/
#if defined(__AVX2__)
-#define shift4x64(a, b) _mm256_slli_epi64((a), (b))
-#define rshift4x64(a, b) _mm256_srli_epi64((a), (b))
+#define lshift64_m256(a, b) _mm256_slli_epi64((a), (b))
+#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
static really_inline
m256 set32x8(u32 in) {
#else
static really_inline
-m256 shift4x64(m256 a, int b) {
+m256 lshift64_m256(m256 a, int b) {
m256 rv = a;
- rv.lo = shift2x64(rv.lo, b);
- rv.hi = shift2x64(rv.hi, b);
+ rv.lo = lshift64_m128(rv.lo, b);
+ rv.hi = lshift64_m128(rv.hi, b);
return rv;
}
static really_inline
-m256 rshift4x64(m256 a, int b) {
+m256 rshift64_m256(m256 a, int b) {
m256 rv = a;
- rv.lo = rshift2x64(rv.lo, b);
- rv.hi = rshift2x64(rv.hi, b);
+ rv.lo = rshift64_m128(rv.lo, b);
+ rv.hi = rshift64_m128(rv.hi, b);
return rv;
}
static really_inline
}
#endif
-// The shift amount is an immediate
-#if defined(__AVX2__)
-#define shift256(a, b) _mm256_slli_epi64((a), (b))
-#else
-static really_really_inline m256 shift256(m256 a, unsigned b) {
- m256 rv;
- rv.lo = shift128(a.lo, b);
- rv.hi = shift128(a.hi, b);
- return rv;
-}
-#endif
-
static really_inline int diff256(m256 a, m256 b) {
#if defined(__AVX2__)
return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
return _mm256_extracti128_si256(x, 0);
}
-static really_inline
-m256 shift256Right8Bits(m256 a) {
- return _mm256_srli_si256(a, 1);
-}
-
-static really_inline
-m256 shift256Left8Bits(m256 a) {
- return _mm256_slli_si256(a, 1);
-}
#define cast256to128(a) _mm256_castsi256_si128(a)
#define cast128to256(a) _mm256_castsi128_si256(a)
#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
-#define byteShiftRight256(a, count_immed) _mm256_srli_si256(a, count_immed)
-#define byteShiftLeft256(a, count_immed) _mm256_slli_si256(a, count_immed)
+#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
+#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
}
// The shift amount is an immediate
-static really_really_inline m384 shift384(m384 a, unsigned b) {
+static really_really_inline
+m384 lshift64_m384(m384 a, unsigned b) {
m384 rv;
- rv.lo = shift128(a.lo, b);
- rv.mid = shift128(a.mid, b);
- rv.hi = shift128(a.hi, b);
+ rv.lo = lshift64_m128(a.lo, b);
+ rv.mid = lshift64_m128(a.mid, b);
+ rv.hi = lshift64_m128(a.hi, b);
return rv;
}
}
// The shift amount is an immediate
-static really_really_inline m512 shift512(m512 a, unsigned b) {
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
m512 rv;
- rv.lo = shift256(a.lo, b);
- rv.hi = shift256(a.hi, b);
+ rv.lo = lshift64_m256(a.lo, b);
+ rv.hi = lshift64_m256(a.hi, b);
return rv;
}
/*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
#define andnot_m384(a, b) (andnot384(a, b))
#define andnot_m512(a, b) (andnot512(a, b))
-#define shift_u32(a, b) ((a) << (b))
-#define shift_u64a(a, b) ((a) << (b))
-#define shift_m128(a, b) (shift128(a, b))
-#define shift_m256(a, b) (shift256(a, b))
-#define shift_m384(a, b) (shift384(a, b))
-#define shift_m512(a, b) (shift512(a, b))
+#define lshift_u32(a, b) ((a) << (b))
+#define lshift_u64a(a, b) ((a) << (b))
+#define lshift_m128(a, b) (lshift64_m128(a, b))
+#define lshift_m256(a, b) (lshift64_m256(a, b))
+#define lshift_m384(a, b) (lshift64_m384(a, b))
+#define lshift_m512(a, b) (lshift64_m512(a, b))
#define isZero_u8(a) ((a) == 0)
#define isZero_u32(a) ((a) == 0)
char base[] = "0123456789ABCDEF";
m128 in = loadu128(base);
- EXPECT_TRUE(!diff128(byteShiftRight128(in, 0),
+ EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 0),
variable_byte_shift_m128(in, 0)));
- EXPECT_TRUE(!diff128(byteShiftRight128(in, 1),
+ EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 1),
variable_byte_shift_m128(in, -1)));
- EXPECT_TRUE(!diff128(byteShiftRight128(in, 2),
+ EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 2),
variable_byte_shift_m128(in, -2)));
- EXPECT_TRUE(!diff128(byteShiftRight128(in, 3),
+ EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 3),
variable_byte_shift_m128(in, -3)));
- EXPECT_TRUE(!diff128(byteShiftRight128(in, 4),
+ EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 4),
variable_byte_shift_m128(in, -4)));
- EXPECT_TRUE(!diff128(byteShiftRight128(in, 5),
+ EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 5),
variable_byte_shift_m128(in, -5)));
- EXPECT_TRUE(!diff128(byteShiftRight128(in, 6),
+ EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 6),
variable_byte_shift_m128(in, -6)));
- EXPECT_TRUE(!diff128(byteShiftRight128(in, 7),
+ EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 7),
variable_byte_shift_m128(in, -7)));
- EXPECT_TRUE(!diff128(byteShiftRight128(in, 8),
+ EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 8),
variable_byte_shift_m128(in, -8)));
- EXPECT_TRUE(!diff128(byteShiftRight128(in, 9),
+ EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 9),
variable_byte_shift_m128(in, -9)));
- EXPECT_TRUE(!diff128(byteShiftRight128(in, 10),
+ EXPECT_TRUE(!diff128(rshiftbyte_m128(in, 10),
variable_byte_shift_m128(in, -10)));
- EXPECT_TRUE(!diff128(byteShiftLeft128(in, 0),
+ EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 0),
variable_byte_shift_m128(in, 0)));
- EXPECT_TRUE(!diff128(byteShiftLeft128(in, 1),
+ EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 1),
variable_byte_shift_m128(in, 1)));
- EXPECT_TRUE(!diff128(byteShiftLeft128(in, 2),
+ EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 2),
variable_byte_shift_m128(in, 2)));
- EXPECT_TRUE(!diff128(byteShiftLeft128(in, 3),
+ EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 3),
variable_byte_shift_m128(in, 3)));
- EXPECT_TRUE(!diff128(byteShiftLeft128(in, 4),
+ EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 4),
variable_byte_shift_m128(in, 4)));
- EXPECT_TRUE(!diff128(byteShiftLeft128(in, 5),
+ EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 5),
variable_byte_shift_m128(in, 5)));
- EXPECT_TRUE(!diff128(byteShiftLeft128(in, 6),
+ EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 6),
variable_byte_shift_m128(in, 6)));
- EXPECT_TRUE(!diff128(byteShiftLeft128(in, 7),
+ EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 7),
variable_byte_shift_m128(in, 7)));
- EXPECT_TRUE(!diff128(byteShiftLeft128(in, 8),
+ EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 8),
variable_byte_shift_m128(in, 8)));
- EXPECT_TRUE(!diff128(byteShiftLeft128(in, 9),
+ EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 9),
variable_byte_shift_m128(in, 9)));
- EXPECT_TRUE(!diff128(byteShiftLeft128(in, 10),
+ EXPECT_TRUE(!diff128(lshiftbyte_m128(in, 10),
variable_byte_shift_m128(in, 10)));
EXPECT_TRUE(!diff128(zeroes128(), variable_byte_shift_m128(in, 16)));