/* Count number of codes for each code length. */
static inline void count_lengths(uint16_t *lens, int codes, uint16_t *count) {
int sym;
- static const ALIGNED_(32) uint8_t one[32] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ static const ALIGNED_(16) uint8_t one[256] = {
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
};
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
uint8x16_t s1 = vdupq_n_u8(0);
uint8x16_t s2 = vdupq_n_u8(0);
- const uint8_t *p = &one[16];
if (codes & 1) {
- s1 = vld1q_u8(&p[-lens[0]]);
+ s1 = vld1q_u8(&one[16 * lens[0]]);
}
for (sym = codes & 1; sym < codes; sym += 2) {
- s1 = vaddq_u8(s1, vld1q_u8(&p[-lens[sym]]));
- s2 = vaddq_u8(s2, vld1q_u8(&p[-lens[sym+1]]));
+ s1 = vaddq_u8(s1, vld1q_u8(&one[16 * lens[sym]]));
+ s2 = vaddq_u8(s2, vld1q_u8(&one[16 * lens[sym+1]]));
}
vst1q_u16(&count[0], vaddl_u8(vget_low_u8(s1), vget_low_u8(s2)));
__m128i s1 = _mm_setzero_si128();
__m128i s2 = _mm_setzero_si128();
- const uint8_t *p = (uint8_t*)&one[16];
if (codes & 1) {
- s1 = _mm_loadu_si128((const __m128i*)&p[-lens[0]]);
+ s1 = _mm_load_si128((const __m128i*)&one[16 * lens[0]]);
}
for (sym = codes & 1; sym < codes; sym += 2) {
- s1 = _mm_add_epi8(s1, _mm_loadu_si128((const __m128i*)&p[-lens[sym]])); // vaddq_u8
- s2 = _mm_add_epi8(s2, _mm_loadu_si128((const __m128i*)&p[-lens[sym+1]]));
+ s1 = _mm_add_epi8(s1, _mm_load_si128((const __m128i*)&one[16 * lens[sym]])); // vaddq_u8
+ s2 = _mm_add_epi8(s2, _mm_load_si128((const __m128i*)&one[16 * lens[sym+1]]));
}
# if defined(__AVX2__)