Current code aligns to 2x VEC_SIZE. Aligning to 2x has no affect on
performance other than potentially resulting in an additional
iteration of the loop.
1x maintains aligned stores (the only reason to align in this case)
and doesn't incur any unnecessary loop iterations.
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
(cherry picked from commit
9469261cf1924d350feeec64d2c80cafbbdcdd4d)
leaq (VEC_SIZE * 4)(%rax), %LOOP_REG
#endif
/* Align dst for loop. */
- andq $(VEC_SIZE * -2), %LOOP_REG
+ andq $(VEC_SIZE * -1), %LOOP_REG
.p2align 4
L(loop):
VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG)