]>
Commit | Line | Data |
---|---|---|
265785b7 S |
1 | #ifdef __APPLE__ |
2 | #define get_checksum1_avx2 _get_checksum1_avx2 | |
3 | #endif | |
4 | ||
5 | .intel_syntax noprefix | |
6 | .text | |
7 | ||
8 | .p2align 5 | |
9 | .globl get_checksum1_avx2 | |
10 | ||
11 | # rdi=*buf, esi=len, edx=i, rcx= *ps1, r8= *ps2 | |
12 | get_checksum1_avx2: | |
13 | vmovd xmm6,[rcx] # load *ps1 | |
14 | lea eax, [rsi-128] # at least 128 bytes to process? | |
15 | cmp edx, eax | |
16 | jg .exit | |
17 | lea rax, .mul_T2[rip] | |
18 | vmovntdqa ymm7, [rax] # load T2 multiplication constants | |
19 | vmovntdqa ymm12,[rax+32]# from memory. | |
20 | vpcmpeqd ymm15, ymm15, ymm15 # set all elements to -1. | |
21 | ||
22 | #if CHAR_OFFSET != 0 | |
23 | mov eax, 32*CHAR_OFFSET | |
24 | vmovd xmm10, eax | |
25 | vpbroadcastd ymm10, xmm10 | |
26 | mov eax, 528*CHAR_OFFSET | |
27 | vmovd xmm13, eax | |
28 | vpbroadcastd ymm13, xmm13 | |
29 | #endif | |
30 | vpabsb ymm15, ymm15 # set all byte size elements to 1. | |
31 | add rdi, rdx | |
32 | vmovdqu ymm2, [rdi] # preload the first 64 bytes. | |
33 | vmovdqu ymm3, [rdi+32] | |
34 | and esi, ~63 # only needed during final reduction, | |
35 | # done here to avoid a longer nop for | |
36 | # alignment below. | |
37 | add edx, esi | |
38 | shr rsi, 6 # longer opcode for alignment | |
39 | add rdi, 64 | |
40 | vpxor xmm1, xmm1, xmm1 # reset both partial sums accumulators. | |
41 | vpxor xmm4, xmm4, xmm4 | |
42 | mov eax, [r8] | |
43 | .p2align 4 # should fit into the LSD allocation queue. | |
44 | .loop: | |
45 | vpmaddubsw ymm0, ymm15, ymm2 # s1 partial sums | |
46 | vpmaddubsw ymm5, ymm15, ymm3 | |
47 | vmovdqu ymm8, [rdi] # preload the next | |
48 | vmovdqu ymm9, [rdi+32] # 64 bytes. | |
49 | add rdi, 64 | |
50 | vpaddd ymm4, ymm4, ymm6 | |
51 | vpaddw ymm5, ymm5, ymm0 | |
52 | vpsrld ymm0, ymm5, 16 | |
53 | vpaddw ymm5, ymm0, ymm5 | |
54 | vpaddd ymm6, ymm5, ymm6 | |
55 | vpmaddubsw ymm2, ymm7, ymm2 # s2 partial sums | |
56 | vpmaddubsw ymm3, ymm12, ymm3 | |
57 | prefetcht0 [rdi+384] # prefetch 6 cachelines ahead. | |
58 | vpaddw ymm3, ymm2, ymm3 | |
59 | vpsrldq ymm2, ymm3, 2 | |
60 | vpaddd ymm3, ymm2, ymm3 | |
61 | vpaddd ymm1, ymm1, ymm3 | |
62 | ||
63 | #if CHAR_OFFSET != 0 | |
64 | vpaddd ymm6, ymm10, ymm6 # 32*CHAR_OFFSET | |
65 | vpaddd ymm1, ymm13, ymm1 # 528*CHAR_OFFSET | |
66 | #endif | |
67 | vmovdqa ymm2, ymm8 # move the next 64 bytes | |
68 | vmovdqa ymm3, ymm9 # into the right registers | |
69 | sub esi, 1 | |
70 | jnz .loop | |
71 | ||
72 | # now we reduce the partial sums. | |
73 | vpslld ymm3, ymm4, 6 | |
74 | vpsrldq ymm2, ymm6, 4 | |
75 | ||
76 | vpaddd ymm0, ymm3, ymm1 | |
77 | vpaddd ymm6, ymm2, ymm6 | |
78 | vpsrlq ymm3, ymm0, 32 | |
79 | ||
80 | vpsrldq ymm2, ymm6, 8 | |
81 | vpaddd ymm0, ymm3, ymm0 | |
82 | vpsrldq ymm3, ymm0, 8 | |
83 | vpaddd ymm6, ymm2, ymm6 | |
84 | vpaddd ymm0, ymm3, ymm0 | |
85 | vextracti128 xmm2, ymm6, 0x1 | |
86 | vextracti128 xmm1, ymm0, 0x1 | |
87 | vpaddd xmm6, xmm2, xmm6 | |
88 | vmovd [rcx], xmm6 | |
89 | vpaddd xmm1, xmm1, xmm0 | |
90 | vmovd ecx, xmm1 | |
91 | add eax, ecx | |
92 | mov [r8], eax | |
93 | .exit: | |
94 | vzeroupper | |
95 | mov eax, edx | |
96 | ret | |
97 | ||
98 | #ifdef __APPLE__ | |
99 | .data | |
100 | .align 6 | |
101 | #else | |
102 | .section .rodata | |
103 | .p2align 6 | |
104 | #endif | |
105 | .mul_T2: | |
106 | .byte 64 | |
107 | .byte 63 | |
108 | .byte 62 | |
109 | .byte 61 | |
110 | .byte 60 | |
111 | .byte 59 | |
112 | .byte 58 | |
113 | .byte 57 | |
114 | .byte 56 | |
115 | .byte 55 | |
116 | .byte 54 | |
117 | .byte 53 | |
118 | .byte 52 | |
119 | .byte 51 | |
120 | .byte 50 | |
121 | .byte 49 | |
122 | .byte 48 | |
123 | .byte 47 | |
124 | .byte 46 | |
125 | .byte 45 | |
126 | .byte 44 | |
127 | .byte 43 | |
128 | .byte 42 | |
129 | .byte 41 | |
130 | .byte 40 | |
131 | .byte 39 | |
132 | .byte 38 | |
133 | .byte 37 | |
134 | .byte 36 | |
135 | .byte 35 | |
136 | .byte 34 | |
137 | .byte 33 | |
138 | .byte 32 | |
139 | .byte 31 | |
140 | .byte 30 | |
141 | .byte 29 | |
142 | .byte 28 | |
143 | .byte 27 | |
144 | .byte 26 | |
145 | .byte 25 | |
146 | .byte 24 | |
147 | .byte 23 | |
148 | .byte 22 | |
149 | .byte 21 | |
150 | .byte 20 | |
151 | .byte 19 | |
152 | .byte 18 | |
153 | .byte 17 | |
154 | .byte 16 | |
155 | .byte 15 | |
156 | .byte 14 | |
157 | .byte 13 | |
158 | .byte 12 | |
159 | .byte 11 | |
160 | .byte 10 | |
161 | .byte 9 | |
162 | .byte 8 | |
163 | .byte 7 | |
164 | .byte 6 | |
165 | .byte 5 | |
166 | .byte 4 | |
167 | .byte 3 | |
168 | .byte 2 | |
169 | .byte 1 |