]>
Commit | Line | Data |
---|---|---|
b81a5095 WD |
1 | #include "config.h" |
2 | ||
3 | #ifdef USE_ROLL_ASM /* { */ | |
4 | ||
5 | #define CHAR_OFFSET 0 /* Keep this the same as rsync.h, which isn't likely to change. */ | |
6 | ||
265785b7 | 7 | #ifdef __APPLE__ |
b81a5095 | 8 | #define get_checksum1_avx2_asm _get_checksum1_avx2_asm |
265785b7 S |
9 | #endif |
10 | ||
11 | .intel_syntax noprefix | |
12 | .text | |
13 | ||
14 | .p2align 5 | |
b81a5095 | 15 | .globl get_checksum1_avx2_asm |
265785b7 S |
16 | |
17 | # rdi=*buf, esi=len, edx=i, rcx= *ps1, r8= *ps2 | |
b81a5095 | 18 | get_checksum1_avx2_asm: |
265785b7 S |
19 | vmovd xmm6,[rcx] # load *ps1 |
20 | lea eax, [rsi-128] # at least 128 bytes to process? | |
21 | cmp edx, eax | |
22 | jg .exit | |
23 | lea rax, .mul_T2[rip] | |
24 | vmovntdqa ymm7, [rax] # load T2 multiplication constants | |
25 | vmovntdqa ymm12,[rax+32]# from memory. | |
26 | vpcmpeqd ymm15, ymm15, ymm15 # set all elements to -1. | |
27 | ||
28 | #if CHAR_OFFSET != 0 | |
29 | mov eax, 32*CHAR_OFFSET | |
30 | vmovd xmm10, eax | |
31 | vpbroadcastd ymm10, xmm10 | |
32 | mov eax, 528*CHAR_OFFSET | |
33 | vmovd xmm13, eax | |
34 | vpbroadcastd ymm13, xmm13 | |
35 | #endif | |
36 | vpabsb ymm15, ymm15 # set all byte size elements to 1. | |
37 | add rdi, rdx | |
38 | vmovdqu ymm2, [rdi] # preload the first 64 bytes. | |
39 | vmovdqu ymm3, [rdi+32] | |
40 | and esi, ~63 # only needed during final reduction, | |
41 | # done here to avoid a longer nop for | |
42 | # alignment below. | |
43 | add edx, esi | |
44 | shr rsi, 6 # longer opcode for alignment | |
45 | add rdi, 64 | |
46 | vpxor xmm1, xmm1, xmm1 # reset both partial sums accumulators. | |
47 | vpxor xmm4, xmm4, xmm4 | |
48 | mov eax, [r8] | |
49 | .p2align 4 # should fit into the LSD allocation queue. | |
50 | .loop: | |
51 | vpmaddubsw ymm0, ymm15, ymm2 # s1 partial sums | |
52 | vpmaddubsw ymm5, ymm15, ymm3 | |
53 | vmovdqu ymm8, [rdi] # preload the next | |
54 | vmovdqu ymm9, [rdi+32] # 64 bytes. | |
55 | add rdi, 64 | |
56 | vpaddd ymm4, ymm4, ymm6 | |
57 | vpaddw ymm5, ymm5, ymm0 | |
58 | vpsrld ymm0, ymm5, 16 | |
59 | vpaddw ymm5, ymm0, ymm5 | |
60 | vpaddd ymm6, ymm5, ymm6 | |
61 | vpmaddubsw ymm2, ymm7, ymm2 # s2 partial sums | |
62 | vpmaddubsw ymm3, ymm12, ymm3 | |
63 | prefetcht0 [rdi+384] # prefetch 6 cachelines ahead. | |
64 | vpaddw ymm3, ymm2, ymm3 | |
65 | vpsrldq ymm2, ymm3, 2 | |
66 | vpaddd ymm3, ymm2, ymm3 | |
67 | vpaddd ymm1, ymm1, ymm3 | |
68 | ||
69 | #if CHAR_OFFSET != 0 | |
70 | vpaddd ymm6, ymm10, ymm6 # 32*CHAR_OFFSET | |
71 | vpaddd ymm1, ymm13, ymm1 # 528*CHAR_OFFSET | |
72 | #endif | |
73 | vmovdqa ymm2, ymm8 # move the next 64 bytes | |
74 | vmovdqa ymm3, ymm9 # into the right registers | |
75 | sub esi, 1 | |
76 | jnz .loop | |
77 | ||
78 | # now we reduce the partial sums. | |
79 | vpslld ymm3, ymm4, 6 | |
80 | vpsrldq ymm2, ymm6, 4 | |
81 | ||
82 | vpaddd ymm0, ymm3, ymm1 | |
83 | vpaddd ymm6, ymm2, ymm6 | |
84 | vpsrlq ymm3, ymm0, 32 | |
85 | ||
86 | vpsrldq ymm2, ymm6, 8 | |
87 | vpaddd ymm0, ymm3, ymm0 | |
88 | vpsrldq ymm3, ymm0, 8 | |
89 | vpaddd ymm6, ymm2, ymm6 | |
90 | vpaddd ymm0, ymm3, ymm0 | |
91 | vextracti128 xmm2, ymm6, 0x1 | |
92 | vextracti128 xmm1, ymm0, 0x1 | |
93 | vpaddd xmm6, xmm2, xmm6 | |
94 | vmovd [rcx], xmm6 | |
95 | vpaddd xmm1, xmm1, xmm0 | |
96 | vmovd ecx, xmm1 | |
97 | add eax, ecx | |
98 | mov [r8], eax | |
99 | .exit: | |
100 | vzeroupper | |
101 | mov eax, edx | |
102 | ret | |
103 | ||
104 | #ifdef __APPLE__ | |
105 | .data | |
106 | .align 6 | |
107 | #else | |
108 | .section .rodata | |
109 | .p2align 6 | |
110 | #endif | |
111 | .mul_T2: | |
112 | .byte 64 | |
113 | .byte 63 | |
114 | .byte 62 | |
115 | .byte 61 | |
116 | .byte 60 | |
117 | .byte 59 | |
118 | .byte 58 | |
119 | .byte 57 | |
120 | .byte 56 | |
121 | .byte 55 | |
122 | .byte 54 | |
123 | .byte 53 | |
124 | .byte 52 | |
125 | .byte 51 | |
126 | .byte 50 | |
127 | .byte 49 | |
128 | .byte 48 | |
129 | .byte 47 | |
130 | .byte 46 | |
131 | .byte 45 | |
132 | .byte 44 | |
133 | .byte 43 | |
134 | .byte 42 | |
135 | .byte 41 | |
136 | .byte 40 | |
137 | .byte 39 | |
138 | .byte 38 | |
139 | .byte 37 | |
140 | .byte 36 | |
141 | .byte 35 | |
142 | .byte 34 | |
143 | .byte 33 | |
144 | .byte 32 | |
145 | .byte 31 | |
146 | .byte 30 | |
147 | .byte 29 | |
148 | .byte 28 | |
149 | .byte 27 | |
150 | .byte 26 | |
151 | .byte 25 | |
152 | .byte 24 | |
153 | .byte 23 | |
154 | .byte 22 | |
155 | .byte 21 | |
156 | .byte 20 | |
157 | .byte 19 | |
158 | .byte 18 | |
159 | .byte 17 | |
160 | .byte 16 | |
161 | .byte 15 | |
162 | .byte 14 | |
163 | .byte 13 | |
164 | .byte 12 | |
165 | .byte 11 | |
166 | .byte 10 | |
167 | .byte 9 | |
168 | .byte 8 | |
169 | .byte 7 | |
170 | .byte 6 | |
171 | .byte 5 | |
172 | .byte 4 | |
173 | .byte 3 | |
174 | .byte 2 | |
175 | .byte 1 | |
b81a5095 WD |
176 | |
177 | #endif /* } USE_ROLL_ASM */ |