]> git.ipfire.org Git - thirdparty/rsync.git/blame - simd-checksum-avx2.S
Linux: Handle protected_regular in inplace writes (#241)
[thirdparty/rsync.git] / simd-checksum-avx2.S
CommitLineData
265785b7
S
1#ifdef __APPLE__
2#define get_checksum1_avx2 _get_checksum1_avx2
3#endif
4
5.intel_syntax noprefix
6.text
7
8 .p2align 5
9 .globl get_checksum1_avx2
10
11# rdi=*buf, esi=len, edx=i, rcx= *ps1, r8= *ps2
12get_checksum1_avx2:
13 vmovd xmm6,[rcx] # load *ps1
14 lea eax, [rsi-128] # at least 128 bytes to process?
15 cmp edx, eax
16 jg .exit
17 lea rax, .mul_T2[rip]
18 vmovntdqa ymm7, [rax] # load T2 multiplication constants
19 vmovntdqa ymm12,[rax+32]# from memory.
20 vpcmpeqd ymm15, ymm15, ymm15 # set all elements to -1.
21
22#if CHAR_OFFSET != 0
23 mov eax, 32*CHAR_OFFSET
24 vmovd xmm10, eax
25 vpbroadcastd ymm10, xmm10
26 mov eax, 528*CHAR_OFFSET
27 vmovd xmm13, eax
28 vpbroadcastd ymm13, xmm13
29#endif
30 vpabsb ymm15, ymm15 # set all byte size elements to 1.
31 add rdi, rdx
32 vmovdqu ymm2, [rdi] # preload the first 64 bytes.
33 vmovdqu ymm3, [rdi+32]
34 and esi, ~63 # only needed during final reduction,
35 # done here to avoid a longer nop for
36 # alignment below.
37 add edx, esi
38 shr rsi, 6 # longer opcode for alignment
39 add rdi, 64
40 vpxor xmm1, xmm1, xmm1 # reset both partial sums accumulators.
41 vpxor xmm4, xmm4, xmm4
42 mov eax, [r8]
43 .p2align 4 # should fit into the LSD allocation queue.
44.loop:
45 vpmaddubsw ymm0, ymm15, ymm2 # s1 partial sums
46 vpmaddubsw ymm5, ymm15, ymm3
47 vmovdqu ymm8, [rdi] # preload the next
48 vmovdqu ymm9, [rdi+32] # 64 bytes.
49 add rdi, 64
50 vpaddd ymm4, ymm4, ymm6
51 vpaddw ymm5, ymm5, ymm0
52 vpsrld ymm0, ymm5, 16
53 vpaddw ymm5, ymm0, ymm5
54 vpaddd ymm6, ymm5, ymm6
55 vpmaddubsw ymm2, ymm7, ymm2 # s2 partial sums
56 vpmaddubsw ymm3, ymm12, ymm3
57 prefetcht0 [rdi+384] # prefetch 6 cachelines ahead.
58 vpaddw ymm3, ymm2, ymm3
59 vpsrldq ymm2, ymm3, 2
60 vpaddd ymm3, ymm2, ymm3
61 vpaddd ymm1, ymm1, ymm3
62
63#if CHAR_OFFSET != 0
64 vpaddd ymm6, ymm10, ymm6 # 32*CHAR_OFFSET
65 vpaddd ymm1, ymm13, ymm1 # 528*CHAR_OFFSET
66#endif
67 vmovdqa ymm2, ymm8 # move the next 64 bytes
68 vmovdqa ymm3, ymm9 # into the right registers
69 sub esi, 1
70 jnz .loop
71
72 # now we reduce the partial sums.
73 vpslld ymm3, ymm4, 6
74 vpsrldq ymm2, ymm6, 4
75
76 vpaddd ymm0, ymm3, ymm1
77 vpaddd ymm6, ymm2, ymm6
78 vpsrlq ymm3, ymm0, 32
79
80 vpsrldq ymm2, ymm6, 8
81 vpaddd ymm0, ymm3, ymm0
82 vpsrldq ymm3, ymm0, 8
83 vpaddd ymm6, ymm2, ymm6
84 vpaddd ymm0, ymm3, ymm0
85 vextracti128 xmm2, ymm6, 0x1
86 vextracti128 xmm1, ymm0, 0x1
87 vpaddd xmm6, xmm2, xmm6
88 vmovd [rcx], xmm6
89 vpaddd xmm1, xmm1, xmm0
90 vmovd ecx, xmm1
91 add eax, ecx
92 mov [r8], eax
93.exit:
94 vzeroupper
95 mov eax, edx
96 ret
97
98#ifdef __APPLE__
99.data
100 .align 6
101#else
102.section .rodata
103 .p2align 6
104#endif
105.mul_T2:
106 .byte 64
107 .byte 63
108 .byte 62
109 .byte 61
110 .byte 60
111 .byte 59
112 .byte 58
113 .byte 57
114 .byte 56
115 .byte 55
116 .byte 54
117 .byte 53
118 .byte 52
119 .byte 51
120 .byte 50
121 .byte 49
122 .byte 48
123 .byte 47
124 .byte 46
125 .byte 45
126 .byte 44
127 .byte 43
128 .byte 42
129 .byte 41
130 .byte 40
131 .byte 39
132 .byte 38
133 .byte 37
134 .byte 36
135 .byte 35
136 .byte 34
137 .byte 33
138 .byte 32
139 .byte 31
140 .byte 30
141 .byte 29
142 .byte 28
143 .byte 27
144 .byte 26
145 .byte 25
146 .byte 24
147 .byte 23
148 .byte 22
149 .byte 21
150 .byte 20
151 .byte 19
152 .byte 18
153 .byte 17
154 .byte 16
155 .byte 15
156 .byte 14
157 .byte 13
158 .byte 12
159 .byte 11
160 .byte 10
161 .byte 9
162 .byte 8
163 .byte 7
164 .byte 6
165 .byte 5
166 .byte 4
167 .byte 3
168 .byte 2
169 .byte 1