]> git.ipfire.org Git - thirdparty/rsync.git/blame - simd-checksum-avx2.S
Some word fixes.
[thirdparty/rsync.git] / simd-checksum-avx2.S
CommitLineData
b81a5095
WD
1#include "config.h"
2
3#ifdef USE_ROLL_ASM /* { */
4
5#define CHAR_OFFSET 0 /* Keep this the same as rsync.h, which isn't likely to change. */
6
265785b7 7#ifdef __APPLE__
b81a5095 8#define get_checksum1_avx2_asm _get_checksum1_avx2_asm
265785b7
S
9#endif
10
11.intel_syntax noprefix
12.text
13
14 .p2align 5
b81a5095 15 .globl get_checksum1_avx2_asm
265785b7
S
16
17# rdi=*buf, esi=len, edx=i, rcx= *ps1, r8= *ps2
b81a5095 18get_checksum1_avx2_asm:
265785b7
S
19 vmovd xmm6,[rcx] # load *ps1
20 lea eax, [rsi-128] # at least 128 bytes to process?
21 cmp edx, eax
22 jg .exit
23 lea rax, .mul_T2[rip]
24 vmovntdqa ymm7, [rax] # load T2 multiplication constants
25 vmovntdqa ymm12,[rax+32]# from memory.
26 vpcmpeqd ymm15, ymm15, ymm15 # set all elements to -1.
27
28#if CHAR_OFFSET != 0
29 mov eax, 32*CHAR_OFFSET
30 vmovd xmm10, eax
31 vpbroadcastd ymm10, xmm10
32 mov eax, 528*CHAR_OFFSET
33 vmovd xmm13, eax
34 vpbroadcastd ymm13, xmm13
35#endif
36 vpabsb ymm15, ymm15 # set all byte size elements to 1.
37 add rdi, rdx
38 vmovdqu ymm2, [rdi] # preload the first 64 bytes.
39 vmovdqu ymm3, [rdi+32]
40 and esi, ~63 # only needed during final reduction,
41 # done here to avoid a longer nop for
42 # alignment below.
43 add edx, esi
44 shr rsi, 6 # longer opcode for alignment
45 add rdi, 64
46 vpxor xmm1, xmm1, xmm1 # reset both partial sums accumulators.
47 vpxor xmm4, xmm4, xmm4
48 mov eax, [r8]
49 .p2align 4 # should fit into the LSD allocation queue.
50.loop:
51 vpmaddubsw ymm0, ymm15, ymm2 # s1 partial sums
52 vpmaddubsw ymm5, ymm15, ymm3
53 vmovdqu ymm8, [rdi] # preload the next
54 vmovdqu ymm9, [rdi+32] # 64 bytes.
55 add rdi, 64
56 vpaddd ymm4, ymm4, ymm6
57 vpaddw ymm5, ymm5, ymm0
58 vpsrld ymm0, ymm5, 16
59 vpaddw ymm5, ymm0, ymm5
60 vpaddd ymm6, ymm5, ymm6
61 vpmaddubsw ymm2, ymm7, ymm2 # s2 partial sums
62 vpmaddubsw ymm3, ymm12, ymm3
63 prefetcht0 [rdi+384] # prefetch 6 cachelines ahead.
64 vpaddw ymm3, ymm2, ymm3
65 vpsrldq ymm2, ymm3, 2
66 vpaddd ymm3, ymm2, ymm3
67 vpaddd ymm1, ymm1, ymm3
68
69#if CHAR_OFFSET != 0
70 vpaddd ymm6, ymm10, ymm6 # 32*CHAR_OFFSET
71 vpaddd ymm1, ymm13, ymm1 # 528*CHAR_OFFSET
72#endif
73 vmovdqa ymm2, ymm8 # move the next 64 bytes
74 vmovdqa ymm3, ymm9 # into the right registers
75 sub esi, 1
76 jnz .loop
77
78 # now we reduce the partial sums.
79 vpslld ymm3, ymm4, 6
80 vpsrldq ymm2, ymm6, 4
81
82 vpaddd ymm0, ymm3, ymm1
83 vpaddd ymm6, ymm2, ymm6
84 vpsrlq ymm3, ymm0, 32
85
86 vpsrldq ymm2, ymm6, 8
87 vpaddd ymm0, ymm3, ymm0
88 vpsrldq ymm3, ymm0, 8
89 vpaddd ymm6, ymm2, ymm6
90 vpaddd ymm0, ymm3, ymm0
91 vextracti128 xmm2, ymm6, 0x1
92 vextracti128 xmm1, ymm0, 0x1
93 vpaddd xmm6, xmm2, xmm6
94 vmovd [rcx], xmm6
95 vpaddd xmm1, xmm1, xmm0
96 vmovd ecx, xmm1
97 add eax, ecx
98 mov [r8], eax
99.exit:
100 vzeroupper
101 mov eax, edx
102 ret
103
104#ifdef __APPLE__
105.data
106 .align 6
107#else
108.section .rodata
109 .p2align 6
110#endif
111.mul_T2:
112 .byte 64
113 .byte 63
114 .byte 62
115 .byte 61
116 .byte 60
117 .byte 59
118 .byte 58
119 .byte 57
120 .byte 56
121 .byte 55
122 .byte 54
123 .byte 53
124 .byte 52
125 .byte 51
126 .byte 50
127 .byte 49
128 .byte 48
129 .byte 47
130 .byte 46
131 .byte 45
132 .byte 44
133 .byte 43
134 .byte 42
135 .byte 41
136 .byte 40
137 .byte 39
138 .byte 38
139 .byte 37
140 .byte 36
141 .byte 35
142 .byte 34
143 .byte 33
144 .byte 32
145 .byte 31
146 .byte 30
147 .byte 29
148 .byte 28
149 .byte 27
150 .byte 26
151 .byte 25
152 .byte 24
153 .byte 23
154 .byte 22
155 .byte 21
156 .byte 20
157 .byte 19
158 .byte 18
159 .byte 17
160 .byte 16
161 .byte 15
162 .byte 14
163 .byte 13
164 .byte 12
165 .byte 11
166 .byte 10
167 .byte 9
168 .byte 8
169 .byte 7
170 .byte 6
171 .byte 5
172 .byte 4
173 .byte 3
174 .byte 2
175 .byte 1
b81a5095
WD
176
177#endif /* } USE_ROLL_ASM */