]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
a98c648e AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # November 2014 | |
18 | # | |
19 | # ChaCha20 for x86_64. | |
20 | # | |
abb8c44f AP |
21 | # December 2016 |
22 | # | |
23 | # Add AVX512F code path. | |
24 | # | |
a98c648e AP |
25 | # Performance in cycles per byte out of large buffer. |
26 | # | |
27 | # IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2 | |
28 | # | |
29 | # P4 9.48/+99% -/22.7(ii) - | |
30 | # Core2 7.83/+55% 7.90/8.08 4.35 | |
31 | # Westmere 7.19/+50% 5.60/6.70 3.00 | |
32 | # Sandy Bridge 8.31/+42% 5.45/6.76 2.72 | |
33 | # Ivy Bridge 6.71/+46% 5.40/6.49 2.41 | |
34 | # Haswell 5.92/+43% 5.20/6.45 2.42 1.23 | |
a30b0522 | 35 | # Skylake 5.87/+39% 4.70/- 2.31 1.19 |
a98c648e | 36 | # Silvermont 12.0/+33% 7.75/7.40 7.03(iii) |
ace05265 | 37 | # Goldmont 10.6/+17% 5.10/- 3.28 |
a98c648e AP |
38 | # Sledgehammer 7.28/+52% -/14.2(ii) - |
39 | # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv) | |
54f8f9a1 | 40 | # Ryzen 5.96/+50% 5.19/- 2.40 2.09 |
a98c648e AP |
41 | # VIA Nano 10.5/+46% 6.72/8.60 6.05 |
42 | # | |
43 | # (i) compared to older gcc 3.x one can observe >2x improvement on | |
44 | # most platforms; | |
45 | # (ii) as it can be seen, SSE2 performance is too low on legacy | |
46 | # processors; NxSSE2 results are naturally better, but not | |
47 | # impressively better than IALU ones, which is why you won't | |
48 | # find SSE2 code below; | |
49 | # (iii) this is not optimal result for Atom because of MSROM | |
50 | # limitations, SSE2 can do better, but gain is considered too | |
51 | # low to justify the [maintenance] effort; | |
52 | # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20; | |
53 | ||
54 | $flavour = shift; | |
55 | $output = shift; | |
56 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
57 | ||
58 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
59 | ||
60 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
61 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
62 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
63 | die "can't locate x86_64-xlate.pl"; | |
64 | ||
65 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
66 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
abb8c44f | 67 | $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); |
a98c648e AP |
68 | } |
69 | ||
70 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
1ea01427 | 71 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { |
abb8c44f AP |
72 | $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); |
73 | $avx += 1 if ($1==2.11 && $2>=8); | |
a98c648e AP |
74 | } |
75 | ||
76 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
77 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
78 | $avx = ($1>=10) + ($1>=11); | |
79 | } | |
80 | ||
81 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { | |
82 | $avx = ($2>=3.0) + ($2>3.0); | |
83 | } | |
84 | ||
cfe1d992 | 85 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
a98c648e AP |
86 | *STDOUT=*OUT; |
87 | ||
88 | # input parameter block | |
89 | ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); | |
90 | ||
91 | $code.=<<___; | |
92 | .text | |
93 | ||
94 | .extern OPENSSL_ia32cap_P | |
95 | ||
96 | .align 64 | |
97 | .Lzero: | |
98 | .long 0,0,0,0 | |
99 | .Lone: | |
100 | .long 1,0,0,0 | |
101 | .Linc: | |
102 | .long 0,1,2,3 | |
103 | .Lfour: | |
104 | .long 4,4,4,4 | |
105 | .Lincy: | |
106 | .long 0,2,4,6,1,3,5,7 | |
107 | .Leight: | |
108 | .long 8,8,8,8,8,8,8,8 | |
109 | .Lrot16: | |
110 | .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd | |
111 | .Lrot24: | |
112 | .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe | |
113 | .Lsigma: | |
114 | .asciz "expand 32-byte k" | |
abb8c44f | 115 | .align 64 |
3c274a6e AP |
116 | .Lzeroz: |
117 | .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 | |
118 | .Lfourz: | |
119 | .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 | |
abb8c44f AP |
120 | .Lincz: |
121 | .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 | |
122 | .Lsixteen: | |
123 | .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 | |
a98c648e AP |
124 | .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
125 | ___ | |
126 | ||
127 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm | |
128 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; | |
129 | my $arg = pop; | |
130 | $arg = "\$$arg" if ($arg*1 eq $arg); | |
131 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; | |
132 | } | |
133 | ||
134 | @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), | |
135 | "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); | |
136 | @t=("%esi","%edi"); | |
137 | ||
138 | sub ROUND { # critical path is 24 cycles per round | |
139 | my ($a0,$b0,$c0,$d0)=@_; | |
140 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
141 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
142 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
143 | my ($xc,$xc_)=map("\"$_\"",@t); | |
144 | my @x=map("\"$_\"",@x); | |
145 | ||
146 | # Consider order in which variables are addressed by their | |
147 | # index: | |
148 | # | |
149 | # a b c d | |
150 | # | |
151 | # 0 4 8 12 < even round | |
152 | # 1 5 9 13 | |
153 | # 2 6 10 14 | |
154 | # 3 7 11 15 | |
155 | # 0 5 10 15 < odd round | |
156 | # 1 6 11 12 | |
157 | # 2 7 8 13 | |
158 | # 3 4 9 14 | |
159 | # | |
160 | # 'a', 'b' and 'd's are permanently allocated in registers, | |
161 | # @x[0..7,12..15], while 'c's are maintained in memory. If | |
162 | # you observe 'c' column, you'll notice that pair of 'c's is | |
163 | # invariant between rounds. This means that we have to reload | |
164 | # them once per round, in the middle. This is why you'll see | |
165 | # bunch of 'c' stores and loads in the middle, but none in | |
166 | # the beginning or end. | |
167 | ||
168 | # Normally instructions would be interleaved to favour in-order | |
169 | # execution. Generally out-of-order cores manage it gracefully, | |
170 | # but not this time for some reason. As in-order execution | |
171 | # cores are dying breed, old Atom is the only one around, | |
172 | # instructions are left uninterleaved. Besides, Atom is better | |
173 | # off executing 1xSSSE3 code anyway... | |
174 | ||
175 | ( | |
176 | "&add (@x[$a0],@x[$b0])", # Q1 | |
177 | "&xor (@x[$d0],@x[$a0])", | |
178 | "&rol (@x[$d0],16)", | |
179 | "&add (@x[$a1],@x[$b1])", # Q2 | |
180 | "&xor (@x[$d1],@x[$a1])", | |
181 | "&rol (@x[$d1],16)", | |
182 | ||
183 | "&add ($xc,@x[$d0])", | |
184 | "&xor (@x[$b0],$xc)", | |
185 | "&rol (@x[$b0],12)", | |
186 | "&add ($xc_,@x[$d1])", | |
187 | "&xor (@x[$b1],$xc_)", | |
188 | "&rol (@x[$b1],12)", | |
189 | ||
190 | "&add (@x[$a0],@x[$b0])", | |
191 | "&xor (@x[$d0],@x[$a0])", | |
192 | "&rol (@x[$d0],8)", | |
193 | "&add (@x[$a1],@x[$b1])", | |
194 | "&xor (@x[$d1],@x[$a1])", | |
195 | "&rol (@x[$d1],8)", | |
196 | ||
197 | "&add ($xc,@x[$d0])", | |
198 | "&xor (@x[$b0],$xc)", | |
199 | "&rol (@x[$b0],7)", | |
200 | "&add ($xc_,@x[$d1])", | |
201 | "&xor (@x[$b1],$xc_)", | |
202 | "&rol (@x[$b1],7)", | |
203 | ||
204 | "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's | |
205 | "&mov (\"4*$c1(%rsp)\",$xc_)", | |
206 | "&mov ($xc,\"4*$c2(%rsp)\")", | |
207 | "&mov ($xc_,\"4*$c3(%rsp)\")", | |
208 | ||
209 | "&add (@x[$a2],@x[$b2])", # Q3 | |
210 | "&xor (@x[$d2],@x[$a2])", | |
211 | "&rol (@x[$d2],16)", | |
212 | "&add (@x[$a3],@x[$b3])", # Q4 | |
213 | "&xor (@x[$d3],@x[$a3])", | |
214 | "&rol (@x[$d3],16)", | |
215 | ||
216 | "&add ($xc,@x[$d2])", | |
217 | "&xor (@x[$b2],$xc)", | |
218 | "&rol (@x[$b2],12)", | |
219 | "&add ($xc_,@x[$d3])", | |
220 | "&xor (@x[$b3],$xc_)", | |
221 | "&rol (@x[$b3],12)", | |
222 | ||
223 | "&add (@x[$a2],@x[$b2])", | |
224 | "&xor (@x[$d2],@x[$a2])", | |
225 | "&rol (@x[$d2],8)", | |
226 | "&add (@x[$a3],@x[$b3])", | |
227 | "&xor (@x[$d3],@x[$a3])", | |
228 | "&rol (@x[$d3],8)", | |
229 | ||
230 | "&add ($xc,@x[$d2])", | |
231 | "&xor (@x[$b2],$xc)", | |
232 | "&rol (@x[$b2],7)", | |
233 | "&add ($xc_,@x[$d3])", | |
234 | "&xor (@x[$b3],$xc_)", | |
235 | "&rol (@x[$b3],7)" | |
236 | ); | |
237 | } | |
238 | ||
239 | ######################################################################## | |
240 | # Generic code path that handles all lengths on pre-SSSE3 processors. | |
241 | $code.=<<___; | |
242 | .globl ChaCha20_ctr32 | |
243 | .type ChaCha20_ctr32,\@function,5 | |
244 | .align 64 | |
245 | ChaCha20_ctr32: | |
f17652e5 | 246 | .cfi_startproc |
622a531c AP |
247 | cmp \$0,$len |
248 | je .Lno_data | |
a98c648e | 249 | mov OPENSSL_ia32cap_P+4(%rip),%r10 |
3c274a6e AP |
250 | ___ |
251 | $code.=<<___ if ($avx>2); | |
252 | bt \$48,%r10 # check for AVX512F | |
253 | jc .LChaCha20_avx512 | |
254 | ___ | |
255 | $code.=<<___; | |
a98c648e AP |
256 | test \$`1<<(41-32)`,%r10d |
257 | jnz .LChaCha20_ssse3 | |
258 | ||
259 | push %rbx | |
f17652e5 | 260 | .cfi_push %rbx |
a98c648e | 261 | push %rbp |
f17652e5 | 262 | .cfi_push %rbp |
a98c648e | 263 | push %r12 |
f17652e5 | 264 | .cfi_push %r12 |
a98c648e | 265 | push %r13 |
f17652e5 | 266 | .cfi_push %r13 |
a98c648e | 267 | push %r14 |
f17652e5 | 268 | .cfi_push %r14 |
a98c648e | 269 | push %r15 |
f17652e5 | 270 | .cfi_push %r15 |
a98c648e | 271 | sub \$64+24,%rsp |
f17652e5 | 272 | .cfi_adjust_cfa_offset 64+24 |
384e6de4 | 273 | .Lctr32_body: |
a98c648e AP |
274 | |
275 | #movdqa .Lsigma(%rip),%xmm0 | |
276 | movdqu ($key),%xmm1 | |
277 | movdqu 16($key),%xmm2 | |
278 | movdqu ($counter),%xmm3 | |
279 | movdqa .Lone(%rip),%xmm4 | |
280 | ||
281 | #movdqa %xmm0,4*0(%rsp) # key[0] | |
282 | movdqa %xmm1,4*4(%rsp) # key[1] | |
283 | movdqa %xmm2,4*8(%rsp) # key[2] | |
284 | movdqa %xmm3,4*12(%rsp) # key[3] | |
285 | mov $len,%rbp # reassign $len | |
286 | jmp .Loop_outer | |
287 | ||
288 | .align 32 | |
289 | .Loop_outer: | |
290 | mov \$0x61707865,@x[0] # 'expa' | |
291 | mov \$0x3320646e,@x[1] # 'nd 3' | |
292 | mov \$0x79622d32,@x[2] # '2-by' | |
293 | mov \$0x6b206574,@x[3] # 'te k' | |
294 | mov 4*4(%rsp),@x[4] | |
295 | mov 4*5(%rsp),@x[5] | |
296 | mov 4*6(%rsp),@x[6] | |
297 | mov 4*7(%rsp),@x[7] | |
298 | movd %xmm3,@x[12] | |
299 | mov 4*13(%rsp),@x[13] | |
300 | mov 4*14(%rsp),@x[14] | |
301 | mov 4*15(%rsp),@x[15] | |
302 | ||
303 | mov %rbp,64+0(%rsp) # save len | |
304 | mov \$10,%ebp | |
305 | mov $inp,64+8(%rsp) # save inp | |
306 | movq %xmm2,%rsi # "@x[8]" | |
307 | mov $out,64+16(%rsp) # save out | |
308 | mov %rsi,%rdi | |
309 | shr \$32,%rdi # "@x[9]" | |
310 | jmp .Loop | |
311 | ||
312 | .align 32 | |
313 | .Loop: | |
314 | ___ | |
315 | foreach (&ROUND (0, 4, 8,12)) { eval; } | |
316 | foreach (&ROUND (0, 5,10,15)) { eval; } | |
317 | &dec ("%ebp"); | |
318 | &jnz (".Loop"); | |
319 | ||
320 | $code.=<<___; | |
321 | mov @t[1],4*9(%rsp) # modulo-scheduled | |
322 | mov @t[0],4*8(%rsp) | |
323 | mov 64(%rsp),%rbp # load len | |
324 | movdqa %xmm2,%xmm1 | |
325 | mov 64+8(%rsp),$inp # load inp | |
326 | paddd %xmm4,%xmm3 # increment counter | |
327 | mov 64+16(%rsp),$out # load out | |
328 | ||
329 | add \$0x61707865,@x[0] # 'expa' | |
330 | add \$0x3320646e,@x[1] # 'nd 3' | |
331 | add \$0x79622d32,@x[2] # '2-by' | |
332 | add \$0x6b206574,@x[3] # 'te k' | |
333 | add 4*4(%rsp),@x[4] | |
334 | add 4*5(%rsp),@x[5] | |
335 | add 4*6(%rsp),@x[6] | |
336 | add 4*7(%rsp),@x[7] | |
337 | add 4*12(%rsp),@x[12] | |
338 | add 4*13(%rsp),@x[13] | |
339 | add 4*14(%rsp),@x[14] | |
340 | add 4*15(%rsp),@x[15] | |
341 | paddd 4*8(%rsp),%xmm1 | |
342 | ||
343 | cmp \$64,%rbp | |
344 | jb .Ltail | |
345 | ||
346 | xor 4*0($inp),@x[0] # xor with input | |
347 | xor 4*1($inp),@x[1] | |
348 | xor 4*2($inp),@x[2] | |
349 | xor 4*3($inp),@x[3] | |
350 | xor 4*4($inp),@x[4] | |
351 | xor 4*5($inp),@x[5] | |
352 | xor 4*6($inp),@x[6] | |
353 | xor 4*7($inp),@x[7] | |
354 | movdqu 4*8($inp),%xmm0 | |
355 | xor 4*12($inp),@x[12] | |
356 | xor 4*13($inp),@x[13] | |
357 | xor 4*14($inp),@x[14] | |
358 | xor 4*15($inp),@x[15] | |
359 | lea 4*16($inp),$inp # inp+=64 | |
360 | pxor %xmm1,%xmm0 | |
361 | ||
362 | movdqa %xmm2,4*8(%rsp) | |
363 | movd %xmm3,4*12(%rsp) | |
364 | ||
365 | mov @x[0],4*0($out) # write output | |
366 | mov @x[1],4*1($out) | |
367 | mov @x[2],4*2($out) | |
368 | mov @x[3],4*3($out) | |
369 | mov @x[4],4*4($out) | |
370 | mov @x[5],4*5($out) | |
371 | mov @x[6],4*6($out) | |
372 | mov @x[7],4*7($out) | |
373 | movdqu %xmm0,4*8($out) | |
374 | mov @x[12],4*12($out) | |
375 | mov @x[13],4*13($out) | |
376 | mov @x[14],4*14($out) | |
377 | mov @x[15],4*15($out) | |
378 | lea 4*16($out),$out # out+=64 | |
379 | ||
380 | sub \$64,%rbp | |
381 | jnz .Loop_outer | |
382 | ||
383 | jmp .Ldone | |
384 | ||
385 | .align 16 | |
386 | .Ltail: | |
387 | mov @x[0],4*0(%rsp) | |
a98c648e | 388 | mov @x[1],4*1(%rsp) |
29880e97 | 389 | xor %rbx,%rbx |
a98c648e AP |
390 | mov @x[2],4*2(%rsp) |
391 | mov @x[3],4*3(%rsp) | |
392 | mov @x[4],4*4(%rsp) | |
393 | mov @x[5],4*5(%rsp) | |
394 | mov @x[6],4*6(%rsp) | |
395 | mov @x[7],4*7(%rsp) | |
396 | movdqa %xmm1,4*8(%rsp) | |
397 | mov @x[12],4*12(%rsp) | |
398 | mov @x[13],4*13(%rsp) | |
399 | mov @x[14],4*14(%rsp) | |
400 | mov @x[15],4*15(%rsp) | |
401 | ||
402 | .Loop_tail: | |
403 | movzb ($inp,%rbx),%eax | |
404 | movzb (%rsp,%rbx),%edx | |
405 | lea 1(%rbx),%rbx | |
406 | xor %edx,%eax | |
407 | mov %al,-1($out,%rbx) | |
408 | dec %rbp | |
409 | jnz .Loop_tail | |
410 | ||
411 | .Ldone: | |
384e6de4 | 412 | lea 64+24+48(%rsp),%rsi |
f17652e5 | 413 | .cfi_def_cfa %rsi,8 |
384e6de4 | 414 | mov -48(%rsi),%r15 |
f17652e5 | 415 | .cfi_restore %r15 |
384e6de4 | 416 | mov -40(%rsi),%r14 |
f17652e5 | 417 | .cfi_restore %r14 |
384e6de4 | 418 | mov -32(%rsi),%r13 |
f17652e5 | 419 | .cfi_restore %r13 |
384e6de4 | 420 | mov -24(%rsi),%r12 |
f17652e5 | 421 | .cfi_restore %r12 |
384e6de4 | 422 | mov -16(%rsi),%rbp |
f17652e5 | 423 | .cfi_restore %rbp |
384e6de4 | 424 | mov -8(%rsi),%rbx |
f17652e5 | 425 | .cfi_restore %rbx |
384e6de4 | 426 | lea (%rsi),%rsp |
f17652e5 | 427 | .cfi_def_cfa_register %rsp |
622a531c | 428 | .Lno_data: |
a98c648e | 429 | ret |
f17652e5 | 430 | .cfi_endproc |
a98c648e AP |
431 | .size ChaCha20_ctr32,.-ChaCha20_ctr32 |
432 | ___ | |
433 | ||
434 | ######################################################################## | |
435 | # SSSE3 code path that handles shorter lengths | |
436 | { | |
437 | my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); | |
438 | ||
439 | sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round | |
440 | &paddd ($a,$b); | |
441 | &pxor ($d,$a); | |
442 | &pshufb ($d,$rot16); | |
443 | ||
444 | &paddd ($c,$d); | |
445 | &pxor ($b,$c); | |
446 | &movdqa ($t,$b); | |
447 | &psrld ($b,20); | |
448 | &pslld ($t,12); | |
449 | &por ($b,$t); | |
450 | ||
451 | &paddd ($a,$b); | |
452 | &pxor ($d,$a); | |
453 | &pshufb ($d,$rot24); | |
454 | ||
455 | &paddd ($c,$d); | |
456 | &pxor ($b,$c); | |
457 | &movdqa ($t,$b); | |
458 | &psrld ($b,25); | |
459 | &pslld ($t,7); | |
460 | &por ($b,$t); | |
461 | } | |
462 | ||
384e6de4 | 463 | my $xframe = $win64 ? 32+8 : 8; |
a98c648e AP |
464 | |
465 | $code.=<<___; | |
466 | .type ChaCha20_ssse3,\@function,5 | |
467 | .align 32 | |
468 | ChaCha20_ssse3: | |
f17652e5 | 469 | .cfi_startproc |
a98c648e | 470 | .LChaCha20_ssse3: |
384e6de4 | 471 | mov %rsp,%r9 # frame pointer |
f17652e5 | 472 | .cfi_def_cfa_register %r9 |
a98c648e AP |
473 | ___ |
474 | $code.=<<___ if ($avx); | |
475 | test \$`1<<(43-32)`,%r10d | |
476 | jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4 | |
477 | ___ | |
478 | $code.=<<___; | |
479 | cmp \$128,$len # we might throw away some data, | |
480 | ja .LChaCha20_4x # but overall it won't be slower | |
481 | ||
482 | .Ldo_sse3_after_all: | |
a98c648e AP |
483 | sub \$64+$xframe,%rsp |
484 | ___ | |
485 | $code.=<<___ if ($win64); | |
384e6de4 AP |
486 | movaps %xmm6,-0x28(%r9) |
487 | movaps %xmm7,-0x18(%r9) | |
488 | .Lssse3_body: | |
a98c648e AP |
489 | ___ |
490 | $code.=<<___; | |
491 | movdqa .Lsigma(%rip),$a | |
492 | movdqu ($key),$b | |
493 | movdqu 16($key),$c | |
494 | movdqu ($counter),$d | |
495 | movdqa .Lrot16(%rip),$rot16 | |
496 | movdqa .Lrot24(%rip),$rot24 | |
497 | ||
498 | movdqa $a,0x00(%rsp) | |
499 | movdqa $b,0x10(%rsp) | |
500 | movdqa $c,0x20(%rsp) | |
501 | movdqa $d,0x30(%rsp) | |
3c274a6e | 502 | mov \$10,$counter # reuse $counter |
a98c648e AP |
503 | jmp .Loop_ssse3 |
504 | ||
505 | .align 32 | |
506 | .Loop_outer_ssse3: | |
507 | movdqa .Lone(%rip),$d | |
508 | movdqa 0x00(%rsp),$a | |
509 | movdqa 0x10(%rsp),$b | |
510 | movdqa 0x20(%rsp),$c | |
511 | paddd 0x30(%rsp),$d | |
3c274a6e | 512 | mov \$10,$counter |
a98c648e AP |
513 | movdqa $d,0x30(%rsp) |
514 | jmp .Loop_ssse3 | |
515 | ||
516 | .align 32 | |
517 | .Loop_ssse3: | |
518 | ___ | |
519 | &SSSE3ROUND(); | |
520 | &pshufd ($c,$c,0b01001110); | |
521 | &pshufd ($b,$b,0b00111001); | |
522 | &pshufd ($d,$d,0b10010011); | |
523 | &nop (); | |
524 | ||
525 | &SSSE3ROUND(); | |
526 | &pshufd ($c,$c,0b01001110); | |
527 | &pshufd ($b,$b,0b10010011); | |
528 | &pshufd ($d,$d,0b00111001); | |
529 | ||
3c274a6e | 530 | &dec ($counter); |
a98c648e AP |
531 | &jnz (".Loop_ssse3"); |
532 | ||
533 | $code.=<<___; | |
534 | paddd 0x00(%rsp),$a | |
535 | paddd 0x10(%rsp),$b | |
536 | paddd 0x20(%rsp),$c | |
537 | paddd 0x30(%rsp),$d | |
538 | ||
539 | cmp \$64,$len | |
540 | jb .Ltail_ssse3 | |
541 | ||
542 | movdqu 0x00($inp),$t | |
543 | movdqu 0x10($inp),$t1 | |
544 | pxor $t,$a # xor with input | |
545 | movdqu 0x20($inp),$t | |
546 | pxor $t1,$b | |
547 | movdqu 0x30($inp),$t1 | |
548 | lea 0x40($inp),$inp # inp+=64 | |
549 | pxor $t,$c | |
550 | pxor $t1,$d | |
551 | ||
552 | movdqu $a,0x00($out) # write output | |
553 | movdqu $b,0x10($out) | |
554 | movdqu $c,0x20($out) | |
555 | movdqu $d,0x30($out) | |
556 | lea 0x40($out),$out # out+=64 | |
557 | ||
558 | sub \$64,$len | |
559 | jnz .Loop_outer_ssse3 | |
560 | ||
561 | jmp .Ldone_ssse3 | |
562 | ||
563 | .align 16 | |
564 | .Ltail_ssse3: | |
565 | movdqa $a,0x00(%rsp) | |
566 | movdqa $b,0x10(%rsp) | |
567 | movdqa $c,0x20(%rsp) | |
568 | movdqa $d,0x30(%rsp) | |
3c274a6e | 569 | xor $counter,$counter |
a98c648e AP |
570 | |
571 | .Loop_tail_ssse3: | |
3c274a6e AP |
572 | movzb ($inp,$counter),%eax |
573 | movzb (%rsp,$counter),%ecx | |
574 | lea 1($counter),$counter | |
29880e97 | 575 | xor %ecx,%eax |
3c274a6e | 576 | mov %al,-1($out,$counter) |
29880e97 | 577 | dec $len |
a98c648e AP |
578 | jnz .Loop_tail_ssse3 |
579 | ||
580 | .Ldone_ssse3: | |
581 | ___ | |
582 | $code.=<<___ if ($win64); | |
384e6de4 AP |
583 | movaps -0x28(%r9),%xmm6 |
584 | movaps -0x18(%r9),%xmm7 | |
a98c648e AP |
585 | ___ |
586 | $code.=<<___; | |
384e6de4 | 587 | lea (%r9),%rsp |
f17652e5 | 588 | .cfi_def_cfa_register %rsp |
384e6de4 | 589 | .Lssse3_epilogue: |
a98c648e | 590 | ret |
f17652e5 | 591 | .cfi_endproc |
a98c648e AP |
592 | .size ChaCha20_ssse3,.-ChaCha20_ssse3 |
593 | ___ | |
594 | } | |
595 | ||
596 | ######################################################################## | |
597 | # SSSE3 code path that handles longer messages. | |
598 | { | |
599 | # assign variables to favor Atom front-end | |
600 | my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, | |
601 | $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); | |
602 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
603 | "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); | |
604 | ||
605 | sub SSSE3_lane_ROUND { | |
606 | my ($a0,$b0,$c0,$d0)=@_; | |
607 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
608 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
609 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
610 | my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); | |
611 | my @x=map("\"$_\"",@xx); | |
612 | ||
613 | # Consider order in which variables are addressed by their | |
614 | # index: | |
615 | # | |
616 | # a b c d | |
617 | # | |
618 | # 0 4 8 12 < even round | |
619 | # 1 5 9 13 | |
620 | # 2 6 10 14 | |
621 | # 3 7 11 15 | |
622 | # 0 5 10 15 < odd round | |
623 | # 1 6 11 12 | |
624 | # 2 7 8 13 | |
625 | # 3 4 9 14 | |
626 | # | |
627 | # 'a', 'b' and 'd's are permanently allocated in registers, | |
628 | # @x[0..7,12..15], while 'c's are maintained in memory. If | |
629 | # you observe 'c' column, you'll notice that pair of 'c's is | |
630 | # invariant between rounds. This means that we have to reload | |
631 | # them once per round, in the middle. This is why you'll see | |
632 | # bunch of 'c' stores and loads in the middle, but none in | |
633 | # the beginning or end. | |
634 | ||
635 | ( | |
636 | "&paddd (@x[$a0],@x[$b0])", # Q1 | |
637 | "&paddd (@x[$a1],@x[$b1])", # Q2 | |
638 | "&pxor (@x[$d0],@x[$a0])", | |
639 | "&pxor (@x[$d1],@x[$a1])", | |
640 | "&pshufb (@x[$d0],$t1)", | |
641 | "&pshufb (@x[$d1],$t1)", | |
642 | ||
643 | "&paddd ($xc,@x[$d0])", | |
644 | "&paddd ($xc_,@x[$d1])", | |
645 | "&pxor (@x[$b0],$xc)", | |
646 | "&pxor (@x[$b1],$xc_)", | |
647 | "&movdqa ($t0,@x[$b0])", | |
648 | "&pslld (@x[$b0],12)", | |
649 | "&psrld ($t0,20)", | |
650 | "&movdqa ($t1,@x[$b1])", | |
651 | "&pslld (@x[$b1],12)", | |
652 | "&por (@x[$b0],$t0)", | |
653 | "&psrld ($t1,20)", | |
654 | "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) | |
655 | "&por (@x[$b1],$t1)", | |
656 | ||
657 | "&paddd (@x[$a0],@x[$b0])", | |
658 | "&paddd (@x[$a1],@x[$b1])", | |
659 | "&pxor (@x[$d0],@x[$a0])", | |
660 | "&pxor (@x[$d1],@x[$a1])", | |
661 | "&pshufb (@x[$d0],$t0)", | |
662 | "&pshufb (@x[$d1],$t0)", | |
663 | ||
664 | "&paddd ($xc,@x[$d0])", | |
665 | "&paddd ($xc_,@x[$d1])", | |
666 | "&pxor (@x[$b0],$xc)", | |
667 | "&pxor (@x[$b1],$xc_)", | |
668 | "&movdqa ($t1,@x[$b0])", | |
669 | "&pslld (@x[$b0],7)", | |
670 | "&psrld ($t1,25)", | |
671 | "&movdqa ($t0,@x[$b1])", | |
672 | "&pslld (@x[$b1],7)", | |
673 | "&por (@x[$b0],$t1)", | |
674 | "&psrld ($t0,25)", | |
675 | "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) | |
676 | "&por (@x[$b1],$t0)", | |
677 | ||
678 | "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's | |
679 | "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", | |
680 | "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", | |
681 | "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", | |
682 | ||
683 | "&paddd (@x[$a2],@x[$b2])", # Q3 | |
684 | "&paddd (@x[$a3],@x[$b3])", # Q4 | |
685 | "&pxor (@x[$d2],@x[$a2])", | |
686 | "&pxor (@x[$d3],@x[$a3])", | |
687 | "&pshufb (@x[$d2],$t1)", | |
688 | "&pshufb (@x[$d3],$t1)", | |
689 | ||
690 | "&paddd ($xc,@x[$d2])", | |
691 | "&paddd ($xc_,@x[$d3])", | |
692 | "&pxor (@x[$b2],$xc)", | |
693 | "&pxor (@x[$b3],$xc_)", | |
694 | "&movdqa ($t0,@x[$b2])", | |
695 | "&pslld (@x[$b2],12)", | |
696 | "&psrld ($t0,20)", | |
697 | "&movdqa ($t1,@x[$b3])", | |
698 | "&pslld (@x[$b3],12)", | |
699 | "&por (@x[$b2],$t0)", | |
700 | "&psrld ($t1,20)", | |
701 | "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) | |
702 | "&por (@x[$b3],$t1)", | |
703 | ||
704 | "&paddd (@x[$a2],@x[$b2])", | |
705 | "&paddd (@x[$a3],@x[$b3])", | |
706 | "&pxor (@x[$d2],@x[$a2])", | |
707 | "&pxor (@x[$d3],@x[$a3])", | |
708 | "&pshufb (@x[$d2],$t0)", | |
709 | "&pshufb (@x[$d3],$t0)", | |
710 | ||
711 | "&paddd ($xc,@x[$d2])", | |
712 | "&paddd ($xc_,@x[$d3])", | |
713 | "&pxor (@x[$b2],$xc)", | |
714 | "&pxor (@x[$b3],$xc_)", | |
715 | "&movdqa ($t1,@x[$b2])", | |
716 | "&pslld (@x[$b2],7)", | |
717 | "&psrld ($t1,25)", | |
718 | "&movdqa ($t0,@x[$b3])", | |
719 | "&pslld (@x[$b3],7)", | |
720 | "&por (@x[$b2],$t1)", | |
721 | "&psrld ($t0,25)", | |
722 | "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) | |
723 | "&por (@x[$b3],$t0)" | |
724 | ); | |
725 | } | |
726 | ||
384e6de4 | 727 | my $xframe = $win64 ? 0xa8 : 8; |
a98c648e AP |
728 | |
729 | $code.=<<___; | |
730 | .type ChaCha20_4x,\@function,5 | |
731 | .align 32 | |
732 | ChaCha20_4x: | |
f17652e5 | 733 | .cfi_startproc |
a98c648e | 734 | .LChaCha20_4x: |
384e6de4 | 735 | mov %rsp,%r9 # frame pointer |
f17652e5 | 736 | .cfi_def_cfa_register %r9 |
a98c648e AP |
737 | mov %r10,%r11 |
738 | ___ | |
739 | $code.=<<___ if ($avx>1); | |
740 | shr \$32,%r10 # OPENSSL_ia32cap_P+8 | |
741 | test \$`1<<5`,%r10 # test AVX2 | |
742 | jnz .LChaCha20_8x | |
743 | ___ | |
744 | $code.=<<___; | |
745 | cmp \$192,$len | |
746 | ja .Lproceed4x | |
747 | ||
748 | and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE | |
749 | cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE | |
750 | je .Ldo_sse3_after_all # to detect Atom | |
751 | ||
752 | .Lproceed4x: | |
384e6de4 | 753 | sub \$0x140+$xframe,%rsp |
a98c648e AP |
754 | ___ |
755 | ################ stack layout | |
756 | # +0x00 SIMD equivalent of @x[8-12] | |
757 | # ... | |
758 | # +0x40 constant copy of key[0-2] smashed by lanes | |
759 | # ... | |
760 | # +0x100 SIMD counters (with nonce smashed by lanes) | |
761 | # ... | |
762 | # +0x140 | |
763 | $code.=<<___ if ($win64); | |
384e6de4 AP |
764 | movaps %xmm6,-0xa8(%r9) |
765 | movaps %xmm7,-0x98(%r9) | |
766 | movaps %xmm8,-0x88(%r9) | |
767 | movaps %xmm9,-0x78(%r9) | |
768 | movaps %xmm10,-0x68(%r9) | |
769 | movaps %xmm11,-0x58(%r9) | |
770 | movaps %xmm12,-0x48(%r9) | |
771 | movaps %xmm13,-0x38(%r9) | |
772 | movaps %xmm14,-0x28(%r9) | |
773 | movaps %xmm15,-0x18(%r9) | |
774 | .L4x_body: | |
a98c648e AP |
775 | ___ |
776 | $code.=<<___; | |
777 | movdqa .Lsigma(%rip),$xa3 # key[0] | |
778 | movdqu ($key),$xb3 # key[1] | |
779 | movdqu 16($key),$xt3 # key[2] | |
780 | movdqu ($counter),$xd3 # key[3] | |
781 | lea 0x100(%rsp),%rcx # size optimization | |
782 | lea .Lrot16(%rip),%r10 | |
783 | lea .Lrot24(%rip),%r11 | |
784 | ||
785 | pshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
786 | pshufd \$0x55,$xa3,$xa1 | |
787 | movdqa $xa0,0x40(%rsp) # ... and offload | |
788 | pshufd \$0xaa,$xa3,$xa2 | |
789 | movdqa $xa1,0x50(%rsp) | |
790 | pshufd \$0xff,$xa3,$xa3 | |
791 | movdqa $xa2,0x60(%rsp) | |
792 | movdqa $xa3,0x70(%rsp) | |
793 | ||
794 | pshufd \$0x00,$xb3,$xb0 | |
795 | pshufd \$0x55,$xb3,$xb1 | |
796 | movdqa $xb0,0x80-0x100(%rcx) | |
797 | pshufd \$0xaa,$xb3,$xb2 | |
798 | movdqa $xb1,0x90-0x100(%rcx) | |
799 | pshufd \$0xff,$xb3,$xb3 | |
800 | movdqa $xb2,0xa0-0x100(%rcx) | |
801 | movdqa $xb3,0xb0-0x100(%rcx) | |
802 | ||
803 | pshufd \$0x00,$xt3,$xt0 # "$xc0" | |
804 | pshufd \$0x55,$xt3,$xt1 # "$xc1" | |
805 | movdqa $xt0,0xc0-0x100(%rcx) | |
806 | pshufd \$0xaa,$xt3,$xt2 # "$xc2" | |
807 | movdqa $xt1,0xd0-0x100(%rcx) | |
808 | pshufd \$0xff,$xt3,$xt3 # "$xc3" | |
809 | movdqa $xt2,0xe0-0x100(%rcx) | |
810 | movdqa $xt3,0xf0-0x100(%rcx) | |
811 | ||
812 | pshufd \$0x00,$xd3,$xd0 | |
813 | pshufd \$0x55,$xd3,$xd1 | |
814 | paddd .Linc(%rip),$xd0 # don't save counters yet | |
815 | pshufd \$0xaa,$xd3,$xd2 | |
816 | movdqa $xd1,0x110-0x100(%rcx) | |
817 | pshufd \$0xff,$xd3,$xd3 | |
818 | movdqa $xd2,0x120-0x100(%rcx) | |
819 | movdqa $xd3,0x130-0x100(%rcx) | |
820 | ||
821 | jmp .Loop_enter4x | |
822 | ||
823 | .align 32 | |
824 | .Loop_outer4x: | |
825 | movdqa 0x40(%rsp),$xa0 # re-load smashed key | |
826 | movdqa 0x50(%rsp),$xa1 | |
827 | movdqa 0x60(%rsp),$xa2 | |
828 | movdqa 0x70(%rsp),$xa3 | |
829 | movdqa 0x80-0x100(%rcx),$xb0 | |
830 | movdqa 0x90-0x100(%rcx),$xb1 | |
831 | movdqa 0xa0-0x100(%rcx),$xb2 | |
832 | movdqa 0xb0-0x100(%rcx),$xb3 | |
833 | movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" | |
834 | movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" | |
835 | movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" | |
836 | movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" | |
837 | movdqa 0x100-0x100(%rcx),$xd0 | |
838 | movdqa 0x110-0x100(%rcx),$xd1 | |
839 | movdqa 0x120-0x100(%rcx),$xd2 | |
840 | movdqa 0x130-0x100(%rcx),$xd3 | |
841 | paddd .Lfour(%rip),$xd0 # next SIMD counters | |
842 | ||
843 | .Loop_enter4x: | |
844 | movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" | |
845 | movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" | |
846 | movdqa (%r10),$xt3 # .Lrot16(%rip) | |
847 | mov \$10,%eax | |
848 | movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters | |
849 | jmp .Loop4x | |
850 | ||
851 | .align 32 | |
852 | .Loop4x: | |
853 | ___ | |
854 | foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } | |
855 | foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } | |
856 | $code.=<<___; | |
857 | dec %eax | |
858 | jnz .Loop4x | |
859 | ||
860 | paddd 0x40(%rsp),$xa0 # accumulate key material | |
861 | paddd 0x50(%rsp),$xa1 | |
862 | paddd 0x60(%rsp),$xa2 | |
863 | paddd 0x70(%rsp),$xa3 | |
864 | ||
865 | movdqa $xa0,$xt2 # "de-interlace" data | |
866 | punpckldq $xa1,$xa0 | |
867 | movdqa $xa2,$xt3 | |
868 | punpckldq $xa3,$xa2 | |
869 | punpckhdq $xa1,$xt2 | |
870 | punpckhdq $xa3,$xt3 | |
871 | movdqa $xa0,$xa1 | |
872 | punpcklqdq $xa2,$xa0 # "a0" | |
873 | movdqa $xt2,$xa3 | |
874 | punpcklqdq $xt3,$xt2 # "a2" | |
875 | punpckhqdq $xa2,$xa1 # "a1" | |
876 | punpckhqdq $xt3,$xa3 # "a3" | |
877 | ___ | |
878 | ($xa2,$xt2)=($xt2,$xa2); | |
879 | $code.=<<___; | |
880 | paddd 0x80-0x100(%rcx),$xb0 | |
881 | paddd 0x90-0x100(%rcx),$xb1 | |
882 | paddd 0xa0-0x100(%rcx),$xb2 | |
883 | paddd 0xb0-0x100(%rcx),$xb3 | |
884 | ||
885 | movdqa $xa0,0x00(%rsp) # offload $xaN | |
886 | movdqa $xa1,0x10(%rsp) | |
887 | movdqa 0x20(%rsp),$xa0 # "xc2" | |
888 | movdqa 0x30(%rsp),$xa1 # "xc3" | |
889 | ||
890 | movdqa $xb0,$xt2 | |
891 | punpckldq $xb1,$xb0 | |
892 | movdqa $xb2,$xt3 | |
893 | punpckldq $xb3,$xb2 | |
894 | punpckhdq $xb1,$xt2 | |
895 | punpckhdq $xb3,$xt3 | |
896 | movdqa $xb0,$xb1 | |
897 | punpcklqdq $xb2,$xb0 # "b0" | |
898 | movdqa $xt2,$xb3 | |
899 | punpcklqdq $xt3,$xt2 # "b2" | |
900 | punpckhqdq $xb2,$xb1 # "b1" | |
901 | punpckhqdq $xt3,$xb3 # "b3" | |
902 | ___ | |
903 | ($xb2,$xt2)=($xt2,$xb2); | |
904 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); | |
905 | $code.=<<___; | |
906 | paddd 0xc0-0x100(%rcx),$xc0 | |
907 | paddd 0xd0-0x100(%rcx),$xc1 | |
908 | paddd 0xe0-0x100(%rcx),$xc2 | |
909 | paddd 0xf0-0x100(%rcx),$xc3 | |
910 | ||
911 | movdqa $xa2,0x20(%rsp) # keep offloading $xaN | |
912 | movdqa $xa3,0x30(%rsp) | |
913 | ||
914 | movdqa $xc0,$xt2 | |
915 | punpckldq $xc1,$xc0 | |
916 | movdqa $xc2,$xt3 | |
917 | punpckldq $xc3,$xc2 | |
918 | punpckhdq $xc1,$xt2 | |
919 | punpckhdq $xc3,$xt3 | |
920 | movdqa $xc0,$xc1 | |
921 | punpcklqdq $xc2,$xc0 # "c0" | |
922 | movdqa $xt2,$xc3 | |
923 | punpcklqdq $xt3,$xt2 # "c2" | |
924 | punpckhqdq $xc2,$xc1 # "c1" | |
925 | punpckhqdq $xt3,$xc3 # "c3" | |
926 | ___ | |
927 | ($xc2,$xt2)=($xt2,$xc2); | |
928 | ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary | |
929 | $code.=<<___; | |
930 | paddd 0x100-0x100(%rcx),$xd0 | |
931 | paddd 0x110-0x100(%rcx),$xd1 | |
932 | paddd 0x120-0x100(%rcx),$xd2 | |
933 | paddd 0x130-0x100(%rcx),$xd3 | |
934 | ||
935 | movdqa $xd0,$xt2 | |
936 | punpckldq $xd1,$xd0 | |
937 | movdqa $xd2,$xt3 | |
938 | punpckldq $xd3,$xd2 | |
939 | punpckhdq $xd1,$xt2 | |
940 | punpckhdq $xd3,$xt3 | |
941 | movdqa $xd0,$xd1 | |
942 | punpcklqdq $xd2,$xd0 # "d0" | |
943 | movdqa $xt2,$xd3 | |
944 | punpcklqdq $xt3,$xt2 # "d2" | |
945 | punpckhqdq $xd2,$xd1 # "d1" | |
946 | punpckhqdq $xt3,$xd3 # "d3" | |
947 | ___ | |
948 | ($xd2,$xt2)=($xt2,$xd2); | |
949 | $code.=<<___; | |
950 | cmp \$64*4,$len | |
951 | jb .Ltail4x | |
952 | ||
953 | movdqu 0x00($inp),$xt0 # xor with input | |
954 | movdqu 0x10($inp),$xt1 | |
955 | movdqu 0x20($inp),$xt2 | |
956 | movdqu 0x30($inp),$xt3 | |
957 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? | |
958 | pxor $xb0,$xt1 | |
959 | pxor $xc0,$xt2 | |
960 | pxor $xd0,$xt3 | |
961 | ||
962 | movdqu $xt0,0x00($out) | |
963 | movdqu 0x40($inp),$xt0 | |
964 | movdqu $xt1,0x10($out) | |
965 | movdqu 0x50($inp),$xt1 | |
966 | movdqu $xt2,0x20($out) | |
967 | movdqu 0x60($inp),$xt2 | |
968 | movdqu $xt3,0x30($out) | |
969 | movdqu 0x70($inp),$xt3 | |
970 | lea 0x80($inp),$inp # size optimization | |
971 | pxor 0x10(%rsp),$xt0 | |
972 | pxor $xb1,$xt1 | |
973 | pxor $xc1,$xt2 | |
974 | pxor $xd1,$xt3 | |
975 | ||
976 | movdqu $xt0,0x40($out) | |
977 | movdqu 0x00($inp),$xt0 | |
978 | movdqu $xt1,0x50($out) | |
979 | movdqu 0x10($inp),$xt1 | |
980 | movdqu $xt2,0x60($out) | |
981 | movdqu 0x20($inp),$xt2 | |
982 | movdqu $xt3,0x70($out) | |
983 | lea 0x80($out),$out # size optimization | |
984 | movdqu 0x30($inp),$xt3 | |
985 | pxor 0x20(%rsp),$xt0 | |
986 | pxor $xb2,$xt1 | |
987 | pxor $xc2,$xt2 | |
988 | pxor $xd2,$xt3 | |
989 | ||
990 | movdqu $xt0,0x00($out) | |
991 | movdqu 0x40($inp),$xt0 | |
992 | movdqu $xt1,0x10($out) | |
993 | movdqu 0x50($inp),$xt1 | |
994 | movdqu $xt2,0x20($out) | |
995 | movdqu 0x60($inp),$xt2 | |
996 | movdqu $xt3,0x30($out) | |
997 | movdqu 0x70($inp),$xt3 | |
998 | lea 0x80($inp),$inp # inp+=64*4 | |
999 | pxor 0x30(%rsp),$xt0 | |
1000 | pxor $xb3,$xt1 | |
1001 | pxor $xc3,$xt2 | |
1002 | pxor $xd3,$xt3 | |
1003 | movdqu $xt0,0x40($out) | |
1004 | movdqu $xt1,0x50($out) | |
1005 | movdqu $xt2,0x60($out) | |
1006 | movdqu $xt3,0x70($out) | |
1007 | lea 0x80($out),$out # out+=64*4 | |
1008 | ||
1009 | sub \$64*4,$len | |
1010 | jnz .Loop_outer4x | |
1011 | ||
1012 | jmp .Ldone4x | |
1013 | ||
1014 | .Ltail4x: | |
1015 | cmp \$192,$len | |
1016 | jae .L192_or_more4x | |
1017 | cmp \$128,$len | |
1018 | jae .L128_or_more4x | |
1019 | cmp \$64,$len | |
1020 | jae .L64_or_more4x | |
1021 | ||
1022 | #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? | |
1023 | xor %r10,%r10 | |
1024 | #movdqa $xt0,0x00(%rsp) | |
1025 | movdqa $xb0,0x10(%rsp) | |
1026 | movdqa $xc0,0x20(%rsp) | |
1027 | movdqa $xd0,0x30(%rsp) | |
1028 | jmp .Loop_tail4x | |
1029 | ||
1030 | .align 32 | |
1031 | .L64_or_more4x: | |
1032 | movdqu 0x00($inp),$xt0 # xor with input | |
1033 | movdqu 0x10($inp),$xt1 | |
1034 | movdqu 0x20($inp),$xt2 | |
1035 | movdqu 0x30($inp),$xt3 | |
1036 | pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? | |
1037 | pxor $xb0,$xt1 | |
1038 | pxor $xc0,$xt2 | |
1039 | pxor $xd0,$xt3 | |
1040 | movdqu $xt0,0x00($out) | |
1041 | movdqu $xt1,0x10($out) | |
1042 | movdqu $xt2,0x20($out) | |
1043 | movdqu $xt3,0x30($out) | |
1044 | je .Ldone4x | |
1045 | ||
1046 | movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? | |
1047 | lea 0x40($inp),$inp # inp+=64*1 | |
1048 | xor %r10,%r10 | |
1049 | movdqa $xt0,0x00(%rsp) | |
1050 | movdqa $xb1,0x10(%rsp) | |
1051 | lea 0x40($out),$out # out+=64*1 | |
1052 | movdqa $xc1,0x20(%rsp) | |
1053 | sub \$64,$len # len-=64*1 | |
1054 | movdqa $xd1,0x30(%rsp) | |
1055 | jmp .Loop_tail4x | |
1056 | ||
1057 | .align 32 | |
1058 | .L128_or_more4x: | |
1059 | movdqu 0x00($inp),$xt0 # xor with input | |
1060 | movdqu 0x10($inp),$xt1 | |
1061 | movdqu 0x20($inp),$xt2 | |
1062 | movdqu 0x30($inp),$xt3 | |
1063 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? | |
1064 | pxor $xb0,$xt1 | |
1065 | pxor $xc0,$xt2 | |
1066 | pxor $xd0,$xt3 | |
1067 | ||
1068 | movdqu $xt0,0x00($out) | |
1069 | movdqu 0x40($inp),$xt0 | |
1070 | movdqu $xt1,0x10($out) | |
1071 | movdqu 0x50($inp),$xt1 | |
1072 | movdqu $xt2,0x20($out) | |
1073 | movdqu 0x60($inp),$xt2 | |
1074 | movdqu $xt3,0x30($out) | |
1075 | movdqu 0x70($inp),$xt3 | |
1076 | pxor 0x10(%rsp),$xt0 | |
1077 | pxor $xb1,$xt1 | |
1078 | pxor $xc1,$xt2 | |
1079 | pxor $xd1,$xt3 | |
1080 | movdqu $xt0,0x40($out) | |
1081 | movdqu $xt1,0x50($out) | |
1082 | movdqu $xt2,0x60($out) | |
1083 | movdqu $xt3,0x70($out) | |
1084 | je .Ldone4x | |
1085 | ||
1086 | movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? | |
1087 | lea 0x80($inp),$inp # inp+=64*2 | |
1088 | xor %r10,%r10 | |
1089 | movdqa $xt0,0x00(%rsp) | |
1090 | movdqa $xb2,0x10(%rsp) | |
1091 | lea 0x80($out),$out # out+=64*2 | |
1092 | movdqa $xc2,0x20(%rsp) | |
1093 | sub \$128,$len # len-=64*2 | |
1094 | movdqa $xd2,0x30(%rsp) | |
1095 | jmp .Loop_tail4x | |
1096 | ||
1097 | .align 32 | |
1098 | .L192_or_more4x: | |
1099 | movdqu 0x00($inp),$xt0 # xor with input | |
1100 | movdqu 0x10($inp),$xt1 | |
1101 | movdqu 0x20($inp),$xt2 | |
1102 | movdqu 0x30($inp),$xt3 | |
1103 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? | |
1104 | pxor $xb0,$xt1 | |
1105 | pxor $xc0,$xt2 | |
1106 | pxor $xd0,$xt3 | |
1107 | ||
1108 | movdqu $xt0,0x00($out) | |
1109 | movdqu 0x40($inp),$xt0 | |
1110 | movdqu $xt1,0x10($out) | |
1111 | movdqu 0x50($inp),$xt1 | |
1112 | movdqu $xt2,0x20($out) | |
1113 | movdqu 0x60($inp),$xt2 | |
1114 | movdqu $xt3,0x30($out) | |
1115 | movdqu 0x70($inp),$xt3 | |
1116 | lea 0x80($inp),$inp # size optimization | |
1117 | pxor 0x10(%rsp),$xt0 | |
1118 | pxor $xb1,$xt1 | |
1119 | pxor $xc1,$xt2 | |
1120 | pxor $xd1,$xt3 | |
1121 | ||
1122 | movdqu $xt0,0x40($out) | |
1123 | movdqu 0x00($inp),$xt0 | |
1124 | movdqu $xt1,0x50($out) | |
1125 | movdqu 0x10($inp),$xt1 | |
1126 | movdqu $xt2,0x60($out) | |
1127 | movdqu 0x20($inp),$xt2 | |
1128 | movdqu $xt3,0x70($out) | |
1129 | lea 0x80($out),$out # size optimization | |
1130 | movdqu 0x30($inp),$xt3 | |
1131 | pxor 0x20(%rsp),$xt0 | |
1132 | pxor $xb2,$xt1 | |
1133 | pxor $xc2,$xt2 | |
1134 | pxor $xd2,$xt3 | |
1135 | movdqu $xt0,0x00($out) | |
1136 | movdqu $xt1,0x10($out) | |
1137 | movdqu $xt2,0x20($out) | |
1138 | movdqu $xt3,0x30($out) | |
1139 | je .Ldone4x | |
1140 | ||
1141 | movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? | |
1142 | lea 0x40($inp),$inp # inp+=64*3 | |
1143 | xor %r10,%r10 | |
1144 | movdqa $xt0,0x00(%rsp) | |
1145 | movdqa $xb3,0x10(%rsp) | |
1146 | lea 0x40($out),$out # out+=64*3 | |
1147 | movdqa $xc3,0x20(%rsp) | |
1148 | sub \$192,$len # len-=64*3 | |
1149 | movdqa $xd3,0x30(%rsp) | |
1150 | ||
1151 | .Loop_tail4x: | |
1152 | movzb ($inp,%r10),%eax | |
1153 | movzb (%rsp,%r10),%ecx | |
1154 | lea 1(%r10),%r10 | |
1155 | xor %ecx,%eax | |
1156 | mov %al,-1($out,%r10) | |
1157 | dec $len | |
1158 | jnz .Loop_tail4x | |
1159 | ||
1160 | .Ldone4x: | |
1161 | ___ | |
1162 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1163 | movaps -0xa8(%r9),%xmm6 |
1164 | movaps -0x98(%r9),%xmm7 | |
1165 | movaps -0x88(%r9),%xmm8 | |
1166 | movaps -0x78(%r9),%xmm9 | |
1167 | movaps -0x68(%r9),%xmm10 | |
1168 | movaps -0x58(%r9),%xmm11 | |
1169 | movaps -0x48(%r9),%xmm12 | |
1170 | movaps -0x38(%r9),%xmm13 | |
1171 | movaps -0x28(%r9),%xmm14 | |
1172 | movaps -0x18(%r9),%xmm15 | |
a98c648e AP |
1173 | ___ |
1174 | $code.=<<___; | |
384e6de4 | 1175 | lea (%r9),%rsp |
f17652e5 | 1176 | .cfi_def_cfa_register %rsp |
384e6de4 | 1177 | .L4x_epilogue: |
a98c648e | 1178 | ret |
f17652e5 | 1179 | .cfi_endproc |
a98c648e AP |
1180 | .size ChaCha20_4x,.-ChaCha20_4x |
1181 | ___ | |
1182 | } | |
1183 | ||
1184 | ######################################################################## | |
1185 | # XOP code path that handles all lengths. | |
1186 | if ($avx) { | |
1187 | # There is some "anomaly" observed depending on instructions' size or | |
1188 | # alignment. If you look closely at below code you'll notice that | |
1189 | # sometimes argument order varies. The order affects instruction | |
1190 | # encoding by making it larger, and such fiddling gives 5% performance | |
1191 | # improvement. This is on FX-4100... | |
1192 | ||
1193 | my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, | |
1194 | $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15)); | |
1195 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
1196 | $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3); | |
1197 | ||
1198 | sub XOP_lane_ROUND { | |
1199 | my ($a0,$b0,$c0,$d0)=@_; | |
1200 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
1201 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
1202 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
1203 | my @x=map("\"$_\"",@xx); | |
1204 | ||
1205 | ( | |
1206 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 | |
1207 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 | |
1208 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 | |
1209 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 | |
1210 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])", | |
1211 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])", | |
1212 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])", | |
1213 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])", | |
1214 | "&vprotd (@x[$d0],@x[$d0],16)", | |
1215 | "&vprotd (@x[$d1],@x[$d1],16)", | |
1216 | "&vprotd (@x[$d2],@x[$d2],16)", | |
1217 | "&vprotd (@x[$d3],@x[$d3],16)", | |
1218 | ||
1219 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", | |
1220 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", | |
1221 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", | |
1222 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", | |
1223 | "&vpxor (@x[$b0],@x[$c0],@x[$b0])", | |
1224 | "&vpxor (@x[$b1],@x[$c1],@x[$b1])", | |
1225 | "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip | |
1226 | "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip | |
1227 | "&vprotd (@x[$b0],@x[$b0],12)", | |
1228 | "&vprotd (@x[$b1],@x[$b1],12)", | |
1229 | "&vprotd (@x[$b2],@x[$b2],12)", | |
1230 | "&vprotd (@x[$b3],@x[$b3],12)", | |
1231 | ||
1232 | "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip | |
1233 | "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip | |
1234 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", | |
1235 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", | |
1236 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])", | |
1237 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])", | |
1238 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])", | |
1239 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])", | |
1240 | "&vprotd (@x[$d0],@x[$d0],8)", | |
1241 | "&vprotd (@x[$d1],@x[$d1],8)", | |
1242 | "&vprotd (@x[$d2],@x[$d2],8)", | |
1243 | "&vprotd (@x[$d3],@x[$d3],8)", | |
1244 | ||
1245 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", | |
1246 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", | |
1247 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", | |
1248 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", | |
1249 | "&vpxor (@x[$b0],@x[$c0],@x[$b0])", | |
1250 | "&vpxor (@x[$b1],@x[$c1],@x[$b1])", | |
1251 | "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip | |
1252 | "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip | |
1253 | "&vprotd (@x[$b0],@x[$b0],7)", | |
1254 | "&vprotd (@x[$b1],@x[$b1],7)", | |
1255 | "&vprotd (@x[$b2],@x[$b2],7)", | |
1256 | "&vprotd (@x[$b3],@x[$b3],7)" | |
1257 | ); | |
1258 | } | |
1259 | ||
384e6de4 | 1260 | my $xframe = $win64 ? 0xa8 : 8; |
a98c648e AP |
1261 | |
1262 | $code.=<<___; | |
1263 | .type ChaCha20_4xop,\@function,5 | |
1264 | .align 32 | |
1265 | ChaCha20_4xop: | |
f17652e5 | 1266 | .cfi_startproc |
a98c648e | 1267 | .LChaCha20_4xop: |
384e6de4 | 1268 | mov %rsp,%r9 # frame pointer |
f17652e5 | 1269 | .cfi_def_cfa_register %r9 |
384e6de4 | 1270 | sub \$0x140+$xframe,%rsp |
a98c648e AP |
1271 | ___ |
1272 | ################ stack layout | |
1273 | # +0x00 SIMD equivalent of @x[8-12] | |
1274 | # ... | |
1275 | # +0x40 constant copy of key[0-2] smashed by lanes | |
1276 | # ... | |
1277 | # +0x100 SIMD counters (with nonce smashed by lanes) | |
1278 | # ... | |
1279 | # +0x140 | |
1280 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1281 | movaps %xmm6,-0xa8(%r9) |
1282 | movaps %xmm7,-0x98(%r9) | |
1283 | movaps %xmm8,-0x88(%r9) | |
1284 | movaps %xmm9,-0x78(%r9) | |
1285 | movaps %xmm10,-0x68(%r9) | |
1286 | movaps %xmm11,-0x58(%r9) | |
1287 | movaps %xmm12,-0x48(%r9) | |
1288 | movaps %xmm13,-0x38(%r9) | |
1289 | movaps %xmm14,-0x28(%r9) | |
1290 | movaps %xmm15,-0x18(%r9) | |
1291 | .L4xop_body: | |
a98c648e AP |
1292 | ___ |
1293 | $code.=<<___; | |
1294 | vzeroupper | |
1295 | ||
1296 | vmovdqa .Lsigma(%rip),$xa3 # key[0] | |
1297 | vmovdqu ($key),$xb3 # key[1] | |
1298 | vmovdqu 16($key),$xt3 # key[2] | |
1299 | vmovdqu ($counter),$xd3 # key[3] | |
1300 | lea 0x100(%rsp),%rcx # size optimization | |
1301 | ||
1302 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
1303 | vpshufd \$0x55,$xa3,$xa1 | |
1304 | vmovdqa $xa0,0x40(%rsp) # ... and offload | |
1305 | vpshufd \$0xaa,$xa3,$xa2 | |
1306 | vmovdqa $xa1,0x50(%rsp) | |
1307 | vpshufd \$0xff,$xa3,$xa3 | |
1308 | vmovdqa $xa2,0x60(%rsp) | |
1309 | vmovdqa $xa3,0x70(%rsp) | |
1310 | ||
1311 | vpshufd \$0x00,$xb3,$xb0 | |
1312 | vpshufd \$0x55,$xb3,$xb1 | |
1313 | vmovdqa $xb0,0x80-0x100(%rcx) | |
1314 | vpshufd \$0xaa,$xb3,$xb2 | |
1315 | vmovdqa $xb1,0x90-0x100(%rcx) | |
1316 | vpshufd \$0xff,$xb3,$xb3 | |
1317 | vmovdqa $xb2,0xa0-0x100(%rcx) | |
1318 | vmovdqa $xb3,0xb0-0x100(%rcx) | |
1319 | ||
1320 | vpshufd \$0x00,$xt3,$xt0 # "$xc0" | |
1321 | vpshufd \$0x55,$xt3,$xt1 # "$xc1" | |
1322 | vmovdqa $xt0,0xc0-0x100(%rcx) | |
1323 | vpshufd \$0xaa,$xt3,$xt2 # "$xc2" | |
1324 | vmovdqa $xt1,0xd0-0x100(%rcx) | |
1325 | vpshufd \$0xff,$xt3,$xt3 # "$xc3" | |
1326 | vmovdqa $xt2,0xe0-0x100(%rcx) | |
1327 | vmovdqa $xt3,0xf0-0x100(%rcx) | |
1328 | ||
1329 | vpshufd \$0x00,$xd3,$xd0 | |
1330 | vpshufd \$0x55,$xd3,$xd1 | |
1331 | vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet | |
1332 | vpshufd \$0xaa,$xd3,$xd2 | |
1333 | vmovdqa $xd1,0x110-0x100(%rcx) | |
1334 | vpshufd \$0xff,$xd3,$xd3 | |
1335 | vmovdqa $xd2,0x120-0x100(%rcx) | |
1336 | vmovdqa $xd3,0x130-0x100(%rcx) | |
1337 | ||
1338 | jmp .Loop_enter4xop | |
1339 | ||
1340 | .align 32 | |
1341 | .Loop_outer4xop: | |
1342 | vmovdqa 0x40(%rsp),$xa0 # re-load smashed key | |
1343 | vmovdqa 0x50(%rsp),$xa1 | |
1344 | vmovdqa 0x60(%rsp),$xa2 | |
1345 | vmovdqa 0x70(%rsp),$xa3 | |
1346 | vmovdqa 0x80-0x100(%rcx),$xb0 | |
1347 | vmovdqa 0x90-0x100(%rcx),$xb1 | |
1348 | vmovdqa 0xa0-0x100(%rcx),$xb2 | |
1349 | vmovdqa 0xb0-0x100(%rcx),$xb3 | |
1350 | vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" | |
1351 | vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" | |
1352 | vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" | |
1353 | vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" | |
1354 | vmovdqa 0x100-0x100(%rcx),$xd0 | |
1355 | vmovdqa 0x110-0x100(%rcx),$xd1 | |
1356 | vmovdqa 0x120-0x100(%rcx),$xd2 | |
1357 | vmovdqa 0x130-0x100(%rcx),$xd3 | |
1358 | vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters | |
1359 | ||
1360 | .Loop_enter4xop: | |
1361 | mov \$10,%eax | |
1362 | vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters | |
1363 | jmp .Loop4xop | |
1364 | ||
1365 | .align 32 | |
1366 | .Loop4xop: | |
1367 | ___ | |
1368 | foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; } | |
1369 | foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; } | |
1370 | $code.=<<___; | |
1371 | dec %eax | |
1372 | jnz .Loop4xop | |
1373 | ||
1374 | vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material | |
1375 | vpaddd 0x50(%rsp),$xa1,$xa1 | |
1376 | vpaddd 0x60(%rsp),$xa2,$xa2 | |
1377 | vpaddd 0x70(%rsp),$xa3,$xa3 | |
1378 | ||
1379 | vmovdqa $xt2,0x20(%rsp) # offload $xc2,3 | |
1380 | vmovdqa $xt3,0x30(%rsp) | |
1381 | ||
1382 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data | |
1383 | vpunpckldq $xa3,$xa2,$xt3 | |
1384 | vpunpckhdq $xa1,$xa0,$xa0 | |
1385 | vpunpckhdq $xa3,$xa2,$xa2 | |
1386 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0" | |
1387 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1" | |
1388 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2" | |
1389 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3" | |
1390 | ___ | |
1391 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); | |
1392 | $code.=<<___; | |
1393 | vpaddd 0x80-0x100(%rcx),$xb0,$xb0 | |
1394 | vpaddd 0x90-0x100(%rcx),$xb1,$xb1 | |
1395 | vpaddd 0xa0-0x100(%rcx),$xb2,$xb2 | |
1396 | vpaddd 0xb0-0x100(%rcx),$xb3,$xb3 | |
1397 | ||
1398 | vmovdqa $xa0,0x00(%rsp) # offload $xa0,1 | |
1399 | vmovdqa $xa1,0x10(%rsp) | |
1400 | vmovdqa 0x20(%rsp),$xa0 # "xc2" | |
1401 | vmovdqa 0x30(%rsp),$xa1 # "xc3" | |
1402 | ||
1403 | vpunpckldq $xb1,$xb0,$xt2 | |
1404 | vpunpckldq $xb3,$xb2,$xt3 | |
1405 | vpunpckhdq $xb1,$xb0,$xb0 | |
1406 | vpunpckhdq $xb3,$xb2,$xb2 | |
1407 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0" | |
1408 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1" | |
1409 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2" | |
1410 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3" | |
1411 | ___ | |
1412 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); | |
1413 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); | |
1414 | $code.=<<___; | |
1415 | vpaddd 0xc0-0x100(%rcx),$xc0,$xc0 | |
1416 | vpaddd 0xd0-0x100(%rcx),$xc1,$xc1 | |
1417 | vpaddd 0xe0-0x100(%rcx),$xc2,$xc2 | |
1418 | vpaddd 0xf0-0x100(%rcx),$xc3,$xc3 | |
1419 | ||
1420 | vpunpckldq $xc1,$xc0,$xt2 | |
1421 | vpunpckldq $xc3,$xc2,$xt3 | |
1422 | vpunpckhdq $xc1,$xc0,$xc0 | |
1423 | vpunpckhdq $xc3,$xc2,$xc2 | |
1424 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0" | |
1425 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1" | |
1426 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2" | |
1427 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3" | |
1428 | ___ | |
1429 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); | |
1430 | $code.=<<___; | |
1431 | vpaddd 0x100-0x100(%rcx),$xd0,$xd0 | |
1432 | vpaddd 0x110-0x100(%rcx),$xd1,$xd1 | |
1433 | vpaddd 0x120-0x100(%rcx),$xd2,$xd2 | |
1434 | vpaddd 0x130-0x100(%rcx),$xd3,$xd3 | |
1435 | ||
1436 | vpunpckldq $xd1,$xd0,$xt2 | |
1437 | vpunpckldq $xd3,$xd2,$xt3 | |
1438 | vpunpckhdq $xd1,$xd0,$xd0 | |
1439 | vpunpckhdq $xd3,$xd2,$xd2 | |
1440 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0" | |
1441 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1" | |
1442 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2" | |
1443 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3" | |
1444 | ___ | |
1445 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); | |
1446 | ($xa0,$xa1)=($xt2,$xt3); | |
1447 | $code.=<<___; | |
1448 | vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1 | |
1449 | vmovdqa 0x10(%rsp),$xa1 | |
1450 | ||
1451 | cmp \$64*4,$len | |
1452 | jb .Ltail4xop | |
1453 | ||
1454 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1455 | vpxor 0x10($inp),$xb0,$xb0 | |
1456 | vpxor 0x20($inp),$xc0,$xc0 | |
1457 | vpxor 0x30($inp),$xd0,$xd0 | |
1458 | vpxor 0x40($inp),$xa1,$xa1 | |
1459 | vpxor 0x50($inp),$xb1,$xb1 | |
1460 | vpxor 0x60($inp),$xc1,$xc1 | |
1461 | vpxor 0x70($inp),$xd1,$xd1 | |
1462 | lea 0x80($inp),$inp # size optimization | |
1463 | vpxor 0x00($inp),$xa2,$xa2 | |
1464 | vpxor 0x10($inp),$xb2,$xb2 | |
1465 | vpxor 0x20($inp),$xc2,$xc2 | |
1466 | vpxor 0x30($inp),$xd2,$xd2 | |
1467 | vpxor 0x40($inp),$xa3,$xa3 | |
1468 | vpxor 0x50($inp),$xb3,$xb3 | |
1469 | vpxor 0x60($inp),$xc3,$xc3 | |
1470 | vpxor 0x70($inp),$xd3,$xd3 | |
1471 | lea 0x80($inp),$inp # inp+=64*4 | |
1472 | ||
1473 | vmovdqu $xa0,0x00($out) | |
1474 | vmovdqu $xb0,0x10($out) | |
1475 | vmovdqu $xc0,0x20($out) | |
1476 | vmovdqu $xd0,0x30($out) | |
1477 | vmovdqu $xa1,0x40($out) | |
1478 | vmovdqu $xb1,0x50($out) | |
1479 | vmovdqu $xc1,0x60($out) | |
1480 | vmovdqu $xd1,0x70($out) | |
1481 | lea 0x80($out),$out # size optimization | |
1482 | vmovdqu $xa2,0x00($out) | |
1483 | vmovdqu $xb2,0x10($out) | |
1484 | vmovdqu $xc2,0x20($out) | |
1485 | vmovdqu $xd2,0x30($out) | |
1486 | vmovdqu $xa3,0x40($out) | |
1487 | vmovdqu $xb3,0x50($out) | |
1488 | vmovdqu $xc3,0x60($out) | |
1489 | vmovdqu $xd3,0x70($out) | |
1490 | lea 0x80($out),$out # out+=64*4 | |
1491 | ||
1492 | sub \$64*4,$len | |
1493 | jnz .Loop_outer4xop | |
1494 | ||
1495 | jmp .Ldone4xop | |
1496 | ||
1497 | .align 32 | |
1498 | .Ltail4xop: | |
1499 | cmp \$192,$len | |
1500 | jae .L192_or_more4xop | |
1501 | cmp \$128,$len | |
1502 | jae .L128_or_more4xop | |
1503 | cmp \$64,$len | |
1504 | jae .L64_or_more4xop | |
1505 | ||
1506 | xor %r10,%r10 | |
1507 | vmovdqa $xa0,0x00(%rsp) | |
1508 | vmovdqa $xb0,0x10(%rsp) | |
1509 | vmovdqa $xc0,0x20(%rsp) | |
1510 | vmovdqa $xd0,0x30(%rsp) | |
1511 | jmp .Loop_tail4xop | |
1512 | ||
1513 | .align 32 | |
1514 | .L64_or_more4xop: | |
1515 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1516 | vpxor 0x10($inp),$xb0,$xb0 | |
1517 | vpxor 0x20($inp),$xc0,$xc0 | |
1518 | vpxor 0x30($inp),$xd0,$xd0 | |
1519 | vmovdqu $xa0,0x00($out) | |
1520 | vmovdqu $xb0,0x10($out) | |
1521 | vmovdqu $xc0,0x20($out) | |
1522 | vmovdqu $xd0,0x30($out) | |
1523 | je .Ldone4xop | |
1524 | ||
1525 | lea 0x40($inp),$inp # inp+=64*1 | |
1526 | vmovdqa $xa1,0x00(%rsp) | |
1527 | xor %r10,%r10 | |
1528 | vmovdqa $xb1,0x10(%rsp) | |
1529 | lea 0x40($out),$out # out+=64*1 | |
1530 | vmovdqa $xc1,0x20(%rsp) | |
1531 | sub \$64,$len # len-=64*1 | |
1532 | vmovdqa $xd1,0x30(%rsp) | |
1533 | jmp .Loop_tail4xop | |
1534 | ||
1535 | .align 32 | |
1536 | .L128_or_more4xop: | |
1537 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1538 | vpxor 0x10($inp),$xb0,$xb0 | |
1539 | vpxor 0x20($inp),$xc0,$xc0 | |
1540 | vpxor 0x30($inp),$xd0,$xd0 | |
1541 | vpxor 0x40($inp),$xa1,$xa1 | |
1542 | vpxor 0x50($inp),$xb1,$xb1 | |
1543 | vpxor 0x60($inp),$xc1,$xc1 | |
1544 | vpxor 0x70($inp),$xd1,$xd1 | |
1545 | ||
1546 | vmovdqu $xa0,0x00($out) | |
1547 | vmovdqu $xb0,0x10($out) | |
1548 | vmovdqu $xc0,0x20($out) | |
1549 | vmovdqu $xd0,0x30($out) | |
1550 | vmovdqu $xa1,0x40($out) | |
1551 | vmovdqu $xb1,0x50($out) | |
1552 | vmovdqu $xc1,0x60($out) | |
1553 | vmovdqu $xd1,0x70($out) | |
1554 | je .Ldone4xop | |
1555 | ||
1556 | lea 0x80($inp),$inp # inp+=64*2 | |
1557 | vmovdqa $xa2,0x00(%rsp) | |
1558 | xor %r10,%r10 | |
1559 | vmovdqa $xb2,0x10(%rsp) | |
1560 | lea 0x80($out),$out # out+=64*2 | |
1561 | vmovdqa $xc2,0x20(%rsp) | |
1562 | sub \$128,$len # len-=64*2 | |
1563 | vmovdqa $xd2,0x30(%rsp) | |
1564 | jmp .Loop_tail4xop | |
1565 | ||
1566 | .align 32 | |
1567 | .L192_or_more4xop: | |
1568 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1569 | vpxor 0x10($inp),$xb0,$xb0 | |
1570 | vpxor 0x20($inp),$xc0,$xc0 | |
1571 | vpxor 0x30($inp),$xd0,$xd0 | |
1572 | vpxor 0x40($inp),$xa1,$xa1 | |
1573 | vpxor 0x50($inp),$xb1,$xb1 | |
1574 | vpxor 0x60($inp),$xc1,$xc1 | |
1575 | vpxor 0x70($inp),$xd1,$xd1 | |
1576 | lea 0x80($inp),$inp # size optimization | |
1577 | vpxor 0x00($inp),$xa2,$xa2 | |
1578 | vpxor 0x10($inp),$xb2,$xb2 | |
1579 | vpxor 0x20($inp),$xc2,$xc2 | |
1580 | vpxor 0x30($inp),$xd2,$xd2 | |
1581 | ||
1582 | vmovdqu $xa0,0x00($out) | |
1583 | vmovdqu $xb0,0x10($out) | |
1584 | vmovdqu $xc0,0x20($out) | |
1585 | vmovdqu $xd0,0x30($out) | |
1586 | vmovdqu $xa1,0x40($out) | |
1587 | vmovdqu $xb1,0x50($out) | |
1588 | vmovdqu $xc1,0x60($out) | |
1589 | vmovdqu $xd1,0x70($out) | |
1590 | lea 0x80($out),$out # size optimization | |
1591 | vmovdqu $xa2,0x00($out) | |
1592 | vmovdqu $xb2,0x10($out) | |
1593 | vmovdqu $xc2,0x20($out) | |
1594 | vmovdqu $xd2,0x30($out) | |
1595 | je .Ldone4xop | |
1596 | ||
1597 | lea 0x40($inp),$inp # inp+=64*3 | |
f2188228 | 1598 | vmovdqa $xa3,0x00(%rsp) |
a98c648e | 1599 | xor %r10,%r10 |
f2188228 | 1600 | vmovdqa $xb3,0x10(%rsp) |
a98c648e | 1601 | lea 0x40($out),$out # out+=64*3 |
f2188228 | 1602 | vmovdqa $xc3,0x20(%rsp) |
a98c648e | 1603 | sub \$192,$len # len-=64*3 |
f2188228 | 1604 | vmovdqa $xd3,0x30(%rsp) |
a98c648e AP |
1605 | |
1606 | .Loop_tail4xop: | |
1607 | movzb ($inp,%r10),%eax | |
1608 | movzb (%rsp,%r10),%ecx | |
1609 | lea 1(%r10),%r10 | |
1610 | xor %ecx,%eax | |
1611 | mov %al,-1($out,%r10) | |
1612 | dec $len | |
1613 | jnz .Loop_tail4xop | |
1614 | ||
1615 | .Ldone4xop: | |
1616 | vzeroupper | |
1617 | ___ | |
1618 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1619 | movaps -0xa8(%r9),%xmm6 |
1620 | movaps -0x98(%r9),%xmm7 | |
1621 | movaps -0x88(%r9),%xmm8 | |
1622 | movaps -0x78(%r9),%xmm9 | |
1623 | movaps -0x68(%r9),%xmm10 | |
1624 | movaps -0x58(%r9),%xmm11 | |
1625 | movaps -0x48(%r9),%xmm12 | |
1626 | movaps -0x38(%r9),%xmm13 | |
1627 | movaps -0x28(%r9),%xmm14 | |
1628 | movaps -0x18(%r9),%xmm15 | |
a98c648e AP |
1629 | ___ |
1630 | $code.=<<___; | |
384e6de4 | 1631 | lea (%r9),%rsp |
f17652e5 | 1632 | .cfi_def_cfa_register %rsp |
384e6de4 | 1633 | .L4xop_epilogue: |
a98c648e | 1634 | ret |
f17652e5 | 1635 | .cfi_endproc |
a98c648e AP |
1636 | .size ChaCha20_4xop,.-ChaCha20_4xop |
1637 | ___ | |
1638 | } | |
1639 | ||
1640 | ######################################################################## | |
1641 | # AVX2 code path | |
1642 | if ($avx>1) { | |
1643 | my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, | |
1644 | $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); | |
1645 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
1646 | "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); | |
1647 | ||
1648 | sub AVX2_lane_ROUND { | |
1649 | my ($a0,$b0,$c0,$d0)=@_; | |
1650 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
1651 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
1652 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
1653 | my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); | |
1654 | my @x=map("\"$_\"",@xx); | |
1655 | ||
1656 | # Consider order in which variables are addressed by their | |
1657 | # index: | |
1658 | # | |
1659 | # a b c d | |
1660 | # | |
1661 | # 0 4 8 12 < even round | |
1662 | # 1 5 9 13 | |
1663 | # 2 6 10 14 | |
1664 | # 3 7 11 15 | |
1665 | # 0 5 10 15 < odd round | |
1666 | # 1 6 11 12 | |
1667 | # 2 7 8 13 | |
1668 | # 3 4 9 14 | |
1669 | # | |
1670 | # 'a', 'b' and 'd's are permanently allocated in registers, | |
1671 | # @x[0..7,12..15], while 'c's are maintained in memory. If | |
1672 | # you observe 'c' column, you'll notice that pair of 'c's is | |
1673 | # invariant between rounds. This means that we have to reload | |
1674 | # them once per round, in the middle. This is why you'll see | |
1675 | # bunch of 'c' stores and loads in the middle, but none in | |
1676 | # the beginning or end. | |
1677 | ||
1678 | ( | |
1679 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 | |
1680 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])", | |
1681 | "&vpshufb (@x[$d0],@x[$d0],$t1)", | |
1682 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 | |
1683 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])", | |
1684 | "&vpshufb (@x[$d1],@x[$d1],$t1)", | |
1685 | ||
1686 | "&vpaddd ($xc,$xc,@x[$d0])", | |
1687 | "&vpxor (@x[$b0],$xc,@x[$b0])", | |
1688 | "&vpslld ($t0,@x[$b0],12)", | |
1689 | "&vpsrld (@x[$b0],@x[$b0],20)", | |
1690 | "&vpor (@x[$b0],$t0,@x[$b0])", | |
1691 | "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) | |
1692 | "&vpaddd ($xc_,$xc_,@x[$d1])", | |
1693 | "&vpxor (@x[$b1],$xc_,@x[$b1])", | |
1694 | "&vpslld ($t1,@x[$b1],12)", | |
1695 | "&vpsrld (@x[$b1],@x[$b1],20)", | |
1696 | "&vpor (@x[$b1],$t1,@x[$b1])", | |
1697 | ||
1698 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", | |
1699 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])", | |
1700 | "&vpshufb (@x[$d0],@x[$d0],$t0)", | |
1701 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", | |
1702 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])", | |
1703 | "&vpshufb (@x[$d1],@x[$d1],$t0)", | |
1704 | ||
1705 | "&vpaddd ($xc,$xc,@x[$d0])", | |
1706 | "&vpxor (@x[$b0],$xc,@x[$b0])", | |
1707 | "&vpslld ($t1,@x[$b0],7)", | |
1708 | "&vpsrld (@x[$b0],@x[$b0],25)", | |
1709 | "&vpor (@x[$b0],$t1,@x[$b0])", | |
1710 | "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) | |
1711 | "&vpaddd ($xc_,$xc_,@x[$d1])", | |
1712 | "&vpxor (@x[$b1],$xc_,@x[$b1])", | |
1713 | "&vpslld ($t0,@x[$b1],7)", | |
1714 | "&vpsrld (@x[$b1],@x[$b1],25)", | |
1715 | "&vpor (@x[$b1],$t0,@x[$b1])", | |
1716 | ||
1717 | "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's | |
1718 | "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", | |
1719 | "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", | |
1720 | "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", | |
1721 | ||
1722 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 | |
1723 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])", | |
1724 | "&vpshufb (@x[$d2],@x[$d2],$t1)", | |
1725 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 | |
1726 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])", | |
1727 | "&vpshufb (@x[$d3],@x[$d3],$t1)", | |
1728 | ||
1729 | "&vpaddd ($xc,$xc,@x[$d2])", | |
1730 | "&vpxor (@x[$b2],$xc,@x[$b2])", | |
1731 | "&vpslld ($t0,@x[$b2],12)", | |
1732 | "&vpsrld (@x[$b2],@x[$b2],20)", | |
1733 | "&vpor (@x[$b2],$t0,@x[$b2])", | |
1734 | "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) | |
1735 | "&vpaddd ($xc_,$xc_,@x[$d3])", | |
1736 | "&vpxor (@x[$b3],$xc_,@x[$b3])", | |
1737 | "&vpslld ($t1,@x[$b3],12)", | |
1738 | "&vpsrld (@x[$b3],@x[$b3],20)", | |
1739 | "&vpor (@x[$b3],$t1,@x[$b3])", | |
1740 | ||
1741 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", | |
1742 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])", | |
1743 | "&vpshufb (@x[$d2],@x[$d2],$t0)", | |
1744 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", | |
1745 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])", | |
1746 | "&vpshufb (@x[$d3],@x[$d3],$t0)", | |
1747 | ||
1748 | "&vpaddd ($xc,$xc,@x[$d2])", | |
1749 | "&vpxor (@x[$b2],$xc,@x[$b2])", | |
1750 | "&vpslld ($t1,@x[$b2],7)", | |
1751 | "&vpsrld (@x[$b2],@x[$b2],25)", | |
1752 | "&vpor (@x[$b2],$t1,@x[$b2])", | |
1753 | "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) | |
1754 | "&vpaddd ($xc_,$xc_,@x[$d3])", | |
1755 | "&vpxor (@x[$b3],$xc_,@x[$b3])", | |
1756 | "&vpslld ($t0,@x[$b3],7)", | |
1757 | "&vpsrld (@x[$b3],@x[$b3],25)", | |
1758 | "&vpor (@x[$b3],$t0,@x[$b3])" | |
1759 | ); | |
1760 | } | |
1761 | ||
384e6de4 | 1762 | my $xframe = $win64 ? 0xa8 : 8; |
a98c648e AP |
1763 | |
1764 | $code.=<<___; | |
1765 | .type ChaCha20_8x,\@function,5 | |
1766 | .align 32 | |
1767 | ChaCha20_8x: | |
f17652e5 | 1768 | .cfi_startproc |
a98c648e | 1769 | .LChaCha20_8x: |
384e6de4 | 1770 | mov %rsp,%r9 # frame register |
f17652e5 | 1771 | .cfi_def_cfa_register %r9 |
a98c648e AP |
1772 | sub \$0x280+$xframe,%rsp |
1773 | and \$-32,%rsp | |
1774 | ___ | |
1775 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1776 | movaps %xmm6,-0xa8(%r9) |
1777 | movaps %xmm7,-0x98(%r9) | |
1778 | movaps %xmm8,-0x88(%r9) | |
1779 | movaps %xmm9,-0x78(%r9) | |
1780 | movaps %xmm10,-0x68(%r9) | |
1781 | movaps %xmm11,-0x58(%r9) | |
1782 | movaps %xmm12,-0x48(%r9) | |
1783 | movaps %xmm13,-0x38(%r9) | |
1784 | movaps %xmm14,-0x28(%r9) | |
1785 | movaps %xmm15,-0x18(%r9) | |
1786 | .L8x_body: | |
a98c648e AP |
1787 | ___ |
1788 | $code.=<<___; | |
1789 | vzeroupper | |
a98c648e AP |
1790 | |
1791 | ################ stack layout | |
1792 | # +0x00 SIMD equivalent of @x[8-12] | |
1793 | # ... | |
1794 | # +0x80 constant copy of key[0-2] smashed by lanes | |
1795 | # ... | |
1796 | # +0x200 SIMD counters (with nonce smashed by lanes) | |
1797 | # ... | |
384e6de4 | 1798 | # +0x280 |
a98c648e AP |
1799 | |
1800 | vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] | |
1801 | vbroadcasti128 ($key),$xb3 # key[1] | |
1802 | vbroadcasti128 16($key),$xt3 # key[2] | |
1803 | vbroadcasti128 ($counter),$xd3 # key[3] | |
1804 | lea 0x100(%rsp),%rcx # size optimization | |
1805 | lea 0x200(%rsp),%rax # size optimization | |
1806 | lea .Lrot16(%rip),%r10 | |
1807 | lea .Lrot24(%rip),%r11 | |
1808 | ||
1809 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
1810 | vpshufd \$0x55,$xa3,$xa1 | |
1811 | vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload | |
1812 | vpshufd \$0xaa,$xa3,$xa2 | |
1813 | vmovdqa $xa1,0xa0-0x100(%rcx) | |
1814 | vpshufd \$0xff,$xa3,$xa3 | |
1815 | vmovdqa $xa2,0xc0-0x100(%rcx) | |
1816 | vmovdqa $xa3,0xe0-0x100(%rcx) | |
1817 | ||
1818 | vpshufd \$0x00,$xb3,$xb0 | |
1819 | vpshufd \$0x55,$xb3,$xb1 | |
1820 | vmovdqa $xb0,0x100-0x100(%rcx) | |
1821 | vpshufd \$0xaa,$xb3,$xb2 | |
1822 | vmovdqa $xb1,0x120-0x100(%rcx) | |
1823 | vpshufd \$0xff,$xb3,$xb3 | |
1824 | vmovdqa $xb2,0x140-0x100(%rcx) | |
1825 | vmovdqa $xb3,0x160-0x100(%rcx) | |
1826 | ||
1827 | vpshufd \$0x00,$xt3,$xt0 # "xc0" | |
1828 | vpshufd \$0x55,$xt3,$xt1 # "xc1" | |
1829 | vmovdqa $xt0,0x180-0x200(%rax) | |
1830 | vpshufd \$0xaa,$xt3,$xt2 # "xc2" | |
1831 | vmovdqa $xt1,0x1a0-0x200(%rax) | |
1832 | vpshufd \$0xff,$xt3,$xt3 # "xc3" | |
1833 | vmovdqa $xt2,0x1c0-0x200(%rax) | |
1834 | vmovdqa $xt3,0x1e0-0x200(%rax) | |
1835 | ||
1836 | vpshufd \$0x00,$xd3,$xd0 | |
1837 | vpshufd \$0x55,$xd3,$xd1 | |
1838 | vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet | |
1839 | vpshufd \$0xaa,$xd3,$xd2 | |
1840 | vmovdqa $xd1,0x220-0x200(%rax) | |
1841 | vpshufd \$0xff,$xd3,$xd3 | |
1842 | vmovdqa $xd2,0x240-0x200(%rax) | |
1843 | vmovdqa $xd3,0x260-0x200(%rax) | |
1844 | ||
1845 | jmp .Loop_enter8x | |
1846 | ||
1847 | .align 32 | |
1848 | .Loop_outer8x: | |
1849 | vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key | |
1850 | vmovdqa 0xa0-0x100(%rcx),$xa1 | |
1851 | vmovdqa 0xc0-0x100(%rcx),$xa2 | |
1852 | vmovdqa 0xe0-0x100(%rcx),$xa3 | |
1853 | vmovdqa 0x100-0x100(%rcx),$xb0 | |
1854 | vmovdqa 0x120-0x100(%rcx),$xb1 | |
1855 | vmovdqa 0x140-0x100(%rcx),$xb2 | |
1856 | vmovdqa 0x160-0x100(%rcx),$xb3 | |
1857 | vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" | |
1858 | vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" | |
1859 | vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" | |
1860 | vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" | |
1861 | vmovdqa 0x200-0x200(%rax),$xd0 | |
1862 | vmovdqa 0x220-0x200(%rax),$xd1 | |
1863 | vmovdqa 0x240-0x200(%rax),$xd2 | |
1864 | vmovdqa 0x260-0x200(%rax),$xd3 | |
1865 | vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters | |
1866 | ||
1867 | .Loop_enter8x: | |
1868 | vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" | |
1869 | vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" | |
1870 | vbroadcasti128 (%r10),$xt3 | |
1871 | vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters | |
1872 | mov \$10,%eax | |
1873 | jmp .Loop8x | |
1874 | ||
1875 | .align 32 | |
1876 | .Loop8x: | |
1877 | ___ | |
1878 | foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } | |
1879 | foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } | |
1880 | $code.=<<___; | |
1881 | dec %eax | |
1882 | jnz .Loop8x | |
1883 | ||
1884 | lea 0x200(%rsp),%rax # size optimization | |
1885 | vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key | |
1886 | vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 | |
1887 | vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 | |
1888 | vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 | |
1889 | ||
1890 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data | |
1891 | vpunpckldq $xa3,$xa2,$xt3 | |
1892 | vpunpckhdq $xa1,$xa0,$xa0 | |
1893 | vpunpckhdq $xa3,$xa2,$xa2 | |
1894 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0" | |
1895 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1" | |
1896 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2" | |
1897 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3" | |
1898 | ___ | |
1899 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); | |
1900 | $code.=<<___; | |
1901 | vpaddd 0x100-0x100(%rcx),$xb0,$xb0 | |
1902 | vpaddd 0x120-0x100(%rcx),$xb1,$xb1 | |
1903 | vpaddd 0x140-0x100(%rcx),$xb2,$xb2 | |
1904 | vpaddd 0x160-0x100(%rcx),$xb3,$xb3 | |
1905 | ||
1906 | vpunpckldq $xb1,$xb0,$xt2 | |
1907 | vpunpckldq $xb3,$xb2,$xt3 | |
1908 | vpunpckhdq $xb1,$xb0,$xb0 | |
1909 | vpunpckhdq $xb3,$xb2,$xb2 | |
1910 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0" | |
1911 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1" | |
1912 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2" | |
1913 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3" | |
1914 | ___ | |
1915 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); | |
1916 | $code.=<<___; | |
1917 | vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further | |
1918 | vperm2i128 \$0x31,$xb0,$xa0,$xb0 | |
1919 | vperm2i128 \$0x20,$xb1,$xa1,$xa0 | |
1920 | vperm2i128 \$0x31,$xb1,$xa1,$xb1 | |
1921 | vperm2i128 \$0x20,$xb2,$xa2,$xa1 | |
1922 | vperm2i128 \$0x31,$xb2,$xa2,$xb2 | |
1923 | vperm2i128 \$0x20,$xb3,$xa3,$xa2 | |
1924 | vperm2i128 \$0x31,$xb3,$xa3,$xb3 | |
1925 | ___ | |
1926 | ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); | |
1927 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); | |
1928 | $code.=<<___; | |
1929 | vmovdqa $xa0,0x00(%rsp) # offload $xaN | |
1930 | vmovdqa $xa1,0x20(%rsp) | |
1931 | vmovdqa 0x40(%rsp),$xc2 # $xa0 | |
1932 | vmovdqa 0x60(%rsp),$xc3 # $xa1 | |
1933 | ||
1934 | vpaddd 0x180-0x200(%rax),$xc0,$xc0 | |
1935 | vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 | |
1936 | vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 | |
1937 | vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 | |
1938 | ||
1939 | vpunpckldq $xc1,$xc0,$xt2 | |
1940 | vpunpckldq $xc3,$xc2,$xt3 | |
1941 | vpunpckhdq $xc1,$xc0,$xc0 | |
1942 | vpunpckhdq $xc3,$xc2,$xc2 | |
1943 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0" | |
1944 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1" | |
1945 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2" | |
1946 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3" | |
1947 | ___ | |
1948 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); | |
1949 | $code.=<<___; | |
1950 | vpaddd 0x200-0x200(%rax),$xd0,$xd0 | |
1951 | vpaddd 0x220-0x200(%rax),$xd1,$xd1 | |
1952 | vpaddd 0x240-0x200(%rax),$xd2,$xd2 | |
1953 | vpaddd 0x260-0x200(%rax),$xd3,$xd3 | |
1954 | ||
1955 | vpunpckldq $xd1,$xd0,$xt2 | |
1956 | vpunpckldq $xd3,$xd2,$xt3 | |
1957 | vpunpckhdq $xd1,$xd0,$xd0 | |
1958 | vpunpckhdq $xd3,$xd2,$xd2 | |
1959 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0" | |
1960 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1" | |
1961 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2" | |
1962 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3" | |
1963 | ___ | |
1964 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); | |
1965 | $code.=<<___; | |
1966 | vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further | |
1967 | vperm2i128 \$0x31,$xd0,$xc0,$xd0 | |
1968 | vperm2i128 \$0x20,$xd1,$xc1,$xc0 | |
1969 | vperm2i128 \$0x31,$xd1,$xc1,$xd1 | |
1970 | vperm2i128 \$0x20,$xd2,$xc2,$xc1 | |
1971 | vperm2i128 \$0x31,$xd2,$xc2,$xd2 | |
1972 | vperm2i128 \$0x20,$xd3,$xc3,$xc2 | |
1973 | vperm2i128 \$0x31,$xd3,$xc3,$xd3 | |
1974 | ___ | |
1975 | ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); | |
1976 | ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= | |
1977 | ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); | |
1978 | ($xa0,$xa1)=($xt2,$xt3); | |
1979 | $code.=<<___; | |
1980 | vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? | |
1981 | vmovdqa 0x20(%rsp),$xa1 | |
1982 | ||
1983 | cmp \$64*8,$len | |
1984 | jb .Ltail8x | |
1985 | ||
1986 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1987 | vpxor 0x20($inp),$xb0,$xb0 | |
1988 | vpxor 0x40($inp),$xc0,$xc0 | |
1989 | vpxor 0x60($inp),$xd0,$xd0 | |
1990 | lea 0x80($inp),$inp # size optimization | |
1991 | vmovdqu $xa0,0x00($out) | |
1992 | vmovdqu $xb0,0x20($out) | |
1993 | vmovdqu $xc0,0x40($out) | |
1994 | vmovdqu $xd0,0x60($out) | |
1995 | lea 0x80($out),$out # size optimization | |
1996 | ||
1997 | vpxor 0x00($inp),$xa1,$xa1 | |
1998 | vpxor 0x20($inp),$xb1,$xb1 | |
1999 | vpxor 0x40($inp),$xc1,$xc1 | |
2000 | vpxor 0x60($inp),$xd1,$xd1 | |
2001 | lea 0x80($inp),$inp # size optimization | |
2002 | vmovdqu $xa1,0x00($out) | |
2003 | vmovdqu $xb1,0x20($out) | |
2004 | vmovdqu $xc1,0x40($out) | |
2005 | vmovdqu $xd1,0x60($out) | |
2006 | lea 0x80($out),$out # size optimization | |
2007 | ||
2008 | vpxor 0x00($inp),$xa2,$xa2 | |
2009 | vpxor 0x20($inp),$xb2,$xb2 | |
2010 | vpxor 0x40($inp),$xc2,$xc2 | |
2011 | vpxor 0x60($inp),$xd2,$xd2 | |
2012 | lea 0x80($inp),$inp # size optimization | |
2013 | vmovdqu $xa2,0x00($out) | |
2014 | vmovdqu $xb2,0x20($out) | |
2015 | vmovdqu $xc2,0x40($out) | |
2016 | vmovdqu $xd2,0x60($out) | |
2017 | lea 0x80($out),$out # size optimization | |
2018 | ||
2019 | vpxor 0x00($inp),$xa3,$xa3 | |
2020 | vpxor 0x20($inp),$xb3,$xb3 | |
2021 | vpxor 0x40($inp),$xc3,$xc3 | |
2022 | vpxor 0x60($inp),$xd3,$xd3 | |
2023 | lea 0x80($inp),$inp # size optimization | |
2024 | vmovdqu $xa3,0x00($out) | |
2025 | vmovdqu $xb3,0x20($out) | |
2026 | vmovdqu $xc3,0x40($out) | |
2027 | vmovdqu $xd3,0x60($out) | |
2028 | lea 0x80($out),$out # size optimization | |
2029 | ||
2030 | sub \$64*8,$len | |
2031 | jnz .Loop_outer8x | |
2032 | ||
2033 | jmp .Ldone8x | |
2034 | ||
2035 | .Ltail8x: | |
2036 | cmp \$448,$len | |
2037 | jae .L448_or_more8x | |
2038 | cmp \$384,$len | |
2039 | jae .L384_or_more8x | |
2040 | cmp \$320,$len | |
2041 | jae .L320_or_more8x | |
2042 | cmp \$256,$len | |
2043 | jae .L256_or_more8x | |
2044 | cmp \$192,$len | |
2045 | jae .L192_or_more8x | |
2046 | cmp \$128,$len | |
2047 | jae .L128_or_more8x | |
2048 | cmp \$64,$len | |
2049 | jae .L64_or_more8x | |
2050 | ||
2051 | xor %r10,%r10 | |
2052 | vmovdqa $xa0,0x00(%rsp) | |
2053 | vmovdqa $xb0,0x20(%rsp) | |
2054 | jmp .Loop_tail8x | |
2055 | ||
2056 | .align 32 | |
2057 | .L64_or_more8x: | |
2058 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2059 | vpxor 0x20($inp),$xb0,$xb0 | |
2060 | vmovdqu $xa0,0x00($out) | |
2061 | vmovdqu $xb0,0x20($out) | |
2062 | je .Ldone8x | |
2063 | ||
2064 | lea 0x40($inp),$inp # inp+=64*1 | |
2065 | xor %r10,%r10 | |
2066 | vmovdqa $xc0,0x00(%rsp) | |
2067 | lea 0x40($out),$out # out+=64*1 | |
2068 | sub \$64,$len # len-=64*1 | |
2069 | vmovdqa $xd0,0x20(%rsp) | |
2070 | jmp .Loop_tail8x | |
2071 | ||
2072 | .align 32 | |
2073 | .L128_or_more8x: | |
2074 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2075 | vpxor 0x20($inp),$xb0,$xb0 | |
2076 | vpxor 0x40($inp),$xc0,$xc0 | |
2077 | vpxor 0x60($inp),$xd0,$xd0 | |
2078 | vmovdqu $xa0,0x00($out) | |
2079 | vmovdqu $xb0,0x20($out) | |
2080 | vmovdqu $xc0,0x40($out) | |
2081 | vmovdqu $xd0,0x60($out) | |
2082 | je .Ldone8x | |
2083 | ||
2084 | lea 0x80($inp),$inp # inp+=64*2 | |
2085 | xor %r10,%r10 | |
2086 | vmovdqa $xa1,0x00(%rsp) | |
2087 | lea 0x80($out),$out # out+=64*2 | |
2088 | sub \$128,$len # len-=64*2 | |
2089 | vmovdqa $xb1,0x20(%rsp) | |
2090 | jmp .Loop_tail8x | |
2091 | ||
2092 | .align 32 | |
2093 | .L192_or_more8x: | |
2094 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2095 | vpxor 0x20($inp),$xb0,$xb0 | |
2096 | vpxor 0x40($inp),$xc0,$xc0 | |
2097 | vpxor 0x60($inp),$xd0,$xd0 | |
2098 | vpxor 0x80($inp),$xa1,$xa1 | |
2099 | vpxor 0xa0($inp),$xb1,$xb1 | |
2100 | vmovdqu $xa0,0x00($out) | |
2101 | vmovdqu $xb0,0x20($out) | |
2102 | vmovdqu $xc0,0x40($out) | |
2103 | vmovdqu $xd0,0x60($out) | |
2104 | vmovdqu $xa1,0x80($out) | |
2105 | vmovdqu $xb1,0xa0($out) | |
2106 | je .Ldone8x | |
2107 | ||
2108 | lea 0xc0($inp),$inp # inp+=64*3 | |
2109 | xor %r10,%r10 | |
2110 | vmovdqa $xc1,0x00(%rsp) | |
2111 | lea 0xc0($out),$out # out+=64*3 | |
2112 | sub \$192,$len # len-=64*3 | |
2113 | vmovdqa $xd1,0x20(%rsp) | |
2114 | jmp .Loop_tail8x | |
2115 | ||
2116 | .align 32 | |
2117 | .L256_or_more8x: | |
2118 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2119 | vpxor 0x20($inp),$xb0,$xb0 | |
2120 | vpxor 0x40($inp),$xc0,$xc0 | |
2121 | vpxor 0x60($inp),$xd0,$xd0 | |
2122 | vpxor 0x80($inp),$xa1,$xa1 | |
2123 | vpxor 0xa0($inp),$xb1,$xb1 | |
2124 | vpxor 0xc0($inp),$xc1,$xc1 | |
2125 | vpxor 0xe0($inp),$xd1,$xd1 | |
2126 | vmovdqu $xa0,0x00($out) | |
2127 | vmovdqu $xb0,0x20($out) | |
2128 | vmovdqu $xc0,0x40($out) | |
2129 | vmovdqu $xd0,0x60($out) | |
2130 | vmovdqu $xa1,0x80($out) | |
2131 | vmovdqu $xb1,0xa0($out) | |
2132 | vmovdqu $xc1,0xc0($out) | |
2133 | vmovdqu $xd1,0xe0($out) | |
2134 | je .Ldone8x | |
2135 | ||
2136 | lea 0x100($inp),$inp # inp+=64*4 | |
2137 | xor %r10,%r10 | |
2138 | vmovdqa $xa2,0x00(%rsp) | |
2139 | lea 0x100($out),$out # out+=64*4 | |
2140 | sub \$256,$len # len-=64*4 | |
2141 | vmovdqa $xb2,0x20(%rsp) | |
2142 | jmp .Loop_tail8x | |
2143 | ||
2144 | .align 32 | |
2145 | .L320_or_more8x: | |
2146 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2147 | vpxor 0x20($inp),$xb0,$xb0 | |
2148 | vpxor 0x40($inp),$xc0,$xc0 | |
2149 | vpxor 0x60($inp),$xd0,$xd0 | |
2150 | vpxor 0x80($inp),$xa1,$xa1 | |
2151 | vpxor 0xa0($inp),$xb1,$xb1 | |
2152 | vpxor 0xc0($inp),$xc1,$xc1 | |
2153 | vpxor 0xe0($inp),$xd1,$xd1 | |
2154 | vpxor 0x100($inp),$xa2,$xa2 | |
2155 | vpxor 0x120($inp),$xb2,$xb2 | |
2156 | vmovdqu $xa0,0x00($out) | |
2157 | vmovdqu $xb0,0x20($out) | |
2158 | vmovdqu $xc0,0x40($out) | |
2159 | vmovdqu $xd0,0x60($out) | |
2160 | vmovdqu $xa1,0x80($out) | |
2161 | vmovdqu $xb1,0xa0($out) | |
2162 | vmovdqu $xc1,0xc0($out) | |
2163 | vmovdqu $xd1,0xe0($out) | |
2164 | vmovdqu $xa2,0x100($out) | |
2165 | vmovdqu $xb2,0x120($out) | |
2166 | je .Ldone8x | |
2167 | ||
2168 | lea 0x140($inp),$inp # inp+=64*5 | |
2169 | xor %r10,%r10 | |
2170 | vmovdqa $xc2,0x00(%rsp) | |
2171 | lea 0x140($out),$out # out+=64*5 | |
2172 | sub \$320,$len # len-=64*5 | |
2173 | vmovdqa $xd2,0x20(%rsp) | |
2174 | jmp .Loop_tail8x | |
2175 | ||
2176 | .align 32 | |
2177 | .L384_or_more8x: | |
2178 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2179 | vpxor 0x20($inp),$xb0,$xb0 | |
2180 | vpxor 0x40($inp),$xc0,$xc0 | |
2181 | vpxor 0x60($inp),$xd0,$xd0 | |
2182 | vpxor 0x80($inp),$xa1,$xa1 | |
2183 | vpxor 0xa0($inp),$xb1,$xb1 | |
2184 | vpxor 0xc0($inp),$xc1,$xc1 | |
2185 | vpxor 0xe0($inp),$xd1,$xd1 | |
2186 | vpxor 0x100($inp),$xa2,$xa2 | |
2187 | vpxor 0x120($inp),$xb2,$xb2 | |
2188 | vpxor 0x140($inp),$xc2,$xc2 | |
2189 | vpxor 0x160($inp),$xd2,$xd2 | |
2190 | vmovdqu $xa0,0x00($out) | |
2191 | vmovdqu $xb0,0x20($out) | |
2192 | vmovdqu $xc0,0x40($out) | |
2193 | vmovdqu $xd0,0x60($out) | |
2194 | vmovdqu $xa1,0x80($out) | |
2195 | vmovdqu $xb1,0xa0($out) | |
2196 | vmovdqu $xc1,0xc0($out) | |
2197 | vmovdqu $xd1,0xe0($out) | |
2198 | vmovdqu $xa2,0x100($out) | |
2199 | vmovdqu $xb2,0x120($out) | |
2200 | vmovdqu $xc2,0x140($out) | |
2201 | vmovdqu $xd2,0x160($out) | |
2202 | je .Ldone8x | |
2203 | ||
2204 | lea 0x180($inp),$inp # inp+=64*6 | |
2205 | xor %r10,%r10 | |
2206 | vmovdqa $xa3,0x00(%rsp) | |
2207 | lea 0x180($out),$out # out+=64*6 | |
2208 | sub \$384,$len # len-=64*6 | |
2209 | vmovdqa $xb3,0x20(%rsp) | |
2210 | jmp .Loop_tail8x | |
2211 | ||
2212 | .align 32 | |
2213 | .L448_or_more8x: | |
2214 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2215 | vpxor 0x20($inp),$xb0,$xb0 | |
2216 | vpxor 0x40($inp),$xc0,$xc0 | |
2217 | vpxor 0x60($inp),$xd0,$xd0 | |
2218 | vpxor 0x80($inp),$xa1,$xa1 | |
2219 | vpxor 0xa0($inp),$xb1,$xb1 | |
2220 | vpxor 0xc0($inp),$xc1,$xc1 | |
2221 | vpxor 0xe0($inp),$xd1,$xd1 | |
2222 | vpxor 0x100($inp),$xa2,$xa2 | |
2223 | vpxor 0x120($inp),$xb2,$xb2 | |
2224 | vpxor 0x140($inp),$xc2,$xc2 | |
2225 | vpxor 0x160($inp),$xd2,$xd2 | |
2226 | vpxor 0x180($inp),$xa3,$xa3 | |
2227 | vpxor 0x1a0($inp),$xb3,$xb3 | |
2228 | vmovdqu $xa0,0x00($out) | |
2229 | vmovdqu $xb0,0x20($out) | |
2230 | vmovdqu $xc0,0x40($out) | |
2231 | vmovdqu $xd0,0x60($out) | |
2232 | vmovdqu $xa1,0x80($out) | |
2233 | vmovdqu $xb1,0xa0($out) | |
2234 | vmovdqu $xc1,0xc0($out) | |
2235 | vmovdqu $xd1,0xe0($out) | |
2236 | vmovdqu $xa2,0x100($out) | |
2237 | vmovdqu $xb2,0x120($out) | |
2238 | vmovdqu $xc2,0x140($out) | |
2239 | vmovdqu $xd2,0x160($out) | |
2240 | vmovdqu $xa3,0x180($out) | |
2241 | vmovdqu $xb3,0x1a0($out) | |
2242 | je .Ldone8x | |
2243 | ||
2244 | lea 0x1c0($inp),$inp # inp+=64*7 | |
2245 | xor %r10,%r10 | |
2246 | vmovdqa $xc3,0x00(%rsp) | |
2247 | lea 0x1c0($out),$out # out+=64*7 | |
2248 | sub \$448,$len # len-=64*7 | |
2249 | vmovdqa $xd3,0x20(%rsp) | |
2250 | ||
2251 | .Loop_tail8x: | |
2252 | movzb ($inp,%r10),%eax | |
2253 | movzb (%rsp,%r10),%ecx | |
2254 | lea 1(%r10),%r10 | |
2255 | xor %ecx,%eax | |
2256 | mov %al,-1($out,%r10) | |
2257 | dec $len | |
2258 | jnz .Loop_tail8x | |
2259 | ||
2260 | .Ldone8x: | |
3c274a6e | 2261 | vzeroall |
a98c648e AP |
2262 | ___ |
2263 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2264 | movaps -0xa8(%r9),%xmm6 |
2265 | movaps -0x98(%r9),%xmm7 | |
2266 | movaps -0x88(%r9),%xmm8 | |
2267 | movaps -0x78(%r9),%xmm9 | |
2268 | movaps -0x68(%r9),%xmm10 | |
2269 | movaps -0x58(%r9),%xmm11 | |
2270 | movaps -0x48(%r9),%xmm12 | |
2271 | movaps -0x38(%r9),%xmm13 | |
2272 | movaps -0x28(%r9),%xmm14 | |
2273 | movaps -0x18(%r9),%xmm15 | |
a98c648e AP |
2274 | ___ |
2275 | $code.=<<___; | |
384e6de4 | 2276 | lea (%r9),%rsp |
f17652e5 | 2277 | .cfi_def_cfa_register %rsp |
384e6de4 | 2278 | .L8x_epilogue: |
a98c648e | 2279 | ret |
f17652e5 | 2280 | .cfi_endproc |
a98c648e AP |
2281 | .size ChaCha20_8x,.-ChaCha20_8x |
2282 | ___ | |
2283 | } | |
2284 | ||
abb8c44f AP |
2285 | ######################################################################## |
2286 | # AVX512 code paths | |
2287 | if ($avx>2) { | |
3c274a6e AP |
2288 | # This one handles shorter inputs... |
2289 | ||
2290 | my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); | |
2291 | my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); | |
2292 | ||
2293 | sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round | |
2294 | &vpaddd ($a,$a,$b); | |
2295 | &vpxord ($d,$d,$a); | |
2296 | &vprold ($d,$d,16); | |
2297 | ||
2298 | &vpaddd ($c,$c,$d); | |
2299 | &vpxord ($b,$b,$c); | |
2300 | &vprold ($b,$b,12); | |
2301 | ||
2302 | &vpaddd ($a,$a,$b); | |
2303 | &vpxord ($d,$d,$a); | |
2304 | &vprold ($d,$d,8); | |
2305 | ||
2306 | &vpaddd ($c,$c,$d); | |
2307 | &vpxord ($b,$b,$c); | |
2308 | &vprold ($b,$b,7); | |
2309 | } | |
2310 | ||
384e6de4 | 2311 | my $xframe = $win64 ? 32+8 : 8; |
3c274a6e AP |
2312 | |
2313 | $code.=<<___; | |
2314 | .type ChaCha20_avx512,\@function,5 | |
2315 | .align 32 | |
2316 | ChaCha20_avx512: | |
f17652e5 | 2317 | .cfi_startproc |
3c274a6e | 2318 | .LChaCha20_avx512: |
384e6de4 | 2319 | mov %rsp,%r9 # frame pointer |
f17652e5 | 2320 | .cfi_def_cfa_register %r9 |
3c274a6e AP |
2321 | cmp \$512,$len |
2322 | ja .LChaCha20_16x | |
2323 | ||
3c274a6e AP |
2324 | sub \$64+$xframe,%rsp |
2325 | ___ | |
2326 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2327 | movaps %xmm6,-0x28(%r9) |
2328 | movaps %xmm7,-0x18(%r9) | |
2329 | .Lavx512_body: | |
3c274a6e AP |
2330 | ___ |
2331 | $code.=<<___; | |
2332 | vbroadcasti32x4 .Lsigma(%rip),$a | |
2333 | vbroadcasti32x4 ($key),$b | |
2334 | vbroadcasti32x4 16($key),$c | |
2335 | vbroadcasti32x4 ($counter),$d | |
2336 | ||
2337 | vmovdqa32 $a,$a_ | |
2338 | vmovdqa32 $b,$b_ | |
2339 | vmovdqa32 $c,$c_ | |
2340 | vpaddd .Lzeroz(%rip),$d,$d | |
2341 | vmovdqa32 .Lfourz(%rip),$fourz | |
2342 | mov \$10,$counter # reuse $counter | |
2343 | vmovdqa32 $d,$d_ | |
2344 | jmp .Loop_avx512 | |
2345 | ||
2346 | .align 16 | |
2347 | .Loop_outer_avx512: | |
2348 | vmovdqa32 $a_,$a | |
2349 | vmovdqa32 $b_,$b | |
2350 | vmovdqa32 $c_,$c | |
2351 | vpaddd $fourz,$d_,$d | |
2352 | mov \$10,$counter | |
2353 | vmovdqa32 $d,$d_ | |
2354 | jmp .Loop_avx512 | |
2355 | ||
2356 | .align 32 | |
2357 | .Loop_avx512: | |
2358 | ___ | |
2359 | &AVX512ROUND(); | |
2360 | &vpshufd ($c,$c,0b01001110); | |
2361 | &vpshufd ($b,$b,0b00111001); | |
2362 | &vpshufd ($d,$d,0b10010011); | |
2363 | ||
2364 | &AVX512ROUND(); | |
2365 | &vpshufd ($c,$c,0b01001110); | |
2366 | &vpshufd ($b,$b,0b10010011); | |
2367 | &vpshufd ($d,$d,0b00111001); | |
2368 | ||
2369 | &dec ($counter); | |
2370 | &jnz (".Loop_avx512"); | |
2371 | ||
2372 | $code.=<<___; | |
2373 | vpaddd $a_,$a,$a | |
2374 | vpaddd $b_,$b,$b | |
2375 | vpaddd $c_,$c,$c | |
2376 | vpaddd $d_,$d,$d | |
2377 | ||
2378 | sub \$64,$len | |
2379 | jb .Ltail64_avx512 | |
2380 | ||
2381 | vpxor 0x00($inp),%x#$a,$t0 # xor with input | |
2382 | vpxor 0x10($inp),%x#$b,$t1 | |
2383 | vpxor 0x20($inp),%x#$c,$t2 | |
2384 | vpxor 0x30($inp),%x#$d,$t3 | |
2385 | lea 0x40($inp),$inp # inp+=64 | |
2386 | ||
2387 | vmovdqu $t0,0x00($out) # write output | |
2388 | vmovdqu $t1,0x10($out) | |
2389 | vmovdqu $t2,0x20($out) | |
2390 | vmovdqu $t3,0x30($out) | |
2391 | lea 0x40($out),$out # out+=64 | |
2392 | ||
2393 | jz .Ldone_avx512 | |
2394 | ||
2395 | vextracti32x4 \$1,$a,$t0 | |
2396 | vextracti32x4 \$1,$b,$t1 | |
2397 | vextracti32x4 \$1,$c,$t2 | |
2398 | vextracti32x4 \$1,$d,$t3 | |
2399 | ||
2400 | sub \$64,$len | |
2401 | jb .Ltail_avx512 | |
2402 | ||
2403 | vpxor 0x00($inp),$t0,$t0 # xor with input | |
2404 | vpxor 0x10($inp),$t1,$t1 | |
2405 | vpxor 0x20($inp),$t2,$t2 | |
2406 | vpxor 0x30($inp),$t3,$t3 | |
2407 | lea 0x40($inp),$inp # inp+=64 | |
2408 | ||
2409 | vmovdqu $t0,0x00($out) # write output | |
2410 | vmovdqu $t1,0x10($out) | |
2411 | vmovdqu $t2,0x20($out) | |
2412 | vmovdqu $t3,0x30($out) | |
2413 | lea 0x40($out),$out # out+=64 | |
2414 | ||
2415 | jz .Ldone_avx512 | |
2416 | ||
2417 | vextracti32x4 \$2,$a,$t0 | |
2418 | vextracti32x4 \$2,$b,$t1 | |
2419 | vextracti32x4 \$2,$c,$t2 | |
2420 | vextracti32x4 \$2,$d,$t3 | |
2421 | ||
2422 | sub \$64,$len | |
2423 | jb .Ltail_avx512 | |
2424 | ||
2425 | vpxor 0x00($inp),$t0,$t0 # xor with input | |
2426 | vpxor 0x10($inp),$t1,$t1 | |
2427 | vpxor 0x20($inp),$t2,$t2 | |
2428 | vpxor 0x30($inp),$t3,$t3 | |
2429 | lea 0x40($inp),$inp # inp+=64 | |
2430 | ||
2431 | vmovdqu $t0,0x00($out) # write output | |
2432 | vmovdqu $t1,0x10($out) | |
2433 | vmovdqu $t2,0x20($out) | |
2434 | vmovdqu $t3,0x30($out) | |
2435 | lea 0x40($out),$out # out+=64 | |
2436 | ||
2437 | jz .Ldone_avx512 | |
2438 | ||
2439 | vextracti32x4 \$3,$a,$t0 | |
2440 | vextracti32x4 \$3,$b,$t1 | |
2441 | vextracti32x4 \$3,$c,$t2 | |
2442 | vextracti32x4 \$3,$d,$t3 | |
2443 | ||
2444 | sub \$64,$len | |
2445 | jb .Ltail_avx512 | |
2446 | ||
2447 | vpxor 0x00($inp),$t0,$t0 # xor with input | |
2448 | vpxor 0x10($inp),$t1,$t1 | |
2449 | vpxor 0x20($inp),$t2,$t2 | |
2450 | vpxor 0x30($inp),$t3,$t3 | |
2451 | lea 0x40($inp),$inp # inp+=64 | |
2452 | ||
2453 | vmovdqu $t0,0x00($out) # write output | |
2454 | vmovdqu $t1,0x10($out) | |
2455 | vmovdqu $t2,0x20($out) | |
2456 | vmovdqu $t3,0x30($out) | |
2457 | lea 0x40($out),$out # out+=64 | |
2458 | ||
2459 | jnz .Loop_outer_avx512 | |
2460 | ||
2461 | jmp .Ldone_avx512 | |
2462 | ||
2463 | .align 16 | |
2464 | .Ltail64_avx512: | |
2465 | vmovdqa %x#$a,0x00(%rsp) | |
2466 | vmovdqa %x#$b,0x10(%rsp) | |
2467 | vmovdqa %x#$c,0x20(%rsp) | |
2468 | vmovdqa %x#$d,0x30(%rsp) | |
2469 | add \$64,$len | |
2470 | jmp .Loop_tail_avx512 | |
2471 | ||
2472 | .align 16 | |
2473 | .Ltail_avx512: | |
2474 | vmovdqa $t0,0x00(%rsp) | |
2475 | vmovdqa $t1,0x10(%rsp) | |
2476 | vmovdqa $t2,0x20(%rsp) | |
2477 | vmovdqa $t3,0x30(%rsp) | |
2478 | add \$64,$len | |
2479 | ||
2480 | .Loop_tail_avx512: | |
2481 | movzb ($inp,$counter),%eax | |
2482 | movzb (%rsp,$counter),%ecx | |
2483 | lea 1($counter),$counter | |
2484 | xor %ecx,%eax | |
2485 | mov %al,-1($out,$counter) | |
2486 | dec $len | |
2487 | jnz .Loop_tail_avx512 | |
2488 | ||
2489 | vmovdqa32 $a_,0x00(%rsp) | |
2490 | ||
2491 | .Ldone_avx512: | |
2492 | vzeroall | |
2493 | ___ | |
2494 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2495 | movaps -0x28(%r9),%xmm6 |
2496 | movaps -0x18(%r9),%xmm7 | |
3c274a6e AP |
2497 | ___ |
2498 | $code.=<<___; | |
384e6de4 | 2499 | lea (%r9),%rsp |
f17652e5 | 2500 | .cfi_def_cfa_register %rsp |
384e6de4 | 2501 | .Lavx512_epilogue: |
3c274a6e | 2502 | ret |
f17652e5 | 2503 | .cfi_endproc |
3c274a6e AP |
2504 | .size ChaCha20_avx512,.-ChaCha20_avx512 |
2505 | ___ | |
2506 | } | |
2507 | if ($avx>2) { | |
2508 | # This one handles longer inputs... | |
2509 | ||
abb8c44f AP |
2510 | my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, |
2511 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); | |
2512 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
2513 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); | |
2514 | my @key=map("%zmm$_",(16..31)); | |
2515 | my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; | |
2516 | ||
2517 | sub AVX512_lane_ROUND { | |
2518 | my ($a0,$b0,$c0,$d0)=@_; | |
2519 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
2520 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
2521 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
2522 | my @x=map("\"$_\"",@xx); | |
2523 | ||
2524 | ( | |
2525 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 | |
2526 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 | |
2527 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 | |
2528 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 | |
2529 | "&vpxord (@x[$d0],@x[$d0],@x[$a0])", | |
2530 | "&vpxord (@x[$d1],@x[$d1],@x[$a1])", | |
2531 | "&vpxord (@x[$d2],@x[$d2],@x[$a2])", | |
2532 | "&vpxord (@x[$d3],@x[$d3],@x[$a3])", | |
2533 | "&vprold (@x[$d0],@x[$d0],16)", | |
2534 | "&vprold (@x[$d1],@x[$d1],16)", | |
2535 | "&vprold (@x[$d2],@x[$d2],16)", | |
2536 | "&vprold (@x[$d3],@x[$d3],16)", | |
2537 | ||
2538 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", | |
2539 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", | |
2540 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", | |
2541 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", | |
2542 | "&vpxord (@x[$b0],@x[$b0],@x[$c0])", | |
2543 | "&vpxord (@x[$b1],@x[$b1],@x[$c1])", | |
2544 | "&vpxord (@x[$b2],@x[$b2],@x[$c2])", | |
2545 | "&vpxord (@x[$b3],@x[$b3],@x[$c3])", | |
2546 | "&vprold (@x[$b0],@x[$b0],12)", | |
2547 | "&vprold (@x[$b1],@x[$b1],12)", | |
2548 | "&vprold (@x[$b2],@x[$b2],12)", | |
2549 | "&vprold (@x[$b3],@x[$b3],12)", | |
2550 | ||
2551 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", | |
2552 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", | |
2553 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", | |
2554 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", | |
2555 | "&vpxord (@x[$d0],@x[$d0],@x[$a0])", | |
2556 | "&vpxord (@x[$d1],@x[$d1],@x[$a1])", | |
2557 | "&vpxord (@x[$d2],@x[$d2],@x[$a2])", | |
2558 | "&vpxord (@x[$d3],@x[$d3],@x[$a3])", | |
2559 | "&vprold (@x[$d0],@x[$d0],8)", | |
2560 | "&vprold (@x[$d1],@x[$d1],8)", | |
2561 | "&vprold (@x[$d2],@x[$d2],8)", | |
2562 | "&vprold (@x[$d3],@x[$d3],8)", | |
2563 | ||
2564 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", | |
2565 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", | |
2566 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", | |
2567 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", | |
2568 | "&vpxord (@x[$b0],@x[$b0],@x[$c0])", | |
2569 | "&vpxord (@x[$b1],@x[$b1],@x[$c1])", | |
2570 | "&vpxord (@x[$b2],@x[$b2],@x[$c2])", | |
2571 | "&vpxord (@x[$b3],@x[$b3],@x[$c3])", | |
2572 | "&vprold (@x[$b0],@x[$b0],7)", | |
2573 | "&vprold (@x[$b1],@x[$b1],7)", | |
2574 | "&vprold (@x[$b2],@x[$b2],7)", | |
2575 | "&vprold (@x[$b3],@x[$b3],7)" | |
2576 | ); | |
2577 | } | |
2578 | ||
384e6de4 | 2579 | my $xframe = $win64 ? 0xa8 : 8; |
abb8c44f AP |
2580 | |
2581 | $code.=<<___; | |
2582 | .type ChaCha20_16x,\@function,5 | |
2583 | .align 32 | |
2584 | ChaCha20_16x: | |
f17652e5 | 2585 | .cfi_startproc |
abb8c44f | 2586 | .LChaCha20_16x: |
384e6de4 | 2587 | mov %rsp,%r9 # frame register |
f17652e5 | 2588 | .cfi_def_cfa_register %r9 |
abb8c44f AP |
2589 | sub \$64+$xframe,%rsp |
2590 | and \$-64,%rsp | |
2591 | ___ | |
2592 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2593 | movaps %xmm6,-0xa8(%r9) |
2594 | movaps %xmm7,-0x98(%r9) | |
2595 | movaps %xmm8,-0x88(%r9) | |
2596 | movaps %xmm9,-0x78(%r9) | |
2597 | movaps %xmm10,-0x68(%r9) | |
2598 | movaps %xmm11,-0x58(%r9) | |
2599 | movaps %xmm12,-0x48(%r9) | |
2600 | movaps %xmm13,-0x38(%r9) | |
2601 | movaps %xmm14,-0x28(%r9) | |
2602 | movaps %xmm15,-0x18(%r9) | |
2603 | .L16x_body: | |
abb8c44f AP |
2604 | ___ |
2605 | $code.=<<___; | |
2606 | vzeroupper | |
2607 | ||
2608 | lea .Lsigma(%rip),%r10 | |
2609 | vbroadcasti32x4 (%r10),$xa3 # key[0] | |
2610 | vbroadcasti32x4 ($key),$xb3 # key[1] | |
2611 | vbroadcasti32x4 16($key),$xc3 # key[2] | |
2612 | vbroadcasti32x4 ($counter),$xd3 # key[3] | |
2613 | ||
2614 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
2615 | vpshufd \$0x55,$xa3,$xa1 | |
2616 | vpshufd \$0xaa,$xa3,$xa2 | |
2617 | vpshufd \$0xff,$xa3,$xa3 | |
2618 | vmovdqa64 $xa0,@key[0] | |
2619 | vmovdqa64 $xa1,@key[1] | |
2620 | vmovdqa64 $xa2,@key[2] | |
2621 | vmovdqa64 $xa3,@key[3] | |
2622 | ||
2623 | vpshufd \$0x00,$xb3,$xb0 | |
2624 | vpshufd \$0x55,$xb3,$xb1 | |
2625 | vpshufd \$0xaa,$xb3,$xb2 | |
2626 | vpshufd \$0xff,$xb3,$xb3 | |
2627 | vmovdqa64 $xb0,@key[4] | |
2628 | vmovdqa64 $xb1,@key[5] | |
2629 | vmovdqa64 $xb2,@key[6] | |
2630 | vmovdqa64 $xb3,@key[7] | |
2631 | ||
2632 | vpshufd \$0x00,$xc3,$xc0 | |
2633 | vpshufd \$0x55,$xc3,$xc1 | |
2634 | vpshufd \$0xaa,$xc3,$xc2 | |
2635 | vpshufd \$0xff,$xc3,$xc3 | |
2636 | vmovdqa64 $xc0,@key[8] | |
2637 | vmovdqa64 $xc1,@key[9] | |
2638 | vmovdqa64 $xc2,@key[10] | |
2639 | vmovdqa64 $xc3,@key[11] | |
2640 | ||
2641 | vpshufd \$0x00,$xd3,$xd0 | |
2642 | vpshufd \$0x55,$xd3,$xd1 | |
2643 | vpshufd \$0xaa,$xd3,$xd2 | |
2644 | vpshufd \$0xff,$xd3,$xd3 | |
2645 | vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet | |
2646 | vmovdqa64 $xd0,@key[12] | |
2647 | vmovdqa64 $xd1,@key[13] | |
2648 | vmovdqa64 $xd2,@key[14] | |
2649 | vmovdqa64 $xd3,@key[15] | |
2650 | ||
2651 | mov \$10,%eax | |
2652 | jmp .Loop16x | |
2653 | ||
2654 | .align 32 | |
2655 | .Loop_outer16x: | |
2656 | vpbroadcastd 0(%r10),$xa0 # reload key | |
2657 | vpbroadcastd 4(%r10),$xa1 | |
2658 | vpbroadcastd 8(%r10),$xa2 | |
2659 | vpbroadcastd 12(%r10),$xa3 | |
2660 | vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters | |
2661 | vmovdqa64 @key[4],$xb0 | |
2662 | vmovdqa64 @key[5],$xb1 | |
2663 | vmovdqa64 @key[6],$xb2 | |
2664 | vmovdqa64 @key[7],$xb3 | |
2665 | vmovdqa64 @key[8],$xc0 | |
2666 | vmovdqa64 @key[9],$xc1 | |
2667 | vmovdqa64 @key[10],$xc2 | |
2668 | vmovdqa64 @key[11],$xc3 | |
2669 | vmovdqa64 @key[12],$xd0 | |
2670 | vmovdqa64 @key[13],$xd1 | |
2671 | vmovdqa64 @key[14],$xd2 | |
2672 | vmovdqa64 @key[15],$xd3 | |
2673 | ||
2674 | vmovdqa64 $xa0,@key[0] | |
2675 | vmovdqa64 $xa1,@key[1] | |
2676 | vmovdqa64 $xa2,@key[2] | |
2677 | vmovdqa64 $xa3,@key[3] | |
2678 | ||
2679 | mov \$10,%eax | |
2680 | jmp .Loop16x | |
2681 | ||
2682 | .align 32 | |
2683 | .Loop16x: | |
2684 | ___ | |
2685 | foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } | |
2686 | foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } | |
2687 | $code.=<<___; | |
2688 | dec %eax | |
2689 | jnz .Loop16x | |
2690 | ||
2691 | vpaddd @key[0],$xa0,$xa0 # accumulate key | |
2692 | vpaddd @key[1],$xa1,$xa1 | |
2693 | vpaddd @key[2],$xa2,$xa2 | |
2694 | vpaddd @key[3],$xa3,$xa3 | |
2695 | ||
2696 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data | |
2697 | vpunpckldq $xa3,$xa2,$xt3 | |
2698 | vpunpckhdq $xa1,$xa0,$xa0 | |
2699 | vpunpckhdq $xa3,$xa2,$xa2 | |
2700 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0" | |
2701 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1" | |
2702 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2" | |
2703 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3" | |
2704 | ___ | |
2705 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); | |
2706 | $code.=<<___; | |
2707 | vpaddd @key[4],$xb0,$xb0 | |
2708 | vpaddd @key[5],$xb1,$xb1 | |
2709 | vpaddd @key[6],$xb2,$xb2 | |
2710 | vpaddd @key[7],$xb3,$xb3 | |
2711 | ||
2712 | vpunpckldq $xb1,$xb0,$xt2 | |
2713 | vpunpckldq $xb3,$xb2,$xt3 | |
2714 | vpunpckhdq $xb1,$xb0,$xb0 | |
2715 | vpunpckhdq $xb3,$xb2,$xb2 | |
2716 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0" | |
2717 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1" | |
2718 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2" | |
2719 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3" | |
2720 | ___ | |
2721 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); | |
2722 | $code.=<<___; | |
2723 | vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further | |
2724 | vshufi32x4 \$0xee,$xb0,$xa0,$xb0 | |
2725 | vshufi32x4 \$0x44,$xb1,$xa1,$xa0 | |
2726 | vshufi32x4 \$0xee,$xb1,$xa1,$xb1 | |
2727 | vshufi32x4 \$0x44,$xb2,$xa2,$xa1 | |
2728 | vshufi32x4 \$0xee,$xb2,$xa2,$xb2 | |
2729 | vshufi32x4 \$0x44,$xb3,$xa3,$xa2 | |
2730 | vshufi32x4 \$0xee,$xb3,$xa3,$xb3 | |
2731 | ___ | |
2732 | ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); | |
2733 | $code.=<<___; | |
2734 | vpaddd @key[8],$xc0,$xc0 | |
2735 | vpaddd @key[9],$xc1,$xc1 | |
2736 | vpaddd @key[10],$xc2,$xc2 | |
2737 | vpaddd @key[11],$xc3,$xc3 | |
2738 | ||
2739 | vpunpckldq $xc1,$xc0,$xt2 | |
2740 | vpunpckldq $xc3,$xc2,$xt3 | |
2741 | vpunpckhdq $xc1,$xc0,$xc0 | |
2742 | vpunpckhdq $xc3,$xc2,$xc2 | |
2743 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0" | |
2744 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1" | |
2745 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2" | |
2746 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3" | |
2747 | ___ | |
2748 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); | |
2749 | $code.=<<___; | |
2750 | vpaddd @key[12],$xd0,$xd0 | |
2751 | vpaddd @key[13],$xd1,$xd1 | |
2752 | vpaddd @key[14],$xd2,$xd2 | |
2753 | vpaddd @key[15],$xd3,$xd3 | |
2754 | ||
2755 | vpunpckldq $xd1,$xd0,$xt2 | |
2756 | vpunpckldq $xd3,$xd2,$xt3 | |
2757 | vpunpckhdq $xd1,$xd0,$xd0 | |
2758 | vpunpckhdq $xd3,$xd2,$xd2 | |
2759 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0" | |
2760 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1" | |
2761 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2" | |
2762 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3" | |
2763 | ___ | |
2764 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); | |
2765 | $code.=<<___; | |
2766 | vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further | |
2767 | vshufi32x4 \$0xee,$xd0,$xc0,$xd0 | |
2768 | vshufi32x4 \$0x44,$xd1,$xc1,$xc0 | |
2769 | vshufi32x4 \$0xee,$xd1,$xc1,$xd1 | |
2770 | vshufi32x4 \$0x44,$xd2,$xc2,$xc1 | |
2771 | vshufi32x4 \$0xee,$xd2,$xc2,$xd2 | |
2772 | vshufi32x4 \$0x44,$xd3,$xc3,$xc2 | |
2773 | vshufi32x4 \$0xee,$xd3,$xc3,$xd3 | |
2774 | ___ | |
2775 | ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); | |
2776 | $code.=<<___; | |
2777 | vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further | |
2778 | vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 | |
2779 | vshufi32x4 \$0x88,$xd0,$xb0,$xc0 | |
2780 | vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 | |
2781 | vshufi32x4 \$0x88,$xc1,$xa1,$xt1 | |
2782 | vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 | |
2783 | vshufi32x4 \$0x88,$xd1,$xb1,$xc1 | |
2784 | vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 | |
2785 | vshufi32x4 \$0x88,$xc2,$xa2,$xt2 | |
2786 | vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 | |
2787 | vshufi32x4 \$0x88,$xd2,$xb2,$xc2 | |
2788 | vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 | |
2789 | vshufi32x4 \$0x88,$xc3,$xa3,$xt3 | |
2790 | vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 | |
2791 | vshufi32x4 \$0x88,$xd3,$xb3,$xc3 | |
2792 | vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 | |
2793 | ___ | |
2794 | ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= | |
2795 | ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); | |
2796 | ||
2797 | ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, | |
2798 | $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = | |
2799 | ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
2800 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); | |
2801 | $code.=<<___; | |
2802 | cmp \$64*16,$len | |
2803 | jb .Ltail16x | |
2804 | ||
2805 | vpxord 0x00($inp),$xa0,$xa0 # xor with input | |
2806 | vpxord 0x40($inp),$xb0,$xb0 | |
2807 | vpxord 0x80($inp),$xc0,$xc0 | |
2808 | vpxord 0xc0($inp),$xd0,$xd0 | |
2809 | vmovdqu32 $xa0,0x00($out) | |
2810 | vmovdqu32 $xb0,0x40($out) | |
2811 | vmovdqu32 $xc0,0x80($out) | |
2812 | vmovdqu32 $xd0,0xc0($out) | |
2813 | ||
2814 | vpxord 0x100($inp),$xa1,$xa1 | |
2815 | vpxord 0x140($inp),$xb1,$xb1 | |
2816 | vpxord 0x180($inp),$xc1,$xc1 | |
2817 | vpxord 0x1c0($inp),$xd1,$xd1 | |
2818 | vmovdqu32 $xa1,0x100($out) | |
2819 | vmovdqu32 $xb1,0x140($out) | |
2820 | vmovdqu32 $xc1,0x180($out) | |
2821 | vmovdqu32 $xd1,0x1c0($out) | |
2822 | ||
2823 | vpxord 0x200($inp),$xa2,$xa2 | |
2824 | vpxord 0x240($inp),$xb2,$xb2 | |
2825 | vpxord 0x280($inp),$xc2,$xc2 | |
2826 | vpxord 0x2c0($inp),$xd2,$xd2 | |
2827 | vmovdqu32 $xa2,0x200($out) | |
2828 | vmovdqu32 $xb2,0x240($out) | |
2829 | vmovdqu32 $xc2,0x280($out) | |
2830 | vmovdqu32 $xd2,0x2c0($out) | |
2831 | ||
2832 | vpxord 0x300($inp),$xa3,$xa3 | |
2833 | vpxord 0x340($inp),$xb3,$xb3 | |
2834 | vpxord 0x380($inp),$xc3,$xc3 | |
2835 | vpxord 0x3c0($inp),$xd3,$xd3 | |
2836 | lea 0x400($inp),$inp | |
2837 | vmovdqu32 $xa3,0x300($out) | |
2838 | vmovdqu32 $xb3,0x340($out) | |
2839 | vmovdqu32 $xc3,0x380($out) | |
2840 | vmovdqu32 $xd3,0x3c0($out) | |
2841 | lea 0x400($out),$out | |
2842 | ||
2843 | sub \$64*16,$len | |
2844 | jnz .Loop_outer16x | |
2845 | ||
2846 | jmp .Ldone16x | |
2847 | ||
2848 | .align 32 | |
2849 | .Ltail16x: | |
2850 | xor %r10,%r10 | |
2851 | sub $inp,$out | |
2852 | cmp \$64*1,$len | |
2853 | jb .Less_than_64_16x | |
2854 | vpxord ($inp),$xa0,$xa0 # xor with input | |
2855 | vmovdqu32 $xa0,($out,$inp) | |
2856 | je .Ldone16x | |
2857 | vmovdqa32 $xb0,$xa0 | |
2858 | lea 64($inp),$inp | |
2859 | ||
2860 | cmp \$64*2,$len | |
2861 | jb .Less_than_64_16x | |
2862 | vpxord ($inp),$xb0,$xb0 | |
2863 | vmovdqu32 $xb0,($out,$inp) | |
2864 | je .Ldone16x | |
2865 | vmovdqa32 $xc0,$xa0 | |
2866 | lea 64($inp),$inp | |
2867 | ||
2868 | cmp \$64*3,$len | |
2869 | jb .Less_than_64_16x | |
2870 | vpxord ($inp),$xc0,$xc0 | |
2871 | vmovdqu32 $xc0,($out,$inp) | |
2872 | je .Ldone16x | |
2873 | vmovdqa32 $xd0,$xa0 | |
2874 | lea 64($inp),$inp | |
2875 | ||
2876 | cmp \$64*4,$len | |
2877 | jb .Less_than_64_16x | |
2878 | vpxord ($inp),$xd0,$xd0 | |
2879 | vmovdqu32 $xd0,($out,$inp) | |
2880 | je .Ldone16x | |
2881 | vmovdqa32 $xa1,$xa0 | |
2882 | lea 64($inp),$inp | |
2883 | ||
2884 | cmp \$64*5,$len | |
2885 | jb .Less_than_64_16x | |
2886 | vpxord ($inp),$xa1,$xa1 | |
2887 | vmovdqu32 $xa1,($out,$inp) | |
2888 | je .Ldone16x | |
2889 | vmovdqa32 $xb1,$xa0 | |
2890 | lea 64($inp),$inp | |
2891 | ||
2892 | cmp \$64*6,$len | |
2893 | jb .Less_than_64_16x | |
2894 | vpxord ($inp),$xb1,$xb1 | |
2895 | vmovdqu32 $xb1,($out,$inp) | |
2896 | je .Ldone16x | |
2897 | vmovdqa32 $xc1,$xa0 | |
2898 | lea 64($inp),$inp | |
2899 | ||
2900 | cmp \$64*7,$len | |
2901 | jb .Less_than_64_16x | |
2902 | vpxord ($inp),$xc1,$xc1 | |
2903 | vmovdqu32 $xc1,($out,$inp) | |
2904 | je .Ldone16x | |
2905 | vmovdqa32 $xd1,$xa0 | |
2906 | lea 64($inp),$inp | |
2907 | ||
2908 | cmp \$64*8,$len | |
2909 | jb .Less_than_64_16x | |
2910 | vpxord ($inp),$xd1,$xd1 | |
2911 | vmovdqu32 $xd1,($out,$inp) | |
2912 | je .Ldone16x | |
2913 | vmovdqa32 $xa2,$xa0 | |
2914 | lea 64($inp),$inp | |
2915 | ||
2916 | cmp \$64*9,$len | |
2917 | jb .Less_than_64_16x | |
2918 | vpxord ($inp),$xa2,$xa2 | |
2919 | vmovdqu32 $xa2,($out,$inp) | |
2920 | je .Ldone16x | |
2921 | vmovdqa32 $xb2,$xa0 | |
2922 | lea 64($inp),$inp | |
2923 | ||
2924 | cmp \$64*10,$len | |
2925 | jb .Less_than_64_16x | |
2926 | vpxord ($inp),$xb2,$xb2 | |
2927 | vmovdqu32 $xb2,($out,$inp) | |
2928 | je .Ldone16x | |
2929 | vmovdqa32 $xc2,$xa0 | |
2930 | lea 64($inp),$inp | |
2931 | ||
2932 | cmp \$64*11,$len | |
2933 | jb .Less_than_64_16x | |
2934 | vpxord ($inp),$xc2,$xc2 | |
2935 | vmovdqu32 $xc2,($out,$inp) | |
2936 | je .Ldone16x | |
2937 | vmovdqa32 $xd2,$xa0 | |
2938 | lea 64($inp),$inp | |
2939 | ||
2940 | cmp \$64*12,$len | |
2941 | jb .Less_than_64_16x | |
2942 | vpxord ($inp),$xd2,$xd2 | |
2943 | vmovdqu32 $xd2,($out,$inp) | |
2944 | je .Ldone16x | |
2945 | vmovdqa32 $xa3,$xa0 | |
2946 | lea 64($inp),$inp | |
2947 | ||
2948 | cmp \$64*13,$len | |
2949 | jb .Less_than_64_16x | |
2950 | vpxord ($inp),$xa3,$xa3 | |
2951 | vmovdqu32 $xa3,($out,$inp) | |
2952 | je .Ldone16x | |
2953 | vmovdqa32 $xb3,$xa0 | |
2954 | lea 64($inp),$inp | |
2955 | ||
2956 | cmp \$64*14,$len | |
2957 | jb .Less_than_64_16x | |
2958 | vpxord ($inp),$xb3,$xb3 | |
2959 | vmovdqu32 $xb3,($out,$inp) | |
2960 | je .Ldone16x | |
2961 | vmovdqa32 $xc3,$xa0 | |
2962 | lea 64($inp),$inp | |
2963 | ||
2964 | cmp \$64*15,$len | |
2965 | jb .Less_than_64_16x | |
2966 | vpxord ($inp),$xc3,$xc3 | |
2967 | vmovdqu32 $xc3,($out,$inp) | |
2968 | je .Ldone16x | |
2969 | vmovdqa32 $xd3,$xa0 | |
2970 | lea 64($inp),$inp | |
2971 | ||
2972 | .Less_than_64_16x: | |
2973 | vmovdqa32 $xa0,0x00(%rsp) | |
2974 | lea ($out,$inp),$out | |
2975 | and \$63,$len | |
2976 | ||
2977 | .Loop_tail16x: | |
2978 | movzb ($inp,%r10),%eax | |
2979 | movzb (%rsp,%r10),%ecx | |
2980 | lea 1(%r10),%r10 | |
2981 | xor %ecx,%eax | |
2982 | mov %al,-1($out,%r10) | |
2983 | dec $len | |
2984 | jnz .Loop_tail16x | |
2985 | ||
3c274a6e AP |
2986 | vpxord $xa0,$xa0,$xa0 |
2987 | vmovdqa32 $xa0,0(%rsp) | |
2988 | ||
abb8c44f | 2989 | .Ldone16x: |
3c274a6e | 2990 | vzeroall |
abb8c44f AP |
2991 | ___ |
2992 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2993 | movaps -0xa8(%r9),%xmm6 |
2994 | movaps -0x98(%r9),%xmm7 | |
2995 | movaps -0x88(%r9),%xmm8 | |
2996 | movaps -0x78(%r9),%xmm9 | |
2997 | movaps -0x68(%r9),%xmm10 | |
2998 | movaps -0x58(%r9),%xmm11 | |
2999 | movaps -0x48(%r9),%xmm12 | |
3000 | movaps -0x38(%r9),%xmm13 | |
3001 | movaps -0x28(%r9),%xmm14 | |
3002 | movaps -0x18(%r9),%xmm15 | |
abb8c44f AP |
3003 | ___ |
3004 | $code.=<<___; | |
384e6de4 | 3005 | lea (%r9),%rsp |
f17652e5 | 3006 | .cfi_def_cfa_register %rsp |
384e6de4 | 3007 | .L16x_epilogue: |
abb8c44f | 3008 | ret |
f17652e5 | 3009 | .cfi_endproc |
abb8c44f AP |
3010 | .size ChaCha20_16x,.-ChaCha20_16x |
3011 | ___ | |
3012 | } | |
3013 | ||
384e6de4 AP |
3014 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
3015 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
3016 | if ($win64) { | |
3017 | $rec="%rcx"; | |
3018 | $frame="%rdx"; | |
3019 | $context="%r8"; | |
3020 | $disp="%r9"; | |
3021 | ||
3022 | $code.=<<___; | |
3023 | .extern __imp_RtlVirtualUnwind | |
3024 | .type se_handler,\@abi-omnipotent | |
3025 | .align 16 | |
3026 | se_handler: | |
3027 | push %rsi | |
3028 | push %rdi | |
3029 | push %rbx | |
3030 | push %rbp | |
3031 | push %r12 | |
3032 | push %r13 | |
3033 | push %r14 | |
3034 | push %r15 | |
3035 | pushfq | |
3036 | sub \$64,%rsp | |
3037 | ||
3038 | mov 120($context),%rax # pull context->Rax | |
3039 | mov 248($context),%rbx # pull context->Rip | |
3040 | ||
3041 | mov 8($disp),%rsi # disp->ImageBase | |
3042 | mov 56($disp),%r11 # disp->HandlerData | |
3043 | ||
3044 | lea .Lctr32_body(%rip),%r10 | |
3045 | cmp %r10,%rbx # context->Rip<.Lprologue | |
3046 | jb .Lcommon_seh_tail | |
3047 | ||
3048 | mov 152($context),%rax # pull context->Rsp | |
3049 | ||
3050 | lea .Lno_data(%rip),%r10 # epilogue label | |
3051 | cmp %r10,%rbx # context->Rip>=.Lepilogue | |
3052 | jae .Lcommon_seh_tail | |
3053 | ||
3054 | lea 64+24+48(%rax),%rax | |
3055 | ||
3056 | mov -8(%rax),%rbx | |
3057 | mov -16(%rax),%rbp | |
3058 | mov -24(%rax),%r12 | |
3059 | mov -32(%rax),%r13 | |
3060 | mov -40(%rax),%r14 | |
3061 | mov -48(%rax),%r15 | |
3062 | mov %rbx,144($context) # restore context->Rbx | |
3063 | mov %rbp,160($context) # restore context->Rbp | |
3064 | mov %r12,216($context) # restore context->R12 | |
3065 | mov %r13,224($context) # restore context->R13 | |
3066 | mov %r14,232($context) # restore context->R14 | |
3067 | mov %r15,240($context) # restore context->R14 | |
3068 | ||
3069 | .Lcommon_seh_tail: | |
3070 | mov 8(%rax),%rdi | |
3071 | mov 16(%rax),%rsi | |
3072 | mov %rax,152($context) # restore context->Rsp | |
3073 | mov %rsi,168($context) # restore context->Rsi | |
3074 | mov %rdi,176($context) # restore context->Rdi | |
3075 | ||
3076 | mov 40($disp),%rdi # disp->ContextRecord | |
3077 | mov $context,%rsi # context | |
3078 | mov \$154,%ecx # sizeof(CONTEXT) | |
3079 | .long 0xa548f3fc # cld; rep movsq | |
3080 | ||
3081 | mov $disp,%rsi | |
3082 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
3083 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
3084 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
3085 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
3086 | mov 40(%rsi),%r10 # disp->ContextRecord | |
3087 | lea 56(%rsi),%r11 # &disp->HandlerData | |
3088 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
3089 | mov %r10,32(%rsp) # arg5 | |
3090 | mov %r11,40(%rsp) # arg6 | |
3091 | mov %r12,48(%rsp) # arg7 | |
3092 | mov %rcx,56(%rsp) # arg8, (NULL) | |
3093 | call *__imp_RtlVirtualUnwind(%rip) | |
3094 | ||
3095 | mov \$1,%eax # ExceptionContinueSearch | |
3096 | add \$64,%rsp | |
3097 | popfq | |
3098 | pop %r15 | |
3099 | pop %r14 | |
3100 | pop %r13 | |
3101 | pop %r12 | |
3102 | pop %rbp | |
3103 | pop %rbx | |
3104 | pop %rdi | |
3105 | pop %rsi | |
3106 | ret | |
3107 | .size se_handler,.-se_handler | |
3108 | ||
3109 | .type ssse3_handler,\@abi-omnipotent | |
3110 | .align 16 | |
3111 | ssse3_handler: | |
3112 | push %rsi | |
3113 | push %rdi | |
3114 | push %rbx | |
3115 | push %rbp | |
3116 | push %r12 | |
3117 | push %r13 | |
3118 | push %r14 | |
3119 | push %r15 | |
3120 | pushfq | |
3121 | sub \$64,%rsp | |
3122 | ||
3123 | mov 120($context),%rax # pull context->Rax | |
3124 | mov 248($context),%rbx # pull context->Rip | |
3125 | ||
3126 | mov 8($disp),%rsi # disp->ImageBase | |
3127 | mov 56($disp),%r11 # disp->HandlerData | |
3128 | ||
3129 | mov 0(%r11),%r10d # HandlerData[0] | |
3130 | lea (%rsi,%r10),%r10 # prologue label | |
3131 | cmp %r10,%rbx # context->Rip<prologue label | |
3132 | jb .Lcommon_seh_tail | |
3133 | ||
3134 | mov 192($context),%rax # pull context->R9 | |
3135 | ||
3136 | mov 4(%r11),%r10d # HandlerData[1] | |
3137 | lea (%rsi,%r10),%r10 # epilogue label | |
3138 | cmp %r10,%rbx # context->Rip>=epilogue label | |
3139 | jae .Lcommon_seh_tail | |
3140 | ||
3141 | lea -0x28(%rax),%rsi | |
3142 | lea 512($context),%rdi # &context.Xmm6 | |
3143 | mov \$4,%ecx | |
3144 | .long 0xa548f3fc # cld; rep movsq | |
3145 | ||
3146 | jmp .Lcommon_seh_tail | |
3147 | .size ssse3_handler,.-ssse3_handler | |
3148 | ||
3149 | .type full_handler,\@abi-omnipotent | |
3150 | .align 16 | |
3151 | full_handler: | |
3152 | push %rsi | |
3153 | push %rdi | |
3154 | push %rbx | |
3155 | push %rbp | |
3156 | push %r12 | |
3157 | push %r13 | |
3158 | push %r14 | |
3159 | push %r15 | |
3160 | pushfq | |
3161 | sub \$64,%rsp | |
3162 | ||
3163 | mov 120($context),%rax # pull context->Rax | |
3164 | mov 248($context),%rbx # pull context->Rip | |
3165 | ||
3166 | mov 8($disp),%rsi # disp->ImageBase | |
3167 | mov 56($disp),%r11 # disp->HandlerData | |
3168 | ||
3169 | mov 0(%r11),%r10d # HandlerData[0] | |
3170 | lea (%rsi,%r10),%r10 # prologue label | |
3171 | cmp %r10,%rbx # context->Rip<prologue label | |
3172 | jb .Lcommon_seh_tail | |
3173 | ||
3174 | mov 192($context),%rax # pull context->R9 | |
3175 | ||
3176 | mov 4(%r11),%r10d # HandlerData[1] | |
3177 | lea (%rsi,%r10),%r10 # epilogue label | |
3178 | cmp %r10,%rbx # context->Rip>=epilogue label | |
3179 | jae .Lcommon_seh_tail | |
3180 | ||
3181 | lea -0xa8(%rax),%rsi | |
3182 | lea 512($context),%rdi # &context.Xmm6 | |
3183 | mov \$20,%ecx | |
3184 | .long 0xa548f3fc # cld; rep movsq | |
3185 | ||
3186 | jmp .Lcommon_seh_tail | |
3187 | .size full_handler,.-full_handler | |
3188 | ||
3189 | .section .pdata | |
3190 | .align 4 | |
3191 | .rva .LSEH_begin_ChaCha20_ctr32 | |
3192 | .rva .LSEH_end_ChaCha20_ctr32 | |
3193 | .rva .LSEH_info_ChaCha20_ctr32 | |
3194 | ||
3195 | .rva .LSEH_begin_ChaCha20_ssse3 | |
3196 | .rva .LSEH_end_ChaCha20_ssse3 | |
3197 | .rva .LSEH_info_ChaCha20_ssse3 | |
3198 | ||
3199 | .rva .LSEH_begin_ChaCha20_4x | |
3200 | .rva .LSEH_end_ChaCha20_4x | |
3201 | .rva .LSEH_info_ChaCha20_4x | |
3202 | ___ | |
3203 | $code.=<<___ if ($avx); | |
3204 | .rva .LSEH_begin_ChaCha20_4xop | |
3205 | .rva .LSEH_end_ChaCha20_4xop | |
3206 | .rva .LSEH_info_ChaCha20_4xop | |
3207 | ___ | |
3208 | $code.=<<___ if ($avx>1); | |
3209 | .rva .LSEH_begin_ChaCha20_8x | |
3210 | .rva .LSEH_end_ChaCha20_8x | |
3211 | .rva .LSEH_info_ChaCha20_8x | |
3212 | ___ | |
3213 | $code.=<<___ if ($avx>2); | |
3214 | .rva .LSEH_begin_ChaCha20_avx512 | |
3215 | .rva .LSEH_end_ChaCha20_avx512 | |
3216 | .rva .LSEH_info_ChaCha20_avx512 | |
3217 | ||
3218 | .rva .LSEH_begin_ChaCha20_16x | |
3219 | .rva .LSEH_end_ChaCha20_16x | |
3220 | .rva .LSEH_info_ChaCha20_16x | |
3221 | ___ | |
3222 | $code.=<<___; | |
3223 | .section .xdata | |
3224 | .align 8 | |
3225 | .LSEH_info_ChaCha20_ctr32: | |
3226 | .byte 9,0,0,0 | |
3227 | .rva se_handler | |
3228 | ||
3229 | .LSEH_info_ChaCha20_ssse3: | |
3230 | .byte 9,0,0,0 | |
3231 | .rva ssse3_handler | |
3232 | .rva .Lssse3_body,.Lssse3_epilogue | |
3233 | ||
3234 | .LSEH_info_ChaCha20_4x: | |
3235 | .byte 9,0,0,0 | |
3236 | .rva full_handler | |
3237 | .rva .L4x_body,.L4x_epilogue | |
3238 | ___ | |
3239 | $code.=<<___ if ($avx); | |
3240 | .LSEH_info_ChaCha20_4xop: | |
3241 | .byte 9,0,0,0 | |
3242 | .rva full_handler | |
3243 | .rva .L4xop_body,.L4xop_epilogue # HandlerData[] | |
3244 | ___ | |
3245 | $code.=<<___ if ($avx>1); | |
3246 | .LSEH_info_ChaCha20_8x: | |
3247 | .byte 9,0,0,0 | |
3248 | .rva full_handler | |
3249 | .rva .L8x_body,.L8x_epilogue # HandlerData[] | |
3250 | ___ | |
3251 | $code.=<<___ if ($avx>2); | |
3252 | .LSEH_info_ChaCha20_avx512: | |
3253 | .byte 9,0,0,0 | |
3254 | .rva ssse3_handler | |
3255 | .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] | |
3256 | ||
3257 | .LSEH_info_ChaCha20_16x: | |
3258 | .byte 9,0,0,0 | |
3259 | .rva full_handler | |
3260 | .rva .L16x_body,.L16x_epilogue # HandlerData[] | |
3261 | ___ | |
3262 | } | |
3263 | ||
a98c648e | 3264 | foreach (split("\n",$code)) { |
3c274a6e | 3265 | s/\`([^\`]*)\`/eval $1/ge; |
a98c648e | 3266 | |
3c274a6e | 3267 | s/%x#%[yz]/%x/g; # "down-shift" |
a98c648e AP |
3268 | |
3269 | print $_,"\n"; | |
3270 | } | |
3271 | ||
3272 | close STDOUT; |