]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
d5487a45 | 2 | # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e RS |
3 | # |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
a98c648e AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # November 2014 | |
18 | # | |
19 | # ChaCha20 for x86_64. | |
20 | # | |
abb8c44f AP |
21 | # December 2016 |
22 | # | |
23 | # Add AVX512F code path. | |
24 | # | |
cded9513 AP |
25 | # December 2017 |
26 | # | |
27 | # Add AVX512VL code path. | |
28 | # | |
a98c648e AP |
29 | # Performance in cycles per byte out of large buffer. |
30 | # | |
d5487a45 | 31 | # IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v) |
a98c648e | 32 | # |
d5487a45 AP |
33 | # P4 9.48/+99% - - |
34 | # Core2 7.83/+55% 7.90/5.76 4.35 | |
35 | # Westmere 7.19/+50% 5.60/4.50 3.00 | |
36 | # Sandy Bridge 8.31/+42% 5.45/4.00 2.72 | |
37 | # Ivy Bridge 6.71/+46% 5.40/? 2.41 | |
38 | # Haswell 5.92/+43% 5.20/3.45 2.42 1.23 | |
39 | # Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)] | |
40 | # Silvermont 12.0/+33% 7.75/6.90 7.03(iii) | |
41 | # Knights L 11.7/- ? 9.60(iii) 0.80 | |
42 | # Goldmont 10.6/+17% 5.10/3.52 3.28 | |
43 | # Sledgehammer 7.28/+52% - - | |
44 | # Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv) | |
45 | # Ryzen 5.96/+50% 5.19/3.00 2.40 2.09 | |
46 | # VIA Nano 10.5/+46% 6.72/6.88 6.05 | |
a98c648e AP |
47 | # |
48 | # (i) compared to older gcc 3.x one can observe >2x improvement on | |
49 | # most platforms; | |
d5487a45 AP |
50 | # (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used |
51 | # by chacha20_poly1305_tls_cipher, results are EVP-free; | |
a98c648e AP |
52 | # (iii) this is not optimal result for Atom because of MSROM |
53 | # limitations, SSE2 can do better, but gain is considered too | |
54 | # low to justify the [maintenance] effort; | |
d5487a45 AP |
55 | # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20 |
56 | # and 4.85 for 128-byte inputs; | |
cded9513 AP |
57 | # (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable; |
58 | # (vi) even though Skylake-X can execute AVX512F code and deliver 0.57 | |
59 | # cpb in single thread, the corresponding capability is suppressed; | |
a98c648e AP |
60 | |
61 | $flavour = shift; | |
62 | $output = shift; | |
63 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | |
64 | ||
65 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
66 | ||
67 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
68 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
69 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
70 | die "can't locate x86_64-xlate.pl"; | |
71 | ||
72 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
73 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
abb8c44f | 74 | $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); |
a98c648e AP |
75 | } |
76 | ||
77 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
1ea01427 | 78 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { |
abb8c44f AP |
79 | $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); |
80 | $avx += 1 if ($1==2.11 && $2>=8); | |
a98c648e AP |
81 | } |
82 | ||
83 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
84 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
85 | $avx = ($1>=10) + ($1>=11); | |
86 | } | |
87 | ||
88 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { | |
89 | $avx = ($2>=3.0) + ($2>3.0); | |
90 | } | |
91 | ||
cfe1d992 | 92 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; |
a98c648e AP |
93 | *STDOUT=*OUT; |
94 | ||
95 | # input parameter block | |
96 | ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); | |
97 | ||
98 | $code.=<<___; | |
99 | .text | |
100 | ||
101 | .extern OPENSSL_ia32cap_P | |
102 | ||
103 | .align 64 | |
104 | .Lzero: | |
105 | .long 0,0,0,0 | |
106 | .Lone: | |
107 | .long 1,0,0,0 | |
108 | .Linc: | |
109 | .long 0,1,2,3 | |
110 | .Lfour: | |
111 | .long 4,4,4,4 | |
112 | .Lincy: | |
113 | .long 0,2,4,6,1,3,5,7 | |
114 | .Leight: | |
115 | .long 8,8,8,8,8,8,8,8 | |
116 | .Lrot16: | |
117 | .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd | |
118 | .Lrot24: | |
119 | .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe | |
cded9513 AP |
120 | .Ltwoy: |
121 | .long 2,0,0,0, 2,0,0,0 | |
abb8c44f | 122 | .align 64 |
3c274a6e AP |
123 | .Lzeroz: |
124 | .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 | |
125 | .Lfourz: | |
126 | .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 | |
abb8c44f AP |
127 | .Lincz: |
128 | .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 | |
129 | .Lsixteen: | |
130 | .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 | |
cded9513 AP |
131 | .Lsigma: |
132 | .asciz "expand 32-byte k" | |
a98c648e AP |
133 | .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
134 | ___ | |
135 | ||
136 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm | |
137 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; | |
138 | my $arg = pop; | |
139 | $arg = "\$$arg" if ($arg*1 eq $arg); | |
140 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; | |
141 | } | |
142 | ||
143 | @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), | |
144 | "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); | |
145 | @t=("%esi","%edi"); | |
146 | ||
147 | sub ROUND { # critical path is 24 cycles per round | |
148 | my ($a0,$b0,$c0,$d0)=@_; | |
149 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
150 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
151 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
152 | my ($xc,$xc_)=map("\"$_\"",@t); | |
153 | my @x=map("\"$_\"",@x); | |
154 | ||
155 | # Consider order in which variables are addressed by their | |
156 | # index: | |
157 | # | |
158 | # a b c d | |
159 | # | |
160 | # 0 4 8 12 < even round | |
161 | # 1 5 9 13 | |
162 | # 2 6 10 14 | |
163 | # 3 7 11 15 | |
164 | # 0 5 10 15 < odd round | |
165 | # 1 6 11 12 | |
166 | # 2 7 8 13 | |
167 | # 3 4 9 14 | |
168 | # | |
169 | # 'a', 'b' and 'd's are permanently allocated in registers, | |
170 | # @x[0..7,12..15], while 'c's are maintained in memory. If | |
171 | # you observe 'c' column, you'll notice that pair of 'c's is | |
172 | # invariant between rounds. This means that we have to reload | |
173 | # them once per round, in the middle. This is why you'll see | |
174 | # bunch of 'c' stores and loads in the middle, but none in | |
175 | # the beginning or end. | |
176 | ||
177 | # Normally instructions would be interleaved to favour in-order | |
178 | # execution. Generally out-of-order cores manage it gracefully, | |
179 | # but not this time for some reason. As in-order execution | |
180 | # cores are dying breed, old Atom is the only one around, | |
181 | # instructions are left uninterleaved. Besides, Atom is better | |
182 | # off executing 1xSSSE3 code anyway... | |
183 | ||
184 | ( | |
185 | "&add (@x[$a0],@x[$b0])", # Q1 | |
186 | "&xor (@x[$d0],@x[$a0])", | |
187 | "&rol (@x[$d0],16)", | |
188 | "&add (@x[$a1],@x[$b1])", # Q2 | |
189 | "&xor (@x[$d1],@x[$a1])", | |
190 | "&rol (@x[$d1],16)", | |
191 | ||
192 | "&add ($xc,@x[$d0])", | |
193 | "&xor (@x[$b0],$xc)", | |
194 | "&rol (@x[$b0],12)", | |
195 | "&add ($xc_,@x[$d1])", | |
196 | "&xor (@x[$b1],$xc_)", | |
197 | "&rol (@x[$b1],12)", | |
198 | ||
199 | "&add (@x[$a0],@x[$b0])", | |
200 | "&xor (@x[$d0],@x[$a0])", | |
201 | "&rol (@x[$d0],8)", | |
202 | "&add (@x[$a1],@x[$b1])", | |
203 | "&xor (@x[$d1],@x[$a1])", | |
204 | "&rol (@x[$d1],8)", | |
205 | ||
206 | "&add ($xc,@x[$d0])", | |
207 | "&xor (@x[$b0],$xc)", | |
208 | "&rol (@x[$b0],7)", | |
209 | "&add ($xc_,@x[$d1])", | |
210 | "&xor (@x[$b1],$xc_)", | |
211 | "&rol (@x[$b1],7)", | |
212 | ||
213 | "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's | |
214 | "&mov (\"4*$c1(%rsp)\",$xc_)", | |
215 | "&mov ($xc,\"4*$c2(%rsp)\")", | |
216 | "&mov ($xc_,\"4*$c3(%rsp)\")", | |
217 | ||
218 | "&add (@x[$a2],@x[$b2])", # Q3 | |
219 | "&xor (@x[$d2],@x[$a2])", | |
220 | "&rol (@x[$d2],16)", | |
221 | "&add (@x[$a3],@x[$b3])", # Q4 | |
222 | "&xor (@x[$d3],@x[$a3])", | |
223 | "&rol (@x[$d3],16)", | |
224 | ||
225 | "&add ($xc,@x[$d2])", | |
226 | "&xor (@x[$b2],$xc)", | |
227 | "&rol (@x[$b2],12)", | |
228 | "&add ($xc_,@x[$d3])", | |
229 | "&xor (@x[$b3],$xc_)", | |
230 | "&rol (@x[$b3],12)", | |
231 | ||
232 | "&add (@x[$a2],@x[$b2])", | |
233 | "&xor (@x[$d2],@x[$a2])", | |
234 | "&rol (@x[$d2],8)", | |
235 | "&add (@x[$a3],@x[$b3])", | |
236 | "&xor (@x[$d3],@x[$a3])", | |
237 | "&rol (@x[$d3],8)", | |
238 | ||
239 | "&add ($xc,@x[$d2])", | |
240 | "&xor (@x[$b2],$xc)", | |
241 | "&rol (@x[$b2],7)", | |
242 | "&add ($xc_,@x[$d3])", | |
243 | "&xor (@x[$b3],$xc_)", | |
244 | "&rol (@x[$b3],7)" | |
245 | ); | |
246 | } | |
247 | ||
248 | ######################################################################## | |
249 | # Generic code path that handles all lengths on pre-SSSE3 processors. | |
250 | $code.=<<___; | |
251 | .globl ChaCha20_ctr32 | |
252 | .type ChaCha20_ctr32,\@function,5 | |
253 | .align 64 | |
254 | ChaCha20_ctr32: | |
f17652e5 | 255 | .cfi_startproc |
622a531c AP |
256 | cmp \$0,$len |
257 | je .Lno_data | |
a98c648e | 258 | mov OPENSSL_ia32cap_P+4(%rip),%r10 |
3c274a6e AP |
259 | ___ |
260 | $code.=<<___ if ($avx>2); | |
261 | bt \$48,%r10 # check for AVX512F | |
262 | jc .LChaCha20_avx512 | |
cded9513 AP |
263 | test %r10,%r10 # check for AVX512VL |
264 | js .LChaCha20_avx512vl | |
3c274a6e AP |
265 | ___ |
266 | $code.=<<___; | |
a98c648e AP |
267 | test \$`1<<(41-32)`,%r10d |
268 | jnz .LChaCha20_ssse3 | |
269 | ||
270 | push %rbx | |
f17652e5 | 271 | .cfi_push %rbx |
a98c648e | 272 | push %rbp |
f17652e5 | 273 | .cfi_push %rbp |
a98c648e | 274 | push %r12 |
f17652e5 | 275 | .cfi_push %r12 |
a98c648e | 276 | push %r13 |
f17652e5 | 277 | .cfi_push %r13 |
a98c648e | 278 | push %r14 |
f17652e5 | 279 | .cfi_push %r14 |
a98c648e | 280 | push %r15 |
f17652e5 | 281 | .cfi_push %r15 |
a98c648e | 282 | sub \$64+24,%rsp |
f17652e5 | 283 | .cfi_adjust_cfa_offset 64+24 |
384e6de4 | 284 | .Lctr32_body: |
a98c648e AP |
285 | |
286 | #movdqa .Lsigma(%rip),%xmm0 | |
287 | movdqu ($key),%xmm1 | |
288 | movdqu 16($key),%xmm2 | |
289 | movdqu ($counter),%xmm3 | |
290 | movdqa .Lone(%rip),%xmm4 | |
291 | ||
292 | #movdqa %xmm0,4*0(%rsp) # key[0] | |
293 | movdqa %xmm1,4*4(%rsp) # key[1] | |
294 | movdqa %xmm2,4*8(%rsp) # key[2] | |
295 | movdqa %xmm3,4*12(%rsp) # key[3] | |
296 | mov $len,%rbp # reassign $len | |
297 | jmp .Loop_outer | |
298 | ||
299 | .align 32 | |
300 | .Loop_outer: | |
301 | mov \$0x61707865,@x[0] # 'expa' | |
302 | mov \$0x3320646e,@x[1] # 'nd 3' | |
303 | mov \$0x79622d32,@x[2] # '2-by' | |
304 | mov \$0x6b206574,@x[3] # 'te k' | |
305 | mov 4*4(%rsp),@x[4] | |
306 | mov 4*5(%rsp),@x[5] | |
307 | mov 4*6(%rsp),@x[6] | |
308 | mov 4*7(%rsp),@x[7] | |
309 | movd %xmm3,@x[12] | |
310 | mov 4*13(%rsp),@x[13] | |
311 | mov 4*14(%rsp),@x[14] | |
312 | mov 4*15(%rsp),@x[15] | |
313 | ||
314 | mov %rbp,64+0(%rsp) # save len | |
315 | mov \$10,%ebp | |
316 | mov $inp,64+8(%rsp) # save inp | |
317 | movq %xmm2,%rsi # "@x[8]" | |
318 | mov $out,64+16(%rsp) # save out | |
319 | mov %rsi,%rdi | |
320 | shr \$32,%rdi # "@x[9]" | |
321 | jmp .Loop | |
322 | ||
323 | .align 32 | |
324 | .Loop: | |
325 | ___ | |
326 | foreach (&ROUND (0, 4, 8,12)) { eval; } | |
327 | foreach (&ROUND (0, 5,10,15)) { eval; } | |
328 | &dec ("%ebp"); | |
329 | &jnz (".Loop"); | |
330 | ||
331 | $code.=<<___; | |
332 | mov @t[1],4*9(%rsp) # modulo-scheduled | |
333 | mov @t[0],4*8(%rsp) | |
334 | mov 64(%rsp),%rbp # load len | |
335 | movdqa %xmm2,%xmm1 | |
336 | mov 64+8(%rsp),$inp # load inp | |
337 | paddd %xmm4,%xmm3 # increment counter | |
338 | mov 64+16(%rsp),$out # load out | |
339 | ||
340 | add \$0x61707865,@x[0] # 'expa' | |
341 | add \$0x3320646e,@x[1] # 'nd 3' | |
342 | add \$0x79622d32,@x[2] # '2-by' | |
343 | add \$0x6b206574,@x[3] # 'te k' | |
344 | add 4*4(%rsp),@x[4] | |
345 | add 4*5(%rsp),@x[5] | |
346 | add 4*6(%rsp),@x[6] | |
347 | add 4*7(%rsp),@x[7] | |
348 | add 4*12(%rsp),@x[12] | |
349 | add 4*13(%rsp),@x[13] | |
350 | add 4*14(%rsp),@x[14] | |
351 | add 4*15(%rsp),@x[15] | |
352 | paddd 4*8(%rsp),%xmm1 | |
353 | ||
354 | cmp \$64,%rbp | |
355 | jb .Ltail | |
356 | ||
357 | xor 4*0($inp),@x[0] # xor with input | |
358 | xor 4*1($inp),@x[1] | |
359 | xor 4*2($inp),@x[2] | |
360 | xor 4*3($inp),@x[3] | |
361 | xor 4*4($inp),@x[4] | |
362 | xor 4*5($inp),@x[5] | |
363 | xor 4*6($inp),@x[6] | |
364 | xor 4*7($inp),@x[7] | |
365 | movdqu 4*8($inp),%xmm0 | |
366 | xor 4*12($inp),@x[12] | |
367 | xor 4*13($inp),@x[13] | |
368 | xor 4*14($inp),@x[14] | |
369 | xor 4*15($inp),@x[15] | |
370 | lea 4*16($inp),$inp # inp+=64 | |
371 | pxor %xmm1,%xmm0 | |
372 | ||
373 | movdqa %xmm2,4*8(%rsp) | |
374 | movd %xmm3,4*12(%rsp) | |
375 | ||
376 | mov @x[0],4*0($out) # write output | |
377 | mov @x[1],4*1($out) | |
378 | mov @x[2],4*2($out) | |
379 | mov @x[3],4*3($out) | |
380 | mov @x[4],4*4($out) | |
381 | mov @x[5],4*5($out) | |
382 | mov @x[6],4*6($out) | |
383 | mov @x[7],4*7($out) | |
384 | movdqu %xmm0,4*8($out) | |
385 | mov @x[12],4*12($out) | |
386 | mov @x[13],4*13($out) | |
387 | mov @x[14],4*14($out) | |
388 | mov @x[15],4*15($out) | |
389 | lea 4*16($out),$out # out+=64 | |
390 | ||
391 | sub \$64,%rbp | |
392 | jnz .Loop_outer | |
393 | ||
394 | jmp .Ldone | |
395 | ||
396 | .align 16 | |
397 | .Ltail: | |
398 | mov @x[0],4*0(%rsp) | |
a98c648e | 399 | mov @x[1],4*1(%rsp) |
29880e97 | 400 | xor %rbx,%rbx |
a98c648e AP |
401 | mov @x[2],4*2(%rsp) |
402 | mov @x[3],4*3(%rsp) | |
403 | mov @x[4],4*4(%rsp) | |
404 | mov @x[5],4*5(%rsp) | |
405 | mov @x[6],4*6(%rsp) | |
406 | mov @x[7],4*7(%rsp) | |
407 | movdqa %xmm1,4*8(%rsp) | |
408 | mov @x[12],4*12(%rsp) | |
409 | mov @x[13],4*13(%rsp) | |
410 | mov @x[14],4*14(%rsp) | |
411 | mov @x[15],4*15(%rsp) | |
412 | ||
413 | .Loop_tail: | |
414 | movzb ($inp,%rbx),%eax | |
415 | movzb (%rsp,%rbx),%edx | |
416 | lea 1(%rbx),%rbx | |
417 | xor %edx,%eax | |
418 | mov %al,-1($out,%rbx) | |
419 | dec %rbp | |
420 | jnz .Loop_tail | |
421 | ||
422 | .Ldone: | |
384e6de4 | 423 | lea 64+24+48(%rsp),%rsi |
f17652e5 | 424 | .cfi_def_cfa %rsi,8 |
384e6de4 | 425 | mov -48(%rsi),%r15 |
f17652e5 | 426 | .cfi_restore %r15 |
384e6de4 | 427 | mov -40(%rsi),%r14 |
f17652e5 | 428 | .cfi_restore %r14 |
384e6de4 | 429 | mov -32(%rsi),%r13 |
f17652e5 | 430 | .cfi_restore %r13 |
384e6de4 | 431 | mov -24(%rsi),%r12 |
f17652e5 | 432 | .cfi_restore %r12 |
384e6de4 | 433 | mov -16(%rsi),%rbp |
f17652e5 | 434 | .cfi_restore %rbp |
384e6de4 | 435 | mov -8(%rsi),%rbx |
f17652e5 | 436 | .cfi_restore %rbx |
384e6de4 | 437 | lea (%rsi),%rsp |
f17652e5 | 438 | .cfi_def_cfa_register %rsp |
622a531c | 439 | .Lno_data: |
a98c648e | 440 | ret |
f17652e5 | 441 | .cfi_endproc |
a98c648e AP |
442 | .size ChaCha20_ctr32,.-ChaCha20_ctr32 |
443 | ___ | |
444 | ||
445 | ######################################################################## | |
446 | # SSSE3 code path that handles shorter lengths | |
447 | { | |
448 | my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); | |
449 | ||
450 | sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round | |
451 | &paddd ($a,$b); | |
452 | &pxor ($d,$a); | |
453 | &pshufb ($d,$rot16); | |
454 | ||
455 | &paddd ($c,$d); | |
456 | &pxor ($b,$c); | |
457 | &movdqa ($t,$b); | |
458 | &psrld ($b,20); | |
459 | &pslld ($t,12); | |
460 | &por ($b,$t); | |
461 | ||
462 | &paddd ($a,$b); | |
463 | &pxor ($d,$a); | |
464 | &pshufb ($d,$rot24); | |
465 | ||
466 | &paddd ($c,$d); | |
467 | &pxor ($b,$c); | |
468 | &movdqa ($t,$b); | |
469 | &psrld ($b,25); | |
470 | &pslld ($t,7); | |
471 | &por ($b,$t); | |
472 | } | |
473 | ||
384e6de4 | 474 | my $xframe = $win64 ? 32+8 : 8; |
a98c648e AP |
475 | |
476 | $code.=<<___; | |
477 | .type ChaCha20_ssse3,\@function,5 | |
478 | .align 32 | |
479 | ChaCha20_ssse3: | |
f17652e5 | 480 | .cfi_startproc |
a98c648e | 481 | .LChaCha20_ssse3: |
384e6de4 | 482 | mov %rsp,%r9 # frame pointer |
f17652e5 | 483 | .cfi_def_cfa_register %r9 |
a98c648e AP |
484 | ___ |
485 | $code.=<<___ if ($avx); | |
486 | test \$`1<<(43-32)`,%r10d | |
487 | jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4 | |
488 | ___ | |
489 | $code.=<<___; | |
490 | cmp \$128,$len # we might throw away some data, | |
d5487a45 | 491 | je .LChaCha20_128 |
a98c648e AP |
492 | ja .LChaCha20_4x # but overall it won't be slower |
493 | ||
494 | .Ldo_sse3_after_all: | |
a98c648e AP |
495 | sub \$64+$xframe,%rsp |
496 | ___ | |
497 | $code.=<<___ if ($win64); | |
384e6de4 AP |
498 | movaps %xmm6,-0x28(%r9) |
499 | movaps %xmm7,-0x18(%r9) | |
500 | .Lssse3_body: | |
a98c648e AP |
501 | ___ |
502 | $code.=<<___; | |
503 | movdqa .Lsigma(%rip),$a | |
504 | movdqu ($key),$b | |
505 | movdqu 16($key),$c | |
506 | movdqu ($counter),$d | |
507 | movdqa .Lrot16(%rip),$rot16 | |
508 | movdqa .Lrot24(%rip),$rot24 | |
509 | ||
510 | movdqa $a,0x00(%rsp) | |
511 | movdqa $b,0x10(%rsp) | |
512 | movdqa $c,0x20(%rsp) | |
513 | movdqa $d,0x30(%rsp) | |
3c274a6e | 514 | mov \$10,$counter # reuse $counter |
a98c648e AP |
515 | jmp .Loop_ssse3 |
516 | ||
517 | .align 32 | |
518 | .Loop_outer_ssse3: | |
519 | movdqa .Lone(%rip),$d | |
520 | movdqa 0x00(%rsp),$a | |
521 | movdqa 0x10(%rsp),$b | |
522 | movdqa 0x20(%rsp),$c | |
523 | paddd 0x30(%rsp),$d | |
3c274a6e | 524 | mov \$10,$counter |
a98c648e AP |
525 | movdqa $d,0x30(%rsp) |
526 | jmp .Loop_ssse3 | |
527 | ||
528 | .align 32 | |
529 | .Loop_ssse3: | |
530 | ___ | |
531 | &SSSE3ROUND(); | |
532 | &pshufd ($c,$c,0b01001110); | |
533 | &pshufd ($b,$b,0b00111001); | |
534 | &pshufd ($d,$d,0b10010011); | |
535 | &nop (); | |
536 | ||
537 | &SSSE3ROUND(); | |
538 | &pshufd ($c,$c,0b01001110); | |
539 | &pshufd ($b,$b,0b10010011); | |
540 | &pshufd ($d,$d,0b00111001); | |
541 | ||
3c274a6e | 542 | &dec ($counter); |
a98c648e AP |
543 | &jnz (".Loop_ssse3"); |
544 | ||
545 | $code.=<<___; | |
546 | paddd 0x00(%rsp),$a | |
547 | paddd 0x10(%rsp),$b | |
548 | paddd 0x20(%rsp),$c | |
549 | paddd 0x30(%rsp),$d | |
550 | ||
551 | cmp \$64,$len | |
552 | jb .Ltail_ssse3 | |
553 | ||
554 | movdqu 0x00($inp),$t | |
555 | movdqu 0x10($inp),$t1 | |
556 | pxor $t,$a # xor with input | |
557 | movdqu 0x20($inp),$t | |
558 | pxor $t1,$b | |
559 | movdqu 0x30($inp),$t1 | |
560 | lea 0x40($inp),$inp # inp+=64 | |
561 | pxor $t,$c | |
562 | pxor $t1,$d | |
563 | ||
564 | movdqu $a,0x00($out) # write output | |
565 | movdqu $b,0x10($out) | |
566 | movdqu $c,0x20($out) | |
567 | movdqu $d,0x30($out) | |
568 | lea 0x40($out),$out # out+=64 | |
569 | ||
570 | sub \$64,$len | |
571 | jnz .Loop_outer_ssse3 | |
572 | ||
573 | jmp .Ldone_ssse3 | |
574 | ||
575 | .align 16 | |
576 | .Ltail_ssse3: | |
577 | movdqa $a,0x00(%rsp) | |
578 | movdqa $b,0x10(%rsp) | |
579 | movdqa $c,0x20(%rsp) | |
580 | movdqa $d,0x30(%rsp) | |
3c274a6e | 581 | xor $counter,$counter |
a98c648e AP |
582 | |
583 | .Loop_tail_ssse3: | |
3c274a6e AP |
584 | movzb ($inp,$counter),%eax |
585 | movzb (%rsp,$counter),%ecx | |
586 | lea 1($counter),$counter | |
29880e97 | 587 | xor %ecx,%eax |
3c274a6e | 588 | mov %al,-1($out,$counter) |
29880e97 | 589 | dec $len |
a98c648e AP |
590 | jnz .Loop_tail_ssse3 |
591 | ||
592 | .Ldone_ssse3: | |
593 | ___ | |
594 | $code.=<<___ if ($win64); | |
384e6de4 AP |
595 | movaps -0x28(%r9),%xmm6 |
596 | movaps -0x18(%r9),%xmm7 | |
a98c648e AP |
597 | ___ |
598 | $code.=<<___; | |
384e6de4 | 599 | lea (%r9),%rsp |
f17652e5 | 600 | .cfi_def_cfa_register %rsp |
384e6de4 | 601 | .Lssse3_epilogue: |
a98c648e | 602 | ret |
f17652e5 | 603 | .cfi_endproc |
a98c648e AP |
604 | .size ChaCha20_ssse3,.-ChaCha20_ssse3 |
605 | ___ | |
606 | } | |
607 | ||
d5487a45 AP |
608 | ######################################################################## |
609 | # SSSE3 code path that handles 128-byte inputs | |
610 | { | |
611 | my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7)); | |
612 | my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1)); | |
613 | ||
614 | sub SSSE3ROUND_2x { | |
615 | &paddd ($a,$b); | |
616 | &pxor ($d,$a); | |
617 | &paddd ($a1,$b1); | |
618 | &pxor ($d1,$a1); | |
619 | &pshufb ($d,$rot16); | |
620 | &pshufb($d1,$rot16); | |
621 | ||
622 | &paddd ($c,$d); | |
623 | &paddd ($c1,$d1); | |
624 | &pxor ($b,$c); | |
625 | &pxor ($b1,$c1); | |
626 | &movdqa ($t,$b); | |
627 | &psrld ($b,20); | |
628 | &movdqa($t1,$b1); | |
629 | &pslld ($t,12); | |
630 | &psrld ($b1,20); | |
631 | &por ($b,$t); | |
632 | &pslld ($t1,12); | |
633 | &por ($b1,$t1); | |
634 | ||
635 | &paddd ($a,$b); | |
636 | &pxor ($d,$a); | |
637 | &paddd ($a1,$b1); | |
638 | &pxor ($d1,$a1); | |
639 | &pshufb ($d,$rot24); | |
640 | &pshufb($d1,$rot24); | |
641 | ||
642 | &paddd ($c,$d); | |
643 | &paddd ($c1,$d1); | |
644 | &pxor ($b,$c); | |
645 | &pxor ($b1,$c1); | |
646 | &movdqa ($t,$b); | |
647 | &psrld ($b,25); | |
648 | &movdqa($t1,$b1); | |
649 | &pslld ($t,7); | |
650 | &psrld ($b1,25); | |
651 | &por ($b,$t); | |
652 | &pslld ($t1,7); | |
653 | &por ($b1,$t1); | |
654 | } | |
655 | ||
656 | my $xframe = $win64 ? 0x68 : 8; | |
657 | ||
658 | $code.=<<___; | |
659 | .type ChaCha20_128,\@function,5 | |
660 | .align 32 | |
661 | ChaCha20_128: | |
662 | .cfi_startproc | |
663 | .LChaCha20_128: | |
664 | mov %rsp,%r9 # frame pointer | |
665 | .cfi_def_cfa_register %r9 | |
666 | sub \$64+$xframe,%rsp | |
667 | ___ | |
668 | $code.=<<___ if ($win64); | |
669 | movaps %xmm6,-0x68(%r9) | |
670 | movaps %xmm7,-0x58(%r9) | |
671 | movaps %xmm8,-0x48(%r9) | |
672 | movaps %xmm9,-0x38(%r9) | |
673 | movaps %xmm10,-0x28(%r9) | |
674 | movaps %xmm11,-0x18(%r9) | |
675 | .L128_body: | |
676 | ___ | |
677 | $code.=<<___; | |
678 | movdqa .Lsigma(%rip),$a | |
679 | movdqu ($key),$b | |
680 | movdqu 16($key),$c | |
681 | movdqu ($counter),$d | |
682 | movdqa .Lone(%rip),$d1 | |
683 | movdqa .Lrot16(%rip),$rot16 | |
684 | movdqa .Lrot24(%rip),$rot24 | |
685 | ||
686 | movdqa $a,$a1 | |
687 | movdqa $a,0x00(%rsp) | |
688 | movdqa $b,$b1 | |
689 | movdqa $b,0x10(%rsp) | |
690 | movdqa $c,$c1 | |
691 | movdqa $c,0x20(%rsp) | |
692 | paddd $d,$d1 | |
693 | movdqa $d,0x30(%rsp) | |
694 | mov \$10,$counter # reuse $counter | |
695 | jmp .Loop_128 | |
696 | ||
697 | .align 32 | |
698 | .Loop_128: | |
699 | ___ | |
700 | &SSSE3ROUND_2x(); | |
701 | &pshufd ($c,$c,0b01001110); | |
702 | &pshufd ($b,$b,0b00111001); | |
703 | &pshufd ($d,$d,0b10010011); | |
704 | &pshufd ($c1,$c1,0b01001110); | |
705 | &pshufd ($b1,$b1,0b00111001); | |
706 | &pshufd ($d1,$d1,0b10010011); | |
707 | ||
708 | &SSSE3ROUND_2x(); | |
709 | &pshufd ($c,$c,0b01001110); | |
710 | &pshufd ($b,$b,0b10010011); | |
711 | &pshufd ($d,$d,0b00111001); | |
712 | &pshufd ($c1,$c1,0b01001110); | |
713 | &pshufd ($b1,$b1,0b10010011); | |
714 | &pshufd ($d1,$d1,0b00111001); | |
715 | ||
716 | &dec ($counter); | |
717 | &jnz (".Loop_128"); | |
718 | ||
719 | $code.=<<___; | |
720 | paddd 0x00(%rsp),$a | |
721 | paddd 0x10(%rsp),$b | |
722 | paddd 0x20(%rsp),$c | |
723 | paddd 0x30(%rsp),$d | |
724 | paddd .Lone(%rip),$d1 | |
725 | paddd 0x00(%rsp),$a1 | |
726 | paddd 0x10(%rsp),$b1 | |
727 | paddd 0x20(%rsp),$c1 | |
728 | paddd 0x30(%rsp),$d1 | |
729 | ||
730 | movdqu 0x00($inp),$t | |
731 | movdqu 0x10($inp),$t1 | |
732 | pxor $t,$a # xor with input | |
733 | movdqu 0x20($inp),$t | |
734 | pxor $t1,$b | |
735 | movdqu 0x30($inp),$t1 | |
736 | pxor $t,$c | |
737 | movdqu 0x40($inp),$t | |
738 | pxor $t1,$d | |
739 | movdqu 0x50($inp),$t1 | |
740 | pxor $t,$a1 | |
741 | movdqu 0x60($inp),$t | |
742 | pxor $t1,$b1 | |
743 | movdqu 0x70($inp),$t1 | |
744 | pxor $t,$c1 | |
745 | pxor $t1,$d1 | |
746 | ||
747 | movdqu $a,0x00($out) # write output | |
748 | movdqu $b,0x10($out) | |
749 | movdqu $c,0x20($out) | |
750 | movdqu $d,0x30($out) | |
751 | movdqu $a1,0x40($out) | |
752 | movdqu $b1,0x50($out) | |
753 | movdqu $c1,0x60($out) | |
754 | movdqu $d1,0x70($out) | |
755 | ___ | |
756 | $code.=<<___ if ($win64); | |
757 | movaps -0x68(%r9),%xmm6 | |
758 | movaps -0x58(%r9),%xmm7 | |
759 | movaps -0x48(%r9),%xmm8 | |
760 | movaps -0x38(%r9),%xmm9 | |
761 | movaps -0x28(%r9),%xmm10 | |
762 | movaps -0x18(%r9),%xmm11 | |
763 | ___ | |
764 | $code.=<<___; | |
765 | lea (%r9),%rsp | |
766 | .cfi_def_cfa_register %rsp | |
767 | .L128_epilogue: | |
768 | ret | |
769 | .cfi_endproc | |
770 | .size ChaCha20_128,.-ChaCha20_128 | |
771 | ___ | |
772 | } | |
773 | ||
a98c648e AP |
774 | ######################################################################## |
775 | # SSSE3 code path that handles longer messages. | |
776 | { | |
777 | # assign variables to favor Atom front-end | |
778 | my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, | |
779 | $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); | |
780 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
781 | "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); | |
782 | ||
783 | sub SSSE3_lane_ROUND { | |
784 | my ($a0,$b0,$c0,$d0)=@_; | |
785 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
786 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
787 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
788 | my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); | |
789 | my @x=map("\"$_\"",@xx); | |
790 | ||
791 | # Consider order in which variables are addressed by their | |
792 | # index: | |
793 | # | |
794 | # a b c d | |
795 | # | |
796 | # 0 4 8 12 < even round | |
797 | # 1 5 9 13 | |
798 | # 2 6 10 14 | |
799 | # 3 7 11 15 | |
800 | # 0 5 10 15 < odd round | |
801 | # 1 6 11 12 | |
802 | # 2 7 8 13 | |
803 | # 3 4 9 14 | |
804 | # | |
805 | # 'a', 'b' and 'd's are permanently allocated in registers, | |
806 | # @x[0..7,12..15], while 'c's are maintained in memory. If | |
807 | # you observe 'c' column, you'll notice that pair of 'c's is | |
808 | # invariant between rounds. This means that we have to reload | |
809 | # them once per round, in the middle. This is why you'll see | |
810 | # bunch of 'c' stores and loads in the middle, but none in | |
811 | # the beginning or end. | |
812 | ||
813 | ( | |
814 | "&paddd (@x[$a0],@x[$b0])", # Q1 | |
815 | "&paddd (@x[$a1],@x[$b1])", # Q2 | |
816 | "&pxor (@x[$d0],@x[$a0])", | |
817 | "&pxor (@x[$d1],@x[$a1])", | |
818 | "&pshufb (@x[$d0],$t1)", | |
819 | "&pshufb (@x[$d1],$t1)", | |
820 | ||
821 | "&paddd ($xc,@x[$d0])", | |
822 | "&paddd ($xc_,@x[$d1])", | |
823 | "&pxor (@x[$b0],$xc)", | |
824 | "&pxor (@x[$b1],$xc_)", | |
825 | "&movdqa ($t0,@x[$b0])", | |
826 | "&pslld (@x[$b0],12)", | |
827 | "&psrld ($t0,20)", | |
828 | "&movdqa ($t1,@x[$b1])", | |
829 | "&pslld (@x[$b1],12)", | |
830 | "&por (@x[$b0],$t0)", | |
831 | "&psrld ($t1,20)", | |
832 | "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) | |
833 | "&por (@x[$b1],$t1)", | |
834 | ||
835 | "&paddd (@x[$a0],@x[$b0])", | |
836 | "&paddd (@x[$a1],@x[$b1])", | |
837 | "&pxor (@x[$d0],@x[$a0])", | |
838 | "&pxor (@x[$d1],@x[$a1])", | |
839 | "&pshufb (@x[$d0],$t0)", | |
840 | "&pshufb (@x[$d1],$t0)", | |
841 | ||
842 | "&paddd ($xc,@x[$d0])", | |
843 | "&paddd ($xc_,@x[$d1])", | |
844 | "&pxor (@x[$b0],$xc)", | |
845 | "&pxor (@x[$b1],$xc_)", | |
846 | "&movdqa ($t1,@x[$b0])", | |
847 | "&pslld (@x[$b0],7)", | |
848 | "&psrld ($t1,25)", | |
849 | "&movdqa ($t0,@x[$b1])", | |
850 | "&pslld (@x[$b1],7)", | |
851 | "&por (@x[$b0],$t1)", | |
852 | "&psrld ($t0,25)", | |
853 | "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) | |
854 | "&por (@x[$b1],$t0)", | |
855 | ||
856 | "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's | |
857 | "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", | |
858 | "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", | |
859 | "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", | |
860 | ||
861 | "&paddd (@x[$a2],@x[$b2])", # Q3 | |
862 | "&paddd (@x[$a3],@x[$b3])", # Q4 | |
863 | "&pxor (@x[$d2],@x[$a2])", | |
864 | "&pxor (@x[$d3],@x[$a3])", | |
865 | "&pshufb (@x[$d2],$t1)", | |
866 | "&pshufb (@x[$d3],$t1)", | |
867 | ||
868 | "&paddd ($xc,@x[$d2])", | |
869 | "&paddd ($xc_,@x[$d3])", | |
870 | "&pxor (@x[$b2],$xc)", | |
871 | "&pxor (@x[$b3],$xc_)", | |
872 | "&movdqa ($t0,@x[$b2])", | |
873 | "&pslld (@x[$b2],12)", | |
874 | "&psrld ($t0,20)", | |
875 | "&movdqa ($t1,@x[$b3])", | |
876 | "&pslld (@x[$b3],12)", | |
877 | "&por (@x[$b2],$t0)", | |
878 | "&psrld ($t1,20)", | |
879 | "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) | |
880 | "&por (@x[$b3],$t1)", | |
881 | ||
882 | "&paddd (@x[$a2],@x[$b2])", | |
883 | "&paddd (@x[$a3],@x[$b3])", | |
884 | "&pxor (@x[$d2],@x[$a2])", | |
885 | "&pxor (@x[$d3],@x[$a3])", | |
886 | "&pshufb (@x[$d2],$t0)", | |
887 | "&pshufb (@x[$d3],$t0)", | |
888 | ||
889 | "&paddd ($xc,@x[$d2])", | |
890 | "&paddd ($xc_,@x[$d3])", | |
891 | "&pxor (@x[$b2],$xc)", | |
892 | "&pxor (@x[$b3],$xc_)", | |
893 | "&movdqa ($t1,@x[$b2])", | |
894 | "&pslld (@x[$b2],7)", | |
895 | "&psrld ($t1,25)", | |
896 | "&movdqa ($t0,@x[$b3])", | |
897 | "&pslld (@x[$b3],7)", | |
898 | "&por (@x[$b2],$t1)", | |
899 | "&psrld ($t0,25)", | |
900 | "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) | |
901 | "&por (@x[$b3],$t0)" | |
902 | ); | |
903 | } | |
904 | ||
384e6de4 | 905 | my $xframe = $win64 ? 0xa8 : 8; |
a98c648e AP |
906 | |
907 | $code.=<<___; | |
908 | .type ChaCha20_4x,\@function,5 | |
909 | .align 32 | |
910 | ChaCha20_4x: | |
f17652e5 | 911 | .cfi_startproc |
a98c648e | 912 | .LChaCha20_4x: |
384e6de4 | 913 | mov %rsp,%r9 # frame pointer |
f17652e5 | 914 | .cfi_def_cfa_register %r9 |
a98c648e AP |
915 | mov %r10,%r11 |
916 | ___ | |
917 | $code.=<<___ if ($avx>1); | |
918 | shr \$32,%r10 # OPENSSL_ia32cap_P+8 | |
919 | test \$`1<<5`,%r10 # test AVX2 | |
920 | jnz .LChaCha20_8x | |
921 | ___ | |
922 | $code.=<<___; | |
923 | cmp \$192,$len | |
924 | ja .Lproceed4x | |
925 | ||
926 | and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE | |
927 | cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE | |
928 | je .Ldo_sse3_after_all # to detect Atom | |
929 | ||
930 | .Lproceed4x: | |
384e6de4 | 931 | sub \$0x140+$xframe,%rsp |
a98c648e AP |
932 | ___ |
933 | ################ stack layout | |
934 | # +0x00 SIMD equivalent of @x[8-12] | |
935 | # ... | |
936 | # +0x40 constant copy of key[0-2] smashed by lanes | |
937 | # ... | |
938 | # +0x100 SIMD counters (with nonce smashed by lanes) | |
939 | # ... | |
940 | # +0x140 | |
941 | $code.=<<___ if ($win64); | |
384e6de4 AP |
942 | movaps %xmm6,-0xa8(%r9) |
943 | movaps %xmm7,-0x98(%r9) | |
944 | movaps %xmm8,-0x88(%r9) | |
945 | movaps %xmm9,-0x78(%r9) | |
946 | movaps %xmm10,-0x68(%r9) | |
947 | movaps %xmm11,-0x58(%r9) | |
948 | movaps %xmm12,-0x48(%r9) | |
949 | movaps %xmm13,-0x38(%r9) | |
950 | movaps %xmm14,-0x28(%r9) | |
951 | movaps %xmm15,-0x18(%r9) | |
952 | .L4x_body: | |
a98c648e AP |
953 | ___ |
954 | $code.=<<___; | |
955 | movdqa .Lsigma(%rip),$xa3 # key[0] | |
956 | movdqu ($key),$xb3 # key[1] | |
957 | movdqu 16($key),$xt3 # key[2] | |
958 | movdqu ($counter),$xd3 # key[3] | |
959 | lea 0x100(%rsp),%rcx # size optimization | |
960 | lea .Lrot16(%rip),%r10 | |
961 | lea .Lrot24(%rip),%r11 | |
962 | ||
963 | pshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
964 | pshufd \$0x55,$xa3,$xa1 | |
965 | movdqa $xa0,0x40(%rsp) # ... and offload | |
966 | pshufd \$0xaa,$xa3,$xa2 | |
967 | movdqa $xa1,0x50(%rsp) | |
968 | pshufd \$0xff,$xa3,$xa3 | |
969 | movdqa $xa2,0x60(%rsp) | |
970 | movdqa $xa3,0x70(%rsp) | |
971 | ||
972 | pshufd \$0x00,$xb3,$xb0 | |
973 | pshufd \$0x55,$xb3,$xb1 | |
974 | movdqa $xb0,0x80-0x100(%rcx) | |
975 | pshufd \$0xaa,$xb3,$xb2 | |
976 | movdqa $xb1,0x90-0x100(%rcx) | |
977 | pshufd \$0xff,$xb3,$xb3 | |
978 | movdqa $xb2,0xa0-0x100(%rcx) | |
979 | movdqa $xb3,0xb0-0x100(%rcx) | |
980 | ||
981 | pshufd \$0x00,$xt3,$xt0 # "$xc0" | |
982 | pshufd \$0x55,$xt3,$xt1 # "$xc1" | |
983 | movdqa $xt0,0xc0-0x100(%rcx) | |
984 | pshufd \$0xaa,$xt3,$xt2 # "$xc2" | |
985 | movdqa $xt1,0xd0-0x100(%rcx) | |
986 | pshufd \$0xff,$xt3,$xt3 # "$xc3" | |
987 | movdqa $xt2,0xe0-0x100(%rcx) | |
988 | movdqa $xt3,0xf0-0x100(%rcx) | |
989 | ||
990 | pshufd \$0x00,$xd3,$xd0 | |
991 | pshufd \$0x55,$xd3,$xd1 | |
992 | paddd .Linc(%rip),$xd0 # don't save counters yet | |
993 | pshufd \$0xaa,$xd3,$xd2 | |
994 | movdqa $xd1,0x110-0x100(%rcx) | |
995 | pshufd \$0xff,$xd3,$xd3 | |
996 | movdqa $xd2,0x120-0x100(%rcx) | |
997 | movdqa $xd3,0x130-0x100(%rcx) | |
998 | ||
999 | jmp .Loop_enter4x | |
1000 | ||
1001 | .align 32 | |
1002 | .Loop_outer4x: | |
1003 | movdqa 0x40(%rsp),$xa0 # re-load smashed key | |
1004 | movdqa 0x50(%rsp),$xa1 | |
1005 | movdqa 0x60(%rsp),$xa2 | |
1006 | movdqa 0x70(%rsp),$xa3 | |
1007 | movdqa 0x80-0x100(%rcx),$xb0 | |
1008 | movdqa 0x90-0x100(%rcx),$xb1 | |
1009 | movdqa 0xa0-0x100(%rcx),$xb2 | |
1010 | movdqa 0xb0-0x100(%rcx),$xb3 | |
1011 | movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" | |
1012 | movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" | |
1013 | movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" | |
1014 | movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" | |
1015 | movdqa 0x100-0x100(%rcx),$xd0 | |
1016 | movdqa 0x110-0x100(%rcx),$xd1 | |
1017 | movdqa 0x120-0x100(%rcx),$xd2 | |
1018 | movdqa 0x130-0x100(%rcx),$xd3 | |
1019 | paddd .Lfour(%rip),$xd0 # next SIMD counters | |
1020 | ||
1021 | .Loop_enter4x: | |
1022 | movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" | |
1023 | movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" | |
1024 | movdqa (%r10),$xt3 # .Lrot16(%rip) | |
1025 | mov \$10,%eax | |
1026 | movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters | |
1027 | jmp .Loop4x | |
1028 | ||
1029 | .align 32 | |
1030 | .Loop4x: | |
1031 | ___ | |
1032 | foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } | |
1033 | foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } | |
1034 | $code.=<<___; | |
1035 | dec %eax | |
1036 | jnz .Loop4x | |
1037 | ||
1038 | paddd 0x40(%rsp),$xa0 # accumulate key material | |
1039 | paddd 0x50(%rsp),$xa1 | |
1040 | paddd 0x60(%rsp),$xa2 | |
1041 | paddd 0x70(%rsp),$xa3 | |
1042 | ||
1043 | movdqa $xa0,$xt2 # "de-interlace" data | |
1044 | punpckldq $xa1,$xa0 | |
1045 | movdqa $xa2,$xt3 | |
1046 | punpckldq $xa3,$xa2 | |
1047 | punpckhdq $xa1,$xt2 | |
1048 | punpckhdq $xa3,$xt3 | |
1049 | movdqa $xa0,$xa1 | |
1050 | punpcklqdq $xa2,$xa0 # "a0" | |
1051 | movdqa $xt2,$xa3 | |
1052 | punpcklqdq $xt3,$xt2 # "a2" | |
1053 | punpckhqdq $xa2,$xa1 # "a1" | |
1054 | punpckhqdq $xt3,$xa3 # "a3" | |
1055 | ___ | |
1056 | ($xa2,$xt2)=($xt2,$xa2); | |
1057 | $code.=<<___; | |
1058 | paddd 0x80-0x100(%rcx),$xb0 | |
1059 | paddd 0x90-0x100(%rcx),$xb1 | |
1060 | paddd 0xa0-0x100(%rcx),$xb2 | |
1061 | paddd 0xb0-0x100(%rcx),$xb3 | |
1062 | ||
1063 | movdqa $xa0,0x00(%rsp) # offload $xaN | |
1064 | movdqa $xa1,0x10(%rsp) | |
1065 | movdqa 0x20(%rsp),$xa0 # "xc2" | |
1066 | movdqa 0x30(%rsp),$xa1 # "xc3" | |
1067 | ||
1068 | movdqa $xb0,$xt2 | |
1069 | punpckldq $xb1,$xb0 | |
1070 | movdqa $xb2,$xt3 | |
1071 | punpckldq $xb3,$xb2 | |
1072 | punpckhdq $xb1,$xt2 | |
1073 | punpckhdq $xb3,$xt3 | |
1074 | movdqa $xb0,$xb1 | |
1075 | punpcklqdq $xb2,$xb0 # "b0" | |
1076 | movdqa $xt2,$xb3 | |
1077 | punpcklqdq $xt3,$xt2 # "b2" | |
1078 | punpckhqdq $xb2,$xb1 # "b1" | |
1079 | punpckhqdq $xt3,$xb3 # "b3" | |
1080 | ___ | |
1081 | ($xb2,$xt2)=($xt2,$xb2); | |
1082 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); | |
1083 | $code.=<<___; | |
1084 | paddd 0xc0-0x100(%rcx),$xc0 | |
1085 | paddd 0xd0-0x100(%rcx),$xc1 | |
1086 | paddd 0xe0-0x100(%rcx),$xc2 | |
1087 | paddd 0xf0-0x100(%rcx),$xc3 | |
1088 | ||
1089 | movdqa $xa2,0x20(%rsp) # keep offloading $xaN | |
1090 | movdqa $xa3,0x30(%rsp) | |
1091 | ||
1092 | movdqa $xc0,$xt2 | |
1093 | punpckldq $xc1,$xc0 | |
1094 | movdqa $xc2,$xt3 | |
1095 | punpckldq $xc3,$xc2 | |
1096 | punpckhdq $xc1,$xt2 | |
1097 | punpckhdq $xc3,$xt3 | |
1098 | movdqa $xc0,$xc1 | |
1099 | punpcklqdq $xc2,$xc0 # "c0" | |
1100 | movdqa $xt2,$xc3 | |
1101 | punpcklqdq $xt3,$xt2 # "c2" | |
1102 | punpckhqdq $xc2,$xc1 # "c1" | |
1103 | punpckhqdq $xt3,$xc3 # "c3" | |
1104 | ___ | |
1105 | ($xc2,$xt2)=($xt2,$xc2); | |
1106 | ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary | |
1107 | $code.=<<___; | |
1108 | paddd 0x100-0x100(%rcx),$xd0 | |
1109 | paddd 0x110-0x100(%rcx),$xd1 | |
1110 | paddd 0x120-0x100(%rcx),$xd2 | |
1111 | paddd 0x130-0x100(%rcx),$xd3 | |
1112 | ||
1113 | movdqa $xd0,$xt2 | |
1114 | punpckldq $xd1,$xd0 | |
1115 | movdqa $xd2,$xt3 | |
1116 | punpckldq $xd3,$xd2 | |
1117 | punpckhdq $xd1,$xt2 | |
1118 | punpckhdq $xd3,$xt3 | |
1119 | movdqa $xd0,$xd1 | |
1120 | punpcklqdq $xd2,$xd0 # "d0" | |
1121 | movdqa $xt2,$xd3 | |
1122 | punpcklqdq $xt3,$xt2 # "d2" | |
1123 | punpckhqdq $xd2,$xd1 # "d1" | |
1124 | punpckhqdq $xt3,$xd3 # "d3" | |
1125 | ___ | |
1126 | ($xd2,$xt2)=($xt2,$xd2); | |
1127 | $code.=<<___; | |
1128 | cmp \$64*4,$len | |
1129 | jb .Ltail4x | |
1130 | ||
1131 | movdqu 0x00($inp),$xt0 # xor with input | |
1132 | movdqu 0x10($inp),$xt1 | |
1133 | movdqu 0x20($inp),$xt2 | |
1134 | movdqu 0x30($inp),$xt3 | |
1135 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? | |
1136 | pxor $xb0,$xt1 | |
1137 | pxor $xc0,$xt2 | |
1138 | pxor $xd0,$xt3 | |
1139 | ||
1140 | movdqu $xt0,0x00($out) | |
1141 | movdqu 0x40($inp),$xt0 | |
1142 | movdqu $xt1,0x10($out) | |
1143 | movdqu 0x50($inp),$xt1 | |
1144 | movdqu $xt2,0x20($out) | |
1145 | movdqu 0x60($inp),$xt2 | |
1146 | movdqu $xt3,0x30($out) | |
1147 | movdqu 0x70($inp),$xt3 | |
1148 | lea 0x80($inp),$inp # size optimization | |
1149 | pxor 0x10(%rsp),$xt0 | |
1150 | pxor $xb1,$xt1 | |
1151 | pxor $xc1,$xt2 | |
1152 | pxor $xd1,$xt3 | |
1153 | ||
1154 | movdqu $xt0,0x40($out) | |
1155 | movdqu 0x00($inp),$xt0 | |
1156 | movdqu $xt1,0x50($out) | |
1157 | movdqu 0x10($inp),$xt1 | |
1158 | movdqu $xt2,0x60($out) | |
1159 | movdqu 0x20($inp),$xt2 | |
1160 | movdqu $xt3,0x70($out) | |
1161 | lea 0x80($out),$out # size optimization | |
1162 | movdqu 0x30($inp),$xt3 | |
1163 | pxor 0x20(%rsp),$xt0 | |
1164 | pxor $xb2,$xt1 | |
1165 | pxor $xc2,$xt2 | |
1166 | pxor $xd2,$xt3 | |
1167 | ||
1168 | movdqu $xt0,0x00($out) | |
1169 | movdqu 0x40($inp),$xt0 | |
1170 | movdqu $xt1,0x10($out) | |
1171 | movdqu 0x50($inp),$xt1 | |
1172 | movdqu $xt2,0x20($out) | |
1173 | movdqu 0x60($inp),$xt2 | |
1174 | movdqu $xt3,0x30($out) | |
1175 | movdqu 0x70($inp),$xt3 | |
1176 | lea 0x80($inp),$inp # inp+=64*4 | |
1177 | pxor 0x30(%rsp),$xt0 | |
1178 | pxor $xb3,$xt1 | |
1179 | pxor $xc3,$xt2 | |
1180 | pxor $xd3,$xt3 | |
1181 | movdqu $xt0,0x40($out) | |
1182 | movdqu $xt1,0x50($out) | |
1183 | movdqu $xt2,0x60($out) | |
1184 | movdqu $xt3,0x70($out) | |
1185 | lea 0x80($out),$out # out+=64*4 | |
1186 | ||
1187 | sub \$64*4,$len | |
1188 | jnz .Loop_outer4x | |
1189 | ||
1190 | jmp .Ldone4x | |
1191 | ||
1192 | .Ltail4x: | |
1193 | cmp \$192,$len | |
1194 | jae .L192_or_more4x | |
1195 | cmp \$128,$len | |
1196 | jae .L128_or_more4x | |
1197 | cmp \$64,$len | |
1198 | jae .L64_or_more4x | |
1199 | ||
1200 | #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? | |
1201 | xor %r10,%r10 | |
1202 | #movdqa $xt0,0x00(%rsp) | |
1203 | movdqa $xb0,0x10(%rsp) | |
1204 | movdqa $xc0,0x20(%rsp) | |
1205 | movdqa $xd0,0x30(%rsp) | |
1206 | jmp .Loop_tail4x | |
1207 | ||
1208 | .align 32 | |
1209 | .L64_or_more4x: | |
1210 | movdqu 0x00($inp),$xt0 # xor with input | |
1211 | movdqu 0x10($inp),$xt1 | |
1212 | movdqu 0x20($inp),$xt2 | |
1213 | movdqu 0x30($inp),$xt3 | |
1214 | pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? | |
1215 | pxor $xb0,$xt1 | |
1216 | pxor $xc0,$xt2 | |
1217 | pxor $xd0,$xt3 | |
1218 | movdqu $xt0,0x00($out) | |
1219 | movdqu $xt1,0x10($out) | |
1220 | movdqu $xt2,0x20($out) | |
1221 | movdqu $xt3,0x30($out) | |
1222 | je .Ldone4x | |
1223 | ||
1224 | movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? | |
1225 | lea 0x40($inp),$inp # inp+=64*1 | |
1226 | xor %r10,%r10 | |
1227 | movdqa $xt0,0x00(%rsp) | |
1228 | movdqa $xb1,0x10(%rsp) | |
1229 | lea 0x40($out),$out # out+=64*1 | |
1230 | movdqa $xc1,0x20(%rsp) | |
1231 | sub \$64,$len # len-=64*1 | |
1232 | movdqa $xd1,0x30(%rsp) | |
1233 | jmp .Loop_tail4x | |
1234 | ||
1235 | .align 32 | |
1236 | .L128_or_more4x: | |
1237 | movdqu 0x00($inp),$xt0 # xor with input | |
1238 | movdqu 0x10($inp),$xt1 | |
1239 | movdqu 0x20($inp),$xt2 | |
1240 | movdqu 0x30($inp),$xt3 | |
1241 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? | |
1242 | pxor $xb0,$xt1 | |
1243 | pxor $xc0,$xt2 | |
1244 | pxor $xd0,$xt3 | |
1245 | ||
1246 | movdqu $xt0,0x00($out) | |
1247 | movdqu 0x40($inp),$xt0 | |
1248 | movdqu $xt1,0x10($out) | |
1249 | movdqu 0x50($inp),$xt1 | |
1250 | movdqu $xt2,0x20($out) | |
1251 | movdqu 0x60($inp),$xt2 | |
1252 | movdqu $xt3,0x30($out) | |
1253 | movdqu 0x70($inp),$xt3 | |
1254 | pxor 0x10(%rsp),$xt0 | |
1255 | pxor $xb1,$xt1 | |
1256 | pxor $xc1,$xt2 | |
1257 | pxor $xd1,$xt3 | |
1258 | movdqu $xt0,0x40($out) | |
1259 | movdqu $xt1,0x50($out) | |
1260 | movdqu $xt2,0x60($out) | |
1261 | movdqu $xt3,0x70($out) | |
1262 | je .Ldone4x | |
1263 | ||
1264 | movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? | |
1265 | lea 0x80($inp),$inp # inp+=64*2 | |
1266 | xor %r10,%r10 | |
1267 | movdqa $xt0,0x00(%rsp) | |
1268 | movdqa $xb2,0x10(%rsp) | |
1269 | lea 0x80($out),$out # out+=64*2 | |
1270 | movdqa $xc2,0x20(%rsp) | |
1271 | sub \$128,$len # len-=64*2 | |
1272 | movdqa $xd2,0x30(%rsp) | |
1273 | jmp .Loop_tail4x | |
1274 | ||
1275 | .align 32 | |
1276 | .L192_or_more4x: | |
1277 | movdqu 0x00($inp),$xt0 # xor with input | |
1278 | movdqu 0x10($inp),$xt1 | |
1279 | movdqu 0x20($inp),$xt2 | |
1280 | movdqu 0x30($inp),$xt3 | |
1281 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? | |
1282 | pxor $xb0,$xt1 | |
1283 | pxor $xc0,$xt2 | |
1284 | pxor $xd0,$xt3 | |
1285 | ||
1286 | movdqu $xt0,0x00($out) | |
1287 | movdqu 0x40($inp),$xt0 | |
1288 | movdqu $xt1,0x10($out) | |
1289 | movdqu 0x50($inp),$xt1 | |
1290 | movdqu $xt2,0x20($out) | |
1291 | movdqu 0x60($inp),$xt2 | |
1292 | movdqu $xt3,0x30($out) | |
1293 | movdqu 0x70($inp),$xt3 | |
1294 | lea 0x80($inp),$inp # size optimization | |
1295 | pxor 0x10(%rsp),$xt0 | |
1296 | pxor $xb1,$xt1 | |
1297 | pxor $xc1,$xt2 | |
1298 | pxor $xd1,$xt3 | |
1299 | ||
1300 | movdqu $xt0,0x40($out) | |
1301 | movdqu 0x00($inp),$xt0 | |
1302 | movdqu $xt1,0x50($out) | |
1303 | movdqu 0x10($inp),$xt1 | |
1304 | movdqu $xt2,0x60($out) | |
1305 | movdqu 0x20($inp),$xt2 | |
1306 | movdqu $xt3,0x70($out) | |
1307 | lea 0x80($out),$out # size optimization | |
1308 | movdqu 0x30($inp),$xt3 | |
1309 | pxor 0x20(%rsp),$xt0 | |
1310 | pxor $xb2,$xt1 | |
1311 | pxor $xc2,$xt2 | |
1312 | pxor $xd2,$xt3 | |
1313 | movdqu $xt0,0x00($out) | |
1314 | movdqu $xt1,0x10($out) | |
1315 | movdqu $xt2,0x20($out) | |
1316 | movdqu $xt3,0x30($out) | |
1317 | je .Ldone4x | |
1318 | ||
1319 | movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? | |
1320 | lea 0x40($inp),$inp # inp+=64*3 | |
1321 | xor %r10,%r10 | |
1322 | movdqa $xt0,0x00(%rsp) | |
1323 | movdqa $xb3,0x10(%rsp) | |
1324 | lea 0x40($out),$out # out+=64*3 | |
1325 | movdqa $xc3,0x20(%rsp) | |
1326 | sub \$192,$len # len-=64*3 | |
1327 | movdqa $xd3,0x30(%rsp) | |
1328 | ||
1329 | .Loop_tail4x: | |
1330 | movzb ($inp,%r10),%eax | |
1331 | movzb (%rsp,%r10),%ecx | |
1332 | lea 1(%r10),%r10 | |
1333 | xor %ecx,%eax | |
1334 | mov %al,-1($out,%r10) | |
1335 | dec $len | |
1336 | jnz .Loop_tail4x | |
1337 | ||
1338 | .Ldone4x: | |
1339 | ___ | |
1340 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1341 | movaps -0xa8(%r9),%xmm6 |
1342 | movaps -0x98(%r9),%xmm7 | |
1343 | movaps -0x88(%r9),%xmm8 | |
1344 | movaps -0x78(%r9),%xmm9 | |
1345 | movaps -0x68(%r9),%xmm10 | |
1346 | movaps -0x58(%r9),%xmm11 | |
1347 | movaps -0x48(%r9),%xmm12 | |
1348 | movaps -0x38(%r9),%xmm13 | |
1349 | movaps -0x28(%r9),%xmm14 | |
1350 | movaps -0x18(%r9),%xmm15 | |
a98c648e AP |
1351 | ___ |
1352 | $code.=<<___; | |
384e6de4 | 1353 | lea (%r9),%rsp |
f17652e5 | 1354 | .cfi_def_cfa_register %rsp |
384e6de4 | 1355 | .L4x_epilogue: |
a98c648e | 1356 | ret |
f17652e5 | 1357 | .cfi_endproc |
a98c648e AP |
1358 | .size ChaCha20_4x,.-ChaCha20_4x |
1359 | ___ | |
1360 | } | |
1361 | ||
1362 | ######################################################################## | |
1363 | # XOP code path that handles all lengths. | |
1364 | if ($avx) { | |
1365 | # There is some "anomaly" observed depending on instructions' size or | |
1366 | # alignment. If you look closely at below code you'll notice that | |
1367 | # sometimes argument order varies. The order affects instruction | |
1368 | # encoding by making it larger, and such fiddling gives 5% performance | |
1369 | # improvement. This is on FX-4100... | |
1370 | ||
1371 | my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, | |
1372 | $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15)); | |
1373 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
1374 | $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3); | |
1375 | ||
1376 | sub XOP_lane_ROUND { | |
1377 | my ($a0,$b0,$c0,$d0)=@_; | |
1378 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
1379 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
1380 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
1381 | my @x=map("\"$_\"",@xx); | |
1382 | ||
1383 | ( | |
1384 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 | |
1385 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 | |
1386 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 | |
1387 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 | |
1388 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])", | |
1389 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])", | |
1390 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])", | |
1391 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])", | |
1392 | "&vprotd (@x[$d0],@x[$d0],16)", | |
1393 | "&vprotd (@x[$d1],@x[$d1],16)", | |
1394 | "&vprotd (@x[$d2],@x[$d2],16)", | |
1395 | "&vprotd (@x[$d3],@x[$d3],16)", | |
1396 | ||
1397 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", | |
1398 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", | |
1399 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", | |
1400 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", | |
1401 | "&vpxor (@x[$b0],@x[$c0],@x[$b0])", | |
1402 | "&vpxor (@x[$b1],@x[$c1],@x[$b1])", | |
1403 | "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip | |
1404 | "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip | |
1405 | "&vprotd (@x[$b0],@x[$b0],12)", | |
1406 | "&vprotd (@x[$b1],@x[$b1],12)", | |
1407 | "&vprotd (@x[$b2],@x[$b2],12)", | |
1408 | "&vprotd (@x[$b3],@x[$b3],12)", | |
1409 | ||
1410 | "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip | |
1411 | "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip | |
1412 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", | |
1413 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", | |
1414 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])", | |
1415 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])", | |
1416 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])", | |
1417 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])", | |
1418 | "&vprotd (@x[$d0],@x[$d0],8)", | |
1419 | "&vprotd (@x[$d1],@x[$d1],8)", | |
1420 | "&vprotd (@x[$d2],@x[$d2],8)", | |
1421 | "&vprotd (@x[$d3],@x[$d3],8)", | |
1422 | ||
1423 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", | |
1424 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", | |
1425 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", | |
1426 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", | |
1427 | "&vpxor (@x[$b0],@x[$c0],@x[$b0])", | |
1428 | "&vpxor (@x[$b1],@x[$c1],@x[$b1])", | |
1429 | "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip | |
1430 | "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip | |
1431 | "&vprotd (@x[$b0],@x[$b0],7)", | |
1432 | "&vprotd (@x[$b1],@x[$b1],7)", | |
1433 | "&vprotd (@x[$b2],@x[$b2],7)", | |
1434 | "&vprotd (@x[$b3],@x[$b3],7)" | |
1435 | ); | |
1436 | } | |
1437 | ||
384e6de4 | 1438 | my $xframe = $win64 ? 0xa8 : 8; |
a98c648e AP |
1439 | |
1440 | $code.=<<___; | |
1441 | .type ChaCha20_4xop,\@function,5 | |
1442 | .align 32 | |
1443 | ChaCha20_4xop: | |
f17652e5 | 1444 | .cfi_startproc |
a98c648e | 1445 | .LChaCha20_4xop: |
384e6de4 | 1446 | mov %rsp,%r9 # frame pointer |
f17652e5 | 1447 | .cfi_def_cfa_register %r9 |
384e6de4 | 1448 | sub \$0x140+$xframe,%rsp |
a98c648e AP |
1449 | ___ |
1450 | ################ stack layout | |
1451 | # +0x00 SIMD equivalent of @x[8-12] | |
1452 | # ... | |
1453 | # +0x40 constant copy of key[0-2] smashed by lanes | |
1454 | # ... | |
1455 | # +0x100 SIMD counters (with nonce smashed by lanes) | |
1456 | # ... | |
1457 | # +0x140 | |
1458 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1459 | movaps %xmm6,-0xa8(%r9) |
1460 | movaps %xmm7,-0x98(%r9) | |
1461 | movaps %xmm8,-0x88(%r9) | |
1462 | movaps %xmm9,-0x78(%r9) | |
1463 | movaps %xmm10,-0x68(%r9) | |
1464 | movaps %xmm11,-0x58(%r9) | |
1465 | movaps %xmm12,-0x48(%r9) | |
1466 | movaps %xmm13,-0x38(%r9) | |
1467 | movaps %xmm14,-0x28(%r9) | |
1468 | movaps %xmm15,-0x18(%r9) | |
1469 | .L4xop_body: | |
a98c648e AP |
1470 | ___ |
1471 | $code.=<<___; | |
1472 | vzeroupper | |
1473 | ||
1474 | vmovdqa .Lsigma(%rip),$xa3 # key[0] | |
1475 | vmovdqu ($key),$xb3 # key[1] | |
1476 | vmovdqu 16($key),$xt3 # key[2] | |
1477 | vmovdqu ($counter),$xd3 # key[3] | |
1478 | lea 0x100(%rsp),%rcx # size optimization | |
1479 | ||
1480 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
1481 | vpshufd \$0x55,$xa3,$xa1 | |
1482 | vmovdqa $xa0,0x40(%rsp) # ... and offload | |
1483 | vpshufd \$0xaa,$xa3,$xa2 | |
1484 | vmovdqa $xa1,0x50(%rsp) | |
1485 | vpshufd \$0xff,$xa3,$xa3 | |
1486 | vmovdqa $xa2,0x60(%rsp) | |
1487 | vmovdqa $xa3,0x70(%rsp) | |
1488 | ||
1489 | vpshufd \$0x00,$xb3,$xb0 | |
1490 | vpshufd \$0x55,$xb3,$xb1 | |
1491 | vmovdqa $xb0,0x80-0x100(%rcx) | |
1492 | vpshufd \$0xaa,$xb3,$xb2 | |
1493 | vmovdqa $xb1,0x90-0x100(%rcx) | |
1494 | vpshufd \$0xff,$xb3,$xb3 | |
1495 | vmovdqa $xb2,0xa0-0x100(%rcx) | |
1496 | vmovdqa $xb3,0xb0-0x100(%rcx) | |
1497 | ||
1498 | vpshufd \$0x00,$xt3,$xt0 # "$xc0" | |
1499 | vpshufd \$0x55,$xt3,$xt1 # "$xc1" | |
1500 | vmovdqa $xt0,0xc0-0x100(%rcx) | |
1501 | vpshufd \$0xaa,$xt3,$xt2 # "$xc2" | |
1502 | vmovdqa $xt1,0xd0-0x100(%rcx) | |
1503 | vpshufd \$0xff,$xt3,$xt3 # "$xc3" | |
1504 | vmovdqa $xt2,0xe0-0x100(%rcx) | |
1505 | vmovdqa $xt3,0xf0-0x100(%rcx) | |
1506 | ||
1507 | vpshufd \$0x00,$xd3,$xd0 | |
1508 | vpshufd \$0x55,$xd3,$xd1 | |
1509 | vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet | |
1510 | vpshufd \$0xaa,$xd3,$xd2 | |
1511 | vmovdqa $xd1,0x110-0x100(%rcx) | |
1512 | vpshufd \$0xff,$xd3,$xd3 | |
1513 | vmovdqa $xd2,0x120-0x100(%rcx) | |
1514 | vmovdqa $xd3,0x130-0x100(%rcx) | |
1515 | ||
1516 | jmp .Loop_enter4xop | |
1517 | ||
1518 | .align 32 | |
1519 | .Loop_outer4xop: | |
1520 | vmovdqa 0x40(%rsp),$xa0 # re-load smashed key | |
1521 | vmovdqa 0x50(%rsp),$xa1 | |
1522 | vmovdqa 0x60(%rsp),$xa2 | |
1523 | vmovdqa 0x70(%rsp),$xa3 | |
1524 | vmovdqa 0x80-0x100(%rcx),$xb0 | |
1525 | vmovdqa 0x90-0x100(%rcx),$xb1 | |
1526 | vmovdqa 0xa0-0x100(%rcx),$xb2 | |
1527 | vmovdqa 0xb0-0x100(%rcx),$xb3 | |
1528 | vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" | |
1529 | vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" | |
1530 | vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" | |
1531 | vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" | |
1532 | vmovdqa 0x100-0x100(%rcx),$xd0 | |
1533 | vmovdqa 0x110-0x100(%rcx),$xd1 | |
1534 | vmovdqa 0x120-0x100(%rcx),$xd2 | |
1535 | vmovdqa 0x130-0x100(%rcx),$xd3 | |
1536 | vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters | |
1537 | ||
1538 | .Loop_enter4xop: | |
1539 | mov \$10,%eax | |
1540 | vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters | |
1541 | jmp .Loop4xop | |
1542 | ||
1543 | .align 32 | |
1544 | .Loop4xop: | |
1545 | ___ | |
1546 | foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; } | |
1547 | foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; } | |
1548 | $code.=<<___; | |
1549 | dec %eax | |
1550 | jnz .Loop4xop | |
1551 | ||
1552 | vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material | |
1553 | vpaddd 0x50(%rsp),$xa1,$xa1 | |
1554 | vpaddd 0x60(%rsp),$xa2,$xa2 | |
1555 | vpaddd 0x70(%rsp),$xa3,$xa3 | |
1556 | ||
1557 | vmovdqa $xt2,0x20(%rsp) # offload $xc2,3 | |
1558 | vmovdqa $xt3,0x30(%rsp) | |
1559 | ||
1560 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data | |
1561 | vpunpckldq $xa3,$xa2,$xt3 | |
1562 | vpunpckhdq $xa1,$xa0,$xa0 | |
1563 | vpunpckhdq $xa3,$xa2,$xa2 | |
1564 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0" | |
1565 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1" | |
1566 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2" | |
1567 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3" | |
1568 | ___ | |
1569 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); | |
1570 | $code.=<<___; | |
1571 | vpaddd 0x80-0x100(%rcx),$xb0,$xb0 | |
1572 | vpaddd 0x90-0x100(%rcx),$xb1,$xb1 | |
1573 | vpaddd 0xa0-0x100(%rcx),$xb2,$xb2 | |
1574 | vpaddd 0xb0-0x100(%rcx),$xb3,$xb3 | |
1575 | ||
1576 | vmovdqa $xa0,0x00(%rsp) # offload $xa0,1 | |
1577 | vmovdqa $xa1,0x10(%rsp) | |
1578 | vmovdqa 0x20(%rsp),$xa0 # "xc2" | |
1579 | vmovdqa 0x30(%rsp),$xa1 # "xc3" | |
1580 | ||
1581 | vpunpckldq $xb1,$xb0,$xt2 | |
1582 | vpunpckldq $xb3,$xb2,$xt3 | |
1583 | vpunpckhdq $xb1,$xb0,$xb0 | |
1584 | vpunpckhdq $xb3,$xb2,$xb2 | |
1585 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0" | |
1586 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1" | |
1587 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2" | |
1588 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3" | |
1589 | ___ | |
1590 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); | |
1591 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); | |
1592 | $code.=<<___; | |
1593 | vpaddd 0xc0-0x100(%rcx),$xc0,$xc0 | |
1594 | vpaddd 0xd0-0x100(%rcx),$xc1,$xc1 | |
1595 | vpaddd 0xe0-0x100(%rcx),$xc2,$xc2 | |
1596 | vpaddd 0xf0-0x100(%rcx),$xc3,$xc3 | |
1597 | ||
1598 | vpunpckldq $xc1,$xc0,$xt2 | |
1599 | vpunpckldq $xc3,$xc2,$xt3 | |
1600 | vpunpckhdq $xc1,$xc0,$xc0 | |
1601 | vpunpckhdq $xc3,$xc2,$xc2 | |
1602 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0" | |
1603 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1" | |
1604 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2" | |
1605 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3" | |
1606 | ___ | |
1607 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); | |
1608 | $code.=<<___; | |
1609 | vpaddd 0x100-0x100(%rcx),$xd0,$xd0 | |
1610 | vpaddd 0x110-0x100(%rcx),$xd1,$xd1 | |
1611 | vpaddd 0x120-0x100(%rcx),$xd2,$xd2 | |
1612 | vpaddd 0x130-0x100(%rcx),$xd3,$xd3 | |
1613 | ||
1614 | vpunpckldq $xd1,$xd0,$xt2 | |
1615 | vpunpckldq $xd3,$xd2,$xt3 | |
1616 | vpunpckhdq $xd1,$xd0,$xd0 | |
1617 | vpunpckhdq $xd3,$xd2,$xd2 | |
1618 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0" | |
1619 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1" | |
1620 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2" | |
1621 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3" | |
1622 | ___ | |
1623 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); | |
1624 | ($xa0,$xa1)=($xt2,$xt3); | |
1625 | $code.=<<___; | |
1626 | vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1 | |
1627 | vmovdqa 0x10(%rsp),$xa1 | |
1628 | ||
1629 | cmp \$64*4,$len | |
1630 | jb .Ltail4xop | |
1631 | ||
1632 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1633 | vpxor 0x10($inp),$xb0,$xb0 | |
1634 | vpxor 0x20($inp),$xc0,$xc0 | |
1635 | vpxor 0x30($inp),$xd0,$xd0 | |
1636 | vpxor 0x40($inp),$xa1,$xa1 | |
1637 | vpxor 0x50($inp),$xb1,$xb1 | |
1638 | vpxor 0x60($inp),$xc1,$xc1 | |
1639 | vpxor 0x70($inp),$xd1,$xd1 | |
1640 | lea 0x80($inp),$inp # size optimization | |
1641 | vpxor 0x00($inp),$xa2,$xa2 | |
1642 | vpxor 0x10($inp),$xb2,$xb2 | |
1643 | vpxor 0x20($inp),$xc2,$xc2 | |
1644 | vpxor 0x30($inp),$xd2,$xd2 | |
1645 | vpxor 0x40($inp),$xa3,$xa3 | |
1646 | vpxor 0x50($inp),$xb3,$xb3 | |
1647 | vpxor 0x60($inp),$xc3,$xc3 | |
1648 | vpxor 0x70($inp),$xd3,$xd3 | |
1649 | lea 0x80($inp),$inp # inp+=64*4 | |
1650 | ||
1651 | vmovdqu $xa0,0x00($out) | |
1652 | vmovdqu $xb0,0x10($out) | |
1653 | vmovdqu $xc0,0x20($out) | |
1654 | vmovdqu $xd0,0x30($out) | |
1655 | vmovdqu $xa1,0x40($out) | |
1656 | vmovdqu $xb1,0x50($out) | |
1657 | vmovdqu $xc1,0x60($out) | |
1658 | vmovdqu $xd1,0x70($out) | |
1659 | lea 0x80($out),$out # size optimization | |
1660 | vmovdqu $xa2,0x00($out) | |
1661 | vmovdqu $xb2,0x10($out) | |
1662 | vmovdqu $xc2,0x20($out) | |
1663 | vmovdqu $xd2,0x30($out) | |
1664 | vmovdqu $xa3,0x40($out) | |
1665 | vmovdqu $xb3,0x50($out) | |
1666 | vmovdqu $xc3,0x60($out) | |
1667 | vmovdqu $xd3,0x70($out) | |
1668 | lea 0x80($out),$out # out+=64*4 | |
1669 | ||
1670 | sub \$64*4,$len | |
1671 | jnz .Loop_outer4xop | |
1672 | ||
1673 | jmp .Ldone4xop | |
1674 | ||
1675 | .align 32 | |
1676 | .Ltail4xop: | |
1677 | cmp \$192,$len | |
1678 | jae .L192_or_more4xop | |
1679 | cmp \$128,$len | |
1680 | jae .L128_or_more4xop | |
1681 | cmp \$64,$len | |
1682 | jae .L64_or_more4xop | |
1683 | ||
1684 | xor %r10,%r10 | |
1685 | vmovdqa $xa0,0x00(%rsp) | |
1686 | vmovdqa $xb0,0x10(%rsp) | |
1687 | vmovdqa $xc0,0x20(%rsp) | |
1688 | vmovdqa $xd0,0x30(%rsp) | |
1689 | jmp .Loop_tail4xop | |
1690 | ||
1691 | .align 32 | |
1692 | .L64_or_more4xop: | |
1693 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1694 | vpxor 0x10($inp),$xb0,$xb0 | |
1695 | vpxor 0x20($inp),$xc0,$xc0 | |
1696 | vpxor 0x30($inp),$xd0,$xd0 | |
1697 | vmovdqu $xa0,0x00($out) | |
1698 | vmovdqu $xb0,0x10($out) | |
1699 | vmovdqu $xc0,0x20($out) | |
1700 | vmovdqu $xd0,0x30($out) | |
1701 | je .Ldone4xop | |
1702 | ||
1703 | lea 0x40($inp),$inp # inp+=64*1 | |
1704 | vmovdqa $xa1,0x00(%rsp) | |
1705 | xor %r10,%r10 | |
1706 | vmovdqa $xb1,0x10(%rsp) | |
1707 | lea 0x40($out),$out # out+=64*1 | |
1708 | vmovdqa $xc1,0x20(%rsp) | |
1709 | sub \$64,$len # len-=64*1 | |
1710 | vmovdqa $xd1,0x30(%rsp) | |
1711 | jmp .Loop_tail4xop | |
1712 | ||
1713 | .align 32 | |
1714 | .L128_or_more4xop: | |
1715 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1716 | vpxor 0x10($inp),$xb0,$xb0 | |
1717 | vpxor 0x20($inp),$xc0,$xc0 | |
1718 | vpxor 0x30($inp),$xd0,$xd0 | |
1719 | vpxor 0x40($inp),$xa1,$xa1 | |
1720 | vpxor 0x50($inp),$xb1,$xb1 | |
1721 | vpxor 0x60($inp),$xc1,$xc1 | |
1722 | vpxor 0x70($inp),$xd1,$xd1 | |
1723 | ||
1724 | vmovdqu $xa0,0x00($out) | |
1725 | vmovdqu $xb0,0x10($out) | |
1726 | vmovdqu $xc0,0x20($out) | |
1727 | vmovdqu $xd0,0x30($out) | |
1728 | vmovdqu $xa1,0x40($out) | |
1729 | vmovdqu $xb1,0x50($out) | |
1730 | vmovdqu $xc1,0x60($out) | |
1731 | vmovdqu $xd1,0x70($out) | |
1732 | je .Ldone4xop | |
1733 | ||
1734 | lea 0x80($inp),$inp # inp+=64*2 | |
1735 | vmovdqa $xa2,0x00(%rsp) | |
1736 | xor %r10,%r10 | |
1737 | vmovdqa $xb2,0x10(%rsp) | |
1738 | lea 0x80($out),$out # out+=64*2 | |
1739 | vmovdqa $xc2,0x20(%rsp) | |
1740 | sub \$128,$len # len-=64*2 | |
1741 | vmovdqa $xd2,0x30(%rsp) | |
1742 | jmp .Loop_tail4xop | |
1743 | ||
1744 | .align 32 | |
1745 | .L192_or_more4xop: | |
1746 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1747 | vpxor 0x10($inp),$xb0,$xb0 | |
1748 | vpxor 0x20($inp),$xc0,$xc0 | |
1749 | vpxor 0x30($inp),$xd0,$xd0 | |
1750 | vpxor 0x40($inp),$xa1,$xa1 | |
1751 | vpxor 0x50($inp),$xb1,$xb1 | |
1752 | vpxor 0x60($inp),$xc1,$xc1 | |
1753 | vpxor 0x70($inp),$xd1,$xd1 | |
1754 | lea 0x80($inp),$inp # size optimization | |
1755 | vpxor 0x00($inp),$xa2,$xa2 | |
1756 | vpxor 0x10($inp),$xb2,$xb2 | |
1757 | vpxor 0x20($inp),$xc2,$xc2 | |
1758 | vpxor 0x30($inp),$xd2,$xd2 | |
1759 | ||
1760 | vmovdqu $xa0,0x00($out) | |
1761 | vmovdqu $xb0,0x10($out) | |
1762 | vmovdqu $xc0,0x20($out) | |
1763 | vmovdqu $xd0,0x30($out) | |
1764 | vmovdqu $xa1,0x40($out) | |
1765 | vmovdqu $xb1,0x50($out) | |
1766 | vmovdqu $xc1,0x60($out) | |
1767 | vmovdqu $xd1,0x70($out) | |
1768 | lea 0x80($out),$out # size optimization | |
1769 | vmovdqu $xa2,0x00($out) | |
1770 | vmovdqu $xb2,0x10($out) | |
1771 | vmovdqu $xc2,0x20($out) | |
1772 | vmovdqu $xd2,0x30($out) | |
1773 | je .Ldone4xop | |
1774 | ||
1775 | lea 0x40($inp),$inp # inp+=64*3 | |
f2188228 | 1776 | vmovdqa $xa3,0x00(%rsp) |
a98c648e | 1777 | xor %r10,%r10 |
f2188228 | 1778 | vmovdqa $xb3,0x10(%rsp) |
a98c648e | 1779 | lea 0x40($out),$out # out+=64*3 |
f2188228 | 1780 | vmovdqa $xc3,0x20(%rsp) |
a98c648e | 1781 | sub \$192,$len # len-=64*3 |
f2188228 | 1782 | vmovdqa $xd3,0x30(%rsp) |
a98c648e AP |
1783 | |
1784 | .Loop_tail4xop: | |
1785 | movzb ($inp,%r10),%eax | |
1786 | movzb (%rsp,%r10),%ecx | |
1787 | lea 1(%r10),%r10 | |
1788 | xor %ecx,%eax | |
1789 | mov %al,-1($out,%r10) | |
1790 | dec $len | |
1791 | jnz .Loop_tail4xop | |
1792 | ||
1793 | .Ldone4xop: | |
1794 | vzeroupper | |
1795 | ___ | |
1796 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1797 | movaps -0xa8(%r9),%xmm6 |
1798 | movaps -0x98(%r9),%xmm7 | |
1799 | movaps -0x88(%r9),%xmm8 | |
1800 | movaps -0x78(%r9),%xmm9 | |
1801 | movaps -0x68(%r9),%xmm10 | |
1802 | movaps -0x58(%r9),%xmm11 | |
1803 | movaps -0x48(%r9),%xmm12 | |
1804 | movaps -0x38(%r9),%xmm13 | |
1805 | movaps -0x28(%r9),%xmm14 | |
1806 | movaps -0x18(%r9),%xmm15 | |
a98c648e AP |
1807 | ___ |
1808 | $code.=<<___; | |
384e6de4 | 1809 | lea (%r9),%rsp |
f17652e5 | 1810 | .cfi_def_cfa_register %rsp |
384e6de4 | 1811 | .L4xop_epilogue: |
a98c648e | 1812 | ret |
f17652e5 | 1813 | .cfi_endproc |
a98c648e AP |
1814 | .size ChaCha20_4xop,.-ChaCha20_4xop |
1815 | ___ | |
1816 | } | |
1817 | ||
1818 | ######################################################################## | |
1819 | # AVX2 code path | |
1820 | if ($avx>1) { | |
1821 | my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, | |
1822 | $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); | |
1823 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
1824 | "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); | |
1825 | ||
1826 | sub AVX2_lane_ROUND { | |
1827 | my ($a0,$b0,$c0,$d0)=@_; | |
1828 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
1829 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
1830 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
1831 | my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); | |
1832 | my @x=map("\"$_\"",@xx); | |
1833 | ||
1834 | # Consider order in which variables are addressed by their | |
1835 | # index: | |
1836 | # | |
1837 | # a b c d | |
1838 | # | |
1839 | # 0 4 8 12 < even round | |
1840 | # 1 5 9 13 | |
1841 | # 2 6 10 14 | |
1842 | # 3 7 11 15 | |
1843 | # 0 5 10 15 < odd round | |
1844 | # 1 6 11 12 | |
1845 | # 2 7 8 13 | |
1846 | # 3 4 9 14 | |
1847 | # | |
1848 | # 'a', 'b' and 'd's are permanently allocated in registers, | |
1849 | # @x[0..7,12..15], while 'c's are maintained in memory. If | |
1850 | # you observe 'c' column, you'll notice that pair of 'c's is | |
1851 | # invariant between rounds. This means that we have to reload | |
1852 | # them once per round, in the middle. This is why you'll see | |
1853 | # bunch of 'c' stores and loads in the middle, but none in | |
1854 | # the beginning or end. | |
1855 | ||
1856 | ( | |
1857 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 | |
1858 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])", | |
1859 | "&vpshufb (@x[$d0],@x[$d0],$t1)", | |
1860 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 | |
1861 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])", | |
1862 | "&vpshufb (@x[$d1],@x[$d1],$t1)", | |
1863 | ||
1864 | "&vpaddd ($xc,$xc,@x[$d0])", | |
1865 | "&vpxor (@x[$b0],$xc,@x[$b0])", | |
1866 | "&vpslld ($t0,@x[$b0],12)", | |
1867 | "&vpsrld (@x[$b0],@x[$b0],20)", | |
1868 | "&vpor (@x[$b0],$t0,@x[$b0])", | |
1869 | "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) | |
1870 | "&vpaddd ($xc_,$xc_,@x[$d1])", | |
1871 | "&vpxor (@x[$b1],$xc_,@x[$b1])", | |
1872 | "&vpslld ($t1,@x[$b1],12)", | |
1873 | "&vpsrld (@x[$b1],@x[$b1],20)", | |
1874 | "&vpor (@x[$b1],$t1,@x[$b1])", | |
1875 | ||
1876 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", | |
1877 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])", | |
1878 | "&vpshufb (@x[$d0],@x[$d0],$t0)", | |
1879 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", | |
1880 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])", | |
1881 | "&vpshufb (@x[$d1],@x[$d1],$t0)", | |
1882 | ||
1883 | "&vpaddd ($xc,$xc,@x[$d0])", | |
1884 | "&vpxor (@x[$b0],$xc,@x[$b0])", | |
1885 | "&vpslld ($t1,@x[$b0],7)", | |
1886 | "&vpsrld (@x[$b0],@x[$b0],25)", | |
1887 | "&vpor (@x[$b0],$t1,@x[$b0])", | |
1888 | "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) | |
1889 | "&vpaddd ($xc_,$xc_,@x[$d1])", | |
1890 | "&vpxor (@x[$b1],$xc_,@x[$b1])", | |
1891 | "&vpslld ($t0,@x[$b1],7)", | |
1892 | "&vpsrld (@x[$b1],@x[$b1],25)", | |
1893 | "&vpor (@x[$b1],$t0,@x[$b1])", | |
1894 | ||
1895 | "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's | |
1896 | "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", | |
1897 | "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", | |
1898 | "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", | |
1899 | ||
1900 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 | |
1901 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])", | |
1902 | "&vpshufb (@x[$d2],@x[$d2],$t1)", | |
1903 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 | |
1904 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])", | |
1905 | "&vpshufb (@x[$d3],@x[$d3],$t1)", | |
1906 | ||
1907 | "&vpaddd ($xc,$xc,@x[$d2])", | |
1908 | "&vpxor (@x[$b2],$xc,@x[$b2])", | |
1909 | "&vpslld ($t0,@x[$b2],12)", | |
1910 | "&vpsrld (@x[$b2],@x[$b2],20)", | |
1911 | "&vpor (@x[$b2],$t0,@x[$b2])", | |
1912 | "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) | |
1913 | "&vpaddd ($xc_,$xc_,@x[$d3])", | |
1914 | "&vpxor (@x[$b3],$xc_,@x[$b3])", | |
1915 | "&vpslld ($t1,@x[$b3],12)", | |
1916 | "&vpsrld (@x[$b3],@x[$b3],20)", | |
1917 | "&vpor (@x[$b3],$t1,@x[$b3])", | |
1918 | ||
1919 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", | |
1920 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])", | |
1921 | "&vpshufb (@x[$d2],@x[$d2],$t0)", | |
1922 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", | |
1923 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])", | |
1924 | "&vpshufb (@x[$d3],@x[$d3],$t0)", | |
1925 | ||
1926 | "&vpaddd ($xc,$xc,@x[$d2])", | |
1927 | "&vpxor (@x[$b2],$xc,@x[$b2])", | |
1928 | "&vpslld ($t1,@x[$b2],7)", | |
1929 | "&vpsrld (@x[$b2],@x[$b2],25)", | |
1930 | "&vpor (@x[$b2],$t1,@x[$b2])", | |
1931 | "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) | |
1932 | "&vpaddd ($xc_,$xc_,@x[$d3])", | |
1933 | "&vpxor (@x[$b3],$xc_,@x[$b3])", | |
1934 | "&vpslld ($t0,@x[$b3],7)", | |
1935 | "&vpsrld (@x[$b3],@x[$b3],25)", | |
1936 | "&vpor (@x[$b3],$t0,@x[$b3])" | |
1937 | ); | |
1938 | } | |
1939 | ||
384e6de4 | 1940 | my $xframe = $win64 ? 0xa8 : 8; |
a98c648e AP |
1941 | |
1942 | $code.=<<___; | |
1943 | .type ChaCha20_8x,\@function,5 | |
1944 | .align 32 | |
1945 | ChaCha20_8x: | |
f17652e5 | 1946 | .cfi_startproc |
a98c648e | 1947 | .LChaCha20_8x: |
384e6de4 | 1948 | mov %rsp,%r9 # frame register |
f17652e5 | 1949 | .cfi_def_cfa_register %r9 |
a98c648e AP |
1950 | sub \$0x280+$xframe,%rsp |
1951 | and \$-32,%rsp | |
1952 | ___ | |
1953 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1954 | movaps %xmm6,-0xa8(%r9) |
1955 | movaps %xmm7,-0x98(%r9) | |
1956 | movaps %xmm8,-0x88(%r9) | |
1957 | movaps %xmm9,-0x78(%r9) | |
1958 | movaps %xmm10,-0x68(%r9) | |
1959 | movaps %xmm11,-0x58(%r9) | |
1960 | movaps %xmm12,-0x48(%r9) | |
1961 | movaps %xmm13,-0x38(%r9) | |
1962 | movaps %xmm14,-0x28(%r9) | |
1963 | movaps %xmm15,-0x18(%r9) | |
1964 | .L8x_body: | |
a98c648e AP |
1965 | ___ |
1966 | $code.=<<___; | |
1967 | vzeroupper | |
a98c648e AP |
1968 | |
1969 | ################ stack layout | |
1970 | # +0x00 SIMD equivalent of @x[8-12] | |
1971 | # ... | |
1972 | # +0x80 constant copy of key[0-2] smashed by lanes | |
1973 | # ... | |
1974 | # +0x200 SIMD counters (with nonce smashed by lanes) | |
1975 | # ... | |
384e6de4 | 1976 | # +0x280 |
a98c648e AP |
1977 | |
1978 | vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] | |
1979 | vbroadcasti128 ($key),$xb3 # key[1] | |
1980 | vbroadcasti128 16($key),$xt3 # key[2] | |
1981 | vbroadcasti128 ($counter),$xd3 # key[3] | |
1982 | lea 0x100(%rsp),%rcx # size optimization | |
1983 | lea 0x200(%rsp),%rax # size optimization | |
1984 | lea .Lrot16(%rip),%r10 | |
1985 | lea .Lrot24(%rip),%r11 | |
1986 | ||
1987 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
1988 | vpshufd \$0x55,$xa3,$xa1 | |
1989 | vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload | |
1990 | vpshufd \$0xaa,$xa3,$xa2 | |
1991 | vmovdqa $xa1,0xa0-0x100(%rcx) | |
1992 | vpshufd \$0xff,$xa3,$xa3 | |
1993 | vmovdqa $xa2,0xc0-0x100(%rcx) | |
1994 | vmovdqa $xa3,0xe0-0x100(%rcx) | |
1995 | ||
1996 | vpshufd \$0x00,$xb3,$xb0 | |
1997 | vpshufd \$0x55,$xb3,$xb1 | |
1998 | vmovdqa $xb0,0x100-0x100(%rcx) | |
1999 | vpshufd \$0xaa,$xb3,$xb2 | |
2000 | vmovdqa $xb1,0x120-0x100(%rcx) | |
2001 | vpshufd \$0xff,$xb3,$xb3 | |
2002 | vmovdqa $xb2,0x140-0x100(%rcx) | |
2003 | vmovdqa $xb3,0x160-0x100(%rcx) | |
2004 | ||
2005 | vpshufd \$0x00,$xt3,$xt0 # "xc0" | |
2006 | vpshufd \$0x55,$xt3,$xt1 # "xc1" | |
2007 | vmovdqa $xt0,0x180-0x200(%rax) | |
2008 | vpshufd \$0xaa,$xt3,$xt2 # "xc2" | |
2009 | vmovdqa $xt1,0x1a0-0x200(%rax) | |
2010 | vpshufd \$0xff,$xt3,$xt3 # "xc3" | |
2011 | vmovdqa $xt2,0x1c0-0x200(%rax) | |
2012 | vmovdqa $xt3,0x1e0-0x200(%rax) | |
2013 | ||
2014 | vpshufd \$0x00,$xd3,$xd0 | |
2015 | vpshufd \$0x55,$xd3,$xd1 | |
2016 | vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet | |
2017 | vpshufd \$0xaa,$xd3,$xd2 | |
2018 | vmovdqa $xd1,0x220-0x200(%rax) | |
2019 | vpshufd \$0xff,$xd3,$xd3 | |
2020 | vmovdqa $xd2,0x240-0x200(%rax) | |
2021 | vmovdqa $xd3,0x260-0x200(%rax) | |
2022 | ||
2023 | jmp .Loop_enter8x | |
2024 | ||
2025 | .align 32 | |
2026 | .Loop_outer8x: | |
2027 | vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key | |
2028 | vmovdqa 0xa0-0x100(%rcx),$xa1 | |
2029 | vmovdqa 0xc0-0x100(%rcx),$xa2 | |
2030 | vmovdqa 0xe0-0x100(%rcx),$xa3 | |
2031 | vmovdqa 0x100-0x100(%rcx),$xb0 | |
2032 | vmovdqa 0x120-0x100(%rcx),$xb1 | |
2033 | vmovdqa 0x140-0x100(%rcx),$xb2 | |
2034 | vmovdqa 0x160-0x100(%rcx),$xb3 | |
2035 | vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" | |
2036 | vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" | |
2037 | vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" | |
2038 | vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" | |
2039 | vmovdqa 0x200-0x200(%rax),$xd0 | |
2040 | vmovdqa 0x220-0x200(%rax),$xd1 | |
2041 | vmovdqa 0x240-0x200(%rax),$xd2 | |
2042 | vmovdqa 0x260-0x200(%rax),$xd3 | |
2043 | vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters | |
2044 | ||
2045 | .Loop_enter8x: | |
2046 | vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" | |
2047 | vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" | |
2048 | vbroadcasti128 (%r10),$xt3 | |
2049 | vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters | |
2050 | mov \$10,%eax | |
2051 | jmp .Loop8x | |
2052 | ||
2053 | .align 32 | |
2054 | .Loop8x: | |
2055 | ___ | |
2056 | foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } | |
2057 | foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } | |
2058 | $code.=<<___; | |
2059 | dec %eax | |
2060 | jnz .Loop8x | |
2061 | ||
2062 | lea 0x200(%rsp),%rax # size optimization | |
2063 | vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key | |
2064 | vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 | |
2065 | vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 | |
2066 | vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 | |
2067 | ||
2068 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data | |
2069 | vpunpckldq $xa3,$xa2,$xt3 | |
2070 | vpunpckhdq $xa1,$xa0,$xa0 | |
2071 | vpunpckhdq $xa3,$xa2,$xa2 | |
2072 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0" | |
2073 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1" | |
2074 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2" | |
2075 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3" | |
2076 | ___ | |
2077 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); | |
2078 | $code.=<<___; | |
2079 | vpaddd 0x100-0x100(%rcx),$xb0,$xb0 | |
2080 | vpaddd 0x120-0x100(%rcx),$xb1,$xb1 | |
2081 | vpaddd 0x140-0x100(%rcx),$xb2,$xb2 | |
2082 | vpaddd 0x160-0x100(%rcx),$xb3,$xb3 | |
2083 | ||
2084 | vpunpckldq $xb1,$xb0,$xt2 | |
2085 | vpunpckldq $xb3,$xb2,$xt3 | |
2086 | vpunpckhdq $xb1,$xb0,$xb0 | |
2087 | vpunpckhdq $xb3,$xb2,$xb2 | |
2088 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0" | |
2089 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1" | |
2090 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2" | |
2091 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3" | |
2092 | ___ | |
2093 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); | |
2094 | $code.=<<___; | |
2095 | vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further | |
2096 | vperm2i128 \$0x31,$xb0,$xa0,$xb0 | |
2097 | vperm2i128 \$0x20,$xb1,$xa1,$xa0 | |
2098 | vperm2i128 \$0x31,$xb1,$xa1,$xb1 | |
2099 | vperm2i128 \$0x20,$xb2,$xa2,$xa1 | |
2100 | vperm2i128 \$0x31,$xb2,$xa2,$xb2 | |
2101 | vperm2i128 \$0x20,$xb3,$xa3,$xa2 | |
2102 | vperm2i128 \$0x31,$xb3,$xa3,$xb3 | |
2103 | ___ | |
2104 | ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); | |
2105 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); | |
2106 | $code.=<<___; | |
2107 | vmovdqa $xa0,0x00(%rsp) # offload $xaN | |
2108 | vmovdqa $xa1,0x20(%rsp) | |
2109 | vmovdqa 0x40(%rsp),$xc2 # $xa0 | |
2110 | vmovdqa 0x60(%rsp),$xc3 # $xa1 | |
2111 | ||
2112 | vpaddd 0x180-0x200(%rax),$xc0,$xc0 | |
2113 | vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 | |
2114 | vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 | |
2115 | vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 | |
2116 | ||
2117 | vpunpckldq $xc1,$xc0,$xt2 | |
2118 | vpunpckldq $xc3,$xc2,$xt3 | |
2119 | vpunpckhdq $xc1,$xc0,$xc0 | |
2120 | vpunpckhdq $xc3,$xc2,$xc2 | |
2121 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0" | |
2122 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1" | |
2123 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2" | |
2124 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3" | |
2125 | ___ | |
2126 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); | |
2127 | $code.=<<___; | |
2128 | vpaddd 0x200-0x200(%rax),$xd0,$xd0 | |
2129 | vpaddd 0x220-0x200(%rax),$xd1,$xd1 | |
2130 | vpaddd 0x240-0x200(%rax),$xd2,$xd2 | |
2131 | vpaddd 0x260-0x200(%rax),$xd3,$xd3 | |
2132 | ||
2133 | vpunpckldq $xd1,$xd0,$xt2 | |
2134 | vpunpckldq $xd3,$xd2,$xt3 | |
2135 | vpunpckhdq $xd1,$xd0,$xd0 | |
2136 | vpunpckhdq $xd3,$xd2,$xd2 | |
2137 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0" | |
2138 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1" | |
2139 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2" | |
2140 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3" | |
2141 | ___ | |
2142 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); | |
2143 | $code.=<<___; | |
2144 | vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further | |
2145 | vperm2i128 \$0x31,$xd0,$xc0,$xd0 | |
2146 | vperm2i128 \$0x20,$xd1,$xc1,$xc0 | |
2147 | vperm2i128 \$0x31,$xd1,$xc1,$xd1 | |
2148 | vperm2i128 \$0x20,$xd2,$xc2,$xc1 | |
2149 | vperm2i128 \$0x31,$xd2,$xc2,$xd2 | |
2150 | vperm2i128 \$0x20,$xd3,$xc3,$xc2 | |
2151 | vperm2i128 \$0x31,$xd3,$xc3,$xd3 | |
2152 | ___ | |
2153 | ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); | |
2154 | ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= | |
2155 | ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); | |
2156 | ($xa0,$xa1)=($xt2,$xt3); | |
2157 | $code.=<<___; | |
2158 | vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? | |
2159 | vmovdqa 0x20(%rsp),$xa1 | |
2160 | ||
2161 | cmp \$64*8,$len | |
2162 | jb .Ltail8x | |
2163 | ||
2164 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2165 | vpxor 0x20($inp),$xb0,$xb0 | |
2166 | vpxor 0x40($inp),$xc0,$xc0 | |
2167 | vpxor 0x60($inp),$xd0,$xd0 | |
2168 | lea 0x80($inp),$inp # size optimization | |
2169 | vmovdqu $xa0,0x00($out) | |
2170 | vmovdqu $xb0,0x20($out) | |
2171 | vmovdqu $xc0,0x40($out) | |
2172 | vmovdqu $xd0,0x60($out) | |
2173 | lea 0x80($out),$out # size optimization | |
2174 | ||
2175 | vpxor 0x00($inp),$xa1,$xa1 | |
2176 | vpxor 0x20($inp),$xb1,$xb1 | |
2177 | vpxor 0x40($inp),$xc1,$xc1 | |
2178 | vpxor 0x60($inp),$xd1,$xd1 | |
2179 | lea 0x80($inp),$inp # size optimization | |
2180 | vmovdqu $xa1,0x00($out) | |
2181 | vmovdqu $xb1,0x20($out) | |
2182 | vmovdqu $xc1,0x40($out) | |
2183 | vmovdqu $xd1,0x60($out) | |
2184 | lea 0x80($out),$out # size optimization | |
2185 | ||
2186 | vpxor 0x00($inp),$xa2,$xa2 | |
2187 | vpxor 0x20($inp),$xb2,$xb2 | |
2188 | vpxor 0x40($inp),$xc2,$xc2 | |
2189 | vpxor 0x60($inp),$xd2,$xd2 | |
2190 | lea 0x80($inp),$inp # size optimization | |
2191 | vmovdqu $xa2,0x00($out) | |
2192 | vmovdqu $xb2,0x20($out) | |
2193 | vmovdqu $xc2,0x40($out) | |
2194 | vmovdqu $xd2,0x60($out) | |
2195 | lea 0x80($out),$out # size optimization | |
2196 | ||
2197 | vpxor 0x00($inp),$xa3,$xa3 | |
2198 | vpxor 0x20($inp),$xb3,$xb3 | |
2199 | vpxor 0x40($inp),$xc3,$xc3 | |
2200 | vpxor 0x60($inp),$xd3,$xd3 | |
2201 | lea 0x80($inp),$inp # size optimization | |
2202 | vmovdqu $xa3,0x00($out) | |
2203 | vmovdqu $xb3,0x20($out) | |
2204 | vmovdqu $xc3,0x40($out) | |
2205 | vmovdqu $xd3,0x60($out) | |
2206 | lea 0x80($out),$out # size optimization | |
2207 | ||
2208 | sub \$64*8,$len | |
2209 | jnz .Loop_outer8x | |
2210 | ||
2211 | jmp .Ldone8x | |
2212 | ||
2213 | .Ltail8x: | |
2214 | cmp \$448,$len | |
2215 | jae .L448_or_more8x | |
2216 | cmp \$384,$len | |
2217 | jae .L384_or_more8x | |
2218 | cmp \$320,$len | |
2219 | jae .L320_or_more8x | |
2220 | cmp \$256,$len | |
2221 | jae .L256_or_more8x | |
2222 | cmp \$192,$len | |
2223 | jae .L192_or_more8x | |
2224 | cmp \$128,$len | |
2225 | jae .L128_or_more8x | |
2226 | cmp \$64,$len | |
2227 | jae .L64_or_more8x | |
2228 | ||
2229 | xor %r10,%r10 | |
2230 | vmovdqa $xa0,0x00(%rsp) | |
2231 | vmovdqa $xb0,0x20(%rsp) | |
2232 | jmp .Loop_tail8x | |
2233 | ||
2234 | .align 32 | |
2235 | .L64_or_more8x: | |
2236 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2237 | vpxor 0x20($inp),$xb0,$xb0 | |
2238 | vmovdqu $xa0,0x00($out) | |
2239 | vmovdqu $xb0,0x20($out) | |
2240 | je .Ldone8x | |
2241 | ||
2242 | lea 0x40($inp),$inp # inp+=64*1 | |
2243 | xor %r10,%r10 | |
2244 | vmovdqa $xc0,0x00(%rsp) | |
2245 | lea 0x40($out),$out # out+=64*1 | |
2246 | sub \$64,$len # len-=64*1 | |
2247 | vmovdqa $xd0,0x20(%rsp) | |
2248 | jmp .Loop_tail8x | |
2249 | ||
2250 | .align 32 | |
2251 | .L128_or_more8x: | |
2252 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2253 | vpxor 0x20($inp),$xb0,$xb0 | |
2254 | vpxor 0x40($inp),$xc0,$xc0 | |
2255 | vpxor 0x60($inp),$xd0,$xd0 | |
2256 | vmovdqu $xa0,0x00($out) | |
2257 | vmovdqu $xb0,0x20($out) | |
2258 | vmovdqu $xc0,0x40($out) | |
2259 | vmovdqu $xd0,0x60($out) | |
2260 | je .Ldone8x | |
2261 | ||
2262 | lea 0x80($inp),$inp # inp+=64*2 | |
2263 | xor %r10,%r10 | |
2264 | vmovdqa $xa1,0x00(%rsp) | |
2265 | lea 0x80($out),$out # out+=64*2 | |
2266 | sub \$128,$len # len-=64*2 | |
2267 | vmovdqa $xb1,0x20(%rsp) | |
2268 | jmp .Loop_tail8x | |
2269 | ||
2270 | .align 32 | |
2271 | .L192_or_more8x: | |
2272 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2273 | vpxor 0x20($inp),$xb0,$xb0 | |
2274 | vpxor 0x40($inp),$xc0,$xc0 | |
2275 | vpxor 0x60($inp),$xd0,$xd0 | |
2276 | vpxor 0x80($inp),$xa1,$xa1 | |
2277 | vpxor 0xa0($inp),$xb1,$xb1 | |
2278 | vmovdqu $xa0,0x00($out) | |
2279 | vmovdqu $xb0,0x20($out) | |
2280 | vmovdqu $xc0,0x40($out) | |
2281 | vmovdqu $xd0,0x60($out) | |
2282 | vmovdqu $xa1,0x80($out) | |
2283 | vmovdqu $xb1,0xa0($out) | |
2284 | je .Ldone8x | |
2285 | ||
2286 | lea 0xc0($inp),$inp # inp+=64*3 | |
2287 | xor %r10,%r10 | |
2288 | vmovdqa $xc1,0x00(%rsp) | |
2289 | lea 0xc0($out),$out # out+=64*3 | |
2290 | sub \$192,$len # len-=64*3 | |
2291 | vmovdqa $xd1,0x20(%rsp) | |
2292 | jmp .Loop_tail8x | |
2293 | ||
2294 | .align 32 | |
2295 | .L256_or_more8x: | |
2296 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2297 | vpxor 0x20($inp),$xb0,$xb0 | |
2298 | vpxor 0x40($inp),$xc0,$xc0 | |
2299 | vpxor 0x60($inp),$xd0,$xd0 | |
2300 | vpxor 0x80($inp),$xa1,$xa1 | |
2301 | vpxor 0xa0($inp),$xb1,$xb1 | |
2302 | vpxor 0xc0($inp),$xc1,$xc1 | |
2303 | vpxor 0xe0($inp),$xd1,$xd1 | |
2304 | vmovdqu $xa0,0x00($out) | |
2305 | vmovdqu $xb0,0x20($out) | |
2306 | vmovdqu $xc0,0x40($out) | |
2307 | vmovdqu $xd0,0x60($out) | |
2308 | vmovdqu $xa1,0x80($out) | |
2309 | vmovdqu $xb1,0xa0($out) | |
2310 | vmovdqu $xc1,0xc0($out) | |
2311 | vmovdqu $xd1,0xe0($out) | |
2312 | je .Ldone8x | |
2313 | ||
2314 | lea 0x100($inp),$inp # inp+=64*4 | |
2315 | xor %r10,%r10 | |
2316 | vmovdqa $xa2,0x00(%rsp) | |
2317 | lea 0x100($out),$out # out+=64*4 | |
2318 | sub \$256,$len # len-=64*4 | |
2319 | vmovdqa $xb2,0x20(%rsp) | |
2320 | jmp .Loop_tail8x | |
2321 | ||
2322 | .align 32 | |
2323 | .L320_or_more8x: | |
2324 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2325 | vpxor 0x20($inp),$xb0,$xb0 | |
2326 | vpxor 0x40($inp),$xc0,$xc0 | |
2327 | vpxor 0x60($inp),$xd0,$xd0 | |
2328 | vpxor 0x80($inp),$xa1,$xa1 | |
2329 | vpxor 0xa0($inp),$xb1,$xb1 | |
2330 | vpxor 0xc0($inp),$xc1,$xc1 | |
2331 | vpxor 0xe0($inp),$xd1,$xd1 | |
2332 | vpxor 0x100($inp),$xa2,$xa2 | |
2333 | vpxor 0x120($inp),$xb2,$xb2 | |
2334 | vmovdqu $xa0,0x00($out) | |
2335 | vmovdqu $xb0,0x20($out) | |
2336 | vmovdqu $xc0,0x40($out) | |
2337 | vmovdqu $xd0,0x60($out) | |
2338 | vmovdqu $xa1,0x80($out) | |
2339 | vmovdqu $xb1,0xa0($out) | |
2340 | vmovdqu $xc1,0xc0($out) | |
2341 | vmovdqu $xd1,0xe0($out) | |
2342 | vmovdqu $xa2,0x100($out) | |
2343 | vmovdqu $xb2,0x120($out) | |
2344 | je .Ldone8x | |
2345 | ||
2346 | lea 0x140($inp),$inp # inp+=64*5 | |
2347 | xor %r10,%r10 | |
2348 | vmovdqa $xc2,0x00(%rsp) | |
2349 | lea 0x140($out),$out # out+=64*5 | |
2350 | sub \$320,$len # len-=64*5 | |
2351 | vmovdqa $xd2,0x20(%rsp) | |
2352 | jmp .Loop_tail8x | |
2353 | ||
2354 | .align 32 | |
2355 | .L384_or_more8x: | |
2356 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2357 | vpxor 0x20($inp),$xb0,$xb0 | |
2358 | vpxor 0x40($inp),$xc0,$xc0 | |
2359 | vpxor 0x60($inp),$xd0,$xd0 | |
2360 | vpxor 0x80($inp),$xa1,$xa1 | |
2361 | vpxor 0xa0($inp),$xb1,$xb1 | |
2362 | vpxor 0xc0($inp),$xc1,$xc1 | |
2363 | vpxor 0xe0($inp),$xd1,$xd1 | |
2364 | vpxor 0x100($inp),$xa2,$xa2 | |
2365 | vpxor 0x120($inp),$xb2,$xb2 | |
2366 | vpxor 0x140($inp),$xc2,$xc2 | |
2367 | vpxor 0x160($inp),$xd2,$xd2 | |
2368 | vmovdqu $xa0,0x00($out) | |
2369 | vmovdqu $xb0,0x20($out) | |
2370 | vmovdqu $xc0,0x40($out) | |
2371 | vmovdqu $xd0,0x60($out) | |
2372 | vmovdqu $xa1,0x80($out) | |
2373 | vmovdqu $xb1,0xa0($out) | |
2374 | vmovdqu $xc1,0xc0($out) | |
2375 | vmovdqu $xd1,0xe0($out) | |
2376 | vmovdqu $xa2,0x100($out) | |
2377 | vmovdqu $xb2,0x120($out) | |
2378 | vmovdqu $xc2,0x140($out) | |
2379 | vmovdqu $xd2,0x160($out) | |
2380 | je .Ldone8x | |
2381 | ||
2382 | lea 0x180($inp),$inp # inp+=64*6 | |
2383 | xor %r10,%r10 | |
2384 | vmovdqa $xa3,0x00(%rsp) | |
2385 | lea 0x180($out),$out # out+=64*6 | |
2386 | sub \$384,$len # len-=64*6 | |
2387 | vmovdqa $xb3,0x20(%rsp) | |
2388 | jmp .Loop_tail8x | |
2389 | ||
2390 | .align 32 | |
2391 | .L448_or_more8x: | |
2392 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2393 | vpxor 0x20($inp),$xb0,$xb0 | |
2394 | vpxor 0x40($inp),$xc0,$xc0 | |
2395 | vpxor 0x60($inp),$xd0,$xd0 | |
2396 | vpxor 0x80($inp),$xa1,$xa1 | |
2397 | vpxor 0xa0($inp),$xb1,$xb1 | |
2398 | vpxor 0xc0($inp),$xc1,$xc1 | |
2399 | vpxor 0xe0($inp),$xd1,$xd1 | |
2400 | vpxor 0x100($inp),$xa2,$xa2 | |
2401 | vpxor 0x120($inp),$xb2,$xb2 | |
2402 | vpxor 0x140($inp),$xc2,$xc2 | |
2403 | vpxor 0x160($inp),$xd2,$xd2 | |
2404 | vpxor 0x180($inp),$xa3,$xa3 | |
2405 | vpxor 0x1a0($inp),$xb3,$xb3 | |
2406 | vmovdqu $xa0,0x00($out) | |
2407 | vmovdqu $xb0,0x20($out) | |
2408 | vmovdqu $xc0,0x40($out) | |
2409 | vmovdqu $xd0,0x60($out) | |
2410 | vmovdqu $xa1,0x80($out) | |
2411 | vmovdqu $xb1,0xa0($out) | |
2412 | vmovdqu $xc1,0xc0($out) | |
2413 | vmovdqu $xd1,0xe0($out) | |
2414 | vmovdqu $xa2,0x100($out) | |
2415 | vmovdqu $xb2,0x120($out) | |
2416 | vmovdqu $xc2,0x140($out) | |
2417 | vmovdqu $xd2,0x160($out) | |
2418 | vmovdqu $xa3,0x180($out) | |
2419 | vmovdqu $xb3,0x1a0($out) | |
2420 | je .Ldone8x | |
2421 | ||
2422 | lea 0x1c0($inp),$inp # inp+=64*7 | |
2423 | xor %r10,%r10 | |
2424 | vmovdqa $xc3,0x00(%rsp) | |
2425 | lea 0x1c0($out),$out # out+=64*7 | |
2426 | sub \$448,$len # len-=64*7 | |
2427 | vmovdqa $xd3,0x20(%rsp) | |
2428 | ||
2429 | .Loop_tail8x: | |
2430 | movzb ($inp,%r10),%eax | |
2431 | movzb (%rsp,%r10),%ecx | |
2432 | lea 1(%r10),%r10 | |
2433 | xor %ecx,%eax | |
2434 | mov %al,-1($out,%r10) | |
2435 | dec $len | |
2436 | jnz .Loop_tail8x | |
2437 | ||
2438 | .Ldone8x: | |
3c274a6e | 2439 | vzeroall |
a98c648e AP |
2440 | ___ |
2441 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2442 | movaps -0xa8(%r9),%xmm6 |
2443 | movaps -0x98(%r9),%xmm7 | |
2444 | movaps -0x88(%r9),%xmm8 | |
2445 | movaps -0x78(%r9),%xmm9 | |
2446 | movaps -0x68(%r9),%xmm10 | |
2447 | movaps -0x58(%r9),%xmm11 | |
2448 | movaps -0x48(%r9),%xmm12 | |
2449 | movaps -0x38(%r9),%xmm13 | |
2450 | movaps -0x28(%r9),%xmm14 | |
2451 | movaps -0x18(%r9),%xmm15 | |
a98c648e AP |
2452 | ___ |
2453 | $code.=<<___; | |
384e6de4 | 2454 | lea (%r9),%rsp |
f17652e5 | 2455 | .cfi_def_cfa_register %rsp |
384e6de4 | 2456 | .L8x_epilogue: |
a98c648e | 2457 | ret |
f17652e5 | 2458 | .cfi_endproc |
a98c648e AP |
2459 | .size ChaCha20_8x,.-ChaCha20_8x |
2460 | ___ | |
2461 | } | |
2462 | ||
abb8c44f AP |
2463 | ######################################################################## |
2464 | # AVX512 code paths | |
2465 | if ($avx>2) { | |
3c274a6e AP |
2466 | # This one handles shorter inputs... |
2467 | ||
2468 | my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); | |
2469 | my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); | |
2470 | ||
cded9513 AP |
2471 | sub vpxord() # size optimization |
2472 | { my $opcode = "vpxor"; # adhere to vpxor when possible | |
2473 | ||
2474 | foreach (@_) { | |
2475 | if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) { | |
2476 | $opcode = "vpxord"; | |
2477 | last; | |
2478 | } | |
2479 | } | |
2480 | ||
2481 | $code .= "\t$opcode\t".join(',',reverse @_)."\n"; | |
2482 | } | |
2483 | ||
3c274a6e AP |
2484 | sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round |
2485 | &vpaddd ($a,$a,$b); | |
2486 | &vpxord ($d,$d,$a); | |
2487 | &vprold ($d,$d,16); | |
2488 | ||
2489 | &vpaddd ($c,$c,$d); | |
2490 | &vpxord ($b,$b,$c); | |
2491 | &vprold ($b,$b,12); | |
2492 | ||
2493 | &vpaddd ($a,$a,$b); | |
2494 | &vpxord ($d,$d,$a); | |
2495 | &vprold ($d,$d,8); | |
2496 | ||
2497 | &vpaddd ($c,$c,$d); | |
2498 | &vpxord ($b,$b,$c); | |
2499 | &vprold ($b,$b,7); | |
2500 | } | |
2501 | ||
384e6de4 | 2502 | my $xframe = $win64 ? 32+8 : 8; |
3c274a6e AP |
2503 | |
2504 | $code.=<<___; | |
2505 | .type ChaCha20_avx512,\@function,5 | |
2506 | .align 32 | |
2507 | ChaCha20_avx512: | |
f17652e5 | 2508 | .cfi_startproc |
3c274a6e | 2509 | .LChaCha20_avx512: |
384e6de4 | 2510 | mov %rsp,%r9 # frame pointer |
f17652e5 | 2511 | .cfi_def_cfa_register %r9 |
3c274a6e AP |
2512 | cmp \$512,$len |
2513 | ja .LChaCha20_16x | |
2514 | ||
3c274a6e AP |
2515 | sub \$64+$xframe,%rsp |
2516 | ___ | |
2517 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2518 | movaps %xmm6,-0x28(%r9) |
2519 | movaps %xmm7,-0x18(%r9) | |
2520 | .Lavx512_body: | |
3c274a6e AP |
2521 | ___ |
2522 | $code.=<<___; | |
2523 | vbroadcasti32x4 .Lsigma(%rip),$a | |
2524 | vbroadcasti32x4 ($key),$b | |
2525 | vbroadcasti32x4 16($key),$c | |
2526 | vbroadcasti32x4 ($counter),$d | |
2527 | ||
2528 | vmovdqa32 $a,$a_ | |
2529 | vmovdqa32 $b,$b_ | |
2530 | vmovdqa32 $c,$c_ | |
2531 | vpaddd .Lzeroz(%rip),$d,$d | |
2532 | vmovdqa32 .Lfourz(%rip),$fourz | |
2533 | mov \$10,$counter # reuse $counter | |
2534 | vmovdqa32 $d,$d_ | |
2535 | jmp .Loop_avx512 | |
2536 | ||
2537 | .align 16 | |
2538 | .Loop_outer_avx512: | |
2539 | vmovdqa32 $a_,$a | |
2540 | vmovdqa32 $b_,$b | |
2541 | vmovdqa32 $c_,$c | |
2542 | vpaddd $fourz,$d_,$d | |
2543 | mov \$10,$counter | |
2544 | vmovdqa32 $d,$d_ | |
2545 | jmp .Loop_avx512 | |
2546 | ||
2547 | .align 32 | |
2548 | .Loop_avx512: | |
2549 | ___ | |
2550 | &AVX512ROUND(); | |
2551 | &vpshufd ($c,$c,0b01001110); | |
2552 | &vpshufd ($b,$b,0b00111001); | |
2553 | &vpshufd ($d,$d,0b10010011); | |
2554 | ||
2555 | &AVX512ROUND(); | |
2556 | &vpshufd ($c,$c,0b01001110); | |
2557 | &vpshufd ($b,$b,0b10010011); | |
2558 | &vpshufd ($d,$d,0b00111001); | |
2559 | ||
2560 | &dec ($counter); | |
2561 | &jnz (".Loop_avx512"); | |
2562 | ||
2563 | $code.=<<___; | |
2564 | vpaddd $a_,$a,$a | |
2565 | vpaddd $b_,$b,$b | |
2566 | vpaddd $c_,$c,$c | |
2567 | vpaddd $d_,$d,$d | |
2568 | ||
2569 | sub \$64,$len | |
2570 | jb .Ltail64_avx512 | |
2571 | ||
2572 | vpxor 0x00($inp),%x#$a,$t0 # xor with input | |
2573 | vpxor 0x10($inp),%x#$b,$t1 | |
2574 | vpxor 0x20($inp),%x#$c,$t2 | |
2575 | vpxor 0x30($inp),%x#$d,$t3 | |
2576 | lea 0x40($inp),$inp # inp+=64 | |
2577 | ||
2578 | vmovdqu $t0,0x00($out) # write output | |
2579 | vmovdqu $t1,0x10($out) | |
2580 | vmovdqu $t2,0x20($out) | |
2581 | vmovdqu $t3,0x30($out) | |
2582 | lea 0x40($out),$out # out+=64 | |
2583 | ||
2584 | jz .Ldone_avx512 | |
2585 | ||
2586 | vextracti32x4 \$1,$a,$t0 | |
2587 | vextracti32x4 \$1,$b,$t1 | |
2588 | vextracti32x4 \$1,$c,$t2 | |
2589 | vextracti32x4 \$1,$d,$t3 | |
2590 | ||
2591 | sub \$64,$len | |
2592 | jb .Ltail_avx512 | |
2593 | ||
2594 | vpxor 0x00($inp),$t0,$t0 # xor with input | |
2595 | vpxor 0x10($inp),$t1,$t1 | |
2596 | vpxor 0x20($inp),$t2,$t2 | |
2597 | vpxor 0x30($inp),$t3,$t3 | |
2598 | lea 0x40($inp),$inp # inp+=64 | |
2599 | ||
2600 | vmovdqu $t0,0x00($out) # write output | |
2601 | vmovdqu $t1,0x10($out) | |
2602 | vmovdqu $t2,0x20($out) | |
2603 | vmovdqu $t3,0x30($out) | |
2604 | lea 0x40($out),$out # out+=64 | |
2605 | ||
2606 | jz .Ldone_avx512 | |
2607 | ||
2608 | vextracti32x4 \$2,$a,$t0 | |
2609 | vextracti32x4 \$2,$b,$t1 | |
2610 | vextracti32x4 \$2,$c,$t2 | |
2611 | vextracti32x4 \$2,$d,$t3 | |
2612 | ||
2613 | sub \$64,$len | |
2614 | jb .Ltail_avx512 | |
2615 | ||
2616 | vpxor 0x00($inp),$t0,$t0 # xor with input | |
2617 | vpxor 0x10($inp),$t1,$t1 | |
2618 | vpxor 0x20($inp),$t2,$t2 | |
2619 | vpxor 0x30($inp),$t3,$t3 | |
2620 | lea 0x40($inp),$inp # inp+=64 | |
2621 | ||
2622 | vmovdqu $t0,0x00($out) # write output | |
2623 | vmovdqu $t1,0x10($out) | |
2624 | vmovdqu $t2,0x20($out) | |
2625 | vmovdqu $t3,0x30($out) | |
2626 | lea 0x40($out),$out # out+=64 | |
2627 | ||
2628 | jz .Ldone_avx512 | |
2629 | ||
2630 | vextracti32x4 \$3,$a,$t0 | |
2631 | vextracti32x4 \$3,$b,$t1 | |
2632 | vextracti32x4 \$3,$c,$t2 | |
2633 | vextracti32x4 \$3,$d,$t3 | |
2634 | ||
2635 | sub \$64,$len | |
2636 | jb .Ltail_avx512 | |
2637 | ||
2638 | vpxor 0x00($inp),$t0,$t0 # xor with input | |
2639 | vpxor 0x10($inp),$t1,$t1 | |
2640 | vpxor 0x20($inp),$t2,$t2 | |
2641 | vpxor 0x30($inp),$t3,$t3 | |
2642 | lea 0x40($inp),$inp # inp+=64 | |
2643 | ||
2644 | vmovdqu $t0,0x00($out) # write output | |
2645 | vmovdqu $t1,0x10($out) | |
2646 | vmovdqu $t2,0x20($out) | |
2647 | vmovdqu $t3,0x30($out) | |
2648 | lea 0x40($out),$out # out+=64 | |
2649 | ||
2650 | jnz .Loop_outer_avx512 | |
2651 | ||
2652 | jmp .Ldone_avx512 | |
2653 | ||
2654 | .align 16 | |
2655 | .Ltail64_avx512: | |
2656 | vmovdqa %x#$a,0x00(%rsp) | |
2657 | vmovdqa %x#$b,0x10(%rsp) | |
2658 | vmovdqa %x#$c,0x20(%rsp) | |
2659 | vmovdqa %x#$d,0x30(%rsp) | |
2660 | add \$64,$len | |
2661 | jmp .Loop_tail_avx512 | |
2662 | ||
2663 | .align 16 | |
2664 | .Ltail_avx512: | |
2665 | vmovdqa $t0,0x00(%rsp) | |
2666 | vmovdqa $t1,0x10(%rsp) | |
2667 | vmovdqa $t2,0x20(%rsp) | |
2668 | vmovdqa $t3,0x30(%rsp) | |
2669 | add \$64,$len | |
2670 | ||
2671 | .Loop_tail_avx512: | |
2672 | movzb ($inp,$counter),%eax | |
2673 | movzb (%rsp,$counter),%ecx | |
2674 | lea 1($counter),$counter | |
2675 | xor %ecx,%eax | |
2676 | mov %al,-1($out,$counter) | |
2677 | dec $len | |
2678 | jnz .Loop_tail_avx512 | |
2679 | ||
47c9926a | 2680 | vmovdqu32 $a_,0x00(%rsp) |
3c274a6e AP |
2681 | |
2682 | .Ldone_avx512: | |
2683 | vzeroall | |
2684 | ___ | |
2685 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2686 | movaps -0x28(%r9),%xmm6 |
2687 | movaps -0x18(%r9),%xmm7 | |
3c274a6e AP |
2688 | ___ |
2689 | $code.=<<___; | |
384e6de4 | 2690 | lea (%r9),%rsp |
f17652e5 | 2691 | .cfi_def_cfa_register %rsp |
384e6de4 | 2692 | .Lavx512_epilogue: |
3c274a6e | 2693 | ret |
f17652e5 | 2694 | .cfi_endproc |
3c274a6e AP |
2695 | .size ChaCha20_avx512,.-ChaCha20_avx512 |
2696 | ___ | |
cded9513 AP |
2697 | |
2698 | map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz); | |
2699 | ||
2700 | $code.=<<___; | |
2701 | .type ChaCha20_avx512vl,\@function,5 | |
2702 | .align 32 | |
2703 | ChaCha20_avx512vl: | |
2704 | .cfi_startproc | |
2705 | .LChaCha20_avx512vl: | |
2706 | mov %rsp,%r9 # frame pointer | |
2707 | .cfi_def_cfa_register %r9 | |
2708 | cmp \$128,$len | |
2709 | ja .LChaCha20_8xvl | |
2710 | ||
2711 | sub \$64+$xframe,%rsp | |
2712 | ___ | |
2713 | $code.=<<___ if ($win64); | |
2714 | movaps %xmm6,-0x28(%r9) | |
2715 | movaps %xmm7,-0x18(%r9) | |
2716 | .Lavx512vl_body: | |
2717 | ___ | |
2718 | $code.=<<___; | |
2719 | vbroadcasti128 .Lsigma(%rip),$a | |
2720 | vbroadcasti128 ($key),$b | |
2721 | vbroadcasti128 16($key),$c | |
2722 | vbroadcasti128 ($counter),$d | |
2723 | ||
2724 | vmovdqa32 $a,$a_ | |
2725 | vmovdqa32 $b,$b_ | |
2726 | vmovdqa32 $c,$c_ | |
2727 | vpaddd .Lzeroz(%rip),$d,$d | |
2728 | vmovdqa32 .Ltwoy(%rip),$fourz | |
2729 | mov \$10,$counter # reuse $counter | |
2730 | vmovdqa32 $d,$d_ | |
2731 | jmp .Loop_avx512vl | |
2732 | ||
2733 | .align 16 | |
2734 | .Loop_outer_avx512vl: | |
2735 | vmovdqa32 $c_,$c | |
2736 | vpaddd $fourz,$d_,$d | |
2737 | mov \$10,$counter | |
2738 | vmovdqa32 $d,$d_ | |
2739 | jmp .Loop_avx512vl | |
2740 | ||
2741 | .align 32 | |
2742 | .Loop_avx512vl: | |
2743 | ___ | |
2744 | &AVX512ROUND(); | |
2745 | &vpshufd ($c,$c,0b01001110); | |
2746 | &vpshufd ($b,$b,0b00111001); | |
2747 | &vpshufd ($d,$d,0b10010011); | |
2748 | ||
2749 | &AVX512ROUND(); | |
2750 | &vpshufd ($c,$c,0b01001110); | |
2751 | &vpshufd ($b,$b,0b10010011); | |
2752 | &vpshufd ($d,$d,0b00111001); | |
2753 | ||
2754 | &dec ($counter); | |
2755 | &jnz (".Loop_avx512vl"); | |
2756 | ||
2757 | $code.=<<___; | |
2758 | vpaddd $a_,$a,$a | |
2759 | vpaddd $b_,$b,$b | |
2760 | vpaddd $c_,$c,$c | |
2761 | vpaddd $d_,$d,$d | |
2762 | ||
2763 | sub \$64,$len | |
2764 | jb .Ltail64_avx512vl | |
2765 | ||
2766 | vpxor 0x00($inp),%x#$a,$t0 # xor with input | |
2767 | vpxor 0x10($inp),%x#$b,$t1 | |
2768 | vpxor 0x20($inp),%x#$c,$t2 | |
2769 | vpxor 0x30($inp),%x#$d,$t3 | |
2770 | lea 0x40($inp),$inp # inp+=64 | |
2771 | ||
2772 | vmovdqu $t0,0x00($out) # write output | |
2773 | vmovdqu $t1,0x10($out) | |
2774 | vmovdqu $t2,0x20($out) | |
2775 | vmovdqu $t3,0x30($out) | |
2776 | lea 0x40($out),$out # out+=64 | |
2777 | ||
2778 | jz .Ldone_avx512vl | |
2779 | ||
2780 | vextracti128 \$1,$a,$t0 | |
2781 | vextracti128 \$1,$b,$t1 | |
2782 | vextracti128 \$1,$c,$t2 | |
2783 | vextracti128 \$1,$d,$t3 | |
2784 | ||
2785 | sub \$64,$len | |
2786 | jb .Ltail_avx512vl | |
2787 | ||
2788 | vpxor 0x00($inp),$t0,$t0 # xor with input | |
2789 | vpxor 0x10($inp),$t1,$t1 | |
2790 | vpxor 0x20($inp),$t2,$t2 | |
2791 | vpxor 0x30($inp),$t3,$t3 | |
2792 | lea 0x40($inp),$inp # inp+=64 | |
2793 | ||
2794 | vmovdqu $t0,0x00($out) # write output | |
2795 | vmovdqu $t1,0x10($out) | |
2796 | vmovdqu $t2,0x20($out) | |
2797 | vmovdqu $t3,0x30($out) | |
2798 | lea 0x40($out),$out # out+=64 | |
2799 | ||
2800 | vmovdqa32 $a_,$a | |
2801 | vmovdqa32 $b_,$b | |
2802 | jnz .Loop_outer_avx512vl | |
2803 | ||
2804 | jmp .Ldone_avx512vl | |
2805 | ||
2806 | .align 16 | |
2807 | .Ltail64_avx512vl: | |
2808 | vmovdqa %x#$a,0x00(%rsp) | |
2809 | vmovdqa %x#$b,0x10(%rsp) | |
2810 | vmovdqa %x#$c,0x20(%rsp) | |
2811 | vmovdqa %x#$d,0x30(%rsp) | |
2812 | add \$64,$len | |
2813 | jmp .Loop_tail_avx512vl | |
2814 | ||
2815 | .align 16 | |
2816 | .Ltail_avx512vl: | |
2817 | vmovdqa $t0,0x00(%rsp) | |
2818 | vmovdqa $t1,0x10(%rsp) | |
2819 | vmovdqa $t2,0x20(%rsp) | |
2820 | vmovdqa $t3,0x30(%rsp) | |
2821 | add \$64,$len | |
2822 | ||
2823 | .Loop_tail_avx512vl: | |
2824 | movzb ($inp,$counter),%eax | |
2825 | movzb (%rsp,$counter),%ecx | |
2826 | lea 1($counter),$counter | |
2827 | xor %ecx,%eax | |
2828 | mov %al,-1($out,$counter) | |
2829 | dec $len | |
2830 | jnz .Loop_tail_avx512vl | |
2831 | ||
2832 | vmovdqu32 $a_,0x00(%rsp) | |
2833 | vmovdqu32 $a_,0x20(%rsp) | |
2834 | ||
2835 | .Ldone_avx512vl: | |
2836 | vzeroall | |
2837 | ___ | |
2838 | $code.=<<___ if ($win64); | |
2839 | movaps -0x28(%r9),%xmm6 | |
2840 | movaps -0x18(%r9),%xmm7 | |
2841 | ___ | |
2842 | $code.=<<___; | |
2843 | lea (%r9),%rsp | |
2844 | .cfi_def_cfa_register %rsp | |
2845 | .Lavx512vl_epilogue: | |
2846 | ret | |
2847 | .cfi_endproc | |
2848 | .size ChaCha20_avx512vl,.-ChaCha20_avx512vl | |
2849 | ___ | |
3c274a6e AP |
2850 | } |
2851 | if ($avx>2) { | |
2852 | # This one handles longer inputs... | |
2853 | ||
abb8c44f AP |
2854 | my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, |
2855 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); | |
2856 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
2857 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); | |
2858 | my @key=map("%zmm$_",(16..31)); | |
2859 | my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; | |
2860 | ||
2861 | sub AVX512_lane_ROUND { | |
2862 | my ($a0,$b0,$c0,$d0)=@_; | |
2863 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
2864 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
2865 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
2866 | my @x=map("\"$_\"",@xx); | |
2867 | ||
2868 | ( | |
2869 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 | |
2870 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 | |
2871 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 | |
2872 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 | |
2873 | "&vpxord (@x[$d0],@x[$d0],@x[$a0])", | |
2874 | "&vpxord (@x[$d1],@x[$d1],@x[$a1])", | |
2875 | "&vpxord (@x[$d2],@x[$d2],@x[$a2])", | |
2876 | "&vpxord (@x[$d3],@x[$d3],@x[$a3])", | |
2877 | "&vprold (@x[$d0],@x[$d0],16)", | |
2878 | "&vprold (@x[$d1],@x[$d1],16)", | |
2879 | "&vprold (@x[$d2],@x[$d2],16)", | |
2880 | "&vprold (@x[$d3],@x[$d3],16)", | |
2881 | ||
2882 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", | |
2883 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", | |
2884 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", | |
2885 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", | |
2886 | "&vpxord (@x[$b0],@x[$b0],@x[$c0])", | |
2887 | "&vpxord (@x[$b1],@x[$b1],@x[$c1])", | |
2888 | "&vpxord (@x[$b2],@x[$b2],@x[$c2])", | |
2889 | "&vpxord (@x[$b3],@x[$b3],@x[$c3])", | |
2890 | "&vprold (@x[$b0],@x[$b0],12)", | |
2891 | "&vprold (@x[$b1],@x[$b1],12)", | |
2892 | "&vprold (@x[$b2],@x[$b2],12)", | |
2893 | "&vprold (@x[$b3],@x[$b3],12)", | |
2894 | ||
2895 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", | |
2896 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", | |
2897 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", | |
2898 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", | |
2899 | "&vpxord (@x[$d0],@x[$d0],@x[$a0])", | |
2900 | "&vpxord (@x[$d1],@x[$d1],@x[$a1])", | |
2901 | "&vpxord (@x[$d2],@x[$d2],@x[$a2])", | |
2902 | "&vpxord (@x[$d3],@x[$d3],@x[$a3])", | |
2903 | "&vprold (@x[$d0],@x[$d0],8)", | |
2904 | "&vprold (@x[$d1],@x[$d1],8)", | |
2905 | "&vprold (@x[$d2],@x[$d2],8)", | |
2906 | "&vprold (@x[$d3],@x[$d3],8)", | |
2907 | ||
2908 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", | |
2909 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", | |
2910 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", | |
2911 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", | |
2912 | "&vpxord (@x[$b0],@x[$b0],@x[$c0])", | |
2913 | "&vpxord (@x[$b1],@x[$b1],@x[$c1])", | |
2914 | "&vpxord (@x[$b2],@x[$b2],@x[$c2])", | |
2915 | "&vpxord (@x[$b3],@x[$b3],@x[$c3])", | |
2916 | "&vprold (@x[$b0],@x[$b0],7)", | |
2917 | "&vprold (@x[$b1],@x[$b1],7)", | |
2918 | "&vprold (@x[$b2],@x[$b2],7)", | |
2919 | "&vprold (@x[$b3],@x[$b3],7)" | |
2920 | ); | |
2921 | } | |
2922 | ||
384e6de4 | 2923 | my $xframe = $win64 ? 0xa8 : 8; |
abb8c44f AP |
2924 | |
2925 | $code.=<<___; | |
2926 | .type ChaCha20_16x,\@function,5 | |
2927 | .align 32 | |
2928 | ChaCha20_16x: | |
f17652e5 | 2929 | .cfi_startproc |
abb8c44f | 2930 | .LChaCha20_16x: |
384e6de4 | 2931 | mov %rsp,%r9 # frame register |
f17652e5 | 2932 | .cfi_def_cfa_register %r9 |
abb8c44f AP |
2933 | sub \$64+$xframe,%rsp |
2934 | and \$-64,%rsp | |
2935 | ___ | |
2936 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2937 | movaps %xmm6,-0xa8(%r9) |
2938 | movaps %xmm7,-0x98(%r9) | |
2939 | movaps %xmm8,-0x88(%r9) | |
2940 | movaps %xmm9,-0x78(%r9) | |
2941 | movaps %xmm10,-0x68(%r9) | |
2942 | movaps %xmm11,-0x58(%r9) | |
2943 | movaps %xmm12,-0x48(%r9) | |
2944 | movaps %xmm13,-0x38(%r9) | |
2945 | movaps %xmm14,-0x28(%r9) | |
2946 | movaps %xmm15,-0x18(%r9) | |
2947 | .L16x_body: | |
abb8c44f AP |
2948 | ___ |
2949 | $code.=<<___; | |
2950 | vzeroupper | |
2951 | ||
2952 | lea .Lsigma(%rip),%r10 | |
2953 | vbroadcasti32x4 (%r10),$xa3 # key[0] | |
2954 | vbroadcasti32x4 ($key),$xb3 # key[1] | |
2955 | vbroadcasti32x4 16($key),$xc3 # key[2] | |
2956 | vbroadcasti32x4 ($counter),$xd3 # key[3] | |
2957 | ||
2958 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
2959 | vpshufd \$0x55,$xa3,$xa1 | |
2960 | vpshufd \$0xaa,$xa3,$xa2 | |
2961 | vpshufd \$0xff,$xa3,$xa3 | |
2962 | vmovdqa64 $xa0,@key[0] | |
2963 | vmovdqa64 $xa1,@key[1] | |
2964 | vmovdqa64 $xa2,@key[2] | |
2965 | vmovdqa64 $xa3,@key[3] | |
2966 | ||
2967 | vpshufd \$0x00,$xb3,$xb0 | |
2968 | vpshufd \$0x55,$xb3,$xb1 | |
2969 | vpshufd \$0xaa,$xb3,$xb2 | |
2970 | vpshufd \$0xff,$xb3,$xb3 | |
2971 | vmovdqa64 $xb0,@key[4] | |
2972 | vmovdqa64 $xb1,@key[5] | |
2973 | vmovdqa64 $xb2,@key[6] | |
2974 | vmovdqa64 $xb3,@key[7] | |
2975 | ||
2976 | vpshufd \$0x00,$xc3,$xc0 | |
2977 | vpshufd \$0x55,$xc3,$xc1 | |
2978 | vpshufd \$0xaa,$xc3,$xc2 | |
2979 | vpshufd \$0xff,$xc3,$xc3 | |
2980 | vmovdqa64 $xc0,@key[8] | |
2981 | vmovdqa64 $xc1,@key[9] | |
2982 | vmovdqa64 $xc2,@key[10] | |
2983 | vmovdqa64 $xc3,@key[11] | |
2984 | ||
2985 | vpshufd \$0x00,$xd3,$xd0 | |
2986 | vpshufd \$0x55,$xd3,$xd1 | |
2987 | vpshufd \$0xaa,$xd3,$xd2 | |
2988 | vpshufd \$0xff,$xd3,$xd3 | |
2989 | vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet | |
2990 | vmovdqa64 $xd0,@key[12] | |
2991 | vmovdqa64 $xd1,@key[13] | |
2992 | vmovdqa64 $xd2,@key[14] | |
2993 | vmovdqa64 $xd3,@key[15] | |
2994 | ||
2995 | mov \$10,%eax | |
2996 | jmp .Loop16x | |
2997 | ||
2998 | .align 32 | |
2999 | .Loop_outer16x: | |
3000 | vpbroadcastd 0(%r10),$xa0 # reload key | |
3001 | vpbroadcastd 4(%r10),$xa1 | |
3002 | vpbroadcastd 8(%r10),$xa2 | |
3003 | vpbroadcastd 12(%r10),$xa3 | |
3004 | vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters | |
3005 | vmovdqa64 @key[4],$xb0 | |
3006 | vmovdqa64 @key[5],$xb1 | |
3007 | vmovdqa64 @key[6],$xb2 | |
3008 | vmovdqa64 @key[7],$xb3 | |
3009 | vmovdqa64 @key[8],$xc0 | |
3010 | vmovdqa64 @key[9],$xc1 | |
3011 | vmovdqa64 @key[10],$xc2 | |
3012 | vmovdqa64 @key[11],$xc3 | |
3013 | vmovdqa64 @key[12],$xd0 | |
3014 | vmovdqa64 @key[13],$xd1 | |
3015 | vmovdqa64 @key[14],$xd2 | |
3016 | vmovdqa64 @key[15],$xd3 | |
3017 | ||
3018 | vmovdqa64 $xa0,@key[0] | |
3019 | vmovdqa64 $xa1,@key[1] | |
3020 | vmovdqa64 $xa2,@key[2] | |
3021 | vmovdqa64 $xa3,@key[3] | |
3022 | ||
3023 | mov \$10,%eax | |
3024 | jmp .Loop16x | |
3025 | ||
3026 | .align 32 | |
3027 | .Loop16x: | |
3028 | ___ | |
3029 | foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } | |
3030 | foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } | |
3031 | $code.=<<___; | |
3032 | dec %eax | |
3033 | jnz .Loop16x | |
3034 | ||
3035 | vpaddd @key[0],$xa0,$xa0 # accumulate key | |
3036 | vpaddd @key[1],$xa1,$xa1 | |
3037 | vpaddd @key[2],$xa2,$xa2 | |
3038 | vpaddd @key[3],$xa3,$xa3 | |
3039 | ||
3040 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data | |
3041 | vpunpckldq $xa3,$xa2,$xt3 | |
3042 | vpunpckhdq $xa1,$xa0,$xa0 | |
3043 | vpunpckhdq $xa3,$xa2,$xa2 | |
3044 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0" | |
3045 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1" | |
3046 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2" | |
3047 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3" | |
3048 | ___ | |
3049 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); | |
3050 | $code.=<<___; | |
3051 | vpaddd @key[4],$xb0,$xb0 | |
3052 | vpaddd @key[5],$xb1,$xb1 | |
3053 | vpaddd @key[6],$xb2,$xb2 | |
3054 | vpaddd @key[7],$xb3,$xb3 | |
3055 | ||
3056 | vpunpckldq $xb1,$xb0,$xt2 | |
3057 | vpunpckldq $xb3,$xb2,$xt3 | |
3058 | vpunpckhdq $xb1,$xb0,$xb0 | |
3059 | vpunpckhdq $xb3,$xb2,$xb2 | |
3060 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0" | |
3061 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1" | |
3062 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2" | |
3063 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3" | |
3064 | ___ | |
3065 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); | |
3066 | $code.=<<___; | |
3067 | vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further | |
3068 | vshufi32x4 \$0xee,$xb0,$xa0,$xb0 | |
3069 | vshufi32x4 \$0x44,$xb1,$xa1,$xa0 | |
3070 | vshufi32x4 \$0xee,$xb1,$xa1,$xb1 | |
3071 | vshufi32x4 \$0x44,$xb2,$xa2,$xa1 | |
3072 | vshufi32x4 \$0xee,$xb2,$xa2,$xb2 | |
3073 | vshufi32x4 \$0x44,$xb3,$xa3,$xa2 | |
3074 | vshufi32x4 \$0xee,$xb3,$xa3,$xb3 | |
3075 | ___ | |
3076 | ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); | |
3077 | $code.=<<___; | |
3078 | vpaddd @key[8],$xc0,$xc0 | |
3079 | vpaddd @key[9],$xc1,$xc1 | |
3080 | vpaddd @key[10],$xc2,$xc2 | |
3081 | vpaddd @key[11],$xc3,$xc3 | |
3082 | ||
3083 | vpunpckldq $xc1,$xc0,$xt2 | |
3084 | vpunpckldq $xc3,$xc2,$xt3 | |
3085 | vpunpckhdq $xc1,$xc0,$xc0 | |
3086 | vpunpckhdq $xc3,$xc2,$xc2 | |
3087 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0" | |
3088 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1" | |
3089 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2" | |
3090 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3" | |
3091 | ___ | |
3092 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); | |
3093 | $code.=<<___; | |
3094 | vpaddd @key[12],$xd0,$xd0 | |
3095 | vpaddd @key[13],$xd1,$xd1 | |
3096 | vpaddd @key[14],$xd2,$xd2 | |
3097 | vpaddd @key[15],$xd3,$xd3 | |
3098 | ||
3099 | vpunpckldq $xd1,$xd0,$xt2 | |
3100 | vpunpckldq $xd3,$xd2,$xt3 | |
3101 | vpunpckhdq $xd1,$xd0,$xd0 | |
3102 | vpunpckhdq $xd3,$xd2,$xd2 | |
3103 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0" | |
3104 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1" | |
3105 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2" | |
3106 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3" | |
3107 | ___ | |
3108 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); | |
3109 | $code.=<<___; | |
3110 | vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further | |
3111 | vshufi32x4 \$0xee,$xd0,$xc0,$xd0 | |
3112 | vshufi32x4 \$0x44,$xd1,$xc1,$xc0 | |
3113 | vshufi32x4 \$0xee,$xd1,$xc1,$xd1 | |
3114 | vshufi32x4 \$0x44,$xd2,$xc2,$xc1 | |
3115 | vshufi32x4 \$0xee,$xd2,$xc2,$xd2 | |
3116 | vshufi32x4 \$0x44,$xd3,$xc3,$xc2 | |
3117 | vshufi32x4 \$0xee,$xd3,$xc3,$xd3 | |
3118 | ___ | |
3119 | ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); | |
3120 | $code.=<<___; | |
3121 | vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further | |
3122 | vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 | |
3123 | vshufi32x4 \$0x88,$xd0,$xb0,$xc0 | |
3124 | vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 | |
3125 | vshufi32x4 \$0x88,$xc1,$xa1,$xt1 | |
3126 | vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 | |
3127 | vshufi32x4 \$0x88,$xd1,$xb1,$xc1 | |
3128 | vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 | |
3129 | vshufi32x4 \$0x88,$xc2,$xa2,$xt2 | |
3130 | vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 | |
3131 | vshufi32x4 \$0x88,$xd2,$xb2,$xc2 | |
3132 | vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 | |
3133 | vshufi32x4 \$0x88,$xc3,$xa3,$xt3 | |
3134 | vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 | |
3135 | vshufi32x4 \$0x88,$xd3,$xb3,$xc3 | |
3136 | vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 | |
3137 | ___ | |
3138 | ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= | |
3139 | ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); | |
3140 | ||
3141 | ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, | |
3142 | $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = | |
3143 | ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
3144 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); | |
3145 | $code.=<<___; | |
3146 | cmp \$64*16,$len | |
3147 | jb .Ltail16x | |
3148 | ||
3149 | vpxord 0x00($inp),$xa0,$xa0 # xor with input | |
3150 | vpxord 0x40($inp),$xb0,$xb0 | |
3151 | vpxord 0x80($inp),$xc0,$xc0 | |
3152 | vpxord 0xc0($inp),$xd0,$xd0 | |
3153 | vmovdqu32 $xa0,0x00($out) | |
3154 | vmovdqu32 $xb0,0x40($out) | |
3155 | vmovdqu32 $xc0,0x80($out) | |
3156 | vmovdqu32 $xd0,0xc0($out) | |
3157 | ||
3158 | vpxord 0x100($inp),$xa1,$xa1 | |
3159 | vpxord 0x140($inp),$xb1,$xb1 | |
3160 | vpxord 0x180($inp),$xc1,$xc1 | |
3161 | vpxord 0x1c0($inp),$xd1,$xd1 | |
3162 | vmovdqu32 $xa1,0x100($out) | |
3163 | vmovdqu32 $xb1,0x140($out) | |
3164 | vmovdqu32 $xc1,0x180($out) | |
3165 | vmovdqu32 $xd1,0x1c0($out) | |
3166 | ||
3167 | vpxord 0x200($inp),$xa2,$xa2 | |
3168 | vpxord 0x240($inp),$xb2,$xb2 | |
3169 | vpxord 0x280($inp),$xc2,$xc2 | |
3170 | vpxord 0x2c0($inp),$xd2,$xd2 | |
3171 | vmovdqu32 $xa2,0x200($out) | |
3172 | vmovdqu32 $xb2,0x240($out) | |
3173 | vmovdqu32 $xc2,0x280($out) | |
3174 | vmovdqu32 $xd2,0x2c0($out) | |
3175 | ||
3176 | vpxord 0x300($inp),$xa3,$xa3 | |
3177 | vpxord 0x340($inp),$xb3,$xb3 | |
3178 | vpxord 0x380($inp),$xc3,$xc3 | |
3179 | vpxord 0x3c0($inp),$xd3,$xd3 | |
3180 | lea 0x400($inp),$inp | |
3181 | vmovdqu32 $xa3,0x300($out) | |
3182 | vmovdqu32 $xb3,0x340($out) | |
3183 | vmovdqu32 $xc3,0x380($out) | |
3184 | vmovdqu32 $xd3,0x3c0($out) | |
3185 | lea 0x400($out),$out | |
3186 | ||
3187 | sub \$64*16,$len | |
3188 | jnz .Loop_outer16x | |
3189 | ||
3190 | jmp .Ldone16x | |
3191 | ||
3192 | .align 32 | |
3193 | .Ltail16x: | |
3194 | xor %r10,%r10 | |
3195 | sub $inp,$out | |
3196 | cmp \$64*1,$len | |
3197 | jb .Less_than_64_16x | |
3198 | vpxord ($inp),$xa0,$xa0 # xor with input | |
3199 | vmovdqu32 $xa0,($out,$inp) | |
3200 | je .Ldone16x | |
3201 | vmovdqa32 $xb0,$xa0 | |
3202 | lea 64($inp),$inp | |
3203 | ||
3204 | cmp \$64*2,$len | |
3205 | jb .Less_than_64_16x | |
3206 | vpxord ($inp),$xb0,$xb0 | |
3207 | vmovdqu32 $xb0,($out,$inp) | |
3208 | je .Ldone16x | |
3209 | vmovdqa32 $xc0,$xa0 | |
3210 | lea 64($inp),$inp | |
3211 | ||
3212 | cmp \$64*3,$len | |
3213 | jb .Less_than_64_16x | |
3214 | vpxord ($inp),$xc0,$xc0 | |
3215 | vmovdqu32 $xc0,($out,$inp) | |
3216 | je .Ldone16x | |
3217 | vmovdqa32 $xd0,$xa0 | |
3218 | lea 64($inp),$inp | |
3219 | ||
3220 | cmp \$64*4,$len | |
3221 | jb .Less_than_64_16x | |
3222 | vpxord ($inp),$xd0,$xd0 | |
3223 | vmovdqu32 $xd0,($out,$inp) | |
3224 | je .Ldone16x | |
3225 | vmovdqa32 $xa1,$xa0 | |
3226 | lea 64($inp),$inp | |
3227 | ||
3228 | cmp \$64*5,$len | |
3229 | jb .Less_than_64_16x | |
3230 | vpxord ($inp),$xa1,$xa1 | |
3231 | vmovdqu32 $xa1,($out,$inp) | |
3232 | je .Ldone16x | |
3233 | vmovdqa32 $xb1,$xa0 | |
3234 | lea 64($inp),$inp | |
3235 | ||
3236 | cmp \$64*6,$len | |
3237 | jb .Less_than_64_16x | |
3238 | vpxord ($inp),$xb1,$xb1 | |
3239 | vmovdqu32 $xb1,($out,$inp) | |
3240 | je .Ldone16x | |
3241 | vmovdqa32 $xc1,$xa0 | |
3242 | lea 64($inp),$inp | |
3243 | ||
3244 | cmp \$64*7,$len | |
3245 | jb .Less_than_64_16x | |
3246 | vpxord ($inp),$xc1,$xc1 | |
3247 | vmovdqu32 $xc1,($out,$inp) | |
3248 | je .Ldone16x | |
3249 | vmovdqa32 $xd1,$xa0 | |
3250 | lea 64($inp),$inp | |
3251 | ||
3252 | cmp \$64*8,$len | |
3253 | jb .Less_than_64_16x | |
3254 | vpxord ($inp),$xd1,$xd1 | |
3255 | vmovdqu32 $xd1,($out,$inp) | |
3256 | je .Ldone16x | |
3257 | vmovdqa32 $xa2,$xa0 | |
3258 | lea 64($inp),$inp | |
3259 | ||
3260 | cmp \$64*9,$len | |
3261 | jb .Less_than_64_16x | |
3262 | vpxord ($inp),$xa2,$xa2 | |
3263 | vmovdqu32 $xa2,($out,$inp) | |
3264 | je .Ldone16x | |
3265 | vmovdqa32 $xb2,$xa0 | |
3266 | lea 64($inp),$inp | |
3267 | ||
3268 | cmp \$64*10,$len | |
3269 | jb .Less_than_64_16x | |
3270 | vpxord ($inp),$xb2,$xb2 | |
3271 | vmovdqu32 $xb2,($out,$inp) | |
3272 | je .Ldone16x | |
3273 | vmovdqa32 $xc2,$xa0 | |
3274 | lea 64($inp),$inp | |
3275 | ||
3276 | cmp \$64*11,$len | |
3277 | jb .Less_than_64_16x | |
3278 | vpxord ($inp),$xc2,$xc2 | |
3279 | vmovdqu32 $xc2,($out,$inp) | |
3280 | je .Ldone16x | |
3281 | vmovdqa32 $xd2,$xa0 | |
3282 | lea 64($inp),$inp | |
3283 | ||
3284 | cmp \$64*12,$len | |
3285 | jb .Less_than_64_16x | |
3286 | vpxord ($inp),$xd2,$xd2 | |
3287 | vmovdqu32 $xd2,($out,$inp) | |
3288 | je .Ldone16x | |
3289 | vmovdqa32 $xa3,$xa0 | |
3290 | lea 64($inp),$inp | |
3291 | ||
3292 | cmp \$64*13,$len | |
3293 | jb .Less_than_64_16x | |
3294 | vpxord ($inp),$xa3,$xa3 | |
3295 | vmovdqu32 $xa3,($out,$inp) | |
3296 | je .Ldone16x | |
3297 | vmovdqa32 $xb3,$xa0 | |
3298 | lea 64($inp),$inp | |
3299 | ||
3300 | cmp \$64*14,$len | |
3301 | jb .Less_than_64_16x | |
3302 | vpxord ($inp),$xb3,$xb3 | |
3303 | vmovdqu32 $xb3,($out,$inp) | |
3304 | je .Ldone16x | |
3305 | vmovdqa32 $xc3,$xa0 | |
3306 | lea 64($inp),$inp | |
3307 | ||
3308 | cmp \$64*15,$len | |
3309 | jb .Less_than_64_16x | |
3310 | vpxord ($inp),$xc3,$xc3 | |
3311 | vmovdqu32 $xc3,($out,$inp) | |
3312 | je .Ldone16x | |
3313 | vmovdqa32 $xd3,$xa0 | |
3314 | lea 64($inp),$inp | |
3315 | ||
3316 | .Less_than_64_16x: | |
3317 | vmovdqa32 $xa0,0x00(%rsp) | |
3318 | lea ($out,$inp),$out | |
3319 | and \$63,$len | |
3320 | ||
3321 | .Loop_tail16x: | |
3322 | movzb ($inp,%r10),%eax | |
3323 | movzb (%rsp,%r10),%ecx | |
3324 | lea 1(%r10),%r10 | |
3325 | xor %ecx,%eax | |
3326 | mov %al,-1($out,%r10) | |
3327 | dec $len | |
3328 | jnz .Loop_tail16x | |
3329 | ||
3c274a6e AP |
3330 | vpxord $xa0,$xa0,$xa0 |
3331 | vmovdqa32 $xa0,0(%rsp) | |
3332 | ||
abb8c44f | 3333 | .Ldone16x: |
3c274a6e | 3334 | vzeroall |
abb8c44f AP |
3335 | ___ |
3336 | $code.=<<___ if ($win64); | |
384e6de4 AP |
3337 | movaps -0xa8(%r9),%xmm6 |
3338 | movaps -0x98(%r9),%xmm7 | |
3339 | movaps -0x88(%r9),%xmm8 | |
3340 | movaps -0x78(%r9),%xmm9 | |
3341 | movaps -0x68(%r9),%xmm10 | |
3342 | movaps -0x58(%r9),%xmm11 | |
3343 | movaps -0x48(%r9),%xmm12 | |
3344 | movaps -0x38(%r9),%xmm13 | |
3345 | movaps -0x28(%r9),%xmm14 | |
3346 | movaps -0x18(%r9),%xmm15 | |
abb8c44f AP |
3347 | ___ |
3348 | $code.=<<___; | |
384e6de4 | 3349 | lea (%r9),%rsp |
f17652e5 | 3350 | .cfi_def_cfa_register %rsp |
384e6de4 | 3351 | .L16x_epilogue: |
abb8c44f | 3352 | ret |
f17652e5 | 3353 | .cfi_endproc |
abb8c44f AP |
3354 | .size ChaCha20_16x,.-ChaCha20_16x |
3355 | ___ | |
cded9513 AP |
3356 | |
3357 | # switch to %ymm domain | |
3358 | ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
3359 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15)); | |
3360 | @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
3361 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); | |
3362 | @key=map("%ymm$_",(16..31)); | |
3363 | ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; | |
3364 | ||
3365 | $code.=<<___; | |
3366 | .type ChaCha20_8xvl,\@function,5 | |
3367 | .align 32 | |
3368 | ChaCha20_8xvl: | |
3369 | .cfi_startproc | |
3370 | .LChaCha20_8xvl: | |
3371 | mov %rsp,%r9 # frame register | |
3372 | .cfi_def_cfa_register %r9 | |
3373 | sub \$64+$xframe,%rsp | |
3374 | and \$-64,%rsp | |
3375 | ___ | |
3376 | $code.=<<___ if ($win64); | |
3377 | movaps %xmm6,-0xa8(%r9) | |
3378 | movaps %xmm7,-0x98(%r9) | |
3379 | movaps %xmm8,-0x88(%r9) | |
3380 | movaps %xmm9,-0x78(%r9) | |
3381 | movaps %xmm10,-0x68(%r9) | |
3382 | movaps %xmm11,-0x58(%r9) | |
3383 | movaps %xmm12,-0x48(%r9) | |
3384 | movaps %xmm13,-0x38(%r9) | |
3385 | movaps %xmm14,-0x28(%r9) | |
3386 | movaps %xmm15,-0x18(%r9) | |
3387 | .L8xvl_body: | |
3388 | ___ | |
3389 | $code.=<<___; | |
3390 | vzeroupper | |
3391 | ||
3392 | lea .Lsigma(%rip),%r10 | |
3393 | vbroadcasti128 (%r10),$xa3 # key[0] | |
3394 | vbroadcasti128 ($key),$xb3 # key[1] | |
3395 | vbroadcasti128 16($key),$xc3 # key[2] | |
3396 | vbroadcasti128 ($counter),$xd3 # key[3] | |
3397 | ||
3398 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
3399 | vpshufd \$0x55,$xa3,$xa1 | |
3400 | vpshufd \$0xaa,$xa3,$xa2 | |
3401 | vpshufd \$0xff,$xa3,$xa3 | |
3402 | vmovdqa64 $xa0,@key[0] | |
3403 | vmovdqa64 $xa1,@key[1] | |
3404 | vmovdqa64 $xa2,@key[2] | |
3405 | vmovdqa64 $xa3,@key[3] | |
3406 | ||
3407 | vpshufd \$0x00,$xb3,$xb0 | |
3408 | vpshufd \$0x55,$xb3,$xb1 | |
3409 | vpshufd \$0xaa,$xb3,$xb2 | |
3410 | vpshufd \$0xff,$xb3,$xb3 | |
3411 | vmovdqa64 $xb0,@key[4] | |
3412 | vmovdqa64 $xb1,@key[5] | |
3413 | vmovdqa64 $xb2,@key[6] | |
3414 | vmovdqa64 $xb3,@key[7] | |
3415 | ||
3416 | vpshufd \$0x00,$xc3,$xc0 | |
3417 | vpshufd \$0x55,$xc3,$xc1 | |
3418 | vpshufd \$0xaa,$xc3,$xc2 | |
3419 | vpshufd \$0xff,$xc3,$xc3 | |
3420 | vmovdqa64 $xc0,@key[8] | |
3421 | vmovdqa64 $xc1,@key[9] | |
3422 | vmovdqa64 $xc2,@key[10] | |
3423 | vmovdqa64 $xc3,@key[11] | |
3424 | ||
3425 | vpshufd \$0x00,$xd3,$xd0 | |
3426 | vpshufd \$0x55,$xd3,$xd1 | |
3427 | vpshufd \$0xaa,$xd3,$xd2 | |
3428 | vpshufd \$0xff,$xd3,$xd3 | |
3429 | vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet | |
3430 | vmovdqa64 $xd0,@key[12] | |
3431 | vmovdqa64 $xd1,@key[13] | |
3432 | vmovdqa64 $xd2,@key[14] | |
3433 | vmovdqa64 $xd3,@key[15] | |
3434 | ||
3435 | mov \$10,%eax | |
3436 | jmp .Loop8xvl | |
3437 | ||
3438 | .align 32 | |
3439 | .Loop_outer8xvl: | |
3440 | #vpbroadcastd 0(%r10),$xa0 # reload key | |
3441 | #vpbroadcastd 4(%r10),$xa1 | |
3442 | vpbroadcastd 8(%r10),$xa2 | |
3443 | vpbroadcastd 12(%r10),$xa3 | |
3444 | vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters | |
3445 | vmovdqa64 @key[4],$xb0 | |
3446 | vmovdqa64 @key[5],$xb1 | |
3447 | vmovdqa64 @key[6],$xb2 | |
3448 | vmovdqa64 @key[7],$xb3 | |
3449 | vmovdqa64 @key[8],$xc0 | |
3450 | vmovdqa64 @key[9],$xc1 | |
3451 | vmovdqa64 @key[10],$xc2 | |
3452 | vmovdqa64 @key[11],$xc3 | |
3453 | vmovdqa64 @key[12],$xd0 | |
3454 | vmovdqa64 @key[13],$xd1 | |
3455 | vmovdqa64 @key[14],$xd2 | |
3456 | vmovdqa64 @key[15],$xd3 | |
3457 | ||
3458 | vmovdqa64 $xa0,@key[0] | |
3459 | vmovdqa64 $xa1,@key[1] | |
3460 | vmovdqa64 $xa2,@key[2] | |
3461 | vmovdqa64 $xa3,@key[3] | |
3462 | ||
3463 | mov \$10,%eax | |
3464 | jmp .Loop8xvl | |
3465 | ||
3466 | .align 32 | |
3467 | .Loop8xvl: | |
3468 | ___ | |
3469 | foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } | |
3470 | foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } | |
3471 | $code.=<<___; | |
3472 | dec %eax | |
3473 | jnz .Loop8xvl | |
3474 | ||
3475 | vpaddd @key[0],$xa0,$xa0 # accumulate key | |
3476 | vpaddd @key[1],$xa1,$xa1 | |
3477 | vpaddd @key[2],$xa2,$xa2 | |
3478 | vpaddd @key[3],$xa3,$xa3 | |
3479 | ||
3480 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data | |
3481 | vpunpckldq $xa3,$xa2,$xt3 | |
3482 | vpunpckhdq $xa1,$xa0,$xa0 | |
3483 | vpunpckhdq $xa3,$xa2,$xa2 | |
3484 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0" | |
3485 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1" | |
3486 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2" | |
3487 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3" | |
3488 | ___ | |
3489 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); | |
3490 | $code.=<<___; | |
3491 | vpaddd @key[4],$xb0,$xb0 | |
3492 | vpaddd @key[5],$xb1,$xb1 | |
3493 | vpaddd @key[6],$xb2,$xb2 | |
3494 | vpaddd @key[7],$xb3,$xb3 | |
3495 | ||
3496 | vpunpckldq $xb1,$xb0,$xt2 | |
3497 | vpunpckldq $xb3,$xb2,$xt3 | |
3498 | vpunpckhdq $xb1,$xb0,$xb0 | |
3499 | vpunpckhdq $xb3,$xb2,$xb2 | |
3500 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0" | |
3501 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1" | |
3502 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2" | |
3503 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3" | |
3504 | ___ | |
3505 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); | |
3506 | $code.=<<___; | |
3507 | vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further | |
3508 | vshufi32x4 \$3,$xb0,$xa0,$xb0 | |
3509 | vshufi32x4 \$0,$xb1,$xa1,$xa0 | |
3510 | vshufi32x4 \$3,$xb1,$xa1,$xb1 | |
3511 | vshufi32x4 \$0,$xb2,$xa2,$xa1 | |
3512 | vshufi32x4 \$3,$xb2,$xa2,$xb2 | |
3513 | vshufi32x4 \$0,$xb3,$xa3,$xa2 | |
3514 | vshufi32x4 \$3,$xb3,$xa3,$xb3 | |
3515 | ___ | |
3516 | ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); | |
3517 | $code.=<<___; | |
3518 | vpaddd @key[8],$xc0,$xc0 | |
3519 | vpaddd @key[9],$xc1,$xc1 | |
3520 | vpaddd @key[10],$xc2,$xc2 | |
3521 | vpaddd @key[11],$xc3,$xc3 | |
3522 | ||
3523 | vpunpckldq $xc1,$xc0,$xt2 | |
3524 | vpunpckldq $xc3,$xc2,$xt3 | |
3525 | vpunpckhdq $xc1,$xc0,$xc0 | |
3526 | vpunpckhdq $xc3,$xc2,$xc2 | |
3527 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0" | |
3528 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1" | |
3529 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2" | |
3530 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3" | |
3531 | ___ | |
3532 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); | |
3533 | $code.=<<___; | |
3534 | vpaddd @key[12],$xd0,$xd0 | |
3535 | vpaddd @key[13],$xd1,$xd1 | |
3536 | vpaddd @key[14],$xd2,$xd2 | |
3537 | vpaddd @key[15],$xd3,$xd3 | |
3538 | ||
3539 | vpunpckldq $xd1,$xd0,$xt2 | |
3540 | vpunpckldq $xd3,$xd2,$xt3 | |
3541 | vpunpckhdq $xd1,$xd0,$xd0 | |
3542 | vpunpckhdq $xd3,$xd2,$xd2 | |
3543 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0" | |
3544 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1" | |
3545 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2" | |
3546 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3" | |
3547 | ___ | |
3548 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); | |
3549 | $code.=<<___; | |
3550 | vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further | |
3551 | vperm2i128 \$0x31,$xd0,$xc0,$xd0 | |
3552 | vperm2i128 \$0x20,$xd1,$xc1,$xc0 | |
3553 | vperm2i128 \$0x31,$xd1,$xc1,$xd1 | |
3554 | vperm2i128 \$0x20,$xd2,$xc2,$xc1 | |
3555 | vperm2i128 \$0x31,$xd2,$xc2,$xd2 | |
3556 | vperm2i128 \$0x20,$xd3,$xc3,$xc2 | |
3557 | vperm2i128 \$0x31,$xd3,$xc3,$xd3 | |
3558 | ___ | |
3559 | ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); | |
3560 | ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= | |
3561 | ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); | |
3562 | $code.=<<___; | |
3563 | cmp \$64*8,$len | |
3564 | jb .Ltail8xvl | |
3565 | ||
3566 | mov \$0x80,%eax # size optimization | |
3567 | vpxord 0x00($inp),$xa0,$xa0 # xor with input | |
3568 | vpxor 0x20($inp),$xb0,$xb0 | |
3569 | vpxor 0x40($inp),$xc0,$xc0 | |
3570 | vpxor 0x60($inp),$xd0,$xd0 | |
3571 | lea ($inp,%rax),$inp # size optimization | |
3572 | vmovdqu32 $xa0,0x00($out) | |
3573 | vmovdqu $xb0,0x20($out) | |
3574 | vmovdqu $xc0,0x40($out) | |
3575 | vmovdqu $xd0,0x60($out) | |
3576 | lea ($out,%rax),$out # size optimization | |
3577 | ||
3578 | vpxor 0x00($inp),$xa1,$xa1 | |
3579 | vpxor 0x20($inp),$xb1,$xb1 | |
3580 | vpxor 0x40($inp),$xc1,$xc1 | |
3581 | vpxor 0x60($inp),$xd1,$xd1 | |
3582 | lea ($inp,%rax),$inp # size optimization | |
3583 | vmovdqu $xa1,0x00($out) | |
3584 | vmovdqu $xb1,0x20($out) | |
3585 | vmovdqu $xc1,0x40($out) | |
3586 | vmovdqu $xd1,0x60($out) | |
3587 | lea ($out,%rax),$out # size optimization | |
3588 | ||
3589 | vpxord 0x00($inp),$xa2,$xa2 | |
3590 | vpxor 0x20($inp),$xb2,$xb2 | |
3591 | vpxor 0x40($inp),$xc2,$xc2 | |
3592 | vpxor 0x60($inp),$xd2,$xd2 | |
3593 | lea ($inp,%rax),$inp # size optimization | |
3594 | vmovdqu32 $xa2,0x00($out) | |
3595 | vmovdqu $xb2,0x20($out) | |
3596 | vmovdqu $xc2,0x40($out) | |
3597 | vmovdqu $xd2,0x60($out) | |
3598 | lea ($out,%rax),$out # size optimization | |
3599 | ||
3600 | vpxor 0x00($inp),$xa3,$xa3 | |
3601 | vpxor 0x20($inp),$xb3,$xb3 | |
3602 | vpxor 0x40($inp),$xc3,$xc3 | |
3603 | vpxor 0x60($inp),$xd3,$xd3 | |
3604 | lea ($inp,%rax),$inp # size optimization | |
3605 | vmovdqu $xa3,0x00($out) | |
3606 | vmovdqu $xb3,0x20($out) | |
3607 | vmovdqu $xc3,0x40($out) | |
3608 | vmovdqu $xd3,0x60($out) | |
3609 | lea ($out,%rax),$out # size optimization | |
3610 | ||
3611 | vpbroadcastd 0(%r10),%ymm0 # reload key | |
3612 | vpbroadcastd 4(%r10),%ymm1 | |
3613 | ||
3614 | sub \$64*8,$len | |
3615 | jnz .Loop_outer8xvl | |
3616 | ||
3617 | jmp .Ldone8xvl | |
3618 | ||
3619 | .align 32 | |
3620 | .Ltail8xvl: | |
3621 | vmovdqa64 $xa0,%ymm8 # size optimization | |
3622 | ___ | |
3623 | $xa0 = "%ymm8"; | |
3624 | $code.=<<___; | |
3625 | xor %r10,%r10 | |
3626 | sub $inp,$out | |
3627 | cmp \$64*1,$len | |
3628 | jb .Less_than_64_8xvl | |
3629 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
3630 | vpxor 0x20($inp),$xb0,$xb0 | |
3631 | vmovdqu $xa0,0x00($out,$inp) | |
3632 | vmovdqu $xb0,0x20($out,$inp) | |
3633 | je .Ldone8xvl | |
3634 | vmovdqa $xc0,$xa0 | |
3635 | vmovdqa $xd0,$xb0 | |
3636 | lea 64($inp),$inp | |
3637 | ||
3638 | cmp \$64*2,$len | |
3639 | jb .Less_than_64_8xvl | |
3640 | vpxor 0x00($inp),$xc0,$xc0 | |
3641 | vpxor 0x20($inp),$xd0,$xd0 | |
3642 | vmovdqu $xc0,0x00($out,$inp) | |
3643 | vmovdqu $xd0,0x20($out,$inp) | |
3644 | je .Ldone8xvl | |
3645 | vmovdqa $xa1,$xa0 | |
3646 | vmovdqa $xb1,$xb0 | |
3647 | lea 64($inp),$inp | |
3648 | ||
3649 | cmp \$64*3,$len | |
3650 | jb .Less_than_64_8xvl | |
3651 | vpxor 0x00($inp),$xa1,$xa1 | |
3652 | vpxor 0x20($inp),$xb1,$xb1 | |
3653 | vmovdqu $xa1,0x00($out,$inp) | |
3654 | vmovdqu $xb1,0x20($out,$inp) | |
3655 | je .Ldone8xvl | |
3656 | vmovdqa $xc1,$xa0 | |
3657 | vmovdqa $xd1,$xb0 | |
3658 | lea 64($inp),$inp | |
3659 | ||
3660 | cmp \$64*4,$len | |
3661 | jb .Less_than_64_8xvl | |
3662 | vpxor 0x00($inp),$xc1,$xc1 | |
3663 | vpxor 0x20($inp),$xd1,$xd1 | |
3664 | vmovdqu $xc1,0x00($out,$inp) | |
3665 | vmovdqu $xd1,0x20($out,$inp) | |
3666 | je .Ldone8xvl | |
3667 | vmovdqa32 $xa2,$xa0 | |
3668 | vmovdqa $xb2,$xb0 | |
3669 | lea 64($inp),$inp | |
3670 | ||
3671 | cmp \$64*5,$len | |
3672 | jb .Less_than_64_8xvl | |
3673 | vpxord 0x00($inp),$xa2,$xa2 | |
3674 | vpxor 0x20($inp),$xb2,$xb2 | |
3675 | vmovdqu32 $xa2,0x00($out,$inp) | |
3676 | vmovdqu $xb2,0x20($out,$inp) | |
3677 | je .Ldone8xvl | |
3678 | vmovdqa $xc2,$xa0 | |
3679 | vmovdqa $xd2,$xb0 | |
3680 | lea 64($inp),$inp | |
3681 | ||
3682 | cmp \$64*6,$len | |
3683 | jb .Less_than_64_8xvl | |
3684 | vpxor 0x00($inp),$xc2,$xc2 | |
3685 | vpxor 0x20($inp),$xd2,$xd2 | |
3686 | vmovdqu $xc2,0x00($out,$inp) | |
3687 | vmovdqu $xd2,0x20($out,$inp) | |
3688 | je .Ldone8xvl | |
3689 | vmovdqa $xa3,$xa0 | |
3690 | vmovdqa $xb3,$xb0 | |
3691 | lea 64($inp),$inp | |
3692 | ||
3693 | cmp \$64*7,$len | |
3694 | jb .Less_than_64_8xvl | |
3695 | vpxor 0x00($inp),$xa3,$xa3 | |
3696 | vpxor 0x20($inp),$xb3,$xb3 | |
3697 | vmovdqu $xa3,0x00($out,$inp) | |
3698 | vmovdqu $xb3,0x20($out,$inp) | |
3699 | je .Ldone8xvl | |
3700 | vmovdqa $xc3,$xa0 | |
3701 | vmovdqa $xd3,$xb0 | |
3702 | lea 64($inp),$inp | |
3703 | ||
3704 | .Less_than_64_8xvl: | |
3705 | vmovdqa $xa0,0x00(%rsp) | |
3706 | vmovdqa $xb0,0x20(%rsp) | |
3707 | lea ($out,$inp),$out | |
3708 | and \$63,$len | |
3709 | ||
3710 | .Loop_tail8xvl: | |
3711 | movzb ($inp,%r10),%eax | |
3712 | movzb (%rsp,%r10),%ecx | |
3713 | lea 1(%r10),%r10 | |
3714 | xor %ecx,%eax | |
3715 | mov %al,-1($out,%r10) | |
3716 | dec $len | |
3717 | jnz .Loop_tail8xvl | |
3718 | ||
3719 | vpxor $xa0,$xa0,$xa0 | |
3720 | vmovdqa $xa0,0x00(%rsp) | |
3721 | vmovdqa $xa0,0x20(%rsp) | |
3722 | ||
3723 | .Ldone8xvl: | |
3724 | vzeroall | |
3725 | ___ | |
3726 | $code.=<<___ if ($win64); | |
3727 | movaps -0xa8(%r9),%xmm6 | |
3728 | movaps -0x98(%r9),%xmm7 | |
3729 | movaps -0x88(%r9),%xmm8 | |
3730 | movaps -0x78(%r9),%xmm9 | |
3731 | movaps -0x68(%r9),%xmm10 | |
3732 | movaps -0x58(%r9),%xmm11 | |
3733 | movaps -0x48(%r9),%xmm12 | |
3734 | movaps -0x38(%r9),%xmm13 | |
3735 | movaps -0x28(%r9),%xmm14 | |
3736 | movaps -0x18(%r9),%xmm15 | |
3737 | ___ | |
3738 | $code.=<<___; | |
3739 | lea (%r9),%rsp | |
3740 | .cfi_def_cfa_register %rsp | |
3741 | .L8xvl_epilogue: | |
3742 | ret | |
3743 | .cfi_endproc | |
3744 | .size ChaCha20_8xvl,.-ChaCha20_8xvl | |
3745 | ___ | |
abb8c44f AP |
3746 | } |
3747 | ||
384e6de4 AP |
3748 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
3749 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
3750 | if ($win64) { | |
3751 | $rec="%rcx"; | |
3752 | $frame="%rdx"; | |
3753 | $context="%r8"; | |
3754 | $disp="%r9"; | |
3755 | ||
3756 | $code.=<<___; | |
3757 | .extern __imp_RtlVirtualUnwind | |
3758 | .type se_handler,\@abi-omnipotent | |
3759 | .align 16 | |
3760 | se_handler: | |
3761 | push %rsi | |
3762 | push %rdi | |
3763 | push %rbx | |
3764 | push %rbp | |
3765 | push %r12 | |
3766 | push %r13 | |
3767 | push %r14 | |
3768 | push %r15 | |
3769 | pushfq | |
3770 | sub \$64,%rsp | |
3771 | ||
3772 | mov 120($context),%rax # pull context->Rax | |
3773 | mov 248($context),%rbx # pull context->Rip | |
3774 | ||
3775 | mov 8($disp),%rsi # disp->ImageBase | |
3776 | mov 56($disp),%r11 # disp->HandlerData | |
3777 | ||
3778 | lea .Lctr32_body(%rip),%r10 | |
3779 | cmp %r10,%rbx # context->Rip<.Lprologue | |
3780 | jb .Lcommon_seh_tail | |
3781 | ||
3782 | mov 152($context),%rax # pull context->Rsp | |
3783 | ||
3784 | lea .Lno_data(%rip),%r10 # epilogue label | |
3785 | cmp %r10,%rbx # context->Rip>=.Lepilogue | |
3786 | jae .Lcommon_seh_tail | |
3787 | ||
3788 | lea 64+24+48(%rax),%rax | |
3789 | ||
3790 | mov -8(%rax),%rbx | |
3791 | mov -16(%rax),%rbp | |
3792 | mov -24(%rax),%r12 | |
3793 | mov -32(%rax),%r13 | |
3794 | mov -40(%rax),%r14 | |
3795 | mov -48(%rax),%r15 | |
3796 | mov %rbx,144($context) # restore context->Rbx | |
3797 | mov %rbp,160($context) # restore context->Rbp | |
3798 | mov %r12,216($context) # restore context->R12 | |
3799 | mov %r13,224($context) # restore context->R13 | |
3800 | mov %r14,232($context) # restore context->R14 | |
3801 | mov %r15,240($context) # restore context->R14 | |
3802 | ||
3803 | .Lcommon_seh_tail: | |
3804 | mov 8(%rax),%rdi | |
3805 | mov 16(%rax),%rsi | |
3806 | mov %rax,152($context) # restore context->Rsp | |
3807 | mov %rsi,168($context) # restore context->Rsi | |
3808 | mov %rdi,176($context) # restore context->Rdi | |
3809 | ||
3810 | mov 40($disp),%rdi # disp->ContextRecord | |
3811 | mov $context,%rsi # context | |
3812 | mov \$154,%ecx # sizeof(CONTEXT) | |
3813 | .long 0xa548f3fc # cld; rep movsq | |
3814 | ||
3815 | mov $disp,%rsi | |
3816 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
3817 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
3818 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
3819 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
3820 | mov 40(%rsi),%r10 # disp->ContextRecord | |
3821 | lea 56(%rsi),%r11 # &disp->HandlerData | |
3822 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
3823 | mov %r10,32(%rsp) # arg5 | |
3824 | mov %r11,40(%rsp) # arg6 | |
3825 | mov %r12,48(%rsp) # arg7 | |
3826 | mov %rcx,56(%rsp) # arg8, (NULL) | |
3827 | call *__imp_RtlVirtualUnwind(%rip) | |
3828 | ||
3829 | mov \$1,%eax # ExceptionContinueSearch | |
3830 | add \$64,%rsp | |
3831 | popfq | |
3832 | pop %r15 | |
3833 | pop %r14 | |
3834 | pop %r13 | |
3835 | pop %r12 | |
3836 | pop %rbp | |
3837 | pop %rbx | |
3838 | pop %rdi | |
3839 | pop %rsi | |
3840 | ret | |
3841 | .size se_handler,.-se_handler | |
3842 | ||
d5487a45 | 3843 | .type simd_handler,\@abi-omnipotent |
384e6de4 | 3844 | .align 16 |
d5487a45 | 3845 | simd_handler: |
384e6de4 AP |
3846 | push %rsi |
3847 | push %rdi | |
3848 | push %rbx | |
3849 | push %rbp | |
3850 | push %r12 | |
3851 | push %r13 | |
3852 | push %r14 | |
3853 | push %r15 | |
3854 | pushfq | |
3855 | sub \$64,%rsp | |
3856 | ||
3857 | mov 120($context),%rax # pull context->Rax | |
3858 | mov 248($context),%rbx # pull context->Rip | |
3859 | ||
3860 | mov 8($disp),%rsi # disp->ImageBase | |
3861 | mov 56($disp),%r11 # disp->HandlerData | |
3862 | ||
3863 | mov 0(%r11),%r10d # HandlerData[0] | |
3864 | lea (%rsi,%r10),%r10 # prologue label | |
3865 | cmp %r10,%rbx # context->Rip<prologue label | |
3866 | jb .Lcommon_seh_tail | |
3867 | ||
3868 | mov 192($context),%rax # pull context->R9 | |
3869 | ||
3870 | mov 4(%r11),%r10d # HandlerData[1] | |
d5487a45 | 3871 | mov 8(%r11),%ecx # HandlerData[2] |
384e6de4 AP |
3872 | lea (%rsi,%r10),%r10 # epilogue label |
3873 | cmp %r10,%rbx # context->Rip>=epilogue label | |
3874 | jae .Lcommon_seh_tail | |
3875 | ||
d5487a45 AP |
3876 | neg %rcx |
3877 | lea -8(%rax,%rcx),%rsi | |
384e6de4 | 3878 | lea 512($context),%rdi # &context.Xmm6 |
d5487a45 AP |
3879 | neg %ecx |
3880 | shr \$3,%ecx | |
384e6de4 AP |
3881 | .long 0xa548f3fc # cld; rep movsq |
3882 | ||
3883 | jmp .Lcommon_seh_tail | |
d5487a45 | 3884 | .size simd_handler,.-simd_handler |
384e6de4 AP |
3885 | |
3886 | .section .pdata | |
3887 | .align 4 | |
3888 | .rva .LSEH_begin_ChaCha20_ctr32 | |
3889 | .rva .LSEH_end_ChaCha20_ctr32 | |
3890 | .rva .LSEH_info_ChaCha20_ctr32 | |
3891 | ||
3892 | .rva .LSEH_begin_ChaCha20_ssse3 | |
3893 | .rva .LSEH_end_ChaCha20_ssse3 | |
3894 | .rva .LSEH_info_ChaCha20_ssse3 | |
3895 | ||
d5487a45 AP |
3896 | .rva .LSEH_begin_ChaCha20_128 |
3897 | .rva .LSEH_end_ChaCha20_128 | |
3898 | .rva .LSEH_info_ChaCha20_128 | |
3899 | ||
384e6de4 AP |
3900 | .rva .LSEH_begin_ChaCha20_4x |
3901 | .rva .LSEH_end_ChaCha20_4x | |
3902 | .rva .LSEH_info_ChaCha20_4x | |
3903 | ___ | |
3904 | $code.=<<___ if ($avx); | |
3905 | .rva .LSEH_begin_ChaCha20_4xop | |
3906 | .rva .LSEH_end_ChaCha20_4xop | |
3907 | .rva .LSEH_info_ChaCha20_4xop | |
3908 | ___ | |
3909 | $code.=<<___ if ($avx>1); | |
3910 | .rva .LSEH_begin_ChaCha20_8x | |
3911 | .rva .LSEH_end_ChaCha20_8x | |
3912 | .rva .LSEH_info_ChaCha20_8x | |
3913 | ___ | |
3914 | $code.=<<___ if ($avx>2); | |
3915 | .rva .LSEH_begin_ChaCha20_avx512 | |
3916 | .rva .LSEH_end_ChaCha20_avx512 | |
3917 | .rva .LSEH_info_ChaCha20_avx512 | |
3918 | ||
cded9513 AP |
3919 | .rva .LSEH_begin_ChaCha20_avx512vl |
3920 | .rva .LSEH_end_ChaCha20_avx512vl | |
3921 | .rva .LSEH_info_ChaCha20_avx512vl | |
3922 | ||
384e6de4 AP |
3923 | .rva .LSEH_begin_ChaCha20_16x |
3924 | .rva .LSEH_end_ChaCha20_16x | |
3925 | .rva .LSEH_info_ChaCha20_16x | |
cded9513 AP |
3926 | |
3927 | .rva .LSEH_begin_ChaCha20_8xvl | |
3928 | .rva .LSEH_end_ChaCha20_8xvl | |
3929 | .rva .LSEH_info_ChaCha20_8xvl | |
384e6de4 AP |
3930 | ___ |
3931 | $code.=<<___; | |
3932 | .section .xdata | |
3933 | .align 8 | |
3934 | .LSEH_info_ChaCha20_ctr32: | |
3935 | .byte 9,0,0,0 | |
3936 | .rva se_handler | |
3937 | ||
3938 | .LSEH_info_ChaCha20_ssse3: | |
3939 | .byte 9,0,0,0 | |
d5487a45 | 3940 | .rva simd_handler |
384e6de4 | 3941 | .rva .Lssse3_body,.Lssse3_epilogue |
d5487a45 AP |
3942 | .long 0x20,0 |
3943 | ||
3944 | .LSEH_info_ChaCha20_128: | |
3945 | .byte 9,0,0,0 | |
3946 | .rva simd_handler | |
3947 | .rva .L128_body,.L128_epilogue | |
3948 | .long 0x60,0 | |
384e6de4 AP |
3949 | |
3950 | .LSEH_info_ChaCha20_4x: | |
3951 | .byte 9,0,0,0 | |
d5487a45 | 3952 | .rva simd_handler |
384e6de4 | 3953 | .rva .L4x_body,.L4x_epilogue |
d5487a45 | 3954 | .long 0xa0,0 |
384e6de4 AP |
3955 | ___ |
3956 | $code.=<<___ if ($avx); | |
3957 | .LSEH_info_ChaCha20_4xop: | |
3958 | .byte 9,0,0,0 | |
d5487a45 | 3959 | .rva simd_handler |
384e6de4 | 3960 | .rva .L4xop_body,.L4xop_epilogue # HandlerData[] |
d5487a45 | 3961 | .long 0xa0,0 |
384e6de4 AP |
3962 | ___ |
3963 | $code.=<<___ if ($avx>1); | |
3964 | .LSEH_info_ChaCha20_8x: | |
3965 | .byte 9,0,0,0 | |
d5487a45 | 3966 | .rva simd_handler |
384e6de4 | 3967 | .rva .L8x_body,.L8x_epilogue # HandlerData[] |
d5487a45 | 3968 | .long 0xa0,0 |
384e6de4 AP |
3969 | ___ |
3970 | $code.=<<___ if ($avx>2); | |
3971 | .LSEH_info_ChaCha20_avx512: | |
3972 | .byte 9,0,0,0 | |
d5487a45 | 3973 | .rva simd_handler |
384e6de4 | 3974 | .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] |
d5487a45 | 3975 | .long 0x20,0 |
384e6de4 | 3976 | |
cded9513 AP |
3977 | .LSEH_info_ChaCha20_avx512vl: |
3978 | .byte 9,0,0,0 | |
d5487a45 | 3979 | .rva simd_handler |
cded9513 | 3980 | .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[] |
d5487a45 | 3981 | .long 0x20,0 |
cded9513 | 3982 | |
384e6de4 AP |
3983 | .LSEH_info_ChaCha20_16x: |
3984 | .byte 9,0,0,0 | |
d5487a45 | 3985 | .rva simd_handler |
384e6de4 | 3986 | .rva .L16x_body,.L16x_epilogue # HandlerData[] |
d5487a45 | 3987 | .long 0xa0,0 |
cded9513 AP |
3988 | |
3989 | .LSEH_info_ChaCha20_8xvl: | |
3990 | .byte 9,0,0,0 | |
d5487a45 | 3991 | .rva simd_handler |
cded9513 | 3992 | .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[] |
d5487a45 | 3993 | .long 0xa0,0 |
384e6de4 AP |
3994 | ___ |
3995 | } | |
3996 | ||
a98c648e | 3997 | foreach (split("\n",$code)) { |
3c274a6e | 3998 | s/\`([^\`]*)\`/eval $1/ge; |
a98c648e | 3999 | |
3c274a6e | 4000 | s/%x#%[yz]/%x/g; # "down-shift" |
a98c648e AP |
4001 | |
4002 | print $_,"\n"; | |
4003 | } | |
4004 | ||
4005 | close STDOUT; |