]>
Commit | Line | Data |
---|---|---|
6aa36e8e | 1 | #! /usr/bin/env perl |
33388b44 | 2 | # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. |
6aa36e8e | 3 | # |
03d770d9 | 4 | # Licensed under the Apache License 2.0 (the "License"). You may not use |
6aa36e8e RS |
5 | # this file except in compliance with the License. You can obtain a copy |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
a98c648e AP |
9 | # |
10 | # ==================================================================== | |
11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
12 | # project. The module is, however, dual licensed under OpenSSL and | |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
15 | # ==================================================================== | |
16 | # | |
17 | # November 2014 | |
18 | # | |
19 | # ChaCha20 for x86_64. | |
20 | # | |
abb8c44f AP |
21 | # December 2016 |
22 | # | |
23 | # Add AVX512F code path. | |
24 | # | |
cded9513 AP |
25 | # December 2017 |
26 | # | |
27 | # Add AVX512VL code path. | |
28 | # | |
a98c648e AP |
29 | # Performance in cycles per byte out of large buffer. |
30 | # | |
d5487a45 | 31 | # IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v) |
a98c648e | 32 | # |
d5487a45 AP |
33 | # P4 9.48/+99% - - |
34 | # Core2 7.83/+55% 7.90/5.76 4.35 | |
35 | # Westmere 7.19/+50% 5.60/4.50 3.00 | |
36 | # Sandy Bridge 8.31/+42% 5.45/4.00 2.72 | |
37 | # Ivy Bridge 6.71/+46% 5.40/? 2.41 | |
38 | # Haswell 5.92/+43% 5.20/3.45 2.42 1.23 | |
39 | # Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)] | |
40 | # Silvermont 12.0/+33% 7.75/6.90 7.03(iii) | |
41 | # Knights L 11.7/- ? 9.60(iii) 0.80 | |
42 | # Goldmont 10.6/+17% 5.10/3.52 3.28 | |
43 | # Sledgehammer 7.28/+52% - - | |
44 | # Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv) | |
45 | # Ryzen 5.96/+50% 5.19/3.00 2.40 2.09 | |
46 | # VIA Nano 10.5/+46% 6.72/6.88 6.05 | |
a98c648e AP |
47 | # |
48 | # (i) compared to older gcc 3.x one can observe >2x improvement on | |
49 | # most platforms; | |
d5487a45 AP |
50 | # (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used |
51 | # by chacha20_poly1305_tls_cipher, results are EVP-free; | |
a98c648e AP |
52 | # (iii) this is not optimal result for Atom because of MSROM |
53 | # limitations, SSE2 can do better, but gain is considered too | |
54 | # low to justify the [maintenance] effort; | |
d5487a45 AP |
55 | # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20 |
56 | # and 4.85 for 128-byte inputs; | |
cded9513 AP |
57 | # (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable; |
58 | # (vi) even though Skylake-X can execute AVX512F code and deliver 0.57 | |
59 | # cpb in single thread, the corresponding capability is suppressed; | |
a98c648e | 60 | |
1aa89a7a RL |
61 | # $output is the last argument if it looks like a file (it has an extension) |
62 | # $flavour is the first argument if it doesn't look like a file | |
63 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
64 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
a98c648e AP |
65 | |
66 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | |
67 | ||
68 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
69 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | |
70 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | |
71 | die "can't locate x86_64-xlate.pl"; | |
72 | ||
73 | if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` | |
74 | =~ /GNU assembler version ([2-9]\.[0-9]+)/) { | |
abb8c44f | 75 | $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); |
a98c648e AP |
76 | } |
77 | ||
78 | if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && | |
1ea01427 | 79 | `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { |
abb8c44f AP |
80 | $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); |
81 | $avx += 1 if ($1==2.11 && $2>=8); | |
a98c648e AP |
82 | } |
83 | ||
84 | if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && | |
85 | `ml64 2>&1` =~ /Version ([0-9]+)\./) { | |
86 | $avx = ($1>=10) + ($1>=11); | |
87 | } | |
88 | ||
9bb3e5fd | 89 | if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { |
a98c648e AP |
90 | $avx = ($2>=3.0) + ($2>3.0); |
91 | } | |
92 | ||
1aa89a7a RL |
93 | open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" |
94 | or die "can't call $xlate: $!"; | |
a98c648e AP |
95 | *STDOUT=*OUT; |
96 | ||
97 | # input parameter block | |
98 | ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); | |
99 | ||
100 | $code.=<<___; | |
101 | .text | |
102 | ||
103 | .extern OPENSSL_ia32cap_P | |
104 | ||
105 | .align 64 | |
106 | .Lzero: | |
107 | .long 0,0,0,0 | |
108 | .Lone: | |
109 | .long 1,0,0,0 | |
110 | .Linc: | |
111 | .long 0,1,2,3 | |
112 | .Lfour: | |
113 | .long 4,4,4,4 | |
114 | .Lincy: | |
115 | .long 0,2,4,6,1,3,5,7 | |
116 | .Leight: | |
117 | .long 8,8,8,8,8,8,8,8 | |
118 | .Lrot16: | |
119 | .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd | |
120 | .Lrot24: | |
121 | .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe | |
cded9513 AP |
122 | .Ltwoy: |
123 | .long 2,0,0,0, 2,0,0,0 | |
abb8c44f | 124 | .align 64 |
3c274a6e AP |
125 | .Lzeroz: |
126 | .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 | |
127 | .Lfourz: | |
128 | .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 | |
abb8c44f AP |
129 | .Lincz: |
130 | .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 | |
131 | .Lsixteen: | |
132 | .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 | |
cded9513 AP |
133 | .Lsigma: |
134 | .asciz "expand 32-byte k" | |
a98c648e AP |
135 | .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" |
136 | ___ | |
137 | ||
138 | sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm | |
139 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; | |
140 | my $arg = pop; | |
141 | $arg = "\$$arg" if ($arg*1 eq $arg); | |
142 | $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; | |
143 | } | |
144 | ||
145 | @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), | |
146 | "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); | |
147 | @t=("%esi","%edi"); | |
148 | ||
149 | sub ROUND { # critical path is 24 cycles per round | |
150 | my ($a0,$b0,$c0,$d0)=@_; | |
151 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
152 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
153 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
154 | my ($xc,$xc_)=map("\"$_\"",@t); | |
155 | my @x=map("\"$_\"",@x); | |
156 | ||
157 | # Consider order in which variables are addressed by their | |
158 | # index: | |
159 | # | |
160 | # a b c d | |
161 | # | |
162 | # 0 4 8 12 < even round | |
163 | # 1 5 9 13 | |
164 | # 2 6 10 14 | |
165 | # 3 7 11 15 | |
166 | # 0 5 10 15 < odd round | |
167 | # 1 6 11 12 | |
168 | # 2 7 8 13 | |
169 | # 3 4 9 14 | |
170 | # | |
171 | # 'a', 'b' and 'd's are permanently allocated in registers, | |
172 | # @x[0..7,12..15], while 'c's are maintained in memory. If | |
173 | # you observe 'c' column, you'll notice that pair of 'c's is | |
174 | # invariant between rounds. This means that we have to reload | |
175 | # them once per round, in the middle. This is why you'll see | |
176 | # bunch of 'c' stores and loads in the middle, but none in | |
177 | # the beginning or end. | |
178 | ||
179 | # Normally instructions would be interleaved to favour in-order | |
180 | # execution. Generally out-of-order cores manage it gracefully, | |
181 | # but not this time for some reason. As in-order execution | |
182 | # cores are dying breed, old Atom is the only one around, | |
183 | # instructions are left uninterleaved. Besides, Atom is better | |
184 | # off executing 1xSSSE3 code anyway... | |
185 | ||
186 | ( | |
187 | "&add (@x[$a0],@x[$b0])", # Q1 | |
188 | "&xor (@x[$d0],@x[$a0])", | |
189 | "&rol (@x[$d0],16)", | |
190 | "&add (@x[$a1],@x[$b1])", # Q2 | |
191 | "&xor (@x[$d1],@x[$a1])", | |
192 | "&rol (@x[$d1],16)", | |
193 | ||
194 | "&add ($xc,@x[$d0])", | |
195 | "&xor (@x[$b0],$xc)", | |
196 | "&rol (@x[$b0],12)", | |
197 | "&add ($xc_,@x[$d1])", | |
198 | "&xor (@x[$b1],$xc_)", | |
199 | "&rol (@x[$b1],12)", | |
200 | ||
201 | "&add (@x[$a0],@x[$b0])", | |
202 | "&xor (@x[$d0],@x[$a0])", | |
203 | "&rol (@x[$d0],8)", | |
204 | "&add (@x[$a1],@x[$b1])", | |
205 | "&xor (@x[$d1],@x[$a1])", | |
206 | "&rol (@x[$d1],8)", | |
207 | ||
208 | "&add ($xc,@x[$d0])", | |
209 | "&xor (@x[$b0],$xc)", | |
210 | "&rol (@x[$b0],7)", | |
211 | "&add ($xc_,@x[$d1])", | |
212 | "&xor (@x[$b1],$xc_)", | |
213 | "&rol (@x[$b1],7)", | |
214 | ||
215 | "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's | |
216 | "&mov (\"4*$c1(%rsp)\",$xc_)", | |
217 | "&mov ($xc,\"4*$c2(%rsp)\")", | |
218 | "&mov ($xc_,\"4*$c3(%rsp)\")", | |
219 | ||
220 | "&add (@x[$a2],@x[$b2])", # Q3 | |
221 | "&xor (@x[$d2],@x[$a2])", | |
222 | "&rol (@x[$d2],16)", | |
223 | "&add (@x[$a3],@x[$b3])", # Q4 | |
224 | "&xor (@x[$d3],@x[$a3])", | |
225 | "&rol (@x[$d3],16)", | |
226 | ||
227 | "&add ($xc,@x[$d2])", | |
228 | "&xor (@x[$b2],$xc)", | |
229 | "&rol (@x[$b2],12)", | |
230 | "&add ($xc_,@x[$d3])", | |
231 | "&xor (@x[$b3],$xc_)", | |
232 | "&rol (@x[$b3],12)", | |
233 | ||
234 | "&add (@x[$a2],@x[$b2])", | |
235 | "&xor (@x[$d2],@x[$a2])", | |
236 | "&rol (@x[$d2],8)", | |
237 | "&add (@x[$a3],@x[$b3])", | |
238 | "&xor (@x[$d3],@x[$a3])", | |
239 | "&rol (@x[$d3],8)", | |
240 | ||
241 | "&add ($xc,@x[$d2])", | |
242 | "&xor (@x[$b2],$xc)", | |
243 | "&rol (@x[$b2],7)", | |
244 | "&add ($xc_,@x[$d3])", | |
245 | "&xor (@x[$b3],$xc_)", | |
246 | "&rol (@x[$b3],7)" | |
247 | ); | |
248 | } | |
249 | ||
250 | ######################################################################## | |
251 | # Generic code path that handles all lengths on pre-SSSE3 processors. | |
252 | $code.=<<___; | |
253 | .globl ChaCha20_ctr32 | |
254 | .type ChaCha20_ctr32,\@function,5 | |
255 | .align 64 | |
256 | ChaCha20_ctr32: | |
f17652e5 | 257 | .cfi_startproc |
622a531c AP |
258 | cmp \$0,$len |
259 | je .Lno_data | |
a98c648e | 260 | mov OPENSSL_ia32cap_P+4(%rip),%r10 |
3c274a6e AP |
261 | ___ |
262 | $code.=<<___ if ($avx>2); | |
263 | bt \$48,%r10 # check for AVX512F | |
264 | jc .LChaCha20_avx512 | |
cded9513 AP |
265 | test %r10,%r10 # check for AVX512VL |
266 | js .LChaCha20_avx512vl | |
3c274a6e AP |
267 | ___ |
268 | $code.=<<___; | |
a98c648e AP |
269 | test \$`1<<(41-32)`,%r10d |
270 | jnz .LChaCha20_ssse3 | |
271 | ||
272 | push %rbx | |
f17652e5 | 273 | .cfi_push %rbx |
a98c648e | 274 | push %rbp |
f17652e5 | 275 | .cfi_push %rbp |
a98c648e | 276 | push %r12 |
f17652e5 | 277 | .cfi_push %r12 |
a98c648e | 278 | push %r13 |
f17652e5 | 279 | .cfi_push %r13 |
a98c648e | 280 | push %r14 |
f17652e5 | 281 | .cfi_push %r14 |
a98c648e | 282 | push %r15 |
f17652e5 | 283 | .cfi_push %r15 |
a98c648e | 284 | sub \$64+24,%rsp |
f17652e5 | 285 | .cfi_adjust_cfa_offset 64+24 |
384e6de4 | 286 | .Lctr32_body: |
a98c648e AP |
287 | |
288 | #movdqa .Lsigma(%rip),%xmm0 | |
289 | movdqu ($key),%xmm1 | |
290 | movdqu 16($key),%xmm2 | |
291 | movdqu ($counter),%xmm3 | |
292 | movdqa .Lone(%rip),%xmm4 | |
293 | ||
294 | #movdqa %xmm0,4*0(%rsp) # key[0] | |
295 | movdqa %xmm1,4*4(%rsp) # key[1] | |
296 | movdqa %xmm2,4*8(%rsp) # key[2] | |
297 | movdqa %xmm3,4*12(%rsp) # key[3] | |
298 | mov $len,%rbp # reassign $len | |
299 | jmp .Loop_outer | |
300 | ||
301 | .align 32 | |
302 | .Loop_outer: | |
303 | mov \$0x61707865,@x[0] # 'expa' | |
304 | mov \$0x3320646e,@x[1] # 'nd 3' | |
305 | mov \$0x79622d32,@x[2] # '2-by' | |
306 | mov \$0x6b206574,@x[3] # 'te k' | |
307 | mov 4*4(%rsp),@x[4] | |
308 | mov 4*5(%rsp),@x[5] | |
309 | mov 4*6(%rsp),@x[6] | |
310 | mov 4*7(%rsp),@x[7] | |
311 | movd %xmm3,@x[12] | |
312 | mov 4*13(%rsp),@x[13] | |
313 | mov 4*14(%rsp),@x[14] | |
314 | mov 4*15(%rsp),@x[15] | |
315 | ||
316 | mov %rbp,64+0(%rsp) # save len | |
317 | mov \$10,%ebp | |
318 | mov $inp,64+8(%rsp) # save inp | |
319 | movq %xmm2,%rsi # "@x[8]" | |
320 | mov $out,64+16(%rsp) # save out | |
321 | mov %rsi,%rdi | |
322 | shr \$32,%rdi # "@x[9]" | |
323 | jmp .Loop | |
324 | ||
325 | .align 32 | |
326 | .Loop: | |
327 | ___ | |
328 | foreach (&ROUND (0, 4, 8,12)) { eval; } | |
329 | foreach (&ROUND (0, 5,10,15)) { eval; } | |
330 | &dec ("%ebp"); | |
331 | &jnz (".Loop"); | |
332 | ||
333 | $code.=<<___; | |
334 | mov @t[1],4*9(%rsp) # modulo-scheduled | |
335 | mov @t[0],4*8(%rsp) | |
336 | mov 64(%rsp),%rbp # load len | |
337 | movdqa %xmm2,%xmm1 | |
338 | mov 64+8(%rsp),$inp # load inp | |
339 | paddd %xmm4,%xmm3 # increment counter | |
340 | mov 64+16(%rsp),$out # load out | |
341 | ||
342 | add \$0x61707865,@x[0] # 'expa' | |
343 | add \$0x3320646e,@x[1] # 'nd 3' | |
344 | add \$0x79622d32,@x[2] # '2-by' | |
345 | add \$0x6b206574,@x[3] # 'te k' | |
346 | add 4*4(%rsp),@x[4] | |
347 | add 4*5(%rsp),@x[5] | |
348 | add 4*6(%rsp),@x[6] | |
349 | add 4*7(%rsp),@x[7] | |
350 | add 4*12(%rsp),@x[12] | |
351 | add 4*13(%rsp),@x[13] | |
352 | add 4*14(%rsp),@x[14] | |
353 | add 4*15(%rsp),@x[15] | |
354 | paddd 4*8(%rsp),%xmm1 | |
355 | ||
356 | cmp \$64,%rbp | |
357 | jb .Ltail | |
358 | ||
359 | xor 4*0($inp),@x[0] # xor with input | |
360 | xor 4*1($inp),@x[1] | |
361 | xor 4*2($inp),@x[2] | |
362 | xor 4*3($inp),@x[3] | |
363 | xor 4*4($inp),@x[4] | |
364 | xor 4*5($inp),@x[5] | |
365 | xor 4*6($inp),@x[6] | |
366 | xor 4*7($inp),@x[7] | |
367 | movdqu 4*8($inp),%xmm0 | |
368 | xor 4*12($inp),@x[12] | |
369 | xor 4*13($inp),@x[13] | |
370 | xor 4*14($inp),@x[14] | |
371 | xor 4*15($inp),@x[15] | |
372 | lea 4*16($inp),$inp # inp+=64 | |
373 | pxor %xmm1,%xmm0 | |
374 | ||
375 | movdqa %xmm2,4*8(%rsp) | |
376 | movd %xmm3,4*12(%rsp) | |
377 | ||
378 | mov @x[0],4*0($out) # write output | |
379 | mov @x[1],4*1($out) | |
380 | mov @x[2],4*2($out) | |
381 | mov @x[3],4*3($out) | |
382 | mov @x[4],4*4($out) | |
383 | mov @x[5],4*5($out) | |
384 | mov @x[6],4*6($out) | |
385 | mov @x[7],4*7($out) | |
386 | movdqu %xmm0,4*8($out) | |
387 | mov @x[12],4*12($out) | |
388 | mov @x[13],4*13($out) | |
389 | mov @x[14],4*14($out) | |
390 | mov @x[15],4*15($out) | |
391 | lea 4*16($out),$out # out+=64 | |
392 | ||
393 | sub \$64,%rbp | |
394 | jnz .Loop_outer | |
395 | ||
396 | jmp .Ldone | |
397 | ||
398 | .align 16 | |
399 | .Ltail: | |
400 | mov @x[0],4*0(%rsp) | |
a98c648e | 401 | mov @x[1],4*1(%rsp) |
29880e97 | 402 | xor %rbx,%rbx |
a98c648e AP |
403 | mov @x[2],4*2(%rsp) |
404 | mov @x[3],4*3(%rsp) | |
405 | mov @x[4],4*4(%rsp) | |
406 | mov @x[5],4*5(%rsp) | |
407 | mov @x[6],4*6(%rsp) | |
408 | mov @x[7],4*7(%rsp) | |
409 | movdqa %xmm1,4*8(%rsp) | |
410 | mov @x[12],4*12(%rsp) | |
411 | mov @x[13],4*13(%rsp) | |
412 | mov @x[14],4*14(%rsp) | |
413 | mov @x[15],4*15(%rsp) | |
414 | ||
415 | .Loop_tail: | |
416 | movzb ($inp,%rbx),%eax | |
417 | movzb (%rsp,%rbx),%edx | |
418 | lea 1(%rbx),%rbx | |
419 | xor %edx,%eax | |
420 | mov %al,-1($out,%rbx) | |
421 | dec %rbp | |
422 | jnz .Loop_tail | |
423 | ||
424 | .Ldone: | |
384e6de4 | 425 | lea 64+24+48(%rsp),%rsi |
f17652e5 | 426 | .cfi_def_cfa %rsi,8 |
384e6de4 | 427 | mov -48(%rsi),%r15 |
f17652e5 | 428 | .cfi_restore %r15 |
384e6de4 | 429 | mov -40(%rsi),%r14 |
f17652e5 | 430 | .cfi_restore %r14 |
384e6de4 | 431 | mov -32(%rsi),%r13 |
f17652e5 | 432 | .cfi_restore %r13 |
384e6de4 | 433 | mov -24(%rsi),%r12 |
f17652e5 | 434 | .cfi_restore %r12 |
384e6de4 | 435 | mov -16(%rsi),%rbp |
f17652e5 | 436 | .cfi_restore %rbp |
384e6de4 | 437 | mov -8(%rsi),%rbx |
f17652e5 | 438 | .cfi_restore %rbx |
384e6de4 | 439 | lea (%rsi),%rsp |
f17652e5 | 440 | .cfi_def_cfa_register %rsp |
622a531c | 441 | .Lno_data: |
a98c648e | 442 | ret |
f17652e5 | 443 | .cfi_endproc |
a98c648e AP |
444 | .size ChaCha20_ctr32,.-ChaCha20_ctr32 |
445 | ___ | |
446 | ||
447 | ######################################################################## | |
448 | # SSSE3 code path that handles shorter lengths | |
449 | { | |
450 | my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); | |
451 | ||
452 | sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round | |
453 | &paddd ($a,$b); | |
454 | &pxor ($d,$a); | |
455 | &pshufb ($d,$rot16); | |
456 | ||
457 | &paddd ($c,$d); | |
458 | &pxor ($b,$c); | |
459 | &movdqa ($t,$b); | |
460 | &psrld ($b,20); | |
461 | &pslld ($t,12); | |
462 | &por ($b,$t); | |
463 | ||
464 | &paddd ($a,$b); | |
465 | &pxor ($d,$a); | |
466 | &pshufb ($d,$rot24); | |
467 | ||
468 | &paddd ($c,$d); | |
469 | &pxor ($b,$c); | |
470 | &movdqa ($t,$b); | |
471 | &psrld ($b,25); | |
472 | &pslld ($t,7); | |
473 | &por ($b,$t); | |
474 | } | |
475 | ||
384e6de4 | 476 | my $xframe = $win64 ? 32+8 : 8; |
a98c648e AP |
477 | |
478 | $code.=<<___; | |
479 | .type ChaCha20_ssse3,\@function,5 | |
480 | .align 32 | |
481 | ChaCha20_ssse3: | |
f17652e5 | 482 | .cfi_startproc |
a98c648e | 483 | .LChaCha20_ssse3: |
384e6de4 | 484 | mov %rsp,%r9 # frame pointer |
f17652e5 | 485 | .cfi_def_cfa_register %r9 |
a98c648e AP |
486 | ___ |
487 | $code.=<<___ if ($avx); | |
488 | test \$`1<<(43-32)`,%r10d | |
489 | jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4 | |
490 | ___ | |
491 | $code.=<<___; | |
492 | cmp \$128,$len # we might throw away some data, | |
d5487a45 | 493 | je .LChaCha20_128 |
a98c648e AP |
494 | ja .LChaCha20_4x # but overall it won't be slower |
495 | ||
496 | .Ldo_sse3_after_all: | |
a98c648e AP |
497 | sub \$64+$xframe,%rsp |
498 | ___ | |
499 | $code.=<<___ if ($win64); | |
384e6de4 AP |
500 | movaps %xmm6,-0x28(%r9) |
501 | movaps %xmm7,-0x18(%r9) | |
502 | .Lssse3_body: | |
a98c648e AP |
503 | ___ |
504 | $code.=<<___; | |
505 | movdqa .Lsigma(%rip),$a | |
506 | movdqu ($key),$b | |
507 | movdqu 16($key),$c | |
508 | movdqu ($counter),$d | |
509 | movdqa .Lrot16(%rip),$rot16 | |
510 | movdqa .Lrot24(%rip),$rot24 | |
511 | ||
512 | movdqa $a,0x00(%rsp) | |
513 | movdqa $b,0x10(%rsp) | |
514 | movdqa $c,0x20(%rsp) | |
515 | movdqa $d,0x30(%rsp) | |
3c274a6e | 516 | mov \$10,$counter # reuse $counter |
a98c648e AP |
517 | jmp .Loop_ssse3 |
518 | ||
519 | .align 32 | |
520 | .Loop_outer_ssse3: | |
521 | movdqa .Lone(%rip),$d | |
522 | movdqa 0x00(%rsp),$a | |
523 | movdqa 0x10(%rsp),$b | |
524 | movdqa 0x20(%rsp),$c | |
525 | paddd 0x30(%rsp),$d | |
3c274a6e | 526 | mov \$10,$counter |
a98c648e AP |
527 | movdqa $d,0x30(%rsp) |
528 | jmp .Loop_ssse3 | |
529 | ||
530 | .align 32 | |
531 | .Loop_ssse3: | |
532 | ___ | |
533 | &SSSE3ROUND(); | |
534 | &pshufd ($c,$c,0b01001110); | |
535 | &pshufd ($b,$b,0b00111001); | |
536 | &pshufd ($d,$d,0b10010011); | |
537 | &nop (); | |
538 | ||
539 | &SSSE3ROUND(); | |
540 | &pshufd ($c,$c,0b01001110); | |
541 | &pshufd ($b,$b,0b10010011); | |
542 | &pshufd ($d,$d,0b00111001); | |
543 | ||
3c274a6e | 544 | &dec ($counter); |
a98c648e AP |
545 | &jnz (".Loop_ssse3"); |
546 | ||
547 | $code.=<<___; | |
548 | paddd 0x00(%rsp),$a | |
549 | paddd 0x10(%rsp),$b | |
550 | paddd 0x20(%rsp),$c | |
551 | paddd 0x30(%rsp),$d | |
552 | ||
553 | cmp \$64,$len | |
554 | jb .Ltail_ssse3 | |
555 | ||
556 | movdqu 0x00($inp),$t | |
557 | movdqu 0x10($inp),$t1 | |
558 | pxor $t,$a # xor with input | |
559 | movdqu 0x20($inp),$t | |
560 | pxor $t1,$b | |
561 | movdqu 0x30($inp),$t1 | |
562 | lea 0x40($inp),$inp # inp+=64 | |
563 | pxor $t,$c | |
564 | pxor $t1,$d | |
565 | ||
566 | movdqu $a,0x00($out) # write output | |
567 | movdqu $b,0x10($out) | |
568 | movdqu $c,0x20($out) | |
569 | movdqu $d,0x30($out) | |
570 | lea 0x40($out),$out # out+=64 | |
571 | ||
572 | sub \$64,$len | |
573 | jnz .Loop_outer_ssse3 | |
574 | ||
575 | jmp .Ldone_ssse3 | |
576 | ||
577 | .align 16 | |
578 | .Ltail_ssse3: | |
579 | movdqa $a,0x00(%rsp) | |
580 | movdqa $b,0x10(%rsp) | |
581 | movdqa $c,0x20(%rsp) | |
582 | movdqa $d,0x30(%rsp) | |
3c274a6e | 583 | xor $counter,$counter |
a98c648e AP |
584 | |
585 | .Loop_tail_ssse3: | |
3c274a6e AP |
586 | movzb ($inp,$counter),%eax |
587 | movzb (%rsp,$counter),%ecx | |
588 | lea 1($counter),$counter | |
29880e97 | 589 | xor %ecx,%eax |
3c274a6e | 590 | mov %al,-1($out,$counter) |
29880e97 | 591 | dec $len |
a98c648e AP |
592 | jnz .Loop_tail_ssse3 |
593 | ||
594 | .Ldone_ssse3: | |
595 | ___ | |
596 | $code.=<<___ if ($win64); | |
384e6de4 AP |
597 | movaps -0x28(%r9),%xmm6 |
598 | movaps -0x18(%r9),%xmm7 | |
a98c648e AP |
599 | ___ |
600 | $code.=<<___; | |
384e6de4 | 601 | lea (%r9),%rsp |
f17652e5 | 602 | .cfi_def_cfa_register %rsp |
384e6de4 | 603 | .Lssse3_epilogue: |
a98c648e | 604 | ret |
f17652e5 | 605 | .cfi_endproc |
a98c648e AP |
606 | .size ChaCha20_ssse3,.-ChaCha20_ssse3 |
607 | ___ | |
608 | } | |
609 | ||
d5487a45 AP |
610 | ######################################################################## |
611 | # SSSE3 code path that handles 128-byte inputs | |
612 | { | |
613 | my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7)); | |
614 | my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1)); | |
615 | ||
616 | sub SSSE3ROUND_2x { | |
617 | &paddd ($a,$b); | |
618 | &pxor ($d,$a); | |
619 | &paddd ($a1,$b1); | |
620 | &pxor ($d1,$a1); | |
621 | &pshufb ($d,$rot16); | |
622 | &pshufb($d1,$rot16); | |
623 | ||
624 | &paddd ($c,$d); | |
625 | &paddd ($c1,$d1); | |
626 | &pxor ($b,$c); | |
627 | &pxor ($b1,$c1); | |
628 | &movdqa ($t,$b); | |
629 | &psrld ($b,20); | |
630 | &movdqa($t1,$b1); | |
631 | &pslld ($t,12); | |
632 | &psrld ($b1,20); | |
633 | &por ($b,$t); | |
634 | &pslld ($t1,12); | |
635 | &por ($b1,$t1); | |
636 | ||
637 | &paddd ($a,$b); | |
638 | &pxor ($d,$a); | |
639 | &paddd ($a1,$b1); | |
640 | &pxor ($d1,$a1); | |
641 | &pshufb ($d,$rot24); | |
642 | &pshufb($d1,$rot24); | |
643 | ||
644 | &paddd ($c,$d); | |
645 | &paddd ($c1,$d1); | |
646 | &pxor ($b,$c); | |
647 | &pxor ($b1,$c1); | |
648 | &movdqa ($t,$b); | |
649 | &psrld ($b,25); | |
650 | &movdqa($t1,$b1); | |
651 | &pslld ($t,7); | |
652 | &psrld ($b1,25); | |
653 | &por ($b,$t); | |
654 | &pslld ($t1,7); | |
655 | &por ($b1,$t1); | |
656 | } | |
657 | ||
658 | my $xframe = $win64 ? 0x68 : 8; | |
659 | ||
660 | $code.=<<___; | |
661 | .type ChaCha20_128,\@function,5 | |
662 | .align 32 | |
663 | ChaCha20_128: | |
664 | .cfi_startproc | |
665 | .LChaCha20_128: | |
666 | mov %rsp,%r9 # frame pointer | |
667 | .cfi_def_cfa_register %r9 | |
668 | sub \$64+$xframe,%rsp | |
669 | ___ | |
670 | $code.=<<___ if ($win64); | |
671 | movaps %xmm6,-0x68(%r9) | |
672 | movaps %xmm7,-0x58(%r9) | |
673 | movaps %xmm8,-0x48(%r9) | |
674 | movaps %xmm9,-0x38(%r9) | |
675 | movaps %xmm10,-0x28(%r9) | |
676 | movaps %xmm11,-0x18(%r9) | |
677 | .L128_body: | |
678 | ___ | |
679 | $code.=<<___; | |
680 | movdqa .Lsigma(%rip),$a | |
681 | movdqu ($key),$b | |
682 | movdqu 16($key),$c | |
683 | movdqu ($counter),$d | |
684 | movdqa .Lone(%rip),$d1 | |
685 | movdqa .Lrot16(%rip),$rot16 | |
686 | movdqa .Lrot24(%rip),$rot24 | |
687 | ||
688 | movdqa $a,$a1 | |
689 | movdqa $a,0x00(%rsp) | |
690 | movdqa $b,$b1 | |
691 | movdqa $b,0x10(%rsp) | |
692 | movdqa $c,$c1 | |
693 | movdqa $c,0x20(%rsp) | |
694 | paddd $d,$d1 | |
695 | movdqa $d,0x30(%rsp) | |
696 | mov \$10,$counter # reuse $counter | |
697 | jmp .Loop_128 | |
698 | ||
699 | .align 32 | |
700 | .Loop_128: | |
701 | ___ | |
702 | &SSSE3ROUND_2x(); | |
703 | &pshufd ($c,$c,0b01001110); | |
704 | &pshufd ($b,$b,0b00111001); | |
705 | &pshufd ($d,$d,0b10010011); | |
706 | &pshufd ($c1,$c1,0b01001110); | |
707 | &pshufd ($b1,$b1,0b00111001); | |
708 | &pshufd ($d1,$d1,0b10010011); | |
709 | ||
710 | &SSSE3ROUND_2x(); | |
711 | &pshufd ($c,$c,0b01001110); | |
712 | &pshufd ($b,$b,0b10010011); | |
713 | &pshufd ($d,$d,0b00111001); | |
714 | &pshufd ($c1,$c1,0b01001110); | |
715 | &pshufd ($b1,$b1,0b10010011); | |
716 | &pshufd ($d1,$d1,0b00111001); | |
717 | ||
718 | &dec ($counter); | |
719 | &jnz (".Loop_128"); | |
720 | ||
721 | $code.=<<___; | |
722 | paddd 0x00(%rsp),$a | |
723 | paddd 0x10(%rsp),$b | |
724 | paddd 0x20(%rsp),$c | |
725 | paddd 0x30(%rsp),$d | |
726 | paddd .Lone(%rip),$d1 | |
727 | paddd 0x00(%rsp),$a1 | |
728 | paddd 0x10(%rsp),$b1 | |
729 | paddd 0x20(%rsp),$c1 | |
730 | paddd 0x30(%rsp),$d1 | |
731 | ||
732 | movdqu 0x00($inp),$t | |
733 | movdqu 0x10($inp),$t1 | |
734 | pxor $t,$a # xor with input | |
735 | movdqu 0x20($inp),$t | |
736 | pxor $t1,$b | |
737 | movdqu 0x30($inp),$t1 | |
738 | pxor $t,$c | |
739 | movdqu 0x40($inp),$t | |
740 | pxor $t1,$d | |
741 | movdqu 0x50($inp),$t1 | |
742 | pxor $t,$a1 | |
743 | movdqu 0x60($inp),$t | |
744 | pxor $t1,$b1 | |
745 | movdqu 0x70($inp),$t1 | |
746 | pxor $t,$c1 | |
747 | pxor $t1,$d1 | |
748 | ||
749 | movdqu $a,0x00($out) # write output | |
750 | movdqu $b,0x10($out) | |
751 | movdqu $c,0x20($out) | |
752 | movdqu $d,0x30($out) | |
753 | movdqu $a1,0x40($out) | |
754 | movdqu $b1,0x50($out) | |
755 | movdqu $c1,0x60($out) | |
756 | movdqu $d1,0x70($out) | |
757 | ___ | |
758 | $code.=<<___ if ($win64); | |
759 | movaps -0x68(%r9),%xmm6 | |
760 | movaps -0x58(%r9),%xmm7 | |
761 | movaps -0x48(%r9),%xmm8 | |
762 | movaps -0x38(%r9),%xmm9 | |
763 | movaps -0x28(%r9),%xmm10 | |
764 | movaps -0x18(%r9),%xmm11 | |
765 | ___ | |
766 | $code.=<<___; | |
767 | lea (%r9),%rsp | |
768 | .cfi_def_cfa_register %rsp | |
769 | .L128_epilogue: | |
770 | ret | |
771 | .cfi_endproc | |
772 | .size ChaCha20_128,.-ChaCha20_128 | |
773 | ___ | |
774 | } | |
775 | ||
a98c648e AP |
776 | ######################################################################## |
777 | # SSSE3 code path that handles longer messages. | |
778 | { | |
779 | # assign variables to favor Atom front-end | |
780 | my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, | |
781 | $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); | |
782 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
783 | "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); | |
784 | ||
785 | sub SSSE3_lane_ROUND { | |
786 | my ($a0,$b0,$c0,$d0)=@_; | |
787 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
788 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
789 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
790 | my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); | |
791 | my @x=map("\"$_\"",@xx); | |
792 | ||
793 | # Consider order in which variables are addressed by their | |
794 | # index: | |
795 | # | |
796 | # a b c d | |
797 | # | |
798 | # 0 4 8 12 < even round | |
799 | # 1 5 9 13 | |
800 | # 2 6 10 14 | |
801 | # 3 7 11 15 | |
802 | # 0 5 10 15 < odd round | |
803 | # 1 6 11 12 | |
804 | # 2 7 8 13 | |
805 | # 3 4 9 14 | |
806 | # | |
807 | # 'a', 'b' and 'd's are permanently allocated in registers, | |
808 | # @x[0..7,12..15], while 'c's are maintained in memory. If | |
809 | # you observe 'c' column, you'll notice that pair of 'c's is | |
810 | # invariant between rounds. This means that we have to reload | |
811 | # them once per round, in the middle. This is why you'll see | |
812 | # bunch of 'c' stores and loads in the middle, but none in | |
813 | # the beginning or end. | |
814 | ||
815 | ( | |
816 | "&paddd (@x[$a0],@x[$b0])", # Q1 | |
817 | "&paddd (@x[$a1],@x[$b1])", # Q2 | |
818 | "&pxor (@x[$d0],@x[$a0])", | |
819 | "&pxor (@x[$d1],@x[$a1])", | |
820 | "&pshufb (@x[$d0],$t1)", | |
821 | "&pshufb (@x[$d1],$t1)", | |
822 | ||
823 | "&paddd ($xc,@x[$d0])", | |
824 | "&paddd ($xc_,@x[$d1])", | |
825 | "&pxor (@x[$b0],$xc)", | |
826 | "&pxor (@x[$b1],$xc_)", | |
827 | "&movdqa ($t0,@x[$b0])", | |
828 | "&pslld (@x[$b0],12)", | |
829 | "&psrld ($t0,20)", | |
830 | "&movdqa ($t1,@x[$b1])", | |
831 | "&pslld (@x[$b1],12)", | |
832 | "&por (@x[$b0],$t0)", | |
833 | "&psrld ($t1,20)", | |
834 | "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) | |
835 | "&por (@x[$b1],$t1)", | |
836 | ||
837 | "&paddd (@x[$a0],@x[$b0])", | |
838 | "&paddd (@x[$a1],@x[$b1])", | |
839 | "&pxor (@x[$d0],@x[$a0])", | |
840 | "&pxor (@x[$d1],@x[$a1])", | |
841 | "&pshufb (@x[$d0],$t0)", | |
842 | "&pshufb (@x[$d1],$t0)", | |
843 | ||
844 | "&paddd ($xc,@x[$d0])", | |
845 | "&paddd ($xc_,@x[$d1])", | |
846 | "&pxor (@x[$b0],$xc)", | |
847 | "&pxor (@x[$b1],$xc_)", | |
848 | "&movdqa ($t1,@x[$b0])", | |
849 | "&pslld (@x[$b0],7)", | |
850 | "&psrld ($t1,25)", | |
851 | "&movdqa ($t0,@x[$b1])", | |
852 | "&pslld (@x[$b1],7)", | |
853 | "&por (@x[$b0],$t1)", | |
854 | "&psrld ($t0,25)", | |
855 | "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) | |
856 | "&por (@x[$b1],$t0)", | |
857 | ||
858 | "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's | |
859 | "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", | |
860 | "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", | |
861 | "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", | |
862 | ||
863 | "&paddd (@x[$a2],@x[$b2])", # Q3 | |
864 | "&paddd (@x[$a3],@x[$b3])", # Q4 | |
865 | "&pxor (@x[$d2],@x[$a2])", | |
866 | "&pxor (@x[$d3],@x[$a3])", | |
867 | "&pshufb (@x[$d2],$t1)", | |
868 | "&pshufb (@x[$d3],$t1)", | |
869 | ||
870 | "&paddd ($xc,@x[$d2])", | |
871 | "&paddd ($xc_,@x[$d3])", | |
872 | "&pxor (@x[$b2],$xc)", | |
873 | "&pxor (@x[$b3],$xc_)", | |
874 | "&movdqa ($t0,@x[$b2])", | |
875 | "&pslld (@x[$b2],12)", | |
876 | "&psrld ($t0,20)", | |
877 | "&movdqa ($t1,@x[$b3])", | |
878 | "&pslld (@x[$b3],12)", | |
879 | "&por (@x[$b2],$t0)", | |
880 | "&psrld ($t1,20)", | |
881 | "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) | |
882 | "&por (@x[$b3],$t1)", | |
883 | ||
884 | "&paddd (@x[$a2],@x[$b2])", | |
885 | "&paddd (@x[$a3],@x[$b3])", | |
886 | "&pxor (@x[$d2],@x[$a2])", | |
887 | "&pxor (@x[$d3],@x[$a3])", | |
888 | "&pshufb (@x[$d2],$t0)", | |
889 | "&pshufb (@x[$d3],$t0)", | |
890 | ||
891 | "&paddd ($xc,@x[$d2])", | |
892 | "&paddd ($xc_,@x[$d3])", | |
893 | "&pxor (@x[$b2],$xc)", | |
894 | "&pxor (@x[$b3],$xc_)", | |
895 | "&movdqa ($t1,@x[$b2])", | |
896 | "&pslld (@x[$b2],7)", | |
897 | "&psrld ($t1,25)", | |
898 | "&movdqa ($t0,@x[$b3])", | |
899 | "&pslld (@x[$b3],7)", | |
900 | "&por (@x[$b2],$t1)", | |
901 | "&psrld ($t0,25)", | |
902 | "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) | |
903 | "&por (@x[$b3],$t0)" | |
904 | ); | |
905 | } | |
906 | ||
384e6de4 | 907 | my $xframe = $win64 ? 0xa8 : 8; |
a98c648e AP |
908 | |
909 | $code.=<<___; | |
910 | .type ChaCha20_4x,\@function,5 | |
911 | .align 32 | |
912 | ChaCha20_4x: | |
f17652e5 | 913 | .cfi_startproc |
a98c648e | 914 | .LChaCha20_4x: |
384e6de4 | 915 | mov %rsp,%r9 # frame pointer |
f17652e5 | 916 | .cfi_def_cfa_register %r9 |
a98c648e AP |
917 | mov %r10,%r11 |
918 | ___ | |
919 | $code.=<<___ if ($avx>1); | |
920 | shr \$32,%r10 # OPENSSL_ia32cap_P+8 | |
921 | test \$`1<<5`,%r10 # test AVX2 | |
922 | jnz .LChaCha20_8x | |
923 | ___ | |
924 | $code.=<<___; | |
925 | cmp \$192,$len | |
926 | ja .Lproceed4x | |
927 | ||
928 | and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE | |
929 | cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE | |
930 | je .Ldo_sse3_after_all # to detect Atom | |
931 | ||
932 | .Lproceed4x: | |
384e6de4 | 933 | sub \$0x140+$xframe,%rsp |
a98c648e AP |
934 | ___ |
935 | ################ stack layout | |
936 | # +0x00 SIMD equivalent of @x[8-12] | |
937 | # ... | |
938 | # +0x40 constant copy of key[0-2] smashed by lanes | |
939 | # ... | |
940 | # +0x100 SIMD counters (with nonce smashed by lanes) | |
941 | # ... | |
942 | # +0x140 | |
943 | $code.=<<___ if ($win64); | |
384e6de4 AP |
944 | movaps %xmm6,-0xa8(%r9) |
945 | movaps %xmm7,-0x98(%r9) | |
946 | movaps %xmm8,-0x88(%r9) | |
947 | movaps %xmm9,-0x78(%r9) | |
948 | movaps %xmm10,-0x68(%r9) | |
949 | movaps %xmm11,-0x58(%r9) | |
950 | movaps %xmm12,-0x48(%r9) | |
951 | movaps %xmm13,-0x38(%r9) | |
952 | movaps %xmm14,-0x28(%r9) | |
953 | movaps %xmm15,-0x18(%r9) | |
954 | .L4x_body: | |
a98c648e AP |
955 | ___ |
956 | $code.=<<___; | |
957 | movdqa .Lsigma(%rip),$xa3 # key[0] | |
958 | movdqu ($key),$xb3 # key[1] | |
959 | movdqu 16($key),$xt3 # key[2] | |
960 | movdqu ($counter),$xd3 # key[3] | |
961 | lea 0x100(%rsp),%rcx # size optimization | |
962 | lea .Lrot16(%rip),%r10 | |
963 | lea .Lrot24(%rip),%r11 | |
964 | ||
965 | pshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
966 | pshufd \$0x55,$xa3,$xa1 | |
967 | movdqa $xa0,0x40(%rsp) # ... and offload | |
968 | pshufd \$0xaa,$xa3,$xa2 | |
969 | movdqa $xa1,0x50(%rsp) | |
970 | pshufd \$0xff,$xa3,$xa3 | |
971 | movdqa $xa2,0x60(%rsp) | |
972 | movdqa $xa3,0x70(%rsp) | |
973 | ||
974 | pshufd \$0x00,$xb3,$xb0 | |
975 | pshufd \$0x55,$xb3,$xb1 | |
976 | movdqa $xb0,0x80-0x100(%rcx) | |
977 | pshufd \$0xaa,$xb3,$xb2 | |
978 | movdqa $xb1,0x90-0x100(%rcx) | |
979 | pshufd \$0xff,$xb3,$xb3 | |
980 | movdqa $xb2,0xa0-0x100(%rcx) | |
981 | movdqa $xb3,0xb0-0x100(%rcx) | |
982 | ||
983 | pshufd \$0x00,$xt3,$xt0 # "$xc0" | |
984 | pshufd \$0x55,$xt3,$xt1 # "$xc1" | |
985 | movdqa $xt0,0xc0-0x100(%rcx) | |
986 | pshufd \$0xaa,$xt3,$xt2 # "$xc2" | |
987 | movdqa $xt1,0xd0-0x100(%rcx) | |
988 | pshufd \$0xff,$xt3,$xt3 # "$xc3" | |
989 | movdqa $xt2,0xe0-0x100(%rcx) | |
990 | movdqa $xt3,0xf0-0x100(%rcx) | |
991 | ||
992 | pshufd \$0x00,$xd3,$xd0 | |
993 | pshufd \$0x55,$xd3,$xd1 | |
994 | paddd .Linc(%rip),$xd0 # don't save counters yet | |
995 | pshufd \$0xaa,$xd3,$xd2 | |
996 | movdqa $xd1,0x110-0x100(%rcx) | |
997 | pshufd \$0xff,$xd3,$xd3 | |
998 | movdqa $xd2,0x120-0x100(%rcx) | |
999 | movdqa $xd3,0x130-0x100(%rcx) | |
1000 | ||
1001 | jmp .Loop_enter4x | |
1002 | ||
1003 | .align 32 | |
1004 | .Loop_outer4x: | |
1005 | movdqa 0x40(%rsp),$xa0 # re-load smashed key | |
1006 | movdqa 0x50(%rsp),$xa1 | |
1007 | movdqa 0x60(%rsp),$xa2 | |
1008 | movdqa 0x70(%rsp),$xa3 | |
1009 | movdqa 0x80-0x100(%rcx),$xb0 | |
1010 | movdqa 0x90-0x100(%rcx),$xb1 | |
1011 | movdqa 0xa0-0x100(%rcx),$xb2 | |
1012 | movdqa 0xb0-0x100(%rcx),$xb3 | |
1013 | movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" | |
1014 | movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" | |
1015 | movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" | |
1016 | movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" | |
1017 | movdqa 0x100-0x100(%rcx),$xd0 | |
1018 | movdqa 0x110-0x100(%rcx),$xd1 | |
1019 | movdqa 0x120-0x100(%rcx),$xd2 | |
1020 | movdqa 0x130-0x100(%rcx),$xd3 | |
1021 | paddd .Lfour(%rip),$xd0 # next SIMD counters | |
1022 | ||
1023 | .Loop_enter4x: | |
1024 | movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" | |
1025 | movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" | |
1026 | movdqa (%r10),$xt3 # .Lrot16(%rip) | |
1027 | mov \$10,%eax | |
1028 | movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters | |
1029 | jmp .Loop4x | |
1030 | ||
1031 | .align 32 | |
1032 | .Loop4x: | |
1033 | ___ | |
1034 | foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } | |
1035 | foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } | |
1036 | $code.=<<___; | |
1037 | dec %eax | |
1038 | jnz .Loop4x | |
1039 | ||
1040 | paddd 0x40(%rsp),$xa0 # accumulate key material | |
1041 | paddd 0x50(%rsp),$xa1 | |
1042 | paddd 0x60(%rsp),$xa2 | |
1043 | paddd 0x70(%rsp),$xa3 | |
1044 | ||
1045 | movdqa $xa0,$xt2 # "de-interlace" data | |
1046 | punpckldq $xa1,$xa0 | |
1047 | movdqa $xa2,$xt3 | |
1048 | punpckldq $xa3,$xa2 | |
1049 | punpckhdq $xa1,$xt2 | |
1050 | punpckhdq $xa3,$xt3 | |
1051 | movdqa $xa0,$xa1 | |
1052 | punpcklqdq $xa2,$xa0 # "a0" | |
1053 | movdqa $xt2,$xa3 | |
1054 | punpcklqdq $xt3,$xt2 # "a2" | |
1055 | punpckhqdq $xa2,$xa1 # "a1" | |
1056 | punpckhqdq $xt3,$xa3 # "a3" | |
1057 | ___ | |
1058 | ($xa2,$xt2)=($xt2,$xa2); | |
1059 | $code.=<<___; | |
1060 | paddd 0x80-0x100(%rcx),$xb0 | |
1061 | paddd 0x90-0x100(%rcx),$xb1 | |
1062 | paddd 0xa0-0x100(%rcx),$xb2 | |
1063 | paddd 0xb0-0x100(%rcx),$xb3 | |
1064 | ||
1065 | movdqa $xa0,0x00(%rsp) # offload $xaN | |
1066 | movdqa $xa1,0x10(%rsp) | |
1067 | movdqa 0x20(%rsp),$xa0 # "xc2" | |
1068 | movdqa 0x30(%rsp),$xa1 # "xc3" | |
1069 | ||
1070 | movdqa $xb0,$xt2 | |
1071 | punpckldq $xb1,$xb0 | |
1072 | movdqa $xb2,$xt3 | |
1073 | punpckldq $xb3,$xb2 | |
1074 | punpckhdq $xb1,$xt2 | |
1075 | punpckhdq $xb3,$xt3 | |
1076 | movdqa $xb0,$xb1 | |
1077 | punpcklqdq $xb2,$xb0 # "b0" | |
1078 | movdqa $xt2,$xb3 | |
1079 | punpcklqdq $xt3,$xt2 # "b2" | |
1080 | punpckhqdq $xb2,$xb1 # "b1" | |
1081 | punpckhqdq $xt3,$xb3 # "b3" | |
1082 | ___ | |
1083 | ($xb2,$xt2)=($xt2,$xb2); | |
1084 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); | |
1085 | $code.=<<___; | |
1086 | paddd 0xc0-0x100(%rcx),$xc0 | |
1087 | paddd 0xd0-0x100(%rcx),$xc1 | |
1088 | paddd 0xe0-0x100(%rcx),$xc2 | |
1089 | paddd 0xf0-0x100(%rcx),$xc3 | |
1090 | ||
1091 | movdqa $xa2,0x20(%rsp) # keep offloading $xaN | |
1092 | movdqa $xa3,0x30(%rsp) | |
1093 | ||
1094 | movdqa $xc0,$xt2 | |
1095 | punpckldq $xc1,$xc0 | |
1096 | movdqa $xc2,$xt3 | |
1097 | punpckldq $xc3,$xc2 | |
1098 | punpckhdq $xc1,$xt2 | |
1099 | punpckhdq $xc3,$xt3 | |
1100 | movdqa $xc0,$xc1 | |
1101 | punpcklqdq $xc2,$xc0 # "c0" | |
1102 | movdqa $xt2,$xc3 | |
1103 | punpcklqdq $xt3,$xt2 # "c2" | |
1104 | punpckhqdq $xc2,$xc1 # "c1" | |
1105 | punpckhqdq $xt3,$xc3 # "c3" | |
1106 | ___ | |
1107 | ($xc2,$xt2)=($xt2,$xc2); | |
1108 | ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary | |
1109 | $code.=<<___; | |
1110 | paddd 0x100-0x100(%rcx),$xd0 | |
1111 | paddd 0x110-0x100(%rcx),$xd1 | |
1112 | paddd 0x120-0x100(%rcx),$xd2 | |
1113 | paddd 0x130-0x100(%rcx),$xd3 | |
1114 | ||
1115 | movdqa $xd0,$xt2 | |
1116 | punpckldq $xd1,$xd0 | |
1117 | movdqa $xd2,$xt3 | |
1118 | punpckldq $xd3,$xd2 | |
1119 | punpckhdq $xd1,$xt2 | |
1120 | punpckhdq $xd3,$xt3 | |
1121 | movdqa $xd0,$xd1 | |
1122 | punpcklqdq $xd2,$xd0 # "d0" | |
1123 | movdqa $xt2,$xd3 | |
1124 | punpcklqdq $xt3,$xt2 # "d2" | |
1125 | punpckhqdq $xd2,$xd1 # "d1" | |
1126 | punpckhqdq $xt3,$xd3 # "d3" | |
1127 | ___ | |
1128 | ($xd2,$xt2)=($xt2,$xd2); | |
1129 | $code.=<<___; | |
1130 | cmp \$64*4,$len | |
1131 | jb .Ltail4x | |
1132 | ||
1133 | movdqu 0x00($inp),$xt0 # xor with input | |
1134 | movdqu 0x10($inp),$xt1 | |
1135 | movdqu 0x20($inp),$xt2 | |
1136 | movdqu 0x30($inp),$xt3 | |
1137 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? | |
1138 | pxor $xb0,$xt1 | |
1139 | pxor $xc0,$xt2 | |
1140 | pxor $xd0,$xt3 | |
1141 | ||
1142 | movdqu $xt0,0x00($out) | |
1143 | movdqu 0x40($inp),$xt0 | |
1144 | movdqu $xt1,0x10($out) | |
1145 | movdqu 0x50($inp),$xt1 | |
1146 | movdqu $xt2,0x20($out) | |
1147 | movdqu 0x60($inp),$xt2 | |
1148 | movdqu $xt3,0x30($out) | |
1149 | movdqu 0x70($inp),$xt3 | |
1150 | lea 0x80($inp),$inp # size optimization | |
1151 | pxor 0x10(%rsp),$xt0 | |
1152 | pxor $xb1,$xt1 | |
1153 | pxor $xc1,$xt2 | |
1154 | pxor $xd1,$xt3 | |
1155 | ||
1156 | movdqu $xt0,0x40($out) | |
1157 | movdqu 0x00($inp),$xt0 | |
1158 | movdqu $xt1,0x50($out) | |
1159 | movdqu 0x10($inp),$xt1 | |
1160 | movdqu $xt2,0x60($out) | |
1161 | movdqu 0x20($inp),$xt2 | |
1162 | movdqu $xt3,0x70($out) | |
1163 | lea 0x80($out),$out # size optimization | |
1164 | movdqu 0x30($inp),$xt3 | |
1165 | pxor 0x20(%rsp),$xt0 | |
1166 | pxor $xb2,$xt1 | |
1167 | pxor $xc2,$xt2 | |
1168 | pxor $xd2,$xt3 | |
1169 | ||
1170 | movdqu $xt0,0x00($out) | |
1171 | movdqu 0x40($inp),$xt0 | |
1172 | movdqu $xt1,0x10($out) | |
1173 | movdqu 0x50($inp),$xt1 | |
1174 | movdqu $xt2,0x20($out) | |
1175 | movdqu 0x60($inp),$xt2 | |
1176 | movdqu $xt3,0x30($out) | |
1177 | movdqu 0x70($inp),$xt3 | |
1178 | lea 0x80($inp),$inp # inp+=64*4 | |
1179 | pxor 0x30(%rsp),$xt0 | |
1180 | pxor $xb3,$xt1 | |
1181 | pxor $xc3,$xt2 | |
1182 | pxor $xd3,$xt3 | |
1183 | movdqu $xt0,0x40($out) | |
1184 | movdqu $xt1,0x50($out) | |
1185 | movdqu $xt2,0x60($out) | |
1186 | movdqu $xt3,0x70($out) | |
1187 | lea 0x80($out),$out # out+=64*4 | |
1188 | ||
1189 | sub \$64*4,$len | |
1190 | jnz .Loop_outer4x | |
1191 | ||
1192 | jmp .Ldone4x | |
1193 | ||
1194 | .Ltail4x: | |
1195 | cmp \$192,$len | |
1196 | jae .L192_or_more4x | |
1197 | cmp \$128,$len | |
1198 | jae .L128_or_more4x | |
1199 | cmp \$64,$len | |
1200 | jae .L64_or_more4x | |
1201 | ||
1202 | #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? | |
1203 | xor %r10,%r10 | |
1204 | #movdqa $xt0,0x00(%rsp) | |
1205 | movdqa $xb0,0x10(%rsp) | |
1206 | movdqa $xc0,0x20(%rsp) | |
1207 | movdqa $xd0,0x30(%rsp) | |
1208 | jmp .Loop_tail4x | |
1209 | ||
1210 | .align 32 | |
1211 | .L64_or_more4x: | |
1212 | movdqu 0x00($inp),$xt0 # xor with input | |
1213 | movdqu 0x10($inp),$xt1 | |
1214 | movdqu 0x20($inp),$xt2 | |
1215 | movdqu 0x30($inp),$xt3 | |
1216 | pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? | |
1217 | pxor $xb0,$xt1 | |
1218 | pxor $xc0,$xt2 | |
1219 | pxor $xd0,$xt3 | |
1220 | movdqu $xt0,0x00($out) | |
1221 | movdqu $xt1,0x10($out) | |
1222 | movdqu $xt2,0x20($out) | |
1223 | movdqu $xt3,0x30($out) | |
1224 | je .Ldone4x | |
1225 | ||
1226 | movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? | |
1227 | lea 0x40($inp),$inp # inp+=64*1 | |
1228 | xor %r10,%r10 | |
1229 | movdqa $xt0,0x00(%rsp) | |
1230 | movdqa $xb1,0x10(%rsp) | |
1231 | lea 0x40($out),$out # out+=64*1 | |
1232 | movdqa $xc1,0x20(%rsp) | |
1233 | sub \$64,$len # len-=64*1 | |
1234 | movdqa $xd1,0x30(%rsp) | |
1235 | jmp .Loop_tail4x | |
1236 | ||
1237 | .align 32 | |
1238 | .L128_or_more4x: | |
1239 | movdqu 0x00($inp),$xt0 # xor with input | |
1240 | movdqu 0x10($inp),$xt1 | |
1241 | movdqu 0x20($inp),$xt2 | |
1242 | movdqu 0x30($inp),$xt3 | |
1243 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? | |
1244 | pxor $xb0,$xt1 | |
1245 | pxor $xc0,$xt2 | |
1246 | pxor $xd0,$xt3 | |
1247 | ||
1248 | movdqu $xt0,0x00($out) | |
1249 | movdqu 0x40($inp),$xt0 | |
1250 | movdqu $xt1,0x10($out) | |
1251 | movdqu 0x50($inp),$xt1 | |
1252 | movdqu $xt2,0x20($out) | |
1253 | movdqu 0x60($inp),$xt2 | |
1254 | movdqu $xt3,0x30($out) | |
1255 | movdqu 0x70($inp),$xt3 | |
1256 | pxor 0x10(%rsp),$xt0 | |
1257 | pxor $xb1,$xt1 | |
1258 | pxor $xc1,$xt2 | |
1259 | pxor $xd1,$xt3 | |
1260 | movdqu $xt0,0x40($out) | |
1261 | movdqu $xt1,0x50($out) | |
1262 | movdqu $xt2,0x60($out) | |
1263 | movdqu $xt3,0x70($out) | |
1264 | je .Ldone4x | |
1265 | ||
1266 | movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? | |
1267 | lea 0x80($inp),$inp # inp+=64*2 | |
1268 | xor %r10,%r10 | |
1269 | movdqa $xt0,0x00(%rsp) | |
1270 | movdqa $xb2,0x10(%rsp) | |
1271 | lea 0x80($out),$out # out+=64*2 | |
1272 | movdqa $xc2,0x20(%rsp) | |
1273 | sub \$128,$len # len-=64*2 | |
1274 | movdqa $xd2,0x30(%rsp) | |
1275 | jmp .Loop_tail4x | |
1276 | ||
1277 | .align 32 | |
1278 | .L192_or_more4x: | |
1279 | movdqu 0x00($inp),$xt0 # xor with input | |
1280 | movdqu 0x10($inp),$xt1 | |
1281 | movdqu 0x20($inp),$xt2 | |
1282 | movdqu 0x30($inp),$xt3 | |
1283 | pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? | |
1284 | pxor $xb0,$xt1 | |
1285 | pxor $xc0,$xt2 | |
1286 | pxor $xd0,$xt3 | |
1287 | ||
1288 | movdqu $xt0,0x00($out) | |
1289 | movdqu 0x40($inp),$xt0 | |
1290 | movdqu $xt1,0x10($out) | |
1291 | movdqu 0x50($inp),$xt1 | |
1292 | movdqu $xt2,0x20($out) | |
1293 | movdqu 0x60($inp),$xt2 | |
1294 | movdqu $xt3,0x30($out) | |
1295 | movdqu 0x70($inp),$xt3 | |
1296 | lea 0x80($inp),$inp # size optimization | |
1297 | pxor 0x10(%rsp),$xt0 | |
1298 | pxor $xb1,$xt1 | |
1299 | pxor $xc1,$xt2 | |
1300 | pxor $xd1,$xt3 | |
1301 | ||
1302 | movdqu $xt0,0x40($out) | |
1303 | movdqu 0x00($inp),$xt0 | |
1304 | movdqu $xt1,0x50($out) | |
1305 | movdqu 0x10($inp),$xt1 | |
1306 | movdqu $xt2,0x60($out) | |
1307 | movdqu 0x20($inp),$xt2 | |
1308 | movdqu $xt3,0x70($out) | |
1309 | lea 0x80($out),$out # size optimization | |
1310 | movdqu 0x30($inp),$xt3 | |
1311 | pxor 0x20(%rsp),$xt0 | |
1312 | pxor $xb2,$xt1 | |
1313 | pxor $xc2,$xt2 | |
1314 | pxor $xd2,$xt3 | |
1315 | movdqu $xt0,0x00($out) | |
1316 | movdqu $xt1,0x10($out) | |
1317 | movdqu $xt2,0x20($out) | |
1318 | movdqu $xt3,0x30($out) | |
1319 | je .Ldone4x | |
1320 | ||
1321 | movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? | |
1322 | lea 0x40($inp),$inp # inp+=64*3 | |
1323 | xor %r10,%r10 | |
1324 | movdqa $xt0,0x00(%rsp) | |
1325 | movdqa $xb3,0x10(%rsp) | |
1326 | lea 0x40($out),$out # out+=64*3 | |
1327 | movdqa $xc3,0x20(%rsp) | |
1328 | sub \$192,$len # len-=64*3 | |
1329 | movdqa $xd3,0x30(%rsp) | |
1330 | ||
1331 | .Loop_tail4x: | |
1332 | movzb ($inp,%r10),%eax | |
1333 | movzb (%rsp,%r10),%ecx | |
1334 | lea 1(%r10),%r10 | |
1335 | xor %ecx,%eax | |
1336 | mov %al,-1($out,%r10) | |
1337 | dec $len | |
1338 | jnz .Loop_tail4x | |
1339 | ||
1340 | .Ldone4x: | |
1341 | ___ | |
1342 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1343 | movaps -0xa8(%r9),%xmm6 |
1344 | movaps -0x98(%r9),%xmm7 | |
1345 | movaps -0x88(%r9),%xmm8 | |
1346 | movaps -0x78(%r9),%xmm9 | |
1347 | movaps -0x68(%r9),%xmm10 | |
1348 | movaps -0x58(%r9),%xmm11 | |
1349 | movaps -0x48(%r9),%xmm12 | |
1350 | movaps -0x38(%r9),%xmm13 | |
1351 | movaps -0x28(%r9),%xmm14 | |
1352 | movaps -0x18(%r9),%xmm15 | |
a98c648e AP |
1353 | ___ |
1354 | $code.=<<___; | |
384e6de4 | 1355 | lea (%r9),%rsp |
f17652e5 | 1356 | .cfi_def_cfa_register %rsp |
384e6de4 | 1357 | .L4x_epilogue: |
a98c648e | 1358 | ret |
f17652e5 | 1359 | .cfi_endproc |
a98c648e AP |
1360 | .size ChaCha20_4x,.-ChaCha20_4x |
1361 | ___ | |
1362 | } | |
1363 | ||
1364 | ######################################################################## | |
1365 | # XOP code path that handles all lengths. | |
1366 | if ($avx) { | |
1367 | # There is some "anomaly" observed depending on instructions' size or | |
1368 | # alignment. If you look closely at below code you'll notice that | |
1369 | # sometimes argument order varies. The order affects instruction | |
1370 | # encoding by making it larger, and such fiddling gives 5% performance | |
1371 | # improvement. This is on FX-4100... | |
1372 | ||
1373 | my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, | |
1374 | $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15)); | |
1375 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
1376 | $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3); | |
1377 | ||
1378 | sub XOP_lane_ROUND { | |
1379 | my ($a0,$b0,$c0,$d0)=@_; | |
1380 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
1381 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
1382 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
1383 | my @x=map("\"$_\"",@xx); | |
1384 | ||
1385 | ( | |
1386 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 | |
1387 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 | |
1388 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 | |
1389 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 | |
1390 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])", | |
1391 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])", | |
1392 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])", | |
1393 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])", | |
1394 | "&vprotd (@x[$d0],@x[$d0],16)", | |
1395 | "&vprotd (@x[$d1],@x[$d1],16)", | |
1396 | "&vprotd (@x[$d2],@x[$d2],16)", | |
1397 | "&vprotd (@x[$d3],@x[$d3],16)", | |
1398 | ||
1399 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", | |
1400 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", | |
1401 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", | |
1402 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", | |
1403 | "&vpxor (@x[$b0],@x[$c0],@x[$b0])", | |
1404 | "&vpxor (@x[$b1],@x[$c1],@x[$b1])", | |
1405 | "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip | |
1406 | "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip | |
1407 | "&vprotd (@x[$b0],@x[$b0],12)", | |
1408 | "&vprotd (@x[$b1],@x[$b1],12)", | |
1409 | "&vprotd (@x[$b2],@x[$b2],12)", | |
1410 | "&vprotd (@x[$b3],@x[$b3],12)", | |
1411 | ||
1412 | "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip | |
1413 | "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip | |
1414 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", | |
1415 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", | |
1416 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])", | |
1417 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])", | |
1418 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])", | |
1419 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])", | |
1420 | "&vprotd (@x[$d0],@x[$d0],8)", | |
1421 | "&vprotd (@x[$d1],@x[$d1],8)", | |
1422 | "&vprotd (@x[$d2],@x[$d2],8)", | |
1423 | "&vprotd (@x[$d3],@x[$d3],8)", | |
1424 | ||
1425 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", | |
1426 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", | |
1427 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", | |
1428 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", | |
1429 | "&vpxor (@x[$b0],@x[$c0],@x[$b0])", | |
1430 | "&vpxor (@x[$b1],@x[$c1],@x[$b1])", | |
1431 | "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip | |
1432 | "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip | |
1433 | "&vprotd (@x[$b0],@x[$b0],7)", | |
1434 | "&vprotd (@x[$b1],@x[$b1],7)", | |
1435 | "&vprotd (@x[$b2],@x[$b2],7)", | |
1436 | "&vprotd (@x[$b3],@x[$b3],7)" | |
1437 | ); | |
1438 | } | |
1439 | ||
384e6de4 | 1440 | my $xframe = $win64 ? 0xa8 : 8; |
a98c648e AP |
1441 | |
1442 | $code.=<<___; | |
1443 | .type ChaCha20_4xop,\@function,5 | |
1444 | .align 32 | |
1445 | ChaCha20_4xop: | |
f17652e5 | 1446 | .cfi_startproc |
a98c648e | 1447 | .LChaCha20_4xop: |
384e6de4 | 1448 | mov %rsp,%r9 # frame pointer |
f17652e5 | 1449 | .cfi_def_cfa_register %r9 |
384e6de4 | 1450 | sub \$0x140+$xframe,%rsp |
a98c648e AP |
1451 | ___ |
1452 | ################ stack layout | |
1453 | # +0x00 SIMD equivalent of @x[8-12] | |
1454 | # ... | |
1455 | # +0x40 constant copy of key[0-2] smashed by lanes | |
1456 | # ... | |
1457 | # +0x100 SIMD counters (with nonce smashed by lanes) | |
1458 | # ... | |
1459 | # +0x140 | |
1460 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1461 | movaps %xmm6,-0xa8(%r9) |
1462 | movaps %xmm7,-0x98(%r9) | |
1463 | movaps %xmm8,-0x88(%r9) | |
1464 | movaps %xmm9,-0x78(%r9) | |
1465 | movaps %xmm10,-0x68(%r9) | |
1466 | movaps %xmm11,-0x58(%r9) | |
1467 | movaps %xmm12,-0x48(%r9) | |
1468 | movaps %xmm13,-0x38(%r9) | |
1469 | movaps %xmm14,-0x28(%r9) | |
1470 | movaps %xmm15,-0x18(%r9) | |
1471 | .L4xop_body: | |
a98c648e AP |
1472 | ___ |
1473 | $code.=<<___; | |
1474 | vzeroupper | |
1475 | ||
1476 | vmovdqa .Lsigma(%rip),$xa3 # key[0] | |
1477 | vmovdqu ($key),$xb3 # key[1] | |
1478 | vmovdqu 16($key),$xt3 # key[2] | |
1479 | vmovdqu ($counter),$xd3 # key[3] | |
1480 | lea 0x100(%rsp),%rcx # size optimization | |
1481 | ||
1482 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
1483 | vpshufd \$0x55,$xa3,$xa1 | |
1484 | vmovdqa $xa0,0x40(%rsp) # ... and offload | |
1485 | vpshufd \$0xaa,$xa3,$xa2 | |
1486 | vmovdqa $xa1,0x50(%rsp) | |
1487 | vpshufd \$0xff,$xa3,$xa3 | |
1488 | vmovdqa $xa2,0x60(%rsp) | |
1489 | vmovdqa $xa3,0x70(%rsp) | |
1490 | ||
1491 | vpshufd \$0x00,$xb3,$xb0 | |
1492 | vpshufd \$0x55,$xb3,$xb1 | |
1493 | vmovdqa $xb0,0x80-0x100(%rcx) | |
1494 | vpshufd \$0xaa,$xb3,$xb2 | |
1495 | vmovdqa $xb1,0x90-0x100(%rcx) | |
1496 | vpshufd \$0xff,$xb3,$xb3 | |
1497 | vmovdqa $xb2,0xa0-0x100(%rcx) | |
1498 | vmovdqa $xb3,0xb0-0x100(%rcx) | |
1499 | ||
1500 | vpshufd \$0x00,$xt3,$xt0 # "$xc0" | |
1501 | vpshufd \$0x55,$xt3,$xt1 # "$xc1" | |
1502 | vmovdqa $xt0,0xc0-0x100(%rcx) | |
1503 | vpshufd \$0xaa,$xt3,$xt2 # "$xc2" | |
1504 | vmovdqa $xt1,0xd0-0x100(%rcx) | |
1505 | vpshufd \$0xff,$xt3,$xt3 # "$xc3" | |
1506 | vmovdqa $xt2,0xe0-0x100(%rcx) | |
1507 | vmovdqa $xt3,0xf0-0x100(%rcx) | |
1508 | ||
1509 | vpshufd \$0x00,$xd3,$xd0 | |
1510 | vpshufd \$0x55,$xd3,$xd1 | |
1511 | vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet | |
1512 | vpshufd \$0xaa,$xd3,$xd2 | |
1513 | vmovdqa $xd1,0x110-0x100(%rcx) | |
1514 | vpshufd \$0xff,$xd3,$xd3 | |
1515 | vmovdqa $xd2,0x120-0x100(%rcx) | |
1516 | vmovdqa $xd3,0x130-0x100(%rcx) | |
1517 | ||
1518 | jmp .Loop_enter4xop | |
1519 | ||
1520 | .align 32 | |
1521 | .Loop_outer4xop: | |
1522 | vmovdqa 0x40(%rsp),$xa0 # re-load smashed key | |
1523 | vmovdqa 0x50(%rsp),$xa1 | |
1524 | vmovdqa 0x60(%rsp),$xa2 | |
1525 | vmovdqa 0x70(%rsp),$xa3 | |
1526 | vmovdqa 0x80-0x100(%rcx),$xb0 | |
1527 | vmovdqa 0x90-0x100(%rcx),$xb1 | |
1528 | vmovdqa 0xa0-0x100(%rcx),$xb2 | |
1529 | vmovdqa 0xb0-0x100(%rcx),$xb3 | |
1530 | vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" | |
1531 | vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" | |
1532 | vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" | |
1533 | vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" | |
1534 | vmovdqa 0x100-0x100(%rcx),$xd0 | |
1535 | vmovdqa 0x110-0x100(%rcx),$xd1 | |
1536 | vmovdqa 0x120-0x100(%rcx),$xd2 | |
1537 | vmovdqa 0x130-0x100(%rcx),$xd3 | |
1538 | vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters | |
1539 | ||
1540 | .Loop_enter4xop: | |
1541 | mov \$10,%eax | |
1542 | vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters | |
1543 | jmp .Loop4xop | |
1544 | ||
1545 | .align 32 | |
1546 | .Loop4xop: | |
1547 | ___ | |
1548 | foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; } | |
1549 | foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; } | |
1550 | $code.=<<___; | |
1551 | dec %eax | |
1552 | jnz .Loop4xop | |
1553 | ||
1554 | vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material | |
1555 | vpaddd 0x50(%rsp),$xa1,$xa1 | |
1556 | vpaddd 0x60(%rsp),$xa2,$xa2 | |
1557 | vpaddd 0x70(%rsp),$xa3,$xa3 | |
1558 | ||
1559 | vmovdqa $xt2,0x20(%rsp) # offload $xc2,3 | |
1560 | vmovdqa $xt3,0x30(%rsp) | |
1561 | ||
1562 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data | |
1563 | vpunpckldq $xa3,$xa2,$xt3 | |
1564 | vpunpckhdq $xa1,$xa0,$xa0 | |
1565 | vpunpckhdq $xa3,$xa2,$xa2 | |
1566 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0" | |
1567 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1" | |
1568 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2" | |
1569 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3" | |
1570 | ___ | |
1571 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); | |
1572 | $code.=<<___; | |
1573 | vpaddd 0x80-0x100(%rcx),$xb0,$xb0 | |
1574 | vpaddd 0x90-0x100(%rcx),$xb1,$xb1 | |
1575 | vpaddd 0xa0-0x100(%rcx),$xb2,$xb2 | |
1576 | vpaddd 0xb0-0x100(%rcx),$xb3,$xb3 | |
1577 | ||
1578 | vmovdqa $xa0,0x00(%rsp) # offload $xa0,1 | |
1579 | vmovdqa $xa1,0x10(%rsp) | |
1580 | vmovdqa 0x20(%rsp),$xa0 # "xc2" | |
1581 | vmovdqa 0x30(%rsp),$xa1 # "xc3" | |
1582 | ||
1583 | vpunpckldq $xb1,$xb0,$xt2 | |
1584 | vpunpckldq $xb3,$xb2,$xt3 | |
1585 | vpunpckhdq $xb1,$xb0,$xb0 | |
1586 | vpunpckhdq $xb3,$xb2,$xb2 | |
1587 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0" | |
1588 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1" | |
1589 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2" | |
1590 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3" | |
1591 | ___ | |
1592 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); | |
1593 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); | |
1594 | $code.=<<___; | |
1595 | vpaddd 0xc0-0x100(%rcx),$xc0,$xc0 | |
1596 | vpaddd 0xd0-0x100(%rcx),$xc1,$xc1 | |
1597 | vpaddd 0xe0-0x100(%rcx),$xc2,$xc2 | |
1598 | vpaddd 0xf0-0x100(%rcx),$xc3,$xc3 | |
1599 | ||
1600 | vpunpckldq $xc1,$xc0,$xt2 | |
1601 | vpunpckldq $xc3,$xc2,$xt3 | |
1602 | vpunpckhdq $xc1,$xc0,$xc0 | |
1603 | vpunpckhdq $xc3,$xc2,$xc2 | |
1604 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0" | |
1605 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1" | |
1606 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2" | |
1607 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3" | |
1608 | ___ | |
1609 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); | |
1610 | $code.=<<___; | |
1611 | vpaddd 0x100-0x100(%rcx),$xd0,$xd0 | |
1612 | vpaddd 0x110-0x100(%rcx),$xd1,$xd1 | |
1613 | vpaddd 0x120-0x100(%rcx),$xd2,$xd2 | |
1614 | vpaddd 0x130-0x100(%rcx),$xd3,$xd3 | |
1615 | ||
1616 | vpunpckldq $xd1,$xd0,$xt2 | |
1617 | vpunpckldq $xd3,$xd2,$xt3 | |
1618 | vpunpckhdq $xd1,$xd0,$xd0 | |
1619 | vpunpckhdq $xd3,$xd2,$xd2 | |
1620 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0" | |
1621 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1" | |
1622 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2" | |
1623 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3" | |
1624 | ___ | |
1625 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); | |
1626 | ($xa0,$xa1)=($xt2,$xt3); | |
1627 | $code.=<<___; | |
1628 | vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1 | |
1629 | vmovdqa 0x10(%rsp),$xa1 | |
1630 | ||
1631 | cmp \$64*4,$len | |
1632 | jb .Ltail4xop | |
1633 | ||
1634 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1635 | vpxor 0x10($inp),$xb0,$xb0 | |
1636 | vpxor 0x20($inp),$xc0,$xc0 | |
1637 | vpxor 0x30($inp),$xd0,$xd0 | |
1638 | vpxor 0x40($inp),$xa1,$xa1 | |
1639 | vpxor 0x50($inp),$xb1,$xb1 | |
1640 | vpxor 0x60($inp),$xc1,$xc1 | |
1641 | vpxor 0x70($inp),$xd1,$xd1 | |
1642 | lea 0x80($inp),$inp # size optimization | |
1643 | vpxor 0x00($inp),$xa2,$xa2 | |
1644 | vpxor 0x10($inp),$xb2,$xb2 | |
1645 | vpxor 0x20($inp),$xc2,$xc2 | |
1646 | vpxor 0x30($inp),$xd2,$xd2 | |
1647 | vpxor 0x40($inp),$xa3,$xa3 | |
1648 | vpxor 0x50($inp),$xb3,$xb3 | |
1649 | vpxor 0x60($inp),$xc3,$xc3 | |
1650 | vpxor 0x70($inp),$xd3,$xd3 | |
1651 | lea 0x80($inp),$inp # inp+=64*4 | |
1652 | ||
1653 | vmovdqu $xa0,0x00($out) | |
1654 | vmovdqu $xb0,0x10($out) | |
1655 | vmovdqu $xc0,0x20($out) | |
1656 | vmovdqu $xd0,0x30($out) | |
1657 | vmovdqu $xa1,0x40($out) | |
1658 | vmovdqu $xb1,0x50($out) | |
1659 | vmovdqu $xc1,0x60($out) | |
1660 | vmovdqu $xd1,0x70($out) | |
1661 | lea 0x80($out),$out # size optimization | |
1662 | vmovdqu $xa2,0x00($out) | |
1663 | vmovdqu $xb2,0x10($out) | |
1664 | vmovdqu $xc2,0x20($out) | |
1665 | vmovdqu $xd2,0x30($out) | |
1666 | vmovdqu $xa3,0x40($out) | |
1667 | vmovdqu $xb3,0x50($out) | |
1668 | vmovdqu $xc3,0x60($out) | |
1669 | vmovdqu $xd3,0x70($out) | |
1670 | lea 0x80($out),$out # out+=64*4 | |
1671 | ||
1672 | sub \$64*4,$len | |
1673 | jnz .Loop_outer4xop | |
1674 | ||
1675 | jmp .Ldone4xop | |
1676 | ||
1677 | .align 32 | |
1678 | .Ltail4xop: | |
1679 | cmp \$192,$len | |
1680 | jae .L192_or_more4xop | |
1681 | cmp \$128,$len | |
1682 | jae .L128_or_more4xop | |
1683 | cmp \$64,$len | |
1684 | jae .L64_or_more4xop | |
1685 | ||
1686 | xor %r10,%r10 | |
1687 | vmovdqa $xa0,0x00(%rsp) | |
1688 | vmovdqa $xb0,0x10(%rsp) | |
1689 | vmovdqa $xc0,0x20(%rsp) | |
1690 | vmovdqa $xd0,0x30(%rsp) | |
1691 | jmp .Loop_tail4xop | |
1692 | ||
1693 | .align 32 | |
1694 | .L64_or_more4xop: | |
1695 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1696 | vpxor 0x10($inp),$xb0,$xb0 | |
1697 | vpxor 0x20($inp),$xc0,$xc0 | |
1698 | vpxor 0x30($inp),$xd0,$xd0 | |
1699 | vmovdqu $xa0,0x00($out) | |
1700 | vmovdqu $xb0,0x10($out) | |
1701 | vmovdqu $xc0,0x20($out) | |
1702 | vmovdqu $xd0,0x30($out) | |
1703 | je .Ldone4xop | |
1704 | ||
1705 | lea 0x40($inp),$inp # inp+=64*1 | |
1706 | vmovdqa $xa1,0x00(%rsp) | |
1707 | xor %r10,%r10 | |
1708 | vmovdqa $xb1,0x10(%rsp) | |
1709 | lea 0x40($out),$out # out+=64*1 | |
1710 | vmovdqa $xc1,0x20(%rsp) | |
1711 | sub \$64,$len # len-=64*1 | |
1712 | vmovdqa $xd1,0x30(%rsp) | |
1713 | jmp .Loop_tail4xop | |
1714 | ||
1715 | .align 32 | |
1716 | .L128_or_more4xop: | |
1717 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1718 | vpxor 0x10($inp),$xb0,$xb0 | |
1719 | vpxor 0x20($inp),$xc0,$xc0 | |
1720 | vpxor 0x30($inp),$xd0,$xd0 | |
1721 | vpxor 0x40($inp),$xa1,$xa1 | |
1722 | vpxor 0x50($inp),$xb1,$xb1 | |
1723 | vpxor 0x60($inp),$xc1,$xc1 | |
1724 | vpxor 0x70($inp),$xd1,$xd1 | |
1725 | ||
1726 | vmovdqu $xa0,0x00($out) | |
1727 | vmovdqu $xb0,0x10($out) | |
1728 | vmovdqu $xc0,0x20($out) | |
1729 | vmovdqu $xd0,0x30($out) | |
1730 | vmovdqu $xa1,0x40($out) | |
1731 | vmovdqu $xb1,0x50($out) | |
1732 | vmovdqu $xc1,0x60($out) | |
1733 | vmovdqu $xd1,0x70($out) | |
1734 | je .Ldone4xop | |
1735 | ||
1736 | lea 0x80($inp),$inp # inp+=64*2 | |
1737 | vmovdqa $xa2,0x00(%rsp) | |
1738 | xor %r10,%r10 | |
1739 | vmovdqa $xb2,0x10(%rsp) | |
1740 | lea 0x80($out),$out # out+=64*2 | |
1741 | vmovdqa $xc2,0x20(%rsp) | |
1742 | sub \$128,$len # len-=64*2 | |
1743 | vmovdqa $xd2,0x30(%rsp) | |
1744 | jmp .Loop_tail4xop | |
1745 | ||
1746 | .align 32 | |
1747 | .L192_or_more4xop: | |
1748 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
1749 | vpxor 0x10($inp),$xb0,$xb0 | |
1750 | vpxor 0x20($inp),$xc0,$xc0 | |
1751 | vpxor 0x30($inp),$xd0,$xd0 | |
1752 | vpxor 0x40($inp),$xa1,$xa1 | |
1753 | vpxor 0x50($inp),$xb1,$xb1 | |
1754 | vpxor 0x60($inp),$xc1,$xc1 | |
1755 | vpxor 0x70($inp),$xd1,$xd1 | |
1756 | lea 0x80($inp),$inp # size optimization | |
1757 | vpxor 0x00($inp),$xa2,$xa2 | |
1758 | vpxor 0x10($inp),$xb2,$xb2 | |
1759 | vpxor 0x20($inp),$xc2,$xc2 | |
1760 | vpxor 0x30($inp),$xd2,$xd2 | |
1761 | ||
1762 | vmovdqu $xa0,0x00($out) | |
1763 | vmovdqu $xb0,0x10($out) | |
1764 | vmovdqu $xc0,0x20($out) | |
1765 | vmovdqu $xd0,0x30($out) | |
1766 | vmovdqu $xa1,0x40($out) | |
1767 | vmovdqu $xb1,0x50($out) | |
1768 | vmovdqu $xc1,0x60($out) | |
1769 | vmovdqu $xd1,0x70($out) | |
1770 | lea 0x80($out),$out # size optimization | |
1771 | vmovdqu $xa2,0x00($out) | |
1772 | vmovdqu $xb2,0x10($out) | |
1773 | vmovdqu $xc2,0x20($out) | |
1774 | vmovdqu $xd2,0x30($out) | |
1775 | je .Ldone4xop | |
1776 | ||
1777 | lea 0x40($inp),$inp # inp+=64*3 | |
f2188228 | 1778 | vmovdqa $xa3,0x00(%rsp) |
a98c648e | 1779 | xor %r10,%r10 |
f2188228 | 1780 | vmovdqa $xb3,0x10(%rsp) |
a98c648e | 1781 | lea 0x40($out),$out # out+=64*3 |
f2188228 | 1782 | vmovdqa $xc3,0x20(%rsp) |
a98c648e | 1783 | sub \$192,$len # len-=64*3 |
f2188228 | 1784 | vmovdqa $xd3,0x30(%rsp) |
a98c648e AP |
1785 | |
1786 | .Loop_tail4xop: | |
1787 | movzb ($inp,%r10),%eax | |
1788 | movzb (%rsp,%r10),%ecx | |
1789 | lea 1(%r10),%r10 | |
1790 | xor %ecx,%eax | |
1791 | mov %al,-1($out,%r10) | |
1792 | dec $len | |
1793 | jnz .Loop_tail4xop | |
1794 | ||
1795 | .Ldone4xop: | |
1796 | vzeroupper | |
1797 | ___ | |
1798 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1799 | movaps -0xa8(%r9),%xmm6 |
1800 | movaps -0x98(%r9),%xmm7 | |
1801 | movaps -0x88(%r9),%xmm8 | |
1802 | movaps -0x78(%r9),%xmm9 | |
1803 | movaps -0x68(%r9),%xmm10 | |
1804 | movaps -0x58(%r9),%xmm11 | |
1805 | movaps -0x48(%r9),%xmm12 | |
1806 | movaps -0x38(%r9),%xmm13 | |
1807 | movaps -0x28(%r9),%xmm14 | |
1808 | movaps -0x18(%r9),%xmm15 | |
a98c648e AP |
1809 | ___ |
1810 | $code.=<<___; | |
384e6de4 | 1811 | lea (%r9),%rsp |
f17652e5 | 1812 | .cfi_def_cfa_register %rsp |
384e6de4 | 1813 | .L4xop_epilogue: |
a98c648e | 1814 | ret |
f17652e5 | 1815 | .cfi_endproc |
a98c648e AP |
1816 | .size ChaCha20_4xop,.-ChaCha20_4xop |
1817 | ___ | |
1818 | } | |
1819 | ||
1820 | ######################################################################## | |
1821 | # AVX2 code path | |
1822 | if ($avx>1) { | |
1823 | my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, | |
1824 | $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); | |
1825 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
1826 | "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); | |
1827 | ||
1828 | sub AVX2_lane_ROUND { | |
1829 | my ($a0,$b0,$c0,$d0)=@_; | |
1830 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
1831 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
1832 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
1833 | my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); | |
1834 | my @x=map("\"$_\"",@xx); | |
1835 | ||
1836 | # Consider order in which variables are addressed by their | |
1837 | # index: | |
1838 | # | |
1839 | # a b c d | |
1840 | # | |
1841 | # 0 4 8 12 < even round | |
1842 | # 1 5 9 13 | |
1843 | # 2 6 10 14 | |
1844 | # 3 7 11 15 | |
1845 | # 0 5 10 15 < odd round | |
1846 | # 1 6 11 12 | |
1847 | # 2 7 8 13 | |
1848 | # 3 4 9 14 | |
1849 | # | |
1850 | # 'a', 'b' and 'd's are permanently allocated in registers, | |
1851 | # @x[0..7,12..15], while 'c's are maintained in memory. If | |
1852 | # you observe 'c' column, you'll notice that pair of 'c's is | |
1853 | # invariant between rounds. This means that we have to reload | |
1854 | # them once per round, in the middle. This is why you'll see | |
1855 | # bunch of 'c' stores and loads in the middle, but none in | |
1856 | # the beginning or end. | |
1857 | ||
1858 | ( | |
1859 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 | |
1860 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])", | |
1861 | "&vpshufb (@x[$d0],@x[$d0],$t1)", | |
1862 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 | |
1863 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])", | |
1864 | "&vpshufb (@x[$d1],@x[$d1],$t1)", | |
1865 | ||
1866 | "&vpaddd ($xc,$xc,@x[$d0])", | |
1867 | "&vpxor (@x[$b0],$xc,@x[$b0])", | |
1868 | "&vpslld ($t0,@x[$b0],12)", | |
1869 | "&vpsrld (@x[$b0],@x[$b0],20)", | |
1870 | "&vpor (@x[$b0],$t0,@x[$b0])", | |
1871 | "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) | |
1872 | "&vpaddd ($xc_,$xc_,@x[$d1])", | |
1873 | "&vpxor (@x[$b1],$xc_,@x[$b1])", | |
1874 | "&vpslld ($t1,@x[$b1],12)", | |
1875 | "&vpsrld (@x[$b1],@x[$b1],20)", | |
1876 | "&vpor (@x[$b1],$t1,@x[$b1])", | |
1877 | ||
1878 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", | |
1879 | "&vpxor (@x[$d0],@x[$a0],@x[$d0])", | |
1880 | "&vpshufb (@x[$d0],@x[$d0],$t0)", | |
1881 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", | |
1882 | "&vpxor (@x[$d1],@x[$a1],@x[$d1])", | |
1883 | "&vpshufb (@x[$d1],@x[$d1],$t0)", | |
1884 | ||
1885 | "&vpaddd ($xc,$xc,@x[$d0])", | |
1886 | "&vpxor (@x[$b0],$xc,@x[$b0])", | |
1887 | "&vpslld ($t1,@x[$b0],7)", | |
1888 | "&vpsrld (@x[$b0],@x[$b0],25)", | |
1889 | "&vpor (@x[$b0],$t1,@x[$b0])", | |
1890 | "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) | |
1891 | "&vpaddd ($xc_,$xc_,@x[$d1])", | |
1892 | "&vpxor (@x[$b1],$xc_,@x[$b1])", | |
1893 | "&vpslld ($t0,@x[$b1],7)", | |
1894 | "&vpsrld (@x[$b1],@x[$b1],25)", | |
1895 | "&vpor (@x[$b1],$t0,@x[$b1])", | |
1896 | ||
1897 | "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's | |
1898 | "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", | |
1899 | "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", | |
1900 | "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", | |
1901 | ||
1902 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 | |
1903 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])", | |
1904 | "&vpshufb (@x[$d2],@x[$d2],$t1)", | |
1905 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 | |
1906 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])", | |
1907 | "&vpshufb (@x[$d3],@x[$d3],$t1)", | |
1908 | ||
1909 | "&vpaddd ($xc,$xc,@x[$d2])", | |
1910 | "&vpxor (@x[$b2],$xc,@x[$b2])", | |
1911 | "&vpslld ($t0,@x[$b2],12)", | |
1912 | "&vpsrld (@x[$b2],@x[$b2],20)", | |
1913 | "&vpor (@x[$b2],$t0,@x[$b2])", | |
1914 | "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) | |
1915 | "&vpaddd ($xc_,$xc_,@x[$d3])", | |
1916 | "&vpxor (@x[$b3],$xc_,@x[$b3])", | |
1917 | "&vpslld ($t1,@x[$b3],12)", | |
1918 | "&vpsrld (@x[$b3],@x[$b3],20)", | |
1919 | "&vpor (@x[$b3],$t1,@x[$b3])", | |
1920 | ||
1921 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", | |
1922 | "&vpxor (@x[$d2],@x[$a2],@x[$d2])", | |
1923 | "&vpshufb (@x[$d2],@x[$d2],$t0)", | |
1924 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", | |
1925 | "&vpxor (@x[$d3],@x[$a3],@x[$d3])", | |
1926 | "&vpshufb (@x[$d3],@x[$d3],$t0)", | |
1927 | ||
1928 | "&vpaddd ($xc,$xc,@x[$d2])", | |
1929 | "&vpxor (@x[$b2],$xc,@x[$b2])", | |
1930 | "&vpslld ($t1,@x[$b2],7)", | |
1931 | "&vpsrld (@x[$b2],@x[$b2],25)", | |
1932 | "&vpor (@x[$b2],$t1,@x[$b2])", | |
1933 | "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) | |
1934 | "&vpaddd ($xc_,$xc_,@x[$d3])", | |
1935 | "&vpxor (@x[$b3],$xc_,@x[$b3])", | |
1936 | "&vpslld ($t0,@x[$b3],7)", | |
1937 | "&vpsrld (@x[$b3],@x[$b3],25)", | |
1938 | "&vpor (@x[$b3],$t0,@x[$b3])" | |
1939 | ); | |
1940 | } | |
1941 | ||
384e6de4 | 1942 | my $xframe = $win64 ? 0xa8 : 8; |
a98c648e AP |
1943 | |
1944 | $code.=<<___; | |
1945 | .type ChaCha20_8x,\@function,5 | |
1946 | .align 32 | |
1947 | ChaCha20_8x: | |
f17652e5 | 1948 | .cfi_startproc |
a98c648e | 1949 | .LChaCha20_8x: |
384e6de4 | 1950 | mov %rsp,%r9 # frame register |
f17652e5 | 1951 | .cfi_def_cfa_register %r9 |
a98c648e AP |
1952 | sub \$0x280+$xframe,%rsp |
1953 | and \$-32,%rsp | |
1954 | ___ | |
1955 | $code.=<<___ if ($win64); | |
384e6de4 AP |
1956 | movaps %xmm6,-0xa8(%r9) |
1957 | movaps %xmm7,-0x98(%r9) | |
1958 | movaps %xmm8,-0x88(%r9) | |
1959 | movaps %xmm9,-0x78(%r9) | |
1960 | movaps %xmm10,-0x68(%r9) | |
1961 | movaps %xmm11,-0x58(%r9) | |
1962 | movaps %xmm12,-0x48(%r9) | |
1963 | movaps %xmm13,-0x38(%r9) | |
1964 | movaps %xmm14,-0x28(%r9) | |
1965 | movaps %xmm15,-0x18(%r9) | |
1966 | .L8x_body: | |
a98c648e AP |
1967 | ___ |
1968 | $code.=<<___; | |
1969 | vzeroupper | |
a98c648e AP |
1970 | |
1971 | ################ stack layout | |
1972 | # +0x00 SIMD equivalent of @x[8-12] | |
1973 | # ... | |
1974 | # +0x80 constant copy of key[0-2] smashed by lanes | |
1975 | # ... | |
1976 | # +0x200 SIMD counters (with nonce smashed by lanes) | |
1977 | # ... | |
384e6de4 | 1978 | # +0x280 |
a98c648e AP |
1979 | |
1980 | vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] | |
1981 | vbroadcasti128 ($key),$xb3 # key[1] | |
1982 | vbroadcasti128 16($key),$xt3 # key[2] | |
1983 | vbroadcasti128 ($counter),$xd3 # key[3] | |
1984 | lea 0x100(%rsp),%rcx # size optimization | |
1985 | lea 0x200(%rsp),%rax # size optimization | |
1986 | lea .Lrot16(%rip),%r10 | |
1987 | lea .Lrot24(%rip),%r11 | |
1988 | ||
1989 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
1990 | vpshufd \$0x55,$xa3,$xa1 | |
1991 | vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload | |
1992 | vpshufd \$0xaa,$xa3,$xa2 | |
1993 | vmovdqa $xa1,0xa0-0x100(%rcx) | |
1994 | vpshufd \$0xff,$xa3,$xa3 | |
1995 | vmovdqa $xa2,0xc0-0x100(%rcx) | |
1996 | vmovdqa $xa3,0xe0-0x100(%rcx) | |
1997 | ||
1998 | vpshufd \$0x00,$xb3,$xb0 | |
1999 | vpshufd \$0x55,$xb3,$xb1 | |
2000 | vmovdqa $xb0,0x100-0x100(%rcx) | |
2001 | vpshufd \$0xaa,$xb3,$xb2 | |
2002 | vmovdqa $xb1,0x120-0x100(%rcx) | |
2003 | vpshufd \$0xff,$xb3,$xb3 | |
2004 | vmovdqa $xb2,0x140-0x100(%rcx) | |
2005 | vmovdqa $xb3,0x160-0x100(%rcx) | |
2006 | ||
2007 | vpshufd \$0x00,$xt3,$xt0 # "xc0" | |
2008 | vpshufd \$0x55,$xt3,$xt1 # "xc1" | |
2009 | vmovdqa $xt0,0x180-0x200(%rax) | |
2010 | vpshufd \$0xaa,$xt3,$xt2 # "xc2" | |
2011 | vmovdqa $xt1,0x1a0-0x200(%rax) | |
2012 | vpshufd \$0xff,$xt3,$xt3 # "xc3" | |
2013 | vmovdqa $xt2,0x1c0-0x200(%rax) | |
2014 | vmovdqa $xt3,0x1e0-0x200(%rax) | |
2015 | ||
2016 | vpshufd \$0x00,$xd3,$xd0 | |
2017 | vpshufd \$0x55,$xd3,$xd1 | |
2018 | vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet | |
2019 | vpshufd \$0xaa,$xd3,$xd2 | |
2020 | vmovdqa $xd1,0x220-0x200(%rax) | |
2021 | vpshufd \$0xff,$xd3,$xd3 | |
2022 | vmovdqa $xd2,0x240-0x200(%rax) | |
2023 | vmovdqa $xd3,0x260-0x200(%rax) | |
2024 | ||
2025 | jmp .Loop_enter8x | |
2026 | ||
2027 | .align 32 | |
2028 | .Loop_outer8x: | |
2029 | vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key | |
2030 | vmovdqa 0xa0-0x100(%rcx),$xa1 | |
2031 | vmovdqa 0xc0-0x100(%rcx),$xa2 | |
2032 | vmovdqa 0xe0-0x100(%rcx),$xa3 | |
2033 | vmovdqa 0x100-0x100(%rcx),$xb0 | |
2034 | vmovdqa 0x120-0x100(%rcx),$xb1 | |
2035 | vmovdqa 0x140-0x100(%rcx),$xb2 | |
2036 | vmovdqa 0x160-0x100(%rcx),$xb3 | |
2037 | vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" | |
2038 | vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" | |
2039 | vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" | |
2040 | vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" | |
2041 | vmovdqa 0x200-0x200(%rax),$xd0 | |
2042 | vmovdqa 0x220-0x200(%rax),$xd1 | |
2043 | vmovdqa 0x240-0x200(%rax),$xd2 | |
2044 | vmovdqa 0x260-0x200(%rax),$xd3 | |
2045 | vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters | |
2046 | ||
2047 | .Loop_enter8x: | |
2048 | vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" | |
2049 | vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" | |
2050 | vbroadcasti128 (%r10),$xt3 | |
2051 | vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters | |
2052 | mov \$10,%eax | |
2053 | jmp .Loop8x | |
2054 | ||
2055 | .align 32 | |
2056 | .Loop8x: | |
2057 | ___ | |
2058 | foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } | |
2059 | foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } | |
2060 | $code.=<<___; | |
2061 | dec %eax | |
2062 | jnz .Loop8x | |
2063 | ||
2064 | lea 0x200(%rsp),%rax # size optimization | |
2065 | vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key | |
2066 | vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 | |
2067 | vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 | |
2068 | vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 | |
2069 | ||
2070 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data | |
2071 | vpunpckldq $xa3,$xa2,$xt3 | |
2072 | vpunpckhdq $xa1,$xa0,$xa0 | |
2073 | vpunpckhdq $xa3,$xa2,$xa2 | |
2074 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0" | |
2075 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1" | |
2076 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2" | |
2077 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3" | |
2078 | ___ | |
2079 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); | |
2080 | $code.=<<___; | |
2081 | vpaddd 0x100-0x100(%rcx),$xb0,$xb0 | |
2082 | vpaddd 0x120-0x100(%rcx),$xb1,$xb1 | |
2083 | vpaddd 0x140-0x100(%rcx),$xb2,$xb2 | |
2084 | vpaddd 0x160-0x100(%rcx),$xb3,$xb3 | |
2085 | ||
2086 | vpunpckldq $xb1,$xb0,$xt2 | |
2087 | vpunpckldq $xb3,$xb2,$xt3 | |
2088 | vpunpckhdq $xb1,$xb0,$xb0 | |
2089 | vpunpckhdq $xb3,$xb2,$xb2 | |
2090 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0" | |
2091 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1" | |
2092 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2" | |
2093 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3" | |
2094 | ___ | |
2095 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); | |
2096 | $code.=<<___; | |
2097 | vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further | |
2098 | vperm2i128 \$0x31,$xb0,$xa0,$xb0 | |
2099 | vperm2i128 \$0x20,$xb1,$xa1,$xa0 | |
2100 | vperm2i128 \$0x31,$xb1,$xa1,$xb1 | |
2101 | vperm2i128 \$0x20,$xb2,$xa2,$xa1 | |
2102 | vperm2i128 \$0x31,$xb2,$xa2,$xb2 | |
2103 | vperm2i128 \$0x20,$xb3,$xa3,$xa2 | |
2104 | vperm2i128 \$0x31,$xb3,$xa3,$xb3 | |
2105 | ___ | |
2106 | ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); | |
2107 | my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); | |
2108 | $code.=<<___; | |
2109 | vmovdqa $xa0,0x00(%rsp) # offload $xaN | |
2110 | vmovdqa $xa1,0x20(%rsp) | |
2111 | vmovdqa 0x40(%rsp),$xc2 # $xa0 | |
2112 | vmovdqa 0x60(%rsp),$xc3 # $xa1 | |
2113 | ||
2114 | vpaddd 0x180-0x200(%rax),$xc0,$xc0 | |
2115 | vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 | |
2116 | vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 | |
2117 | vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 | |
2118 | ||
2119 | vpunpckldq $xc1,$xc0,$xt2 | |
2120 | vpunpckldq $xc3,$xc2,$xt3 | |
2121 | vpunpckhdq $xc1,$xc0,$xc0 | |
2122 | vpunpckhdq $xc3,$xc2,$xc2 | |
2123 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0" | |
2124 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1" | |
2125 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2" | |
2126 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3" | |
2127 | ___ | |
2128 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); | |
2129 | $code.=<<___; | |
2130 | vpaddd 0x200-0x200(%rax),$xd0,$xd0 | |
2131 | vpaddd 0x220-0x200(%rax),$xd1,$xd1 | |
2132 | vpaddd 0x240-0x200(%rax),$xd2,$xd2 | |
2133 | vpaddd 0x260-0x200(%rax),$xd3,$xd3 | |
2134 | ||
2135 | vpunpckldq $xd1,$xd0,$xt2 | |
2136 | vpunpckldq $xd3,$xd2,$xt3 | |
2137 | vpunpckhdq $xd1,$xd0,$xd0 | |
2138 | vpunpckhdq $xd3,$xd2,$xd2 | |
2139 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0" | |
2140 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1" | |
2141 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2" | |
2142 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3" | |
2143 | ___ | |
2144 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); | |
2145 | $code.=<<___; | |
2146 | vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further | |
2147 | vperm2i128 \$0x31,$xd0,$xc0,$xd0 | |
2148 | vperm2i128 \$0x20,$xd1,$xc1,$xc0 | |
2149 | vperm2i128 \$0x31,$xd1,$xc1,$xd1 | |
2150 | vperm2i128 \$0x20,$xd2,$xc2,$xc1 | |
2151 | vperm2i128 \$0x31,$xd2,$xc2,$xd2 | |
2152 | vperm2i128 \$0x20,$xd3,$xc3,$xc2 | |
2153 | vperm2i128 \$0x31,$xd3,$xc3,$xd3 | |
2154 | ___ | |
2155 | ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); | |
2156 | ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= | |
2157 | ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); | |
2158 | ($xa0,$xa1)=($xt2,$xt3); | |
2159 | $code.=<<___; | |
2160 | vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? | |
2161 | vmovdqa 0x20(%rsp),$xa1 | |
2162 | ||
2163 | cmp \$64*8,$len | |
2164 | jb .Ltail8x | |
2165 | ||
2166 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2167 | vpxor 0x20($inp),$xb0,$xb0 | |
2168 | vpxor 0x40($inp),$xc0,$xc0 | |
2169 | vpxor 0x60($inp),$xd0,$xd0 | |
2170 | lea 0x80($inp),$inp # size optimization | |
2171 | vmovdqu $xa0,0x00($out) | |
2172 | vmovdqu $xb0,0x20($out) | |
2173 | vmovdqu $xc0,0x40($out) | |
2174 | vmovdqu $xd0,0x60($out) | |
2175 | lea 0x80($out),$out # size optimization | |
2176 | ||
2177 | vpxor 0x00($inp),$xa1,$xa1 | |
2178 | vpxor 0x20($inp),$xb1,$xb1 | |
2179 | vpxor 0x40($inp),$xc1,$xc1 | |
2180 | vpxor 0x60($inp),$xd1,$xd1 | |
2181 | lea 0x80($inp),$inp # size optimization | |
2182 | vmovdqu $xa1,0x00($out) | |
2183 | vmovdqu $xb1,0x20($out) | |
2184 | vmovdqu $xc1,0x40($out) | |
2185 | vmovdqu $xd1,0x60($out) | |
2186 | lea 0x80($out),$out # size optimization | |
2187 | ||
2188 | vpxor 0x00($inp),$xa2,$xa2 | |
2189 | vpxor 0x20($inp),$xb2,$xb2 | |
2190 | vpxor 0x40($inp),$xc2,$xc2 | |
2191 | vpxor 0x60($inp),$xd2,$xd2 | |
2192 | lea 0x80($inp),$inp # size optimization | |
2193 | vmovdqu $xa2,0x00($out) | |
2194 | vmovdqu $xb2,0x20($out) | |
2195 | vmovdqu $xc2,0x40($out) | |
2196 | vmovdqu $xd2,0x60($out) | |
2197 | lea 0x80($out),$out # size optimization | |
2198 | ||
2199 | vpxor 0x00($inp),$xa3,$xa3 | |
2200 | vpxor 0x20($inp),$xb3,$xb3 | |
2201 | vpxor 0x40($inp),$xc3,$xc3 | |
2202 | vpxor 0x60($inp),$xd3,$xd3 | |
2203 | lea 0x80($inp),$inp # size optimization | |
2204 | vmovdqu $xa3,0x00($out) | |
2205 | vmovdqu $xb3,0x20($out) | |
2206 | vmovdqu $xc3,0x40($out) | |
2207 | vmovdqu $xd3,0x60($out) | |
2208 | lea 0x80($out),$out # size optimization | |
2209 | ||
2210 | sub \$64*8,$len | |
2211 | jnz .Loop_outer8x | |
2212 | ||
2213 | jmp .Ldone8x | |
2214 | ||
2215 | .Ltail8x: | |
2216 | cmp \$448,$len | |
2217 | jae .L448_or_more8x | |
2218 | cmp \$384,$len | |
2219 | jae .L384_or_more8x | |
2220 | cmp \$320,$len | |
2221 | jae .L320_or_more8x | |
2222 | cmp \$256,$len | |
2223 | jae .L256_or_more8x | |
2224 | cmp \$192,$len | |
2225 | jae .L192_or_more8x | |
2226 | cmp \$128,$len | |
2227 | jae .L128_or_more8x | |
2228 | cmp \$64,$len | |
2229 | jae .L64_or_more8x | |
2230 | ||
2231 | xor %r10,%r10 | |
2232 | vmovdqa $xa0,0x00(%rsp) | |
2233 | vmovdqa $xb0,0x20(%rsp) | |
2234 | jmp .Loop_tail8x | |
2235 | ||
2236 | .align 32 | |
2237 | .L64_or_more8x: | |
2238 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2239 | vpxor 0x20($inp),$xb0,$xb0 | |
2240 | vmovdqu $xa0,0x00($out) | |
2241 | vmovdqu $xb0,0x20($out) | |
2242 | je .Ldone8x | |
2243 | ||
2244 | lea 0x40($inp),$inp # inp+=64*1 | |
2245 | xor %r10,%r10 | |
2246 | vmovdqa $xc0,0x00(%rsp) | |
2247 | lea 0x40($out),$out # out+=64*1 | |
2248 | sub \$64,$len # len-=64*1 | |
2249 | vmovdqa $xd0,0x20(%rsp) | |
2250 | jmp .Loop_tail8x | |
2251 | ||
2252 | .align 32 | |
2253 | .L128_or_more8x: | |
2254 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2255 | vpxor 0x20($inp),$xb0,$xb0 | |
2256 | vpxor 0x40($inp),$xc0,$xc0 | |
2257 | vpxor 0x60($inp),$xd0,$xd0 | |
2258 | vmovdqu $xa0,0x00($out) | |
2259 | vmovdqu $xb0,0x20($out) | |
2260 | vmovdqu $xc0,0x40($out) | |
2261 | vmovdqu $xd0,0x60($out) | |
2262 | je .Ldone8x | |
2263 | ||
2264 | lea 0x80($inp),$inp # inp+=64*2 | |
2265 | xor %r10,%r10 | |
2266 | vmovdqa $xa1,0x00(%rsp) | |
2267 | lea 0x80($out),$out # out+=64*2 | |
2268 | sub \$128,$len # len-=64*2 | |
2269 | vmovdqa $xb1,0x20(%rsp) | |
2270 | jmp .Loop_tail8x | |
2271 | ||
2272 | .align 32 | |
2273 | .L192_or_more8x: | |
2274 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2275 | vpxor 0x20($inp),$xb0,$xb0 | |
2276 | vpxor 0x40($inp),$xc0,$xc0 | |
2277 | vpxor 0x60($inp),$xd0,$xd0 | |
2278 | vpxor 0x80($inp),$xa1,$xa1 | |
2279 | vpxor 0xa0($inp),$xb1,$xb1 | |
2280 | vmovdqu $xa0,0x00($out) | |
2281 | vmovdqu $xb0,0x20($out) | |
2282 | vmovdqu $xc0,0x40($out) | |
2283 | vmovdqu $xd0,0x60($out) | |
2284 | vmovdqu $xa1,0x80($out) | |
2285 | vmovdqu $xb1,0xa0($out) | |
2286 | je .Ldone8x | |
2287 | ||
2288 | lea 0xc0($inp),$inp # inp+=64*3 | |
2289 | xor %r10,%r10 | |
2290 | vmovdqa $xc1,0x00(%rsp) | |
2291 | lea 0xc0($out),$out # out+=64*3 | |
2292 | sub \$192,$len # len-=64*3 | |
2293 | vmovdqa $xd1,0x20(%rsp) | |
2294 | jmp .Loop_tail8x | |
2295 | ||
2296 | .align 32 | |
2297 | .L256_or_more8x: | |
2298 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2299 | vpxor 0x20($inp),$xb0,$xb0 | |
2300 | vpxor 0x40($inp),$xc0,$xc0 | |
2301 | vpxor 0x60($inp),$xd0,$xd0 | |
2302 | vpxor 0x80($inp),$xa1,$xa1 | |
2303 | vpxor 0xa0($inp),$xb1,$xb1 | |
2304 | vpxor 0xc0($inp),$xc1,$xc1 | |
2305 | vpxor 0xe0($inp),$xd1,$xd1 | |
2306 | vmovdqu $xa0,0x00($out) | |
2307 | vmovdqu $xb0,0x20($out) | |
2308 | vmovdqu $xc0,0x40($out) | |
2309 | vmovdqu $xd0,0x60($out) | |
2310 | vmovdqu $xa1,0x80($out) | |
2311 | vmovdqu $xb1,0xa0($out) | |
2312 | vmovdqu $xc1,0xc0($out) | |
2313 | vmovdqu $xd1,0xe0($out) | |
2314 | je .Ldone8x | |
2315 | ||
2316 | lea 0x100($inp),$inp # inp+=64*4 | |
2317 | xor %r10,%r10 | |
2318 | vmovdqa $xa2,0x00(%rsp) | |
2319 | lea 0x100($out),$out # out+=64*4 | |
2320 | sub \$256,$len # len-=64*4 | |
2321 | vmovdqa $xb2,0x20(%rsp) | |
2322 | jmp .Loop_tail8x | |
2323 | ||
2324 | .align 32 | |
2325 | .L320_or_more8x: | |
2326 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2327 | vpxor 0x20($inp),$xb0,$xb0 | |
2328 | vpxor 0x40($inp),$xc0,$xc0 | |
2329 | vpxor 0x60($inp),$xd0,$xd0 | |
2330 | vpxor 0x80($inp),$xa1,$xa1 | |
2331 | vpxor 0xa0($inp),$xb1,$xb1 | |
2332 | vpxor 0xc0($inp),$xc1,$xc1 | |
2333 | vpxor 0xe0($inp),$xd1,$xd1 | |
2334 | vpxor 0x100($inp),$xa2,$xa2 | |
2335 | vpxor 0x120($inp),$xb2,$xb2 | |
2336 | vmovdqu $xa0,0x00($out) | |
2337 | vmovdqu $xb0,0x20($out) | |
2338 | vmovdqu $xc0,0x40($out) | |
2339 | vmovdqu $xd0,0x60($out) | |
2340 | vmovdqu $xa1,0x80($out) | |
2341 | vmovdqu $xb1,0xa0($out) | |
2342 | vmovdqu $xc1,0xc0($out) | |
2343 | vmovdqu $xd1,0xe0($out) | |
2344 | vmovdqu $xa2,0x100($out) | |
2345 | vmovdqu $xb2,0x120($out) | |
2346 | je .Ldone8x | |
2347 | ||
2348 | lea 0x140($inp),$inp # inp+=64*5 | |
2349 | xor %r10,%r10 | |
2350 | vmovdqa $xc2,0x00(%rsp) | |
2351 | lea 0x140($out),$out # out+=64*5 | |
2352 | sub \$320,$len # len-=64*5 | |
2353 | vmovdqa $xd2,0x20(%rsp) | |
2354 | jmp .Loop_tail8x | |
2355 | ||
2356 | .align 32 | |
2357 | .L384_or_more8x: | |
2358 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2359 | vpxor 0x20($inp),$xb0,$xb0 | |
2360 | vpxor 0x40($inp),$xc0,$xc0 | |
2361 | vpxor 0x60($inp),$xd0,$xd0 | |
2362 | vpxor 0x80($inp),$xa1,$xa1 | |
2363 | vpxor 0xa0($inp),$xb1,$xb1 | |
2364 | vpxor 0xc0($inp),$xc1,$xc1 | |
2365 | vpxor 0xe0($inp),$xd1,$xd1 | |
2366 | vpxor 0x100($inp),$xa2,$xa2 | |
2367 | vpxor 0x120($inp),$xb2,$xb2 | |
2368 | vpxor 0x140($inp),$xc2,$xc2 | |
2369 | vpxor 0x160($inp),$xd2,$xd2 | |
2370 | vmovdqu $xa0,0x00($out) | |
2371 | vmovdqu $xb0,0x20($out) | |
2372 | vmovdqu $xc0,0x40($out) | |
2373 | vmovdqu $xd0,0x60($out) | |
2374 | vmovdqu $xa1,0x80($out) | |
2375 | vmovdqu $xb1,0xa0($out) | |
2376 | vmovdqu $xc1,0xc0($out) | |
2377 | vmovdqu $xd1,0xe0($out) | |
2378 | vmovdqu $xa2,0x100($out) | |
2379 | vmovdqu $xb2,0x120($out) | |
2380 | vmovdqu $xc2,0x140($out) | |
2381 | vmovdqu $xd2,0x160($out) | |
2382 | je .Ldone8x | |
2383 | ||
2384 | lea 0x180($inp),$inp # inp+=64*6 | |
2385 | xor %r10,%r10 | |
2386 | vmovdqa $xa3,0x00(%rsp) | |
2387 | lea 0x180($out),$out # out+=64*6 | |
2388 | sub \$384,$len # len-=64*6 | |
2389 | vmovdqa $xb3,0x20(%rsp) | |
2390 | jmp .Loop_tail8x | |
2391 | ||
2392 | .align 32 | |
2393 | .L448_or_more8x: | |
2394 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
2395 | vpxor 0x20($inp),$xb0,$xb0 | |
2396 | vpxor 0x40($inp),$xc0,$xc0 | |
2397 | vpxor 0x60($inp),$xd0,$xd0 | |
2398 | vpxor 0x80($inp),$xa1,$xa1 | |
2399 | vpxor 0xa0($inp),$xb1,$xb1 | |
2400 | vpxor 0xc0($inp),$xc1,$xc1 | |
2401 | vpxor 0xe0($inp),$xd1,$xd1 | |
2402 | vpxor 0x100($inp),$xa2,$xa2 | |
2403 | vpxor 0x120($inp),$xb2,$xb2 | |
2404 | vpxor 0x140($inp),$xc2,$xc2 | |
2405 | vpxor 0x160($inp),$xd2,$xd2 | |
2406 | vpxor 0x180($inp),$xa3,$xa3 | |
2407 | vpxor 0x1a0($inp),$xb3,$xb3 | |
2408 | vmovdqu $xa0,0x00($out) | |
2409 | vmovdqu $xb0,0x20($out) | |
2410 | vmovdqu $xc0,0x40($out) | |
2411 | vmovdqu $xd0,0x60($out) | |
2412 | vmovdqu $xa1,0x80($out) | |
2413 | vmovdqu $xb1,0xa0($out) | |
2414 | vmovdqu $xc1,0xc0($out) | |
2415 | vmovdqu $xd1,0xe0($out) | |
2416 | vmovdqu $xa2,0x100($out) | |
2417 | vmovdqu $xb2,0x120($out) | |
2418 | vmovdqu $xc2,0x140($out) | |
2419 | vmovdqu $xd2,0x160($out) | |
2420 | vmovdqu $xa3,0x180($out) | |
2421 | vmovdqu $xb3,0x1a0($out) | |
2422 | je .Ldone8x | |
2423 | ||
2424 | lea 0x1c0($inp),$inp # inp+=64*7 | |
2425 | xor %r10,%r10 | |
2426 | vmovdqa $xc3,0x00(%rsp) | |
2427 | lea 0x1c0($out),$out # out+=64*7 | |
2428 | sub \$448,$len # len-=64*7 | |
2429 | vmovdqa $xd3,0x20(%rsp) | |
2430 | ||
2431 | .Loop_tail8x: | |
2432 | movzb ($inp,%r10),%eax | |
2433 | movzb (%rsp,%r10),%ecx | |
2434 | lea 1(%r10),%r10 | |
2435 | xor %ecx,%eax | |
2436 | mov %al,-1($out,%r10) | |
2437 | dec $len | |
2438 | jnz .Loop_tail8x | |
2439 | ||
2440 | .Ldone8x: | |
3c274a6e | 2441 | vzeroall |
a98c648e AP |
2442 | ___ |
2443 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2444 | movaps -0xa8(%r9),%xmm6 |
2445 | movaps -0x98(%r9),%xmm7 | |
2446 | movaps -0x88(%r9),%xmm8 | |
2447 | movaps -0x78(%r9),%xmm9 | |
2448 | movaps -0x68(%r9),%xmm10 | |
2449 | movaps -0x58(%r9),%xmm11 | |
2450 | movaps -0x48(%r9),%xmm12 | |
2451 | movaps -0x38(%r9),%xmm13 | |
2452 | movaps -0x28(%r9),%xmm14 | |
2453 | movaps -0x18(%r9),%xmm15 | |
a98c648e AP |
2454 | ___ |
2455 | $code.=<<___; | |
384e6de4 | 2456 | lea (%r9),%rsp |
f17652e5 | 2457 | .cfi_def_cfa_register %rsp |
384e6de4 | 2458 | .L8x_epilogue: |
a98c648e | 2459 | ret |
f17652e5 | 2460 | .cfi_endproc |
a98c648e AP |
2461 | .size ChaCha20_8x,.-ChaCha20_8x |
2462 | ___ | |
2463 | } | |
2464 | ||
abb8c44f AP |
2465 | ######################################################################## |
2466 | # AVX512 code paths | |
2467 | if ($avx>2) { | |
3c274a6e AP |
2468 | # This one handles shorter inputs... |
2469 | ||
2470 | my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); | |
2471 | my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); | |
2472 | ||
cded9513 AP |
2473 | sub vpxord() # size optimization |
2474 | { my $opcode = "vpxor"; # adhere to vpxor when possible | |
2475 | ||
2476 | foreach (@_) { | |
2477 | if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) { | |
2478 | $opcode = "vpxord"; | |
2479 | last; | |
2480 | } | |
2481 | } | |
2482 | ||
2483 | $code .= "\t$opcode\t".join(',',reverse @_)."\n"; | |
2484 | } | |
2485 | ||
3c274a6e AP |
2486 | sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round |
2487 | &vpaddd ($a,$a,$b); | |
2488 | &vpxord ($d,$d,$a); | |
2489 | &vprold ($d,$d,16); | |
2490 | ||
2491 | &vpaddd ($c,$c,$d); | |
2492 | &vpxord ($b,$b,$c); | |
2493 | &vprold ($b,$b,12); | |
2494 | ||
2495 | &vpaddd ($a,$a,$b); | |
2496 | &vpxord ($d,$d,$a); | |
2497 | &vprold ($d,$d,8); | |
2498 | ||
2499 | &vpaddd ($c,$c,$d); | |
2500 | &vpxord ($b,$b,$c); | |
2501 | &vprold ($b,$b,7); | |
2502 | } | |
2503 | ||
384e6de4 | 2504 | my $xframe = $win64 ? 32+8 : 8; |
3c274a6e AP |
2505 | |
2506 | $code.=<<___; | |
2507 | .type ChaCha20_avx512,\@function,5 | |
2508 | .align 32 | |
2509 | ChaCha20_avx512: | |
f17652e5 | 2510 | .cfi_startproc |
3c274a6e | 2511 | .LChaCha20_avx512: |
384e6de4 | 2512 | mov %rsp,%r9 # frame pointer |
f17652e5 | 2513 | .cfi_def_cfa_register %r9 |
3c274a6e AP |
2514 | cmp \$512,$len |
2515 | ja .LChaCha20_16x | |
2516 | ||
3c274a6e AP |
2517 | sub \$64+$xframe,%rsp |
2518 | ___ | |
2519 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2520 | movaps %xmm6,-0x28(%r9) |
2521 | movaps %xmm7,-0x18(%r9) | |
2522 | .Lavx512_body: | |
3c274a6e AP |
2523 | ___ |
2524 | $code.=<<___; | |
2525 | vbroadcasti32x4 .Lsigma(%rip),$a | |
2526 | vbroadcasti32x4 ($key),$b | |
2527 | vbroadcasti32x4 16($key),$c | |
2528 | vbroadcasti32x4 ($counter),$d | |
2529 | ||
2530 | vmovdqa32 $a,$a_ | |
2531 | vmovdqa32 $b,$b_ | |
2532 | vmovdqa32 $c,$c_ | |
2533 | vpaddd .Lzeroz(%rip),$d,$d | |
2534 | vmovdqa32 .Lfourz(%rip),$fourz | |
2535 | mov \$10,$counter # reuse $counter | |
2536 | vmovdqa32 $d,$d_ | |
2537 | jmp .Loop_avx512 | |
2538 | ||
2539 | .align 16 | |
2540 | .Loop_outer_avx512: | |
2541 | vmovdqa32 $a_,$a | |
2542 | vmovdqa32 $b_,$b | |
2543 | vmovdqa32 $c_,$c | |
2544 | vpaddd $fourz,$d_,$d | |
2545 | mov \$10,$counter | |
2546 | vmovdqa32 $d,$d_ | |
2547 | jmp .Loop_avx512 | |
2548 | ||
2549 | .align 32 | |
2550 | .Loop_avx512: | |
2551 | ___ | |
2552 | &AVX512ROUND(); | |
2553 | &vpshufd ($c,$c,0b01001110); | |
2554 | &vpshufd ($b,$b,0b00111001); | |
2555 | &vpshufd ($d,$d,0b10010011); | |
2556 | ||
2557 | &AVX512ROUND(); | |
2558 | &vpshufd ($c,$c,0b01001110); | |
2559 | &vpshufd ($b,$b,0b10010011); | |
2560 | &vpshufd ($d,$d,0b00111001); | |
2561 | ||
2562 | &dec ($counter); | |
2563 | &jnz (".Loop_avx512"); | |
2564 | ||
2565 | $code.=<<___; | |
2566 | vpaddd $a_,$a,$a | |
2567 | vpaddd $b_,$b,$b | |
2568 | vpaddd $c_,$c,$c | |
2569 | vpaddd $d_,$d,$d | |
2570 | ||
2571 | sub \$64,$len | |
2572 | jb .Ltail64_avx512 | |
2573 | ||
2574 | vpxor 0x00($inp),%x#$a,$t0 # xor with input | |
2575 | vpxor 0x10($inp),%x#$b,$t1 | |
2576 | vpxor 0x20($inp),%x#$c,$t2 | |
2577 | vpxor 0x30($inp),%x#$d,$t3 | |
2578 | lea 0x40($inp),$inp # inp+=64 | |
2579 | ||
2580 | vmovdqu $t0,0x00($out) # write output | |
2581 | vmovdqu $t1,0x10($out) | |
2582 | vmovdqu $t2,0x20($out) | |
2583 | vmovdqu $t3,0x30($out) | |
2584 | lea 0x40($out),$out # out+=64 | |
2585 | ||
2586 | jz .Ldone_avx512 | |
2587 | ||
2588 | vextracti32x4 \$1,$a,$t0 | |
2589 | vextracti32x4 \$1,$b,$t1 | |
2590 | vextracti32x4 \$1,$c,$t2 | |
2591 | vextracti32x4 \$1,$d,$t3 | |
2592 | ||
2593 | sub \$64,$len | |
2594 | jb .Ltail_avx512 | |
2595 | ||
2596 | vpxor 0x00($inp),$t0,$t0 # xor with input | |
2597 | vpxor 0x10($inp),$t1,$t1 | |
2598 | vpxor 0x20($inp),$t2,$t2 | |
2599 | vpxor 0x30($inp),$t3,$t3 | |
2600 | lea 0x40($inp),$inp # inp+=64 | |
2601 | ||
2602 | vmovdqu $t0,0x00($out) # write output | |
2603 | vmovdqu $t1,0x10($out) | |
2604 | vmovdqu $t2,0x20($out) | |
2605 | vmovdqu $t3,0x30($out) | |
2606 | lea 0x40($out),$out # out+=64 | |
2607 | ||
2608 | jz .Ldone_avx512 | |
2609 | ||
2610 | vextracti32x4 \$2,$a,$t0 | |
2611 | vextracti32x4 \$2,$b,$t1 | |
2612 | vextracti32x4 \$2,$c,$t2 | |
2613 | vextracti32x4 \$2,$d,$t3 | |
2614 | ||
2615 | sub \$64,$len | |
2616 | jb .Ltail_avx512 | |
2617 | ||
2618 | vpxor 0x00($inp),$t0,$t0 # xor with input | |
2619 | vpxor 0x10($inp),$t1,$t1 | |
2620 | vpxor 0x20($inp),$t2,$t2 | |
2621 | vpxor 0x30($inp),$t3,$t3 | |
2622 | lea 0x40($inp),$inp # inp+=64 | |
2623 | ||
2624 | vmovdqu $t0,0x00($out) # write output | |
2625 | vmovdqu $t1,0x10($out) | |
2626 | vmovdqu $t2,0x20($out) | |
2627 | vmovdqu $t3,0x30($out) | |
2628 | lea 0x40($out),$out # out+=64 | |
2629 | ||
2630 | jz .Ldone_avx512 | |
2631 | ||
2632 | vextracti32x4 \$3,$a,$t0 | |
2633 | vextracti32x4 \$3,$b,$t1 | |
2634 | vextracti32x4 \$3,$c,$t2 | |
2635 | vextracti32x4 \$3,$d,$t3 | |
2636 | ||
2637 | sub \$64,$len | |
2638 | jb .Ltail_avx512 | |
2639 | ||
2640 | vpxor 0x00($inp),$t0,$t0 # xor with input | |
2641 | vpxor 0x10($inp),$t1,$t1 | |
2642 | vpxor 0x20($inp),$t2,$t2 | |
2643 | vpxor 0x30($inp),$t3,$t3 | |
2644 | lea 0x40($inp),$inp # inp+=64 | |
2645 | ||
2646 | vmovdqu $t0,0x00($out) # write output | |
2647 | vmovdqu $t1,0x10($out) | |
2648 | vmovdqu $t2,0x20($out) | |
2649 | vmovdqu $t3,0x30($out) | |
2650 | lea 0x40($out),$out # out+=64 | |
2651 | ||
2652 | jnz .Loop_outer_avx512 | |
2653 | ||
2654 | jmp .Ldone_avx512 | |
2655 | ||
2656 | .align 16 | |
2657 | .Ltail64_avx512: | |
2658 | vmovdqa %x#$a,0x00(%rsp) | |
2659 | vmovdqa %x#$b,0x10(%rsp) | |
2660 | vmovdqa %x#$c,0x20(%rsp) | |
2661 | vmovdqa %x#$d,0x30(%rsp) | |
2662 | add \$64,$len | |
2663 | jmp .Loop_tail_avx512 | |
2664 | ||
2665 | .align 16 | |
2666 | .Ltail_avx512: | |
2667 | vmovdqa $t0,0x00(%rsp) | |
2668 | vmovdqa $t1,0x10(%rsp) | |
2669 | vmovdqa $t2,0x20(%rsp) | |
2670 | vmovdqa $t3,0x30(%rsp) | |
2671 | add \$64,$len | |
2672 | ||
2673 | .Loop_tail_avx512: | |
2674 | movzb ($inp,$counter),%eax | |
2675 | movzb (%rsp,$counter),%ecx | |
2676 | lea 1($counter),$counter | |
2677 | xor %ecx,%eax | |
2678 | mov %al,-1($out,$counter) | |
2679 | dec $len | |
2680 | jnz .Loop_tail_avx512 | |
2681 | ||
47c9926a | 2682 | vmovdqu32 $a_,0x00(%rsp) |
3c274a6e AP |
2683 | |
2684 | .Ldone_avx512: | |
2685 | vzeroall | |
2686 | ___ | |
2687 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2688 | movaps -0x28(%r9),%xmm6 |
2689 | movaps -0x18(%r9),%xmm7 | |
3c274a6e AP |
2690 | ___ |
2691 | $code.=<<___; | |
384e6de4 | 2692 | lea (%r9),%rsp |
f17652e5 | 2693 | .cfi_def_cfa_register %rsp |
384e6de4 | 2694 | .Lavx512_epilogue: |
3c274a6e | 2695 | ret |
f17652e5 | 2696 | .cfi_endproc |
3c274a6e AP |
2697 | .size ChaCha20_avx512,.-ChaCha20_avx512 |
2698 | ___ | |
cded9513 AP |
2699 | |
2700 | map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz); | |
2701 | ||
2702 | $code.=<<___; | |
2703 | .type ChaCha20_avx512vl,\@function,5 | |
2704 | .align 32 | |
2705 | ChaCha20_avx512vl: | |
2706 | .cfi_startproc | |
2707 | .LChaCha20_avx512vl: | |
2708 | mov %rsp,%r9 # frame pointer | |
2709 | .cfi_def_cfa_register %r9 | |
2710 | cmp \$128,$len | |
2711 | ja .LChaCha20_8xvl | |
2712 | ||
2713 | sub \$64+$xframe,%rsp | |
2714 | ___ | |
2715 | $code.=<<___ if ($win64); | |
2716 | movaps %xmm6,-0x28(%r9) | |
2717 | movaps %xmm7,-0x18(%r9) | |
2718 | .Lavx512vl_body: | |
2719 | ___ | |
2720 | $code.=<<___; | |
2721 | vbroadcasti128 .Lsigma(%rip),$a | |
2722 | vbroadcasti128 ($key),$b | |
2723 | vbroadcasti128 16($key),$c | |
2724 | vbroadcasti128 ($counter),$d | |
2725 | ||
2726 | vmovdqa32 $a,$a_ | |
2727 | vmovdqa32 $b,$b_ | |
2728 | vmovdqa32 $c,$c_ | |
2729 | vpaddd .Lzeroz(%rip),$d,$d | |
2730 | vmovdqa32 .Ltwoy(%rip),$fourz | |
2731 | mov \$10,$counter # reuse $counter | |
2732 | vmovdqa32 $d,$d_ | |
2733 | jmp .Loop_avx512vl | |
2734 | ||
2735 | .align 16 | |
2736 | .Loop_outer_avx512vl: | |
2737 | vmovdqa32 $c_,$c | |
2738 | vpaddd $fourz,$d_,$d | |
2739 | mov \$10,$counter | |
2740 | vmovdqa32 $d,$d_ | |
2741 | jmp .Loop_avx512vl | |
2742 | ||
2743 | .align 32 | |
2744 | .Loop_avx512vl: | |
2745 | ___ | |
2746 | &AVX512ROUND(); | |
2747 | &vpshufd ($c,$c,0b01001110); | |
2748 | &vpshufd ($b,$b,0b00111001); | |
2749 | &vpshufd ($d,$d,0b10010011); | |
2750 | ||
2751 | &AVX512ROUND(); | |
2752 | &vpshufd ($c,$c,0b01001110); | |
2753 | &vpshufd ($b,$b,0b10010011); | |
2754 | &vpshufd ($d,$d,0b00111001); | |
2755 | ||
2756 | &dec ($counter); | |
2757 | &jnz (".Loop_avx512vl"); | |
2758 | ||
2759 | $code.=<<___; | |
2760 | vpaddd $a_,$a,$a | |
2761 | vpaddd $b_,$b,$b | |
2762 | vpaddd $c_,$c,$c | |
2763 | vpaddd $d_,$d,$d | |
2764 | ||
2765 | sub \$64,$len | |
2766 | jb .Ltail64_avx512vl | |
2767 | ||
2768 | vpxor 0x00($inp),%x#$a,$t0 # xor with input | |
2769 | vpxor 0x10($inp),%x#$b,$t1 | |
2770 | vpxor 0x20($inp),%x#$c,$t2 | |
2771 | vpxor 0x30($inp),%x#$d,$t3 | |
2772 | lea 0x40($inp),$inp # inp+=64 | |
2773 | ||
2774 | vmovdqu $t0,0x00($out) # write output | |
2775 | vmovdqu $t1,0x10($out) | |
2776 | vmovdqu $t2,0x20($out) | |
2777 | vmovdqu $t3,0x30($out) | |
2778 | lea 0x40($out),$out # out+=64 | |
2779 | ||
2780 | jz .Ldone_avx512vl | |
2781 | ||
2782 | vextracti128 \$1,$a,$t0 | |
2783 | vextracti128 \$1,$b,$t1 | |
2784 | vextracti128 \$1,$c,$t2 | |
2785 | vextracti128 \$1,$d,$t3 | |
2786 | ||
2787 | sub \$64,$len | |
2788 | jb .Ltail_avx512vl | |
2789 | ||
2790 | vpxor 0x00($inp),$t0,$t0 # xor with input | |
2791 | vpxor 0x10($inp),$t1,$t1 | |
2792 | vpxor 0x20($inp),$t2,$t2 | |
2793 | vpxor 0x30($inp),$t3,$t3 | |
2794 | lea 0x40($inp),$inp # inp+=64 | |
2795 | ||
2796 | vmovdqu $t0,0x00($out) # write output | |
2797 | vmovdqu $t1,0x10($out) | |
2798 | vmovdqu $t2,0x20($out) | |
2799 | vmovdqu $t3,0x30($out) | |
2800 | lea 0x40($out),$out # out+=64 | |
2801 | ||
2802 | vmovdqa32 $a_,$a | |
2803 | vmovdqa32 $b_,$b | |
2804 | jnz .Loop_outer_avx512vl | |
2805 | ||
2806 | jmp .Ldone_avx512vl | |
2807 | ||
2808 | .align 16 | |
2809 | .Ltail64_avx512vl: | |
2810 | vmovdqa %x#$a,0x00(%rsp) | |
2811 | vmovdqa %x#$b,0x10(%rsp) | |
2812 | vmovdqa %x#$c,0x20(%rsp) | |
2813 | vmovdqa %x#$d,0x30(%rsp) | |
2814 | add \$64,$len | |
2815 | jmp .Loop_tail_avx512vl | |
2816 | ||
2817 | .align 16 | |
2818 | .Ltail_avx512vl: | |
2819 | vmovdqa $t0,0x00(%rsp) | |
2820 | vmovdqa $t1,0x10(%rsp) | |
2821 | vmovdqa $t2,0x20(%rsp) | |
2822 | vmovdqa $t3,0x30(%rsp) | |
2823 | add \$64,$len | |
2824 | ||
2825 | .Loop_tail_avx512vl: | |
2826 | movzb ($inp,$counter),%eax | |
2827 | movzb (%rsp,$counter),%ecx | |
2828 | lea 1($counter),$counter | |
2829 | xor %ecx,%eax | |
2830 | mov %al,-1($out,$counter) | |
2831 | dec $len | |
2832 | jnz .Loop_tail_avx512vl | |
2833 | ||
2834 | vmovdqu32 $a_,0x00(%rsp) | |
2835 | vmovdqu32 $a_,0x20(%rsp) | |
2836 | ||
2837 | .Ldone_avx512vl: | |
2838 | vzeroall | |
2839 | ___ | |
2840 | $code.=<<___ if ($win64); | |
2841 | movaps -0x28(%r9),%xmm6 | |
2842 | movaps -0x18(%r9),%xmm7 | |
2843 | ___ | |
2844 | $code.=<<___; | |
2845 | lea (%r9),%rsp | |
2846 | .cfi_def_cfa_register %rsp | |
2847 | .Lavx512vl_epilogue: | |
2848 | ret | |
2849 | .cfi_endproc | |
2850 | .size ChaCha20_avx512vl,.-ChaCha20_avx512vl | |
2851 | ___ | |
3c274a6e AP |
2852 | } |
2853 | if ($avx>2) { | |
2854 | # This one handles longer inputs... | |
2855 | ||
abb8c44f AP |
2856 | my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, |
2857 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); | |
2858 | my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
2859 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); | |
2860 | my @key=map("%zmm$_",(16..31)); | |
2861 | my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; | |
2862 | ||
2863 | sub AVX512_lane_ROUND { | |
2864 | my ($a0,$b0,$c0,$d0)=@_; | |
2865 | my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |
2866 | my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |
2867 | my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |
2868 | my @x=map("\"$_\"",@xx); | |
2869 | ||
2870 | ( | |
2871 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 | |
2872 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 | |
2873 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 | |
2874 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 | |
2875 | "&vpxord (@x[$d0],@x[$d0],@x[$a0])", | |
2876 | "&vpxord (@x[$d1],@x[$d1],@x[$a1])", | |
2877 | "&vpxord (@x[$d2],@x[$d2],@x[$a2])", | |
2878 | "&vpxord (@x[$d3],@x[$d3],@x[$a3])", | |
2879 | "&vprold (@x[$d0],@x[$d0],16)", | |
2880 | "&vprold (@x[$d1],@x[$d1],16)", | |
2881 | "&vprold (@x[$d2],@x[$d2],16)", | |
2882 | "&vprold (@x[$d3],@x[$d3],16)", | |
2883 | ||
2884 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", | |
2885 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", | |
2886 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", | |
2887 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", | |
2888 | "&vpxord (@x[$b0],@x[$b0],@x[$c0])", | |
2889 | "&vpxord (@x[$b1],@x[$b1],@x[$c1])", | |
2890 | "&vpxord (@x[$b2],@x[$b2],@x[$c2])", | |
2891 | "&vpxord (@x[$b3],@x[$b3],@x[$c3])", | |
2892 | "&vprold (@x[$b0],@x[$b0],12)", | |
2893 | "&vprold (@x[$b1],@x[$b1],12)", | |
2894 | "&vprold (@x[$b2],@x[$b2],12)", | |
2895 | "&vprold (@x[$b3],@x[$b3],12)", | |
2896 | ||
2897 | "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", | |
2898 | "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", | |
2899 | "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", | |
2900 | "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", | |
2901 | "&vpxord (@x[$d0],@x[$d0],@x[$a0])", | |
2902 | "&vpxord (@x[$d1],@x[$d1],@x[$a1])", | |
2903 | "&vpxord (@x[$d2],@x[$d2],@x[$a2])", | |
2904 | "&vpxord (@x[$d3],@x[$d3],@x[$a3])", | |
2905 | "&vprold (@x[$d0],@x[$d0],8)", | |
2906 | "&vprold (@x[$d1],@x[$d1],8)", | |
2907 | "&vprold (@x[$d2],@x[$d2],8)", | |
2908 | "&vprold (@x[$d3],@x[$d3],8)", | |
2909 | ||
2910 | "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", | |
2911 | "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", | |
2912 | "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", | |
2913 | "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", | |
2914 | "&vpxord (@x[$b0],@x[$b0],@x[$c0])", | |
2915 | "&vpxord (@x[$b1],@x[$b1],@x[$c1])", | |
2916 | "&vpxord (@x[$b2],@x[$b2],@x[$c2])", | |
2917 | "&vpxord (@x[$b3],@x[$b3],@x[$c3])", | |
2918 | "&vprold (@x[$b0],@x[$b0],7)", | |
2919 | "&vprold (@x[$b1],@x[$b1],7)", | |
2920 | "&vprold (@x[$b2],@x[$b2],7)", | |
2921 | "&vprold (@x[$b3],@x[$b3],7)" | |
2922 | ); | |
2923 | } | |
2924 | ||
384e6de4 | 2925 | my $xframe = $win64 ? 0xa8 : 8; |
abb8c44f AP |
2926 | |
2927 | $code.=<<___; | |
2928 | .type ChaCha20_16x,\@function,5 | |
2929 | .align 32 | |
2930 | ChaCha20_16x: | |
f17652e5 | 2931 | .cfi_startproc |
abb8c44f | 2932 | .LChaCha20_16x: |
384e6de4 | 2933 | mov %rsp,%r9 # frame register |
f17652e5 | 2934 | .cfi_def_cfa_register %r9 |
abb8c44f AP |
2935 | sub \$64+$xframe,%rsp |
2936 | and \$-64,%rsp | |
2937 | ___ | |
2938 | $code.=<<___ if ($win64); | |
384e6de4 AP |
2939 | movaps %xmm6,-0xa8(%r9) |
2940 | movaps %xmm7,-0x98(%r9) | |
2941 | movaps %xmm8,-0x88(%r9) | |
2942 | movaps %xmm9,-0x78(%r9) | |
2943 | movaps %xmm10,-0x68(%r9) | |
2944 | movaps %xmm11,-0x58(%r9) | |
2945 | movaps %xmm12,-0x48(%r9) | |
2946 | movaps %xmm13,-0x38(%r9) | |
2947 | movaps %xmm14,-0x28(%r9) | |
2948 | movaps %xmm15,-0x18(%r9) | |
2949 | .L16x_body: | |
abb8c44f AP |
2950 | ___ |
2951 | $code.=<<___; | |
2952 | vzeroupper | |
2953 | ||
2954 | lea .Lsigma(%rip),%r10 | |
2955 | vbroadcasti32x4 (%r10),$xa3 # key[0] | |
2956 | vbroadcasti32x4 ($key),$xb3 # key[1] | |
2957 | vbroadcasti32x4 16($key),$xc3 # key[2] | |
2958 | vbroadcasti32x4 ($counter),$xd3 # key[3] | |
2959 | ||
2960 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
2961 | vpshufd \$0x55,$xa3,$xa1 | |
2962 | vpshufd \$0xaa,$xa3,$xa2 | |
2963 | vpshufd \$0xff,$xa3,$xa3 | |
2964 | vmovdqa64 $xa0,@key[0] | |
2965 | vmovdqa64 $xa1,@key[1] | |
2966 | vmovdqa64 $xa2,@key[2] | |
2967 | vmovdqa64 $xa3,@key[3] | |
2968 | ||
2969 | vpshufd \$0x00,$xb3,$xb0 | |
2970 | vpshufd \$0x55,$xb3,$xb1 | |
2971 | vpshufd \$0xaa,$xb3,$xb2 | |
2972 | vpshufd \$0xff,$xb3,$xb3 | |
2973 | vmovdqa64 $xb0,@key[4] | |
2974 | vmovdqa64 $xb1,@key[5] | |
2975 | vmovdqa64 $xb2,@key[6] | |
2976 | vmovdqa64 $xb3,@key[7] | |
2977 | ||
2978 | vpshufd \$0x00,$xc3,$xc0 | |
2979 | vpshufd \$0x55,$xc3,$xc1 | |
2980 | vpshufd \$0xaa,$xc3,$xc2 | |
2981 | vpshufd \$0xff,$xc3,$xc3 | |
2982 | vmovdqa64 $xc0,@key[8] | |
2983 | vmovdqa64 $xc1,@key[9] | |
2984 | vmovdqa64 $xc2,@key[10] | |
2985 | vmovdqa64 $xc3,@key[11] | |
2986 | ||
2987 | vpshufd \$0x00,$xd3,$xd0 | |
2988 | vpshufd \$0x55,$xd3,$xd1 | |
2989 | vpshufd \$0xaa,$xd3,$xd2 | |
2990 | vpshufd \$0xff,$xd3,$xd3 | |
2991 | vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet | |
2992 | vmovdqa64 $xd0,@key[12] | |
2993 | vmovdqa64 $xd1,@key[13] | |
2994 | vmovdqa64 $xd2,@key[14] | |
2995 | vmovdqa64 $xd3,@key[15] | |
2996 | ||
2997 | mov \$10,%eax | |
2998 | jmp .Loop16x | |
2999 | ||
3000 | .align 32 | |
3001 | .Loop_outer16x: | |
3002 | vpbroadcastd 0(%r10),$xa0 # reload key | |
3003 | vpbroadcastd 4(%r10),$xa1 | |
3004 | vpbroadcastd 8(%r10),$xa2 | |
3005 | vpbroadcastd 12(%r10),$xa3 | |
3006 | vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters | |
3007 | vmovdqa64 @key[4],$xb0 | |
3008 | vmovdqa64 @key[5],$xb1 | |
3009 | vmovdqa64 @key[6],$xb2 | |
3010 | vmovdqa64 @key[7],$xb3 | |
3011 | vmovdqa64 @key[8],$xc0 | |
3012 | vmovdqa64 @key[9],$xc1 | |
3013 | vmovdqa64 @key[10],$xc2 | |
3014 | vmovdqa64 @key[11],$xc3 | |
3015 | vmovdqa64 @key[12],$xd0 | |
3016 | vmovdqa64 @key[13],$xd1 | |
3017 | vmovdqa64 @key[14],$xd2 | |
3018 | vmovdqa64 @key[15],$xd3 | |
3019 | ||
3020 | vmovdqa64 $xa0,@key[0] | |
3021 | vmovdqa64 $xa1,@key[1] | |
3022 | vmovdqa64 $xa2,@key[2] | |
3023 | vmovdqa64 $xa3,@key[3] | |
3024 | ||
3025 | mov \$10,%eax | |
3026 | jmp .Loop16x | |
3027 | ||
3028 | .align 32 | |
3029 | .Loop16x: | |
3030 | ___ | |
3031 | foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } | |
3032 | foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } | |
3033 | $code.=<<___; | |
3034 | dec %eax | |
3035 | jnz .Loop16x | |
3036 | ||
3037 | vpaddd @key[0],$xa0,$xa0 # accumulate key | |
3038 | vpaddd @key[1],$xa1,$xa1 | |
3039 | vpaddd @key[2],$xa2,$xa2 | |
3040 | vpaddd @key[3],$xa3,$xa3 | |
3041 | ||
3042 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data | |
3043 | vpunpckldq $xa3,$xa2,$xt3 | |
3044 | vpunpckhdq $xa1,$xa0,$xa0 | |
3045 | vpunpckhdq $xa3,$xa2,$xa2 | |
3046 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0" | |
3047 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1" | |
3048 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2" | |
3049 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3" | |
3050 | ___ | |
3051 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); | |
3052 | $code.=<<___; | |
3053 | vpaddd @key[4],$xb0,$xb0 | |
3054 | vpaddd @key[5],$xb1,$xb1 | |
3055 | vpaddd @key[6],$xb2,$xb2 | |
3056 | vpaddd @key[7],$xb3,$xb3 | |
3057 | ||
3058 | vpunpckldq $xb1,$xb0,$xt2 | |
3059 | vpunpckldq $xb3,$xb2,$xt3 | |
3060 | vpunpckhdq $xb1,$xb0,$xb0 | |
3061 | vpunpckhdq $xb3,$xb2,$xb2 | |
3062 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0" | |
3063 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1" | |
3064 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2" | |
3065 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3" | |
3066 | ___ | |
3067 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); | |
3068 | $code.=<<___; | |
3069 | vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further | |
3070 | vshufi32x4 \$0xee,$xb0,$xa0,$xb0 | |
3071 | vshufi32x4 \$0x44,$xb1,$xa1,$xa0 | |
3072 | vshufi32x4 \$0xee,$xb1,$xa1,$xb1 | |
3073 | vshufi32x4 \$0x44,$xb2,$xa2,$xa1 | |
3074 | vshufi32x4 \$0xee,$xb2,$xa2,$xb2 | |
3075 | vshufi32x4 \$0x44,$xb3,$xa3,$xa2 | |
3076 | vshufi32x4 \$0xee,$xb3,$xa3,$xb3 | |
3077 | ___ | |
3078 | ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); | |
3079 | $code.=<<___; | |
3080 | vpaddd @key[8],$xc0,$xc0 | |
3081 | vpaddd @key[9],$xc1,$xc1 | |
3082 | vpaddd @key[10],$xc2,$xc2 | |
3083 | vpaddd @key[11],$xc3,$xc3 | |
3084 | ||
3085 | vpunpckldq $xc1,$xc0,$xt2 | |
3086 | vpunpckldq $xc3,$xc2,$xt3 | |
3087 | vpunpckhdq $xc1,$xc0,$xc0 | |
3088 | vpunpckhdq $xc3,$xc2,$xc2 | |
3089 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0" | |
3090 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1" | |
3091 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2" | |
3092 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3" | |
3093 | ___ | |
3094 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); | |
3095 | $code.=<<___; | |
3096 | vpaddd @key[12],$xd0,$xd0 | |
3097 | vpaddd @key[13],$xd1,$xd1 | |
3098 | vpaddd @key[14],$xd2,$xd2 | |
3099 | vpaddd @key[15],$xd3,$xd3 | |
3100 | ||
3101 | vpunpckldq $xd1,$xd0,$xt2 | |
3102 | vpunpckldq $xd3,$xd2,$xt3 | |
3103 | vpunpckhdq $xd1,$xd0,$xd0 | |
3104 | vpunpckhdq $xd3,$xd2,$xd2 | |
3105 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0" | |
3106 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1" | |
3107 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2" | |
3108 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3" | |
3109 | ___ | |
3110 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); | |
3111 | $code.=<<___; | |
3112 | vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further | |
3113 | vshufi32x4 \$0xee,$xd0,$xc0,$xd0 | |
3114 | vshufi32x4 \$0x44,$xd1,$xc1,$xc0 | |
3115 | vshufi32x4 \$0xee,$xd1,$xc1,$xd1 | |
3116 | vshufi32x4 \$0x44,$xd2,$xc2,$xc1 | |
3117 | vshufi32x4 \$0xee,$xd2,$xc2,$xd2 | |
3118 | vshufi32x4 \$0x44,$xd3,$xc3,$xc2 | |
3119 | vshufi32x4 \$0xee,$xd3,$xc3,$xd3 | |
3120 | ___ | |
3121 | ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); | |
3122 | $code.=<<___; | |
3123 | vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further | |
3124 | vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 | |
3125 | vshufi32x4 \$0x88,$xd0,$xb0,$xc0 | |
3126 | vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 | |
3127 | vshufi32x4 \$0x88,$xc1,$xa1,$xt1 | |
3128 | vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 | |
3129 | vshufi32x4 \$0x88,$xd1,$xb1,$xc1 | |
3130 | vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 | |
3131 | vshufi32x4 \$0x88,$xc2,$xa2,$xt2 | |
3132 | vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 | |
3133 | vshufi32x4 \$0x88,$xd2,$xb2,$xc2 | |
3134 | vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 | |
3135 | vshufi32x4 \$0x88,$xc3,$xa3,$xt3 | |
3136 | vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 | |
3137 | vshufi32x4 \$0x88,$xd3,$xb3,$xc3 | |
3138 | vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 | |
3139 | ___ | |
3140 | ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= | |
3141 | ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); | |
3142 | ||
3143 | ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, | |
3144 | $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = | |
3145 | ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
3146 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); | |
3147 | $code.=<<___; | |
3148 | cmp \$64*16,$len | |
3149 | jb .Ltail16x | |
3150 | ||
3151 | vpxord 0x00($inp),$xa0,$xa0 # xor with input | |
3152 | vpxord 0x40($inp),$xb0,$xb0 | |
3153 | vpxord 0x80($inp),$xc0,$xc0 | |
3154 | vpxord 0xc0($inp),$xd0,$xd0 | |
3155 | vmovdqu32 $xa0,0x00($out) | |
3156 | vmovdqu32 $xb0,0x40($out) | |
3157 | vmovdqu32 $xc0,0x80($out) | |
3158 | vmovdqu32 $xd0,0xc0($out) | |
3159 | ||
3160 | vpxord 0x100($inp),$xa1,$xa1 | |
3161 | vpxord 0x140($inp),$xb1,$xb1 | |
3162 | vpxord 0x180($inp),$xc1,$xc1 | |
3163 | vpxord 0x1c0($inp),$xd1,$xd1 | |
3164 | vmovdqu32 $xa1,0x100($out) | |
3165 | vmovdqu32 $xb1,0x140($out) | |
3166 | vmovdqu32 $xc1,0x180($out) | |
3167 | vmovdqu32 $xd1,0x1c0($out) | |
3168 | ||
3169 | vpxord 0x200($inp),$xa2,$xa2 | |
3170 | vpxord 0x240($inp),$xb2,$xb2 | |
3171 | vpxord 0x280($inp),$xc2,$xc2 | |
3172 | vpxord 0x2c0($inp),$xd2,$xd2 | |
3173 | vmovdqu32 $xa2,0x200($out) | |
3174 | vmovdqu32 $xb2,0x240($out) | |
3175 | vmovdqu32 $xc2,0x280($out) | |
3176 | vmovdqu32 $xd2,0x2c0($out) | |
3177 | ||
3178 | vpxord 0x300($inp),$xa3,$xa3 | |
3179 | vpxord 0x340($inp),$xb3,$xb3 | |
3180 | vpxord 0x380($inp),$xc3,$xc3 | |
3181 | vpxord 0x3c0($inp),$xd3,$xd3 | |
3182 | lea 0x400($inp),$inp | |
3183 | vmovdqu32 $xa3,0x300($out) | |
3184 | vmovdqu32 $xb3,0x340($out) | |
3185 | vmovdqu32 $xc3,0x380($out) | |
3186 | vmovdqu32 $xd3,0x3c0($out) | |
3187 | lea 0x400($out),$out | |
3188 | ||
3189 | sub \$64*16,$len | |
3190 | jnz .Loop_outer16x | |
3191 | ||
3192 | jmp .Ldone16x | |
3193 | ||
3194 | .align 32 | |
3195 | .Ltail16x: | |
3196 | xor %r10,%r10 | |
3197 | sub $inp,$out | |
3198 | cmp \$64*1,$len | |
3199 | jb .Less_than_64_16x | |
3200 | vpxord ($inp),$xa0,$xa0 # xor with input | |
3201 | vmovdqu32 $xa0,($out,$inp) | |
3202 | je .Ldone16x | |
3203 | vmovdqa32 $xb0,$xa0 | |
3204 | lea 64($inp),$inp | |
3205 | ||
3206 | cmp \$64*2,$len | |
3207 | jb .Less_than_64_16x | |
3208 | vpxord ($inp),$xb0,$xb0 | |
3209 | vmovdqu32 $xb0,($out,$inp) | |
3210 | je .Ldone16x | |
3211 | vmovdqa32 $xc0,$xa0 | |
3212 | lea 64($inp),$inp | |
3213 | ||
3214 | cmp \$64*3,$len | |
3215 | jb .Less_than_64_16x | |
3216 | vpxord ($inp),$xc0,$xc0 | |
3217 | vmovdqu32 $xc0,($out,$inp) | |
3218 | je .Ldone16x | |
3219 | vmovdqa32 $xd0,$xa0 | |
3220 | lea 64($inp),$inp | |
3221 | ||
3222 | cmp \$64*4,$len | |
3223 | jb .Less_than_64_16x | |
3224 | vpxord ($inp),$xd0,$xd0 | |
3225 | vmovdqu32 $xd0,($out,$inp) | |
3226 | je .Ldone16x | |
3227 | vmovdqa32 $xa1,$xa0 | |
3228 | lea 64($inp),$inp | |
3229 | ||
3230 | cmp \$64*5,$len | |
3231 | jb .Less_than_64_16x | |
3232 | vpxord ($inp),$xa1,$xa1 | |
3233 | vmovdqu32 $xa1,($out,$inp) | |
3234 | je .Ldone16x | |
3235 | vmovdqa32 $xb1,$xa0 | |
3236 | lea 64($inp),$inp | |
3237 | ||
3238 | cmp \$64*6,$len | |
3239 | jb .Less_than_64_16x | |
3240 | vpxord ($inp),$xb1,$xb1 | |
3241 | vmovdqu32 $xb1,($out,$inp) | |
3242 | je .Ldone16x | |
3243 | vmovdqa32 $xc1,$xa0 | |
3244 | lea 64($inp),$inp | |
3245 | ||
3246 | cmp \$64*7,$len | |
3247 | jb .Less_than_64_16x | |
3248 | vpxord ($inp),$xc1,$xc1 | |
3249 | vmovdqu32 $xc1,($out,$inp) | |
3250 | je .Ldone16x | |
3251 | vmovdqa32 $xd1,$xa0 | |
3252 | lea 64($inp),$inp | |
3253 | ||
3254 | cmp \$64*8,$len | |
3255 | jb .Less_than_64_16x | |
3256 | vpxord ($inp),$xd1,$xd1 | |
3257 | vmovdqu32 $xd1,($out,$inp) | |
3258 | je .Ldone16x | |
3259 | vmovdqa32 $xa2,$xa0 | |
3260 | lea 64($inp),$inp | |
3261 | ||
3262 | cmp \$64*9,$len | |
3263 | jb .Less_than_64_16x | |
3264 | vpxord ($inp),$xa2,$xa2 | |
3265 | vmovdqu32 $xa2,($out,$inp) | |
3266 | je .Ldone16x | |
3267 | vmovdqa32 $xb2,$xa0 | |
3268 | lea 64($inp),$inp | |
3269 | ||
3270 | cmp \$64*10,$len | |
3271 | jb .Less_than_64_16x | |
3272 | vpxord ($inp),$xb2,$xb2 | |
3273 | vmovdqu32 $xb2,($out,$inp) | |
3274 | je .Ldone16x | |
3275 | vmovdqa32 $xc2,$xa0 | |
3276 | lea 64($inp),$inp | |
3277 | ||
3278 | cmp \$64*11,$len | |
3279 | jb .Less_than_64_16x | |
3280 | vpxord ($inp),$xc2,$xc2 | |
3281 | vmovdqu32 $xc2,($out,$inp) | |
3282 | je .Ldone16x | |
3283 | vmovdqa32 $xd2,$xa0 | |
3284 | lea 64($inp),$inp | |
3285 | ||
3286 | cmp \$64*12,$len | |
3287 | jb .Less_than_64_16x | |
3288 | vpxord ($inp),$xd2,$xd2 | |
3289 | vmovdqu32 $xd2,($out,$inp) | |
3290 | je .Ldone16x | |
3291 | vmovdqa32 $xa3,$xa0 | |
3292 | lea 64($inp),$inp | |
3293 | ||
3294 | cmp \$64*13,$len | |
3295 | jb .Less_than_64_16x | |
3296 | vpxord ($inp),$xa3,$xa3 | |
3297 | vmovdqu32 $xa3,($out,$inp) | |
3298 | je .Ldone16x | |
3299 | vmovdqa32 $xb3,$xa0 | |
3300 | lea 64($inp),$inp | |
3301 | ||
3302 | cmp \$64*14,$len | |
3303 | jb .Less_than_64_16x | |
3304 | vpxord ($inp),$xb3,$xb3 | |
3305 | vmovdqu32 $xb3,($out,$inp) | |
3306 | je .Ldone16x | |
3307 | vmovdqa32 $xc3,$xa0 | |
3308 | lea 64($inp),$inp | |
3309 | ||
3310 | cmp \$64*15,$len | |
3311 | jb .Less_than_64_16x | |
3312 | vpxord ($inp),$xc3,$xc3 | |
3313 | vmovdqu32 $xc3,($out,$inp) | |
3314 | je .Ldone16x | |
3315 | vmovdqa32 $xd3,$xa0 | |
3316 | lea 64($inp),$inp | |
3317 | ||
3318 | .Less_than_64_16x: | |
3319 | vmovdqa32 $xa0,0x00(%rsp) | |
3320 | lea ($out,$inp),$out | |
3321 | and \$63,$len | |
3322 | ||
3323 | .Loop_tail16x: | |
3324 | movzb ($inp,%r10),%eax | |
3325 | movzb (%rsp,%r10),%ecx | |
3326 | lea 1(%r10),%r10 | |
3327 | xor %ecx,%eax | |
3328 | mov %al,-1($out,%r10) | |
3329 | dec $len | |
3330 | jnz .Loop_tail16x | |
3331 | ||
3c274a6e AP |
3332 | vpxord $xa0,$xa0,$xa0 |
3333 | vmovdqa32 $xa0,0(%rsp) | |
3334 | ||
abb8c44f | 3335 | .Ldone16x: |
3c274a6e | 3336 | vzeroall |
abb8c44f AP |
3337 | ___ |
3338 | $code.=<<___ if ($win64); | |
384e6de4 AP |
3339 | movaps -0xa8(%r9),%xmm6 |
3340 | movaps -0x98(%r9),%xmm7 | |
3341 | movaps -0x88(%r9),%xmm8 | |
3342 | movaps -0x78(%r9),%xmm9 | |
3343 | movaps -0x68(%r9),%xmm10 | |
3344 | movaps -0x58(%r9),%xmm11 | |
3345 | movaps -0x48(%r9),%xmm12 | |
3346 | movaps -0x38(%r9),%xmm13 | |
3347 | movaps -0x28(%r9),%xmm14 | |
3348 | movaps -0x18(%r9),%xmm15 | |
abb8c44f AP |
3349 | ___ |
3350 | $code.=<<___; | |
384e6de4 | 3351 | lea (%r9),%rsp |
f17652e5 | 3352 | .cfi_def_cfa_register %rsp |
384e6de4 | 3353 | .L16x_epilogue: |
abb8c44f | 3354 | ret |
f17652e5 | 3355 | .cfi_endproc |
abb8c44f AP |
3356 | .size ChaCha20_16x,.-ChaCha20_16x |
3357 | ___ | |
cded9513 AP |
3358 | |
3359 | # switch to %ymm domain | |
3360 | ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
3361 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15)); | |
3362 | @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, | |
3363 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); | |
3364 | @key=map("%ymm$_",(16..31)); | |
3365 | ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; | |
3366 | ||
3367 | $code.=<<___; | |
3368 | .type ChaCha20_8xvl,\@function,5 | |
3369 | .align 32 | |
3370 | ChaCha20_8xvl: | |
3371 | .cfi_startproc | |
3372 | .LChaCha20_8xvl: | |
3373 | mov %rsp,%r9 # frame register | |
3374 | .cfi_def_cfa_register %r9 | |
3375 | sub \$64+$xframe,%rsp | |
3376 | and \$-64,%rsp | |
3377 | ___ | |
3378 | $code.=<<___ if ($win64); | |
3379 | movaps %xmm6,-0xa8(%r9) | |
3380 | movaps %xmm7,-0x98(%r9) | |
3381 | movaps %xmm8,-0x88(%r9) | |
3382 | movaps %xmm9,-0x78(%r9) | |
3383 | movaps %xmm10,-0x68(%r9) | |
3384 | movaps %xmm11,-0x58(%r9) | |
3385 | movaps %xmm12,-0x48(%r9) | |
3386 | movaps %xmm13,-0x38(%r9) | |
3387 | movaps %xmm14,-0x28(%r9) | |
3388 | movaps %xmm15,-0x18(%r9) | |
3389 | .L8xvl_body: | |
3390 | ___ | |
3391 | $code.=<<___; | |
3392 | vzeroupper | |
3393 | ||
3394 | lea .Lsigma(%rip),%r10 | |
3395 | vbroadcasti128 (%r10),$xa3 # key[0] | |
3396 | vbroadcasti128 ($key),$xb3 # key[1] | |
3397 | vbroadcasti128 16($key),$xc3 # key[2] | |
3398 | vbroadcasti128 ($counter),$xd3 # key[3] | |
3399 | ||
3400 | vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... | |
3401 | vpshufd \$0x55,$xa3,$xa1 | |
3402 | vpshufd \$0xaa,$xa3,$xa2 | |
3403 | vpshufd \$0xff,$xa3,$xa3 | |
3404 | vmovdqa64 $xa0,@key[0] | |
3405 | vmovdqa64 $xa1,@key[1] | |
3406 | vmovdqa64 $xa2,@key[2] | |
3407 | vmovdqa64 $xa3,@key[3] | |
3408 | ||
3409 | vpshufd \$0x00,$xb3,$xb0 | |
3410 | vpshufd \$0x55,$xb3,$xb1 | |
3411 | vpshufd \$0xaa,$xb3,$xb2 | |
3412 | vpshufd \$0xff,$xb3,$xb3 | |
3413 | vmovdqa64 $xb0,@key[4] | |
3414 | vmovdqa64 $xb1,@key[5] | |
3415 | vmovdqa64 $xb2,@key[6] | |
3416 | vmovdqa64 $xb3,@key[7] | |
3417 | ||
3418 | vpshufd \$0x00,$xc3,$xc0 | |
3419 | vpshufd \$0x55,$xc3,$xc1 | |
3420 | vpshufd \$0xaa,$xc3,$xc2 | |
3421 | vpshufd \$0xff,$xc3,$xc3 | |
3422 | vmovdqa64 $xc0,@key[8] | |
3423 | vmovdqa64 $xc1,@key[9] | |
3424 | vmovdqa64 $xc2,@key[10] | |
3425 | vmovdqa64 $xc3,@key[11] | |
3426 | ||
3427 | vpshufd \$0x00,$xd3,$xd0 | |
3428 | vpshufd \$0x55,$xd3,$xd1 | |
3429 | vpshufd \$0xaa,$xd3,$xd2 | |
3430 | vpshufd \$0xff,$xd3,$xd3 | |
3431 | vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet | |
3432 | vmovdqa64 $xd0,@key[12] | |
3433 | vmovdqa64 $xd1,@key[13] | |
3434 | vmovdqa64 $xd2,@key[14] | |
3435 | vmovdqa64 $xd3,@key[15] | |
3436 | ||
3437 | mov \$10,%eax | |
3438 | jmp .Loop8xvl | |
3439 | ||
3440 | .align 32 | |
3441 | .Loop_outer8xvl: | |
3442 | #vpbroadcastd 0(%r10),$xa0 # reload key | |
3443 | #vpbroadcastd 4(%r10),$xa1 | |
3444 | vpbroadcastd 8(%r10),$xa2 | |
3445 | vpbroadcastd 12(%r10),$xa3 | |
3446 | vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters | |
3447 | vmovdqa64 @key[4],$xb0 | |
3448 | vmovdqa64 @key[5],$xb1 | |
3449 | vmovdqa64 @key[6],$xb2 | |
3450 | vmovdqa64 @key[7],$xb3 | |
3451 | vmovdqa64 @key[8],$xc0 | |
3452 | vmovdqa64 @key[9],$xc1 | |
3453 | vmovdqa64 @key[10],$xc2 | |
3454 | vmovdqa64 @key[11],$xc3 | |
3455 | vmovdqa64 @key[12],$xd0 | |
3456 | vmovdqa64 @key[13],$xd1 | |
3457 | vmovdqa64 @key[14],$xd2 | |
3458 | vmovdqa64 @key[15],$xd3 | |
3459 | ||
3460 | vmovdqa64 $xa0,@key[0] | |
3461 | vmovdqa64 $xa1,@key[1] | |
3462 | vmovdqa64 $xa2,@key[2] | |
3463 | vmovdqa64 $xa3,@key[3] | |
3464 | ||
3465 | mov \$10,%eax | |
3466 | jmp .Loop8xvl | |
3467 | ||
3468 | .align 32 | |
3469 | .Loop8xvl: | |
3470 | ___ | |
3471 | foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } | |
3472 | foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } | |
3473 | $code.=<<___; | |
3474 | dec %eax | |
3475 | jnz .Loop8xvl | |
3476 | ||
3477 | vpaddd @key[0],$xa0,$xa0 # accumulate key | |
3478 | vpaddd @key[1],$xa1,$xa1 | |
3479 | vpaddd @key[2],$xa2,$xa2 | |
3480 | vpaddd @key[3],$xa3,$xa3 | |
3481 | ||
3482 | vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data | |
3483 | vpunpckldq $xa3,$xa2,$xt3 | |
3484 | vpunpckhdq $xa1,$xa0,$xa0 | |
3485 | vpunpckhdq $xa3,$xa2,$xa2 | |
3486 | vpunpcklqdq $xt3,$xt2,$xa1 # "a0" | |
3487 | vpunpckhqdq $xt3,$xt2,$xt2 # "a1" | |
3488 | vpunpcklqdq $xa2,$xa0,$xa3 # "a2" | |
3489 | vpunpckhqdq $xa2,$xa0,$xa0 # "a3" | |
3490 | ___ | |
3491 | ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); | |
3492 | $code.=<<___; | |
3493 | vpaddd @key[4],$xb0,$xb0 | |
3494 | vpaddd @key[5],$xb1,$xb1 | |
3495 | vpaddd @key[6],$xb2,$xb2 | |
3496 | vpaddd @key[7],$xb3,$xb3 | |
3497 | ||
3498 | vpunpckldq $xb1,$xb0,$xt2 | |
3499 | vpunpckldq $xb3,$xb2,$xt3 | |
3500 | vpunpckhdq $xb1,$xb0,$xb0 | |
3501 | vpunpckhdq $xb3,$xb2,$xb2 | |
3502 | vpunpcklqdq $xt3,$xt2,$xb1 # "b0" | |
3503 | vpunpckhqdq $xt3,$xt2,$xt2 # "b1" | |
3504 | vpunpcklqdq $xb2,$xb0,$xb3 # "b2" | |
3505 | vpunpckhqdq $xb2,$xb0,$xb0 # "b3" | |
3506 | ___ | |
3507 | ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); | |
3508 | $code.=<<___; | |
3509 | vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further | |
3510 | vshufi32x4 \$3,$xb0,$xa0,$xb0 | |
3511 | vshufi32x4 \$0,$xb1,$xa1,$xa0 | |
3512 | vshufi32x4 \$3,$xb1,$xa1,$xb1 | |
3513 | vshufi32x4 \$0,$xb2,$xa2,$xa1 | |
3514 | vshufi32x4 \$3,$xb2,$xa2,$xb2 | |
3515 | vshufi32x4 \$0,$xb3,$xa3,$xa2 | |
3516 | vshufi32x4 \$3,$xb3,$xa3,$xb3 | |
3517 | ___ | |
3518 | ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); | |
3519 | $code.=<<___; | |
3520 | vpaddd @key[8],$xc0,$xc0 | |
3521 | vpaddd @key[9],$xc1,$xc1 | |
3522 | vpaddd @key[10],$xc2,$xc2 | |
3523 | vpaddd @key[11],$xc3,$xc3 | |
3524 | ||
3525 | vpunpckldq $xc1,$xc0,$xt2 | |
3526 | vpunpckldq $xc3,$xc2,$xt3 | |
3527 | vpunpckhdq $xc1,$xc0,$xc0 | |
3528 | vpunpckhdq $xc3,$xc2,$xc2 | |
3529 | vpunpcklqdq $xt3,$xt2,$xc1 # "c0" | |
3530 | vpunpckhqdq $xt3,$xt2,$xt2 # "c1" | |
3531 | vpunpcklqdq $xc2,$xc0,$xc3 # "c2" | |
3532 | vpunpckhqdq $xc2,$xc0,$xc0 # "c3" | |
3533 | ___ | |
3534 | ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); | |
3535 | $code.=<<___; | |
3536 | vpaddd @key[12],$xd0,$xd0 | |
3537 | vpaddd @key[13],$xd1,$xd1 | |
3538 | vpaddd @key[14],$xd2,$xd2 | |
3539 | vpaddd @key[15],$xd3,$xd3 | |
3540 | ||
3541 | vpunpckldq $xd1,$xd0,$xt2 | |
3542 | vpunpckldq $xd3,$xd2,$xt3 | |
3543 | vpunpckhdq $xd1,$xd0,$xd0 | |
3544 | vpunpckhdq $xd3,$xd2,$xd2 | |
3545 | vpunpcklqdq $xt3,$xt2,$xd1 # "d0" | |
3546 | vpunpckhqdq $xt3,$xt2,$xt2 # "d1" | |
3547 | vpunpcklqdq $xd2,$xd0,$xd3 # "d2" | |
3548 | vpunpckhqdq $xd2,$xd0,$xd0 # "d3" | |
3549 | ___ | |
3550 | ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); | |
3551 | $code.=<<___; | |
3552 | vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further | |
3553 | vperm2i128 \$0x31,$xd0,$xc0,$xd0 | |
3554 | vperm2i128 \$0x20,$xd1,$xc1,$xc0 | |
3555 | vperm2i128 \$0x31,$xd1,$xc1,$xd1 | |
3556 | vperm2i128 \$0x20,$xd2,$xc2,$xc1 | |
3557 | vperm2i128 \$0x31,$xd2,$xc2,$xd2 | |
3558 | vperm2i128 \$0x20,$xd3,$xc3,$xc2 | |
3559 | vperm2i128 \$0x31,$xd3,$xc3,$xd3 | |
3560 | ___ | |
3561 | ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); | |
3562 | ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= | |
3563 | ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); | |
3564 | $code.=<<___; | |
3565 | cmp \$64*8,$len | |
3566 | jb .Ltail8xvl | |
3567 | ||
3568 | mov \$0x80,%eax # size optimization | |
3569 | vpxord 0x00($inp),$xa0,$xa0 # xor with input | |
3570 | vpxor 0x20($inp),$xb0,$xb0 | |
3571 | vpxor 0x40($inp),$xc0,$xc0 | |
3572 | vpxor 0x60($inp),$xd0,$xd0 | |
3573 | lea ($inp,%rax),$inp # size optimization | |
3574 | vmovdqu32 $xa0,0x00($out) | |
3575 | vmovdqu $xb0,0x20($out) | |
3576 | vmovdqu $xc0,0x40($out) | |
3577 | vmovdqu $xd0,0x60($out) | |
3578 | lea ($out,%rax),$out # size optimization | |
3579 | ||
3580 | vpxor 0x00($inp),$xa1,$xa1 | |
3581 | vpxor 0x20($inp),$xb1,$xb1 | |
3582 | vpxor 0x40($inp),$xc1,$xc1 | |
3583 | vpxor 0x60($inp),$xd1,$xd1 | |
3584 | lea ($inp,%rax),$inp # size optimization | |
3585 | vmovdqu $xa1,0x00($out) | |
3586 | vmovdqu $xb1,0x20($out) | |
3587 | vmovdqu $xc1,0x40($out) | |
3588 | vmovdqu $xd1,0x60($out) | |
3589 | lea ($out,%rax),$out # size optimization | |
3590 | ||
3591 | vpxord 0x00($inp),$xa2,$xa2 | |
3592 | vpxor 0x20($inp),$xb2,$xb2 | |
3593 | vpxor 0x40($inp),$xc2,$xc2 | |
3594 | vpxor 0x60($inp),$xd2,$xd2 | |
3595 | lea ($inp,%rax),$inp # size optimization | |
3596 | vmovdqu32 $xa2,0x00($out) | |
3597 | vmovdqu $xb2,0x20($out) | |
3598 | vmovdqu $xc2,0x40($out) | |
3599 | vmovdqu $xd2,0x60($out) | |
3600 | lea ($out,%rax),$out # size optimization | |
3601 | ||
3602 | vpxor 0x00($inp),$xa3,$xa3 | |
3603 | vpxor 0x20($inp),$xb3,$xb3 | |
3604 | vpxor 0x40($inp),$xc3,$xc3 | |
3605 | vpxor 0x60($inp),$xd3,$xd3 | |
3606 | lea ($inp,%rax),$inp # size optimization | |
3607 | vmovdqu $xa3,0x00($out) | |
3608 | vmovdqu $xb3,0x20($out) | |
3609 | vmovdqu $xc3,0x40($out) | |
3610 | vmovdqu $xd3,0x60($out) | |
3611 | lea ($out,%rax),$out # size optimization | |
3612 | ||
3613 | vpbroadcastd 0(%r10),%ymm0 # reload key | |
3614 | vpbroadcastd 4(%r10),%ymm1 | |
3615 | ||
3616 | sub \$64*8,$len | |
3617 | jnz .Loop_outer8xvl | |
3618 | ||
3619 | jmp .Ldone8xvl | |
3620 | ||
3621 | .align 32 | |
3622 | .Ltail8xvl: | |
3623 | vmovdqa64 $xa0,%ymm8 # size optimization | |
3624 | ___ | |
3625 | $xa0 = "%ymm8"; | |
3626 | $code.=<<___; | |
3627 | xor %r10,%r10 | |
3628 | sub $inp,$out | |
3629 | cmp \$64*1,$len | |
3630 | jb .Less_than_64_8xvl | |
3631 | vpxor 0x00($inp),$xa0,$xa0 # xor with input | |
3632 | vpxor 0x20($inp),$xb0,$xb0 | |
3633 | vmovdqu $xa0,0x00($out,$inp) | |
3634 | vmovdqu $xb0,0x20($out,$inp) | |
3635 | je .Ldone8xvl | |
3636 | vmovdqa $xc0,$xa0 | |
3637 | vmovdqa $xd0,$xb0 | |
3638 | lea 64($inp),$inp | |
3639 | ||
3640 | cmp \$64*2,$len | |
3641 | jb .Less_than_64_8xvl | |
3642 | vpxor 0x00($inp),$xc0,$xc0 | |
3643 | vpxor 0x20($inp),$xd0,$xd0 | |
3644 | vmovdqu $xc0,0x00($out,$inp) | |
3645 | vmovdqu $xd0,0x20($out,$inp) | |
3646 | je .Ldone8xvl | |
3647 | vmovdqa $xa1,$xa0 | |
3648 | vmovdqa $xb1,$xb0 | |
3649 | lea 64($inp),$inp | |
3650 | ||
3651 | cmp \$64*3,$len | |
3652 | jb .Less_than_64_8xvl | |
3653 | vpxor 0x00($inp),$xa1,$xa1 | |
3654 | vpxor 0x20($inp),$xb1,$xb1 | |
3655 | vmovdqu $xa1,0x00($out,$inp) | |
3656 | vmovdqu $xb1,0x20($out,$inp) | |
3657 | je .Ldone8xvl | |
3658 | vmovdqa $xc1,$xa0 | |
3659 | vmovdqa $xd1,$xb0 | |
3660 | lea 64($inp),$inp | |
3661 | ||
3662 | cmp \$64*4,$len | |
3663 | jb .Less_than_64_8xvl | |
3664 | vpxor 0x00($inp),$xc1,$xc1 | |
3665 | vpxor 0x20($inp),$xd1,$xd1 | |
3666 | vmovdqu $xc1,0x00($out,$inp) | |
3667 | vmovdqu $xd1,0x20($out,$inp) | |
3668 | je .Ldone8xvl | |
3669 | vmovdqa32 $xa2,$xa0 | |
3670 | vmovdqa $xb2,$xb0 | |
3671 | lea 64($inp),$inp | |
3672 | ||
3673 | cmp \$64*5,$len | |
3674 | jb .Less_than_64_8xvl | |
3675 | vpxord 0x00($inp),$xa2,$xa2 | |
3676 | vpxor 0x20($inp),$xb2,$xb2 | |
3677 | vmovdqu32 $xa2,0x00($out,$inp) | |
3678 | vmovdqu $xb2,0x20($out,$inp) | |
3679 | je .Ldone8xvl | |
3680 | vmovdqa $xc2,$xa0 | |
3681 | vmovdqa $xd2,$xb0 | |
3682 | lea 64($inp),$inp | |
3683 | ||
3684 | cmp \$64*6,$len | |
3685 | jb .Less_than_64_8xvl | |
3686 | vpxor 0x00($inp),$xc2,$xc2 | |
3687 | vpxor 0x20($inp),$xd2,$xd2 | |
3688 | vmovdqu $xc2,0x00($out,$inp) | |
3689 | vmovdqu $xd2,0x20($out,$inp) | |
3690 | je .Ldone8xvl | |
3691 | vmovdqa $xa3,$xa0 | |
3692 | vmovdqa $xb3,$xb0 | |
3693 | lea 64($inp),$inp | |
3694 | ||
3695 | cmp \$64*7,$len | |
3696 | jb .Less_than_64_8xvl | |
3697 | vpxor 0x00($inp),$xa3,$xa3 | |
3698 | vpxor 0x20($inp),$xb3,$xb3 | |
3699 | vmovdqu $xa3,0x00($out,$inp) | |
3700 | vmovdqu $xb3,0x20($out,$inp) | |
3701 | je .Ldone8xvl | |
3702 | vmovdqa $xc3,$xa0 | |
3703 | vmovdqa $xd3,$xb0 | |
3704 | lea 64($inp),$inp | |
3705 | ||
3706 | .Less_than_64_8xvl: | |
3707 | vmovdqa $xa0,0x00(%rsp) | |
3708 | vmovdqa $xb0,0x20(%rsp) | |
3709 | lea ($out,$inp),$out | |
3710 | and \$63,$len | |
3711 | ||
3712 | .Loop_tail8xvl: | |
3713 | movzb ($inp,%r10),%eax | |
3714 | movzb (%rsp,%r10),%ecx | |
3715 | lea 1(%r10),%r10 | |
3716 | xor %ecx,%eax | |
3717 | mov %al,-1($out,%r10) | |
3718 | dec $len | |
3719 | jnz .Loop_tail8xvl | |
3720 | ||
3721 | vpxor $xa0,$xa0,$xa0 | |
3722 | vmovdqa $xa0,0x00(%rsp) | |
3723 | vmovdqa $xa0,0x20(%rsp) | |
3724 | ||
3725 | .Ldone8xvl: | |
3726 | vzeroall | |
3727 | ___ | |
3728 | $code.=<<___ if ($win64); | |
3729 | movaps -0xa8(%r9),%xmm6 | |
3730 | movaps -0x98(%r9),%xmm7 | |
3731 | movaps -0x88(%r9),%xmm8 | |
3732 | movaps -0x78(%r9),%xmm9 | |
3733 | movaps -0x68(%r9),%xmm10 | |
3734 | movaps -0x58(%r9),%xmm11 | |
3735 | movaps -0x48(%r9),%xmm12 | |
3736 | movaps -0x38(%r9),%xmm13 | |
3737 | movaps -0x28(%r9),%xmm14 | |
3738 | movaps -0x18(%r9),%xmm15 | |
3739 | ___ | |
3740 | $code.=<<___; | |
3741 | lea (%r9),%rsp | |
3742 | .cfi_def_cfa_register %rsp | |
3743 | .L8xvl_epilogue: | |
3744 | ret | |
3745 | .cfi_endproc | |
3746 | .size ChaCha20_8xvl,.-ChaCha20_8xvl | |
3747 | ___ | |
abb8c44f AP |
3748 | } |
3749 | ||
384e6de4 AP |
3750 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, |
3751 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | |
3752 | if ($win64) { | |
3753 | $rec="%rcx"; | |
3754 | $frame="%rdx"; | |
3755 | $context="%r8"; | |
3756 | $disp="%r9"; | |
3757 | ||
3758 | $code.=<<___; | |
3759 | .extern __imp_RtlVirtualUnwind | |
3760 | .type se_handler,\@abi-omnipotent | |
3761 | .align 16 | |
3762 | se_handler: | |
3763 | push %rsi | |
3764 | push %rdi | |
3765 | push %rbx | |
3766 | push %rbp | |
3767 | push %r12 | |
3768 | push %r13 | |
3769 | push %r14 | |
3770 | push %r15 | |
3771 | pushfq | |
3772 | sub \$64,%rsp | |
3773 | ||
3774 | mov 120($context),%rax # pull context->Rax | |
3775 | mov 248($context),%rbx # pull context->Rip | |
3776 | ||
3777 | mov 8($disp),%rsi # disp->ImageBase | |
3778 | mov 56($disp),%r11 # disp->HandlerData | |
3779 | ||
3780 | lea .Lctr32_body(%rip),%r10 | |
3781 | cmp %r10,%rbx # context->Rip<.Lprologue | |
3782 | jb .Lcommon_seh_tail | |
3783 | ||
3784 | mov 152($context),%rax # pull context->Rsp | |
3785 | ||
3786 | lea .Lno_data(%rip),%r10 # epilogue label | |
3787 | cmp %r10,%rbx # context->Rip>=.Lepilogue | |
3788 | jae .Lcommon_seh_tail | |
3789 | ||
3790 | lea 64+24+48(%rax),%rax | |
3791 | ||
3792 | mov -8(%rax),%rbx | |
3793 | mov -16(%rax),%rbp | |
3794 | mov -24(%rax),%r12 | |
3795 | mov -32(%rax),%r13 | |
3796 | mov -40(%rax),%r14 | |
3797 | mov -48(%rax),%r15 | |
3798 | mov %rbx,144($context) # restore context->Rbx | |
3799 | mov %rbp,160($context) # restore context->Rbp | |
3800 | mov %r12,216($context) # restore context->R12 | |
3801 | mov %r13,224($context) # restore context->R13 | |
3802 | mov %r14,232($context) # restore context->R14 | |
3803 | mov %r15,240($context) # restore context->R14 | |
3804 | ||
3805 | .Lcommon_seh_tail: | |
3806 | mov 8(%rax),%rdi | |
3807 | mov 16(%rax),%rsi | |
3808 | mov %rax,152($context) # restore context->Rsp | |
3809 | mov %rsi,168($context) # restore context->Rsi | |
3810 | mov %rdi,176($context) # restore context->Rdi | |
3811 | ||
3812 | mov 40($disp),%rdi # disp->ContextRecord | |
3813 | mov $context,%rsi # context | |
3814 | mov \$154,%ecx # sizeof(CONTEXT) | |
3815 | .long 0xa548f3fc # cld; rep movsq | |
3816 | ||
3817 | mov $disp,%rsi | |
3818 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | |
3819 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | |
3820 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | |
3821 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | |
3822 | mov 40(%rsi),%r10 # disp->ContextRecord | |
3823 | lea 56(%rsi),%r11 # &disp->HandlerData | |
3824 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | |
3825 | mov %r10,32(%rsp) # arg5 | |
3826 | mov %r11,40(%rsp) # arg6 | |
3827 | mov %r12,48(%rsp) # arg7 | |
3828 | mov %rcx,56(%rsp) # arg8, (NULL) | |
3829 | call *__imp_RtlVirtualUnwind(%rip) | |
3830 | ||
3831 | mov \$1,%eax # ExceptionContinueSearch | |
3832 | add \$64,%rsp | |
3833 | popfq | |
3834 | pop %r15 | |
3835 | pop %r14 | |
3836 | pop %r13 | |
3837 | pop %r12 | |
3838 | pop %rbp | |
3839 | pop %rbx | |
3840 | pop %rdi | |
3841 | pop %rsi | |
3842 | ret | |
3843 | .size se_handler,.-se_handler | |
3844 | ||
d5487a45 | 3845 | .type simd_handler,\@abi-omnipotent |
384e6de4 | 3846 | .align 16 |
d5487a45 | 3847 | simd_handler: |
384e6de4 AP |
3848 | push %rsi |
3849 | push %rdi | |
3850 | push %rbx | |
3851 | push %rbp | |
3852 | push %r12 | |
3853 | push %r13 | |
3854 | push %r14 | |
3855 | push %r15 | |
3856 | pushfq | |
3857 | sub \$64,%rsp | |
3858 | ||
3859 | mov 120($context),%rax # pull context->Rax | |
3860 | mov 248($context),%rbx # pull context->Rip | |
3861 | ||
3862 | mov 8($disp),%rsi # disp->ImageBase | |
3863 | mov 56($disp),%r11 # disp->HandlerData | |
3864 | ||
3865 | mov 0(%r11),%r10d # HandlerData[0] | |
3866 | lea (%rsi,%r10),%r10 # prologue label | |
3867 | cmp %r10,%rbx # context->Rip<prologue label | |
3868 | jb .Lcommon_seh_tail | |
3869 | ||
3870 | mov 192($context),%rax # pull context->R9 | |
3871 | ||
3872 | mov 4(%r11),%r10d # HandlerData[1] | |
d5487a45 | 3873 | mov 8(%r11),%ecx # HandlerData[2] |
384e6de4 AP |
3874 | lea (%rsi,%r10),%r10 # epilogue label |
3875 | cmp %r10,%rbx # context->Rip>=epilogue label | |
3876 | jae .Lcommon_seh_tail | |
3877 | ||
d5487a45 AP |
3878 | neg %rcx |
3879 | lea -8(%rax,%rcx),%rsi | |
384e6de4 | 3880 | lea 512($context),%rdi # &context.Xmm6 |
d5487a45 AP |
3881 | neg %ecx |
3882 | shr \$3,%ecx | |
384e6de4 AP |
3883 | .long 0xa548f3fc # cld; rep movsq |
3884 | ||
3885 | jmp .Lcommon_seh_tail | |
d5487a45 | 3886 | .size simd_handler,.-simd_handler |
384e6de4 AP |
3887 | |
3888 | .section .pdata | |
3889 | .align 4 | |
3890 | .rva .LSEH_begin_ChaCha20_ctr32 | |
3891 | .rva .LSEH_end_ChaCha20_ctr32 | |
3892 | .rva .LSEH_info_ChaCha20_ctr32 | |
3893 | ||
3894 | .rva .LSEH_begin_ChaCha20_ssse3 | |
3895 | .rva .LSEH_end_ChaCha20_ssse3 | |
3896 | .rva .LSEH_info_ChaCha20_ssse3 | |
3897 | ||
d5487a45 AP |
3898 | .rva .LSEH_begin_ChaCha20_128 |
3899 | .rva .LSEH_end_ChaCha20_128 | |
3900 | .rva .LSEH_info_ChaCha20_128 | |
3901 | ||
384e6de4 AP |
3902 | .rva .LSEH_begin_ChaCha20_4x |
3903 | .rva .LSEH_end_ChaCha20_4x | |
3904 | .rva .LSEH_info_ChaCha20_4x | |
3905 | ___ | |
3906 | $code.=<<___ if ($avx); | |
3907 | .rva .LSEH_begin_ChaCha20_4xop | |
3908 | .rva .LSEH_end_ChaCha20_4xop | |
3909 | .rva .LSEH_info_ChaCha20_4xop | |
3910 | ___ | |
3911 | $code.=<<___ if ($avx>1); | |
3912 | .rva .LSEH_begin_ChaCha20_8x | |
3913 | .rva .LSEH_end_ChaCha20_8x | |
3914 | .rva .LSEH_info_ChaCha20_8x | |
3915 | ___ | |
3916 | $code.=<<___ if ($avx>2); | |
3917 | .rva .LSEH_begin_ChaCha20_avx512 | |
3918 | .rva .LSEH_end_ChaCha20_avx512 | |
3919 | .rva .LSEH_info_ChaCha20_avx512 | |
3920 | ||
cded9513 AP |
3921 | .rva .LSEH_begin_ChaCha20_avx512vl |
3922 | .rva .LSEH_end_ChaCha20_avx512vl | |
3923 | .rva .LSEH_info_ChaCha20_avx512vl | |
3924 | ||
384e6de4 AP |
3925 | .rva .LSEH_begin_ChaCha20_16x |
3926 | .rva .LSEH_end_ChaCha20_16x | |
3927 | .rva .LSEH_info_ChaCha20_16x | |
cded9513 AP |
3928 | |
3929 | .rva .LSEH_begin_ChaCha20_8xvl | |
3930 | .rva .LSEH_end_ChaCha20_8xvl | |
3931 | .rva .LSEH_info_ChaCha20_8xvl | |
384e6de4 AP |
3932 | ___ |
3933 | $code.=<<___; | |
3934 | .section .xdata | |
3935 | .align 8 | |
3936 | .LSEH_info_ChaCha20_ctr32: | |
3937 | .byte 9,0,0,0 | |
3938 | .rva se_handler | |
3939 | ||
3940 | .LSEH_info_ChaCha20_ssse3: | |
3941 | .byte 9,0,0,0 | |
d5487a45 | 3942 | .rva simd_handler |
384e6de4 | 3943 | .rva .Lssse3_body,.Lssse3_epilogue |
d5487a45 AP |
3944 | .long 0x20,0 |
3945 | ||
3946 | .LSEH_info_ChaCha20_128: | |
3947 | .byte 9,0,0,0 | |
3948 | .rva simd_handler | |
3949 | .rva .L128_body,.L128_epilogue | |
3950 | .long 0x60,0 | |
384e6de4 AP |
3951 | |
3952 | .LSEH_info_ChaCha20_4x: | |
3953 | .byte 9,0,0,0 | |
d5487a45 | 3954 | .rva simd_handler |
384e6de4 | 3955 | .rva .L4x_body,.L4x_epilogue |
d5487a45 | 3956 | .long 0xa0,0 |
384e6de4 AP |
3957 | ___ |
3958 | $code.=<<___ if ($avx); | |
3959 | .LSEH_info_ChaCha20_4xop: | |
3960 | .byte 9,0,0,0 | |
d5487a45 | 3961 | .rva simd_handler |
384e6de4 | 3962 | .rva .L4xop_body,.L4xop_epilogue # HandlerData[] |
d5487a45 | 3963 | .long 0xa0,0 |
384e6de4 AP |
3964 | ___ |
3965 | $code.=<<___ if ($avx>1); | |
3966 | .LSEH_info_ChaCha20_8x: | |
3967 | .byte 9,0,0,0 | |
d5487a45 | 3968 | .rva simd_handler |
384e6de4 | 3969 | .rva .L8x_body,.L8x_epilogue # HandlerData[] |
d5487a45 | 3970 | .long 0xa0,0 |
384e6de4 AP |
3971 | ___ |
3972 | $code.=<<___ if ($avx>2); | |
3973 | .LSEH_info_ChaCha20_avx512: | |
3974 | .byte 9,0,0,0 | |
d5487a45 | 3975 | .rva simd_handler |
384e6de4 | 3976 | .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] |
d5487a45 | 3977 | .long 0x20,0 |
384e6de4 | 3978 | |
cded9513 AP |
3979 | .LSEH_info_ChaCha20_avx512vl: |
3980 | .byte 9,0,0,0 | |
d5487a45 | 3981 | .rva simd_handler |
cded9513 | 3982 | .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[] |
d5487a45 | 3983 | .long 0x20,0 |
cded9513 | 3984 | |
384e6de4 AP |
3985 | .LSEH_info_ChaCha20_16x: |
3986 | .byte 9,0,0,0 | |
d5487a45 | 3987 | .rva simd_handler |
384e6de4 | 3988 | .rva .L16x_body,.L16x_epilogue # HandlerData[] |
d5487a45 | 3989 | .long 0xa0,0 |
cded9513 AP |
3990 | |
3991 | .LSEH_info_ChaCha20_8xvl: | |
3992 | .byte 9,0,0,0 | |
d5487a45 | 3993 | .rva simd_handler |
cded9513 | 3994 | .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[] |
d5487a45 | 3995 | .long 0xa0,0 |
384e6de4 AP |
3996 | ___ |
3997 | } | |
3998 | ||
a98c648e | 3999 | foreach (split("\n",$code)) { |
3c274a6e | 4000 | s/\`([^\`]*)\`/eval $1/ge; |
a98c648e | 4001 | |
3c274a6e | 4002 | s/%x#%[yz]/%x/g; # "down-shift" |
a98c648e AP |
4003 | |
4004 | print $_,"\n"; | |
4005 | } | |
4006 | ||
a21314db | 4007 | close STDOUT or die "error closing STDOUT: $!"; |