]>
git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/chacha/asm/chacha-x86.pl
24c6966d3e4a7fae0772523b5c1b70ac5372db83
2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
21 # Performance in cycles per byte out of large buffer.
27 # Core2 9.56/+89% 4.83
28 # Westmere 9.50/+45% 3.35
29 # Sandy Bridge 10.5/+47% 3.20
30 # Haswell 8.15/+50% 2.83
31 # Skylake 7.53/+22% 2.75
32 # Silvermont 17.4/+36% 8.35
33 # Goldmont 13.4/+40% 4.36
34 # Sledgehammer 10.2/+54%
35 # Bulldozer 13.4/+50% 4.38(*)
37 # (*) Bulldozer actually executes 4xXOP code path that delivers 3.55;
39 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
40 push(@INC,"${dir}","${dir}../../perlasm");
43 $output = pop and open STDOUT
,">$output";
45 &asm_init
($ARGV[0],$ARGV[$#ARGV] eq "386");
48 for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
51 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
52 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
53 ($gasver=$1)>=2.19); # first version supporting AVX
55 $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
56 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
57 $1>=2.03); # first version supporting AVX
59 $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
60 `ml 2>&1` =~ /Version ([0-9]+)\./ &&
61 $1>=10); # first version supporting AVX
63 $ymm=1 if ($xmm && !$ymm &&
64 `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ &&
65 $2>=3.0); # first version supporting AVX
68 ($b,$b_)=("ebx","ebp");
69 ($c,$c_)=("ecx","esi");
70 ($d,$d_)=("edx","edi");
73 my ($ai,$bi,$ci,$di,$i)=@_;
74 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
75 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
79 # 0 4 8 12 < even round
83 # 0 5 10 15 < odd round
90 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
93 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
96 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
99 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
102 #&add ($a,$b); # see elsewhere
104 &mov
(&DWP
(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3);
106 &mov
(&DWP
(4*$bp,"esp"),$b_) if ($i!=0);
108 &mov
($c_,&DWP
(4*$cn,"esp")) if ($ai>0 && $ai<3);
110 &mov
($d_,&DWP
(4*$dn,"esp")) if ($di!=$dn);
112 &mov
($b_,&DWP
(4*$bn,"esp")) if ($i<7);
113 &mov
($b_,&DWP
(128,"esp")) if ($i==7); # loop counter
116 &mov
(&DWP
(4*$ai,"esp"),$a);
118 &mov
($a,&DWP
(4*$an,"esp"));
120 &mov
(&DWP
(4*$di,"esp"),$d) if ($di!=$dn);
121 &mov
($d_,$d) if ($di==$dn);
123 &add
($a,$b_) if ($i<7); # elsewhere
131 &static_label
("ssse3_shortcut");
132 &static_label
("xop_shortcut");
133 &static_label
("ssse3_data");
134 &static_label
("pic_point");
136 &function_begin
("ChaCha20_ctr32");
138 &cmp ("eax",&wparam
(2)); # len==0?
139 &je
(&label
("no_data"));
141 &call
(&label
("pic_point"));
142 &set_label
("pic_point");
144 &picmeup
("ebp","OPENSSL_ia32cap_P","eax",&label
("pic_point"));
145 &test
(&DWP
(0,"ebp"),1<<24); # test FXSR bit
147 &test
(&DWP
(4,"ebp"),1<<9); # test SSSE3 bit
149 &jmp
(&label
("ssse3_shortcut"));
152 &mov
("esi",&wparam
(3)); # key
153 &mov
("edi",&wparam
(4)); # counter and nonce
157 &mov
("eax",&DWP
(4*0,"esi")); # copy key
158 &mov
("ebx",&DWP
(4*1,"esi"));
159 &mov
("ecx",&DWP
(4*2,"esi"));
160 &mov
("edx",&DWP
(4*3,"esi"));
161 &mov
(&DWP
(64+4*4,"esp"),"eax");
162 &mov
(&DWP
(64+4*5,"esp"),"ebx");
163 &mov
(&DWP
(64+4*6,"esp"),"ecx");
164 &mov
(&DWP
(64+4*7,"esp"),"edx");
165 &mov
("eax",&DWP
(4*4,"esi"));
166 &mov
("ebx",&DWP
(4*5,"esi"));
167 &mov
("ecx",&DWP
(4*6,"esi"));
168 &mov
("edx",&DWP
(4*7,"esi"));
169 &mov
(&DWP
(64+4*8,"esp"),"eax");
170 &mov
(&DWP
(64+4*9,"esp"),"ebx");
171 &mov
(&DWP
(64+4*10,"esp"),"ecx");
172 &mov
(&DWP
(64+4*11,"esp"),"edx");
173 &mov
("eax",&DWP
(4*0,"edi")); # copy counter and nonce
174 &mov
("ebx",&DWP
(4*1,"edi"));
175 &mov
("ecx",&DWP
(4*2,"edi"));
176 &mov
("edx",&DWP
(4*3,"edi"));
178 &mov
(&DWP
(64+4*12,"esp"),"eax");
179 &mov
(&DWP
(64+4*13,"esp"),"ebx");
180 &mov
(&DWP
(64+4*14,"esp"),"ecx");
181 &mov
(&DWP
(64+4*15,"esp"),"edx");
182 &jmp
(&label
("entry"));
184 &set_label
("outer_loop",16);
185 &mov
(&wparam
(1),$b); # save input
186 &mov
(&wparam
(0),$a); # save output
187 &mov
(&wparam
(2),$c); # save len
189 &mov
($a,0x61707865);
190 &mov
(&DWP
(4*1,"esp"),0x3320646e);
191 &mov
(&DWP
(4*2,"esp"),0x79622d32);
192 &mov
(&DWP
(4*3,"esp"),0x6b206574);
194 &mov
($b, &DWP
(64+4*5,"esp")); # copy key material
195 &mov
($b_,&DWP
(64+4*6,"esp"));
196 &mov
($c, &DWP
(64+4*10,"esp"));
197 &mov
($c_,&DWP
(64+4*11,"esp"));
198 &mov
($d, &DWP
(64+4*13,"esp"));
199 &mov
($d_,&DWP
(64+4*14,"esp"));
200 &mov
(&DWP
(4*5,"esp"),$b);
201 &mov
(&DWP
(4*6,"esp"),$b_);
202 &mov
(&DWP
(4*10,"esp"),$c);
203 &mov
(&DWP
(4*11,"esp"),$c_);
204 &mov
(&DWP
(4*13,"esp"),$d);
205 &mov
(&DWP
(4*14,"esp"),$d_);
207 &mov
($b, &DWP
(64+4*7,"esp"));
208 &mov
($d_,&DWP
(64+4*15,"esp"));
209 &mov
($d, &DWP
(64+4*12,"esp"));
210 &mov
($b_,&DWP
(64+4*4,"esp"));
211 &mov
($c, &DWP
(64+4*8,"esp"));
212 &mov
($c_,&DWP
(64+4*9,"esp"));
213 &add
($d,1); # counter value
214 &mov
(&DWP
(4*7,"esp"),$b);
215 &mov
(&DWP
(4*15,"esp"),$d_);
216 &mov
(&DWP
(64+4*12,"esp"),$d); # save counter value
218 &mov
($b,10); # loop counter
219 &jmp
(&label
("loop"));
221 &set_label
("loop",16);
222 &add
($a,$b_); # elsewhere
223 &mov
(&DWP
(128,"esp"),$b); # save loop counter
225 &QUARTERROUND
(0, 4, 8, 12, 0);
226 &QUARTERROUND
(1, 5, 9, 13, 1);
227 &QUARTERROUND
(2, 6,10, 14, 2);
228 &QUARTERROUND
(3, 7,11, 15, 3);
229 &QUARTERROUND
(0, 5,10, 15, 4);
230 &QUARTERROUND
(1, 6,11, 12, 5);
231 &QUARTERROUND
(2, 7, 8, 13, 6);
232 &QUARTERROUND
(3, 4, 9, 14, 7);
234 &jnz
(&label
("loop"));
236 &mov
($b,&wparam
(2)); # load len
238 &add
($a,0x61707865); # accumulate key material
239 &add
($b_,&DWP
(64+4*4,"esp"));
240 &add
($c, &DWP
(64+4*8,"esp"));
241 &add
($c_,&DWP
(64+4*9,"esp"));
244 &jb
(&label
("tail"));
246 &mov
($b,&wparam
(1)); # load input pointer
247 &add
($d, &DWP
(64+4*12,"esp"));
248 &add
($d_,&DWP
(64+4*14,"esp"));
250 &xor ($a, &DWP
(4*0,$b)); # xor with input
251 &xor ($b_,&DWP
(4*4,$b));
252 &mov
(&DWP
(4*0,"esp"),$a);
253 &mov
($a,&wparam
(0)); # load output pointer
254 &xor ($c, &DWP
(4*8,$b));
255 &xor ($c_,&DWP
(4*9,$b));
256 &xor ($d, &DWP
(4*12,$b));
257 &xor ($d_,&DWP
(4*14,$b));
258 &mov
(&DWP
(4*4,$a),$b_); # write output
259 &mov
(&DWP
(4*8,$a),$c);
260 &mov
(&DWP
(4*9,$a),$c_);
261 &mov
(&DWP
(4*12,$a),$d);
262 &mov
(&DWP
(4*14,$a),$d_);
264 &mov
($b_,&DWP
(4*1,"esp"));
265 &mov
($c, &DWP
(4*2,"esp"));
266 &mov
($c_,&DWP
(4*3,"esp"));
267 &mov
($d, &DWP
(4*5,"esp"));
268 &mov
($d_,&DWP
(4*6,"esp"));
269 &add
($b_,0x3320646e); # accumulate key material
270 &add
($c, 0x79622d32);
271 &add
($c_,0x6b206574);
272 &add
($d, &DWP
(64+4*5,"esp"));
273 &add
($d_,&DWP
(64+4*6,"esp"));
274 &xor ($b_,&DWP
(4*1,$b));
275 &xor ($c, &DWP
(4*2,$b));
276 &xor ($c_,&DWP
(4*3,$b));
277 &xor ($d, &DWP
(4*5,$b));
278 &xor ($d_,&DWP
(4*6,$b));
279 &mov
(&DWP
(4*1,$a),$b_);
280 &mov
(&DWP
(4*2,$a),$c);
281 &mov
(&DWP
(4*3,$a),$c_);
282 &mov
(&DWP
(4*5,$a),$d);
283 &mov
(&DWP
(4*6,$a),$d_);
285 &mov
($b_,&DWP
(4*7,"esp"));
286 &mov
($c, &DWP
(4*10,"esp"));
287 &mov
($c_,&DWP
(4*11,"esp"));
288 &mov
($d, &DWP
(4*13,"esp"));
289 &mov
($d_,&DWP
(4*15,"esp"));
290 &add
($b_,&DWP
(64+4*7,"esp"));
291 &add
($c, &DWP
(64+4*10,"esp"));
292 &add
($c_,&DWP
(64+4*11,"esp"));
293 &add
($d, &DWP
(64+4*13,"esp"));
294 &add
($d_,&DWP
(64+4*15,"esp"));
295 &xor ($b_,&DWP
(4*7,$b));
296 &xor ($c, &DWP
(4*10,$b));
297 &xor ($c_,&DWP
(4*11,$b));
298 &xor ($d, &DWP
(4*13,$b));
299 &xor ($d_,&DWP
(4*15,$b));
300 &lea
($b,&DWP
(4*16,$b));
301 &mov
(&DWP
(4*7,$a),$b_);
302 &mov
($b_,&DWP
(4*0,"esp"));
303 &mov
(&DWP
(4*10,$a),$c);
304 &mov
($c,&wparam
(2)); # len
305 &mov
(&DWP
(4*11,$a),$c_);
306 &mov
(&DWP
(4*13,$a),$d);
307 &mov
(&DWP
(4*15,$a),$d_);
308 &mov
(&DWP
(4*0,$a),$b_);
309 &lea
($a,&DWP
(4*16,$a));
311 &jnz
(&label
("outer_loop"));
313 &jmp
(&label
("done"));
316 &add
($d, &DWP
(64+4*12,"esp"));
317 &add
($d_,&DWP
(64+4*14,"esp"));
318 &mov
(&DWP
(4*0,"esp"),$a);
319 &mov
(&DWP
(4*4,"esp"),$b_);
320 &mov
(&DWP
(4*8,"esp"),$c);
321 &mov
(&DWP
(4*9,"esp"),$c_);
322 &mov
(&DWP
(4*12,"esp"),$d);
323 &mov
(&DWP
(4*14,"esp"),$d_);
325 &mov
($b_,&DWP
(4*1,"esp"));
326 &mov
($c, &DWP
(4*2,"esp"));
327 &mov
($c_,&DWP
(4*3,"esp"));
328 &mov
($d, &DWP
(4*5,"esp"));
329 &mov
($d_,&DWP
(4*6,"esp"));
330 &add
($b_,0x3320646e); # accumulate key material
331 &add
($c, 0x79622d32);
332 &add
($c_,0x6b206574);
333 &add
($d, &DWP
(64+4*5,"esp"));
334 &add
($d_,&DWP
(64+4*6,"esp"));
335 &mov
(&DWP
(4*1,"esp"),$b_);
336 &mov
(&DWP
(4*2,"esp"),$c);
337 &mov
(&DWP
(4*3,"esp"),$c_);
338 &mov
(&DWP
(4*5,"esp"),$d);
339 &mov
(&DWP
(4*6,"esp"),$d_);
341 &mov
($b_,&DWP
(4*7,"esp"));
342 &mov
($c, &DWP
(4*10,"esp"));
343 &mov
($c_,&DWP
(4*11,"esp"));
344 &mov
($d, &DWP
(4*13,"esp"));
345 &mov
($d_,&DWP
(4*15,"esp"));
346 &add
($b_,&DWP
(64+4*7,"esp"));
347 &add
($c, &DWP
(64+4*10,"esp"));
348 &add
($c_,&DWP
(64+4*11,"esp"));
349 &add
($d, &DWP
(64+4*13,"esp"));
350 &add
($d_,&DWP
(64+4*15,"esp"));
351 &mov
(&DWP
(4*7,"esp"),$b_);
352 &mov
($b_,&wparam
(1)); # load input
353 &mov
(&DWP
(4*10,"esp"),$c);
354 &mov
($c,&wparam
(0)); # load output
355 &mov
(&DWP
(4*11,"esp"),$c_);
357 &mov
(&DWP
(4*13,"esp"),$d);
358 &mov
(&DWP
(4*15,"esp"),$d_);
362 &set_label
("tail_loop");
363 &movb
("al",&BP
(0,$c_,$b_));
364 &movb
("dl",&BP
(0,"esp",$c_));
365 &lea
($c_,&DWP
(1,$c_));
367 &mov
(&BP
(-1,$c,$c_),"al");
369 &jnz
(&label
("tail_loop"));
373 &set_label
("no_data");
374 &function_end
("ChaCha20_ctr32");
377 my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
378 my ($out,$inp,$len)=("edi","esi","ecx");
380 sub QUARTERROUND_SSSE3
{
381 my ($ai,$bi,$ci,$di,$i)=@_;
382 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
383 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
387 # 0 4 8 12 < even round
391 # 0 5 10 15 < odd round
398 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
401 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
404 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
407 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
410 #&paddd ($xa,$xb); # see elsewhere
411 #&pxor ($xd,$xa); # see elsewhere
412 &movdqa
(&QWP
(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3);
413 &pshufb
($xd,&QWP
(0,"eax")); # rot16
414 &movdqa
(&QWP
(16*$bp-128,"ebx"),$xb_) if ($i!=0);
416 &movdqa
($xc_,&QWP
(16*$cn-128,"ebx")) if ($ai>0 && $ai<3);
418 &movdqa
($xb_,&QWP
(16*$bn-128,"ebx")) if ($i<7);
419 &movdqa
($xa_,$xb); # borrow as temporary
423 &movdqa
($xa_,&QWP
(16*$an-128,"ebx"));
425 &movdqa
($xd_,&QWP
(16*$dn-128,"ebx")) if ($di!=$dn);
427 &movdqa
(&QWP
(16*$ai-128,"ebx"),$xa);
428 &pshufb
($xd,&QWP
(16,"eax")); # rot8
430 &movdqa
(&QWP
(16*$di-128,"ebx"),$xd) if ($di!=$dn);
431 &movdqa
($xd_,$xd) if ($di==$dn);
433 &paddd
($xa_,$xb_) if ($i<7); # elsewhere
434 &movdqa
($xa,$xb); # borrow as temporary
437 &pxor
($xd_,$xa_) if ($i<7); # elsewhere
440 ($xa,$xa_)=($xa_,$xa);
441 ($xb,$xb_)=($xb_,$xb);
442 ($xc,$xc_)=($xc_,$xc);
443 ($xd,$xd_)=($xd_,$xd);
446 &function_begin
("ChaCha20_ssse3");
447 &set_label
("ssse3_shortcut");
449 &test
(&DWP
(4,"ebp"),1<<11); # test XOP bit
450 &jnz
(&label
("xop_shortcut"));
453 &mov
($out,&wparam
(0));
454 &mov
($inp,&wparam
(1));
455 &mov
($len,&wparam
(2));
456 &mov
("edx",&wparam
(3)); # key
457 &mov
("ebx",&wparam
(4)); # counter and nonce
462 &mov
(&DWP
(512,"esp"),"ebp");
464 &lea
("eax",&DWP
(&label
("ssse3_data")."-".
465 &label
("pic_point"),"eax"));
466 &movdqu
("xmm3",&QWP
(0,"ebx")); # counter and nonce
468 if (defined($gasver) && $gasver>=2.17) { # even though we encode
469 # pshufb manually, we
470 # handle only register
471 # operands, while this
472 # segment uses memory
477 &mov
(&DWP
(512+4,"esp"),"edx"); # offload pointers
478 &mov
(&DWP
(512+8,"esp"),"ebx");
479 &sub ($len,64*4); # bias len
480 &lea
("ebp",&DWP
(256+128,"esp")); # size optimization
482 &movdqu
("xmm7",&QWP
(0,"edx")); # key
483 &pshufd
("xmm0","xmm3",0x00);
484 &pshufd
("xmm1","xmm3",0x55);
485 &pshufd
("xmm2","xmm3",0xaa);
486 &pshufd
("xmm3","xmm3",0xff);
487 &paddd
("xmm0",&QWP
(16*3,"eax")); # fix counters
488 &pshufd
("xmm4","xmm7",0x00);
489 &pshufd
("xmm5","xmm7",0x55);
490 &psubd
("xmm0",&QWP
(16*4,"eax"));
491 &pshufd
("xmm6","xmm7",0xaa);
492 &pshufd
("xmm7","xmm7",0xff);
493 &movdqa
(&QWP
(16*12-128,"ebp"),"xmm0");
494 &movdqa
(&QWP
(16*13-128,"ebp"),"xmm1");
495 &movdqa
(&QWP
(16*14-128,"ebp"),"xmm2");
496 &movdqa
(&QWP
(16*15-128,"ebp"),"xmm3");
497 &movdqu
("xmm3",&QWP
(16,"edx")); # key
498 &movdqa
(&QWP
(16*4-128,"ebp"),"xmm4");
499 &movdqa
(&QWP
(16*5-128,"ebp"),"xmm5");
500 &movdqa
(&QWP
(16*6-128,"ebp"),"xmm6");
501 &movdqa
(&QWP
(16*7-128,"ebp"),"xmm7");
502 &movdqa
("xmm7",&QWP
(16*2,"eax")); # sigma
503 &lea
("ebx",&DWP
(128,"esp")); # size optimization
505 &pshufd
("xmm0","xmm3",0x00);
506 &pshufd
("xmm1","xmm3",0x55);
507 &pshufd
("xmm2","xmm3",0xaa);
508 &pshufd
("xmm3","xmm3",0xff);
509 &pshufd
("xmm4","xmm7",0x00);
510 &pshufd
("xmm5","xmm7",0x55);
511 &pshufd
("xmm6","xmm7",0xaa);
512 &pshufd
("xmm7","xmm7",0xff);
513 &movdqa
(&QWP
(16*8-128,"ebp"),"xmm0");
514 &movdqa
(&QWP
(16*9-128,"ebp"),"xmm1");
515 &movdqa
(&QWP
(16*10-128,"ebp"),"xmm2");
516 &movdqa
(&QWP
(16*11-128,"ebp"),"xmm3");
517 &movdqa
(&QWP
(16*0-128,"ebp"),"xmm4");
518 &movdqa
(&QWP
(16*1-128,"ebp"),"xmm5");
519 &movdqa
(&QWP
(16*2-128,"ebp"),"xmm6");
520 &movdqa
(&QWP
(16*3-128,"ebp"),"xmm7");
522 &lea
($inp,&DWP
(128,$inp)); # size optimization
523 &lea
($out,&DWP
(128,$out)); # size optimization
524 &jmp
(&label
("outer_loop"));
526 &set_label
("outer_loop",16);
527 #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material
528 &movdqa
("xmm1",&QWP
(16*1-128,"ebp"));
529 &movdqa
("xmm2",&QWP
(16*2-128,"ebp"));
530 &movdqa
("xmm3",&QWP
(16*3-128,"ebp"));
531 #&movdqa ("xmm4",&QWP(16*4-128,"ebp"));
532 &movdqa
("xmm5",&QWP
(16*5-128,"ebp"));
533 &movdqa
("xmm6",&QWP
(16*6-128,"ebp"));
534 &movdqa
("xmm7",&QWP
(16*7-128,"ebp"));
535 #&movdqa (&QWP(16*0-128,"ebx"),"xmm0");
536 &movdqa
(&QWP
(16*1-128,"ebx"),"xmm1");
537 &movdqa
(&QWP
(16*2-128,"ebx"),"xmm2");
538 &movdqa
(&QWP
(16*3-128,"ebx"),"xmm3");
539 #&movdqa (&QWP(16*4-128,"ebx"),"xmm4");
540 &movdqa
(&QWP
(16*5-128,"ebx"),"xmm5");
541 &movdqa
(&QWP
(16*6-128,"ebx"),"xmm6");
542 &movdqa
(&QWP
(16*7-128,"ebx"),"xmm7");
543 #&movdqa ("xmm0",&QWP(16*8-128,"ebp"));
544 #&movdqa ("xmm1",&QWP(16*9-128,"ebp"));
545 &movdqa
("xmm2",&QWP
(16*10-128,"ebp"));
546 &movdqa
("xmm3",&QWP
(16*11-128,"ebp"));
547 &movdqa
("xmm4",&QWP
(16*12-128,"ebp"));
548 &movdqa
("xmm5",&QWP
(16*13-128,"ebp"));
549 &movdqa
("xmm6",&QWP
(16*14-128,"ebp"));
550 &movdqa
("xmm7",&QWP
(16*15-128,"ebp"));
551 &paddd
("xmm4",&QWP
(16*4,"eax")); # counter value
552 #&movdqa (&QWP(16*8-128,"ebx"),"xmm0");
553 #&movdqa (&QWP(16*9-128,"ebx"),"xmm1");
554 &movdqa
(&QWP
(16*10-128,"ebx"),"xmm2");
555 &movdqa
(&QWP
(16*11-128,"ebx"),"xmm3");
556 &movdqa
(&QWP
(16*12-128,"ebx"),"xmm4");
557 &movdqa
(&QWP
(16*13-128,"ebx"),"xmm5");
558 &movdqa
(&QWP
(16*14-128,"ebx"),"xmm6");
559 &movdqa
(&QWP
(16*15-128,"ebx"),"xmm7");
560 &movdqa
(&QWP
(16*12-128,"ebp"),"xmm4"); # save counter value
562 &movdqa
($xa, &QWP
(16*0-128,"ebp"));
563 &movdqa
($xd, "xmm4");
564 &movdqa
($xb_,&QWP
(16*4-128,"ebp"));
565 &movdqa
($xc, &QWP
(16*8-128,"ebp"));
566 &movdqa
($xc_,&QWP
(16*9-128,"ebp"));
568 &mov
("edx",10); # loop counter
571 &set_label
("loop",16);
572 &paddd
($xa,$xb_); # elsewhere
574 &pxor
($xd,$xa); # elsewhere
575 &QUARTERROUND_SSSE3
(0, 4, 8, 12, 0);
576 &QUARTERROUND_SSSE3
(1, 5, 9, 13, 1);
577 &QUARTERROUND_SSSE3
(2, 6,10, 14, 2);
578 &QUARTERROUND_SSSE3
(3, 7,11, 15, 3);
579 &QUARTERROUND_SSSE3
(0, 5,10, 15, 4);
580 &QUARTERROUND_SSSE3
(1, 6,11, 12, 5);
581 &QUARTERROUND_SSSE3
(2, 7, 8, 13, 6);
582 &QUARTERROUND_SSSE3
(3, 4, 9, 14, 7);
584 &jnz
(&label
("loop"));
586 &movdqa
(&QWP
(16*4-128,"ebx"),$xb_);
587 &movdqa
(&QWP
(16*8-128,"ebx"),$xc);
588 &movdqa
(&QWP
(16*9-128,"ebx"),$xc_);
589 &movdqa
(&QWP
(16*12-128,"ebx"),$xd);
590 &movdqa
(&QWP
(16*14-128,"ebx"),$xd_);
592 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
594 #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there
595 &movdqa
($xa1,&QWP
(16*1-128,"ebx"));
596 &movdqa
($xa2,&QWP
(16*2-128,"ebx"));
597 &movdqa
($xa3,&QWP
(16*3-128,"ebx"));
599 for($i=0;$i<256;$i+=64) {
600 &paddd
($xa0,&QWP
($i+16*0-128,"ebp")); # accumulate key material
601 &paddd
($xa1,&QWP
($i+16*1-128,"ebp"));
602 &paddd
($xa2,&QWP
($i+16*2-128,"ebp"));
603 &paddd
($xa3,&QWP
($i+16*3-128,"ebp"));
605 &movdqa
($xt2,$xa0); # "de-interlace" data
606 &punpckldq
($xa0,$xa1);
608 &punpckldq
($xa2,$xa3);
609 &punpckhdq
($xt2,$xa1);
610 &punpckhdq
($xt3,$xa3);
612 &punpcklqdq
($xa0,$xa2); # "a0"
614 &punpcklqdq
($xt2,$xt3); # "a2"
615 &punpckhqdq
($xa1,$xa2); # "a1"
616 &punpckhqdq
($xa3,$xt3); # "a3"
618 #($xa2,$xt2)=($xt2,$xa2);
620 &movdqu
($xt0,&QWP
(64*0-128,$inp)); # load input
621 &movdqu
($xt1,&QWP
(64*1-128,$inp));
622 &movdqu
($xa2,&QWP
(64*2-128,$inp));
623 &movdqu
($xt3,&QWP
(64*3-128,$inp));
624 &lea
($inp,&QWP
($i<192?
16:(64*4-16*3),$inp));
626 &movdqa
($xa0,&QWP
($i+16*4-128,"ebx")) if ($i<192);
628 &movdqa
($xa1,&QWP
($i+16*5-128,"ebx")) if ($i<192);
630 &movdqa
($xa2,&QWP
($i+16*6-128,"ebx")) if ($i<192);
632 &movdqa
($xa3,&QWP
($i+16*7-128,"ebx")) if ($i<192);
633 &movdqu
(&QWP
(64*0-128,$out),$xt0); # store output
634 &movdqu
(&QWP
(64*1-128,$out),$xt1);
635 &movdqu
(&QWP
(64*2-128,$out),$xt2);
636 &movdqu
(&QWP
(64*3-128,$out),$xt3);
637 &lea
($out,&QWP
($i<192?
16:(64*4-16*3),$out));
640 &jnc
(&label
("outer_loop"));
643 &jz
(&label
("done"));
645 &mov
("ebx",&DWP
(512+8,"esp")); # restore pointers
646 &lea
($inp,&DWP
(-128,$inp));
647 &mov
("edx",&DWP
(512+4,"esp"));
648 &lea
($out,&DWP
(-128,$out));
650 &movd
("xmm2",&DWP
(16*12-128,"ebp")); # counter value
651 &movdqu
("xmm3",&QWP
(0,"ebx"));
652 &paddd
("xmm2",&QWP
(16*6,"eax")); # +four
653 &pand
("xmm3",&QWP
(16*7,"eax"));
654 &por
("xmm3","xmm2"); # counter value
657 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
659 sub SSSE3ROUND
{ # critical path is 20 "SIMD ticks" per round
684 &movdqa
($a,&QWP
(16*2,"eax")); # sigma
685 &movdqu
($b,&QWP
(0,"edx"));
686 &movdqu
($c,&QWP
(16,"edx"));
687 #&movdqu ($d,&QWP(0,"ebx")); # already loaded
688 &movdqa
($rot16,&QWP
(0,"eax"));
689 &movdqa
($rot24,&QWP
(16,"eax"));
690 &mov
(&DWP
(16*3,"esp"),"ebp");
692 &movdqa
(&QWP
(16*0,"esp"),$a);
693 &movdqa
(&QWP
(16*1,"esp"),$b);
694 &movdqa
(&QWP
(16*2,"esp"),$c);
695 &movdqa
(&QWP
(16*3,"esp"),$d);
697 &jmp
(&label
("loop1x"));
699 &set_label
("outer1x",16);
700 &movdqa
($d,&QWP
(16*5,"eax")); # one
701 &movdqa
($a,&QWP
(16*0,"esp"));
702 &movdqa
($b,&QWP
(16*1,"esp"));
703 &movdqa
($c,&QWP
(16*2,"esp"));
704 &paddd
($d,&QWP
(16*3,"esp"));
706 &movdqa
(&QWP
(16*3,"esp"),$d);
707 &jmp
(&label
("loop1x"));
709 &set_label
("loop1x",16);
711 &pshufd
($c,$c,0b01001110
);
712 &pshufd
($b,$b,0b00111001
);
713 &pshufd
($d,$d,0b10010011
);
717 &pshufd
($c,$c,0b01001110
);
718 &pshufd
($b,$b,0b10010011
);
719 &pshufd
($d,$d,0b00111001
);
722 &jnz
(&label
("loop1x"));
724 &paddd
($a,&QWP
(16*0,"esp"));
725 &paddd
($b,&QWP
(16*1,"esp"));
726 &paddd
($c,&QWP
(16*2,"esp"));
727 &paddd
($d,&QWP
(16*3,"esp"));
730 &jb
(&label
("tail"));
732 &movdqu
($t,&QWP
(16*0,$inp));
733 &movdqu
($t1,&QWP
(16*1,$inp));
734 &pxor
($a,$t); # xor with input
735 &movdqu
($t,&QWP
(16*2,$inp));
737 &movdqu
($t1,&QWP
(16*3,$inp));
740 &lea
($inp,&DWP
(16*4,$inp)); # inp+=64
742 &movdqu
(&QWP
(16*0,$out),$a); # write output
743 &movdqu
(&QWP
(16*1,$out),$b);
744 &movdqu
(&QWP
(16*2,$out),$c);
745 &movdqu
(&QWP
(16*3,$out),$d);
746 &lea
($out,&DWP
(16*4,$out)); # inp+=64
749 &jnz
(&label
("outer1x"));
751 &jmp
(&label
("done"));
754 &movdqa
(&QWP
(16*0,"esp"),$a);
755 &movdqa
(&QWP
(16*1,"esp"),$b);
756 &movdqa
(&QWP
(16*2,"esp"),$c);
757 &movdqa
(&QWP
(16*3,"esp"),$d);
763 &set_label
("tail_loop");
764 &movb
("al",&BP
(0,"esp","ebp"));
765 &movb
("dl",&BP
(0,$inp,"ebp"));
766 &lea
("ebp",&DWP
(1,"ebp"));
768 &movb
(&BP
(-1,$out,"ebp"),"al");
770 &jnz
(&label
("tail_loop"));
773 &mov
("esp",&DWP
(512,"esp"));
774 &function_end
("ChaCha20_ssse3");
777 &set_label
("ssse3_data");
778 &data_byte
(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
779 &data_byte
(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
780 &data_word
(0x61707865,0x3320646e,0x79622d32,0x6b206574);
785 &data_word
(0,-1,-1,-1);
788 &asciz
("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
791 my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
792 my ($out,$inp,$len)=("edi","esi","ecx");
794 sub QUARTERROUND_XOP
{
795 my ($ai,$bi,$ci,$di,$i)=@_;
796 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
797 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
801 # 0 4 8 12 < even round
805 # 0 5 10 15 < odd round
812 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
815 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
818 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
821 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
824 #&vpaddd ($xa,$xa,$xb); # see elsewhere
825 #&vpxor ($xd,$xd,$xa); # see elsewhere
826 &vmovdqa
(&QWP
(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3);
827 &vprotd
($xd,$xd,16);
828 &vmovdqa
(&QWP
(16*$bp-128,"ebx"),$xb_) if ($i!=0);
829 &vpaddd
($xc,$xc,$xd);
830 &vmovdqa
($xc_,&QWP
(16*$cn-128,"ebx")) if ($ai>0 && $ai<3);
831 &vpxor
($xb,$i!=0?
$xb:$xb_,$xc);
832 &vmovdqa
($xa_,&QWP
(16*$an-128,"ebx"));
833 &vprotd
($xb,$xb,12);
834 &vmovdqa
($xb_,&QWP
(16*$bn-128,"ebx")) if ($i<7);
835 &vpaddd
($xa,$xa,$xb);
836 &vmovdqa
($xd_,&QWP
(16*$dn-128,"ebx")) if ($di!=$dn);
837 &vpxor
($xd,$xd,$xa);
838 &vpaddd
($xa_,$xa_,$xb_) if ($i<7); # elsewhere
840 &vmovdqa
(&QWP
(16*$ai-128,"ebx"),$xa);
841 &vpaddd
($xc,$xc,$xd);
842 &vmovdqa
(&QWP
(16*$di-128,"ebx"),$xd) if ($di!=$dn);
843 &vpxor
($xb,$xb,$xc);
844 &vpxor
($xd_,$di==$dn?
$xd:$xd_,$xa_) if ($i<7); # elsewhere
847 ($xa,$xa_)=($xa_,$xa);
848 ($xb,$xb_)=($xb_,$xb);
849 ($xc,$xc_)=($xc_,$xc);
850 ($xd,$xd_)=($xd_,$xd);
853 &function_begin
("ChaCha20_xop");
854 &set_label
("xop_shortcut");
855 &mov
($out,&wparam
(0));
856 &mov
($inp,&wparam
(1));
857 &mov
($len,&wparam
(2));
858 &mov
("edx",&wparam
(3)); # key
859 &mov
("ebx",&wparam
(4)); # counter and nonce
865 &mov
(&DWP
(512,"esp"),"ebp");
867 &lea
("eax",&DWP
(&label
("ssse3_data")."-".
868 &label
("pic_point"),"eax"));
869 &vmovdqu
("xmm3",&QWP
(0,"ebx")); # counter and nonce
874 &mov
(&DWP
(512+4,"esp"),"edx"); # offload pointers
875 &mov
(&DWP
(512+8,"esp"),"ebx");
876 &sub ($len,64*4); # bias len
877 &lea
("ebp",&DWP
(256+128,"esp")); # size optimization
879 &vmovdqu
("xmm7",&QWP
(0,"edx")); # key
880 &vpshufd
("xmm0","xmm3",0x00);
881 &vpshufd
("xmm1","xmm3",0x55);
882 &vpshufd
("xmm2","xmm3",0xaa);
883 &vpshufd
("xmm3","xmm3",0xff);
884 &vpaddd
("xmm0","xmm0",&QWP
(16*3,"eax")); # fix counters
885 &vpshufd
("xmm4","xmm7",0x00);
886 &vpshufd
("xmm5","xmm7",0x55);
887 &vpsubd
("xmm0","xmm0",&QWP
(16*4,"eax"));
888 &vpshufd
("xmm6","xmm7",0xaa);
889 &vpshufd
("xmm7","xmm7",0xff);
890 &vmovdqa
(&QWP
(16*12-128,"ebp"),"xmm0");
891 &vmovdqa
(&QWP
(16*13-128,"ebp"),"xmm1");
892 &vmovdqa
(&QWP
(16*14-128,"ebp"),"xmm2");
893 &vmovdqa
(&QWP
(16*15-128,"ebp"),"xmm3");
894 &vmovdqu
("xmm3",&QWP
(16,"edx")); # key
895 &vmovdqa
(&QWP
(16*4-128,"ebp"),"xmm4");
896 &vmovdqa
(&QWP
(16*5-128,"ebp"),"xmm5");
897 &vmovdqa
(&QWP
(16*6-128,"ebp"),"xmm6");
898 &vmovdqa
(&QWP
(16*7-128,"ebp"),"xmm7");
899 &vmovdqa
("xmm7",&QWP
(16*2,"eax")); # sigma
900 &lea
("ebx",&DWP
(128,"esp")); # size optimization
902 &vpshufd
("xmm0","xmm3",0x00);
903 &vpshufd
("xmm1","xmm3",0x55);
904 &vpshufd
("xmm2","xmm3",0xaa);
905 &vpshufd
("xmm3","xmm3",0xff);
906 &vpshufd
("xmm4","xmm7",0x00);
907 &vpshufd
("xmm5","xmm7",0x55);
908 &vpshufd
("xmm6","xmm7",0xaa);
909 &vpshufd
("xmm7","xmm7",0xff);
910 &vmovdqa
(&QWP
(16*8-128,"ebp"),"xmm0");
911 &vmovdqa
(&QWP
(16*9-128,"ebp"),"xmm1");
912 &vmovdqa
(&QWP
(16*10-128,"ebp"),"xmm2");
913 &vmovdqa
(&QWP
(16*11-128,"ebp"),"xmm3");
914 &vmovdqa
(&QWP
(16*0-128,"ebp"),"xmm4");
915 &vmovdqa
(&QWP
(16*1-128,"ebp"),"xmm5");
916 &vmovdqa
(&QWP
(16*2-128,"ebp"),"xmm6");
917 &vmovdqa
(&QWP
(16*3-128,"ebp"),"xmm7");
919 &lea
($inp,&DWP
(128,$inp)); # size optimization
920 &lea
($out,&DWP
(128,$out)); # size optimization
921 &jmp
(&label
("outer_loop"));
923 &set_label
("outer_loop",32);
924 #&vmovdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material
925 &vmovdqa
("xmm1",&QWP
(16*1-128,"ebp"));
926 &vmovdqa
("xmm2",&QWP
(16*2-128,"ebp"));
927 &vmovdqa
("xmm3",&QWP
(16*3-128,"ebp"));
928 #&vmovdqa ("xmm4",&QWP(16*4-128,"ebp"));
929 &vmovdqa
("xmm5",&QWP
(16*5-128,"ebp"));
930 &vmovdqa
("xmm6",&QWP
(16*6-128,"ebp"));
931 &vmovdqa
("xmm7",&QWP
(16*7-128,"ebp"));
932 #&vmovdqa (&QWP(16*0-128,"ebx"),"xmm0");
933 &vmovdqa
(&QWP
(16*1-128,"ebx"),"xmm1");
934 &vmovdqa
(&QWP
(16*2-128,"ebx"),"xmm2");
935 &vmovdqa
(&QWP
(16*3-128,"ebx"),"xmm3");
936 #&vmovdqa (&QWP(16*4-128,"ebx"),"xmm4");
937 &vmovdqa
(&QWP
(16*5-128,"ebx"),"xmm5");
938 &vmovdqa
(&QWP
(16*6-128,"ebx"),"xmm6");
939 &vmovdqa
(&QWP
(16*7-128,"ebx"),"xmm7");
940 #&vmovdqa ("xmm0",&QWP(16*8-128,"ebp"));
941 #&vmovdqa ("xmm1",&QWP(16*9-128,"ebp"));
942 &vmovdqa
("xmm2",&QWP
(16*10-128,"ebp"));
943 &vmovdqa
("xmm3",&QWP
(16*11-128,"ebp"));
944 &vmovdqa
("xmm4",&QWP
(16*12-128,"ebp"));
945 &vmovdqa
("xmm5",&QWP
(16*13-128,"ebp"));
946 &vmovdqa
("xmm6",&QWP
(16*14-128,"ebp"));
947 &vmovdqa
("xmm7",&QWP
(16*15-128,"ebp"));
948 &vpaddd
("xmm4","xmm4",&QWP
(16*4,"eax")); # counter value
949 #&vmovdqa (&QWP(16*8-128,"ebx"),"xmm0");
950 #&vmovdqa (&QWP(16*9-128,"ebx"),"xmm1");
951 &vmovdqa
(&QWP
(16*10-128,"ebx"),"xmm2");
952 &vmovdqa
(&QWP
(16*11-128,"ebx"),"xmm3");
953 &vmovdqa
(&QWP
(16*12-128,"ebx"),"xmm4");
954 &vmovdqa
(&QWP
(16*13-128,"ebx"),"xmm5");
955 &vmovdqa
(&QWP
(16*14-128,"ebx"),"xmm6");
956 &vmovdqa
(&QWP
(16*15-128,"ebx"),"xmm7");
957 &vmovdqa
(&QWP
(16*12-128,"ebp"),"xmm4"); # save counter value
959 &vmovdqa
($xa, &QWP
(16*0-128,"ebp"));
960 &vmovdqa
($xd, "xmm4");
961 &vmovdqa
($xb_,&QWP
(16*4-128,"ebp"));
962 &vmovdqa
($xc, &QWP
(16*8-128,"ebp"));
963 &vmovdqa
($xc_,&QWP
(16*9-128,"ebp"));
965 &mov
("edx",10); # loop counter
968 &set_label
("loop",32);
969 &vpaddd
($xa,$xa,$xb_); # elsewhere
970 &vpxor
($xd,$xd,$xa); # elsewhere
971 &QUARTERROUND_XOP
(0, 4, 8, 12, 0);
972 &QUARTERROUND_XOP
(1, 5, 9, 13, 1);
973 &QUARTERROUND_XOP
(2, 6,10, 14, 2);
974 &QUARTERROUND_XOP
(3, 7,11, 15, 3);
975 &QUARTERROUND_XOP
(0, 5,10, 15, 4);
976 &QUARTERROUND_XOP
(1, 6,11, 12, 5);
977 &QUARTERROUND_XOP
(2, 7, 8, 13, 6);
978 &QUARTERROUND_XOP
(3, 4, 9, 14, 7);
980 &jnz
(&label
("loop"));
982 &vmovdqa
(&QWP
(16*4-128,"ebx"),$xb_);
983 &vmovdqa
(&QWP
(16*8-128,"ebx"),$xc);
984 &vmovdqa
(&QWP
(16*9-128,"ebx"),$xc_);
985 &vmovdqa
(&QWP
(16*12-128,"ebx"),$xd);
986 &vmovdqa
(&QWP
(16*14-128,"ebx"),$xd_);
988 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
990 #&vmovdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there
991 &vmovdqa
($xa1,&QWP
(16*1-128,"ebx"));
992 &vmovdqa
($xa2,&QWP
(16*2-128,"ebx"));
993 &vmovdqa
($xa3,&QWP
(16*3-128,"ebx"));
995 for($i=0;$i<256;$i+=64) {
996 &vpaddd
($xa0,$xa0,&QWP
($i+16*0-128,"ebp")); # accumulate key material
997 &vpaddd
($xa1,$xa1,&QWP
($i+16*1-128,"ebp"));
998 &vpaddd
($xa2,$xa2,&QWP
($i+16*2-128,"ebp"));
999 &vpaddd
($xa3,$xa3,&QWP
($i+16*3-128,"ebp"));
1001 &vpunpckldq
($xt2,$xa0,$xa1); # "de-interlace" data
1002 &vpunpckldq
($xt3,$xa2,$xa3);
1003 &vpunpckhdq
($xa0,$xa0,$xa1);
1004 &vpunpckhdq
($xa2,$xa2,$xa3);
1005 &vpunpcklqdq
($xa1,$xt2,$xt3); # "a0"
1006 &vpunpckhqdq
($xt2,$xt2,$xt3); # "a1"
1007 &vpunpcklqdq
($xt3,$xa0,$xa2); # "a2"
1008 &vpunpckhqdq
($xa3,$xa0,$xa2); # "a3"
1010 &vpxor
($xt0,$xa1,&QWP
(64*0-128,$inp));
1011 &vpxor
($xt1,$xt2,&QWP
(64*1-128,$inp));
1012 &vpxor
($xt2,$xt3,&QWP
(64*2-128,$inp));
1013 &vpxor
($xt3,$xa3,&QWP
(64*3-128,$inp));
1014 &lea
($inp,&QWP
($i<192?
16:(64*4-16*3),$inp));
1015 &vmovdqa
($xa0,&QWP
($i+16*4-128,"ebx")) if ($i<192);
1016 &vmovdqa
($xa1,&QWP
($i+16*5-128,"ebx")) if ($i<192);
1017 &vmovdqa
($xa2,&QWP
($i+16*6-128,"ebx")) if ($i<192);
1018 &vmovdqa
($xa3,&QWP
($i+16*7-128,"ebx")) if ($i<192);
1019 &vmovdqu
(&QWP
(64*0-128,$out),$xt0); # store output
1020 &vmovdqu
(&QWP
(64*1-128,$out),$xt1);
1021 &vmovdqu
(&QWP
(64*2-128,$out),$xt2);
1022 &vmovdqu
(&QWP
(64*3-128,$out),$xt3);
1023 &lea
($out,&QWP
($i<192?
16:(64*4-16*3),$out));
1026 &jnc
(&label
("outer_loop"));
1029 &jz
(&label
("done"));
1031 &mov
("ebx",&DWP
(512+8,"esp")); # restore pointers
1032 &lea
($inp,&DWP
(-128,$inp));
1033 &mov
("edx",&DWP
(512+4,"esp"));
1034 &lea
($out,&DWP
(-128,$out));
1036 &vmovd
("xmm2",&DWP
(16*12-128,"ebp")); # counter value
1037 &vmovdqu
("xmm3",&QWP
(0,"ebx"));
1038 &vpaddd
("xmm2","xmm2",&QWP
(16*6,"eax"));# +four
1039 &vpand
("xmm3","xmm3",&QWP
(16*7,"eax"));
1040 &vpor
("xmm3","xmm3","xmm2"); # counter value
1042 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
1063 &vmovdqa
($a,&QWP
(16*2,"eax")); # sigma
1064 &vmovdqu
($b,&QWP
(0,"edx"));
1065 &vmovdqu
($c,&QWP
(16,"edx"));
1066 #&vmovdqu ($d,&QWP(0,"ebx")); # already loaded
1067 &vmovdqa
($rot16,&QWP
(0,"eax"));
1068 &vmovdqa
($rot24,&QWP
(16,"eax"));
1069 &mov
(&DWP
(16*3,"esp"),"ebp");
1071 &vmovdqa
(&QWP
(16*0,"esp"),$a);
1072 &vmovdqa
(&QWP
(16*1,"esp"),$b);
1073 &vmovdqa
(&QWP
(16*2,"esp"),$c);
1074 &vmovdqa
(&QWP
(16*3,"esp"),$d);
1076 &jmp
(&label
("loop1x"));
1078 &set_label
("outer1x",16);
1079 &vmovdqa
($d,&QWP
(16*5,"eax")); # one
1080 &vmovdqa
($a,&QWP
(16*0,"esp"));
1081 &vmovdqa
($b,&QWP
(16*1,"esp"));
1082 &vmovdqa
($c,&QWP
(16*2,"esp"));
1083 &vpaddd
($d,$d,&QWP
(16*3,"esp"));
1085 &vmovdqa
(&QWP
(16*3,"esp"),$d);
1086 &jmp
(&label
("loop1x"));
1088 &set_label
("loop1x",16);
1090 &vpshufd
($c,$c,0b01001110
);
1091 &vpshufd
($b,$b,0b00111001
);
1092 &vpshufd
($d,$d,0b10010011
);
1095 &vpshufd
($c,$c,0b01001110
);
1096 &vpshufd
($b,$b,0b10010011
);
1097 &vpshufd
($d,$d,0b00111001
);
1100 &jnz
(&label
("loop1x"));
1102 &vpaddd
($a,$a,&QWP
(16*0,"esp"));
1103 &vpaddd
($b,$b,&QWP
(16*1,"esp"));
1104 &vpaddd
($c,$c,&QWP
(16*2,"esp"));
1105 &vpaddd
($d,$d,&QWP
(16*3,"esp"));
1108 &jb
(&label
("tail"));
1110 &vpxor
($a,$a,&QWP
(16*0,$inp)); # xor with input
1111 &vpxor
($b,$b,&QWP
(16*1,$inp));
1112 &vpxor
($c,$c,&QWP
(16*2,$inp));
1113 &vpxor
($d,$d,&QWP
(16*3,$inp));
1114 &lea
($inp,&DWP
(16*4,$inp)); # inp+=64
1116 &vmovdqu
(&QWP
(16*0,$out),$a); # write output
1117 &vmovdqu
(&QWP
(16*1,$out),$b);
1118 &vmovdqu
(&QWP
(16*2,$out),$c);
1119 &vmovdqu
(&QWP
(16*3,$out),$d);
1120 &lea
($out,&DWP
(16*4,$out)); # inp+=64
1123 &jnz
(&label
("outer1x"));
1125 &jmp
(&label
("done"));
1128 &vmovdqa
(&QWP
(16*0,"esp"),$a);
1129 &vmovdqa
(&QWP
(16*1,"esp"),$b);
1130 &vmovdqa
(&QWP
(16*2,"esp"),$c);
1131 &vmovdqa
(&QWP
(16*3,"esp"),$d);
1137 &set_label
("tail_loop");
1138 &movb
("al",&BP
(0,"esp","ebp"));
1139 &movb
("dl",&BP
(0,$inp,"ebp"));
1140 &lea
("ebp",&DWP
(1,"ebp"));
1142 &movb
(&BP
(-1,$out,"ebp"),"al");
1144 &jnz
(&label
("tail_loop"));
1148 &mov
("esp",&DWP
(512,"esp"));
1149 &function_end
("ChaCha20_xop");
1154 close STDOUT
or die "error closing STDOUT: $!";