]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/chacha/asm/chacha-x86.pl
Add OpenSSL copyright to .pl files
[thirdparty/openssl.git] / crypto / chacha / asm / chacha-x86.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # January 2015
18 #
19 # ChaCha20 for x86.
20 #
21 # Performance in cycles per byte out of large buffer.
22 #
23 # 1xIALU/gcc 4xSSSE3
24 # Pentium 17.5/+80%
25 # PIII 14.2/+60%
26 # P4 18.6/+84%
27 # Core2 9.56/+89% 4.83
28 # Westmere 9.50/+45% 3.35
29 # Sandy Bridge 10.5/+47% 3.20
30 # Haswell 8.15/+50% 2.83
31 # Silvermont 17.4/+36% 8.35
32 # Sledgehammer 10.2/+54%
33 # Bulldozer 13.4/+50% 4.38(*)
34 #
35 # (*) Bulldozer actually executes 4xXOP code path that delivers 3.55;
36
37 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38 push(@INC,"${dir}","${dir}../../perlasm");
39 require "x86asm.pl";
40
41 $output=pop;
42 open STDOUT,">$output";
43
44 &asm_init($ARGV[0],"chacha-x86.pl",$ARGV[$#ARGV] eq "386");
45
46 $xmm=$ymm=0;
47 for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
48
49 $ymm=1 if ($xmm &&
50 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
51 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
52 $1>=2.19); # first version supporting AVX
53
54 $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
55 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
56 $1>=2.03); # first version supporting AVX
57
58 $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
59 `ml 2>&1` =~ /Version ([0-9]+)\./ &&
60 $1>=10); # first version supporting AVX
61
62 $ymm=1 if ($xmm && !$ymm &&
63 `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ &&
64 $2>=3.0); # first version supporting AVX
65
66 $a="eax";
67 ($b,$b_)=("ebx","ebp");
68 ($c,$c_)=("ecx","esi");
69 ($d,$d_)=("edx","edi");
70
71 sub QUARTERROUND {
72 my ($ai,$bi,$ci,$di,$i)=@_;
73 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
74 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
75
76 # a b c d
77 #
78 # 0 4 8 12 < even round
79 # 1 5 9 13
80 # 2 6 10 14
81 # 3 7 11 15
82 # 0 5 10 15 < odd round
83 # 1 6 11 12
84 # 2 7 8 13
85 # 3 4 9 14
86
87 if ($i==0) {
88 my $j=4;
89 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
90 } elsif ($i==3) {
91 my $j=0;
92 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
93 } elsif ($i==4) {
94 my $j=4;
95 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
96 } elsif ($i==7) {
97 my $j=0;
98 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
99 }
100
101 #&add ($a,$b); # see elsewhere
102 &xor ($d,$a);
103 &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3);
104 &rol ($d,16);
105 &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0);
106 &add ($c,$d);
107 &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3);
108 &xor ($b,$c);
109 &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn);
110 &rol ($b,12);
111 &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7);
112 &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter
113 &add ($a,$b);
114 &xor ($d,$a);
115 &mov (&DWP(4*$ai,"esp"),$a);
116 &rol ($d,8);
117 &mov ($a,&DWP(4*$an,"esp"));
118 &add ($c,$d);
119 &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn);
120 &mov ($d_,$d) if ($di==$dn);
121 &xor ($b,$c);
122 &add ($a,$b_) if ($i<7); # elsewhere
123 &rol ($b,7);
124
125 ($b,$b_)=($b_,$b);
126 ($c,$c_)=($c_,$c);
127 ($d,$d_)=($d_,$d);
128 }
129
130 &static_label("ssse3_shortcut");
131 &static_label("xop_shortcut");
132 &static_label("ssse3_data");
133 &static_label("pic_point");
134
135 &function_begin("ChaCha20_ctr32");
136 &xor ("eax","eax");
137 &cmp ("eax",&wparam(2)); # len==0?
138 &je (&label("no_data"));
139 if ($xmm) {
140 &call (&label("pic_point"));
141 &set_label("pic_point");
142 &blindpop("eax");
143 &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
144 &test (&DWP(0,"ebp"),1<<24); # test FXSR bit
145 &jz (&label("x86"));
146 &test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit
147 &jz (&label("x86"));
148 &jmp (&label("ssse3_shortcut"));
149 &set_label("x86");
150 }
151 &mov ("esi",&wparam(3)); # key
152 &mov ("edi",&wparam(4)); # counter and nonce
153
154 &stack_push(33);
155
156 &mov ("eax",&DWP(4*0,"esi")); # copy key
157 &mov ("ebx",&DWP(4*1,"esi"));
158 &mov ("ecx",&DWP(4*2,"esi"));
159 &mov ("edx",&DWP(4*3,"esi"));
160 &mov (&DWP(64+4*4,"esp"),"eax");
161 &mov (&DWP(64+4*5,"esp"),"ebx");
162 &mov (&DWP(64+4*6,"esp"),"ecx");
163 &mov (&DWP(64+4*7,"esp"),"edx");
164 &mov ("eax",&DWP(4*4,"esi"));
165 &mov ("ebx",&DWP(4*5,"esi"));
166 &mov ("ecx",&DWP(4*6,"esi"));
167 &mov ("edx",&DWP(4*7,"esi"));
168 &mov (&DWP(64+4*8,"esp"),"eax");
169 &mov (&DWP(64+4*9,"esp"),"ebx");
170 &mov (&DWP(64+4*10,"esp"),"ecx");
171 &mov (&DWP(64+4*11,"esp"),"edx");
172 &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce
173 &mov ("ebx",&DWP(4*1,"edi"));
174 &mov ("ecx",&DWP(4*2,"edi"));
175 &mov ("edx",&DWP(4*3,"edi"));
176 &sub ("eax",1);
177 &mov (&DWP(64+4*12,"esp"),"eax");
178 &mov (&DWP(64+4*13,"esp"),"ebx");
179 &mov (&DWP(64+4*14,"esp"),"ecx");
180 &mov (&DWP(64+4*15,"esp"),"edx");
181 &jmp (&label("entry"));
182
183 &set_label("outer_loop",16);
184 &mov (&wparam(1),$b); # save input
185 &mov (&wparam(0),$a); # save output
186 &mov (&wparam(2),$c); # save len
187 &set_label("entry");
188 &mov ($a,0x61707865);
189 &mov (&DWP(4*1,"esp"),0x3320646e);
190 &mov (&DWP(4*2,"esp"),0x79622d32);
191 &mov (&DWP(4*3,"esp"),0x6b206574);
192
193 &mov ($b, &DWP(64+4*5,"esp")); # copy key material
194 &mov ($b_,&DWP(64+4*6,"esp"));
195 &mov ($c, &DWP(64+4*10,"esp"));
196 &mov ($c_,&DWP(64+4*11,"esp"));
197 &mov ($d, &DWP(64+4*13,"esp"));
198 &mov ($d_,&DWP(64+4*14,"esp"));
199 &mov (&DWP(4*5,"esp"),$b);
200 &mov (&DWP(4*6,"esp"),$b_);
201 &mov (&DWP(4*10,"esp"),$c);
202 &mov (&DWP(4*11,"esp"),$c_);
203 &mov (&DWP(4*13,"esp"),$d);
204 &mov (&DWP(4*14,"esp"),$d_);
205
206 &mov ($b, &DWP(64+4*7,"esp"));
207 &mov ($d_,&DWP(64+4*15,"esp"));
208 &mov ($d, &DWP(64+4*12,"esp"));
209 &mov ($b_,&DWP(64+4*4,"esp"));
210 &mov ($c, &DWP(64+4*8,"esp"));
211 &mov ($c_,&DWP(64+4*9,"esp"));
212 &add ($d,1); # counter value
213 &mov (&DWP(4*7,"esp"),$b);
214 &mov (&DWP(4*15,"esp"),$d_);
215 &mov (&DWP(64+4*12,"esp"),$d); # save counter value
216
217 &mov ($b,10); # loop counter
218 &jmp (&label("loop"));
219
220 &set_label("loop",16);
221 &add ($a,$b_); # elsewhere
222 &mov (&DWP(128,"esp"),$b); # save loop counter
223 &mov ($b,$b_);
224 &QUARTERROUND(0, 4, 8, 12, 0);
225 &QUARTERROUND(1, 5, 9, 13, 1);
226 &QUARTERROUND(2, 6,10, 14, 2);
227 &QUARTERROUND(3, 7,11, 15, 3);
228 &QUARTERROUND(0, 5,10, 15, 4);
229 &QUARTERROUND(1, 6,11, 12, 5);
230 &QUARTERROUND(2, 7, 8, 13, 6);
231 &QUARTERROUND(3, 4, 9, 14, 7);
232 &dec ($b);
233 &jnz (&label("loop"));
234
235 &mov ($b,&wparam(2)); # load len
236
237 &add ($a,0x61707865); # accumulate key material
238 &add ($b_,&DWP(64+4*4,"esp"));
239 &add ($c, &DWP(64+4*8,"esp"));
240 &add ($c_,&DWP(64+4*9,"esp"));
241
242 &cmp ($b,64);
243 &jb (&label("tail"));
244
245 &mov ($b,&wparam(1)); # load input pointer
246 &add ($d, &DWP(64+4*12,"esp"));
247 &add ($d_,&DWP(64+4*14,"esp"));
248
249 &xor ($a, &DWP(4*0,$b)); # xor with input
250 &xor ($b_,&DWP(4*4,$b));
251 &mov (&DWP(4*0,"esp"),$a);
252 &mov ($a,&wparam(0)); # load output pointer
253 &xor ($c, &DWP(4*8,$b));
254 &xor ($c_,&DWP(4*9,$b));
255 &xor ($d, &DWP(4*12,$b));
256 &xor ($d_,&DWP(4*14,$b));
257 &mov (&DWP(4*4,$a),$b_); # write output
258 &mov (&DWP(4*8,$a),$c);
259 &mov (&DWP(4*9,$a),$c_);
260 &mov (&DWP(4*12,$a),$d);
261 &mov (&DWP(4*14,$a),$d_);
262
263 &mov ($b_,&DWP(4*1,"esp"));
264 &mov ($c, &DWP(4*2,"esp"));
265 &mov ($c_,&DWP(4*3,"esp"));
266 &mov ($d, &DWP(4*5,"esp"));
267 &mov ($d_,&DWP(4*6,"esp"));
268 &add ($b_,0x3320646e); # accumulate key material
269 &add ($c, 0x79622d32);
270 &add ($c_,0x6b206574);
271 &add ($d, &DWP(64+4*5,"esp"));
272 &add ($d_,&DWP(64+4*6,"esp"));
273 &xor ($b_,&DWP(4*1,$b));
274 &xor ($c, &DWP(4*2,$b));
275 &xor ($c_,&DWP(4*3,$b));
276 &xor ($d, &DWP(4*5,$b));
277 &xor ($d_,&DWP(4*6,$b));
278 &mov (&DWP(4*1,$a),$b_);
279 &mov (&DWP(4*2,$a),$c);
280 &mov (&DWP(4*3,$a),$c_);
281 &mov (&DWP(4*5,$a),$d);
282 &mov (&DWP(4*6,$a),$d_);
283
284 &mov ($b_,&DWP(4*7,"esp"));
285 &mov ($c, &DWP(4*10,"esp"));
286 &mov ($c_,&DWP(4*11,"esp"));
287 &mov ($d, &DWP(4*13,"esp"));
288 &mov ($d_,&DWP(4*15,"esp"));
289 &add ($b_,&DWP(64+4*7,"esp"));
290 &add ($c, &DWP(64+4*10,"esp"));
291 &add ($c_,&DWP(64+4*11,"esp"));
292 &add ($d, &DWP(64+4*13,"esp"));
293 &add ($d_,&DWP(64+4*15,"esp"));
294 &xor ($b_,&DWP(4*7,$b));
295 &xor ($c, &DWP(4*10,$b));
296 &xor ($c_,&DWP(4*11,$b));
297 &xor ($d, &DWP(4*13,$b));
298 &xor ($d_,&DWP(4*15,$b));
299 &lea ($b,&DWP(4*16,$b));
300 &mov (&DWP(4*7,$a),$b_);
301 &mov ($b_,&DWP(4*0,"esp"));
302 &mov (&DWP(4*10,$a),$c);
303 &mov ($c,&wparam(2)); # len
304 &mov (&DWP(4*11,$a),$c_);
305 &mov (&DWP(4*13,$a),$d);
306 &mov (&DWP(4*15,$a),$d_);
307 &mov (&DWP(4*0,$a),$b_);
308 &lea ($a,&DWP(4*16,$a));
309 &sub ($c,64);
310 &jnz (&label("outer_loop"));
311
312 &jmp (&label("done"));
313
314 &set_label("tail");
315 &add ($d, &DWP(64+4*12,"esp"));
316 &add ($d_,&DWP(64+4*14,"esp"));
317 &mov (&DWP(4*0,"esp"),$a);
318 &mov (&DWP(4*4,"esp"),$b_);
319 &mov (&DWP(4*8,"esp"),$c);
320 &mov (&DWP(4*9,"esp"),$c_);
321 &mov (&DWP(4*12,"esp"),$d);
322 &mov (&DWP(4*14,"esp"),$d_);
323
324 &mov ($b_,&DWP(4*1,"esp"));
325 &mov ($c, &DWP(4*2,"esp"));
326 &mov ($c_,&DWP(4*3,"esp"));
327 &mov ($d, &DWP(4*5,"esp"));
328 &mov ($d_,&DWP(4*6,"esp"));
329 &add ($b_,0x3320646e); # accumulate key material
330 &add ($c, 0x79622d32);
331 &add ($c_,0x6b206574);
332 &add ($d, &DWP(64+4*5,"esp"));
333 &add ($d_,&DWP(64+4*6,"esp"));
334 &mov (&DWP(4*1,"esp"),$b_);
335 &mov (&DWP(4*2,"esp"),$c);
336 &mov (&DWP(4*3,"esp"),$c_);
337 &mov (&DWP(4*5,"esp"),$d);
338 &mov (&DWP(4*6,"esp"),$d_);
339
340 &mov ($b_,&DWP(4*7,"esp"));
341 &mov ($c, &DWP(4*10,"esp"));
342 &mov ($c_,&DWP(4*11,"esp"));
343 &mov ($d, &DWP(4*13,"esp"));
344 &mov ($d_,&DWP(4*15,"esp"));
345 &add ($b_,&DWP(64+4*7,"esp"));
346 &add ($c, &DWP(64+4*10,"esp"));
347 &add ($c_,&DWP(64+4*11,"esp"));
348 &add ($d, &DWP(64+4*13,"esp"));
349 &add ($d_,&DWP(64+4*15,"esp"));
350 &mov (&DWP(4*7,"esp"),$b_);
351 &mov ($b_,&wparam(1)); # load input
352 &mov (&DWP(4*10,"esp"),$c);
353 &mov ($c,&wparam(0)); # load output
354 &mov (&DWP(4*11,"esp"),$c_);
355 &xor ($c_,$c_);
356 &mov (&DWP(4*13,"esp"),$d);
357 &mov (&DWP(4*15,"esp"),$d_);
358
359 &xor ("eax","eax");
360 &xor ("edx","edx");
361 &set_label("tail_loop");
362 &movb ("al",&BP(0,$c_,$b_));
363 &movb ("dl",&BP(0,"esp",$c_));
364 &lea ($c_,&DWP(1,$c_));
365 &xor ("al","dl");
366 &mov (&BP(-1,$c,$c_),"al");
367 &dec ($b);
368 &jnz (&label("tail_loop"));
369
370 &set_label("done");
371 &stack_pop(33);
372 &set_label("no_data");
373 &function_end("ChaCha20_ctr32");
374
375 if ($xmm) {
376 my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
377 my ($out,$inp,$len)=("edi","esi","ecx");
378
379 sub QUARTERROUND_SSSE3 {
380 my ($ai,$bi,$ci,$di,$i)=@_;
381 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
382 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
383
384 # a b c d
385 #
386 # 0 4 8 12 < even round
387 # 1 5 9 13
388 # 2 6 10 14
389 # 3 7 11 15
390 # 0 5 10 15 < odd round
391 # 1 6 11 12
392 # 2 7 8 13
393 # 3 4 9 14
394
395 if ($i==0) {
396 my $j=4;
397 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
398 } elsif ($i==3) {
399 my $j=0;
400 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
401 } elsif ($i==4) {
402 my $j=4;
403 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
404 } elsif ($i==7) {
405 my $j=0;
406 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
407 }
408
409 #&paddd ($xa,$xb); # see elsewhere
410 #&pxor ($xd,$xa); # see elsewhere
411 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3);
412 &pshufb ($xd,&QWP(0,"eax")); # rot16
413 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0);
414 &paddd ($xc,$xd);
415 &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3);
416 &pxor ($xb,$xc);
417 &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7);
418 &movdqa ($xa_,$xb); # borrow as temporary
419 &pslld ($xb,12);
420 &psrld ($xa_,20);
421 &por ($xb,$xa_);
422 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
423 &paddd ($xa,$xb);
424 &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn);
425 &pxor ($xd,$xa);
426 &movdqa (&QWP(16*$ai-128,"ebx"),$xa);
427 &pshufb ($xd,&QWP(16,"eax")); # rot8
428 &paddd ($xc,$xd);
429 &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn);
430 &movdqa ($xd_,$xd) if ($di==$dn);
431 &pxor ($xb,$xc);
432 &paddd ($xa_,$xb_) if ($i<7); # elsewhere
433 &movdqa ($xa,$xb); # borrow as temporary
434 &pslld ($xb,7);
435 &psrld ($xa,25);
436 &pxor ($xd_,$xa_) if ($i<7); # elsewhere
437 &por ($xb,$xa);
438
439 ($xa,$xa_)=($xa_,$xa);
440 ($xb,$xb_)=($xb_,$xb);
441 ($xc,$xc_)=($xc_,$xc);
442 ($xd,$xd_)=($xd_,$xd);
443 }
444
445 &function_begin("ChaCha20_ssse3");
446 &set_label("ssse3_shortcut");
447 if ($ymm) {
448 &test (&DWP(4,"ebp"),1<<11); # test XOP bit
449 &jnz (&label("xop_shortcut"));
450 }
451
452 &mov ($out,&wparam(0));
453 &mov ($inp,&wparam(1));
454 &mov ($len,&wparam(2));
455 &mov ("edx",&wparam(3)); # key
456 &mov ("ebx",&wparam(4)); # counter and nonce
457
458 &mov ("ebp","esp");
459 &stack_push (131);
460 &and ("esp",-64);
461 &mov (&DWP(512,"esp"),"ebp");
462
463 &lea ("eax",&DWP(&label("ssse3_data")."-".
464 &label("pic_point"),"eax"));
465 &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce
466
467 &cmp ($len,64*4);
468 &jb (&label("1x"));
469
470 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers
471 &mov (&DWP(512+8,"esp"),"ebx");
472 &sub ($len,64*4); # bias len
473 &lea ("ebp",&DWP(256+128,"esp")); # size optimization
474
475 &movdqu ("xmm7",&QWP(0,"edx")); # key
476 &pshufd ("xmm0","xmm3",0x00);
477 &pshufd ("xmm1","xmm3",0x55);
478 &pshufd ("xmm2","xmm3",0xaa);
479 &pshufd ("xmm3","xmm3",0xff);
480 &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters
481 &pshufd ("xmm4","xmm7",0x00);
482 &pshufd ("xmm5","xmm7",0x55);
483 &psubd ("xmm0",&QWP(16*4,"eax"));
484 &pshufd ("xmm6","xmm7",0xaa);
485 &pshufd ("xmm7","xmm7",0xff);
486 &movdqa (&QWP(16*12-128,"ebp"),"xmm0");
487 &movdqa (&QWP(16*13-128,"ebp"),"xmm1");
488 &movdqa (&QWP(16*14-128,"ebp"),"xmm2");
489 &movdqa (&QWP(16*15-128,"ebp"),"xmm3");
490 &movdqu ("xmm3",&QWP(16,"edx")); # key
491 &movdqa (&QWP(16*4-128,"ebp"),"xmm4");
492 &movdqa (&QWP(16*5-128,"ebp"),"xmm5");
493 &movdqa (&QWP(16*6-128,"ebp"),"xmm6");
494 &movdqa (&QWP(16*7-128,"ebp"),"xmm7");
495 &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma
496 &lea ("ebx",&DWP(128,"esp")); # size optimization
497
498 &pshufd ("xmm0","xmm3",0x00);
499 &pshufd ("xmm1","xmm3",0x55);
500 &pshufd ("xmm2","xmm3",0xaa);
501 &pshufd ("xmm3","xmm3",0xff);
502 &pshufd ("xmm4","xmm7",0x00);
503 &pshufd ("xmm5","xmm7",0x55);
504 &pshufd ("xmm6","xmm7",0xaa);
505 &pshufd ("xmm7","xmm7",0xff);
506 &movdqa (&QWP(16*8-128,"ebp"),"xmm0");
507 &movdqa (&QWP(16*9-128,"ebp"),"xmm1");
508 &movdqa (&QWP(16*10-128,"ebp"),"xmm2");
509 &movdqa (&QWP(16*11-128,"ebp"),"xmm3");
510 &movdqa (&QWP(16*0-128,"ebp"),"xmm4");
511 &movdqa (&QWP(16*1-128,"ebp"),"xmm5");
512 &movdqa (&QWP(16*2-128,"ebp"),"xmm6");
513 &movdqa (&QWP(16*3-128,"ebp"),"xmm7");
514
515 &lea ($inp,&DWP(128,$inp)); # size optimization
516 &lea ($out,&DWP(128,$out)); # size optimization
517 &jmp (&label("outer_loop"));
518
519 &set_label("outer_loop",16);
520 #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material
521 &movdqa ("xmm1",&QWP(16*1-128,"ebp"));
522 &movdqa ("xmm2",&QWP(16*2-128,"ebp"));
523 &movdqa ("xmm3",&QWP(16*3-128,"ebp"));
524 #&movdqa ("xmm4",&QWP(16*4-128,"ebp"));
525 &movdqa ("xmm5",&QWP(16*5-128,"ebp"));
526 &movdqa ("xmm6",&QWP(16*6-128,"ebp"));
527 &movdqa ("xmm7",&QWP(16*7-128,"ebp"));
528 #&movdqa (&QWP(16*0-128,"ebx"),"xmm0");
529 &movdqa (&QWP(16*1-128,"ebx"),"xmm1");
530 &movdqa (&QWP(16*2-128,"ebx"),"xmm2");
531 &movdqa (&QWP(16*3-128,"ebx"),"xmm3");
532 #&movdqa (&QWP(16*4-128,"ebx"),"xmm4");
533 &movdqa (&QWP(16*5-128,"ebx"),"xmm5");
534 &movdqa (&QWP(16*6-128,"ebx"),"xmm6");
535 &movdqa (&QWP(16*7-128,"ebx"),"xmm7");
536 #&movdqa ("xmm0",&QWP(16*8-128,"ebp"));
537 #&movdqa ("xmm1",&QWP(16*9-128,"ebp"));
538 &movdqa ("xmm2",&QWP(16*10-128,"ebp"));
539 &movdqa ("xmm3",&QWP(16*11-128,"ebp"));
540 &movdqa ("xmm4",&QWP(16*12-128,"ebp"));
541 &movdqa ("xmm5",&QWP(16*13-128,"ebp"));
542 &movdqa ("xmm6",&QWP(16*14-128,"ebp"));
543 &movdqa ("xmm7",&QWP(16*15-128,"ebp"));
544 &paddd ("xmm4",&QWP(16*4,"eax")); # counter value
545 #&movdqa (&QWP(16*8-128,"ebx"),"xmm0");
546 #&movdqa (&QWP(16*9-128,"ebx"),"xmm1");
547 &movdqa (&QWP(16*10-128,"ebx"),"xmm2");
548 &movdqa (&QWP(16*11-128,"ebx"),"xmm3");
549 &movdqa (&QWP(16*12-128,"ebx"),"xmm4");
550 &movdqa (&QWP(16*13-128,"ebx"),"xmm5");
551 &movdqa (&QWP(16*14-128,"ebx"),"xmm6");
552 &movdqa (&QWP(16*15-128,"ebx"),"xmm7");
553 &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value
554
555 &movdqa ($xa, &QWP(16*0-128,"ebp"));
556 &movdqa ($xd, "xmm4");
557 &movdqa ($xb_,&QWP(16*4-128,"ebp"));
558 &movdqa ($xc, &QWP(16*8-128,"ebp"));
559 &movdqa ($xc_,&QWP(16*9-128,"ebp"));
560
561 &mov ("edx",10); # loop counter
562 &nop ();
563
564 &set_label("loop",16);
565 &paddd ($xa,$xb_); # elsewhere
566 &movdqa ($xb,$xb_);
567 &pxor ($xd,$xa); # elsewhere
568 &QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
569 &QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
570 &QUARTERROUND_SSSE3(2, 6,10, 14, 2);
571 &QUARTERROUND_SSSE3(3, 7,11, 15, 3);
572 &QUARTERROUND_SSSE3(0, 5,10, 15, 4);
573 &QUARTERROUND_SSSE3(1, 6,11, 12, 5);
574 &QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
575 &QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
576 &dec ("edx");
577 &jnz (&label("loop"));
578
579 &movdqa (&QWP(16*4-128,"ebx"),$xb_);
580 &movdqa (&QWP(16*8-128,"ebx"),$xc);
581 &movdqa (&QWP(16*9-128,"ebx"),$xc_);
582 &movdqa (&QWP(16*12-128,"ebx"),$xd);
583 &movdqa (&QWP(16*14-128,"ebx"),$xd_);
584
585 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
586
587 #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there
588 &movdqa ($xa1,&QWP(16*1-128,"ebx"));
589 &movdqa ($xa2,&QWP(16*2-128,"ebx"));
590 &movdqa ($xa3,&QWP(16*3-128,"ebx"));
591
592 for($i=0;$i<256;$i+=64) {
593 &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material
594 &paddd ($xa1,&QWP($i+16*1-128,"ebp"));
595 &paddd ($xa2,&QWP($i+16*2-128,"ebp"));
596 &paddd ($xa3,&QWP($i+16*3-128,"ebp"));
597
598 &movdqa ($xt2,$xa0); # "de-interlace" data
599 &punpckldq ($xa0,$xa1);
600 &movdqa ($xt3,$xa2);
601 &punpckldq ($xa2,$xa3);
602 &punpckhdq ($xt2,$xa1);
603 &punpckhdq ($xt3,$xa3);
604 &movdqa ($xa1,$xa0);
605 &punpcklqdq ($xa0,$xa2); # "a0"
606 &movdqa ($xa3,$xt2);
607 &punpcklqdq ($xt2,$xt3); # "a2"
608 &punpckhqdq ($xa1,$xa2); # "a1"
609 &punpckhqdq ($xa3,$xt3); # "a3"
610
611 #($xa2,$xt2)=($xt2,$xa2);
612
613 &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input
614 &movdqu ($xt1,&QWP(64*1-128,$inp));
615 &movdqu ($xa2,&QWP(64*2-128,$inp));
616 &movdqu ($xt3,&QWP(64*3-128,$inp));
617 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp));
618 &pxor ($xt0,$xa0);
619 &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192);
620 &pxor ($xt1,$xa1);
621 &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192);
622 &pxor ($xt2,$xa2);
623 &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192);
624 &pxor ($xt3,$xa3);
625 &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192);
626 &movdqu (&QWP(64*0-128,$out),$xt0); # store output
627 &movdqu (&QWP(64*1-128,$out),$xt1);
628 &movdqu (&QWP(64*2-128,$out),$xt2);
629 &movdqu (&QWP(64*3-128,$out),$xt3);
630 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out));
631 }
632 &sub ($len,64*4);
633 &jnc (&label("outer_loop"));
634
635 &add ($len,64*4);
636 &jz (&label("done"));
637
638 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers
639 &lea ($inp,&DWP(-128,$inp));
640 &mov ("edx",&DWP(512+4,"esp"));
641 &lea ($out,&DWP(-128,$out));
642
643 &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value
644 &movdqu ("xmm3",&QWP(0,"ebx"));
645 &paddd ("xmm2",&QWP(16*6,"eax")); # +four
646 &pand ("xmm3",&QWP(16*7,"eax"));
647 &por ("xmm3","xmm2"); # counter value
648 {
649 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
650
651 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
652 &paddd ($a,$b);
653 &pxor ($d,$a);
654 &pshufb ($d,$rot16);
655
656 &paddd ($c,$d);
657 &pxor ($b,$c);
658 &movdqa ($t,$b);
659 &psrld ($b,20);
660 &pslld ($t,12);
661 &por ($b,$t);
662
663 &paddd ($a,$b);
664 &pxor ($d,$a);
665 &pshufb ($d,$rot24);
666
667 &paddd ($c,$d);
668 &pxor ($b,$c);
669 &movdqa ($t,$b);
670 &psrld ($b,25);
671 &pslld ($t,7);
672 &por ($b,$t);
673 }
674
675 &set_label("1x");
676 &movdqa ($a,&QWP(16*2,"eax")); # sigma
677 &movdqu ($b,&QWP(0,"edx"));
678 &movdqu ($c,&QWP(16,"edx"));
679 #&movdqu ($d,&QWP(0,"ebx")); # already loaded
680 &movdqa ($rot16,&QWP(0,"eax"));
681 &movdqa ($rot24,&QWP(16,"eax"));
682 &mov (&DWP(16*3,"esp"),"ebp");
683
684 &movdqa (&QWP(16*0,"esp"),$a);
685 &movdqa (&QWP(16*1,"esp"),$b);
686 &movdqa (&QWP(16*2,"esp"),$c);
687 &movdqa (&QWP(16*3,"esp"),$d);
688 &mov ("edx",10);
689 &jmp (&label("loop1x"));
690
691 &set_label("outer1x",16);
692 &movdqa ($d,&QWP(16*5,"eax")); # one
693 &movdqa ($a,&QWP(16*0,"esp"));
694 &movdqa ($b,&QWP(16*1,"esp"));
695 &movdqa ($c,&QWP(16*2,"esp"));
696 &paddd ($d,&QWP(16*3,"esp"));
697 &mov ("edx",10);
698 &movdqa (&QWP(16*3,"esp"),$d);
699 &jmp (&label("loop1x"));
700
701 &set_label("loop1x",16);
702 &SSSE3ROUND();
703 &pshufd ($c,$c,0b01001110);
704 &pshufd ($b,$b,0b00111001);
705 &pshufd ($d,$d,0b10010011);
706 &nop ();
707
708 &SSSE3ROUND();
709 &pshufd ($c,$c,0b01001110);
710 &pshufd ($b,$b,0b10010011);
711 &pshufd ($d,$d,0b00111001);
712
713 &dec ("edx");
714 &jnz (&label("loop1x"));
715
716 &paddd ($a,&QWP(16*0,"esp"));
717 &paddd ($b,&QWP(16*1,"esp"));
718 &paddd ($c,&QWP(16*2,"esp"));
719 &paddd ($d,&QWP(16*3,"esp"));
720
721 &cmp ($len,64);
722 &jb (&label("tail"));
723
724 &movdqu ($t,&QWP(16*0,$inp));
725 &movdqu ($t1,&QWP(16*1,$inp));
726 &pxor ($a,$t); # xor with input
727 &movdqu ($t,&QWP(16*2,$inp));
728 &pxor ($b,$t1);
729 &movdqu ($t1,&QWP(16*3,$inp));
730 &pxor ($c,$t);
731 &pxor ($d,$t1);
732 &lea ($inp,&DWP(16*4,$inp)); # inp+=64
733
734 &movdqu (&QWP(16*0,$out),$a); # write output
735 &movdqu (&QWP(16*1,$out),$b);
736 &movdqu (&QWP(16*2,$out),$c);
737 &movdqu (&QWP(16*3,$out),$d);
738 &lea ($out,&DWP(16*4,$out)); # inp+=64
739
740 &sub ($len,64);
741 &jnz (&label("outer1x"));
742
743 &jmp (&label("done"));
744
745 &set_label("tail");
746 &movdqa (&QWP(16*0,"esp"),$a);
747 &movdqa (&QWP(16*1,"esp"),$b);
748 &movdqa (&QWP(16*2,"esp"),$c);
749 &movdqa (&QWP(16*3,"esp"),$d);
750
751 &xor ("eax","eax");
752 &xor ("edx","edx");
753 &xor ("ebp","ebp");
754
755 &set_label("tail_loop");
756 &movb ("al",&BP(0,"esp","ebp"));
757 &movb ("dl",&BP(0,$inp,"ebp"));
758 &lea ("ebp",&DWP(1,"ebp"));
759 &xor ("al","dl");
760 &movb (&BP(-1,$out,"ebp"),"al");
761 &dec ($len);
762 &jnz (&label("tail_loop"));
763 }
764 &set_label("done");
765 &mov ("esp",&DWP(512,"esp"));
766 &function_end("ChaCha20_ssse3");
767
768 &align (64);
769 &set_label("ssse3_data");
770 &data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
771 &data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
772 &data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
773 &data_word(0,1,2,3);
774 &data_word(4,4,4,4);
775 &data_word(1,0,0,0);
776 &data_word(4,0,0,0);
777 &data_word(0,-1,-1,-1);
778 &align (64);
779 }
780 &asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
781
782 if ($ymm) {
783 my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
784 my ($out,$inp,$len)=("edi","esi","ecx");
785
786 sub QUARTERROUND_XOP {
787 my ($ai,$bi,$ci,$di,$i)=@_;
788 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
789 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
790
791 # a b c d
792 #
793 # 0 4 8 12 < even round
794 # 1 5 9 13
795 # 2 6 10 14
796 # 3 7 11 15
797 # 0 5 10 15 < odd round
798 # 1 6 11 12
799 # 2 7 8 13
800 # 3 4 9 14
801
802 if ($i==0) {
803 my $j=4;
804 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
805 } elsif ($i==3) {
806 my $j=0;
807 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
808 } elsif ($i==4) {
809 my $j=4;
810 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
811 } elsif ($i==7) {
812 my $j=0;
813 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
814 }
815
816 #&vpaddd ($xa,$xa,$xb); # see elsewhere
817 #&vpxor ($xd,$xd,$xa); # see elsewhere
818 &vmovdqa (&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3);
819 &vprotd ($xd,$xd,16);
820 &vmovdqa (&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0);
821 &vpaddd ($xc,$xc,$xd);
822 &vmovdqa ($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3);
823 &vpxor ($xb,$i!=0?$xb:$xb_,$xc);
824 &vmovdqa ($xa_,&QWP(16*$an-128,"ebx"));
825 &vprotd ($xb,$xb,12);
826 &vmovdqa ($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7);
827 &vpaddd ($xa,$xa,$xb);
828 &vmovdqa ($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn);
829 &vpxor ($xd,$xd,$xa);
830 &vpaddd ($xa_,$xa_,$xb_) if ($i<7); # elsewhere
831 &vprotd ($xd,$xd,8);
832 &vmovdqa (&QWP(16*$ai-128,"ebx"),$xa);
833 &vpaddd ($xc,$xc,$xd);
834 &vmovdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn);
835 &vpxor ($xb,$xb,$xc);
836 &vpxor ($xd_,$di==$dn?$xd:$xd_,$xa_) if ($i<7); # elsewhere
837 &vprotd ($xb,$xb,7);
838
839 ($xa,$xa_)=($xa_,$xa);
840 ($xb,$xb_)=($xb_,$xb);
841 ($xc,$xc_)=($xc_,$xc);
842 ($xd,$xd_)=($xd_,$xd);
843 }
844
845 &function_begin("ChaCha20_xop");
846 &set_label("xop_shortcut");
847 &mov ($out,&wparam(0));
848 &mov ($inp,&wparam(1));
849 &mov ($len,&wparam(2));
850 &mov ("edx",&wparam(3)); # key
851 &mov ("ebx",&wparam(4)); # counter and nonce
852 &vzeroupper ();
853
854 &mov ("ebp","esp");
855 &stack_push (131);
856 &and ("esp",-64);
857 &mov (&DWP(512,"esp"),"ebp");
858
859 &lea ("eax",&DWP(&label("ssse3_data")."-".
860 &label("pic_point"),"eax"));
861 &vmovdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce
862
863 &cmp ($len,64*4);
864 &jb (&label("1x"));
865
866 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers
867 &mov (&DWP(512+8,"esp"),"ebx");
868 &sub ($len,64*4); # bias len
869 &lea ("ebp",&DWP(256+128,"esp")); # size optimization
870
871 &vmovdqu ("xmm7",&QWP(0,"edx")); # key
872 &vpshufd ("xmm0","xmm3",0x00);
873 &vpshufd ("xmm1","xmm3",0x55);
874 &vpshufd ("xmm2","xmm3",0xaa);
875 &vpshufd ("xmm3","xmm3",0xff);
876 &vpaddd ("xmm0","xmm0",&QWP(16*3,"eax")); # fix counters
877 &vpshufd ("xmm4","xmm7",0x00);
878 &vpshufd ("xmm5","xmm7",0x55);
879 &vpsubd ("xmm0","xmm0",&QWP(16*4,"eax"));
880 &vpshufd ("xmm6","xmm7",0xaa);
881 &vpshufd ("xmm7","xmm7",0xff);
882 &vmovdqa (&QWP(16*12-128,"ebp"),"xmm0");
883 &vmovdqa (&QWP(16*13-128,"ebp"),"xmm1");
884 &vmovdqa (&QWP(16*14-128,"ebp"),"xmm2");
885 &vmovdqa (&QWP(16*15-128,"ebp"),"xmm3");
886 &vmovdqu ("xmm3",&QWP(16,"edx")); # key
887 &vmovdqa (&QWP(16*4-128,"ebp"),"xmm4");
888 &vmovdqa (&QWP(16*5-128,"ebp"),"xmm5");
889 &vmovdqa (&QWP(16*6-128,"ebp"),"xmm6");
890 &vmovdqa (&QWP(16*7-128,"ebp"),"xmm7");
891 &vmovdqa ("xmm7",&QWP(16*2,"eax")); # sigma
892 &lea ("ebx",&DWP(128,"esp")); # size optimization
893
894 &vpshufd ("xmm0","xmm3",0x00);
895 &vpshufd ("xmm1","xmm3",0x55);
896 &vpshufd ("xmm2","xmm3",0xaa);
897 &vpshufd ("xmm3","xmm3",0xff);
898 &vpshufd ("xmm4","xmm7",0x00);
899 &vpshufd ("xmm5","xmm7",0x55);
900 &vpshufd ("xmm6","xmm7",0xaa);
901 &vpshufd ("xmm7","xmm7",0xff);
902 &vmovdqa (&QWP(16*8-128,"ebp"),"xmm0");
903 &vmovdqa (&QWP(16*9-128,"ebp"),"xmm1");
904 &vmovdqa (&QWP(16*10-128,"ebp"),"xmm2");
905 &vmovdqa (&QWP(16*11-128,"ebp"),"xmm3");
906 &vmovdqa (&QWP(16*0-128,"ebp"),"xmm4");
907 &vmovdqa (&QWP(16*1-128,"ebp"),"xmm5");
908 &vmovdqa (&QWP(16*2-128,"ebp"),"xmm6");
909 &vmovdqa (&QWP(16*3-128,"ebp"),"xmm7");
910
911 &lea ($inp,&DWP(128,$inp)); # size optimization
912 &lea ($out,&DWP(128,$out)); # size optimization
913 &jmp (&label("outer_loop"));
914
915 &set_label("outer_loop",32);
916 #&vmovdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material
917 &vmovdqa ("xmm1",&QWP(16*1-128,"ebp"));
918 &vmovdqa ("xmm2",&QWP(16*2-128,"ebp"));
919 &vmovdqa ("xmm3",&QWP(16*3-128,"ebp"));
920 #&vmovdqa ("xmm4",&QWP(16*4-128,"ebp"));
921 &vmovdqa ("xmm5",&QWP(16*5-128,"ebp"));
922 &vmovdqa ("xmm6",&QWP(16*6-128,"ebp"));
923 &vmovdqa ("xmm7",&QWP(16*7-128,"ebp"));
924 #&vmovdqa (&QWP(16*0-128,"ebx"),"xmm0");
925 &vmovdqa (&QWP(16*1-128,"ebx"),"xmm1");
926 &vmovdqa (&QWP(16*2-128,"ebx"),"xmm2");
927 &vmovdqa (&QWP(16*3-128,"ebx"),"xmm3");
928 #&vmovdqa (&QWP(16*4-128,"ebx"),"xmm4");
929 &vmovdqa (&QWP(16*5-128,"ebx"),"xmm5");
930 &vmovdqa (&QWP(16*6-128,"ebx"),"xmm6");
931 &vmovdqa (&QWP(16*7-128,"ebx"),"xmm7");
932 #&vmovdqa ("xmm0",&QWP(16*8-128,"ebp"));
933 #&vmovdqa ("xmm1",&QWP(16*9-128,"ebp"));
934 &vmovdqa ("xmm2",&QWP(16*10-128,"ebp"));
935 &vmovdqa ("xmm3",&QWP(16*11-128,"ebp"));
936 &vmovdqa ("xmm4",&QWP(16*12-128,"ebp"));
937 &vmovdqa ("xmm5",&QWP(16*13-128,"ebp"));
938 &vmovdqa ("xmm6",&QWP(16*14-128,"ebp"));
939 &vmovdqa ("xmm7",&QWP(16*15-128,"ebp"));
940 &vpaddd ("xmm4","xmm4",&QWP(16*4,"eax")); # counter value
941 #&vmovdqa (&QWP(16*8-128,"ebx"),"xmm0");
942 #&vmovdqa (&QWP(16*9-128,"ebx"),"xmm1");
943 &vmovdqa (&QWP(16*10-128,"ebx"),"xmm2");
944 &vmovdqa (&QWP(16*11-128,"ebx"),"xmm3");
945 &vmovdqa (&QWP(16*12-128,"ebx"),"xmm4");
946 &vmovdqa (&QWP(16*13-128,"ebx"),"xmm5");
947 &vmovdqa (&QWP(16*14-128,"ebx"),"xmm6");
948 &vmovdqa (&QWP(16*15-128,"ebx"),"xmm7");
949 &vmovdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value
950
951 &vmovdqa ($xa, &QWP(16*0-128,"ebp"));
952 &vmovdqa ($xd, "xmm4");
953 &vmovdqa ($xb_,&QWP(16*4-128,"ebp"));
954 &vmovdqa ($xc, &QWP(16*8-128,"ebp"));
955 &vmovdqa ($xc_,&QWP(16*9-128,"ebp"));
956
957 &mov ("edx",10); # loop counter
958 &nop ();
959
960 &set_label("loop",32);
961 &vpaddd ($xa,$xa,$xb_); # elsewhere
962 &vpxor ($xd,$xd,$xa); # elsewhere
963 &QUARTERROUND_XOP(0, 4, 8, 12, 0);
964 &QUARTERROUND_XOP(1, 5, 9, 13, 1);
965 &QUARTERROUND_XOP(2, 6,10, 14, 2);
966 &QUARTERROUND_XOP(3, 7,11, 15, 3);
967 &QUARTERROUND_XOP(0, 5,10, 15, 4);
968 &QUARTERROUND_XOP(1, 6,11, 12, 5);
969 &QUARTERROUND_XOP(2, 7, 8, 13, 6);
970 &QUARTERROUND_XOP(3, 4, 9, 14, 7);
971 &dec ("edx");
972 &jnz (&label("loop"));
973
974 &vmovdqa (&QWP(16*4-128,"ebx"),$xb_);
975 &vmovdqa (&QWP(16*8-128,"ebx"),$xc);
976 &vmovdqa (&QWP(16*9-128,"ebx"),$xc_);
977 &vmovdqa (&QWP(16*12-128,"ebx"),$xd);
978 &vmovdqa (&QWP(16*14-128,"ebx"),$xd_);
979
980 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
981
982 #&vmovdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there
983 &vmovdqa ($xa1,&QWP(16*1-128,"ebx"));
984 &vmovdqa ($xa2,&QWP(16*2-128,"ebx"));
985 &vmovdqa ($xa3,&QWP(16*3-128,"ebx"));
986
987 for($i=0;$i<256;$i+=64) {
988 &vpaddd ($xa0,$xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material
989 &vpaddd ($xa1,$xa1,&QWP($i+16*1-128,"ebp"));
990 &vpaddd ($xa2,$xa2,&QWP($i+16*2-128,"ebp"));
991 &vpaddd ($xa3,$xa3,&QWP($i+16*3-128,"ebp"));
992
993 &vpunpckldq ($xt2,$xa0,$xa1); # "de-interlace" data
994 &vpunpckldq ($xt3,$xa2,$xa3);
995 &vpunpckhdq ($xa0,$xa0,$xa1);
996 &vpunpckhdq ($xa2,$xa2,$xa3);
997 &vpunpcklqdq ($xa1,$xt2,$xt3); # "a0"
998 &vpunpckhqdq ($xt2,$xt2,$xt3); # "a1"
999 &vpunpcklqdq ($xt3,$xa0,$xa2); # "a2"
1000 &vpunpckhqdq ($xa3,$xa0,$xa2); # "a3"
1001
1002 &vpxor ($xt0,$xa1,&QWP(64*0-128,$inp));
1003 &vpxor ($xt1,$xt2,&QWP(64*1-128,$inp));
1004 &vpxor ($xt2,$xt3,&QWP(64*2-128,$inp));
1005 &vpxor ($xt3,$xa3,&QWP(64*3-128,$inp));
1006 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp));
1007 &vmovdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192);
1008 &vmovdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192);
1009 &vmovdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192);
1010 &vmovdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192);
1011 &vmovdqu (&QWP(64*0-128,$out),$xt0); # store output
1012 &vmovdqu (&QWP(64*1-128,$out),$xt1);
1013 &vmovdqu (&QWP(64*2-128,$out),$xt2);
1014 &vmovdqu (&QWP(64*3-128,$out),$xt3);
1015 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out));
1016 }
1017 &sub ($len,64*4);
1018 &jnc (&label("outer_loop"));
1019
1020 &add ($len,64*4);
1021 &jz (&label("done"));
1022
1023 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers
1024 &lea ($inp,&DWP(-128,$inp));
1025 &mov ("edx",&DWP(512+4,"esp"));
1026 &lea ($out,&DWP(-128,$out));
1027
1028 &vmovd ("xmm2",&DWP(16*12-128,"ebp")); # counter value
1029 &vmovdqu ("xmm3",&QWP(0,"ebx"));
1030 &vpaddd ("xmm2","xmm2",&QWP(16*6,"eax"));# +four
1031 &vpand ("xmm3","xmm3",&QWP(16*7,"eax"));
1032 &vpor ("xmm3","xmm3","xmm2"); # counter value
1033 {
1034 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
1035
1036 sub XOPROUND {
1037 &vpaddd ($a,$a,$b);
1038 &vpxor ($d,$d,$a);
1039 &vprotd ($d,$d,16);
1040
1041 &vpaddd ($c,$c,$d);
1042 &vpxor ($b,$b,$c);
1043 &vprotd ($b,$b,12);
1044
1045 &vpaddd ($a,$a,$b);
1046 &vpxor ($d,$d,$a);
1047 &vprotd ($d,$d,8);
1048
1049 &vpaddd ($c,$c,$d);
1050 &vpxor ($b,$b,$c);
1051 &vprotd ($b,$b,7);
1052 }
1053
1054 &set_label("1x");
1055 &vmovdqa ($a,&QWP(16*2,"eax")); # sigma
1056 &vmovdqu ($b,&QWP(0,"edx"));
1057 &vmovdqu ($c,&QWP(16,"edx"));
1058 #&vmovdqu ($d,&QWP(0,"ebx")); # already loaded
1059 &vmovdqa ($rot16,&QWP(0,"eax"));
1060 &vmovdqa ($rot24,&QWP(16,"eax"));
1061 &mov (&DWP(16*3,"esp"),"ebp");
1062
1063 &vmovdqa (&QWP(16*0,"esp"),$a);
1064 &vmovdqa (&QWP(16*1,"esp"),$b);
1065 &vmovdqa (&QWP(16*2,"esp"),$c);
1066 &vmovdqa (&QWP(16*3,"esp"),$d);
1067 &mov ("edx",10);
1068 &jmp (&label("loop1x"));
1069
1070 &set_label("outer1x",16);
1071 &vmovdqa ($d,&QWP(16*5,"eax")); # one
1072 &vmovdqa ($a,&QWP(16*0,"esp"));
1073 &vmovdqa ($b,&QWP(16*1,"esp"));
1074 &vmovdqa ($c,&QWP(16*2,"esp"));
1075 &vpaddd ($d,$d,&QWP(16*3,"esp"));
1076 &mov ("edx",10);
1077 &vmovdqa (&QWP(16*3,"esp"),$d);
1078 &jmp (&label("loop1x"));
1079
1080 &set_label("loop1x",16);
1081 &XOPROUND();
1082 &vpshufd ($c,$c,0b01001110);
1083 &vpshufd ($b,$b,0b00111001);
1084 &vpshufd ($d,$d,0b10010011);
1085
1086 &XOPROUND();
1087 &vpshufd ($c,$c,0b01001110);
1088 &vpshufd ($b,$b,0b10010011);
1089 &vpshufd ($d,$d,0b00111001);
1090
1091 &dec ("edx");
1092 &jnz (&label("loop1x"));
1093
1094 &vpaddd ($a,$a,&QWP(16*0,"esp"));
1095 &vpaddd ($b,$b,&QWP(16*1,"esp"));
1096 &vpaddd ($c,$c,&QWP(16*2,"esp"));
1097 &vpaddd ($d,$d,&QWP(16*3,"esp"));
1098
1099 &cmp ($len,64);
1100 &jb (&label("tail"));
1101
1102 &vpxor ($a,$a,&QWP(16*0,$inp)); # xor with input
1103 &vpxor ($b,$b,&QWP(16*1,$inp));
1104 &vpxor ($c,$c,&QWP(16*2,$inp));
1105 &vpxor ($d,$d,&QWP(16*3,$inp));
1106 &lea ($inp,&DWP(16*4,$inp)); # inp+=64
1107
1108 &vmovdqu (&QWP(16*0,$out),$a); # write output
1109 &vmovdqu (&QWP(16*1,$out),$b);
1110 &vmovdqu (&QWP(16*2,$out),$c);
1111 &vmovdqu (&QWP(16*3,$out),$d);
1112 &lea ($out,&DWP(16*4,$out)); # inp+=64
1113
1114 &sub ($len,64);
1115 &jnz (&label("outer1x"));
1116
1117 &jmp (&label("done"));
1118
1119 &set_label("tail");
1120 &vmovdqa (&QWP(16*0,"esp"),$a);
1121 &vmovdqa (&QWP(16*1,"esp"),$b);
1122 &vmovdqa (&QWP(16*2,"esp"),$c);
1123 &vmovdqa (&QWP(16*3,"esp"),$d);
1124
1125 &xor ("eax","eax");
1126 &xor ("edx","edx");
1127 &xor ("ebp","ebp");
1128
1129 &set_label("tail_loop");
1130 &movb ("al",&BP(0,"esp","ebp"));
1131 &movb ("dl",&BP(0,$inp,"ebp"));
1132 &lea ("ebp",&DWP(1,"ebp"));
1133 &xor ("al","dl");
1134 &movb (&BP(-1,$out,"ebp"),"al");
1135 &dec ($len);
1136 &jnz (&label("tail_loop"));
1137 }
1138 &set_label("done");
1139 &vzeroupper ();
1140 &mov ("esp",&DWP(512,"esp"));
1141 &function_end("ChaCha20_xop");
1142 }
1143
1144 &asm_finish();
1145
1146 close STDOUT;