]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/chacha/asm/chacha-x86.pl
Do not silently truncate files on perlasm errors
[thirdparty/openssl.git] / crypto / chacha / asm / chacha-x86.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
fd38836b 2# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e
RS
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
a98c648e
AP
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# January 2015
18#
19# ChaCha20 for x86.
20#
21# Performance in cycles per byte out of large buffer.
22#
23# 1xIALU/gcc 4xSSSE3
24# Pentium 17.5/+80%
25# PIII 14.2/+60%
26# P4 18.6/+84%
27# Core2 9.56/+89% 4.83
28# Westmere 9.50/+45% 3.35
29# Sandy Bridge 10.5/+47% 3.20
30# Haswell 8.15/+50% 2.83
a30b0522 31# Skylake 7.53/+22% 2.75
a98c648e 32# Silvermont 17.4/+36% 8.35
ace05265 33# Goldmont 13.4/+40% 4.36
a98c648e
AP
34# Sledgehammer 10.2/+54%
35# Bulldozer 13.4/+50% 4.38(*)
36#
37# (*) Bulldozer actually executes 4xXOP code path that delivers 3.55;
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40push(@INC,"${dir}","${dir}../../perlasm");
41require "x86asm.pl";
42
df0cb57c
RL
43$output=pop;
44open STDOUT,">$output";
45
e195c8a2 46&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
a98c648e
AP
47
48$xmm=$ymm=0;
49for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
50
51$ymm=1 if ($xmm &&
52 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
53 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
d89773d6 54 ($gasver=$1)>=2.19); # first version supporting AVX
a98c648e
AP
55
56$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
57 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
58 $1>=2.03); # first version supporting AVX
59
60$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
61 `ml 2>&1` =~ /Version ([0-9]+)\./ &&
62 $1>=10); # first version supporting AVX
63
64$ymm=1 if ($xmm && !$ymm &&
609d24bb 65 `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ &&
a98c648e
AP
66 $2>=3.0); # first version supporting AVX
67
68$a="eax";
69($b,$b_)=("ebx","ebp");
70($c,$c_)=("ecx","esi");
71($d,$d_)=("edx","edi");
72
73sub QUARTERROUND {
74my ($ai,$bi,$ci,$di,$i)=@_;
75my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
76my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
77
78 # a b c d
79 #
80 # 0 4 8 12 < even round
81 # 1 5 9 13
82 # 2 6 10 14
83 # 3 7 11 15
84 # 0 5 10 15 < odd round
85 # 1 6 11 12
86 # 2 7 8 13
87 # 3 4 9 14
88
89 if ($i==0) {
90 my $j=4;
91 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
92 } elsif ($i==3) {
93 my $j=0;
94 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
95 } elsif ($i==4) {
96 my $j=4;
97 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
98 } elsif ($i==7) {
99 my $j=0;
100 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
101 }
102
103 #&add ($a,$b); # see elsewhere
104 &xor ($d,$a);
105 &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3);
106 &rol ($d,16);
107 &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0);
108 &add ($c,$d);
109 &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3);
110 &xor ($b,$c);
111 &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn);
112 &rol ($b,12);
113 &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7);
114 &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter
115 &add ($a,$b);
116 &xor ($d,$a);
117 &mov (&DWP(4*$ai,"esp"),$a);
118 &rol ($d,8);
119 &mov ($a,&DWP(4*$an,"esp"));
120 &add ($c,$d);
121 &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn);
122 &mov ($d_,$d) if ($di==$dn);
123 &xor ($b,$c);
124 &add ($a,$b_) if ($i<7); # elsewhere
125 &rol ($b,7);
126
127 ($b,$b_)=($b_,$b);
128 ($c,$c_)=($c_,$c);
129 ($d,$d_)=($d_,$d);
130}
131
132&static_label("ssse3_shortcut");
133&static_label("xop_shortcut");
134&static_label("ssse3_data");
135&static_label("pic_point");
136
137&function_begin("ChaCha20_ctr32");
622a531c
AP
138 &xor ("eax","eax");
139 &cmp ("eax",&wparam(2)); # len==0?
140 &je (&label("no_data"));
a98c648e
AP
141if ($xmm) {
142 &call (&label("pic_point"));
143&set_label("pic_point");
144 &blindpop("eax");
145 &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
146 &test (&DWP(0,"ebp"),1<<24); # test FXSR bit
147 &jz (&label("x86"));
148 &test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit
149 &jz (&label("x86"));
150 &jmp (&label("ssse3_shortcut"));
151&set_label("x86");
152}
153 &mov ("esi",&wparam(3)); # key
154 &mov ("edi",&wparam(4)); # counter and nonce
155
156 &stack_push(33);
157
158 &mov ("eax",&DWP(4*0,"esi")); # copy key
159 &mov ("ebx",&DWP(4*1,"esi"));
160 &mov ("ecx",&DWP(4*2,"esi"));
161 &mov ("edx",&DWP(4*3,"esi"));
162 &mov (&DWP(64+4*4,"esp"),"eax");
163 &mov (&DWP(64+4*5,"esp"),"ebx");
164 &mov (&DWP(64+4*6,"esp"),"ecx");
165 &mov (&DWP(64+4*7,"esp"),"edx");
166 &mov ("eax",&DWP(4*4,"esi"));
167 &mov ("ebx",&DWP(4*5,"esi"));
168 &mov ("ecx",&DWP(4*6,"esi"));
169 &mov ("edx",&DWP(4*7,"esi"));
170 &mov (&DWP(64+4*8,"esp"),"eax");
171 &mov (&DWP(64+4*9,"esp"),"ebx");
172 &mov (&DWP(64+4*10,"esp"),"ecx");
173 &mov (&DWP(64+4*11,"esp"),"edx");
174 &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce
175 &mov ("ebx",&DWP(4*1,"edi"));
176 &mov ("ecx",&DWP(4*2,"edi"));
177 &mov ("edx",&DWP(4*3,"edi"));
178 &sub ("eax",1);
179 &mov (&DWP(64+4*12,"esp"),"eax");
180 &mov (&DWP(64+4*13,"esp"),"ebx");
181 &mov (&DWP(64+4*14,"esp"),"ecx");
182 &mov (&DWP(64+4*15,"esp"),"edx");
183 &jmp (&label("entry"));
184
185&set_label("outer_loop",16);
186 &mov (&wparam(1),$b); # save input
187 &mov (&wparam(0),$a); # save output
188 &mov (&wparam(2),$c); # save len
189&set_label("entry");
190 &mov ($a,0x61707865);
191 &mov (&DWP(4*1,"esp"),0x3320646e);
192 &mov (&DWP(4*2,"esp"),0x79622d32);
193 &mov (&DWP(4*3,"esp"),0x6b206574);
194
195 &mov ($b, &DWP(64+4*5,"esp")); # copy key material
196 &mov ($b_,&DWP(64+4*6,"esp"));
197 &mov ($c, &DWP(64+4*10,"esp"));
198 &mov ($c_,&DWP(64+4*11,"esp"));
199 &mov ($d, &DWP(64+4*13,"esp"));
200 &mov ($d_,&DWP(64+4*14,"esp"));
201 &mov (&DWP(4*5,"esp"),$b);
202 &mov (&DWP(4*6,"esp"),$b_);
203 &mov (&DWP(4*10,"esp"),$c);
204 &mov (&DWP(4*11,"esp"),$c_);
205 &mov (&DWP(4*13,"esp"),$d);
206 &mov (&DWP(4*14,"esp"),$d_);
207
208 &mov ($b, &DWP(64+4*7,"esp"));
209 &mov ($d_,&DWP(64+4*15,"esp"));
210 &mov ($d, &DWP(64+4*12,"esp"));
211 &mov ($b_,&DWP(64+4*4,"esp"));
212 &mov ($c, &DWP(64+4*8,"esp"));
213 &mov ($c_,&DWP(64+4*9,"esp"));
214 &add ($d,1); # counter value
215 &mov (&DWP(4*7,"esp"),$b);
216 &mov (&DWP(4*15,"esp"),$d_);
217 &mov (&DWP(64+4*12,"esp"),$d); # save counter value
218
219 &mov ($b,10); # loop counter
220 &jmp (&label("loop"));
221
222&set_label("loop",16);
223 &add ($a,$b_); # elsewhere
224 &mov (&DWP(128,"esp"),$b); # save loop counter
225 &mov ($b,$b_);
226 &QUARTERROUND(0, 4, 8, 12, 0);
227 &QUARTERROUND(1, 5, 9, 13, 1);
228 &QUARTERROUND(2, 6,10, 14, 2);
229 &QUARTERROUND(3, 7,11, 15, 3);
230 &QUARTERROUND(0, 5,10, 15, 4);
231 &QUARTERROUND(1, 6,11, 12, 5);
232 &QUARTERROUND(2, 7, 8, 13, 6);
233 &QUARTERROUND(3, 4, 9, 14, 7);
234 &dec ($b);
235 &jnz (&label("loop"));
236
29880e97 237 &mov ($b,&wparam(2)); # load len
a98c648e
AP
238
239 &add ($a,0x61707865); # accumulate key material
240 &add ($b_,&DWP(64+4*4,"esp"));
241 &add ($c, &DWP(64+4*8,"esp"));
242 &add ($c_,&DWP(64+4*9,"esp"));
243
244 &cmp ($b,64);
245 &jb (&label("tail"));
246
247 &mov ($b,&wparam(1)); # load input pointer
248 &add ($d, &DWP(64+4*12,"esp"));
249 &add ($d_,&DWP(64+4*14,"esp"));
250
251 &xor ($a, &DWP(4*0,$b)); # xor with input
252 &xor ($b_,&DWP(4*4,$b));
253 &mov (&DWP(4*0,"esp"),$a);
254 &mov ($a,&wparam(0)); # load output pointer
255 &xor ($c, &DWP(4*8,$b));
256 &xor ($c_,&DWP(4*9,$b));
257 &xor ($d, &DWP(4*12,$b));
258 &xor ($d_,&DWP(4*14,$b));
259 &mov (&DWP(4*4,$a),$b_); # write output
260 &mov (&DWP(4*8,$a),$c);
261 &mov (&DWP(4*9,$a),$c_);
262 &mov (&DWP(4*12,$a),$d);
263 &mov (&DWP(4*14,$a),$d_);
264
265 &mov ($b_,&DWP(4*1,"esp"));
266 &mov ($c, &DWP(4*2,"esp"));
267 &mov ($c_,&DWP(4*3,"esp"));
268 &mov ($d, &DWP(4*5,"esp"));
269 &mov ($d_,&DWP(4*6,"esp"));
270 &add ($b_,0x3320646e); # accumulate key material
271 &add ($c, 0x79622d32);
272 &add ($c_,0x6b206574);
273 &add ($d, &DWP(64+4*5,"esp"));
274 &add ($d_,&DWP(64+4*6,"esp"));
275 &xor ($b_,&DWP(4*1,$b));
276 &xor ($c, &DWP(4*2,$b));
277 &xor ($c_,&DWP(4*3,$b));
278 &xor ($d, &DWP(4*5,$b));
279 &xor ($d_,&DWP(4*6,$b));
280 &mov (&DWP(4*1,$a),$b_);
281 &mov (&DWP(4*2,$a),$c);
282 &mov (&DWP(4*3,$a),$c_);
283 &mov (&DWP(4*5,$a),$d);
284 &mov (&DWP(4*6,$a),$d_);
285
286 &mov ($b_,&DWP(4*7,"esp"));
287 &mov ($c, &DWP(4*10,"esp"));
288 &mov ($c_,&DWP(4*11,"esp"));
289 &mov ($d, &DWP(4*13,"esp"));
290 &mov ($d_,&DWP(4*15,"esp"));
291 &add ($b_,&DWP(64+4*7,"esp"));
292 &add ($c, &DWP(64+4*10,"esp"));
293 &add ($c_,&DWP(64+4*11,"esp"));
294 &add ($d, &DWP(64+4*13,"esp"));
295 &add ($d_,&DWP(64+4*15,"esp"));
296 &xor ($b_,&DWP(4*7,$b));
297 &xor ($c, &DWP(4*10,$b));
298 &xor ($c_,&DWP(4*11,$b));
299 &xor ($d, &DWP(4*13,$b));
300 &xor ($d_,&DWP(4*15,$b));
301 &lea ($b,&DWP(4*16,$b));
302 &mov (&DWP(4*7,$a),$b_);
303 &mov ($b_,&DWP(4*0,"esp"));
304 &mov (&DWP(4*10,$a),$c);
305 &mov ($c,&wparam(2)); # len
306 &mov (&DWP(4*11,$a),$c_);
307 &mov (&DWP(4*13,$a),$d);
308 &mov (&DWP(4*15,$a),$d_);
309 &mov (&DWP(4*0,$a),$b_);
310 &lea ($a,&DWP(4*16,$a));
311 &sub ($c,64);
312 &jnz (&label("outer_loop"));
313
314 &jmp (&label("done"));
315
316&set_label("tail");
317 &add ($d, &DWP(64+4*12,"esp"));
318 &add ($d_,&DWP(64+4*14,"esp"));
319 &mov (&DWP(4*0,"esp"),$a);
320 &mov (&DWP(4*4,"esp"),$b_);
321 &mov (&DWP(4*8,"esp"),$c);
322 &mov (&DWP(4*9,"esp"),$c_);
323 &mov (&DWP(4*12,"esp"),$d);
324 &mov (&DWP(4*14,"esp"),$d_);
325
326 &mov ($b_,&DWP(4*1,"esp"));
327 &mov ($c, &DWP(4*2,"esp"));
328 &mov ($c_,&DWP(4*3,"esp"));
329 &mov ($d, &DWP(4*5,"esp"));
330 &mov ($d_,&DWP(4*6,"esp"));
331 &add ($b_,0x3320646e); # accumulate key material
332 &add ($c, 0x79622d32);
333 &add ($c_,0x6b206574);
334 &add ($d, &DWP(64+4*5,"esp"));
335 &add ($d_,&DWP(64+4*6,"esp"));
336 &mov (&DWP(4*1,"esp"),$b_);
337 &mov (&DWP(4*2,"esp"),$c);
338 &mov (&DWP(4*3,"esp"),$c_);
339 &mov (&DWP(4*5,"esp"),$d);
340 &mov (&DWP(4*6,"esp"),$d_);
341
342 &mov ($b_,&DWP(4*7,"esp"));
343 &mov ($c, &DWP(4*10,"esp"));
344 &mov ($c_,&DWP(4*11,"esp"));
345 &mov ($d, &DWP(4*13,"esp"));
346 &mov ($d_,&DWP(4*15,"esp"));
347 &add ($b_,&DWP(64+4*7,"esp"));
348 &add ($c, &DWP(64+4*10,"esp"));
349 &add ($c_,&DWP(64+4*11,"esp"));
350 &add ($d, &DWP(64+4*13,"esp"));
351 &add ($d_,&DWP(64+4*15,"esp"));
352 &mov (&DWP(4*7,"esp"),$b_);
353 &mov ($b_,&wparam(1)); # load input
354 &mov (&DWP(4*10,"esp"),$c);
355 &mov ($c,&wparam(0)); # load output
356 &mov (&DWP(4*11,"esp"),$c_);
357 &xor ($c_,$c_);
358 &mov (&DWP(4*13,"esp"),$d);
359 &mov (&DWP(4*15,"esp"),$d_);
360
361 &xor ("eax","eax");
362 &xor ("edx","edx");
363&set_label("tail_loop");
b44a9641
AP
364 &movb ("al",&BP(0,$c_,$b_));
365 &movb ("dl",&BP(0,"esp",$c_));
a98c648e
AP
366 &lea ($c_,&DWP(1,$c_));
367 &xor ("al","dl");
b44a9641 368 &mov (&BP(-1,$c,$c_),"al");
a98c648e
AP
369 &dec ($b);
370 &jnz (&label("tail_loop"));
371
372&set_label("done");
373 &stack_pop(33);
622a531c 374&set_label("no_data");
a98c648e
AP
375&function_end("ChaCha20_ctr32");
376
377if ($xmm) {
378my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
379my ($out,$inp,$len)=("edi","esi","ecx");
380
381sub QUARTERROUND_SSSE3 {
382my ($ai,$bi,$ci,$di,$i)=@_;
383my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
384my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
385
386 # a b c d
387 #
388 # 0 4 8 12 < even round
389 # 1 5 9 13
390 # 2 6 10 14
391 # 3 7 11 15
392 # 0 5 10 15 < odd round
393 # 1 6 11 12
394 # 2 7 8 13
395 # 3 4 9 14
396
397 if ($i==0) {
398 my $j=4;
399 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
400 } elsif ($i==3) {
401 my $j=0;
402 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
403 } elsif ($i==4) {
404 my $j=4;
405 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
406 } elsif ($i==7) {
407 my $j=0;
408 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
409 }
410
411 #&paddd ($xa,$xb); # see elsewhere
412 #&pxor ($xd,$xa); # see elsewhere
413 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3);
414 &pshufb ($xd,&QWP(0,"eax")); # rot16
415 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0);
416 &paddd ($xc,$xd);
417 &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3);
418 &pxor ($xb,$xc);
419 &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7);
420 &movdqa ($xa_,$xb); # borrow as temporary
421 &pslld ($xb,12);
422 &psrld ($xa_,20);
423 &por ($xb,$xa_);
424 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
425 &paddd ($xa,$xb);
426 &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn);
427 &pxor ($xd,$xa);
428 &movdqa (&QWP(16*$ai-128,"ebx"),$xa);
429 &pshufb ($xd,&QWP(16,"eax")); # rot8
430 &paddd ($xc,$xd);
431 &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn);
432 &movdqa ($xd_,$xd) if ($di==$dn);
433 &pxor ($xb,$xc);
434 &paddd ($xa_,$xb_) if ($i<7); # elsewhere
435 &movdqa ($xa,$xb); # borrow as temporary
436 &pslld ($xb,7);
437 &psrld ($xa,25);
438 &pxor ($xd_,$xa_) if ($i<7); # elsewhere
439 &por ($xb,$xa);
440
441 ($xa,$xa_)=($xa_,$xa);
442 ($xb,$xb_)=($xb_,$xb);
443 ($xc,$xc_)=($xc_,$xc);
444 ($xd,$xd_)=($xd_,$xd);
445}
446
447&function_begin("ChaCha20_ssse3");
448&set_label("ssse3_shortcut");
8e9f1bb9 449if ($ymm) {
a98c648e
AP
450 &test (&DWP(4,"ebp"),1<<11); # test XOP bit
451 &jnz (&label("xop_shortcut"));
8e9f1bb9 452}
a98c648e
AP
453
454 &mov ($out,&wparam(0));
455 &mov ($inp,&wparam(1));
456 &mov ($len,&wparam(2));
457 &mov ("edx",&wparam(3)); # key
458 &mov ("ebx",&wparam(4)); # counter and nonce
459
460 &mov ("ebp","esp");
461 &stack_push (131);
462 &and ("esp",-64);
463 &mov (&DWP(512,"esp"),"ebp");
464
465 &lea ("eax",&DWP(&label("ssse3_data")."-".
466 &label("pic_point"),"eax"));
467 &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce
468
d89773d6
AP
469if (defined($gasver) && $gasver>=2.17) { # even though we encode
470 # pshufb manually, we
471 # handle only register
472 # operands, while this
473 # segment uses memory
474 # operand...
a98c648e
AP
475 &cmp ($len,64*4);
476 &jb (&label("1x"));
477
478 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers
479 &mov (&DWP(512+8,"esp"),"ebx");
480 &sub ($len,64*4); # bias len
481 &lea ("ebp",&DWP(256+128,"esp")); # size optimization
482
b44a9641 483 &movdqu ("xmm7",&QWP(0,"edx")); # key
a98c648e
AP
484 &pshufd ("xmm0","xmm3",0x00);
485 &pshufd ("xmm1","xmm3",0x55);
486 &pshufd ("xmm2","xmm3",0xaa);
487 &pshufd ("xmm3","xmm3",0xff);
488 &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters
489 &pshufd ("xmm4","xmm7",0x00);
490 &pshufd ("xmm5","xmm7",0x55);
491 &psubd ("xmm0",&QWP(16*4,"eax"));
492 &pshufd ("xmm6","xmm7",0xaa);
493 &pshufd ("xmm7","xmm7",0xff);
494 &movdqa (&QWP(16*12-128,"ebp"),"xmm0");
495 &movdqa (&QWP(16*13-128,"ebp"),"xmm1");
496 &movdqa (&QWP(16*14-128,"ebp"),"xmm2");
497 &movdqa (&QWP(16*15-128,"ebp"),"xmm3");
b44a9641 498 &movdqu ("xmm3",&QWP(16,"edx")); # key
a98c648e
AP
499 &movdqa (&QWP(16*4-128,"ebp"),"xmm4");
500 &movdqa (&QWP(16*5-128,"ebp"),"xmm5");
501 &movdqa (&QWP(16*6-128,"ebp"),"xmm6");
502 &movdqa (&QWP(16*7-128,"ebp"),"xmm7");
b44a9641 503 &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma
a98c648e
AP
504 &lea ("ebx",&DWP(128,"esp")); # size optimization
505
506 &pshufd ("xmm0","xmm3",0x00);
507 &pshufd ("xmm1","xmm3",0x55);
508 &pshufd ("xmm2","xmm3",0xaa);
509 &pshufd ("xmm3","xmm3",0xff);
510 &pshufd ("xmm4","xmm7",0x00);
511 &pshufd ("xmm5","xmm7",0x55);
512 &pshufd ("xmm6","xmm7",0xaa);
513 &pshufd ("xmm7","xmm7",0xff);
514 &movdqa (&QWP(16*8-128,"ebp"),"xmm0");
515 &movdqa (&QWP(16*9-128,"ebp"),"xmm1");
516 &movdqa (&QWP(16*10-128,"ebp"),"xmm2");
517 &movdqa (&QWP(16*11-128,"ebp"),"xmm3");
518 &movdqa (&QWP(16*0-128,"ebp"),"xmm4");
519 &movdqa (&QWP(16*1-128,"ebp"),"xmm5");
520 &movdqa (&QWP(16*2-128,"ebp"),"xmm6");
521 &movdqa (&QWP(16*3-128,"ebp"),"xmm7");
522
523 &lea ($inp,&DWP(128,$inp)); # size optimization
524 &lea ($out,&DWP(128,$out)); # size optimization
525 &jmp (&label("outer_loop"));
526
527&set_label("outer_loop",16);
528 #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material
529 &movdqa ("xmm1",&QWP(16*1-128,"ebp"));
530 &movdqa ("xmm2",&QWP(16*2-128,"ebp"));
531 &movdqa ("xmm3",&QWP(16*3-128,"ebp"));
532 #&movdqa ("xmm4",&QWP(16*4-128,"ebp"));
533 &movdqa ("xmm5",&QWP(16*5-128,"ebp"));
534 &movdqa ("xmm6",&QWP(16*6-128,"ebp"));
535 &movdqa ("xmm7",&QWP(16*7-128,"ebp"));
536 #&movdqa (&QWP(16*0-128,"ebx"),"xmm0");
537 &movdqa (&QWP(16*1-128,"ebx"),"xmm1");
538 &movdqa (&QWP(16*2-128,"ebx"),"xmm2");
539 &movdqa (&QWP(16*3-128,"ebx"),"xmm3");
540 #&movdqa (&QWP(16*4-128,"ebx"),"xmm4");
541 &movdqa (&QWP(16*5-128,"ebx"),"xmm5");
542 &movdqa (&QWP(16*6-128,"ebx"),"xmm6");
543 &movdqa (&QWP(16*7-128,"ebx"),"xmm7");
544 #&movdqa ("xmm0",&QWP(16*8-128,"ebp"));
545 #&movdqa ("xmm1",&QWP(16*9-128,"ebp"));
546 &movdqa ("xmm2",&QWP(16*10-128,"ebp"));
547 &movdqa ("xmm3",&QWP(16*11-128,"ebp"));
548 &movdqa ("xmm4",&QWP(16*12-128,"ebp"));
549 &movdqa ("xmm5",&QWP(16*13-128,"ebp"));
550 &movdqa ("xmm6",&QWP(16*14-128,"ebp"));
551 &movdqa ("xmm7",&QWP(16*15-128,"ebp"));
552 &paddd ("xmm4",&QWP(16*4,"eax")); # counter value
553 #&movdqa (&QWP(16*8-128,"ebx"),"xmm0");
554 #&movdqa (&QWP(16*9-128,"ebx"),"xmm1");
555 &movdqa (&QWP(16*10-128,"ebx"),"xmm2");
556 &movdqa (&QWP(16*11-128,"ebx"),"xmm3");
557 &movdqa (&QWP(16*12-128,"ebx"),"xmm4");
558 &movdqa (&QWP(16*13-128,"ebx"),"xmm5");
559 &movdqa (&QWP(16*14-128,"ebx"),"xmm6");
560 &movdqa (&QWP(16*15-128,"ebx"),"xmm7");
561 &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value
562
563 &movdqa ($xa, &QWP(16*0-128,"ebp"));
564 &movdqa ($xd, "xmm4");
565 &movdqa ($xb_,&QWP(16*4-128,"ebp"));
566 &movdqa ($xc, &QWP(16*8-128,"ebp"));
567 &movdqa ($xc_,&QWP(16*9-128,"ebp"));
568
569 &mov ("edx",10); # loop counter
570 &nop ();
571
572&set_label("loop",16);
573 &paddd ($xa,$xb_); # elsewhere
574 &movdqa ($xb,$xb_);
575 &pxor ($xd,$xa); # elsewhere
576 &QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
577 &QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
578 &QUARTERROUND_SSSE3(2, 6,10, 14, 2);
579 &QUARTERROUND_SSSE3(3, 7,11, 15, 3);
580 &QUARTERROUND_SSSE3(0, 5,10, 15, 4);
581 &QUARTERROUND_SSSE3(1, 6,11, 12, 5);
582 &QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
583 &QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
584 &dec ("edx");
585 &jnz (&label("loop"));
586
587 &movdqa (&QWP(16*4-128,"ebx"),$xb_);
588 &movdqa (&QWP(16*8-128,"ebx"),$xc);
589 &movdqa (&QWP(16*9-128,"ebx"),$xc_);
590 &movdqa (&QWP(16*12-128,"ebx"),$xd);
591 &movdqa (&QWP(16*14-128,"ebx"),$xd_);
592
593 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
594
595 #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there
596 &movdqa ($xa1,&QWP(16*1-128,"ebx"));
597 &movdqa ($xa2,&QWP(16*2-128,"ebx"));
598 &movdqa ($xa3,&QWP(16*3-128,"ebx"));
599
600 for($i=0;$i<256;$i+=64) {
601 &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material
602 &paddd ($xa1,&QWP($i+16*1-128,"ebp"));
603 &paddd ($xa2,&QWP($i+16*2-128,"ebp"));
604 &paddd ($xa3,&QWP($i+16*3-128,"ebp"));
605
606 &movdqa ($xt2,$xa0); # "de-interlace" data
607 &punpckldq ($xa0,$xa1);
608 &movdqa ($xt3,$xa2);
609 &punpckldq ($xa2,$xa3);
610 &punpckhdq ($xt2,$xa1);
611 &punpckhdq ($xt3,$xa3);
612 &movdqa ($xa1,$xa0);
613 &punpcklqdq ($xa0,$xa2); # "a0"
614 &movdqa ($xa3,$xt2);
615 &punpcklqdq ($xt2,$xt3); # "a2"
616 &punpckhqdq ($xa1,$xa2); # "a1"
617 &punpckhqdq ($xa3,$xt3); # "a3"
618
619 #($xa2,$xt2)=($xt2,$xa2);
620
621 &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input
622 &movdqu ($xt1,&QWP(64*1-128,$inp));
623 &movdqu ($xa2,&QWP(64*2-128,$inp));
624 &movdqu ($xt3,&QWP(64*3-128,$inp));
625 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp));
626 &pxor ($xt0,$xa0);
627 &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192);
628 &pxor ($xt1,$xa1);
629 &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192);
630 &pxor ($xt2,$xa2);
631 &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192);
632 &pxor ($xt3,$xa3);
633 &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192);
634 &movdqu (&QWP(64*0-128,$out),$xt0); # store output
635 &movdqu (&QWP(64*1-128,$out),$xt1);
636 &movdqu (&QWP(64*2-128,$out),$xt2);
637 &movdqu (&QWP(64*3-128,$out),$xt3);
638 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out));
639 }
640 &sub ($len,64*4);
641 &jnc (&label("outer_loop"));
642
643 &add ($len,64*4);
644 &jz (&label("done"));
645
646 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers
647 &lea ($inp,&DWP(-128,$inp));
648 &mov ("edx",&DWP(512+4,"esp"));
649 &lea ($out,&DWP(-128,$out));
650
651 &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value
652 &movdqu ("xmm3",&QWP(0,"ebx"));
653 &paddd ("xmm2",&QWP(16*6,"eax")); # +four
654 &pand ("xmm3",&QWP(16*7,"eax"));
655 &por ("xmm3","xmm2"); # counter value
d89773d6 656}
a98c648e 657{
b44a9641 658my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
a98c648e
AP
659
660sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
661 &paddd ($a,$b);
662 &pxor ($d,$a);
663 &pshufb ($d,$rot16);
664
665 &paddd ($c,$d);
666 &pxor ($b,$c);
667 &movdqa ($t,$b);
668 &psrld ($b,20);
669 &pslld ($t,12);
670 &por ($b,$t);
671
672 &paddd ($a,$b);
673 &pxor ($d,$a);
674 &pshufb ($d,$rot24);
675
676 &paddd ($c,$d);
677 &pxor ($b,$c);
678 &movdqa ($t,$b);
679 &psrld ($b,25);
680 &pslld ($t,7);
681 &por ($b,$t);
682}
683
684&set_label("1x");
685 &movdqa ($a,&QWP(16*2,"eax")); # sigma
686 &movdqu ($b,&QWP(0,"edx"));
687 &movdqu ($c,&QWP(16,"edx"));
688 #&movdqu ($d,&QWP(0,"ebx")); # already loaded
689 &movdqa ($rot16,&QWP(0,"eax"));
690 &movdqa ($rot24,&QWP(16,"eax"));
691 &mov (&DWP(16*3,"esp"),"ebp");
692
693 &movdqa (&QWP(16*0,"esp"),$a);
694 &movdqa (&QWP(16*1,"esp"),$b);
695 &movdqa (&QWP(16*2,"esp"),$c);
696 &movdqa (&QWP(16*3,"esp"),$d);
697 &mov ("edx",10);
698 &jmp (&label("loop1x"));
699
700&set_label("outer1x",16);
701 &movdqa ($d,&QWP(16*5,"eax")); # one
702 &movdqa ($a,&QWP(16*0,"esp"));
703 &movdqa ($b,&QWP(16*1,"esp"));
704 &movdqa ($c,&QWP(16*2,"esp"));
705 &paddd ($d,&QWP(16*3,"esp"));
706 &mov ("edx",10);
707 &movdqa (&QWP(16*3,"esp"),$d);
708 &jmp (&label("loop1x"));
709
710&set_label("loop1x",16);
711 &SSSE3ROUND();
712 &pshufd ($c,$c,0b01001110);
713 &pshufd ($b,$b,0b00111001);
714 &pshufd ($d,$d,0b10010011);
715 &nop ();
716
717 &SSSE3ROUND();
718 &pshufd ($c,$c,0b01001110);
719 &pshufd ($b,$b,0b10010011);
720 &pshufd ($d,$d,0b00111001);
721
722 &dec ("edx");
723 &jnz (&label("loop1x"));
724
725 &paddd ($a,&QWP(16*0,"esp"));
726 &paddd ($b,&QWP(16*1,"esp"));
727 &paddd ($c,&QWP(16*2,"esp"));
728 &paddd ($d,&QWP(16*3,"esp"));
729
730 &cmp ($len,64);
731 &jb (&label("tail"));
732
733 &movdqu ($t,&QWP(16*0,$inp));
734 &movdqu ($t1,&QWP(16*1,$inp));
735 &pxor ($a,$t); # xor with input
736 &movdqu ($t,&QWP(16*2,$inp));
737 &pxor ($b,$t1);
738 &movdqu ($t1,&QWP(16*3,$inp));
739 &pxor ($c,$t);
740 &pxor ($d,$t1);
741 &lea ($inp,&DWP(16*4,$inp)); # inp+=64
742
743 &movdqu (&QWP(16*0,$out),$a); # write output
744 &movdqu (&QWP(16*1,$out),$b);
745 &movdqu (&QWP(16*2,$out),$c);
746 &movdqu (&QWP(16*3,$out),$d);
747 &lea ($out,&DWP(16*4,$out)); # inp+=64
748
749 &sub ($len,64);
750 &jnz (&label("outer1x"));
751
752 &jmp (&label("done"));
753
754&set_label("tail");
755 &movdqa (&QWP(16*0,"esp"),$a);
756 &movdqa (&QWP(16*1,"esp"),$b);
757 &movdqa (&QWP(16*2,"esp"),$c);
758 &movdqa (&QWP(16*3,"esp"),$d);
759
760 &xor ("eax","eax");
761 &xor ("edx","edx");
762 &xor ("ebp","ebp");
763
764&set_label("tail_loop");
765 &movb ("al",&BP(0,"esp","ebp"));
766 &movb ("dl",&BP(0,$inp,"ebp"));
767 &lea ("ebp",&DWP(1,"ebp"));
768 &xor ("al","dl");
769 &movb (&BP(-1,$out,"ebp"),"al");
770 &dec ($len);
771 &jnz (&label("tail_loop"));
772}
773&set_label("done");
774 &mov ("esp",&DWP(512,"esp"));
775&function_end("ChaCha20_ssse3");
776
777&align (64);
778&set_label("ssse3_data");
779&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
780&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
781&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
782&data_word(0,1,2,3);
783&data_word(4,4,4,4);
784&data_word(1,0,0,0);
785&data_word(4,0,0,0);
786&data_word(0,-1,-1,-1);
787&align (64);
788}
789&asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
790
8e9f1bb9 791if ($ymm) {
a98c648e
AP
792my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
793my ($out,$inp,$len)=("edi","esi","ecx");
794
795sub QUARTERROUND_XOP {
796my ($ai,$bi,$ci,$di,$i)=@_;
797my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
798my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
799
800 # a b c d
801 #
802 # 0 4 8 12 < even round
803 # 1 5 9 13
804 # 2 6 10 14
805 # 3 7 11 15
806 # 0 5 10 15 < odd round
807 # 1 6 11 12
808 # 2 7 8 13
809 # 3 4 9 14
810
811 if ($i==0) {
812 my $j=4;
813 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
814 } elsif ($i==3) {
815 my $j=0;
816 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
817 } elsif ($i==4) {
818 my $j=4;
819 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
820 } elsif ($i==7) {
821 my $j=0;
822 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
823 }
824
825 #&vpaddd ($xa,$xa,$xb); # see elsewhere
826 #&vpxor ($xd,$xd,$xa); # see elsewhere
827 &vmovdqa (&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3);
828 &vprotd ($xd,$xd,16);
829 &vmovdqa (&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0);
830 &vpaddd ($xc,$xc,$xd);
831 &vmovdqa ($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3);
832 &vpxor ($xb,$i!=0?$xb:$xb_,$xc);
833 &vmovdqa ($xa_,&QWP(16*$an-128,"ebx"));
834 &vprotd ($xb,$xb,12);
835 &vmovdqa ($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7);
836 &vpaddd ($xa,$xa,$xb);
837 &vmovdqa ($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn);
838 &vpxor ($xd,$xd,$xa);
839 &vpaddd ($xa_,$xa_,$xb_) if ($i<7); # elsewhere
840 &vprotd ($xd,$xd,8);
841 &vmovdqa (&QWP(16*$ai-128,"ebx"),$xa);
842 &vpaddd ($xc,$xc,$xd);
843 &vmovdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn);
844 &vpxor ($xb,$xb,$xc);
845 &vpxor ($xd_,$di==$dn?$xd:$xd_,$xa_) if ($i<7); # elsewhere
846 &vprotd ($xb,$xb,7);
847
848 ($xa,$xa_)=($xa_,$xa);
849 ($xb,$xb_)=($xb_,$xb);
850 ($xc,$xc_)=($xc_,$xc);
851 ($xd,$xd_)=($xd_,$xd);
852}
853
854&function_begin("ChaCha20_xop");
855&set_label("xop_shortcut");
856 &mov ($out,&wparam(0));
857 &mov ($inp,&wparam(1));
858 &mov ($len,&wparam(2));
859 &mov ("edx",&wparam(3)); # key
860 &mov ("ebx",&wparam(4)); # counter and nonce
861 &vzeroupper ();
862
863 &mov ("ebp","esp");
864 &stack_push (131);
865 &and ("esp",-64);
866 &mov (&DWP(512,"esp"),"ebp");
867
868 &lea ("eax",&DWP(&label("ssse3_data")."-".
869 &label("pic_point"),"eax"));
870 &vmovdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce
871
872 &cmp ($len,64*4);
873 &jb (&label("1x"));
874
875 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers
876 &mov (&DWP(512+8,"esp"),"ebx");
877 &sub ($len,64*4); # bias len
878 &lea ("ebp",&DWP(256+128,"esp")); # size optimization
879
b44a9641 880 &vmovdqu ("xmm7",&QWP(0,"edx")); # key
a98c648e
AP
881 &vpshufd ("xmm0","xmm3",0x00);
882 &vpshufd ("xmm1","xmm3",0x55);
883 &vpshufd ("xmm2","xmm3",0xaa);
884 &vpshufd ("xmm3","xmm3",0xff);
885 &vpaddd ("xmm0","xmm0",&QWP(16*3,"eax")); # fix counters
886 &vpshufd ("xmm4","xmm7",0x00);
887 &vpshufd ("xmm5","xmm7",0x55);
888 &vpsubd ("xmm0","xmm0",&QWP(16*4,"eax"));
889 &vpshufd ("xmm6","xmm7",0xaa);
890 &vpshufd ("xmm7","xmm7",0xff);
891 &vmovdqa (&QWP(16*12-128,"ebp"),"xmm0");
892 &vmovdqa (&QWP(16*13-128,"ebp"),"xmm1");
893 &vmovdqa (&QWP(16*14-128,"ebp"),"xmm2");
894 &vmovdqa (&QWP(16*15-128,"ebp"),"xmm3");
b44a9641 895 &vmovdqu ("xmm3",&QWP(16,"edx")); # key
a98c648e
AP
896 &vmovdqa (&QWP(16*4-128,"ebp"),"xmm4");
897 &vmovdqa (&QWP(16*5-128,"ebp"),"xmm5");
898 &vmovdqa (&QWP(16*6-128,"ebp"),"xmm6");
899 &vmovdqa (&QWP(16*7-128,"ebp"),"xmm7");
b44a9641 900 &vmovdqa ("xmm7",&QWP(16*2,"eax")); # sigma
a98c648e
AP
901 &lea ("ebx",&DWP(128,"esp")); # size optimization
902
903 &vpshufd ("xmm0","xmm3",0x00);
904 &vpshufd ("xmm1","xmm3",0x55);
905 &vpshufd ("xmm2","xmm3",0xaa);
906 &vpshufd ("xmm3","xmm3",0xff);
907 &vpshufd ("xmm4","xmm7",0x00);
908 &vpshufd ("xmm5","xmm7",0x55);
909 &vpshufd ("xmm6","xmm7",0xaa);
910 &vpshufd ("xmm7","xmm7",0xff);
911 &vmovdqa (&QWP(16*8-128,"ebp"),"xmm0");
912 &vmovdqa (&QWP(16*9-128,"ebp"),"xmm1");
913 &vmovdqa (&QWP(16*10-128,"ebp"),"xmm2");
914 &vmovdqa (&QWP(16*11-128,"ebp"),"xmm3");
915 &vmovdqa (&QWP(16*0-128,"ebp"),"xmm4");
916 &vmovdqa (&QWP(16*1-128,"ebp"),"xmm5");
917 &vmovdqa (&QWP(16*2-128,"ebp"),"xmm6");
918 &vmovdqa (&QWP(16*3-128,"ebp"),"xmm7");
919
920 &lea ($inp,&DWP(128,$inp)); # size optimization
921 &lea ($out,&DWP(128,$out)); # size optimization
922 &jmp (&label("outer_loop"));
923
924&set_label("outer_loop",32);
925 #&vmovdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material
926 &vmovdqa ("xmm1",&QWP(16*1-128,"ebp"));
927 &vmovdqa ("xmm2",&QWP(16*2-128,"ebp"));
928 &vmovdqa ("xmm3",&QWP(16*3-128,"ebp"));
929 #&vmovdqa ("xmm4",&QWP(16*4-128,"ebp"));
930 &vmovdqa ("xmm5",&QWP(16*5-128,"ebp"));
931 &vmovdqa ("xmm6",&QWP(16*6-128,"ebp"));
932 &vmovdqa ("xmm7",&QWP(16*7-128,"ebp"));
933 #&vmovdqa (&QWP(16*0-128,"ebx"),"xmm0");
934 &vmovdqa (&QWP(16*1-128,"ebx"),"xmm1");
935 &vmovdqa (&QWP(16*2-128,"ebx"),"xmm2");
936 &vmovdqa (&QWP(16*3-128,"ebx"),"xmm3");
937 #&vmovdqa (&QWP(16*4-128,"ebx"),"xmm4");
938 &vmovdqa (&QWP(16*5-128,"ebx"),"xmm5");
939 &vmovdqa (&QWP(16*6-128,"ebx"),"xmm6");
940 &vmovdqa (&QWP(16*7-128,"ebx"),"xmm7");
941 #&vmovdqa ("xmm0",&QWP(16*8-128,"ebp"));
942 #&vmovdqa ("xmm1",&QWP(16*9-128,"ebp"));
943 &vmovdqa ("xmm2",&QWP(16*10-128,"ebp"));
944 &vmovdqa ("xmm3",&QWP(16*11-128,"ebp"));
945 &vmovdqa ("xmm4",&QWP(16*12-128,"ebp"));
946 &vmovdqa ("xmm5",&QWP(16*13-128,"ebp"));
947 &vmovdqa ("xmm6",&QWP(16*14-128,"ebp"));
948 &vmovdqa ("xmm7",&QWP(16*15-128,"ebp"));
949 &vpaddd ("xmm4","xmm4",&QWP(16*4,"eax")); # counter value
950 #&vmovdqa (&QWP(16*8-128,"ebx"),"xmm0");
951 #&vmovdqa (&QWP(16*9-128,"ebx"),"xmm1");
952 &vmovdqa (&QWP(16*10-128,"ebx"),"xmm2");
953 &vmovdqa (&QWP(16*11-128,"ebx"),"xmm3");
954 &vmovdqa (&QWP(16*12-128,"ebx"),"xmm4");
955 &vmovdqa (&QWP(16*13-128,"ebx"),"xmm5");
956 &vmovdqa (&QWP(16*14-128,"ebx"),"xmm6");
957 &vmovdqa (&QWP(16*15-128,"ebx"),"xmm7");
958 &vmovdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value
959
960 &vmovdqa ($xa, &QWP(16*0-128,"ebp"));
961 &vmovdqa ($xd, "xmm4");
962 &vmovdqa ($xb_,&QWP(16*4-128,"ebp"));
963 &vmovdqa ($xc, &QWP(16*8-128,"ebp"));
964 &vmovdqa ($xc_,&QWP(16*9-128,"ebp"));
965
966 &mov ("edx",10); # loop counter
967 &nop ();
968
969&set_label("loop",32);
970 &vpaddd ($xa,$xa,$xb_); # elsewhere
971 &vpxor ($xd,$xd,$xa); # elsewhere
972 &QUARTERROUND_XOP(0, 4, 8, 12, 0);
973 &QUARTERROUND_XOP(1, 5, 9, 13, 1);
974 &QUARTERROUND_XOP(2, 6,10, 14, 2);
975 &QUARTERROUND_XOP(3, 7,11, 15, 3);
976 &QUARTERROUND_XOP(0, 5,10, 15, 4);
977 &QUARTERROUND_XOP(1, 6,11, 12, 5);
978 &QUARTERROUND_XOP(2, 7, 8, 13, 6);
979 &QUARTERROUND_XOP(3, 4, 9, 14, 7);
980 &dec ("edx");
981 &jnz (&label("loop"));
982
983 &vmovdqa (&QWP(16*4-128,"ebx"),$xb_);
984 &vmovdqa (&QWP(16*8-128,"ebx"),$xc);
985 &vmovdqa (&QWP(16*9-128,"ebx"),$xc_);
986 &vmovdqa (&QWP(16*12-128,"ebx"),$xd);
987 &vmovdqa (&QWP(16*14-128,"ebx"),$xd_);
988
989 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
990
991 #&vmovdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there
992 &vmovdqa ($xa1,&QWP(16*1-128,"ebx"));
993 &vmovdqa ($xa2,&QWP(16*2-128,"ebx"));
994 &vmovdqa ($xa3,&QWP(16*3-128,"ebx"));
995
996 for($i=0;$i<256;$i+=64) {
997 &vpaddd ($xa0,$xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material
998 &vpaddd ($xa1,$xa1,&QWP($i+16*1-128,"ebp"));
999 &vpaddd ($xa2,$xa2,&QWP($i+16*2-128,"ebp"));
1000 &vpaddd ($xa3,$xa3,&QWP($i+16*3-128,"ebp"));
1001
1002 &vpunpckldq ($xt2,$xa0,$xa1); # "de-interlace" data
1003 &vpunpckldq ($xt3,$xa2,$xa3);
1004 &vpunpckhdq ($xa0,$xa0,$xa1);
1005 &vpunpckhdq ($xa2,$xa2,$xa3);
1006 &vpunpcklqdq ($xa1,$xt2,$xt3); # "a0"
1007 &vpunpckhqdq ($xt2,$xt2,$xt3); # "a1"
1008 &vpunpcklqdq ($xt3,$xa0,$xa2); # "a2"
1009 &vpunpckhqdq ($xa3,$xa0,$xa2); # "a3"
1010
1011 &vpxor ($xt0,$xa1,&QWP(64*0-128,$inp));
1012 &vpxor ($xt1,$xt2,&QWP(64*1-128,$inp));
1013 &vpxor ($xt2,$xt3,&QWP(64*2-128,$inp));
1014 &vpxor ($xt3,$xa3,&QWP(64*3-128,$inp));
1015 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp));
1016 &vmovdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192);
1017 &vmovdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192);
1018 &vmovdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192);
1019 &vmovdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192);
1020 &vmovdqu (&QWP(64*0-128,$out),$xt0); # store output
1021 &vmovdqu (&QWP(64*1-128,$out),$xt1);
1022 &vmovdqu (&QWP(64*2-128,$out),$xt2);
1023 &vmovdqu (&QWP(64*3-128,$out),$xt3);
1024 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out));
1025 }
1026 &sub ($len,64*4);
1027 &jnc (&label("outer_loop"));
1028
1029 &add ($len,64*4);
1030 &jz (&label("done"));
1031
1032 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers
1033 &lea ($inp,&DWP(-128,$inp));
1034 &mov ("edx",&DWP(512+4,"esp"));
1035 &lea ($out,&DWP(-128,$out));
1036
1037 &vmovd ("xmm2",&DWP(16*12-128,"ebp")); # counter value
1038 &vmovdqu ("xmm3",&QWP(0,"ebx"));
1039 &vpaddd ("xmm2","xmm2",&QWP(16*6,"eax"));# +four
1040 &vpand ("xmm3","xmm3",&QWP(16*7,"eax"));
1041 &vpor ("xmm3","xmm3","xmm2"); # counter value
1042{
b44a9641 1043my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
a98c648e
AP
1044
1045sub XOPROUND {
1046 &vpaddd ($a,$a,$b);
1047 &vpxor ($d,$d,$a);
1048 &vprotd ($d,$d,16);
1049
1050 &vpaddd ($c,$c,$d);
1051 &vpxor ($b,$b,$c);
1052 &vprotd ($b,$b,12);
1053
1054 &vpaddd ($a,$a,$b);
1055 &vpxor ($d,$d,$a);
1056 &vprotd ($d,$d,8);
1057
1058 &vpaddd ($c,$c,$d);
1059 &vpxor ($b,$b,$c);
1060 &vprotd ($b,$b,7);
1061}
1062
1063&set_label("1x");
1064 &vmovdqa ($a,&QWP(16*2,"eax")); # sigma
1065 &vmovdqu ($b,&QWP(0,"edx"));
1066 &vmovdqu ($c,&QWP(16,"edx"));
1067 #&vmovdqu ($d,&QWP(0,"ebx")); # already loaded
1068 &vmovdqa ($rot16,&QWP(0,"eax"));
1069 &vmovdqa ($rot24,&QWP(16,"eax"));
1070 &mov (&DWP(16*3,"esp"),"ebp");
1071
1072 &vmovdqa (&QWP(16*0,"esp"),$a);
1073 &vmovdqa (&QWP(16*1,"esp"),$b);
1074 &vmovdqa (&QWP(16*2,"esp"),$c);
1075 &vmovdqa (&QWP(16*3,"esp"),$d);
1076 &mov ("edx",10);
1077 &jmp (&label("loop1x"));
1078
1079&set_label("outer1x",16);
1080 &vmovdqa ($d,&QWP(16*5,"eax")); # one
1081 &vmovdqa ($a,&QWP(16*0,"esp"));
1082 &vmovdqa ($b,&QWP(16*1,"esp"));
1083 &vmovdqa ($c,&QWP(16*2,"esp"));
1084 &vpaddd ($d,$d,&QWP(16*3,"esp"));
1085 &mov ("edx",10);
1086 &vmovdqa (&QWP(16*3,"esp"),$d);
1087 &jmp (&label("loop1x"));
1088
1089&set_label("loop1x",16);
1090 &XOPROUND();
1091 &vpshufd ($c,$c,0b01001110);
1092 &vpshufd ($b,$b,0b00111001);
1093 &vpshufd ($d,$d,0b10010011);
1094
1095 &XOPROUND();
1096 &vpshufd ($c,$c,0b01001110);
1097 &vpshufd ($b,$b,0b10010011);
1098 &vpshufd ($d,$d,0b00111001);
1099
1100 &dec ("edx");
1101 &jnz (&label("loop1x"));
1102
1103 &vpaddd ($a,$a,&QWP(16*0,"esp"));
1104 &vpaddd ($b,$b,&QWP(16*1,"esp"));
1105 &vpaddd ($c,$c,&QWP(16*2,"esp"));
1106 &vpaddd ($d,$d,&QWP(16*3,"esp"));
1107
1108 &cmp ($len,64);
1109 &jb (&label("tail"));
1110
1111 &vpxor ($a,$a,&QWP(16*0,$inp)); # xor with input
1112 &vpxor ($b,$b,&QWP(16*1,$inp));
1113 &vpxor ($c,$c,&QWP(16*2,$inp));
1114 &vpxor ($d,$d,&QWP(16*3,$inp));
1115 &lea ($inp,&DWP(16*4,$inp)); # inp+=64
1116
1117 &vmovdqu (&QWP(16*0,$out),$a); # write output
1118 &vmovdqu (&QWP(16*1,$out),$b);
1119 &vmovdqu (&QWP(16*2,$out),$c);
1120 &vmovdqu (&QWP(16*3,$out),$d);
1121 &lea ($out,&DWP(16*4,$out)); # inp+=64
1122
1123 &sub ($len,64);
1124 &jnz (&label("outer1x"));
1125
1126 &jmp (&label("done"));
1127
1128&set_label("tail");
1129 &vmovdqa (&QWP(16*0,"esp"),$a);
1130 &vmovdqa (&QWP(16*1,"esp"),$b);
1131 &vmovdqa (&QWP(16*2,"esp"),$c);
1132 &vmovdqa (&QWP(16*3,"esp"),$d);
1133
1134 &xor ("eax","eax");
1135 &xor ("edx","edx");
1136 &xor ("ebp","ebp");
1137
1138&set_label("tail_loop");
1139 &movb ("al",&BP(0,"esp","ebp"));
1140 &movb ("dl",&BP(0,$inp,"ebp"));
1141 &lea ("ebp",&DWP(1,"ebp"));
1142 &xor ("al","dl");
1143 &movb (&BP(-1,$out,"ebp"),"al");
1144 &dec ($len);
1145 &jnz (&label("tail_loop"));
1146}
1147&set_label("done");
1148 &vzeroupper ();
1149 &mov ("esp",&DWP(512,"esp"));
1150&function_end("ChaCha20_xop");
1151}
1152
1153&asm_finish();
df0cb57c 1154
218e740f 1155close STDOUT or die "error closing STDOUT: $!";