]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aesni-x86.pl
Remove eng_aesni.c as AES-NI support is integrated directly at EVP.
[thirdparty/openssl.git] / crypto / aes / asm / aesni-x86.pl
CommitLineData
d64a7232
AP
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13# details].
d7d119a3
AP
14#
15# Performance.
16#
17# To start with see corresponding paragraph in aesni-x86_64.pl...
18# Instead of filling table similar to one found there I've chosen to
19# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20# The simplified table below represents 32-bit performance relative
21# to 64-bit one in every given point. Ratios vary for different
22# encryption modes, therefore interval values.
23#
24# 16-byte 64-byte 256-byte 1-KB 8-KB
25# 53-67% 67-84% 91-94% 95-98% 97-99.5%
26#
27# Lower ratios for smaller block sizes are perfectly understandable,
28# because function call overhead is higher in 32-bit mode. Largest
29# 8-KB block performance is virtually same: 32-bit code is less than
f8501464
AP
30# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31
32# January 2011
33#
34# See aesni-x86_64.pl for details. Unlike x86_64 version this module
35# interleaves at most 6 aes[enc|dec] instructions, because there are
36# not enough registers for 8x interleave [which should be optimal for
37# Sandy Bridge]. Actually, performance results for 6x interleave
38# factor presented in aesni-x86_64.pl (except for CTR) are for this
39# module.
40
41# April 2011
42#
43# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
d64a7232
AP
45
46$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
47 # generates drop-in replacement for
48 # crypto/aes/asm/aes-586.pl:-)
6f766a41 49$inline=1; # inline _aesni_[en|de]crypt
d64a7232
AP
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52push(@INC,"${dir}","${dir}../../perlasm");
53require "x86asm.pl";
54
55&asm_init($ARGV[0],$0);
56
8da721ee 57if ($PREFIX eq "aesni") { $movekey=*movups; }
6c83629b 58else { $movekey=*movups; }
d64a7232
AP
59
60$len="eax";
61$rounds="ecx";
62$key="edx";
63$inp="esi";
64$out="edi";
d608b4d6
AP
65$rounds_="ebx"; # backup copy for $rounds
66$key_="ebp"; # backup copy for $key
d64a7232 67
f8501464
AP
68$rndkey0="xmm0";
69$rndkey1="xmm1";
70$inout0="xmm2";
71$inout1="xmm3";
72$inout2="xmm4";
73$inout3="xmm5"; $in1="xmm5";
74$inout4="xmm6"; $in0="xmm6";
75$inout5="xmm7"; $ivec="xmm7";
133a7f9a
AP
76
77# AESNI extenstion
78sub aeskeygenassist
79{ my($dst,$src,$imm)=@_;
80 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
81 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
82}
83sub aescommon
84{ my($opcodelet,$dst,$src)=@_;
85 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
86 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
87}
88sub aesimc { aescommon(0xdb,@_); }
89sub aesenc { aescommon(0xdc,@_); }
90sub aesenclast { aescommon(0xdd,@_); }
91sub aesdec { aescommon(0xde,@_); }
92sub aesdeclast { aescommon(0xdf,@_); }
6c83629b 93\f
d608b4d6 94# Inline version of internal aesni_[en|de]crypt1
d7d119a3 95{ my $sn;
d608b4d6 96sub aesni_inline_generate1
f8501464 97{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
d7d119a3 98 $sn++;
d64a7232 99
f8501464 100 &$movekey ($rndkey0,&QWP(0,$key));
d608b4d6 101 &$movekey ($rndkey1,&QWP(16,$key));
f8501464 102 &xorps ($ivec,$rndkey0) if (defined($ivec));
d608b4d6 103 &lea ($key,&DWP(32,$key));
f8501464
AP
104 &xorps ($inout,$ivec) if (defined($ivec));
105 &xorps ($inout,$rndkey0) if (!defined($ivec));
d7d119a3
AP
106 &set_label("${p}1_loop_$sn");
107 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 108 &dec ($rounds);
d64a7232 109 &$movekey ($rndkey1,&QWP(0,$key));
d608b4d6 110 &lea ($key,&DWP(16,$key));
d7d119a3
AP
111 &jnz (&label("${p}1_loop_$sn"));
112 eval"&aes${p}last ($inout,$rndkey1)";
113}}
d64a7232
AP
114
115sub aesni_generate1 # fully unrolled loop
d7d119a3 116{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
d64a7232
AP
117
118 &function_begin_B("_aesni_${p}rypt1");
f8501464 119 &movups ($rndkey0,&QWP(0,$key));
d64a7232 120 &$movekey ($rndkey1,&QWP(0x10,$key));
f8501464 121 &xorps ($inout,$rndkey0);
d64a7232
AP
122 &$movekey ($rndkey0,&QWP(0x20,$key));
123 &lea ($key,&DWP(0x30,$key));
d7d119a3 124 &cmp ($rounds,11);
d64a7232
AP
125 &jb (&label("${p}128"));
126 &lea ($key,&DWP(0x20,$key));
127 &je (&label("${p}192"));
128 &lea ($key,&DWP(0x20,$key));
d7d119a3 129 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 130 &$movekey ($rndkey1,&QWP(-0x40,$key));
d7d119a3 131 eval"&aes${p} ($inout,$rndkey0)";
d64a7232
AP
132 &$movekey ($rndkey0,&QWP(-0x30,$key));
133 &set_label("${p}192");
d7d119a3 134 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 135 &$movekey ($rndkey1,&QWP(-0x20,$key));
d7d119a3 136 eval"&aes${p} ($inout,$rndkey0)";
d64a7232
AP
137 &$movekey ($rndkey0,&QWP(-0x10,$key));
138 &set_label("${p}128");
d7d119a3 139 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 140 &$movekey ($rndkey1,&QWP(0,$key));
d7d119a3 141 eval"&aes${p} ($inout,$rndkey0)";
d64a7232 142 &$movekey ($rndkey0,&QWP(0x10,$key));
d7d119a3 143 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 144 &$movekey ($rndkey1,&QWP(0x20,$key));
d7d119a3 145 eval"&aes${p} ($inout,$rndkey0)";
d64a7232 146 &$movekey ($rndkey0,&QWP(0x30,$key));
d7d119a3 147 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 148 &$movekey ($rndkey1,&QWP(0x40,$key));
d7d119a3 149 eval"&aes${p} ($inout,$rndkey0)";
d64a7232 150 &$movekey ($rndkey0,&QWP(0x50,$key));
d7d119a3 151 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 152 &$movekey ($rndkey1,&QWP(0x60,$key));
d7d119a3 153 eval"&aes${p} ($inout,$rndkey0)";
d64a7232 154 &$movekey ($rndkey0,&QWP(0x70,$key));
d7d119a3
AP
155 eval"&aes${p} ($inout,$rndkey1)";
156 eval"&aes${p}last ($inout,$rndkey0)";
d64a7232
AP
157 &ret();
158 &function_end_B("_aesni_${p}rypt1");
159}
6c83629b 160\f
d64a7232 161# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
6f766a41 162&aesni_generate1("enc") if (!$inline);
d64a7232
AP
163&function_begin_B("${PREFIX}_encrypt");
164 &mov ("eax",&wparam(0));
165 &mov ($key,&wparam(2));
f8501464 166 &movups ($inout0,&QWP(0,"eax"));
d64a7232
AP
167 &mov ($rounds,&DWP(240,$key));
168 &mov ("eax",&wparam(1));
6f766a41
AP
169 if ($inline)
170 { &aesni_inline_generate1("enc"); }
171 else
172 { &call ("_aesni_encrypt1"); }
d64a7232
AP
173 &movups (&QWP(0,"eax"),$inout0);
174 &ret ();
175&function_end_B("${PREFIX}_encrypt");
176
d64a7232 177# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
6f766a41 178&aesni_generate1("dec") if(!$inline);
d64a7232
AP
179&function_begin_B("${PREFIX}_decrypt");
180 &mov ("eax",&wparam(0));
181 &mov ($key,&wparam(2));
f8501464 182 &movups ($inout0,&QWP(0,"eax"));
d64a7232
AP
183 &mov ($rounds,&DWP(240,$key));
184 &mov ("eax",&wparam(1));
6f766a41
AP
185 if ($inline)
186 { &aesni_inline_generate1("dec"); }
187 else
188 { &call ("_aesni_decrypt1"); }
d64a7232
AP
189 &movups (&QWP(0,"eax"),$inout0);
190 &ret ();
191&function_end_B("${PREFIX}_decrypt");
6c83629b 192
f8501464
AP
193# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
194# factor. Why 3x subroutine were originally used in loops? Even though
195# aes[enc|dec] latency was originally 6, it could be scheduled only
196# every *2nd* cycle. Thus 3x interleave was the one providing optimal
d608b4d6
AP
197# utilization, i.e. when subroutine's throughput is virtually same as
198# of non-interleaved subroutine [for number of input blocks up to 3].
f8501464
AP
199# This is why it makes no sense to implement 2x subroutine.
200# aes[enc|dec] latency in next processor generation is 8, but the
201# instructions can be scheduled every cycle. Optimal interleave for
202# new processor is therefore 8x, but it's unfeasible to accommodate it
203# in XMM registers addreassable in 32-bit mode and therefore 6x is
204# used instead...
205
d64a7232
AP
206sub aesni_generate3
207{ my $p=shift;
208
209 &function_begin_B("_aesni_${p}rypt3");
210 &$movekey ($rndkey0,&QWP(0,$key));
d64a7232 211 &shr ($rounds,1);
d608b4d6 212 &$movekey ($rndkey1,&QWP(16,$key));
d64a7232 213 &lea ($key,&DWP(32,$key));
f8501464 214 &xorps ($inout0,$rndkey0);
d64a7232 215 &pxor ($inout1,$rndkey0);
d64a7232 216 &pxor ($inout2,$rndkey0);
d64a7232 217 &$movekey ($rndkey0,&QWP(0,$key));
d7d119a3
AP
218
219 &set_label("${p}3_loop");
220 eval"&aes${p} ($inout0,$rndkey1)";
d64a7232
AP
221 eval"&aes${p} ($inout1,$rndkey1)";
222 &dec ($rounds);
223 eval"&aes${p} ($inout2,$rndkey1)";
224 &$movekey ($rndkey1,&QWP(16,$key));
225 eval"&aes${p} ($inout0,$rndkey0)";
d64a7232 226 eval"&aes${p} ($inout1,$rndkey0)";
d7d119a3 227 &lea ($key,&DWP(32,$key));
d64a7232 228 eval"&aes${p} ($inout2,$rndkey0)";
d7d119a3 229 &$movekey ($rndkey0,&QWP(0,$key));
d64a7232
AP
230 &jnz (&label("${p}3_loop"));
231 eval"&aes${p} ($inout0,$rndkey1)";
d64a7232
AP
232 eval"&aes${p} ($inout1,$rndkey1)";
233 eval"&aes${p} ($inout2,$rndkey1)";
234 eval"&aes${p}last ($inout0,$rndkey0)";
235 eval"&aes${p}last ($inout1,$rndkey0)";
236 eval"&aes${p}last ($inout2,$rndkey0)";
237 &ret();
238 &function_end_B("_aesni_${p}rypt3");
239}
d608b4d6
AP
240
241# 4x interleave is implemented to improve small block performance,
242# most notably [and naturally] 4 block by ~30%. One can argue that one
243# should have implemented 5x as well, but improvement would be <20%,
244# so it's not worth it...
245sub aesni_generate4
246{ my $p=shift;
247
248 &function_begin_B("_aesni_${p}rypt4");
249 &$movekey ($rndkey0,&QWP(0,$key));
250 &$movekey ($rndkey1,&QWP(16,$key));
251 &shr ($rounds,1);
252 &lea ($key,&DWP(32,$key));
f8501464 253 &xorps ($inout0,$rndkey0);
d608b4d6
AP
254 &pxor ($inout1,$rndkey0);
255 &pxor ($inout2,$rndkey0);
256 &pxor ($inout3,$rndkey0);
d608b4d6 257 &$movekey ($rndkey0,&QWP(0,$key));
d7d119a3 258
f8501464 259 &set_label("${p}4_loop");
d7d119a3 260 eval"&aes${p} ($inout0,$rndkey1)";
d608b4d6
AP
261 eval"&aes${p} ($inout1,$rndkey1)";
262 &dec ($rounds);
263 eval"&aes${p} ($inout2,$rndkey1)";
264 eval"&aes${p} ($inout3,$rndkey1)";
265 &$movekey ($rndkey1,&QWP(16,$key));
266 eval"&aes${p} ($inout0,$rndkey0)";
d608b4d6 267 eval"&aes${p} ($inout1,$rndkey0)";
d7d119a3 268 &lea ($key,&DWP(32,$key));
d608b4d6
AP
269 eval"&aes${p} ($inout2,$rndkey0)";
270 eval"&aes${p} ($inout3,$rndkey0)";
d7d119a3 271 &$movekey ($rndkey0,&QWP(0,$key));
f8501464 272 &jnz (&label("${p}4_loop"));
d7d119a3 273
d608b4d6 274 eval"&aes${p} ($inout0,$rndkey1)";
d608b4d6
AP
275 eval"&aes${p} ($inout1,$rndkey1)";
276 eval"&aes${p} ($inout2,$rndkey1)";
277 eval"&aes${p} ($inout3,$rndkey1)";
278 eval"&aes${p}last ($inout0,$rndkey0)";
279 eval"&aes${p}last ($inout1,$rndkey0)";
280 eval"&aes${p}last ($inout2,$rndkey0)";
281 eval"&aes${p}last ($inout3,$rndkey0)";
282 &ret();
283 &function_end_B("_aesni_${p}rypt4");
284}
f8501464
AP
285
286sub aesni_generate6
287{ my $p=shift;
288
289 &function_begin_B("_aesni_${p}rypt6");
290 &static_label("_aesni_${p}rypt6_enter");
291 &$movekey ($rndkey0,&QWP(0,$key));
292 &shr ($rounds,1);
293 &$movekey ($rndkey1,&QWP(16,$key));
294 &lea ($key,&DWP(32,$key));
295 &xorps ($inout0,$rndkey0);
296 &pxor ($inout1,$rndkey0); # pxor does better here
297 eval"&aes${p} ($inout0,$rndkey1)";
298 &pxor ($inout2,$rndkey0);
299 eval"&aes${p} ($inout1,$rndkey1)";
300 &pxor ($inout3,$rndkey0);
301 &dec ($rounds);
302 eval"&aes${p} ($inout2,$rndkey1)";
303 &pxor ($inout4,$rndkey0);
304 eval"&aes${p} ($inout3,$rndkey1)";
305 &pxor ($inout5,$rndkey0);
306 eval"&aes${p} ($inout4,$rndkey1)";
307 &$movekey ($rndkey0,&QWP(0,$key));
308 eval"&aes${p} ($inout5,$rndkey1)";
309 &jmp (&label("_aesni_${p}rypt6_enter"));
310
311 &set_label("${p}6_loop",16);
312 eval"&aes${p} ($inout0,$rndkey1)";
313 eval"&aes${p} ($inout1,$rndkey1)";
314 &dec ($rounds);
315 eval"&aes${p} ($inout2,$rndkey1)";
316 eval"&aes${p} ($inout3,$rndkey1)";
317 eval"&aes${p} ($inout4,$rndkey1)";
318 eval"&aes${p} ($inout5,$rndkey1)";
319 &set_label("_aesni_${p}rypt6_enter",16);
320 &$movekey ($rndkey1,&QWP(16,$key));
321 eval"&aes${p} ($inout0,$rndkey0)";
322 eval"&aes${p} ($inout1,$rndkey0)";
323 &lea ($key,&DWP(32,$key));
324 eval"&aes${p} ($inout2,$rndkey0)";
325 eval"&aes${p} ($inout3,$rndkey0)";
326 eval"&aes${p} ($inout4,$rndkey0)";
327 eval"&aes${p} ($inout5,$rndkey0)";
328 &$movekey ($rndkey0,&QWP(0,$key));
329 &jnz (&label("${p}6_loop"));
330
331 eval"&aes${p} ($inout0,$rndkey1)";
332 eval"&aes${p} ($inout1,$rndkey1)";
333 eval"&aes${p} ($inout2,$rndkey1)";
334 eval"&aes${p} ($inout3,$rndkey1)";
335 eval"&aes${p} ($inout4,$rndkey1)";
336 eval"&aes${p} ($inout5,$rndkey1)";
337 eval"&aes${p}last ($inout0,$rndkey0)";
338 eval"&aes${p}last ($inout1,$rndkey0)";
339 eval"&aes${p}last ($inout2,$rndkey0)";
340 eval"&aes${p}last ($inout3,$rndkey0)";
341 eval"&aes${p}last ($inout4,$rndkey0)";
342 eval"&aes${p}last ($inout5,$rndkey0)";
343 &ret();
344 &function_end_B("_aesni_${p}rypt6");
345}
d64a7232
AP
346&aesni_generate3("enc") if ($PREFIX eq "aesni");
347&aesni_generate3("dec");
d608b4d6
AP
348&aesni_generate4("enc") if ($PREFIX eq "aesni");
349&aesni_generate4("dec");
f8501464
AP
350&aesni_generate6("enc") if ($PREFIX eq "aesni");
351&aesni_generate6("dec");
6c83629b 352\f
d64a7232 353if ($PREFIX eq "aesni") {
6c83629b 354######################################################################
d64a7232
AP
355# void aesni_ecb_encrypt (const void *in, void *out,
356# size_t length, const AES_KEY *key,
357# int enc);
d64a7232
AP
358&function_begin("aesni_ecb_encrypt");
359 &mov ($inp,&wparam(0));
360 &mov ($out,&wparam(1));
361 &mov ($len,&wparam(2));
362 &mov ($key,&wparam(3));
f8501464 363 &mov ($rounds_,&wparam(4));
d64a7232 364 &and ($len,-16);
f8501464 365 &jz (&label("ecb_ret"));
d64a7232 366 &mov ($rounds,&DWP(240,$key));
f8501464
AP
367 &test ($rounds_,$rounds_);
368 &jz (&label("ecb_decrypt"));
369
d64a7232
AP
370 &mov ($key_,$key); # backup $key
371 &mov ($rounds_,$rounds); # backup $rounds
f8501464
AP
372 &cmp ($len,0x60);
373 &jb (&label("ecb_enc_tail"));
374
375 &movdqu ($inout0,&QWP(0,$inp));
376 &movdqu ($inout1,&QWP(0x10,$inp));
377 &movdqu ($inout2,&QWP(0x20,$inp));
378 &movdqu ($inout3,&QWP(0x30,$inp));
379 &movdqu ($inout4,&QWP(0x40,$inp));
380 &movdqu ($inout5,&QWP(0x50,$inp));
381 &lea ($inp,&DWP(0x60,$inp));
382 &sub ($len,0x60);
383 &jmp (&label("ecb_enc_loop6_enter"));
384
385&set_label("ecb_enc_loop6",16);
386 &movups (&QWP(0,$out),$inout0);
387 &movdqu ($inout0,&QWP(0,$inp));
388 &movups (&QWP(0x10,$out),$inout1);
389 &movdqu ($inout1,&QWP(0x10,$inp));
390 &movups (&QWP(0x20,$out),$inout2);
391 &movdqu ($inout2,&QWP(0x20,$inp));
392 &movups (&QWP(0x30,$out),$inout3);
393 &movdqu ($inout3,&QWP(0x30,$inp));
394 &movups (&QWP(0x40,$out),$inout4);
395 &movdqu ($inout4,&QWP(0x40,$inp));
396 &movups (&QWP(0x50,$out),$inout5);
397 &lea ($out,&DWP(0x60,$out));
398 &movdqu ($inout5,&QWP(0x50,$inp));
399 &lea ($inp,&DWP(0x60,$inp));
400&set_label("ecb_enc_loop6_enter");
d64a7232 401
f8501464 402 &call ("_aesni_encrypt6");
d64a7232 403
d64a7232 404 &mov ($key,$key_); # restore $key
d64a7232 405 &mov ($rounds,$rounds_); # restore $rounds
f8501464
AP
406 &sub ($len,0x60);
407 &jnc (&label("ecb_enc_loop6"));
408
409 &movups (&QWP(0,$out),$inout0);
410 &movups (&QWP(0x10,$out),$inout1);
d7d119a3 411 &movups (&QWP(0x20,$out),$inout2);
f8501464
AP
412 &movups (&QWP(0x30,$out),$inout3);
413 &movups (&QWP(0x40,$out),$inout4);
414 &movups (&QWP(0x50,$out),$inout5);
415 &lea ($out,&DWP(0x60,$out));
416 &add ($len,0x60);
417 &jz (&label("ecb_ret"));
d64a7232 418
6c83629b 419&set_label("ecb_enc_tail");
6c83629b 420 &movups ($inout0,&QWP(0,$inp));
d7d119a3 421 &cmp ($len,0x20);
6c83629b 422 &jb (&label("ecb_enc_one"));
d64a7232 423 &movups ($inout1,&QWP(0x10,$inp));
d608b4d6 424 &je (&label("ecb_enc_two"));
d608b4d6 425 &movups ($inout2,&QWP(0x20,$inp));
f8501464
AP
426 &cmp ($len,0x40);
427 &jb (&label("ecb_enc_three"));
d608b4d6 428 &movups ($inout3,&QWP(0x30,$inp));
f8501464
AP
429 &je (&label("ecb_enc_four"));
430 &movups ($inout4,&QWP(0x40,$inp));
431 &xorps ($inout5,$inout5);
432 &call ("_aesni_encrypt6");
d64a7232
AP
433 &movups (&QWP(0,$out),$inout0);
434 &movups (&QWP(0x10,$out),$inout1);
d608b4d6
AP
435 &movups (&QWP(0x20,$out),$inout2);
436 &movups (&QWP(0x30,$out),$inout3);
f8501464 437 &movups (&QWP(0x40,$out),$inout4);
d64a7232
AP
438 jmp (&label("ecb_ret"));
439
440&set_label("ecb_enc_one",16);
6f766a41
AP
441 if ($inline)
442 { &aesni_inline_generate1("enc"); }
443 else
444 { &call ("_aesni_encrypt1"); }
d64a7232
AP
445 &movups (&QWP(0,$out),$inout0);
446 &jmp (&label("ecb_ret"));
447
d608b4d6 448&set_label("ecb_enc_two",16);
f8501464 449 &xorps ($inout2,$inout2);
d608b4d6
AP
450 &call ("_aesni_encrypt3");
451 &movups (&QWP(0,$out),$inout0);
452 &movups (&QWP(0x10,$out),$inout1);
453 &jmp (&label("ecb_ret"));
454
455&set_label("ecb_enc_three",16);
456 &call ("_aesni_encrypt3");
457 &movups (&QWP(0,$out),$inout0);
458 &movups (&QWP(0x10,$out),$inout1);
459 &movups (&QWP(0x20,$out),$inout2);
460 &jmp (&label("ecb_ret"));
f8501464
AP
461
462&set_label("ecb_enc_four",16);
463 &call ("_aesni_encrypt4");
464 &movups (&QWP(0,$out),$inout0);
465 &movups (&QWP(0x10,$out),$inout1);
466 &movups (&QWP(0x20,$out),$inout2);
467 &movups (&QWP(0x30,$out),$inout3);
468 &jmp (&label("ecb_ret"));
6c83629b 469######################################################################
d64a7232 470&set_label("ecb_decrypt",16);
f8501464
AP
471 &mov ($key_,$key); # backup $key
472 &mov ($rounds_,$rounds); # backup $rounds
473 &cmp ($len,0x60);
474 &jb (&label("ecb_dec_tail"));
475
476 &movdqu ($inout0,&QWP(0,$inp));
477 &movdqu ($inout1,&QWP(0x10,$inp));
478 &movdqu ($inout2,&QWP(0x20,$inp));
479 &movdqu ($inout3,&QWP(0x30,$inp));
480 &movdqu ($inout4,&QWP(0x40,$inp));
481 &movdqu ($inout5,&QWP(0x50,$inp));
482 &lea ($inp,&DWP(0x60,$inp));
483 &sub ($len,0x60);
484 &jmp (&label("ecb_dec_loop6_enter"));
485
486&set_label("ecb_dec_loop6",16);
d7d119a3 487 &movups (&QWP(0,$out),$inout0);
f8501464 488 &movdqu ($inout0,&QWP(0,$inp));
d7d119a3 489 &movups (&QWP(0x10,$out),$inout1);
f8501464
AP
490 &movdqu ($inout1,&QWP(0x10,$inp));
491 &movups (&QWP(0x20,$out),$inout2);
492 &movdqu ($inout2,&QWP(0x20,$inp));
493 &movups (&QWP(0x30,$out),$inout3);
494 &movdqu ($inout3,&QWP(0x30,$inp));
495 &movups (&QWP(0x40,$out),$inout4);
496 &movdqu ($inout4,&QWP(0x40,$inp));
497 &movups (&QWP(0x50,$out),$inout5);
498 &lea ($out,&DWP(0x60,$out));
499 &movdqu ($inout5,&QWP(0x50,$inp));
500 &lea ($inp,&DWP(0x60,$inp));
501&set_label("ecb_dec_loop6_enter");
502
503 &call ("_aesni_decrypt6");
504
505 &mov ($key,$key_); # restore $key
d64a7232 506 &mov ($rounds,$rounds_); # restore $rounds
f8501464
AP
507 &sub ($len,0x60);
508 &jnc (&label("ecb_dec_loop6"));
509
510 &movups (&QWP(0,$out),$inout0);
511 &movups (&QWP(0x10,$out),$inout1);
d7d119a3 512 &movups (&QWP(0x20,$out),$inout2);
f8501464
AP
513 &movups (&QWP(0x30,$out),$inout3);
514 &movups (&QWP(0x40,$out),$inout4);
515 &movups (&QWP(0x50,$out),$inout5);
516 &lea ($out,&DWP(0x60,$out));
517 &add ($len,0x60);
518 &jz (&label("ecb_ret"));
d64a7232 519
6c83629b 520&set_label("ecb_dec_tail");
6c83629b 521 &movups ($inout0,&QWP(0,$inp));
d7d119a3 522 &cmp ($len,0x20);
6c83629b 523 &jb (&label("ecb_dec_one"));
d64a7232 524 &movups ($inout1,&QWP(0x10,$inp));
d608b4d6 525 &je (&label("ecb_dec_two"));
d608b4d6 526 &movups ($inout2,&QWP(0x20,$inp));
f8501464
AP
527 &cmp ($len,0x40);
528 &jb (&label("ecb_dec_three"));
d608b4d6 529 &movups ($inout3,&QWP(0x30,$inp));
f8501464
AP
530 &je (&label("ecb_dec_four"));
531 &movups ($inout4,&QWP(0x40,$inp));
532 &xorps ($inout5,$inout5);
533 &call ("_aesni_decrypt6");
d64a7232
AP
534 &movups (&QWP(0,$out),$inout0);
535 &movups (&QWP(0x10,$out),$inout1);
d608b4d6
AP
536 &movups (&QWP(0x20,$out),$inout2);
537 &movups (&QWP(0x30,$out),$inout3);
f8501464 538 &movups (&QWP(0x40,$out),$inout4);
d608b4d6 539 &jmp (&label("ecb_ret"));
d64a7232
AP
540
541&set_label("ecb_dec_one",16);
6f766a41
AP
542 if ($inline)
543 { &aesni_inline_generate1("dec"); }
544 else
545 { &call ("_aesni_decrypt1"); }
d64a7232 546 &movups (&QWP(0,$out),$inout0);
d608b4d6
AP
547 &jmp (&label("ecb_ret"));
548
549&set_label("ecb_dec_two",16);
f8501464 550 &xorps ($inout2,$inout2);
d608b4d6
AP
551 &call ("_aesni_decrypt3");
552 &movups (&QWP(0,$out),$inout0);
553 &movups (&QWP(0x10,$out),$inout1);
554 &jmp (&label("ecb_ret"));
555
556&set_label("ecb_dec_three",16);
557 &call ("_aesni_decrypt3");
558 &movups (&QWP(0,$out),$inout0);
559 &movups (&QWP(0x10,$out),$inout1);
560 &movups (&QWP(0x20,$out),$inout2);
f8501464
AP
561 &jmp (&label("ecb_ret"));
562
563&set_label("ecb_dec_four",16);
564 &call ("_aesni_decrypt4");
565 &movups (&QWP(0,$out),$inout0);
566 &movups (&QWP(0x10,$out),$inout1);
567 &movups (&QWP(0x20,$out),$inout2);
568 &movups (&QWP(0x30,$out),$inout3);
d64a7232
AP
569
570&set_label("ecb_ret");
571&function_end("aesni_ecb_encrypt");
6c83629b
AP
572\f
573######################################################################
d7d119a3
AP
574# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
575# size_t blocks, const AES_KEY *key,
576# const char *ivec,char *cmac);
577#
578# Handles only complete blocks, operates on 64-bit counter and
579# does not update *ivec! Nor does it finalize CMAC value
580# (see engine/eng_aesni.c for details)
6c83629b 581#
f8501464 582{ my $cmac=$inout1;
d7d119a3
AP
583&function_begin("aesni_ccm64_encrypt_blocks");
584 &mov ($inp,&wparam(0));
585 &mov ($out,&wparam(1));
586 &mov ($len,&wparam(2));
587 &mov ($key,&wparam(3));
588 &mov ($rounds_,&wparam(4));
589 &mov ($rounds,&wparam(5));
590 &mov ($key_,"esp");
591 &sub ("esp",60);
592 &and ("esp",-16); # align stack
593 &mov (&DWP(48,"esp"),$key_);
594
595 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
f8501464 596 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
267b481c 597 &mov ($rounds,&DWP(240,$key));
d7d119a3
AP
598
599 # compose byte-swap control mask for pshufb on stack
600 &mov (&DWP(0,"esp"),0x0c0d0e0f);
601 &mov (&DWP(4,"esp"),0x08090a0b);
602 &mov (&DWP(8,"esp"),0x04050607);
603 &mov (&DWP(12,"esp"),0x00010203);
604
605 # compose counter increment vector on stack
267b481c 606 &mov ($rounds_,1);
d7d119a3 607 &xor ($key_,$key_);
267b481c 608 &mov (&DWP(16,"esp"),$rounds_);
d7d119a3
AP
609 &mov (&DWP(20,"esp"),$key_);
610 &mov (&DWP(24,"esp"),$key_);
611 &mov (&DWP(28,"esp"),$key_);
612
267b481c
AP
613 &shr ($rounds,1);
614 &lea ($key_,&DWP(0,$key));
d7d119a3 615 &movdqa ($inout0,$ivec);
267b481c
AP
616 &mov ($rounds_,$rounds);
617 &movdqa ($inout3,&QWP(0,"esp"));
d7d119a3
AP
618
619&set_label("ccm64_enc_outer");
267b481c 620 &$movekey ($rndkey0,&QWP(0,$key_));
f8501464 621 &mov ($rounds,$rounds_);
267b481c 622 &movups ($in0,&QWP(0,$inp));
d7d119a3 623
f8501464 624 &xorps ($inout0,$rndkey0);
267b481c
AP
625 &$movekey ($rndkey1,&QWP(16,$key_));
626 &xorps ($rndkey0,$in0);
627 &lea ($key,&DWP(32,$key_));
628 &xorps ($cmac,$rndkey0); # cmac^=inp
f8501464
AP
629 &$movekey ($rndkey0,&QWP(0,$key));
630
631&set_label("ccm64_enc2_loop");
632 &aesenc ($inout0,$rndkey1);
633 &dec ($rounds);
634 &aesenc ($cmac,$rndkey1);
635 &$movekey ($rndkey1,&QWP(16,$key));
636 &aesenc ($inout0,$rndkey0);
637 &lea ($key,&DWP(32,$key));
638 &aesenc ($cmac,$rndkey0);
639 &$movekey ($rndkey0,&QWP(0,$key));
640 &jnz (&label("ccm64_enc2_loop"));
267b481c 641 &pshufb ($ivec,$inout3);
f8501464
AP
642 &aesenc ($inout0,$rndkey1);
643 &aesenc ($cmac,$rndkey1);
267b481c 644 &paddq ($ivec,&QWP(16,"esp"));
f8501464
AP
645 &aesenclast ($inout0,$rndkey0);
646 &aesenclast ($cmac,$rndkey0);
d7d119a3 647
d7d119a3
AP
648 &dec ($len);
649 &lea ($inp,&DWP(16,$inp));
f8501464 650 &xorps ($in0,$inout0); # inp^=E(ivec)
d7d119a3 651 &movdqa ($inout0,$ivec);
267b481c 652 &movups (&QWP(0,$out),$in0); # save output
d7d119a3 653 &lea ($out,&DWP(16,$out));
267b481c 654 &pshufb ($ivec,$inout3);
d7d119a3
AP
655 &jnz (&label("ccm64_enc_outer"));
656
657 &mov ("esp",&DWP(48,"esp"));
658 &mov ($out,&wparam(5));
f8501464 659 &movups (&QWP(0,$out),$cmac);
d7d119a3
AP
660&function_end("aesni_ccm64_encrypt_blocks");
661
662&function_begin("aesni_ccm64_decrypt_blocks");
663 &mov ($inp,&wparam(0));
664 &mov ($out,&wparam(1));
665 &mov ($len,&wparam(2));
666 &mov ($key,&wparam(3));
667 &mov ($rounds_,&wparam(4));
668 &mov ($rounds,&wparam(5));
669 &mov ($key_,"esp");
670 &sub ("esp",60);
671 &and ("esp",-16); # align stack
672 &mov (&DWP(48,"esp"),$key_);
673
674 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
f8501464 675 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
267b481c 676 &mov ($rounds,&DWP(240,$key));
d7d119a3
AP
677
678 # compose byte-swap control mask for pshufb on stack
679 &mov (&DWP(0,"esp"),0x0c0d0e0f);
680 &mov (&DWP(4,"esp"),0x08090a0b);
681 &mov (&DWP(8,"esp"),0x04050607);
682 &mov (&DWP(12,"esp"),0x00010203);
683
684 # compose counter increment vector on stack
267b481c 685 &mov ($rounds_,1);
d7d119a3 686 &xor ($key_,$key_);
267b481c 687 &mov (&DWP(16,"esp"),$rounds_);
d7d119a3
AP
688 &mov (&DWP(20,"esp"),$key_);
689 &mov (&DWP(24,"esp"),$key_);
690 &mov (&DWP(28,"esp"),$key_);
691
692 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
693 &movdqa ($inout0,$ivec);
d7d119a3 694
d7d119a3
AP
695 &mov ($key_,$key);
696 &mov ($rounds_,$rounds);
697
267b481c 698 &pshufb ($ivec,$inout3);
d7d119a3
AP
699 if ($inline)
700 { &aesni_inline_generate1("enc"); }
701 else
702 { &call ("_aesni_encrypt1"); }
f8501464 703 &movups ($in0,&QWP(0,$inp)); # load inp
267b481c
AP
704 &paddq ($ivec,&QWP(16,"esp"));
705 &pshufb ($ivec,$inout3);
f8501464 706 &lea ($inp,&QWP(16,$inp));
267b481c
AP
707 &jmp (&label("ccm64_dec_outer"));
708
709&set_label("ccm64_dec_outer",16);
710 &xorps ($in0,$inout0); # inp ^= E(ivec)
711 &movdqa ($inout0,$ivec);
d7d119a3 712 &mov ($rounds,$rounds_);
267b481c 713 &movups (&QWP(0,$out),$in0); # save output
d7d119a3
AP
714 &lea ($out,&DWP(16,$out));
715
f8501464 716 &sub ($len,1);
d7d119a3
AP
717 &jz (&label("ccm64_dec_break"));
718
267b481c 719 &$movekey ($rndkey0,&QWP(0,$key_));
f8501464 720 &shr ($rounds,1);
267b481c 721 &$movekey ($rndkey1,&QWP(16,$key_));
f8501464 722 &xorps ($in0,$rndkey0);
267b481c 723 &lea ($key,&DWP(32,$key_));
f8501464
AP
724 &xorps ($inout0,$rndkey0);
725 &xorps ($cmac,$in0); # cmac^=out
726 &$movekey ($rndkey0,&QWP(0,$key));
d7d119a3 727
f8501464
AP
728&set_label("ccm64_dec2_loop");
729 &aesenc ($inout0,$rndkey1);
730 &dec ($rounds);
731 &aesenc ($cmac,$rndkey1);
732 &$movekey ($rndkey1,&QWP(16,$key));
733 &aesenc ($inout0,$rndkey0);
734 &lea ($key,&DWP(32,$key));
735 &aesenc ($cmac,$rndkey0);
736 &$movekey ($rndkey0,&QWP(0,$key));
737 &jnz (&label("ccm64_dec2_loop"));
267b481c
AP
738 &movups ($in0,&QWP(0,$inp)); # load inp
739 &paddq ($ivec,&QWP(16,"esp"));
f8501464
AP
740 &aesenc ($inout0,$rndkey1);
741 &aesenc ($cmac,$rndkey1);
267b481c
AP
742 &pshufb ($ivec,$inout3);
743 &lea ($inp,&QWP(16,$inp));
f8501464
AP
744 &aesenclast ($inout0,$rndkey0);
745 &aesenclast ($cmac,$rndkey0);
d7d119a3
AP
746 &jmp (&label("ccm64_dec_outer"));
747
748&set_label("ccm64_dec_break",16);
267b481c 749 &mov ($key,$key_);
d7d119a3 750 if ($inline)
f8501464 751 { &aesni_inline_generate1("enc",$cmac,$in0); }
d7d119a3 752 else
f8501464 753 { &call ("_aesni_encrypt1",$cmac); }
d7d119a3
AP
754
755 &mov ("esp",&DWP(48,"esp"));
756 &mov ($out,&wparam(5));
f8501464 757 &movups (&QWP(0,$out),$cmac);
d7d119a3 758&function_end("aesni_ccm64_decrypt_blocks");
f8501464 759}
d7d119a3
AP
760\f
761######################################################################
6c83629b
AP
762# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
763# size_t blocks, const AES_KEY *key,
764# const char *ivec);
d7d119a3
AP
765#
766# Handles only complete blocks, operates on 32-bit counter and
767# does not update *ivec! (see engine/eng_aesni.c for details)
768#
f8501464
AP
769# stack layout:
770# 0 pshufb mask
771# 16 vector addend: 0,6,6,6
772# 32 counter-less ivec
773# 48 1st triplet of counter vector
774# 64 2nd triplet of counter vector
775# 80 saved %esp
776
6c83629b
AP
777&function_begin("aesni_ctr32_encrypt_blocks");
778 &mov ($inp,&wparam(0));
779 &mov ($out,&wparam(1));
780 &mov ($len,&wparam(2));
781 &mov ($key,&wparam(3));
782 &mov ($rounds_,&wparam(4));
783 &mov ($key_,"esp");
f8501464 784 &sub ("esp",88);
6c83629b 785 &and ("esp",-16); # align stack
f8501464 786 &mov (&DWP(80,"esp"),$key_);
6c83629b 787
d7d119a3
AP
788 &cmp ($len,1);
789 &je (&label("ctr32_one_shortcut"));
790
f8501464 791 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
6c83629b
AP
792
793 # compose byte-swap control mask for pshufb on stack
794 &mov (&DWP(0,"esp"),0x0c0d0e0f);
795 &mov (&DWP(4,"esp"),0x08090a0b);
796 &mov (&DWP(8,"esp"),0x04050607);
797 &mov (&DWP(12,"esp"),0x00010203);
798
799 # compose counter increment vector on stack
f8501464 800 &mov ($rounds,6);
6c83629b
AP
801 &xor ($key_,$key_);
802 &mov (&DWP(16,"esp"),$rounds);
803 &mov (&DWP(20,"esp"),$rounds);
804 &mov (&DWP(24,"esp"),$rounds);
805 &mov (&DWP(28,"esp"),$key_);
806
f8501464
AP
807 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
808 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
6c83629b
AP
809
810 &mov ($rounds,&DWP(240,$key)); # key->rounds
6c83629b 811
f8501464 812 # compose 2 vectors of 3x32-bit counters
6c83629b 813 &bswap ($rounds_);
f8501464
AP
814 &pxor ($rndkey1,$rndkey1);
815 &pxor ($rndkey0,$rndkey0);
816 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
817 &pinsrd ($rndkey1,$rounds_,0);
818 &lea ($key_,&DWP(3,$rounds_));
819 &pinsrd ($rndkey0,$key_,0);
6c83629b 820 &inc ($rounds_);
f8501464
AP
821 &pinsrd ($rndkey1,$rounds_,1);
822 &inc ($key_);
823 &pinsrd ($rndkey0,$key_,1);
6c83629b 824 &inc ($rounds_);
f8501464
AP
825 &pinsrd ($rndkey1,$rounds_,2);
826 &inc ($key_);
827 &pinsrd ($rndkey0,$key_,2);
828 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
829 &pshufb ($rndkey1,$inout0); # byte swap
830 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
831 &pshufb ($rndkey0,$inout0); # byte swap
832
833 &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
834 &pshufd ($inout1,$rndkey1,2<<6);
835 &cmp ($len,6);
836 &jb (&label("ctr32_tail"));
837 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
838 &shr ($rounds,1);
839 &mov ($key_,$key); # backup $key
840 &mov ($rounds_,$rounds); # backup $rounds
841 &sub ($len,6);
842 &jmp (&label("ctr32_loop6"));
843
844&set_label("ctr32_loop6",16);
845 &pshufd ($inout2,$rndkey1,1<<6);
846 &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
847 &pshufd ($inout3,$rndkey0,3<<6);
848 &por ($inout0,$rndkey1); # merge counter-less ivec
849 &pshufd ($inout4,$rndkey0,2<<6);
850 &por ($inout1,$rndkey1);
851 &pshufd ($inout5,$rndkey0,1<<6);
852 &por ($inout2,$rndkey1);
853 &por ($inout3,$rndkey1);
854 &por ($inout4,$rndkey1);
855 &por ($inout5,$rndkey1);
856
857 # inlining _aesni_encrypt6's prologue gives ~4% improvement...
858 &$movekey ($rndkey0,&QWP(0,$key_));
859 &$movekey ($rndkey1,&QWP(16,$key_));
860 &lea ($key,&DWP(32,$key_));
861 &dec ($rounds);
d7d119a3
AP
862 &pxor ($inout0,$rndkey0);
863 &pxor ($inout1,$rndkey0);
d7d119a3 864 &aesenc ($inout0,$rndkey1);
f8501464 865 &pxor ($inout2,$rndkey0);
d7d119a3 866 &aesenc ($inout1,$rndkey1);
f8501464 867 &pxor ($inout3,$rndkey0);
d7d119a3 868 &aesenc ($inout2,$rndkey1);
f8501464
AP
869 &pxor ($inout4,$rndkey0);
870 &aesenc ($inout3,$rndkey1);
871 &pxor ($inout5,$rndkey0);
872 &aesenc ($inout4,$rndkey1);
d7d119a3 873 &$movekey ($rndkey0,&QWP(0,$key));
f8501464 874 &aesenc ($inout5,$rndkey1);
d7d119a3 875
f8501464
AP
876 &call (&label("_aesni_encrypt6_enter"));
877
878 &movups ($rndkey1,&QWP(0,$inp));
879 &movups ($rndkey0,&QWP(0x10,$inp));
880 &xorps ($inout0,$rndkey1);
881 &movups ($rndkey1,&QWP(0x20,$inp));
882 &xorps ($inout1,$rndkey0);
883 &movups (&QWP(0,$out),$inout0);
884 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
885 &xorps ($inout2,$rndkey1);
886 &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
887 &movups (&QWP(0x10,$out),$inout1);
888 &movups (&QWP(0x20,$out),$inout2);
889
890 &paddd ($rndkey1,$rndkey0); # 1st triplet increment
891 &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
892 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
893
894 &movups ($inout1,&QWP(0x30,$inp));
895 &movups ($inout2,&QWP(0x40,$inp));
896 &xorps ($inout3,$inout1);
897 &movups ($inout1,&QWP(0x50,$inp));
898 &lea ($inp,&DWP(0x60,$inp));
899 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
900 &pshufb ($rndkey1,$inout0); # byte swap
901 &xorps ($inout4,$inout2);
902 &movups (&QWP(0x30,$out),$inout3);
903 &xorps ($inout5,$inout1);
904 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
905 &pshufb ($rndkey0,$inout0); # byte swap
906 &movups (&QWP(0x40,$out),$inout4);
907 &pshufd ($inout0,$rndkey1,3<<6);
908 &movups (&QWP(0x50,$out),$inout5);
909 &lea ($out,&DWP(0x60,$out));
d7d119a3 910
6c83629b 911 &mov ($rounds,$rounds_);
f8501464
AP
912 &pshufd ($inout1,$rndkey1,2<<6);
913 &sub ($len,6);
914 &jnc (&label("ctr32_loop6"));
6c83629b 915
f8501464
AP
916 &add ($len,6);
917 &jz (&label("ctr32_ret"));
918 &mov ($key,$key_);
919 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
920 &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
6c83629b
AP
921
922&set_label("ctr32_tail");
f8501464 923 &por ($inout0,$inout5);
d7d119a3 924 &cmp ($len,2);
6c83629b 925 &jb (&label("ctr32_one"));
6c83629b 926
f8501464
AP
927 &pshufd ($inout2,$rndkey1,1<<6);
928 &por ($inout1,$inout5);
929 &je (&label("ctr32_two"));
6c83629b 930
f8501464
AP
931 &pshufd ($inout3,$rndkey0,3<<6);
932 &por ($inout2,$inout5);
933 &cmp ($len,4);
934 &jb (&label("ctr32_three"));
935
936 &pshufd ($inout4,$rndkey0,2<<6);
937 &por ($inout3,$inout5);
938 &je (&label("ctr32_four"));
939
940 &por ($inout4,$inout5);
941 &call ("_aesni_encrypt6");
942 &movups ($rndkey1,&QWP(0,$inp));
943 &movups ($rndkey0,&QWP(0x10,$inp));
944 &xorps ($inout0,$rndkey1);
945 &movups ($rndkey1,&QWP(0x20,$inp));
946 &xorps ($inout1,$rndkey0);
947 &movups ($rndkey0,&QWP(0x30,$inp));
948 &xorps ($inout2,$rndkey1);
949 &movups ($rndkey1,&QWP(0x40,$inp));
950 &xorps ($inout3,$rndkey0);
951 &movups (&QWP(0,$out),$inout0);
952 &xorps ($inout4,$rndkey1);
953 &movups (&QWP(0x10,$out),$inout1);
954 &movups (&QWP(0x20,$out),$inout2);
955 &movups (&QWP(0x30,$out),$inout3);
956 &movups (&QWP(0x40,$out),$inout4);
6c83629b
AP
957 &jmp (&label("ctr32_ret"));
958
d7d119a3 959&set_label("ctr32_one_shortcut",16);
f8501464 960 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
d7d119a3
AP
961 &mov ($rounds,&DWP(240,$key));
962
963&set_label("ctr32_one");
6c83629b
AP
964 if ($inline)
965 { &aesni_inline_generate1("enc"); }
966 else
967 { &call ("_aesni_encrypt1"); }
f8501464
AP
968 &movups ($in0,&QWP(0,$inp));
969 &xorps ($in0,$inout0);
970 &movups (&QWP(0,$out),$in0);
6c83629b 971 &jmp (&label("ctr32_ret"));
d64a7232 972
6c83629b
AP
973&set_label("ctr32_two",16);
974 &call ("_aesni_encrypt3");
f8501464
AP
975 &movups ($inout3,&QWP(0,$inp));
976 &movups ($inout4,&QWP(0x10,$inp));
977 &xorps ($inout0,$inout3);
978 &xorps ($inout1,$inout4);
979 &movups (&QWP(0,$out),$inout0);
980 &movups (&QWP(0x10,$out),$inout1);
6c83629b
AP
981 &jmp (&label("ctr32_ret"));
982
983&set_label("ctr32_three",16);
984 &call ("_aesni_encrypt3");
f8501464
AP
985 &movups ($inout3,&QWP(0,$inp));
986 &movups ($inout4,&QWP(0x10,$inp));
987 &xorps ($inout0,$inout3);
988 &movups ($inout5,&QWP(0x20,$inp));
989 &xorps ($inout1,$inout4);
990 &movups (&QWP(0,$out),$inout0);
991 &xorps ($inout2,$inout5);
992 &movups (&QWP(0x10,$out),$inout1);
993 &movups (&QWP(0x20,$out),$inout2);
994 &jmp (&label("ctr32_ret"));
995
996&set_label("ctr32_four",16);
997 &call ("_aesni_encrypt4");
998 &movups ($inout4,&QWP(0,$inp));
999 &movups ($inout5,&QWP(0x10,$inp));
1000 &movups ($rndkey1,&QWP(0x20,$inp));
1001 &xorps ($inout0,$inout4);
1002 &movups ($rndkey0,&QWP(0x30,$inp));
1003 &xorps ($inout1,$inout5);
1004 &movups (&QWP(0,$out),$inout0);
1005 &xorps ($inout2,$rndkey1);
1006 &movups (&QWP(0x10,$out),$inout1);
1007 &xorps ($inout3,$rndkey0);
1008 &movups (&QWP(0x20,$out),$inout2);
1009 &movups (&QWP(0x30,$out),$inout3);
6c83629b
AP
1010
1011&set_label("ctr32_ret");
f8501464 1012 &mov ("esp",&DWP(80,"esp"));
6c83629b 1013&function_end("aesni_ctr32_encrypt_blocks");
f8501464
AP
1014\f
1015######################################################################
1016# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1017# const AES_KEY *key1, const AES_KEY *key2
1018# const unsigned char iv[16]);
1019#
1020{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1021
1022&function_begin("aesni_xts_encrypt");
1023 &mov ($key,&wparam(4)); # key2
1024 &mov ($inp,&wparam(5)); # clear-text tweak
1025
1026 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1027 &movups ($inout0,&QWP(0,$inp));
1028 if ($inline)
1029 { &aesni_inline_generate1("enc"); }
1030 else
1031 { &call ("_aesni_encrypt1"); }
1032
1033 &mov ($inp,&wparam(0));
1034 &mov ($out,&wparam(1));
1035 &mov ($len,&wparam(2));
1036 &mov ($key,&wparam(3)); # key1
1037
1038 &mov ($key_,"esp");
1039 &sub ("esp",16*7+8);
1040 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1041 &and ("esp",-16); # align stack
1042
1043 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1044 &mov (&DWP(16*6+4,"esp"),0);
1045 &mov (&DWP(16*6+8,"esp"),1);
1046 &mov (&DWP(16*6+12,"esp"),0);
1047 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1048 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1049
1050 &movdqa ($tweak,$inout0);
1051 &pxor ($twtmp,$twtmp);
1052 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1053 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1054
1055 &and ($len,-16);
1056 &mov ($key_,$key); # backup $key
1057 &mov ($rounds_,$rounds); # backup $rounds
1058 &sub ($len,16*6);
1059 &jc (&label("xts_enc_short"));
1060
1061 &shr ($rounds,1);
1062 &mov ($rounds_,$rounds);
1063 &jmp (&label("xts_enc_loop6"));
1064
1065&set_label("xts_enc_loop6",16);
1066 for ($i=0;$i<4;$i++) {
1067 &pshufd ($twres,$twtmp,0x13);
1068 &pxor ($twtmp,$twtmp);
1069 &movdqa (&QWP(16*$i,"esp"),$tweak);
1070 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1071 &pand ($twres,$twmask); # isolate carry and residue
1072 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1073 &pxor ($tweak,$twres);
1074 }
1075 &pshufd ($inout5,$twtmp,0x13);
1076 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1077 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1078 &$movekey ($rndkey0,&QWP(0,$key_));
1079 &pand ($inout5,$twmask); # isolate carry and residue
1080 &movups ($inout0,&QWP(0,$inp)); # load input
1081 &pxor ($inout5,$tweak);
1082
1083 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1084 &movdqu ($inout1,&QWP(16*1,$inp));
1085 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1086 &movdqu ($inout2,&QWP(16*2,$inp));
1087 &pxor ($inout1,$rndkey0);
1088 &movdqu ($inout3,&QWP(16*3,$inp));
1089 &pxor ($inout2,$rndkey0);
1090 &movdqu ($inout4,&QWP(16*4,$inp));
1091 &pxor ($inout3,$rndkey0);
1092 &movdqu ($rndkey1,&QWP(16*5,$inp));
1093 &pxor ($inout4,$rndkey0);
1094 &lea ($inp,&DWP(16*6,$inp));
1095 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1096 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1097 &pxor ($inout5,$rndkey1);
1098
1099 &$movekey ($rndkey1,&QWP(16,$key_));
1100 &lea ($key,&DWP(32,$key_));
1101 &pxor ($inout1,&QWP(16*1,"esp"));
1102 &aesenc ($inout0,$rndkey1);
1103 &pxor ($inout2,&QWP(16*2,"esp"));
1104 &aesenc ($inout1,$rndkey1);
1105 &pxor ($inout3,&QWP(16*3,"esp"));
1106 &dec ($rounds);
1107 &aesenc ($inout2,$rndkey1);
1108 &pxor ($inout4,&QWP(16*4,"esp"));
1109 &aesenc ($inout3,$rndkey1);
1110 &pxor ($inout5,$rndkey0);
1111 &aesenc ($inout4,$rndkey1);
1112 &$movekey ($rndkey0,&QWP(0,$key));
1113 &aesenc ($inout5,$rndkey1);
1114 &call (&label("_aesni_encrypt6_enter"));
1115
1116 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1117 &pxor ($twtmp,$twtmp);
1118 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1119 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1120 &xorps ($inout1,&QWP(16*1,"esp"));
1121 &movups (&QWP(16*0,$out),$inout0); # write output
1122 &xorps ($inout2,&QWP(16*2,"esp"));
1123 &movups (&QWP(16*1,$out),$inout1);
1124 &xorps ($inout3,&QWP(16*3,"esp"));
1125 &movups (&QWP(16*2,$out),$inout2);
1126 &xorps ($inout4,&QWP(16*4,"esp"));
1127 &movups (&QWP(16*3,$out),$inout3);
1128 &xorps ($inout5,$tweak);
1129 &movups (&QWP(16*4,$out),$inout4);
1130 &pshufd ($twres,$twtmp,0x13);
1131 &movups (&QWP(16*5,$out),$inout5);
1132 &lea ($out,&DWP(16*6,$out));
1133 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1134
1135 &pxor ($twtmp,$twtmp);
1136 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1137 &pand ($twres,$twmask); # isolate carry and residue
1138 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1139 &mov ($rounds,$rounds_); # restore $rounds
1140 &pxor ($tweak,$twres);
1141
1142 &sub ($len,16*6);
1143 &jnc (&label("xts_enc_loop6"));
1144
1145 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1146 &mov ($key,$key_); # restore $key
1147 &mov ($rounds_,$rounds);
1148
1149&set_label("xts_enc_short");
1150 &add ($len,16*6);
1151 &jz (&label("xts_enc_done6x"));
1152
1153 &movdqa ($inout3,$tweak); # put aside previous tweak
1154 &cmp ($len,0x20);
1155 &jb (&label("xts_enc_one"));
1156
1157 &pshufd ($twres,$twtmp,0x13);
1158 &pxor ($twtmp,$twtmp);
1159 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1160 &pand ($twres,$twmask); # isolate carry and residue
1161 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1162 &pxor ($tweak,$twres);
1163 &je (&label("xts_enc_two"));
1164
1165 &pshufd ($twres,$twtmp,0x13);
1166 &pxor ($twtmp,$twtmp);
1167 &movdqa ($inout4,$tweak); # put aside previous tweak
1168 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1169 &pand ($twres,$twmask); # isolate carry and residue
1170 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1171 &pxor ($tweak,$twres);
1172 &cmp ($len,0x40);
1173 &jb (&label("xts_enc_three"));
1174
1175 &pshufd ($twres,$twtmp,0x13);
1176 &pxor ($twtmp,$twtmp);
1177 &movdqa ($inout5,$tweak); # put aside previous tweak
1178 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1179 &pand ($twres,$twmask); # isolate carry and residue
1180 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1181 &pxor ($tweak,$twres);
1182 &movdqa (&QWP(16*0,"esp"),$inout3);
1183 &movdqa (&QWP(16*1,"esp"),$inout4);
1184 &je (&label("xts_enc_four"));
1185
1186 &movdqa (&QWP(16*2,"esp"),$inout5);
1187 &pshufd ($inout5,$twtmp,0x13);
1188 &movdqa (&QWP(16*3,"esp"),$tweak);
1189 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1190 &pand ($inout5,$twmask); # isolate carry and residue
1191 &pxor ($inout5,$tweak);
1192
1193 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1194 &movdqu ($inout1,&QWP(16*1,$inp));
1195 &movdqu ($inout2,&QWP(16*2,$inp));
1196 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1197 &movdqu ($inout3,&QWP(16*3,$inp));
1198 &pxor ($inout1,&QWP(16*1,"esp"));
1199 &movdqu ($inout4,&QWP(16*4,$inp));
1200 &pxor ($inout2,&QWP(16*2,"esp"));
1201 &lea ($inp,&DWP(16*5,$inp));
1202 &pxor ($inout3,&QWP(16*3,"esp"));
1203 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1204 &pxor ($inout4,$inout5);
1205
1206 &call ("_aesni_encrypt6");
1207
1208 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1209 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1210 &xorps ($inout1,&QWP(16*1,"esp"));
1211 &xorps ($inout2,&QWP(16*2,"esp"));
1212 &movups (&QWP(16*0,$out),$inout0); # write output
1213 &xorps ($inout3,&QWP(16*3,"esp"));
1214 &movups (&QWP(16*1,$out),$inout1);
1215 &xorps ($inout4,$tweak);
1216 &movups (&QWP(16*2,$out),$inout2);
1217 &movups (&QWP(16*3,$out),$inout3);
1218 &movups (&QWP(16*4,$out),$inout4);
1219 &lea ($out,&DWP(16*5,$out));
1220 &jmp (&label("xts_enc_done"));
1221
1222&set_label("xts_enc_one",16);
1223 &movups ($inout0,&QWP(16*0,$inp)); # load input
1224 &lea ($inp,&DWP(16*1,$inp));
1225 &xorps ($inout0,$inout3); # input^=tweak
1226 if ($inline)
1227 { &aesni_inline_generate1("enc"); }
1228 else
1229 { &call ("_aesni_encrypt1"); }
1230 &xorps ($inout0,$inout3); # output^=tweak
1231 &movups (&QWP(16*0,$out),$inout0); # write output
1232 &lea ($out,&DWP(16*1,$out));
1233
1234 &movdqa ($tweak,$inout3); # last tweak
1235 &jmp (&label("xts_enc_done"));
1236
1237&set_label("xts_enc_two",16);
1238 &movaps ($inout4,$tweak); # put aside last tweak
1239
1240 &movups ($inout0,&QWP(16*0,$inp)); # load input
1241 &movups ($inout1,&QWP(16*1,$inp));
1242 &lea ($inp,&DWP(16*2,$inp));
1243 &xorps ($inout0,$inout3); # input^=tweak
1244 &xorps ($inout1,$inout4);
1245 &xorps ($inout2,$inout2);
1246
1247 &call ("_aesni_encrypt3");
1248
1249 &xorps ($inout0,$inout3); # output^=tweak
1250 &xorps ($inout1,$inout4);
1251 &movups (&QWP(16*0,$out),$inout0); # write output
1252 &movups (&QWP(16*1,$out),$inout1);
1253 &lea ($out,&DWP(16*2,$out));
1254
1255 &movdqa ($tweak,$inout4); # last tweak
1256 &jmp (&label("xts_enc_done"));
1257
1258&set_label("xts_enc_three",16);
1259 &movaps ($inout5,$tweak); # put aside last tweak
1260 &movups ($inout0,&QWP(16*0,$inp)); # load input
1261 &movups ($inout1,&QWP(16*1,$inp));
1262 &movups ($inout2,&QWP(16*2,$inp));
1263 &lea ($inp,&DWP(16*3,$inp));
1264 &xorps ($inout0,$inout3); # input^=tweak
1265 &xorps ($inout1,$inout4);
1266 &xorps ($inout2,$inout5);
1267
1268 &call ("_aesni_encrypt3");
1269
1270 &xorps ($inout0,$inout3); # output^=tweak
1271 &xorps ($inout1,$inout4);
1272 &xorps ($inout2,$inout5);
1273 &movups (&QWP(16*0,$out),$inout0); # write output
1274 &movups (&QWP(16*1,$out),$inout1);
1275 &movups (&QWP(16*2,$out),$inout2);
1276 &lea ($out,&DWP(16*3,$out));
1277
1278 &movdqa ($tweak,$inout5); # last tweak
1279 &jmp (&label("xts_enc_done"));
1280
1281&set_label("xts_enc_four",16);
1282 &movaps ($inout4,$tweak); # put aside last tweak
1283
1284 &movups ($inout0,&QWP(16*0,$inp)); # load input
1285 &movups ($inout1,&QWP(16*1,$inp));
1286 &movups ($inout2,&QWP(16*2,$inp));
1287 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1288 &movups ($inout3,&QWP(16*3,$inp));
1289 &lea ($inp,&DWP(16*4,$inp));
1290 &xorps ($inout1,&QWP(16*1,"esp"));
1291 &xorps ($inout2,$inout5);
1292 &xorps ($inout3,$inout4);
1293
1294 &call ("_aesni_encrypt4");
1295
1296 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1297 &xorps ($inout1,&QWP(16*1,"esp"));
1298 &xorps ($inout2,$inout5);
1299 &movups (&QWP(16*0,$out),$inout0); # write output
1300 &xorps ($inout3,$inout4);
1301 &movups (&QWP(16*1,$out),$inout1);
1302 &movups (&QWP(16*2,$out),$inout2);
1303 &movups (&QWP(16*3,$out),$inout3);
1304 &lea ($out,&DWP(16*4,$out));
1305
1306 &movdqa ($tweak,$inout4); # last tweak
1307 &jmp (&label("xts_enc_done"));
1308
1309&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1310 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1311 &and ($len,15);
1312 &jz (&label("xts_enc_ret"));
1313 &movdqa ($inout3,$tweak);
1314 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1315 &jmp (&label("xts_enc_steal"));
1316
1317&set_label("xts_enc_done",16);
1318 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1319 &pxor ($twtmp,$twtmp);
1320 &and ($len,15);
1321 &jz (&label("xts_enc_ret"));
1322
1323 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1324 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1325 &pshufd ($inout3,$twtmp,0x13);
1326 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1327 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1328 &pxor ($inout3,$tweak);
1329
1330&set_label("xts_enc_steal");
1331 &movz ($rounds,&BP(0,$inp));
1332 &movz ($key,&BP(-16,$out));
1333 &lea ($inp,&DWP(1,$inp));
1334 &mov (&BP(-16,$out),&LB($rounds));
1335 &mov (&BP(0,$out),&LB($key));
1336 &lea ($out,&DWP(1,$out));
1337 &sub ($len,1);
1338 &jnz (&label("xts_enc_steal"));
1339
1340 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1341 &mov ($key,$key_); # restore $key
1342 &mov ($rounds,$rounds_); # restore $rounds
1343
1344 &movups ($inout0,&QWP(-16,$out)); # load input
1345 &xorps ($inout0,$inout3); # input^=tweak
1346 if ($inline)
1347 { &aesni_inline_generate1("enc"); }
1348 else
1349 { &call ("_aesni_encrypt1"); }
1350 &xorps ($inout0,$inout3); # output^=tweak
1351 &movups (&QWP(-16,$out),$inout0); # write output
1352
1353&set_label("xts_enc_ret");
1354 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1355&function_end("aesni_xts_encrypt");
1356
1357&function_begin("aesni_xts_decrypt");
1358 &mov ($key,&wparam(4)); # key2
1359 &mov ($inp,&wparam(5)); # clear-text tweak
1360
1361 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1362 &movups ($inout0,&QWP(0,$inp));
1363 if ($inline)
1364 { &aesni_inline_generate1("enc"); }
1365 else
1366 { &call ("_aesni_encrypt1"); }
1367
1368 &mov ($inp,&wparam(0));
1369 &mov ($out,&wparam(1));
1370 &mov ($len,&wparam(2));
1371 &mov ($key,&wparam(3)); # key1
1372
1373 &mov ($key_,"esp");
1374 &sub ("esp",16*7+8);
1375 &and ("esp",-16); # align stack
1376
1377 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1378 &test ($len,15);
1379 &setnz (&LB($rounds_));
1380 &shl ($rounds_,4);
1381 &sub ($len,$rounds_);
1382
1383 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1384 &mov (&DWP(16*6+4,"esp"),0);
1385 &mov (&DWP(16*6+8,"esp"),1);
1386 &mov (&DWP(16*6+12,"esp"),0);
1387 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1388 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1389
1390 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1391 &mov ($key_,$key); # backup $key
1392 &mov ($rounds_,$rounds); # backup $rounds
1393
1394 &movdqa ($tweak,$inout0);
1395 &pxor ($twtmp,$twtmp);
1396 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1397 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1398
1399 &and ($len,-16);
1400 &sub ($len,16*6);
1401 &jc (&label("xts_dec_short"));
1402
1403 &shr ($rounds,1);
1404 &mov ($rounds_,$rounds);
1405 &jmp (&label("xts_dec_loop6"));
1406
1407&set_label("xts_dec_loop6",16);
1408 for ($i=0;$i<4;$i++) {
1409 &pshufd ($twres,$twtmp,0x13);
1410 &pxor ($twtmp,$twtmp);
1411 &movdqa (&QWP(16*$i,"esp"),$tweak);
1412 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1413 &pand ($twres,$twmask); # isolate carry and residue
1414 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1415 &pxor ($tweak,$twres);
1416 }
1417 &pshufd ($inout5,$twtmp,0x13);
1418 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1419 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1420 &$movekey ($rndkey0,&QWP(0,$key_));
1421 &pand ($inout5,$twmask); # isolate carry and residue
1422 &movups ($inout0,&QWP(0,$inp)); # load input
1423 &pxor ($inout5,$tweak);
1424
1425 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1426 &movdqu ($inout1,&QWP(16*1,$inp));
1427 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1428 &movdqu ($inout2,&QWP(16*2,$inp));
1429 &pxor ($inout1,$rndkey0);
1430 &movdqu ($inout3,&QWP(16*3,$inp));
1431 &pxor ($inout2,$rndkey0);
1432 &movdqu ($inout4,&QWP(16*4,$inp));
1433 &pxor ($inout3,$rndkey0);
1434 &movdqu ($rndkey1,&QWP(16*5,$inp));
1435 &pxor ($inout4,$rndkey0);
1436 &lea ($inp,&DWP(16*6,$inp));
1437 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1438 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1439 &pxor ($inout5,$rndkey1);
1440
1441 &$movekey ($rndkey1,&QWP(16,$key_));
1442 &lea ($key,&DWP(32,$key_));
1443 &pxor ($inout1,&QWP(16*1,"esp"));
1444 &aesdec ($inout0,$rndkey1);
1445 &pxor ($inout2,&QWP(16*2,"esp"));
1446 &aesdec ($inout1,$rndkey1);
1447 &pxor ($inout3,&QWP(16*3,"esp"));
1448 &dec ($rounds);
1449 &aesdec ($inout2,$rndkey1);
1450 &pxor ($inout4,&QWP(16*4,"esp"));
1451 &aesdec ($inout3,$rndkey1);
1452 &pxor ($inout5,$rndkey0);
1453 &aesdec ($inout4,$rndkey1);
1454 &$movekey ($rndkey0,&QWP(0,$key));
1455 &aesdec ($inout5,$rndkey1);
1456 &call (&label("_aesni_decrypt6_enter"));
1457
1458 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1459 &pxor ($twtmp,$twtmp);
1460 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1461 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1462 &xorps ($inout1,&QWP(16*1,"esp"));
1463 &movups (&QWP(16*0,$out),$inout0); # write output
1464 &xorps ($inout2,&QWP(16*2,"esp"));
1465 &movups (&QWP(16*1,$out),$inout1);
1466 &xorps ($inout3,&QWP(16*3,"esp"));
1467 &movups (&QWP(16*2,$out),$inout2);
1468 &xorps ($inout4,&QWP(16*4,"esp"));
1469 &movups (&QWP(16*3,$out),$inout3);
1470 &xorps ($inout5,$tweak);
1471 &movups (&QWP(16*4,$out),$inout4);
1472 &pshufd ($twres,$twtmp,0x13);
1473 &movups (&QWP(16*5,$out),$inout5);
1474 &lea ($out,&DWP(16*6,$out));
1475 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1476
1477 &pxor ($twtmp,$twtmp);
1478 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1479 &pand ($twres,$twmask); # isolate carry and residue
1480 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1481 &mov ($rounds,$rounds_); # restore $rounds
1482 &pxor ($tweak,$twres);
1483
1484 &sub ($len,16*6);
1485 &jnc (&label("xts_dec_loop6"));
1486
1487 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1488 &mov ($key,$key_); # restore $key
1489 &mov ($rounds_,$rounds);
1490
1491&set_label("xts_dec_short");
1492 &add ($len,16*6);
1493 &jz (&label("xts_dec_done6x"));
1494
1495 &movdqa ($inout3,$tweak); # put aside previous tweak
1496 &cmp ($len,0x20);
1497 &jb (&label("xts_dec_one"));
1498
1499 &pshufd ($twres,$twtmp,0x13);
1500 &pxor ($twtmp,$twtmp);
1501 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1502 &pand ($twres,$twmask); # isolate carry and residue
1503 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1504 &pxor ($tweak,$twres);
1505 &je (&label("xts_dec_two"));
1506
1507 &pshufd ($twres,$twtmp,0x13);
1508 &pxor ($twtmp,$twtmp);
1509 &movdqa ($inout4,$tweak); # put aside previous tweak
1510 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1511 &pand ($twres,$twmask); # isolate carry and residue
1512 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1513 &pxor ($tweak,$twres);
1514 &cmp ($len,0x40);
1515 &jb (&label("xts_dec_three"));
1516
1517 &pshufd ($twres,$twtmp,0x13);
1518 &pxor ($twtmp,$twtmp);
1519 &movdqa ($inout5,$tweak); # put aside previous tweak
1520 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1521 &pand ($twres,$twmask); # isolate carry and residue
1522 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1523 &pxor ($tweak,$twres);
1524 &movdqa (&QWP(16*0,"esp"),$inout3);
1525 &movdqa (&QWP(16*1,"esp"),$inout4);
1526 &je (&label("xts_dec_four"));
1527
1528 &movdqa (&QWP(16*2,"esp"),$inout5);
1529 &pshufd ($inout5,$twtmp,0x13);
1530 &movdqa (&QWP(16*3,"esp"),$tweak);
1531 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1532 &pand ($inout5,$twmask); # isolate carry and residue
1533 &pxor ($inout5,$tweak);
1534
1535 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1536 &movdqu ($inout1,&QWP(16*1,$inp));
1537 &movdqu ($inout2,&QWP(16*2,$inp));
1538 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1539 &movdqu ($inout3,&QWP(16*3,$inp));
1540 &pxor ($inout1,&QWP(16*1,"esp"));
1541 &movdqu ($inout4,&QWP(16*4,$inp));
1542 &pxor ($inout2,&QWP(16*2,"esp"));
1543 &lea ($inp,&DWP(16*5,$inp));
1544 &pxor ($inout3,&QWP(16*3,"esp"));
1545 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1546 &pxor ($inout4,$inout5);
1547
1548 &call ("_aesni_decrypt6");
1549
1550 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1551 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1552 &xorps ($inout1,&QWP(16*1,"esp"));
1553 &xorps ($inout2,&QWP(16*2,"esp"));
1554 &movups (&QWP(16*0,$out),$inout0); # write output
1555 &xorps ($inout3,&QWP(16*3,"esp"));
1556 &movups (&QWP(16*1,$out),$inout1);
1557 &xorps ($inout4,$tweak);
1558 &movups (&QWP(16*2,$out),$inout2);
1559 &movups (&QWP(16*3,$out),$inout3);
1560 &movups (&QWP(16*4,$out),$inout4);
1561 &lea ($out,&DWP(16*5,$out));
1562 &jmp (&label("xts_dec_done"));
1563
1564&set_label("xts_dec_one",16);
1565 &movups ($inout0,&QWP(16*0,$inp)); # load input
1566 &lea ($inp,&DWP(16*1,$inp));
1567 &xorps ($inout0,$inout3); # input^=tweak
1568 if ($inline)
1569 { &aesni_inline_generate1("dec"); }
1570 else
1571 { &call ("_aesni_decrypt1"); }
1572 &xorps ($inout0,$inout3); # output^=tweak
1573 &movups (&QWP(16*0,$out),$inout0); # write output
1574 &lea ($out,&DWP(16*1,$out));
1575
1576 &movdqa ($tweak,$inout3); # last tweak
1577 &jmp (&label("xts_dec_done"));
1578
1579&set_label("xts_dec_two",16);
1580 &movaps ($inout4,$tweak); # put aside last tweak
1581
1582 &movups ($inout0,&QWP(16*0,$inp)); # load input
1583 &movups ($inout1,&QWP(16*1,$inp));
1584 &lea ($inp,&DWP(16*2,$inp));
1585 &xorps ($inout0,$inout3); # input^=tweak
1586 &xorps ($inout1,$inout4);
1587
1588 &call ("_aesni_decrypt3");
1589
1590 &xorps ($inout0,$inout3); # output^=tweak
1591 &xorps ($inout1,$inout4);
1592 &movups (&QWP(16*0,$out),$inout0); # write output
1593 &movups (&QWP(16*1,$out),$inout1);
1594 &lea ($out,&DWP(16*2,$out));
1595
1596 &movdqa ($tweak,$inout4); # last tweak
1597 &jmp (&label("xts_dec_done"));
1598
1599&set_label("xts_dec_three",16);
1600 &movaps ($inout5,$tweak); # put aside last tweak
1601 &movups ($inout0,&QWP(16*0,$inp)); # load input
1602 &movups ($inout1,&QWP(16*1,$inp));
1603 &movups ($inout2,&QWP(16*2,$inp));
1604 &lea ($inp,&DWP(16*3,$inp));
1605 &xorps ($inout0,$inout3); # input^=tweak
1606 &xorps ($inout1,$inout4);
1607 &xorps ($inout2,$inout5);
1608
1609 &call ("_aesni_decrypt3");
1610
1611 &xorps ($inout0,$inout3); # output^=tweak
1612 &xorps ($inout1,$inout4);
1613 &xorps ($inout2,$inout5);
1614 &movups (&QWP(16*0,$out),$inout0); # write output
1615 &movups (&QWP(16*1,$out),$inout1);
1616 &movups (&QWP(16*2,$out),$inout2);
1617 &lea ($out,&DWP(16*3,$out));
1618
1619 &movdqa ($tweak,$inout5); # last tweak
1620 &jmp (&label("xts_dec_done"));
1621
1622&set_label("xts_dec_four",16);
1623 &movaps ($inout4,$tweak); # put aside last tweak
1624
1625 &movups ($inout0,&QWP(16*0,$inp)); # load input
1626 &movups ($inout1,&QWP(16*1,$inp));
1627 &movups ($inout2,&QWP(16*2,$inp));
1628 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1629 &movups ($inout3,&QWP(16*3,$inp));
1630 &lea ($inp,&DWP(16*4,$inp));
1631 &xorps ($inout1,&QWP(16*1,"esp"));
1632 &xorps ($inout2,$inout5);
1633 &xorps ($inout3,$inout4);
1634
1635 &call ("_aesni_decrypt4");
1636
1637 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1638 &xorps ($inout1,&QWP(16*1,"esp"));
1639 &xorps ($inout2,$inout5);
1640 &movups (&QWP(16*0,$out),$inout0); # write output
1641 &xorps ($inout3,$inout4);
1642 &movups (&QWP(16*1,$out),$inout1);
1643 &movups (&QWP(16*2,$out),$inout2);
1644 &movups (&QWP(16*3,$out),$inout3);
1645 &lea ($out,&DWP(16*4,$out));
1646
1647 &movdqa ($tweak,$inout4); # last tweak
1648 &jmp (&label("xts_dec_done"));
1649
1650&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1651 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1652 &and ($len,15);
1653 &jz (&label("xts_dec_ret"));
1654 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1655 &jmp (&label("xts_dec_only_one_more"));
1656
1657&set_label("xts_dec_done",16);
1658 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1659 &pxor ($twtmp,$twtmp);
1660 &and ($len,15);
1661 &jz (&label("xts_dec_ret"));
1662
1663 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1664 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1665 &pshufd ($twres,$twtmp,0x13);
1666 &pxor ($twtmp,$twtmp);
1667 &movdqa ($twmask,&QWP(16*6,"esp"));
1668 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1669 &pand ($twres,$twmask); # isolate carry and residue
1670 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1671 &pxor ($tweak,$twres);
1672
1673&set_label("xts_dec_only_one_more");
1674 &pshufd ($inout3,$twtmp,0x13);
1675 &movdqa ($inout4,$tweak); # put aside previous tweak
1676 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1677 &pand ($inout3,$twmask); # isolate carry and residue
1678 &pxor ($inout3,$tweak);
1679
1680 &mov ($key,$key_); # restore $key
1681 &mov ($rounds,$rounds_); # restore $rounds
1682
1683 &movups ($inout0,&QWP(0,$inp)); # load input
1684 &xorps ($inout0,$inout3); # input^=tweak
1685 if ($inline)
1686 { &aesni_inline_generate1("dec"); }
1687 else
1688 { &call ("_aesni_decrypt1"); }
1689 &xorps ($inout0,$inout3); # output^=tweak
1690 &movups (&QWP(0,$out),$inout0); # write output
1691
1692&set_label("xts_dec_steal");
1693 &movz ($rounds,&BP(16,$inp));
1694 &movz ($key,&BP(0,$out));
1695 &lea ($inp,&DWP(1,$inp));
1696 &mov (&BP(0,$out),&LB($rounds));
1697 &mov (&BP(16,$out),&LB($key));
1698 &lea ($out,&DWP(1,$out));
1699 &sub ($len,1);
1700 &jnz (&label("xts_dec_steal"));
1701
1702 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1703 &mov ($key,$key_); # restore $key
1704 &mov ($rounds,$rounds_); # restore $rounds
1705
1706 &movups ($inout0,&QWP(0,$out)); # load input
1707 &xorps ($inout0,$inout4); # input^=tweak
1708 if ($inline)
1709 { &aesni_inline_generate1("dec"); }
1710 else
1711 { &call ("_aesni_decrypt1"); }
1712 &xorps ($inout0,$inout4); # output^=tweak
1713 &movups (&QWP(0,$out),$inout0); # write output
1714
1715&set_label("xts_dec_ret");
1716 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1717&function_end("aesni_xts_decrypt");
1718}
6c83629b
AP
1719}
1720\f
1721######################################################################
d64a7232
AP
1722# void $PREFIX_cbc_encrypt (const void *inp, void *out,
1723# size_t length, const AES_KEY *key,
1724# unsigned char *ivp,const int enc);
1725&function_begin("${PREFIX}_cbc_encrypt");
1726 &mov ($inp,&wparam(0));
f8501464 1727 &mov ($rounds_,"esp");
d64a7232 1728 &mov ($out,&wparam(1));
f8501464 1729 &sub ($rounds_,24);
d64a7232 1730 &mov ($len,&wparam(2));
f8501464 1731 &and ($rounds_,-16);
d64a7232 1732 &mov ($key,&wparam(3));
d64a7232 1733 &mov ($key_,&wparam(4));
d7d119a3 1734 &test ($len,$len);
f8501464 1735 &jz (&label("cbc_abort"));
d64a7232
AP
1736
1737 &cmp (&wparam(5),0);
f8501464
AP
1738 &xchg ($rounds_,"esp"); # alloca
1739 &movups ($ivec,&QWP(0,$key_)); # load IV
d64a7232 1740 &mov ($rounds,&DWP(240,$key));
f8501464
AP
1741 &mov ($key_,$key); # backup $key
1742 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
1743 &mov ($rounds_,$rounds); # backup $rounds
d64a7232
AP
1744 &je (&label("cbc_decrypt"));
1745
f8501464 1746 &movaps ($inout0,$ivec);
d64a7232
AP
1747 &cmp ($len,16);
1748 &jb (&label("cbc_enc_tail"));
1749 &sub ($len,16);
1750 &jmp (&label("cbc_enc_loop"));
1751
1752&set_label("cbc_enc_loop",16);
f8501464 1753 &movups ($ivec,&QWP(0,$inp)); # input actually
d64a7232 1754 &lea ($inp,&DWP(16,$inp));
6f766a41 1755 if ($inline)
f8501464 1756 { &aesni_inline_generate1("enc",$inout0,$ivec); }
6f766a41 1757 else
f8501464 1758 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
d64a7232
AP
1759 &mov ($rounds,$rounds_); # restore $rounds
1760 &mov ($key,$key_); # restore $key
d7d119a3
AP
1761 &movups (&QWP(0,$out),$inout0); # store output
1762 &lea ($out,&DWP(16,$out));
1763 &sub ($len,16);
d64a7232
AP
1764 &jnc (&label("cbc_enc_loop"));
1765 &add ($len,16);
1766 &jnz (&label("cbc_enc_tail"));
1767 &movaps ($ivec,$inout0);
1768 &jmp (&label("cbc_ret"));
1769
1770&set_label("cbc_enc_tail");
1771 &mov ("ecx",$len); # zaps $rounds
1772 &data_word(0xA4F3F689); # rep movsb
1773 &mov ("ecx",16); # zero tail
1774 &sub ("ecx",$len);
1775 &xor ("eax","eax"); # zaps $len
1776 &data_word(0xAAF3F689); # rep stosb
1777 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
1778 &mov ($rounds,$rounds_); # restore $rounds
1779 &mov ($inp,$out); # $inp and $out are the same
1780 &mov ($key,$key_); # restore $key
1781 &jmp (&label("cbc_enc_loop"));
6c83629b 1782######################################################################
d64a7232 1783&set_label("cbc_decrypt",16);
f8501464 1784 &cmp ($len,0x50);
d608b4d6 1785 &jbe (&label("cbc_dec_tail"));
f8501464
AP
1786 &movaps (&QWP(0,"esp"),$ivec); # save IV
1787 &sub ($len,0x50);
1788 &jmp (&label("cbc_dec_loop6_enter"));
d64a7232 1789
f8501464
AP
1790&set_label("cbc_dec_loop6",16);
1791 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
1792 &movups (&QWP(0,$out),$inout5);
1793 &lea ($out,&DWP(0x10,$out));
1794&set_label("cbc_dec_loop6_enter");
1795 &movdqu ($inout0,&QWP(0,$inp));
1796 &movdqu ($inout1,&QWP(0x10,$inp));
1797 &movdqu ($inout2,&QWP(0x20,$inp));
1798 &movdqu ($inout3,&QWP(0x30,$inp));
1799 &movdqu ($inout4,&QWP(0x40,$inp));
1800 &movdqu ($inout5,&QWP(0x50,$inp));
1801
1802 &call ("_aesni_decrypt6");
1803
1804 &movups ($rndkey1,&QWP(0,$inp));
1805 &movups ($rndkey0,&QWP(0x10,$inp));
1806 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
1807 &xorps ($inout1,$rndkey1);
1808 &movups ($rndkey1,&QWP(0x20,$inp));
1809 &xorps ($inout2,$rndkey0);
1810 &movups ($rndkey0,&QWP(0x30,$inp));
1811 &xorps ($inout3,$rndkey1);
1812 &movups ($rndkey1,&QWP(0x40,$inp));
1813 &xorps ($inout4,$rndkey0);
1814 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
1815 &xorps ($inout5,$rndkey1);
1816 &movups (&QWP(0,$out),$inout0);
1817 &movups (&QWP(0x10,$out),$inout1);
1818 &lea ($inp,&DWP(0x60,$inp));
1819 &movups (&QWP(0x20,$out),$inout2);
1820 &mov ($rounds,$rounds_) # restore $rounds
1821 &movups (&QWP(0x30,$out),$inout3);
1822 &mov ($key,$key_); # restore $key
1823 &movups (&QWP(0x40,$out),$inout4);
1824 &lea ($out,&DWP(0x50,$out));
1825 &sub ($len,0x60);
1826 &ja (&label("cbc_dec_loop6"));
1827
1828 &movaps ($inout0,$inout5);
1829 &movaps ($ivec,$rndkey0);
1830 &add ($len,0x50);
1831 &jle (&label("cbc_dec_tail_collected"));
1832 &movups (&QWP(0,$out),$inout0);
1833 &lea ($out,&DWP(0x10,$out));
6c83629b 1834&set_label("cbc_dec_tail");
d64a7232 1835 &movups ($inout0,&QWP(0,$inp));
d64a7232 1836 &movaps ($in0,$inout0);
d7d119a3 1837 &cmp ($len,0x10);
d64a7232 1838 &jbe (&label("cbc_dec_one"));
f8501464 1839
d64a7232 1840 &movups ($inout1,&QWP(0x10,$inp));
d64a7232 1841 &movaps ($in1,$inout1);
d7d119a3 1842 &cmp ($len,0x20);
d64a7232 1843 &jbe (&label("cbc_dec_two"));
f8501464 1844
d64a7232 1845 &movups ($inout2,&QWP(0x20,$inp));
d608b4d6
AP
1846 &cmp ($len,0x30);
1847 &jbe (&label("cbc_dec_three"));
f8501464 1848
d608b4d6 1849 &movups ($inout3,&QWP(0x30,$inp));
f8501464
AP
1850 &cmp ($len,0x40);
1851 &jbe (&label("cbc_dec_four"));
1852
1853 &movups ($inout4,&QWP(0x40,$inp));
1854 &movaps (&QWP(0,"esp"),$ivec); # save IV
1855 &movups ($inout0,&QWP(0,$inp));
1856 &xorps ($inout5,$inout5);
1857 &call ("_aesni_decrypt6");
1858 &movups ($rndkey1,&QWP(0,$inp));
1859 &movups ($rndkey0,&QWP(0x10,$inp));
1860 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
1861 &xorps ($inout1,$rndkey1);
1862 &movups ($rndkey1,&QWP(0x20,$inp));
1863 &xorps ($inout2,$rndkey0);
1864 &movups ($rndkey0,&QWP(0x30,$inp));
1865 &xorps ($inout3,$rndkey1);
1866 &movups ($ivec,&QWP(0x40,$inp)); # IV
1867 &xorps ($inout4,$rndkey0);
1868 &movups (&QWP(0,$out),$inout0);
1869 &movups (&QWP(0x10,$out),$inout1);
1870 &movups (&QWP(0x20,$out),$inout2);
1871 &movups (&QWP(0x30,$out),$inout3);
1872 &lea ($out,&DWP(0x40,$out));
1873 &movaps ($inout0,$inout4);
1874 &sub ($len,0x50);
d64a7232
AP
1875 &jmp (&label("cbc_dec_tail_collected"));
1876
d7d119a3 1877&set_label("cbc_dec_one",16);
6f766a41
AP
1878 if ($inline)
1879 { &aesni_inline_generate1("dec"); }
1880 else
1881 { &call ("_aesni_decrypt1"); }
f8501464
AP
1882 &xorps ($inout0,$ivec);
1883 &movaps ($ivec,$in0);
1884 &sub ($len,0x10);
d64a7232
AP
1885 &jmp (&label("cbc_dec_tail_collected"));
1886
d7d119a3 1887&set_label("cbc_dec_two",16);
f8501464 1888 &xorps ($inout2,$inout2);
d64a7232 1889 &call ("_aesni_decrypt3");
f8501464
AP
1890 &xorps ($inout0,$ivec);
1891 &xorps ($inout1,$in0);
1892 &movups (&QWP(0,$out),$inout0);
1893 &movaps ($inout0,$inout1);
d64a7232 1894 &lea ($out,&DWP(0x10,$out));
f8501464
AP
1895 &movaps ($ivec,$in1);
1896 &sub ($len,0x20);
d608b4d6
AP
1897 &jmp (&label("cbc_dec_tail_collected"));
1898
d7d119a3 1899&set_label("cbc_dec_three",16);
d608b4d6 1900 &call ("_aesni_decrypt3");
f8501464
AP
1901 &xorps ($inout0,$ivec);
1902 &xorps ($inout1,$in0);
1903 &xorps ($inout2,$in1);
1904 &movups (&QWP(0,$out),$inout0);
1905 &movaps ($inout0,$inout2);
1906 &movups (&QWP(0x10,$out),$inout1);
d608b4d6 1907 &lea ($out,&DWP(0x20,$out));
f8501464
AP
1908 &movups ($ivec,&QWP(0x20,$inp));
1909 &sub ($len,0x30);
1910 &jmp (&label("cbc_dec_tail_collected"));
1911
1912&set_label("cbc_dec_four",16);
1913 &call ("_aesni_decrypt4");
1914 &movups ($rndkey1,&QWP(0x10,$inp));
1915 &movups ($rndkey0,&QWP(0x20,$inp));
1916 &xorps ($inout0,$ivec);
1917 &movups ($ivec,&QWP(0x30,$inp));
1918 &xorps ($inout1,$in0);
1919 &movups (&QWP(0,$out),$inout0);
1920 &xorps ($inout2,$rndkey1);
1921 &movups (&QWP(0x10,$out),$inout1);
1922 &xorps ($inout3,$rndkey0);
1923 &movups (&QWP(0x20,$out),$inout2);
1924 &lea ($out,&DWP(0x30,$out));
1925 &movaps ($inout0,$inout3);
1926 &sub ($len,0x40);
d64a7232
AP
1927
1928&set_label("cbc_dec_tail_collected");
1929 &and ($len,15);
1930 &jnz (&label("cbc_dec_tail_partial"));
f8501464 1931 &movups (&QWP(0,$out),$inout0);
d64a7232
AP
1932 &jmp (&label("cbc_ret"));
1933
d7d119a3 1934&set_label("cbc_dec_tail_partial",16);
f8501464
AP
1935 &movaps (&QWP(0,"esp"),$inout0);
1936 &mov ("ecx",16);
d64a7232 1937 &mov ($inp,"esp");
f8501464 1938 &sub ("ecx",$len);
d64a7232 1939 &data_word(0xA4F3F689); # rep movsb
d64a7232
AP
1940
1941&set_label("cbc_ret");
f8501464 1942 &mov ("esp",&DWP(16,"esp")); # pull original %esp
d64a7232
AP
1943 &mov ($key_,&wparam(4));
1944 &movups (&QWP(0,$key_),$ivec); # output IV
f8501464 1945&set_label("cbc_abort");
d64a7232 1946&function_end("${PREFIX}_cbc_encrypt");
6c83629b
AP
1947\f
1948######################################################################
d64a7232
AP
1949# Mechanical port from aesni-x86_64.pl.
1950#
1951# _aesni_set_encrypt_key is private interface,
1952# input:
1953# "eax" const unsigned char *userKey
1954# $rounds int bits
1955# $key AES_KEY *key
1956# output:
1957# "eax" return code
1958# $round rounds
1959
1960&function_begin_B("_aesni_set_encrypt_key");
1961 &test ("eax","eax");
1962 &jz (&label("bad_pointer"));
1963 &test ($key,$key);
1964 &jz (&label("bad_pointer"));
1965
1966 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
f8501464 1967 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
d64a7232
AP
1968 &lea ($key,&DWP(16,$key));
1969 &cmp ($rounds,256);
1970 &je (&label("14rounds"));
1971 &cmp ($rounds,192);
1972 &je (&label("12rounds"));
1973 &cmp ($rounds,128);
1974 &jne (&label("bad_keybits"));
1975
1976&set_label("10rounds",16);
d608b4d6 1977 &mov ($rounds,9);
d64a7232
AP
1978 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
1979 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
1980 &call (&label("key_128_cold"));
1981 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
1982 &call (&label("key_128"));
1983 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
1984 &call (&label("key_128"));
1985 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
1986 &call (&label("key_128"));
1987 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
1988 &call (&label("key_128"));
1989 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
1990 &call (&label("key_128"));
1991 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
1992 &call (&label("key_128"));
1993 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
1994 &call (&label("key_128"));
1995 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
1996 &call (&label("key_128"));
1997 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
1998 &call (&label("key_128"));
1999 &$movekey (&QWP(0,$key),"xmm0");
2000 &mov (&DWP(80,$key),$rounds);
2001 &xor ("eax","eax");
2002 &ret();
2003
2004&set_label("key_128",16);
2005 &$movekey (&QWP(0,$key),"xmm0");
2006 &lea ($key,&DWP(16,$key));
2007&set_label("key_128_cold");
2008 &shufps ("xmm4","xmm0",0b00010000);
f8501464
AP
2009 &xorps ("xmm0","xmm4");
2010 &shufps ("xmm4","xmm0",0b10001100);
2011 &xorps ("xmm0","xmm4");
2012 &shufps ("xmm1","xmm1",0b11111111); # critical path
2013 &xorps ("xmm0","xmm1");
d64a7232
AP
2014 &ret();
2015
2016&set_label("12rounds",16);
2017 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
d608b4d6 2018 &mov ($rounds,11);
d64a7232
AP
2019 &$movekey (&QWP(-16,$key),"xmm0") # round 0
2020 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
2021 &call (&label("key_192a_cold"));
2022 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
2023 &call (&label("key_192b"));
2024 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
2025 &call (&label("key_192a"));
2026 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
2027 &call (&label("key_192b"));
2028 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
2029 &call (&label("key_192a"));
2030 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
2031 &call (&label("key_192b"));
2032 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
2033 &call (&label("key_192a"));
2034 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
2035 &call (&label("key_192b"));
2036 &$movekey (&QWP(0,$key),"xmm0");
2037 &mov (&DWP(48,$key),$rounds);
2038 &xor ("eax","eax");
2039 &ret();
2040
2041&set_label("key_192a",16);
2042 &$movekey (&QWP(0,$key),"xmm0");
2043 &lea ($key,&DWP(16,$key));
2044&set_label("key_192a_cold",16);
2045 &movaps ("xmm5","xmm2");
2046&set_label("key_192b_warm");
2047 &shufps ("xmm4","xmm0",0b00010000);
f8501464
AP
2048 &movdqa ("xmm3","xmm2");
2049 &xorps ("xmm0","xmm4");
d64a7232
AP
2050 &shufps ("xmm4","xmm0",0b10001100);
2051 &pslldq ("xmm3",4);
f8501464 2052 &xorps ("xmm0","xmm4");
d64a7232
AP
2053 &pshufd ("xmm1","xmm1",0b01010101); # critical path
2054 &pxor ("xmm2","xmm3");
2055 &pxor ("xmm0","xmm1");
2056 &pshufd ("xmm3","xmm0",0b11111111);
2057 &pxor ("xmm2","xmm3");
2058 &ret();
2059
2060&set_label("key_192b",16);
2061 &movaps ("xmm3","xmm0");
2062 &shufps ("xmm5","xmm0",0b01000100);
2063 &$movekey (&QWP(0,$key),"xmm5");
2064 &shufps ("xmm3","xmm2",0b01001110);
2065 &$movekey (&QWP(16,$key),"xmm3");
2066 &lea ($key,&DWP(32,$key));
2067 &jmp (&label("key_192b_warm"));
2068
2069&set_label("14rounds",16);
2070 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
d608b4d6 2071 &mov ($rounds,13);
d64a7232
AP
2072 &lea ($key,&DWP(16,$key));
2073 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
2074 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
2075 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
2076 &call (&label("key_256a_cold"));
2077 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
2078 &call (&label("key_256b"));
2079 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
2080 &call (&label("key_256a"));
2081 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
2082 &call (&label("key_256b"));
2083 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
2084 &call (&label("key_256a"));
2085 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
2086 &call (&label("key_256b"));
2087 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
2088 &call (&label("key_256a"));
2089 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
2090 &call (&label("key_256b"));
2091 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
2092 &call (&label("key_256a"));
2093 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
2094 &call (&label("key_256b"));
2095 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
2096 &call (&label("key_256a"));
2097 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
2098 &call (&label("key_256b"));
2099 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
2100 &call (&label("key_256a"));
2101 &$movekey (&QWP(0,$key),"xmm0");
2102 &mov (&DWP(16,$key),$rounds);
2103 &xor ("eax","eax");
2104 &ret();
2105
2106&set_label("key_256a",16);
2107 &$movekey (&QWP(0,$key),"xmm2");
2108 &lea ($key,&DWP(16,$key));
2109&set_label("key_256a_cold");
2110 &shufps ("xmm4","xmm0",0b00010000);
f8501464 2111 &xorps ("xmm0","xmm4");
d64a7232 2112 &shufps ("xmm4","xmm0",0b10001100);
f8501464
AP
2113 &xorps ("xmm0","xmm4");
2114 &shufps ("xmm1","xmm1",0b11111111); # critical path
2115 &xorps ("xmm0","xmm1");
d64a7232
AP
2116 &ret();
2117
2118&set_label("key_256b",16);
2119 &$movekey (&QWP(0,$key),"xmm0");
2120 &lea ($key,&DWP(16,$key));
2121
2122 &shufps ("xmm4","xmm2",0b00010000);
f8501464 2123 &xorps ("xmm2","xmm4");
d64a7232 2124 &shufps ("xmm4","xmm2",0b10001100);
f8501464
AP
2125 &xorps ("xmm2","xmm4");
2126 &shufps ("xmm1","xmm1",0b10101010); # critical path
2127 &xorps ("xmm2","xmm1");
d64a7232
AP
2128 &ret();
2129
2130&set_label("bad_pointer",4);
2131 &mov ("eax",-1);
2132 &ret ();
2133&set_label("bad_keybits",4);
2134 &mov ("eax",-2);
2135 &ret ();
2136&function_end_B("_aesni_set_encrypt_key");
2137
2138# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2139# AES_KEY *key)
2140&function_begin_B("${PREFIX}_set_encrypt_key");
2141 &mov ("eax",&wparam(0));
2142 &mov ($rounds,&wparam(1));
2143 &mov ($key,&wparam(2));
2144 &call ("_aesni_set_encrypt_key");
2145 &ret ();
2146&function_end_B("${PREFIX}_set_encrypt_key");
2147
2148# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2149# AES_KEY *key)
2150&function_begin_B("${PREFIX}_set_decrypt_key");
2151 &mov ("eax",&wparam(0));
2152 &mov ($rounds,&wparam(1));
2153 &mov ($key,&wparam(2));
2154 &call ("_aesni_set_encrypt_key");
2155 &mov ($key,&wparam(2));
d608b4d6 2156 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
d64a7232
AP
2157 &test ("eax","eax");
2158 &jnz (&label("dec_key_ret"));
d608b4d6 2159 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
d64a7232
AP
2160
2161 &$movekey ("xmm0",&QWP(0,$key)); # just swap
2162 &$movekey ("xmm1",&QWP(0,"eax"));
2163 &$movekey (&QWP(0,"eax"),"xmm0");
2164 &$movekey (&QWP(0,$key),"xmm1");
2165 &lea ($key,&DWP(16,$key));
2166 &lea ("eax",&DWP(-16,"eax"));
d64a7232 2167
d608b4d6 2168&set_label("dec_key_inverse");
d64a7232
AP
2169 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
2170 &$movekey ("xmm1",&QWP(0,"eax"));
2171 &aesimc ("xmm0","xmm0");
2172 &aesimc ("xmm1","xmm1");
2173 &lea ($key,&DWP(16,$key));
2174 &lea ("eax",&DWP(-16,"eax"));
d64a7232
AP
2175 &$movekey (&QWP(16,"eax"),"xmm0");
2176 &$movekey (&QWP(-16,$key),"xmm1");
d7d119a3 2177 &cmp ("eax",$key);
d64a7232
AP
2178 &ja (&label("dec_key_inverse"));
2179
2180 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
2181 &aesimc ("xmm0","xmm0");
2182 &$movekey (&QWP(0,$key),"xmm0");
2183
2184 &xor ("eax","eax"); # return success
2185&set_label("dec_key_ret");
2186 &ret ();
2187&function_end_B("${PREFIX}_set_decrypt_key");
2188&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2189
2190&asm_finish();