]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aesni-x86.pl
Remove filename argument to x86 asm_init.
[thirdparty/openssl.git] / crypto / aes / asm / aesni-x86.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
d64a7232
AP
9
10# ====================================================================
d8ba0dc9 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
d64a7232
AP
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
20# details].
d7d119a3
AP
21#
22# Performance.
23#
24# To start with see corresponding paragraph in aesni-x86_64.pl...
25# Instead of filling table similar to one found there I've chosen to
26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27# The simplified table below represents 32-bit performance relative
28# to 64-bit one in every given point. Ratios vary for different
29# encryption modes, therefore interval values.
30#
31# 16-byte 64-byte 256-byte 1-KB 8-KB
32# 53-67% 67-84% 91-94% 95-98% 97-99.5%
33#
34# Lower ratios for smaller block sizes are perfectly understandable,
35# because function call overhead is higher in 32-bit mode. Largest
36# 8-KB block performance is virtually same: 32-bit code is less than
f8501464
AP
37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
38
39# January 2011
40#
41# See aesni-x86_64.pl for details. Unlike x86_64 version this module
42# interleaves at most 6 aes[enc|dec] instructions, because there are
43# not enough registers for 8x interleave [which should be optimal for
44# Sandy Bridge]. Actually, performance results for 6x interleave
45# factor presented in aesni-x86_64.pl (except for CTR) are for this
46# module.
47
48# April 2011
49#
50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
d64a7232 52
bd30091c
AP
53# November 2015
54#
55# Add aesni_ocb_[en|de]crypt.
56
5599c733
AP
57######################################################################
58# Current large-block performance in cycles per byte processed with
59# 128-bit key (less is better).
60#
bd30091c 61# CBC en-/decrypt CTR XTS ECB OCB
5599c733 62# Westmere 3.77/1.37 1.37 1.52 1.27
bd30091c
AP
63# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10
64# Haswell 4.44/0.80 0.97 1.03 0.72 0.76
a30b0522 65# Skylake 2.68/0.65 0.65 0.66 0.64 0.66
bd30091c 66# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03
a30b0522 67# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70
bd30091c 68# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23
5599c733 69
d64a7232
AP
70$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
71 # generates drop-in replacement for
72 # crypto/aes/asm/aes-586.pl:-)
6f766a41 73$inline=1; # inline _aesni_[en|de]crypt
d64a7232
AP
74
75$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76push(@INC,"${dir}","${dir}../../perlasm");
77require "x86asm.pl";
78
184bc45f
RL
79$output = pop;
80open OUT,">$output";
81*STDOUT=*OUT;
82
e195c8a2 83&asm_init($ARGV[0]);
d64a7232 84
23f6eec7
AP
85&external_label("OPENSSL_ia32cap_P");
86&static_label("key_const");
87
f9c5e5d9
AP
88if ($PREFIX eq "aesni") { $movekey=\&movups; }
89else { $movekey=\&movups; }
d64a7232
AP
90
91$len="eax";
92$rounds="ecx";
93$key="edx";
94$inp="esi";
95$out="edi";
d608b4d6
AP
96$rounds_="ebx"; # backup copy for $rounds
97$key_="ebp"; # backup copy for $key
d64a7232 98
f8501464
AP
99$rndkey0="xmm0";
100$rndkey1="xmm1";
101$inout0="xmm2";
102$inout1="xmm3";
103$inout2="xmm4";
104$inout3="xmm5"; $in1="xmm5";
105$inout4="xmm6"; $in0="xmm6";
106$inout5="xmm7"; $ivec="xmm7";
133a7f9a 107
d900a015 108# AESNI extension
133a7f9a
AP
109sub aeskeygenassist
110{ my($dst,$src,$imm)=@_;
111 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
112 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
113}
114sub aescommon
115{ my($opcodelet,$dst,$src)=@_;
116 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
117 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
118}
119sub aesimc { aescommon(0xdb,@_); }
120sub aesenc { aescommon(0xdc,@_); }
121sub aesenclast { aescommon(0xdd,@_); }
122sub aesdec { aescommon(0xde,@_); }
123sub aesdeclast { aescommon(0xdf,@_); }
6c83629b 124\f
d608b4d6 125# Inline version of internal aesni_[en|de]crypt1
d7d119a3 126{ my $sn;
d608b4d6 127sub aesni_inline_generate1
f8501464 128{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
d7d119a3 129 $sn++;
d64a7232 130
f8501464 131 &$movekey ($rndkey0,&QWP(0,$key));
d608b4d6 132 &$movekey ($rndkey1,&QWP(16,$key));
f8501464 133 &xorps ($ivec,$rndkey0) if (defined($ivec));
d608b4d6 134 &lea ($key,&DWP(32,$key));
f8501464
AP
135 &xorps ($inout,$ivec) if (defined($ivec));
136 &xorps ($inout,$rndkey0) if (!defined($ivec));
d7d119a3
AP
137 &set_label("${p}1_loop_$sn");
138 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 139 &dec ($rounds);
d64a7232 140 &$movekey ($rndkey1,&QWP(0,$key));
d608b4d6 141 &lea ($key,&DWP(16,$key));
d7d119a3
AP
142 &jnz (&label("${p}1_loop_$sn"));
143 eval"&aes${p}last ($inout,$rndkey1)";
144}}
d64a7232
AP
145
146sub aesni_generate1 # fully unrolled loop
d7d119a3 147{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
d64a7232
AP
148
149 &function_begin_B("_aesni_${p}rypt1");
f8501464 150 &movups ($rndkey0,&QWP(0,$key));
d64a7232 151 &$movekey ($rndkey1,&QWP(0x10,$key));
f8501464 152 &xorps ($inout,$rndkey0);
d64a7232
AP
153 &$movekey ($rndkey0,&QWP(0x20,$key));
154 &lea ($key,&DWP(0x30,$key));
d7d119a3 155 &cmp ($rounds,11);
d64a7232
AP
156 &jb (&label("${p}128"));
157 &lea ($key,&DWP(0x20,$key));
158 &je (&label("${p}192"));
159 &lea ($key,&DWP(0x20,$key));
d7d119a3 160 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 161 &$movekey ($rndkey1,&QWP(-0x40,$key));
d7d119a3 162 eval"&aes${p} ($inout,$rndkey0)";
d64a7232
AP
163 &$movekey ($rndkey0,&QWP(-0x30,$key));
164 &set_label("${p}192");
d7d119a3 165 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 166 &$movekey ($rndkey1,&QWP(-0x20,$key));
d7d119a3 167 eval"&aes${p} ($inout,$rndkey0)";
d64a7232
AP
168 &$movekey ($rndkey0,&QWP(-0x10,$key));
169 &set_label("${p}128");
d7d119a3 170 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 171 &$movekey ($rndkey1,&QWP(0,$key));
d7d119a3 172 eval"&aes${p} ($inout,$rndkey0)";
d64a7232 173 &$movekey ($rndkey0,&QWP(0x10,$key));
d7d119a3 174 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 175 &$movekey ($rndkey1,&QWP(0x20,$key));
d7d119a3 176 eval"&aes${p} ($inout,$rndkey0)";
d64a7232 177 &$movekey ($rndkey0,&QWP(0x30,$key));
d7d119a3 178 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 179 &$movekey ($rndkey1,&QWP(0x40,$key));
d7d119a3 180 eval"&aes${p} ($inout,$rndkey0)";
d64a7232 181 &$movekey ($rndkey0,&QWP(0x50,$key));
d7d119a3 182 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 183 &$movekey ($rndkey1,&QWP(0x60,$key));
d7d119a3 184 eval"&aes${p} ($inout,$rndkey0)";
d64a7232 185 &$movekey ($rndkey0,&QWP(0x70,$key));
d7d119a3
AP
186 eval"&aes${p} ($inout,$rndkey1)";
187 eval"&aes${p}last ($inout,$rndkey0)";
d64a7232
AP
188 &ret();
189 &function_end_B("_aesni_${p}rypt1");
190}
6c83629b 191\f
d64a7232 192# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
6f766a41 193&aesni_generate1("enc") if (!$inline);
d64a7232
AP
194&function_begin_B("${PREFIX}_encrypt");
195 &mov ("eax",&wparam(0));
196 &mov ($key,&wparam(2));
f8501464 197 &movups ($inout0,&QWP(0,"eax"));
d64a7232
AP
198 &mov ($rounds,&DWP(240,$key));
199 &mov ("eax",&wparam(1));
6f766a41
AP
200 if ($inline)
201 { &aesni_inline_generate1("enc"); }
202 else
203 { &call ("_aesni_encrypt1"); }
23f6eec7
AP
204 &pxor ($rndkey0,$rndkey0); # clear register bank
205 &pxor ($rndkey1,$rndkey1);
d64a7232 206 &movups (&QWP(0,"eax"),$inout0);
23f6eec7 207 &pxor ($inout0,$inout0);
d64a7232
AP
208 &ret ();
209&function_end_B("${PREFIX}_encrypt");
210
d64a7232 211# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
6f766a41 212&aesni_generate1("dec") if(!$inline);
d64a7232
AP
213&function_begin_B("${PREFIX}_decrypt");
214 &mov ("eax",&wparam(0));
215 &mov ($key,&wparam(2));
f8501464 216 &movups ($inout0,&QWP(0,"eax"));
d64a7232
AP
217 &mov ($rounds,&DWP(240,$key));
218 &mov ("eax",&wparam(1));
6f766a41
AP
219 if ($inline)
220 { &aesni_inline_generate1("dec"); }
221 else
222 { &call ("_aesni_decrypt1"); }
23f6eec7
AP
223 &pxor ($rndkey0,$rndkey0); # clear register bank
224 &pxor ($rndkey1,$rndkey1);
d64a7232 225 &movups (&QWP(0,"eax"),$inout0);
23f6eec7 226 &pxor ($inout0,$inout0);
d64a7232
AP
227 &ret ();
228&function_end_B("${PREFIX}_decrypt");
6c83629b 229
f8501464
AP
230# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
231# factor. Why 3x subroutine were originally used in loops? Even though
232# aes[enc|dec] latency was originally 6, it could be scheduled only
233# every *2nd* cycle. Thus 3x interleave was the one providing optimal
d608b4d6
AP
234# utilization, i.e. when subroutine's throughput is virtually same as
235# of non-interleaved subroutine [for number of input blocks up to 3].
214368ff
AP
236# This is why it originally made no sense to implement 2x subroutine.
237# But times change and it became appropriate to spend extra 192 bytes
238# on 2x subroutine on Atom Silvermont account. For processors that
239# can schedule aes[enc|dec] every cycle optimal interleave factor
240# equals to corresponding instructions latency. 8x is optimal for
241# * Bridge, but it's unfeasible to accommodate such implementation
242# in XMM registers addreassable in 32-bit mode and therefore maximum
243# of 6x is used instead...
244
245sub aesni_generate2
246{ my $p=shift;
247
248 &function_begin_B("_aesni_${p}rypt2");
249 &$movekey ($rndkey0,&QWP(0,$key));
250 &shl ($rounds,4);
251 &$movekey ($rndkey1,&QWP(16,$key));
252 &xorps ($inout0,$rndkey0);
253 &pxor ($inout1,$rndkey0);
254 &$movekey ($rndkey0,&QWP(32,$key));
255 &lea ($key,&DWP(32,$key,$rounds));
256 &neg ($rounds);
257 &add ($rounds,16);
258
259 &set_label("${p}2_loop");
260 eval"&aes${p} ($inout0,$rndkey1)";
261 eval"&aes${p} ($inout1,$rndkey1)";
262 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
263 &add ($rounds,32);
264 eval"&aes${p} ($inout0,$rndkey0)";
265 eval"&aes${p} ($inout1,$rndkey0)";
266 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
267 &jnz (&label("${p}2_loop"));
268 eval"&aes${p} ($inout0,$rndkey1)";
269 eval"&aes${p} ($inout1,$rndkey1)";
270 eval"&aes${p}last ($inout0,$rndkey0)";
271 eval"&aes${p}last ($inout1,$rndkey0)";
272 &ret();
273 &function_end_B("_aesni_${p}rypt2");
274}
f8501464 275
d64a7232
AP
276sub aesni_generate3
277{ my $p=shift;
278
279 &function_begin_B("_aesni_${p}rypt3");
280 &$movekey ($rndkey0,&QWP(0,$key));
d8ba0dc9 281 &shl ($rounds,4);
d608b4d6 282 &$movekey ($rndkey1,&QWP(16,$key));
f8501464 283 &xorps ($inout0,$rndkey0);
d64a7232 284 &pxor ($inout1,$rndkey0);
d64a7232 285 &pxor ($inout2,$rndkey0);
d8ba0dc9
AP
286 &$movekey ($rndkey0,&QWP(32,$key));
287 &lea ($key,&DWP(32,$key,$rounds));
288 &neg ($rounds);
289 &add ($rounds,16);
d7d119a3
AP
290
291 &set_label("${p}3_loop");
292 eval"&aes${p} ($inout0,$rndkey1)";
d64a7232 293 eval"&aes${p} ($inout1,$rndkey1)";
d64a7232 294 eval"&aes${p} ($inout2,$rndkey1)";
d8ba0dc9
AP
295 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
296 &add ($rounds,32);
d64a7232 297 eval"&aes${p} ($inout0,$rndkey0)";
d64a7232
AP
298 eval"&aes${p} ($inout1,$rndkey0)";
299 eval"&aes${p} ($inout2,$rndkey0)";
d8ba0dc9 300 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
d64a7232
AP
301 &jnz (&label("${p}3_loop"));
302 eval"&aes${p} ($inout0,$rndkey1)";
d64a7232
AP
303 eval"&aes${p} ($inout1,$rndkey1)";
304 eval"&aes${p} ($inout2,$rndkey1)";
305 eval"&aes${p}last ($inout0,$rndkey0)";
306 eval"&aes${p}last ($inout1,$rndkey0)";
307 eval"&aes${p}last ($inout2,$rndkey0)";
308 &ret();
309 &function_end_B("_aesni_${p}rypt3");
310}
d608b4d6
AP
311
312# 4x interleave is implemented to improve small block performance,
313# most notably [and naturally] 4 block by ~30%. One can argue that one
314# should have implemented 5x as well, but improvement would be <20%,
315# so it's not worth it...
316sub aesni_generate4
317{ my $p=shift;
318
319 &function_begin_B("_aesni_${p}rypt4");
320 &$movekey ($rndkey0,&QWP(0,$key));
321 &$movekey ($rndkey1,&QWP(16,$key));
d8ba0dc9 322 &shl ($rounds,4);
f8501464 323 &xorps ($inout0,$rndkey0);
d608b4d6
AP
324 &pxor ($inout1,$rndkey0);
325 &pxor ($inout2,$rndkey0);
326 &pxor ($inout3,$rndkey0);
d8ba0dc9
AP
327 &$movekey ($rndkey0,&QWP(32,$key));
328 &lea ($key,&DWP(32,$key,$rounds));
329 &neg ($rounds);
330 &data_byte (0x0f,0x1f,0x40,0x00);
331 &add ($rounds,16);
d7d119a3 332
f8501464 333 &set_label("${p}4_loop");
d7d119a3 334 eval"&aes${p} ($inout0,$rndkey1)";
d608b4d6 335 eval"&aes${p} ($inout1,$rndkey1)";
d608b4d6
AP
336 eval"&aes${p} ($inout2,$rndkey1)";
337 eval"&aes${p} ($inout3,$rndkey1)";
d8ba0dc9
AP
338 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
339 &add ($rounds,32);
d608b4d6 340 eval"&aes${p} ($inout0,$rndkey0)";
d608b4d6
AP
341 eval"&aes${p} ($inout1,$rndkey0)";
342 eval"&aes${p} ($inout2,$rndkey0)";
343 eval"&aes${p} ($inout3,$rndkey0)";
d8ba0dc9 344 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
f8501464 345 &jnz (&label("${p}4_loop"));
d7d119a3 346
d608b4d6 347 eval"&aes${p} ($inout0,$rndkey1)";
d608b4d6
AP
348 eval"&aes${p} ($inout1,$rndkey1)";
349 eval"&aes${p} ($inout2,$rndkey1)";
350 eval"&aes${p} ($inout3,$rndkey1)";
351 eval"&aes${p}last ($inout0,$rndkey0)";
352 eval"&aes${p}last ($inout1,$rndkey0)";
353 eval"&aes${p}last ($inout2,$rndkey0)";
354 eval"&aes${p}last ($inout3,$rndkey0)";
355 &ret();
356 &function_end_B("_aesni_${p}rypt4");
357}
f8501464
AP
358
359sub aesni_generate6
360{ my $p=shift;
361
362 &function_begin_B("_aesni_${p}rypt6");
363 &static_label("_aesni_${p}rypt6_enter");
364 &$movekey ($rndkey0,&QWP(0,$key));
d8ba0dc9 365 &shl ($rounds,4);
f8501464 366 &$movekey ($rndkey1,&QWP(16,$key));
f8501464
AP
367 &xorps ($inout0,$rndkey0);
368 &pxor ($inout1,$rndkey0); # pxor does better here
f8501464 369 &pxor ($inout2,$rndkey0);
d8ba0dc9 370 eval"&aes${p} ($inout0,$rndkey1)";
f8501464 371 &pxor ($inout3,$rndkey0);
f8501464 372 &pxor ($inout4,$rndkey0);
d8ba0dc9
AP
373 eval"&aes${p} ($inout1,$rndkey1)";
374 &lea ($key,&DWP(32,$key,$rounds));
375 &neg ($rounds);
376 eval"&aes${p} ($inout2,$rndkey1)";
f8501464 377 &pxor ($inout5,$rndkey0);
23f6eec7 378 &$movekey ($rndkey0,&QWP(0,$key,$rounds));
d8ba0dc9 379 &add ($rounds,16);
23f6eec7 380 &jmp (&label("_aesni_${p}rypt6_inner"));
f8501464
AP
381
382 &set_label("${p}6_loop",16);
383 eval"&aes${p} ($inout0,$rndkey1)";
384 eval"&aes${p} ($inout1,$rndkey1)";
f8501464 385 eval"&aes${p} ($inout2,$rndkey1)";
23f6eec7 386 &set_label("_aesni_${p}rypt6_inner");
f8501464
AP
387 eval"&aes${p} ($inout3,$rndkey1)";
388 eval"&aes${p} ($inout4,$rndkey1)";
389 eval"&aes${p} ($inout5,$rndkey1)";
d8ba0dc9
AP
390 &set_label("_aesni_${p}rypt6_enter");
391 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
392 &add ($rounds,32);
f8501464
AP
393 eval"&aes${p} ($inout0,$rndkey0)";
394 eval"&aes${p} ($inout1,$rndkey0)";
f8501464
AP
395 eval"&aes${p} ($inout2,$rndkey0)";
396 eval"&aes${p} ($inout3,$rndkey0)";
397 eval"&aes${p} ($inout4,$rndkey0)";
398 eval"&aes${p} ($inout5,$rndkey0)";
d8ba0dc9 399 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
f8501464
AP
400 &jnz (&label("${p}6_loop"));
401
402 eval"&aes${p} ($inout0,$rndkey1)";
403 eval"&aes${p} ($inout1,$rndkey1)";
404 eval"&aes${p} ($inout2,$rndkey1)";
405 eval"&aes${p} ($inout3,$rndkey1)";
406 eval"&aes${p} ($inout4,$rndkey1)";
407 eval"&aes${p} ($inout5,$rndkey1)";
408 eval"&aes${p}last ($inout0,$rndkey0)";
409 eval"&aes${p}last ($inout1,$rndkey0)";
410 eval"&aes${p}last ($inout2,$rndkey0)";
411 eval"&aes${p}last ($inout3,$rndkey0)";
412 eval"&aes${p}last ($inout4,$rndkey0)";
413 eval"&aes${p}last ($inout5,$rndkey0)";
414 &ret();
415 &function_end_B("_aesni_${p}rypt6");
416}
214368ff
AP
417&aesni_generate2("enc") if ($PREFIX eq "aesni");
418&aesni_generate2("dec");
d64a7232
AP
419&aesni_generate3("enc") if ($PREFIX eq "aesni");
420&aesni_generate3("dec");
d608b4d6
AP
421&aesni_generate4("enc") if ($PREFIX eq "aesni");
422&aesni_generate4("dec");
f8501464
AP
423&aesni_generate6("enc") if ($PREFIX eq "aesni");
424&aesni_generate6("dec");
6c83629b 425\f
d64a7232 426if ($PREFIX eq "aesni") {
6c83629b 427######################################################################
d64a7232
AP
428# void aesni_ecb_encrypt (const void *in, void *out,
429# size_t length, const AES_KEY *key,
430# int enc);
d64a7232
AP
431&function_begin("aesni_ecb_encrypt");
432 &mov ($inp,&wparam(0));
433 &mov ($out,&wparam(1));
434 &mov ($len,&wparam(2));
435 &mov ($key,&wparam(3));
f8501464 436 &mov ($rounds_,&wparam(4));
d64a7232 437 &and ($len,-16);
f8501464 438 &jz (&label("ecb_ret"));
d64a7232 439 &mov ($rounds,&DWP(240,$key));
f8501464
AP
440 &test ($rounds_,$rounds_);
441 &jz (&label("ecb_decrypt"));
442
d64a7232
AP
443 &mov ($key_,$key); # backup $key
444 &mov ($rounds_,$rounds); # backup $rounds
f8501464
AP
445 &cmp ($len,0x60);
446 &jb (&label("ecb_enc_tail"));
447
448 &movdqu ($inout0,&QWP(0,$inp));
449 &movdqu ($inout1,&QWP(0x10,$inp));
450 &movdqu ($inout2,&QWP(0x20,$inp));
451 &movdqu ($inout3,&QWP(0x30,$inp));
452 &movdqu ($inout4,&QWP(0x40,$inp));
453 &movdqu ($inout5,&QWP(0x50,$inp));
454 &lea ($inp,&DWP(0x60,$inp));
455 &sub ($len,0x60);
456 &jmp (&label("ecb_enc_loop6_enter"));
457
458&set_label("ecb_enc_loop6",16);
459 &movups (&QWP(0,$out),$inout0);
460 &movdqu ($inout0,&QWP(0,$inp));
461 &movups (&QWP(0x10,$out),$inout1);
462 &movdqu ($inout1,&QWP(0x10,$inp));
463 &movups (&QWP(0x20,$out),$inout2);
464 &movdqu ($inout2,&QWP(0x20,$inp));
465 &movups (&QWP(0x30,$out),$inout3);
466 &movdqu ($inout3,&QWP(0x30,$inp));
467 &movups (&QWP(0x40,$out),$inout4);
468 &movdqu ($inout4,&QWP(0x40,$inp));
469 &movups (&QWP(0x50,$out),$inout5);
470 &lea ($out,&DWP(0x60,$out));
471 &movdqu ($inout5,&QWP(0x50,$inp));
472 &lea ($inp,&DWP(0x60,$inp));
473&set_label("ecb_enc_loop6_enter");
d64a7232 474
f8501464 475 &call ("_aesni_encrypt6");
d64a7232 476
d64a7232 477 &mov ($key,$key_); # restore $key
d64a7232 478 &mov ($rounds,$rounds_); # restore $rounds
f8501464
AP
479 &sub ($len,0x60);
480 &jnc (&label("ecb_enc_loop6"));
481
482 &movups (&QWP(0,$out),$inout0);
483 &movups (&QWP(0x10,$out),$inout1);
d7d119a3 484 &movups (&QWP(0x20,$out),$inout2);
f8501464
AP
485 &movups (&QWP(0x30,$out),$inout3);
486 &movups (&QWP(0x40,$out),$inout4);
487 &movups (&QWP(0x50,$out),$inout5);
488 &lea ($out,&DWP(0x60,$out));
489 &add ($len,0x60);
490 &jz (&label("ecb_ret"));
d64a7232 491
6c83629b 492&set_label("ecb_enc_tail");
6c83629b 493 &movups ($inout0,&QWP(0,$inp));
d7d119a3 494 &cmp ($len,0x20);
6c83629b 495 &jb (&label("ecb_enc_one"));
d64a7232 496 &movups ($inout1,&QWP(0x10,$inp));
d608b4d6 497 &je (&label("ecb_enc_two"));
d608b4d6 498 &movups ($inout2,&QWP(0x20,$inp));
f8501464
AP
499 &cmp ($len,0x40);
500 &jb (&label("ecb_enc_three"));
d608b4d6 501 &movups ($inout3,&QWP(0x30,$inp));
f8501464
AP
502 &je (&label("ecb_enc_four"));
503 &movups ($inout4,&QWP(0x40,$inp));
504 &xorps ($inout5,$inout5);
505 &call ("_aesni_encrypt6");
d64a7232
AP
506 &movups (&QWP(0,$out),$inout0);
507 &movups (&QWP(0x10,$out),$inout1);
d608b4d6
AP
508 &movups (&QWP(0x20,$out),$inout2);
509 &movups (&QWP(0x30,$out),$inout3);
f8501464 510 &movups (&QWP(0x40,$out),$inout4);
d64a7232
AP
511 jmp (&label("ecb_ret"));
512
513&set_label("ecb_enc_one",16);
6f766a41
AP
514 if ($inline)
515 { &aesni_inline_generate1("enc"); }
516 else
517 { &call ("_aesni_encrypt1"); }
d64a7232
AP
518 &movups (&QWP(0,$out),$inout0);
519 &jmp (&label("ecb_ret"));
520
d608b4d6 521&set_label("ecb_enc_two",16);
214368ff 522 &call ("_aesni_encrypt2");
d608b4d6
AP
523 &movups (&QWP(0,$out),$inout0);
524 &movups (&QWP(0x10,$out),$inout1);
525 &jmp (&label("ecb_ret"));
526
527&set_label("ecb_enc_three",16);
528 &call ("_aesni_encrypt3");
529 &movups (&QWP(0,$out),$inout0);
530 &movups (&QWP(0x10,$out),$inout1);
531 &movups (&QWP(0x20,$out),$inout2);
532 &jmp (&label("ecb_ret"));
f8501464
AP
533
534&set_label("ecb_enc_four",16);
535 &call ("_aesni_encrypt4");
536 &movups (&QWP(0,$out),$inout0);
537 &movups (&QWP(0x10,$out),$inout1);
538 &movups (&QWP(0x20,$out),$inout2);
539 &movups (&QWP(0x30,$out),$inout3);
540 &jmp (&label("ecb_ret"));
6c83629b 541######################################################################
d64a7232 542&set_label("ecb_decrypt",16);
f8501464
AP
543 &mov ($key_,$key); # backup $key
544 &mov ($rounds_,$rounds); # backup $rounds
545 &cmp ($len,0x60);
546 &jb (&label("ecb_dec_tail"));
547
548 &movdqu ($inout0,&QWP(0,$inp));
549 &movdqu ($inout1,&QWP(0x10,$inp));
550 &movdqu ($inout2,&QWP(0x20,$inp));
551 &movdqu ($inout3,&QWP(0x30,$inp));
552 &movdqu ($inout4,&QWP(0x40,$inp));
553 &movdqu ($inout5,&QWP(0x50,$inp));
554 &lea ($inp,&DWP(0x60,$inp));
555 &sub ($len,0x60);
556 &jmp (&label("ecb_dec_loop6_enter"));
557
558&set_label("ecb_dec_loop6",16);
d7d119a3 559 &movups (&QWP(0,$out),$inout0);
f8501464 560 &movdqu ($inout0,&QWP(0,$inp));
d7d119a3 561 &movups (&QWP(0x10,$out),$inout1);
f8501464
AP
562 &movdqu ($inout1,&QWP(0x10,$inp));
563 &movups (&QWP(0x20,$out),$inout2);
564 &movdqu ($inout2,&QWP(0x20,$inp));
565 &movups (&QWP(0x30,$out),$inout3);
566 &movdqu ($inout3,&QWP(0x30,$inp));
567 &movups (&QWP(0x40,$out),$inout4);
568 &movdqu ($inout4,&QWP(0x40,$inp));
569 &movups (&QWP(0x50,$out),$inout5);
570 &lea ($out,&DWP(0x60,$out));
571 &movdqu ($inout5,&QWP(0x50,$inp));
572 &lea ($inp,&DWP(0x60,$inp));
573&set_label("ecb_dec_loop6_enter");
574
575 &call ("_aesni_decrypt6");
576
577 &mov ($key,$key_); # restore $key
d64a7232 578 &mov ($rounds,$rounds_); # restore $rounds
f8501464
AP
579 &sub ($len,0x60);
580 &jnc (&label("ecb_dec_loop6"));
581
582 &movups (&QWP(0,$out),$inout0);
583 &movups (&QWP(0x10,$out),$inout1);
d7d119a3 584 &movups (&QWP(0x20,$out),$inout2);
f8501464
AP
585 &movups (&QWP(0x30,$out),$inout3);
586 &movups (&QWP(0x40,$out),$inout4);
587 &movups (&QWP(0x50,$out),$inout5);
588 &lea ($out,&DWP(0x60,$out));
589 &add ($len,0x60);
590 &jz (&label("ecb_ret"));
d64a7232 591
6c83629b 592&set_label("ecb_dec_tail");
6c83629b 593 &movups ($inout0,&QWP(0,$inp));
d7d119a3 594 &cmp ($len,0x20);
6c83629b 595 &jb (&label("ecb_dec_one"));
d64a7232 596 &movups ($inout1,&QWP(0x10,$inp));
d608b4d6 597 &je (&label("ecb_dec_two"));
d608b4d6 598 &movups ($inout2,&QWP(0x20,$inp));
f8501464
AP
599 &cmp ($len,0x40);
600 &jb (&label("ecb_dec_three"));
d608b4d6 601 &movups ($inout3,&QWP(0x30,$inp));
f8501464
AP
602 &je (&label("ecb_dec_four"));
603 &movups ($inout4,&QWP(0x40,$inp));
604 &xorps ($inout5,$inout5);
605 &call ("_aesni_decrypt6");
d64a7232
AP
606 &movups (&QWP(0,$out),$inout0);
607 &movups (&QWP(0x10,$out),$inout1);
d608b4d6
AP
608 &movups (&QWP(0x20,$out),$inout2);
609 &movups (&QWP(0x30,$out),$inout3);
f8501464 610 &movups (&QWP(0x40,$out),$inout4);
d608b4d6 611 &jmp (&label("ecb_ret"));
d64a7232
AP
612
613&set_label("ecb_dec_one",16);
6f766a41
AP
614 if ($inline)
615 { &aesni_inline_generate1("dec"); }
616 else
617 { &call ("_aesni_decrypt1"); }
d64a7232 618 &movups (&QWP(0,$out),$inout0);
d608b4d6
AP
619 &jmp (&label("ecb_ret"));
620
621&set_label("ecb_dec_two",16);
214368ff 622 &call ("_aesni_decrypt2");
d608b4d6
AP
623 &movups (&QWP(0,$out),$inout0);
624 &movups (&QWP(0x10,$out),$inout1);
625 &jmp (&label("ecb_ret"));
626
627&set_label("ecb_dec_three",16);
628 &call ("_aesni_decrypt3");
629 &movups (&QWP(0,$out),$inout0);
630 &movups (&QWP(0x10,$out),$inout1);
631 &movups (&QWP(0x20,$out),$inout2);
f8501464
AP
632 &jmp (&label("ecb_ret"));
633
634&set_label("ecb_dec_four",16);
635 &call ("_aesni_decrypt4");
636 &movups (&QWP(0,$out),$inout0);
637 &movups (&QWP(0x10,$out),$inout1);
638 &movups (&QWP(0x20,$out),$inout2);
639 &movups (&QWP(0x30,$out),$inout3);
d64a7232
AP
640
641&set_label("ecb_ret");
23f6eec7
AP
642 &pxor ("xmm0","xmm0"); # clear register bank
643 &pxor ("xmm1","xmm1");
644 &pxor ("xmm2","xmm2");
645 &pxor ("xmm3","xmm3");
646 &pxor ("xmm4","xmm4");
647 &pxor ("xmm5","xmm5");
648 &pxor ("xmm6","xmm6");
649 &pxor ("xmm7","xmm7");
d64a7232 650&function_end("aesni_ecb_encrypt");
6c83629b
AP
651\f
652######################################################################
d7d119a3
AP
653# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
654# size_t blocks, const AES_KEY *key,
655# const char *ivec,char *cmac);
656#
657# Handles only complete blocks, operates on 64-bit counter and
658# does not update *ivec! Nor does it finalize CMAC value
659# (see engine/eng_aesni.c for details)
6c83629b 660#
f8501464 661{ my $cmac=$inout1;
d7d119a3
AP
662&function_begin("aesni_ccm64_encrypt_blocks");
663 &mov ($inp,&wparam(0));
664 &mov ($out,&wparam(1));
665 &mov ($len,&wparam(2));
666 &mov ($key,&wparam(3));
667 &mov ($rounds_,&wparam(4));
668 &mov ($rounds,&wparam(5));
669 &mov ($key_,"esp");
670 &sub ("esp",60);
671 &and ("esp",-16); # align stack
672 &mov (&DWP(48,"esp"),$key_);
673
674 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
f8501464 675 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
267b481c 676 &mov ($rounds,&DWP(240,$key));
d7d119a3
AP
677
678 # compose byte-swap control mask for pshufb on stack
679 &mov (&DWP(0,"esp"),0x0c0d0e0f);
680 &mov (&DWP(4,"esp"),0x08090a0b);
681 &mov (&DWP(8,"esp"),0x04050607);
682 &mov (&DWP(12,"esp"),0x00010203);
683
684 # compose counter increment vector on stack
267b481c 685 &mov ($rounds_,1);
d7d119a3 686 &xor ($key_,$key_);
267b481c 687 &mov (&DWP(16,"esp"),$rounds_);
d7d119a3
AP
688 &mov (&DWP(20,"esp"),$key_);
689 &mov (&DWP(24,"esp"),$key_);
690 &mov (&DWP(28,"esp"),$key_);
691
d8ba0dc9
AP
692 &shl ($rounds,4);
693 &mov ($rounds_,16);
267b481c 694 &lea ($key_,&DWP(0,$key));
9ee5916d 695 &movdqa ($inout3,&QWP(0,"esp"));
d7d119a3 696 &movdqa ($inout0,$ivec);
d8ba0dc9
AP
697 &lea ($key,&DWP(32,$key,$rounds));
698 &sub ($rounds_,$rounds);
9ee5916d 699 &pshufb ($ivec,$inout3);
d7d119a3
AP
700
701&set_label("ccm64_enc_outer");
267b481c 702 &$movekey ($rndkey0,&QWP(0,$key_));
f8501464 703 &mov ($rounds,$rounds_);
267b481c 704 &movups ($in0,&QWP(0,$inp));
d7d119a3 705
f8501464 706 &xorps ($inout0,$rndkey0);
267b481c
AP
707 &$movekey ($rndkey1,&QWP(16,$key_));
708 &xorps ($rndkey0,$in0);
267b481c 709 &xorps ($cmac,$rndkey0); # cmac^=inp
d8ba0dc9 710 &$movekey ($rndkey0,&QWP(32,$key_));
f8501464
AP
711
712&set_label("ccm64_enc2_loop");
713 &aesenc ($inout0,$rndkey1);
f8501464 714 &aesenc ($cmac,$rndkey1);
d8ba0dc9
AP
715 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
716 &add ($rounds,32);
f8501464 717 &aesenc ($inout0,$rndkey0);
f8501464 718 &aesenc ($cmac,$rndkey0);
d8ba0dc9 719 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
f8501464
AP
720 &jnz (&label("ccm64_enc2_loop"));
721 &aesenc ($inout0,$rndkey1);
722 &aesenc ($cmac,$rndkey1);
267b481c 723 &paddq ($ivec,&QWP(16,"esp"));
d8ba0dc9 724 &dec ($len);
f8501464
AP
725 &aesenclast ($inout0,$rndkey0);
726 &aesenclast ($cmac,$rndkey0);
d7d119a3 727
d7d119a3 728 &lea ($inp,&DWP(16,$inp));
f8501464 729 &xorps ($in0,$inout0); # inp^=E(ivec)
d7d119a3 730 &movdqa ($inout0,$ivec);
267b481c 731 &movups (&QWP(0,$out),$in0); # save output
9ee5916d 732 &pshufb ($inout0,$inout3);
d8ba0dc9 733 &lea ($out,&DWP(16,$out));
d7d119a3
AP
734 &jnz (&label("ccm64_enc_outer"));
735
736 &mov ("esp",&DWP(48,"esp"));
737 &mov ($out,&wparam(5));
f8501464 738 &movups (&QWP(0,$out),$cmac);
23f6eec7
AP
739
740 &pxor ("xmm0","xmm0"); # clear register bank
741 &pxor ("xmm1","xmm1");
742 &pxor ("xmm2","xmm2");
743 &pxor ("xmm3","xmm3");
744 &pxor ("xmm4","xmm4");
745 &pxor ("xmm5","xmm5");
746 &pxor ("xmm6","xmm6");
747 &pxor ("xmm7","xmm7");
d7d119a3
AP
748&function_end("aesni_ccm64_encrypt_blocks");
749
750&function_begin("aesni_ccm64_decrypt_blocks");
751 &mov ($inp,&wparam(0));
752 &mov ($out,&wparam(1));
753 &mov ($len,&wparam(2));
754 &mov ($key,&wparam(3));
755 &mov ($rounds_,&wparam(4));
756 &mov ($rounds,&wparam(5));
757 &mov ($key_,"esp");
758 &sub ("esp",60);
759 &and ("esp",-16); # align stack
760 &mov (&DWP(48,"esp"),$key_);
761
762 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
f8501464 763 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
267b481c 764 &mov ($rounds,&DWP(240,$key));
d7d119a3
AP
765
766 # compose byte-swap control mask for pshufb on stack
767 &mov (&DWP(0,"esp"),0x0c0d0e0f);
768 &mov (&DWP(4,"esp"),0x08090a0b);
769 &mov (&DWP(8,"esp"),0x04050607);
770 &mov (&DWP(12,"esp"),0x00010203);
771
772 # compose counter increment vector on stack
267b481c 773 &mov ($rounds_,1);
d7d119a3 774 &xor ($key_,$key_);
267b481c 775 &mov (&DWP(16,"esp"),$rounds_);
d7d119a3
AP
776 &mov (&DWP(20,"esp"),$key_);
777 &mov (&DWP(24,"esp"),$key_);
778 &mov (&DWP(28,"esp"),$key_);
779
780 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
781 &movdqa ($inout0,$ivec);
d7d119a3 782
d7d119a3
AP
783 &mov ($key_,$key);
784 &mov ($rounds_,$rounds);
785
267b481c 786 &pshufb ($ivec,$inout3);
d7d119a3
AP
787 if ($inline)
788 { &aesni_inline_generate1("enc"); }
789 else
790 { &call ("_aesni_encrypt1"); }
d8ba0dc9
AP
791 &shl ($rounds_,4);
792 &mov ($rounds,16);
f8501464 793 &movups ($in0,&QWP(0,$inp)); # load inp
267b481c 794 &paddq ($ivec,&QWP(16,"esp"));
f8501464 795 &lea ($inp,&QWP(16,$inp));
d8ba0dc9
AP
796 &sub ($rounds,$rounds_);
797 &lea ($key,&DWP(32,$key_,$rounds_));
798 &mov ($rounds_,$rounds);
267b481c
AP
799 &jmp (&label("ccm64_dec_outer"));
800
801&set_label("ccm64_dec_outer",16);
802 &xorps ($in0,$inout0); # inp ^= E(ivec)
803 &movdqa ($inout0,$ivec);
267b481c 804 &movups (&QWP(0,$out),$in0); # save output
d7d119a3 805 &lea ($out,&DWP(16,$out));
9ee5916d 806 &pshufb ($inout0,$inout3);
d7d119a3 807
f8501464 808 &sub ($len,1);
d7d119a3
AP
809 &jz (&label("ccm64_dec_break"));
810
267b481c 811 &$movekey ($rndkey0,&QWP(0,$key_));
d8ba0dc9 812 &mov ($rounds,$rounds_);
267b481c 813 &$movekey ($rndkey1,&QWP(16,$key_));
f8501464 814 &xorps ($in0,$rndkey0);
f8501464
AP
815 &xorps ($inout0,$rndkey0);
816 &xorps ($cmac,$in0); # cmac^=out
d8ba0dc9 817 &$movekey ($rndkey0,&QWP(32,$key_));
d7d119a3 818
f8501464
AP
819&set_label("ccm64_dec2_loop");
820 &aesenc ($inout0,$rndkey1);
f8501464 821 &aesenc ($cmac,$rndkey1);
d8ba0dc9
AP
822 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
823 &add ($rounds,32);
f8501464 824 &aesenc ($inout0,$rndkey0);
f8501464 825 &aesenc ($cmac,$rndkey0);
d8ba0dc9 826 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
f8501464 827 &jnz (&label("ccm64_dec2_loop"));
267b481c
AP
828 &movups ($in0,&QWP(0,$inp)); # load inp
829 &paddq ($ivec,&QWP(16,"esp"));
f8501464
AP
830 &aesenc ($inout0,$rndkey1);
831 &aesenc ($cmac,$rndkey1);
832 &aesenclast ($inout0,$rndkey0);
833 &aesenclast ($cmac,$rndkey0);
d8ba0dc9 834 &lea ($inp,&QWP(16,$inp));
d7d119a3
AP
835 &jmp (&label("ccm64_dec_outer"));
836
837&set_label("ccm64_dec_break",16);
d8ba0dc9 838 &mov ($rounds,&DWP(240,$key_));
267b481c 839 &mov ($key,$key_);
d7d119a3 840 if ($inline)
f8501464 841 { &aesni_inline_generate1("enc",$cmac,$in0); }
d7d119a3 842 else
f8501464 843 { &call ("_aesni_encrypt1",$cmac); }
d7d119a3
AP
844
845 &mov ("esp",&DWP(48,"esp"));
846 &mov ($out,&wparam(5));
f8501464 847 &movups (&QWP(0,$out),$cmac);
23f6eec7
AP
848
849 &pxor ("xmm0","xmm0"); # clear register bank
850 &pxor ("xmm1","xmm1");
851 &pxor ("xmm2","xmm2");
852 &pxor ("xmm3","xmm3");
853 &pxor ("xmm4","xmm4");
854 &pxor ("xmm5","xmm5");
855 &pxor ("xmm6","xmm6");
856 &pxor ("xmm7","xmm7");
d7d119a3 857&function_end("aesni_ccm64_decrypt_blocks");
f8501464 858}
d7d119a3
AP
859\f
860######################################################################
6c83629b
AP
861# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
862# size_t blocks, const AES_KEY *key,
863# const char *ivec);
d7d119a3
AP
864#
865# Handles only complete blocks, operates on 32-bit counter and
d8ba0dc9 866# does not update *ivec! (see crypto/modes/ctr128.c for details)
d7d119a3 867#
f8501464
AP
868# stack layout:
869# 0 pshufb mask
870# 16 vector addend: 0,6,6,6
871# 32 counter-less ivec
872# 48 1st triplet of counter vector
873# 64 2nd triplet of counter vector
874# 80 saved %esp
875
6c83629b
AP
876&function_begin("aesni_ctr32_encrypt_blocks");
877 &mov ($inp,&wparam(0));
878 &mov ($out,&wparam(1));
879 &mov ($len,&wparam(2));
880 &mov ($key,&wparam(3));
881 &mov ($rounds_,&wparam(4));
882 &mov ($key_,"esp");
f8501464 883 &sub ("esp",88);
6c83629b 884 &and ("esp",-16); # align stack
f8501464 885 &mov (&DWP(80,"esp"),$key_);
6c83629b 886
d7d119a3
AP
887 &cmp ($len,1);
888 &je (&label("ctr32_one_shortcut"));
889
f8501464 890 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
6c83629b
AP
891
892 # compose byte-swap control mask for pshufb on stack
893 &mov (&DWP(0,"esp"),0x0c0d0e0f);
894 &mov (&DWP(4,"esp"),0x08090a0b);
895 &mov (&DWP(8,"esp"),0x04050607);
896 &mov (&DWP(12,"esp"),0x00010203);
897
898 # compose counter increment vector on stack
f8501464 899 &mov ($rounds,6);
6c83629b
AP
900 &xor ($key_,$key_);
901 &mov (&DWP(16,"esp"),$rounds);
902 &mov (&DWP(20,"esp"),$rounds);
903 &mov (&DWP(24,"esp"),$rounds);
904 &mov (&DWP(28,"esp"),$key_);
905
f8501464
AP
906 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
907 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
6c83629b
AP
908
909 &mov ($rounds,&DWP(240,$key)); # key->rounds
6c83629b 910
f8501464 911 # compose 2 vectors of 3x32-bit counters
6c83629b 912 &bswap ($rounds_);
f8501464 913 &pxor ($rndkey0,$rndkey0);
d8ba0dc9 914 &pxor ($rndkey1,$rndkey1);
f8501464 915 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
d8ba0dc9 916 &pinsrd ($rndkey0,$rounds_,0);
f8501464 917 &lea ($key_,&DWP(3,$rounds_));
d8ba0dc9 918 &pinsrd ($rndkey1,$key_,0);
6c83629b 919 &inc ($rounds_);
d8ba0dc9 920 &pinsrd ($rndkey0,$rounds_,1);
f8501464 921 &inc ($key_);
d8ba0dc9 922 &pinsrd ($rndkey1,$key_,1);
6c83629b 923 &inc ($rounds_);
d8ba0dc9 924 &pinsrd ($rndkey0,$rounds_,2);
f8501464 925 &inc ($key_);
d8ba0dc9
AP
926 &pinsrd ($rndkey1,$key_,2);
927 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
f8501464 928 &pshufb ($rndkey0,$inout0); # byte swap
d8ba0dc9
AP
929 &movdqu ($inout4,&QWP(0,$key)); # key[0]
930 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
931 &pshufb ($rndkey1,$inout0); # byte swap
f8501464 932
d8ba0dc9
AP
933 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
934 &pshufd ($inout1,$rndkey0,2<<6);
f8501464
AP
935 &cmp ($len,6);
936 &jb (&label("ctr32_tail"));
d8ba0dc9
AP
937 &pxor ($inout5,$inout4); # counter-less ivec^key[0]
938 &shl ($rounds,4);
939 &mov ($rounds_,16);
940 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
f8501464 941 &mov ($key_,$key); # backup $key
d8ba0dc9
AP
942 &sub ($rounds_,$rounds); # backup twisted $rounds
943 &lea ($key,&DWP(32,$key,$rounds));
f8501464
AP
944 &sub ($len,6);
945 &jmp (&label("ctr32_loop6"));
946
947&set_label("ctr32_loop6",16);
d8ba0dc9
AP
948 # inlining _aesni_encrypt6's prologue gives ~6% improvement...
949 &pshufd ($inout2,$rndkey0,1<<6);
950 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
951 &pshufd ($inout3,$rndkey1,3<<6);
952 &pxor ($inout0,$rndkey0); # merge counter-less ivec
953 &pshufd ($inout4,$rndkey1,2<<6);
d7d119a3 954 &pxor ($inout1,$rndkey0);
d8ba0dc9
AP
955 &pshufd ($inout5,$rndkey1,1<<6);
956 &$movekey ($rndkey1,&QWP(16,$key_));
f8501464 957 &pxor ($inout2,$rndkey0);
f8501464 958 &pxor ($inout3,$rndkey0);
d8ba0dc9 959 &aesenc ($inout0,$rndkey1);
f8501464 960 &pxor ($inout4,$rndkey0);
f8501464 961 &pxor ($inout5,$rndkey0);
d8ba0dc9
AP
962 &aesenc ($inout1,$rndkey1);
963 &$movekey ($rndkey0,&QWP(32,$key_));
964 &mov ($rounds,$rounds_);
965 &aesenc ($inout2,$rndkey1);
966 &aesenc ($inout3,$rndkey1);
f8501464 967 &aesenc ($inout4,$rndkey1);
f8501464 968 &aesenc ($inout5,$rndkey1);
d7d119a3 969
f8501464
AP
970 &call (&label("_aesni_encrypt6_enter"));
971
972 &movups ($rndkey1,&QWP(0,$inp));
973 &movups ($rndkey0,&QWP(0x10,$inp));
974 &xorps ($inout0,$rndkey1);
975 &movups ($rndkey1,&QWP(0x20,$inp));
976 &xorps ($inout1,$rndkey0);
977 &movups (&QWP(0,$out),$inout0);
978 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
979 &xorps ($inout2,$rndkey1);
d8ba0dc9 980 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
f8501464
AP
981 &movups (&QWP(0x10,$out),$inout1);
982 &movups (&QWP(0x20,$out),$inout2);
983
d8ba0dc9
AP
984 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment
985 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
f8501464
AP
986 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
987
988 &movups ($inout1,&QWP(0x30,$inp));
989 &movups ($inout2,&QWP(0x40,$inp));
990 &xorps ($inout3,$inout1);
991 &movups ($inout1,&QWP(0x50,$inp));
992 &lea ($inp,&DWP(0x60,$inp));
d8ba0dc9
AP
993 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
994 &pshufb ($rndkey0,$inout0); # byte swap
f8501464
AP
995 &xorps ($inout4,$inout2);
996 &movups (&QWP(0x30,$out),$inout3);
997 &xorps ($inout5,$inout1);
d8ba0dc9
AP
998 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
999 &pshufb ($rndkey1,$inout0); # byte swap
f8501464 1000 &movups (&QWP(0x40,$out),$inout4);
d8ba0dc9 1001 &pshufd ($inout0,$rndkey0,3<<6);
f8501464
AP
1002 &movups (&QWP(0x50,$out),$inout5);
1003 &lea ($out,&DWP(0x60,$out));
d7d119a3 1004
d8ba0dc9 1005 &pshufd ($inout1,$rndkey0,2<<6);
f8501464
AP
1006 &sub ($len,6);
1007 &jnc (&label("ctr32_loop6"));
6c83629b 1008
f8501464
AP
1009 &add ($len,6);
1010 &jz (&label("ctr32_ret"));
d8ba0dc9 1011 &movdqu ($inout5,&QWP(0,$key_));
f8501464 1012 &mov ($key,$key_);
d8ba0dc9
AP
1013 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
1014 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
6c83629b
AP
1015
1016&set_label("ctr32_tail");
f8501464 1017 &por ($inout0,$inout5);
d7d119a3 1018 &cmp ($len,2);
6c83629b 1019 &jb (&label("ctr32_one"));
6c83629b 1020
d8ba0dc9 1021 &pshufd ($inout2,$rndkey0,1<<6);
f8501464
AP
1022 &por ($inout1,$inout5);
1023 &je (&label("ctr32_two"));
6c83629b 1024
d8ba0dc9 1025 &pshufd ($inout3,$rndkey1,3<<6);
f8501464
AP
1026 &por ($inout2,$inout5);
1027 &cmp ($len,4);
1028 &jb (&label("ctr32_three"));
1029
d8ba0dc9 1030 &pshufd ($inout4,$rndkey1,2<<6);
f8501464
AP
1031 &por ($inout3,$inout5);
1032 &je (&label("ctr32_four"));
1033
1034 &por ($inout4,$inout5);
1035 &call ("_aesni_encrypt6");
1036 &movups ($rndkey1,&QWP(0,$inp));
1037 &movups ($rndkey0,&QWP(0x10,$inp));
1038 &xorps ($inout0,$rndkey1);
1039 &movups ($rndkey1,&QWP(0x20,$inp));
1040 &xorps ($inout1,$rndkey0);
1041 &movups ($rndkey0,&QWP(0x30,$inp));
1042 &xorps ($inout2,$rndkey1);
1043 &movups ($rndkey1,&QWP(0x40,$inp));
1044 &xorps ($inout3,$rndkey0);
1045 &movups (&QWP(0,$out),$inout0);
1046 &xorps ($inout4,$rndkey1);
1047 &movups (&QWP(0x10,$out),$inout1);
1048 &movups (&QWP(0x20,$out),$inout2);
1049 &movups (&QWP(0x30,$out),$inout3);
1050 &movups (&QWP(0x40,$out),$inout4);
6c83629b
AP
1051 &jmp (&label("ctr32_ret"));
1052
d7d119a3 1053&set_label("ctr32_one_shortcut",16);
f8501464 1054 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
d7d119a3 1055 &mov ($rounds,&DWP(240,$key));
609b0852 1056
d7d119a3 1057&set_label("ctr32_one");
6c83629b
AP
1058 if ($inline)
1059 { &aesni_inline_generate1("enc"); }
1060 else
1061 { &call ("_aesni_encrypt1"); }
f8501464
AP
1062 &movups ($in0,&QWP(0,$inp));
1063 &xorps ($in0,$inout0);
1064 &movups (&QWP(0,$out),$in0);
6c83629b 1065 &jmp (&label("ctr32_ret"));
d64a7232 1066
6c83629b 1067&set_label("ctr32_two",16);
214368ff 1068 &call ("_aesni_encrypt2");
f8501464
AP
1069 &movups ($inout3,&QWP(0,$inp));
1070 &movups ($inout4,&QWP(0x10,$inp));
1071 &xorps ($inout0,$inout3);
1072 &xorps ($inout1,$inout4);
1073 &movups (&QWP(0,$out),$inout0);
1074 &movups (&QWP(0x10,$out),$inout1);
6c83629b
AP
1075 &jmp (&label("ctr32_ret"));
1076
1077&set_label("ctr32_three",16);
1078 &call ("_aesni_encrypt3");
f8501464
AP
1079 &movups ($inout3,&QWP(0,$inp));
1080 &movups ($inout4,&QWP(0x10,$inp));
1081 &xorps ($inout0,$inout3);
1082 &movups ($inout5,&QWP(0x20,$inp));
1083 &xorps ($inout1,$inout4);
1084 &movups (&QWP(0,$out),$inout0);
1085 &xorps ($inout2,$inout5);
1086 &movups (&QWP(0x10,$out),$inout1);
1087 &movups (&QWP(0x20,$out),$inout2);
1088 &jmp (&label("ctr32_ret"));
1089
1090&set_label("ctr32_four",16);
1091 &call ("_aesni_encrypt4");
1092 &movups ($inout4,&QWP(0,$inp));
1093 &movups ($inout5,&QWP(0x10,$inp));
1094 &movups ($rndkey1,&QWP(0x20,$inp));
1095 &xorps ($inout0,$inout4);
1096 &movups ($rndkey0,&QWP(0x30,$inp));
1097 &xorps ($inout1,$inout5);
1098 &movups (&QWP(0,$out),$inout0);
1099 &xorps ($inout2,$rndkey1);
1100 &movups (&QWP(0x10,$out),$inout1);
1101 &xorps ($inout3,$rndkey0);
1102 &movups (&QWP(0x20,$out),$inout2);
1103 &movups (&QWP(0x30,$out),$inout3);
6c83629b
AP
1104
1105&set_label("ctr32_ret");
23f6eec7
AP
1106 &pxor ("xmm0","xmm0"); # clear register bank
1107 &pxor ("xmm1","xmm1");
1108 &pxor ("xmm2","xmm2");
1109 &pxor ("xmm3","xmm3");
1110 &pxor ("xmm4","xmm4");
1111 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
1112 &pxor ("xmm5","xmm5");
1113 &movdqa (&QWP(48,"esp"),"xmm0");
1114 &pxor ("xmm6","xmm6");
1115 &movdqa (&QWP(64,"esp"),"xmm0");
1116 &pxor ("xmm7","xmm7");
f8501464 1117 &mov ("esp",&DWP(80,"esp"));
6c83629b 1118&function_end("aesni_ctr32_encrypt_blocks");
f8501464
AP
1119\f
1120######################################################################
1121# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1122# const AES_KEY *key1, const AES_KEY *key2
1123# const unsigned char iv[16]);
1124#
1125{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1126
1127&function_begin("aesni_xts_encrypt");
1128 &mov ($key,&wparam(4)); # key2
1129 &mov ($inp,&wparam(5)); # clear-text tweak
1130
1131 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1132 &movups ($inout0,&QWP(0,$inp));
1133 if ($inline)
1134 { &aesni_inline_generate1("enc"); }
1135 else
1136 { &call ("_aesni_encrypt1"); }
1137
1138 &mov ($inp,&wparam(0));
1139 &mov ($out,&wparam(1));
1140 &mov ($len,&wparam(2));
1141 &mov ($key,&wparam(3)); # key1
1142
1143 &mov ($key_,"esp");
1144 &sub ("esp",16*7+8);
1145 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1146 &and ("esp",-16); # align stack
1147
1148 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1149 &mov (&DWP(16*6+4,"esp"),0);
1150 &mov (&DWP(16*6+8,"esp"),1);
1151 &mov (&DWP(16*6+12,"esp"),0);
1152 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1153 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1154
1155 &movdqa ($tweak,$inout0);
1156 &pxor ($twtmp,$twtmp);
1157 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1158 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1159
1160 &and ($len,-16);
1161 &mov ($key_,$key); # backup $key
1162 &mov ($rounds_,$rounds); # backup $rounds
1163 &sub ($len,16*6);
1164 &jc (&label("xts_enc_short"));
1165
d8ba0dc9
AP
1166 &shl ($rounds,4);
1167 &mov ($rounds_,16);
1168 &sub ($rounds_,$rounds);
1169 &lea ($key,&DWP(32,$key,$rounds));
f8501464
AP
1170 &jmp (&label("xts_enc_loop6"));
1171
1172&set_label("xts_enc_loop6",16);
1173 for ($i=0;$i<4;$i++) {
1174 &pshufd ($twres,$twtmp,0x13);
1175 &pxor ($twtmp,$twtmp);
1176 &movdqa (&QWP(16*$i,"esp"),$tweak);
1177 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1178 &pand ($twres,$twmask); # isolate carry and residue
1179 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1180 &pxor ($tweak,$twres);
1181 }
1182 &pshufd ($inout5,$twtmp,0x13);
1183 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1184 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1185 &$movekey ($rndkey0,&QWP(0,$key_));
1186 &pand ($inout5,$twmask); # isolate carry and residue
1187 &movups ($inout0,&QWP(0,$inp)); # load input
1188 &pxor ($inout5,$tweak);
1189
1190 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
d8ba0dc9 1191 &mov ($rounds,$rounds_); # restore $rounds
f8501464
AP
1192 &movdqu ($inout1,&QWP(16*1,$inp));
1193 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1194 &movdqu ($inout2,&QWP(16*2,$inp));
1195 &pxor ($inout1,$rndkey0);
1196 &movdqu ($inout3,&QWP(16*3,$inp));
1197 &pxor ($inout2,$rndkey0);
1198 &movdqu ($inout4,&QWP(16*4,$inp));
1199 &pxor ($inout3,$rndkey0);
1200 &movdqu ($rndkey1,&QWP(16*5,$inp));
1201 &pxor ($inout4,$rndkey0);
1202 &lea ($inp,&DWP(16*6,$inp));
1203 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1204 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1205 &pxor ($inout5,$rndkey1);
1206
1207 &$movekey ($rndkey1,&QWP(16,$key_));
f8501464 1208 &pxor ($inout1,&QWP(16*1,"esp"));
f8501464 1209 &pxor ($inout2,&QWP(16*2,"esp"));
d8ba0dc9 1210 &aesenc ($inout0,$rndkey1);
f8501464 1211 &pxor ($inout3,&QWP(16*3,"esp"));
f8501464 1212 &pxor ($inout4,&QWP(16*4,"esp"));
d8ba0dc9 1213 &aesenc ($inout1,$rndkey1);
f8501464 1214 &pxor ($inout5,$rndkey0);
d8ba0dc9
AP
1215 &$movekey ($rndkey0,&QWP(32,$key_));
1216 &aesenc ($inout2,$rndkey1);
1217 &aesenc ($inout3,$rndkey1);
f8501464 1218 &aesenc ($inout4,$rndkey1);
f8501464
AP
1219 &aesenc ($inout5,$rndkey1);
1220 &call (&label("_aesni_encrypt6_enter"));
1221
1222 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1223 &pxor ($twtmp,$twtmp);
1224 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1225 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1226 &xorps ($inout1,&QWP(16*1,"esp"));
1227 &movups (&QWP(16*0,$out),$inout0); # write output
1228 &xorps ($inout2,&QWP(16*2,"esp"));
1229 &movups (&QWP(16*1,$out),$inout1);
1230 &xorps ($inout3,&QWP(16*3,"esp"));
1231 &movups (&QWP(16*2,$out),$inout2);
1232 &xorps ($inout4,&QWP(16*4,"esp"));
1233 &movups (&QWP(16*3,$out),$inout3);
1234 &xorps ($inout5,$tweak);
1235 &movups (&QWP(16*4,$out),$inout4);
1236 &pshufd ($twres,$twtmp,0x13);
1237 &movups (&QWP(16*5,$out),$inout5);
1238 &lea ($out,&DWP(16*6,$out));
1239 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1240
1241 &pxor ($twtmp,$twtmp);
1242 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1243 &pand ($twres,$twmask); # isolate carry and residue
1244 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
f8501464
AP
1245 &pxor ($tweak,$twres);
1246
1247 &sub ($len,16*6);
1248 &jnc (&label("xts_enc_loop6"));
1249
d8ba0dc9 1250 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
f8501464
AP
1251 &mov ($key,$key_); # restore $key
1252 &mov ($rounds_,$rounds);
1253
1254&set_label("xts_enc_short");
1255 &add ($len,16*6);
1256 &jz (&label("xts_enc_done6x"));
1257
1258 &movdqa ($inout3,$tweak); # put aside previous tweak
1259 &cmp ($len,0x20);
1260 &jb (&label("xts_enc_one"));
1261
1262 &pshufd ($twres,$twtmp,0x13);
1263 &pxor ($twtmp,$twtmp);
1264 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1265 &pand ($twres,$twmask); # isolate carry and residue
1266 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1267 &pxor ($tweak,$twres);
1268 &je (&label("xts_enc_two"));
1269
1270 &pshufd ($twres,$twtmp,0x13);
1271 &pxor ($twtmp,$twtmp);
1272 &movdqa ($inout4,$tweak); # put aside previous tweak
1273 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1274 &pand ($twres,$twmask); # isolate carry and residue
1275 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1276 &pxor ($tweak,$twres);
1277 &cmp ($len,0x40);
1278 &jb (&label("xts_enc_three"));
1279
1280 &pshufd ($twres,$twtmp,0x13);
1281 &pxor ($twtmp,$twtmp);
1282 &movdqa ($inout5,$tweak); # put aside previous tweak
1283 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1284 &pand ($twres,$twmask); # isolate carry and residue
1285 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1286 &pxor ($tweak,$twres);
1287 &movdqa (&QWP(16*0,"esp"),$inout3);
1288 &movdqa (&QWP(16*1,"esp"),$inout4);
1289 &je (&label("xts_enc_four"));
1290
1291 &movdqa (&QWP(16*2,"esp"),$inout5);
1292 &pshufd ($inout5,$twtmp,0x13);
1293 &movdqa (&QWP(16*3,"esp"),$tweak);
1294 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1295 &pand ($inout5,$twmask); # isolate carry and residue
1296 &pxor ($inout5,$tweak);
1297
1298 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1299 &movdqu ($inout1,&QWP(16*1,$inp));
1300 &movdqu ($inout2,&QWP(16*2,$inp));
1301 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1302 &movdqu ($inout3,&QWP(16*3,$inp));
1303 &pxor ($inout1,&QWP(16*1,"esp"));
1304 &movdqu ($inout4,&QWP(16*4,$inp));
1305 &pxor ($inout2,&QWP(16*2,"esp"));
1306 &lea ($inp,&DWP(16*5,$inp));
1307 &pxor ($inout3,&QWP(16*3,"esp"));
1308 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1309 &pxor ($inout4,$inout5);
1310
1311 &call ("_aesni_encrypt6");
1312
1313 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1314 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1315 &xorps ($inout1,&QWP(16*1,"esp"));
1316 &xorps ($inout2,&QWP(16*2,"esp"));
1317 &movups (&QWP(16*0,$out),$inout0); # write output
1318 &xorps ($inout3,&QWP(16*3,"esp"));
1319 &movups (&QWP(16*1,$out),$inout1);
1320 &xorps ($inout4,$tweak);
1321 &movups (&QWP(16*2,$out),$inout2);
1322 &movups (&QWP(16*3,$out),$inout3);
1323 &movups (&QWP(16*4,$out),$inout4);
1324 &lea ($out,&DWP(16*5,$out));
1325 &jmp (&label("xts_enc_done"));
1326
1327&set_label("xts_enc_one",16);
1328 &movups ($inout0,&QWP(16*0,$inp)); # load input
1329 &lea ($inp,&DWP(16*1,$inp));
1330 &xorps ($inout0,$inout3); # input^=tweak
1331 if ($inline)
1332 { &aesni_inline_generate1("enc"); }
1333 else
1334 { &call ("_aesni_encrypt1"); }
1335 &xorps ($inout0,$inout3); # output^=tweak
1336 &movups (&QWP(16*0,$out),$inout0); # write output
1337 &lea ($out,&DWP(16*1,$out));
1338
1339 &movdqa ($tweak,$inout3); # last tweak
1340 &jmp (&label("xts_enc_done"));
1341
1342&set_label("xts_enc_two",16);
1343 &movaps ($inout4,$tweak); # put aside last tweak
1344
1345 &movups ($inout0,&QWP(16*0,$inp)); # load input
1346 &movups ($inout1,&QWP(16*1,$inp));
1347 &lea ($inp,&DWP(16*2,$inp));
1348 &xorps ($inout0,$inout3); # input^=tweak
1349 &xorps ($inout1,$inout4);
f8501464 1350
214368ff 1351 &call ("_aesni_encrypt2");
f8501464
AP
1352
1353 &xorps ($inout0,$inout3); # output^=tweak
1354 &xorps ($inout1,$inout4);
1355 &movups (&QWP(16*0,$out),$inout0); # write output
1356 &movups (&QWP(16*1,$out),$inout1);
1357 &lea ($out,&DWP(16*2,$out));
1358
1359 &movdqa ($tweak,$inout4); # last tweak
1360 &jmp (&label("xts_enc_done"));
1361
1362&set_label("xts_enc_three",16);
1363 &movaps ($inout5,$tweak); # put aside last tweak
1364 &movups ($inout0,&QWP(16*0,$inp)); # load input
1365 &movups ($inout1,&QWP(16*1,$inp));
1366 &movups ($inout2,&QWP(16*2,$inp));
1367 &lea ($inp,&DWP(16*3,$inp));
1368 &xorps ($inout0,$inout3); # input^=tweak
1369 &xorps ($inout1,$inout4);
1370 &xorps ($inout2,$inout5);
1371
1372 &call ("_aesni_encrypt3");
1373
1374 &xorps ($inout0,$inout3); # output^=tweak
1375 &xorps ($inout1,$inout4);
1376 &xorps ($inout2,$inout5);
1377 &movups (&QWP(16*0,$out),$inout0); # write output
1378 &movups (&QWP(16*1,$out),$inout1);
1379 &movups (&QWP(16*2,$out),$inout2);
1380 &lea ($out,&DWP(16*3,$out));
1381
1382 &movdqa ($tweak,$inout5); # last tweak
1383 &jmp (&label("xts_enc_done"));
1384
1385&set_label("xts_enc_four",16);
1386 &movaps ($inout4,$tweak); # put aside last tweak
1387
1388 &movups ($inout0,&QWP(16*0,$inp)); # load input
1389 &movups ($inout1,&QWP(16*1,$inp));
1390 &movups ($inout2,&QWP(16*2,$inp));
1391 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1392 &movups ($inout3,&QWP(16*3,$inp));
1393 &lea ($inp,&DWP(16*4,$inp));
1394 &xorps ($inout1,&QWP(16*1,"esp"));
1395 &xorps ($inout2,$inout5);
1396 &xorps ($inout3,$inout4);
1397
1398 &call ("_aesni_encrypt4");
1399
1400 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1401 &xorps ($inout1,&QWP(16*1,"esp"));
1402 &xorps ($inout2,$inout5);
1403 &movups (&QWP(16*0,$out),$inout0); # write output
1404 &xorps ($inout3,$inout4);
1405 &movups (&QWP(16*1,$out),$inout1);
1406 &movups (&QWP(16*2,$out),$inout2);
1407 &movups (&QWP(16*3,$out),$inout3);
1408 &lea ($out,&DWP(16*4,$out));
1409
1410 &movdqa ($tweak,$inout4); # last tweak
1411 &jmp (&label("xts_enc_done"));
1412
1413&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1414 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1415 &and ($len,15);
1416 &jz (&label("xts_enc_ret"));
1417 &movdqa ($inout3,$tweak);
1418 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1419 &jmp (&label("xts_enc_steal"));
1420
1421&set_label("xts_enc_done",16);
1422 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1423 &pxor ($twtmp,$twtmp);
1424 &and ($len,15);
1425 &jz (&label("xts_enc_ret"));
1426
1427 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1428 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1429 &pshufd ($inout3,$twtmp,0x13);
1430 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1431 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1432 &pxor ($inout3,$tweak);
1433
1434&set_label("xts_enc_steal");
1435 &movz ($rounds,&BP(0,$inp));
1436 &movz ($key,&BP(-16,$out));
1437 &lea ($inp,&DWP(1,$inp));
1438 &mov (&BP(-16,$out),&LB($rounds));
1439 &mov (&BP(0,$out),&LB($key));
1440 &lea ($out,&DWP(1,$out));
1441 &sub ($len,1);
1442 &jnz (&label("xts_enc_steal"));
1443
1444 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1445 &mov ($key,$key_); # restore $key
1446 &mov ($rounds,$rounds_); # restore $rounds
1447
1448 &movups ($inout0,&QWP(-16,$out)); # load input
1449 &xorps ($inout0,$inout3); # input^=tweak
1450 if ($inline)
1451 { &aesni_inline_generate1("enc"); }
1452 else
1453 { &call ("_aesni_encrypt1"); }
1454 &xorps ($inout0,$inout3); # output^=tweak
1455 &movups (&QWP(-16,$out),$inout0); # write output
1456
1457&set_label("xts_enc_ret");
23f6eec7
AP
1458 &pxor ("xmm0","xmm0"); # clear register bank
1459 &pxor ("xmm1","xmm1");
1460 &pxor ("xmm2","xmm2");
1461 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1462 &pxor ("xmm3","xmm3");
1463 &movdqa (&QWP(16*1,"esp"),"xmm0");
1464 &pxor ("xmm4","xmm4");
1465 &movdqa (&QWP(16*2,"esp"),"xmm0");
1466 &pxor ("xmm5","xmm5");
1467 &movdqa (&QWP(16*3,"esp"),"xmm0");
1468 &pxor ("xmm6","xmm6");
1469 &movdqa (&QWP(16*4,"esp"),"xmm0");
1470 &pxor ("xmm7","xmm7");
1471 &movdqa (&QWP(16*5,"esp"),"xmm0");
f8501464
AP
1472 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1473&function_end("aesni_xts_encrypt");
1474
1475&function_begin("aesni_xts_decrypt");
1476 &mov ($key,&wparam(4)); # key2
1477 &mov ($inp,&wparam(5)); # clear-text tweak
1478
1479 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1480 &movups ($inout0,&QWP(0,$inp));
1481 if ($inline)
1482 { &aesni_inline_generate1("enc"); }
1483 else
1484 { &call ("_aesni_encrypt1"); }
1485
1486 &mov ($inp,&wparam(0));
1487 &mov ($out,&wparam(1));
1488 &mov ($len,&wparam(2));
1489 &mov ($key,&wparam(3)); # key1
1490
1491 &mov ($key_,"esp");
1492 &sub ("esp",16*7+8);
1493 &and ("esp",-16); # align stack
1494
1495 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1496 &test ($len,15);
1497 &setnz (&LB($rounds_));
1498 &shl ($rounds_,4);
1499 &sub ($len,$rounds_);
1500
1501 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1502 &mov (&DWP(16*6+4,"esp"),0);
1503 &mov (&DWP(16*6+8,"esp"),1);
1504 &mov (&DWP(16*6+12,"esp"),0);
1505 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1506 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1507
1508 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1509 &mov ($key_,$key); # backup $key
1510 &mov ($rounds_,$rounds); # backup $rounds
1511
1512 &movdqa ($tweak,$inout0);
1513 &pxor ($twtmp,$twtmp);
1514 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1515 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1516
1517 &and ($len,-16);
1518 &sub ($len,16*6);
1519 &jc (&label("xts_dec_short"));
1520
d8ba0dc9
AP
1521 &shl ($rounds,4);
1522 &mov ($rounds_,16);
1523 &sub ($rounds_,$rounds);
1524 &lea ($key,&DWP(32,$key,$rounds));
f8501464
AP
1525 &jmp (&label("xts_dec_loop6"));
1526
1527&set_label("xts_dec_loop6",16);
1528 for ($i=0;$i<4;$i++) {
1529 &pshufd ($twres,$twtmp,0x13);
1530 &pxor ($twtmp,$twtmp);
1531 &movdqa (&QWP(16*$i,"esp"),$tweak);
1532 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1533 &pand ($twres,$twmask); # isolate carry and residue
1534 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1535 &pxor ($tweak,$twres);
1536 }
1537 &pshufd ($inout5,$twtmp,0x13);
1538 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1539 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1540 &$movekey ($rndkey0,&QWP(0,$key_));
1541 &pand ($inout5,$twmask); # isolate carry and residue
1542 &movups ($inout0,&QWP(0,$inp)); # load input
1543 &pxor ($inout5,$tweak);
1544
1545 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
d8ba0dc9 1546 &mov ($rounds,$rounds_);
f8501464
AP
1547 &movdqu ($inout1,&QWP(16*1,$inp));
1548 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1549 &movdqu ($inout2,&QWP(16*2,$inp));
1550 &pxor ($inout1,$rndkey0);
1551 &movdqu ($inout3,&QWP(16*3,$inp));
1552 &pxor ($inout2,$rndkey0);
1553 &movdqu ($inout4,&QWP(16*4,$inp));
1554 &pxor ($inout3,$rndkey0);
1555 &movdqu ($rndkey1,&QWP(16*5,$inp));
1556 &pxor ($inout4,$rndkey0);
1557 &lea ($inp,&DWP(16*6,$inp));
1558 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1559 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1560 &pxor ($inout5,$rndkey1);
1561
1562 &$movekey ($rndkey1,&QWP(16,$key_));
f8501464 1563 &pxor ($inout1,&QWP(16*1,"esp"));
f8501464 1564 &pxor ($inout2,&QWP(16*2,"esp"));
d8ba0dc9 1565 &aesdec ($inout0,$rndkey1);
f8501464 1566 &pxor ($inout3,&QWP(16*3,"esp"));
f8501464 1567 &pxor ($inout4,&QWP(16*4,"esp"));
d8ba0dc9 1568 &aesdec ($inout1,$rndkey1);
f8501464 1569 &pxor ($inout5,$rndkey0);
d8ba0dc9
AP
1570 &$movekey ($rndkey0,&QWP(32,$key_));
1571 &aesdec ($inout2,$rndkey1);
1572 &aesdec ($inout3,$rndkey1);
f8501464 1573 &aesdec ($inout4,$rndkey1);
f8501464
AP
1574 &aesdec ($inout5,$rndkey1);
1575 &call (&label("_aesni_decrypt6_enter"));
1576
1577 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1578 &pxor ($twtmp,$twtmp);
1579 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1580 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1581 &xorps ($inout1,&QWP(16*1,"esp"));
1582 &movups (&QWP(16*0,$out),$inout0); # write output
1583 &xorps ($inout2,&QWP(16*2,"esp"));
1584 &movups (&QWP(16*1,$out),$inout1);
1585 &xorps ($inout3,&QWP(16*3,"esp"));
1586 &movups (&QWP(16*2,$out),$inout2);
1587 &xorps ($inout4,&QWP(16*4,"esp"));
1588 &movups (&QWP(16*3,$out),$inout3);
1589 &xorps ($inout5,$tweak);
1590 &movups (&QWP(16*4,$out),$inout4);
1591 &pshufd ($twres,$twtmp,0x13);
1592 &movups (&QWP(16*5,$out),$inout5);
1593 &lea ($out,&DWP(16*6,$out));
1594 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1595
1596 &pxor ($twtmp,$twtmp);
1597 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1598 &pand ($twres,$twmask); # isolate carry and residue
1599 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
f8501464
AP
1600 &pxor ($tweak,$twres);
1601
1602 &sub ($len,16*6);
1603 &jnc (&label("xts_dec_loop6"));
1604
d8ba0dc9 1605 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
f8501464
AP
1606 &mov ($key,$key_); # restore $key
1607 &mov ($rounds_,$rounds);
1608
1609&set_label("xts_dec_short");
1610 &add ($len,16*6);
1611 &jz (&label("xts_dec_done6x"));
1612
1613 &movdqa ($inout3,$tweak); # put aside previous tweak
1614 &cmp ($len,0x20);
1615 &jb (&label("xts_dec_one"));
1616
1617 &pshufd ($twres,$twtmp,0x13);
1618 &pxor ($twtmp,$twtmp);
1619 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1620 &pand ($twres,$twmask); # isolate carry and residue
1621 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1622 &pxor ($tweak,$twres);
1623 &je (&label("xts_dec_two"));
1624
1625 &pshufd ($twres,$twtmp,0x13);
1626 &pxor ($twtmp,$twtmp);
1627 &movdqa ($inout4,$tweak); # put aside previous tweak
1628 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1629 &pand ($twres,$twmask); # isolate carry and residue
1630 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1631 &pxor ($tweak,$twres);
1632 &cmp ($len,0x40);
1633 &jb (&label("xts_dec_three"));
1634
1635 &pshufd ($twres,$twtmp,0x13);
1636 &pxor ($twtmp,$twtmp);
1637 &movdqa ($inout5,$tweak); # put aside previous tweak
1638 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1639 &pand ($twres,$twmask); # isolate carry and residue
1640 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1641 &pxor ($tweak,$twres);
1642 &movdqa (&QWP(16*0,"esp"),$inout3);
1643 &movdqa (&QWP(16*1,"esp"),$inout4);
1644 &je (&label("xts_dec_four"));
1645
1646 &movdqa (&QWP(16*2,"esp"),$inout5);
1647 &pshufd ($inout5,$twtmp,0x13);
1648 &movdqa (&QWP(16*3,"esp"),$tweak);
1649 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1650 &pand ($inout5,$twmask); # isolate carry and residue
1651 &pxor ($inout5,$tweak);
1652
1653 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1654 &movdqu ($inout1,&QWP(16*1,$inp));
1655 &movdqu ($inout2,&QWP(16*2,$inp));
1656 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1657 &movdqu ($inout3,&QWP(16*3,$inp));
1658 &pxor ($inout1,&QWP(16*1,"esp"));
1659 &movdqu ($inout4,&QWP(16*4,$inp));
1660 &pxor ($inout2,&QWP(16*2,"esp"));
1661 &lea ($inp,&DWP(16*5,$inp));
1662 &pxor ($inout3,&QWP(16*3,"esp"));
1663 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1664 &pxor ($inout4,$inout5);
1665
1666 &call ("_aesni_decrypt6");
1667
1668 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1669 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1670 &xorps ($inout1,&QWP(16*1,"esp"));
1671 &xorps ($inout2,&QWP(16*2,"esp"));
1672 &movups (&QWP(16*0,$out),$inout0); # write output
1673 &xorps ($inout3,&QWP(16*3,"esp"));
1674 &movups (&QWP(16*1,$out),$inout1);
1675 &xorps ($inout4,$tweak);
1676 &movups (&QWP(16*2,$out),$inout2);
1677 &movups (&QWP(16*3,$out),$inout3);
1678 &movups (&QWP(16*4,$out),$inout4);
1679 &lea ($out,&DWP(16*5,$out));
1680 &jmp (&label("xts_dec_done"));
1681
1682&set_label("xts_dec_one",16);
1683 &movups ($inout0,&QWP(16*0,$inp)); # load input
1684 &lea ($inp,&DWP(16*1,$inp));
1685 &xorps ($inout0,$inout3); # input^=tweak
1686 if ($inline)
1687 { &aesni_inline_generate1("dec"); }
1688 else
1689 { &call ("_aesni_decrypt1"); }
1690 &xorps ($inout0,$inout3); # output^=tweak
1691 &movups (&QWP(16*0,$out),$inout0); # write output
1692 &lea ($out,&DWP(16*1,$out));
1693
1694 &movdqa ($tweak,$inout3); # last tweak
1695 &jmp (&label("xts_dec_done"));
1696
1697&set_label("xts_dec_two",16);
1698 &movaps ($inout4,$tweak); # put aside last tweak
1699
1700 &movups ($inout0,&QWP(16*0,$inp)); # load input
1701 &movups ($inout1,&QWP(16*1,$inp));
1702 &lea ($inp,&DWP(16*2,$inp));
1703 &xorps ($inout0,$inout3); # input^=tweak
1704 &xorps ($inout1,$inout4);
1705
214368ff 1706 &call ("_aesni_decrypt2");
f8501464
AP
1707
1708 &xorps ($inout0,$inout3); # output^=tweak
1709 &xorps ($inout1,$inout4);
1710 &movups (&QWP(16*0,$out),$inout0); # write output
1711 &movups (&QWP(16*1,$out),$inout1);
1712 &lea ($out,&DWP(16*2,$out));
1713
1714 &movdqa ($tweak,$inout4); # last tweak
1715 &jmp (&label("xts_dec_done"));
1716
1717&set_label("xts_dec_three",16);
1718 &movaps ($inout5,$tweak); # put aside last tweak
1719 &movups ($inout0,&QWP(16*0,$inp)); # load input
1720 &movups ($inout1,&QWP(16*1,$inp));
1721 &movups ($inout2,&QWP(16*2,$inp));
1722 &lea ($inp,&DWP(16*3,$inp));
1723 &xorps ($inout0,$inout3); # input^=tweak
1724 &xorps ($inout1,$inout4);
1725 &xorps ($inout2,$inout5);
1726
1727 &call ("_aesni_decrypt3");
1728
1729 &xorps ($inout0,$inout3); # output^=tweak
1730 &xorps ($inout1,$inout4);
1731 &xorps ($inout2,$inout5);
1732 &movups (&QWP(16*0,$out),$inout0); # write output
1733 &movups (&QWP(16*1,$out),$inout1);
1734 &movups (&QWP(16*2,$out),$inout2);
1735 &lea ($out,&DWP(16*3,$out));
1736
1737 &movdqa ($tweak,$inout5); # last tweak
1738 &jmp (&label("xts_dec_done"));
1739
1740&set_label("xts_dec_four",16);
1741 &movaps ($inout4,$tweak); # put aside last tweak
1742
1743 &movups ($inout0,&QWP(16*0,$inp)); # load input
1744 &movups ($inout1,&QWP(16*1,$inp));
1745 &movups ($inout2,&QWP(16*2,$inp));
1746 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1747 &movups ($inout3,&QWP(16*3,$inp));
1748 &lea ($inp,&DWP(16*4,$inp));
1749 &xorps ($inout1,&QWP(16*1,"esp"));
1750 &xorps ($inout2,$inout5);
1751 &xorps ($inout3,$inout4);
1752
1753 &call ("_aesni_decrypt4");
1754
1755 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1756 &xorps ($inout1,&QWP(16*1,"esp"));
1757 &xorps ($inout2,$inout5);
1758 &movups (&QWP(16*0,$out),$inout0); # write output
1759 &xorps ($inout3,$inout4);
1760 &movups (&QWP(16*1,$out),$inout1);
1761 &movups (&QWP(16*2,$out),$inout2);
1762 &movups (&QWP(16*3,$out),$inout3);
1763 &lea ($out,&DWP(16*4,$out));
1764
1765 &movdqa ($tweak,$inout4); # last tweak
1766 &jmp (&label("xts_dec_done"));
1767
1768&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1769 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1770 &and ($len,15);
1771 &jz (&label("xts_dec_ret"));
1772 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1773 &jmp (&label("xts_dec_only_one_more"));
1774
1775&set_label("xts_dec_done",16);
1776 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1777 &pxor ($twtmp,$twtmp);
1778 &and ($len,15);
1779 &jz (&label("xts_dec_ret"));
1780
1781 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1782 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1783 &pshufd ($twres,$twtmp,0x13);
1784 &pxor ($twtmp,$twtmp);
1785 &movdqa ($twmask,&QWP(16*6,"esp"));
1786 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1787 &pand ($twres,$twmask); # isolate carry and residue
1788 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1789 &pxor ($tweak,$twres);
1790
1791&set_label("xts_dec_only_one_more");
1792 &pshufd ($inout3,$twtmp,0x13);
1793 &movdqa ($inout4,$tweak); # put aside previous tweak
1794 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1795 &pand ($inout3,$twmask); # isolate carry and residue
1796 &pxor ($inout3,$tweak);
1797
1798 &mov ($key,$key_); # restore $key
1799 &mov ($rounds,$rounds_); # restore $rounds
1800
1801 &movups ($inout0,&QWP(0,$inp)); # load input
1802 &xorps ($inout0,$inout3); # input^=tweak
1803 if ($inline)
1804 { &aesni_inline_generate1("dec"); }
1805 else
1806 { &call ("_aesni_decrypt1"); }
1807 &xorps ($inout0,$inout3); # output^=tweak
1808 &movups (&QWP(0,$out),$inout0); # write output
1809
1810&set_label("xts_dec_steal");
1811 &movz ($rounds,&BP(16,$inp));
1812 &movz ($key,&BP(0,$out));
1813 &lea ($inp,&DWP(1,$inp));
1814 &mov (&BP(0,$out),&LB($rounds));
1815 &mov (&BP(16,$out),&LB($key));
1816 &lea ($out,&DWP(1,$out));
1817 &sub ($len,1);
1818 &jnz (&label("xts_dec_steal"));
1819
1820 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1821 &mov ($key,$key_); # restore $key
1822 &mov ($rounds,$rounds_); # restore $rounds
1823
1824 &movups ($inout0,&QWP(0,$out)); # load input
1825 &xorps ($inout0,$inout4); # input^=tweak
1826 if ($inline)
1827 { &aesni_inline_generate1("dec"); }
1828 else
1829 { &call ("_aesni_decrypt1"); }
1830 &xorps ($inout0,$inout4); # output^=tweak
1831 &movups (&QWP(0,$out),$inout0); # write output
1832
1833&set_label("xts_dec_ret");
23f6eec7
AP
1834 &pxor ("xmm0","xmm0"); # clear register bank
1835 &pxor ("xmm1","xmm1");
1836 &pxor ("xmm2","xmm2");
1837 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1838 &pxor ("xmm3","xmm3");
1839 &movdqa (&QWP(16*1,"esp"),"xmm0");
1840 &pxor ("xmm4","xmm4");
1841 &movdqa (&QWP(16*2,"esp"),"xmm0");
1842 &pxor ("xmm5","xmm5");
1843 &movdqa (&QWP(16*3,"esp"),"xmm0");
1844 &pxor ("xmm6","xmm6");
1845 &movdqa (&QWP(16*4,"esp"),"xmm0");
1846 &pxor ("xmm7","xmm7");
1847 &movdqa (&QWP(16*5,"esp"),"xmm0");
f8501464
AP
1848 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1849&function_end("aesni_xts_decrypt");
1850}
bd30091c
AP
1851\f
1852######################################################################
1853# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
1854# const AES_KEY *key, unsigned int start_block_num,
1855# unsigned char offset_i[16], const unsigned char L_[][16],
1856# unsigned char checksum[16]);
1857#
1858{
1859# offsets within stack frame
1860my $checksum = 16*6;
1861my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
1862
1863# reassigned registers
1864my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
1865# $l_, $blocks, $inp, $key are permanently allocated in registers;
1866# remaining non-volatile ones are offloaded to stack, which even
1867# stay invariant after written to stack.
1868
1869&function_begin("aesni_ocb_encrypt");
1870 &mov ($rounds,&wparam(5)); # &offset_i
1871 &mov ($rounds_,&wparam(7)); # &checksum
1872
1873 &mov ($inp,&wparam(0));
1874 &mov ($out,&wparam(1));
1875 &mov ($len,&wparam(2));
1876 &mov ($key,&wparam(3));
1877 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
1878 &mov ($block,&wparam(4)); # start_block_num
1879 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
1880 &mov ($l_,&wparam(6)); # L_
1881
1882 &mov ($rounds,"esp");
1883 &sub ("esp",$esp_off+4); # alloca
1884 &and ("esp",-16); # align stack
1885
1886 &sub ($out,$inp);
1887 &shl ($len,4);
1888 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
1889 &mov (&DWP($out_off,"esp"),$out);
1890 &mov (&DWP($end_off,"esp"),$len);
1891 &mov (&DWP($esp_off,"esp"),$rounds);
1892
1893 &mov ($rounds,&DWP(240,$key));
1894
1895 &test ($block,1);
1896 &jnz (&label("odd"));
1897
1898 &bsf ($i3,$block);
1899 &add ($block,1);
1900 &shl ($i3,4);
1901 &movdqu ($inout5,&QWP(0,$l_,$i3));
1902 &mov ($i3,$key); # put aside key
1903
1904 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1905 &lea ($inp,&DWP(16,$inp));
1906
1907 &pxor ($inout5,$rndkey0); # ^ last offset_i
1908 &pxor ($rndkey1,$inout0); # checksum
1909 &pxor ($inout0,$inout5); # ^ offset_i
1910
1911 &movdqa ($inout4,$rndkey1);
1912 if ($inline)
1913 { &aesni_inline_generate1("enc"); }
1914 else
1915 { &call ("_aesni_encrypt1"); }
1916
1917 &xorps ($inout0,$inout5); # ^ offset_i
1918 &movdqa ($rndkey0,$inout5); # pass last offset_i
1919 &movdqa ($rndkey1,$inout4); # pass the checksum
1920
1921 &movups (&QWP(-16,$out,$inp),$inout0); # store output
1922
1923 &mov ($rounds,&DWP(240,$i3));
1924 &mov ($key,$i3); # restore key
1925 &mov ($len,&DWP($end_off,"esp"));
1926
1927&set_label("odd");
1928 &shl ($rounds,4);
1929 &mov ($out,16);
1930 &sub ($out,$rounds); # twisted rounds
1931 &mov (&DWP($key_off,"esp"),$key);
1932 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
1933 &mov (&DWP($rounds_off,"esp"),$out);
1934
1935 &cmp ($inp,$len);
1936 &ja (&label("short"));
1937 &jmp (&label("grandloop"));
1938
1939&set_label("grandloop",32);
1940 &lea ($i1,&DWP(1,$block));
1941 &lea ($i3,&DWP(3,$block));
1942 &lea ($i5,&DWP(5,$block));
1943 &add ($block,6);
1944 &bsf ($i1,$i1);
1945 &bsf ($i3,$i3);
1946 &bsf ($i5,$i5);
1947 &shl ($i1,4);
1948 &shl ($i3,4);
1949 &shl ($i5,4);
1950 &movdqu ($inout0,&QWP(0,$l_));
1951 &movdqu ($inout1,&QWP(0,$l_,$i1));
1952 &mov ($rounds,&DWP($rounds_off,"esp"));
1953 &movdqa ($inout2,$inout0);
1954 &movdqu ($inout3,&QWP(0,$l_,$i3));
1955 &movdqa ($inout4,$inout0);
1956 &movdqu ($inout5,&QWP(0,$l_,$i5));
1957
1958 &pxor ($inout0,$rndkey0); # ^ last offset_i
1959 &pxor ($inout1,$inout0);
1960 &movdqa (&QWP(16*0,"esp"),$inout0);
1961 &pxor ($inout2,$inout1);
1962 &movdqa (&QWP(16*1,"esp"),$inout1);
1963 &pxor ($inout3,$inout2);
1964 &movdqa (&QWP(16*2,"esp"),$inout2);
1965 &pxor ($inout4,$inout3);
1966 &movdqa (&QWP(16*3,"esp"),$inout3);
1967 &pxor ($inout5,$inout4);
1968 &movdqa (&QWP(16*4,"esp"),$inout4);
1969 &movdqa (&QWP(16*5,"esp"),$inout5);
1970
1971 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
1972 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1973 &movdqu ($inout1,&QWP(16*1,$inp));
1974 &movdqu ($inout2,&QWP(16*2,$inp));
1975 &movdqu ($inout3,&QWP(16*3,$inp));
1976 &movdqu ($inout4,&QWP(16*4,$inp));
1977 &movdqu ($inout5,&QWP(16*5,$inp));
1978 &lea ($inp,&DWP(16*6,$inp));
1979
1980 &pxor ($rndkey1,$inout0); # checksum
1981 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
1982 &pxor ($rndkey1,$inout1);
1983 &pxor ($inout1,$rndkey0);
1984 &pxor ($rndkey1,$inout2);
1985 &pxor ($inout2,$rndkey0);
1986 &pxor ($rndkey1,$inout3);
1987 &pxor ($inout3,$rndkey0);
1988 &pxor ($rndkey1,$inout4);
1989 &pxor ($inout4,$rndkey0);
1990 &pxor ($rndkey1,$inout5);
1991 &pxor ($inout5,$rndkey0);
1992 &movdqa (&QWP($checksum,"esp"),$rndkey1);
1993
1994 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
1995 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
1996 &pxor ($inout1,&QWP(16*1,"esp"));
1997 &pxor ($inout2,&QWP(16*2,"esp"));
1998 &pxor ($inout3,&QWP(16*3,"esp"));
1999 &pxor ($inout4,&QWP(16*4,"esp"));
2000 &pxor ($inout5,&QWP(16*5,"esp"));
2001
2002 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2003 &aesenc ($inout0,$rndkey1);
2004 &aesenc ($inout1,$rndkey1);
2005 &aesenc ($inout2,$rndkey1);
2006 &aesenc ($inout3,$rndkey1);
2007 &aesenc ($inout4,$rndkey1);
2008 &aesenc ($inout5,$rndkey1);
2009
2010 &mov ($out,&DWP($out_off,"esp"));
2011 &mov ($len,&DWP($end_off,"esp"));
2012 &call ("_aesni_encrypt6_enter");
2013
2014 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
2015 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2016 &pxor ($inout1,&QWP(16*1,"esp"));
2017 &pxor ($inout2,&QWP(16*2,"esp"));
2018 &pxor ($inout3,&QWP(16*3,"esp"));
2019 &pxor ($inout4,&QWP(16*4,"esp"));
2020 &pxor ($inout5,$rndkey0);
2021 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2022
2023 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
2024 &movdqu (&QWP(-16*5,$out,$inp),$inout1);
2025 &movdqu (&QWP(-16*4,$out,$inp),$inout2);
2026 &movdqu (&QWP(-16*3,$out,$inp),$inout3);
2027 &movdqu (&QWP(-16*2,$out,$inp),$inout4);
2028 &movdqu (&QWP(-16*1,$out,$inp),$inout5);
2029 &cmp ($inp,$len); # done yet?
2030 &jb (&label("grandloop"));
2031
2032&set_label("short");
2033 &add ($len,16*6);
2034 &sub ($len,$inp);
2035 &jz (&label("done"));
2036
2037 &cmp ($len,16*2);
2038 &jb (&label("one"));
2039 &je (&label("two"));
2040
2041 &cmp ($len,16*4);
2042 &jb (&label("three"));
2043 &je (&label("four"));
2044
2045 &lea ($i1,&DWP(1,$block));
2046 &lea ($i3,&DWP(3,$block));
2047 &bsf ($i1,$i1);
2048 &bsf ($i3,$i3);
2049 &shl ($i1,4);
2050 &shl ($i3,4);
2051 &movdqu ($inout0,&QWP(0,$l_));
2052 &movdqu ($inout1,&QWP(0,$l_,$i1));
2053 &mov ($rounds,&DWP($rounds_off,"esp"));
2054 &movdqa ($inout2,$inout0);
2055 &movdqu ($inout3,&QWP(0,$l_,$i3));
2056 &movdqa ($inout4,$inout0);
2057
2058 &pxor ($inout0,$rndkey0); # ^ last offset_i
2059 &pxor ($inout1,$inout0);
2060 &movdqa (&QWP(16*0,"esp"),$inout0);
2061 &pxor ($inout2,$inout1);
2062 &movdqa (&QWP(16*1,"esp"),$inout1);
2063 &pxor ($inout3,$inout2);
2064 &movdqa (&QWP(16*2,"esp"),$inout2);
2065 &pxor ($inout4,$inout3);
2066 &movdqa (&QWP(16*3,"esp"),$inout3);
2067 &pxor ($inout5,$inout4);
2068 &movdqa (&QWP(16*4,"esp"),$inout4);
2069
2070 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2071 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2072 &movdqu ($inout1,&QWP(16*1,$inp));
2073 &movdqu ($inout2,&QWP(16*2,$inp));
2074 &movdqu ($inout3,&QWP(16*3,$inp));
2075 &movdqu ($inout4,&QWP(16*4,$inp));
2076 &pxor ($inout5,$inout5);
2077
2078 &pxor ($rndkey1,$inout0); # checksum
2079 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2080 &pxor ($rndkey1,$inout1);
2081 &pxor ($inout1,$rndkey0);
2082 &pxor ($rndkey1,$inout2);
2083 &pxor ($inout2,$rndkey0);
2084 &pxor ($rndkey1,$inout3);
2085 &pxor ($inout3,$rndkey0);
2086 &pxor ($rndkey1,$inout4);
2087 &pxor ($inout4,$rndkey0);
2088 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2089
2090 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2091 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2092 &pxor ($inout1,&QWP(16*1,"esp"));
2093 &pxor ($inout2,&QWP(16*2,"esp"));
2094 &pxor ($inout3,&QWP(16*3,"esp"));
2095 &pxor ($inout4,&QWP(16*4,"esp"));
2096
2097 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2098 &aesenc ($inout0,$rndkey1);
2099 &aesenc ($inout1,$rndkey1);
2100 &aesenc ($inout2,$rndkey1);
2101 &aesenc ($inout3,$rndkey1);
2102 &aesenc ($inout4,$rndkey1);
2103 &aesenc ($inout5,$rndkey1);
2104
2105 &mov ($out,&DWP($out_off,"esp"));
2106 &call ("_aesni_encrypt6_enter");
2107
2108 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
2109 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2110 &pxor ($inout1,&QWP(16*1,"esp"));
2111 &pxor ($inout2,&QWP(16*2,"esp"));
2112 &pxor ($inout3,&QWP(16*3,"esp"));
2113 &pxor ($inout4,$rndkey0);
2114 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2115
2116 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
2117 &movdqu (&QWP(16*1,$out,$inp),$inout1);
2118 &movdqu (&QWP(16*2,$out,$inp),$inout2);
2119 &movdqu (&QWP(16*3,$out,$inp),$inout3);
2120 &movdqu (&QWP(16*4,$out,$inp),$inout4);
2121
2122 &jmp (&label("done"));
2123
2124&set_label("one",16);
2125 &movdqu ($inout5,&QWP(0,$l_));
2126 &mov ($key,&DWP($key_off,"esp")); # restore key
2127
2128 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2129 &mov ($rounds,&DWP(240,$key));
2130
2131 &pxor ($inout5,$rndkey0); # ^ last offset_i
2132 &pxor ($rndkey1,$inout0); # checksum
2133 &pxor ($inout0,$inout5); # ^ offset_i
2134
2135 &movdqa ($inout4,$rndkey1);
2136 &mov ($out,&DWP($out_off,"esp"));
2137 if ($inline)
2138 { &aesni_inline_generate1("enc"); }
2139 else
2140 { &call ("_aesni_encrypt1"); }
2141
2142 &xorps ($inout0,$inout5); # ^ offset_i
2143 &movdqa ($rndkey0,$inout5); # pass last offset_i
2144 &movdqa ($rndkey1,$inout4); # pass the checksum
2145 &movups (&QWP(0,$out,$inp),$inout0);
2146
2147 &jmp (&label("done"));
2148
2149&set_label("two",16);
2150 &lea ($i1,&DWP(1,$block));
2151 &mov ($key,&DWP($key_off,"esp")); # restore key
2152 &bsf ($i1,$i1);
2153 &shl ($i1,4);
2154 &movdqu ($inout4,&QWP(0,$l_));
2155 &movdqu ($inout5,&QWP(0,$l_,$i1));
2156
2157 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2158 &movdqu ($inout1,&QWP(16*1,$inp));
2159 &mov ($rounds,&DWP(240,$key));
2160
2161 &pxor ($inout4,$rndkey0); # ^ last offset_i
2162 &pxor ($inout5,$inout4);
2163
2164 &pxor ($rndkey1,$inout0); # checksum
2165 &pxor ($inout0,$inout4); # ^ offset_i
2166 &pxor ($rndkey1,$inout1);
2167 &pxor ($inout1,$inout5);
2168
2169 &movdqa ($inout3,$rndkey1)
2170 &mov ($out,&DWP($out_off,"esp"));
2171 &call ("_aesni_encrypt2");
2172
2173 &xorps ($inout0,$inout4); # ^ offset_i
2174 &xorps ($inout1,$inout5);
2175 &movdqa ($rndkey0,$inout5); # pass last offset_i
2176 &movdqa ($rndkey1,$inout3); # pass the checksum
2177 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2178 &movups (&QWP(16*1,$out,$inp),$inout1);
2179
2180 &jmp (&label("done"));
2181
2182&set_label("three",16);
2183 &lea ($i1,&DWP(1,$block));
2184 &mov ($key,&DWP($key_off,"esp")); # restore key
2185 &bsf ($i1,$i1);
2186 &shl ($i1,4);
2187 &movdqu ($inout3,&QWP(0,$l_));
2188 &movdqu ($inout4,&QWP(0,$l_,$i1));
2189 &movdqa ($inout5,$inout3);
2190
2191 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2192 &movdqu ($inout1,&QWP(16*1,$inp));
2193 &movdqu ($inout2,&QWP(16*2,$inp));
2194 &mov ($rounds,&DWP(240,$key));
2195
2196 &pxor ($inout3,$rndkey0); # ^ last offset_i
2197 &pxor ($inout4,$inout3);
2198 &pxor ($inout5,$inout4);
2199
2200 &pxor ($rndkey1,$inout0); # checksum
2201 &pxor ($inout0,$inout3); # ^ offset_i
2202 &pxor ($rndkey1,$inout1);
2203 &pxor ($inout1,$inout4);
2204 &pxor ($rndkey1,$inout2);
2205 &pxor ($inout2,$inout5);
2206
2207 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2208 &mov ($out,&DWP($out_off,"esp"));
2209 &call ("_aesni_encrypt3");
2210
2211 &xorps ($inout0,$inout3); # ^ offset_i
2212 &xorps ($inout1,$inout4);
2213 &xorps ($inout2,$inout5);
2214 &movdqa ($rndkey0,$inout5); # pass last offset_i
2215 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2216 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2217 &movups (&QWP(16*1,$out,$inp),$inout1);
2218 &movups (&QWP(16*2,$out,$inp),$inout2);
2219
2220 &jmp (&label("done"));
2221
2222&set_label("four",16);
2223 &lea ($i1,&DWP(1,$block));
2224 &lea ($i3,&DWP(3,$block));
2225 &bsf ($i1,$i1);
2226 &bsf ($i3,$i3);
2227 &mov ($key,&DWP($key_off,"esp")); # restore key
2228 &shl ($i1,4);
2229 &shl ($i3,4);
2230 &movdqu ($inout2,&QWP(0,$l_));
2231 &movdqu ($inout3,&QWP(0,$l_,$i1));
2232 &movdqa ($inout4,$inout2);
2233 &movdqu ($inout5,&QWP(0,$l_,$i3));
2234
2235 &pxor ($inout2,$rndkey0); # ^ last offset_i
2236 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2237 &pxor ($inout3,$inout2);
2238 &movdqu ($inout1,&QWP(16*1,$inp));
2239 &pxor ($inout4,$inout3);
2240 &movdqa (&QWP(16*0,"esp"),$inout2);
2241 &pxor ($inout5,$inout4);
2242 &movdqa (&QWP(16*1,"esp"),$inout3);
2243 &movdqu ($inout2,&QWP(16*2,$inp));
2244 &movdqu ($inout3,&QWP(16*3,$inp));
2245 &mov ($rounds,&DWP(240,$key));
2246
2247 &pxor ($rndkey1,$inout0); # checksum
2248 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2249 &pxor ($rndkey1,$inout1);
2250 &pxor ($inout1,&QWP(16*1,"esp"));
2251 &pxor ($rndkey1,$inout2);
2252 &pxor ($inout2,$inout4);
2253 &pxor ($rndkey1,$inout3);
2254 &pxor ($inout3,$inout5);
2255
2256 &movdqa (&QWP($checksum,"esp"),$rndkey1)
2257 &mov ($out,&DWP($out_off,"esp"));
2258 &call ("_aesni_encrypt4");
2259
2260 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2261 &xorps ($inout1,&QWP(16*1,"esp"));
2262 &xorps ($inout2,$inout4);
2263 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2264 &xorps ($inout3,$inout5);
2265 &movups (&QWP(16*1,$out,$inp),$inout1);
2266 &movdqa ($rndkey0,$inout5); # pass last offset_i
2267 &movups (&QWP(16*2,$out,$inp),$inout2);
2268 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2269 &movups (&QWP(16*3,$out,$inp),$inout3);
2270
2271&set_label("done");
2272 &mov ($key,&DWP($esp_off,"esp"));
2273 &pxor ($inout0,$inout0); # clear register bank
2274 &pxor ($inout1,$inout1);
2275 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
2276 &pxor ($inout2,$inout2);
2277 &movdqa (&QWP(16*1,"esp"),$inout0);
2278 &pxor ($inout3,$inout3);
2279 &movdqa (&QWP(16*2,"esp"),$inout0);
2280 &pxor ($inout4,$inout4);
2281 &movdqa (&QWP(16*3,"esp"),$inout0);
2282 &pxor ($inout5,$inout5);
2283 &movdqa (&QWP(16*4,"esp"),$inout0);
2284 &movdqa (&QWP(16*5,"esp"),$inout0);
2285 &movdqa (&QWP(16*6,"esp"),$inout0);
2286
2287 &lea ("esp",&DWP(0,$key));
2288 &mov ($rounds,&wparam(5)); # &offset_i
2289 &mov ($rounds_,&wparam(7)); # &checksum
2290 &movdqu (&QWP(0,$rounds),$rndkey0);
2291 &pxor ($rndkey0,$rndkey0);
2292 &movdqu (&QWP(0,$rounds_),$rndkey1);
2293 &pxor ($rndkey1,$rndkey1);
2294&function_end("aesni_ocb_encrypt");
2295
2296&function_begin("aesni_ocb_decrypt");
2297 &mov ($rounds,&wparam(5)); # &offset_i
2298 &mov ($rounds_,&wparam(7)); # &checksum
2299
2300 &mov ($inp,&wparam(0));
2301 &mov ($out,&wparam(1));
2302 &mov ($len,&wparam(2));
2303 &mov ($key,&wparam(3));
2304 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
2305 &mov ($block,&wparam(4)); # start_block_num
2306 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
2307 &mov ($l_,&wparam(6)); # L_
2308
2309 &mov ($rounds,"esp");
2310 &sub ("esp",$esp_off+4); # alloca
2311 &and ("esp",-16); # align stack
2312
2313 &sub ($out,$inp);
2314 &shl ($len,4);
2315 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
2316 &mov (&DWP($out_off,"esp"),$out);
2317 &mov (&DWP($end_off,"esp"),$len);
2318 &mov (&DWP($esp_off,"esp"),$rounds);
2319
2320 &mov ($rounds,&DWP(240,$key));
2321
2322 &test ($block,1);
2323 &jnz (&label("odd"));
2324
2325 &bsf ($i3,$block);
2326 &add ($block,1);
2327 &shl ($i3,4);
2328 &movdqu ($inout5,&QWP(0,$l_,$i3));
2329 &mov ($i3,$key); # put aside key
2330
2331 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2332 &lea ($inp,&DWP(16,$inp));
2333
2334 &pxor ($inout5,$rndkey0); # ^ last offset_i
2335 &pxor ($inout0,$inout5); # ^ offset_i
2336
2337 &movdqa ($inout4,$rndkey1);
2338 if ($inline)
2339 { &aesni_inline_generate1("dec"); }
2340 else
2341 { &call ("_aesni_decrypt1"); }
2342
2343 &xorps ($inout0,$inout5); # ^ offset_i
2344 &movaps ($rndkey1,$inout4); # pass the checksum
2345 &movdqa ($rndkey0,$inout5); # pass last offset_i
2346 &xorps ($rndkey1,$inout0); # checksum
2347 &movups (&QWP(-16,$out,$inp),$inout0); # store output
2348
2349 &mov ($rounds,&DWP(240,$i3));
2350 &mov ($key,$i3); # restore key
2351 &mov ($len,&DWP($end_off,"esp"));
2352
2353&set_label("odd");
2354 &shl ($rounds,4);
2355 &mov ($out,16);
2356 &sub ($out,$rounds); # twisted rounds
2357 &mov (&DWP($key_off,"esp"),$key);
2358 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
2359 &mov (&DWP($rounds_off,"esp"),$out);
2360
2361 &cmp ($inp,$len);
2362 &ja (&label("short"));
2363 &jmp (&label("grandloop"));
2364
2365&set_label("grandloop",32);
2366 &lea ($i1,&DWP(1,$block));
2367 &lea ($i3,&DWP(3,$block));
2368 &lea ($i5,&DWP(5,$block));
2369 &add ($block,6);
2370 &bsf ($i1,$i1);
2371 &bsf ($i3,$i3);
2372 &bsf ($i5,$i5);
2373 &shl ($i1,4);
2374 &shl ($i3,4);
2375 &shl ($i5,4);
2376 &movdqu ($inout0,&QWP(0,$l_));
2377 &movdqu ($inout1,&QWP(0,$l_,$i1));
2378 &mov ($rounds,&DWP($rounds_off,"esp"));
2379 &movdqa ($inout2,$inout0);
2380 &movdqu ($inout3,&QWP(0,$l_,$i3));
2381 &movdqa ($inout4,$inout0);
2382 &movdqu ($inout5,&QWP(0,$l_,$i5));
2383
2384 &pxor ($inout0,$rndkey0); # ^ last offset_i
2385 &pxor ($inout1,$inout0);
2386 &movdqa (&QWP(16*0,"esp"),$inout0);
2387 &pxor ($inout2,$inout1);
2388 &movdqa (&QWP(16*1,"esp"),$inout1);
2389 &pxor ($inout3,$inout2);
2390 &movdqa (&QWP(16*2,"esp"),$inout2);
2391 &pxor ($inout4,$inout3);
2392 &movdqa (&QWP(16*3,"esp"),$inout3);
2393 &pxor ($inout5,$inout4);
2394 &movdqa (&QWP(16*4,"esp"),$inout4);
2395 &movdqa (&QWP(16*5,"esp"),$inout5);
2396
2397 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2398 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2399 &movdqu ($inout1,&QWP(16*1,$inp));
2400 &movdqu ($inout2,&QWP(16*2,$inp));
2401 &movdqu ($inout3,&QWP(16*3,$inp));
2402 &movdqu ($inout4,&QWP(16*4,$inp));
2403 &movdqu ($inout5,&QWP(16*5,$inp));
2404 &lea ($inp,&DWP(16*6,$inp));
2405
2406 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2407 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2408 &pxor ($inout1,$rndkey0);
2409 &pxor ($inout2,$rndkey0);
2410 &pxor ($inout3,$rndkey0);
2411 &pxor ($inout4,$rndkey0);
2412 &pxor ($inout5,$rndkey0);
2413
2414 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2415 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2416 &pxor ($inout1,&QWP(16*1,"esp"));
2417 &pxor ($inout2,&QWP(16*2,"esp"));
2418 &pxor ($inout3,&QWP(16*3,"esp"));
2419 &pxor ($inout4,&QWP(16*4,"esp"));
2420 &pxor ($inout5,&QWP(16*5,"esp"));
2421
2422 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2423 &aesdec ($inout0,$rndkey1);
2424 &aesdec ($inout1,$rndkey1);
2425 &aesdec ($inout2,$rndkey1);
2426 &aesdec ($inout3,$rndkey1);
2427 &aesdec ($inout4,$rndkey1);
2428 &aesdec ($inout5,$rndkey1);
2429
2430 &mov ($out,&DWP($out_off,"esp"));
2431 &mov ($len,&DWP($end_off,"esp"));
2432 &call ("_aesni_decrypt6_enter");
2433
2434 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
2435 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2436 &movdqa ($rndkey1,&QWP($checksum,"esp"));
2437 &pxor ($inout1,&QWP(16*1,"esp"));
2438 &pxor ($inout2,&QWP(16*2,"esp"));
2439 &pxor ($inout3,&QWP(16*3,"esp"));
2440 &pxor ($inout4,&QWP(16*4,"esp"));
2441 &pxor ($inout5,$rndkey0);
2442
2443 &pxor ($rndkey1,$inout0); # checksum
2444 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
2445 &pxor ($rndkey1,$inout1);
2446 &movdqu (&QWP(-16*5,$out,$inp),$inout1);
2447 &pxor ($rndkey1,$inout2);
2448 &movdqu (&QWP(-16*4,$out,$inp),$inout2);
2449 &pxor ($rndkey1,$inout3);
2450 &movdqu (&QWP(-16*3,$out,$inp),$inout3);
2451 &pxor ($rndkey1,$inout4);
2452 &movdqu (&QWP(-16*2,$out,$inp),$inout4);
2453 &pxor ($rndkey1,$inout5);
2454 &movdqu (&QWP(-16*1,$out,$inp),$inout5);
2455 &cmp ($inp,$len); # done yet?
2456 &jb (&label("grandloop"));
2457
2458&set_label("short");
2459 &add ($len,16*6);
2460 &sub ($len,$inp);
2461 &jz (&label("done"));
2462
2463 &cmp ($len,16*2);
2464 &jb (&label("one"));
2465 &je (&label("two"));
2466
2467 &cmp ($len,16*4);
2468 &jb (&label("three"));
2469 &je (&label("four"));
2470
2471 &lea ($i1,&DWP(1,$block));
2472 &lea ($i3,&DWP(3,$block));
2473 &bsf ($i1,$i1);
2474 &bsf ($i3,$i3);
2475 &shl ($i1,4);
2476 &shl ($i3,4);
2477 &movdqu ($inout0,&QWP(0,$l_));
2478 &movdqu ($inout1,&QWP(0,$l_,$i1));
2479 &mov ($rounds,&DWP($rounds_off,"esp"));
2480 &movdqa ($inout2,$inout0);
2481 &movdqu ($inout3,&QWP(0,$l_,$i3));
2482 &movdqa ($inout4,$inout0);
2483
2484 &pxor ($inout0,$rndkey0); # ^ last offset_i
2485 &pxor ($inout1,$inout0);
2486 &movdqa (&QWP(16*0,"esp"),$inout0);
2487 &pxor ($inout2,$inout1);
2488 &movdqa (&QWP(16*1,"esp"),$inout1);
2489 &pxor ($inout3,$inout2);
2490 &movdqa (&QWP(16*2,"esp"),$inout2);
2491 &pxor ($inout4,$inout3);
2492 &movdqa (&QWP(16*3,"esp"),$inout3);
2493 &pxor ($inout5,$inout4);
2494 &movdqa (&QWP(16*4,"esp"),$inout4);
2495
2496 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2497 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2498 &movdqu ($inout1,&QWP(16*1,$inp));
2499 &movdqu ($inout2,&QWP(16*2,$inp));
2500 &movdqu ($inout3,&QWP(16*3,$inp));
2501 &movdqu ($inout4,&QWP(16*4,$inp));
2502 &pxor ($inout5,$inout5);
2503
2504 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2505 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2506 &pxor ($inout1,$rndkey0);
2507 &pxor ($inout2,$rndkey0);
2508 &pxor ($inout3,$rndkey0);
2509 &pxor ($inout4,$rndkey0);
2510
2511 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2512 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2513 &pxor ($inout1,&QWP(16*1,"esp"));
2514 &pxor ($inout2,&QWP(16*2,"esp"));
2515 &pxor ($inout3,&QWP(16*3,"esp"));
2516 &pxor ($inout4,&QWP(16*4,"esp"));
2517
2518 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2519 &aesdec ($inout0,$rndkey1);
2520 &aesdec ($inout1,$rndkey1);
2521 &aesdec ($inout2,$rndkey1);
2522 &aesdec ($inout3,$rndkey1);
2523 &aesdec ($inout4,$rndkey1);
2524 &aesdec ($inout5,$rndkey1);
2525
2526 &mov ($out,&DWP($out_off,"esp"));
2527 &call ("_aesni_decrypt6_enter");
2528
2529 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
2530 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2531 &movdqa ($rndkey1,&QWP($checksum,"esp"));
2532 &pxor ($inout1,&QWP(16*1,"esp"));
2533 &pxor ($inout2,&QWP(16*2,"esp"));
2534 &pxor ($inout3,&QWP(16*3,"esp"));
2535 &pxor ($inout4,$rndkey0);
2536
2537 &pxor ($rndkey1,$inout0); # checksum
2538 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
2539 &pxor ($rndkey1,$inout1);
2540 &movdqu (&QWP(16*1,$out,$inp),$inout1);
2541 &pxor ($rndkey1,$inout2);
2542 &movdqu (&QWP(16*2,$out,$inp),$inout2);
2543 &pxor ($rndkey1,$inout3);
2544 &movdqu (&QWP(16*3,$out,$inp),$inout3);
2545 &pxor ($rndkey1,$inout4);
2546 &movdqu (&QWP(16*4,$out,$inp),$inout4);
2547
2548 &jmp (&label("done"));
2549
2550&set_label("one",16);
2551 &movdqu ($inout5,&QWP(0,$l_));
2552 &mov ($key,&DWP($key_off,"esp")); # restore key
2553
2554 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2555 &mov ($rounds,&DWP(240,$key));
2556
2557 &pxor ($inout5,$rndkey0); # ^ last offset_i
2558 &pxor ($inout0,$inout5); # ^ offset_i
2559
2560 &movdqa ($inout4,$rndkey1);
2561 &mov ($out,&DWP($out_off,"esp"));
2562 if ($inline)
2563 { &aesni_inline_generate1("dec"); }
2564 else
2565 { &call ("_aesni_decrypt1"); }
2566
2567 &xorps ($inout0,$inout5); # ^ offset_i
2568 &movaps ($rndkey1,$inout4); # pass the checksum
2569 &movdqa ($rndkey0,$inout5); # pass last offset_i
2570 &xorps ($rndkey1,$inout0); # checksum
2571 &movups (&QWP(0,$out,$inp),$inout0);
2572
2573 &jmp (&label("done"));
2574
2575&set_label("two",16);
2576 &lea ($i1,&DWP(1,$block));
2577 &mov ($key,&DWP($key_off,"esp")); # restore key
2578 &bsf ($i1,$i1);
2579 &shl ($i1,4);
2580 &movdqu ($inout4,&QWP(0,$l_));
2581 &movdqu ($inout5,&QWP(0,$l_,$i1));
2582
2583 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2584 &movdqu ($inout1,&QWP(16*1,$inp));
2585 &mov ($rounds,&DWP(240,$key));
2586
2587 &movdqa ($inout3,$rndkey1);
2588 &pxor ($inout4,$rndkey0); # ^ last offset_i
2589 &pxor ($inout5,$inout4);
2590
2591 &pxor ($inout0,$inout4); # ^ offset_i
2592 &pxor ($inout1,$inout5);
2593
2594 &mov ($out,&DWP($out_off,"esp"));
2595 &call ("_aesni_decrypt2");
2596
2597 &xorps ($inout0,$inout4); # ^ offset_i
2598 &xorps ($inout1,$inout5);
2599 &movdqa ($rndkey0,$inout5); # pass last offset_i
2600 &xorps ($inout3,$inout0); # checksum
2601 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2602 &xorps ($inout3,$inout1);
2603 &movups (&QWP(16*1,$out,$inp),$inout1);
2604 &movaps ($rndkey1,$inout3); # pass the checksum
2605
2606 &jmp (&label("done"));
2607
2608&set_label("three",16);
2609 &lea ($i1,&DWP(1,$block));
2610 &mov ($key,&DWP($key_off,"esp")); # restore key
2611 &bsf ($i1,$i1);
2612 &shl ($i1,4);
2613 &movdqu ($inout3,&QWP(0,$l_));
2614 &movdqu ($inout4,&QWP(0,$l_,$i1));
2615 &movdqa ($inout5,$inout3);
2616
2617 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2618 &movdqu ($inout1,&QWP(16*1,$inp));
2619 &movdqu ($inout2,&QWP(16*2,$inp));
2620 &mov ($rounds,&DWP(240,$key));
2621
2622 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2623 &pxor ($inout3,$rndkey0); # ^ last offset_i
2624 &pxor ($inout4,$inout3);
2625 &pxor ($inout5,$inout4);
2626
2627 &pxor ($inout0,$inout3); # ^ offset_i
2628 &pxor ($inout1,$inout4);
2629 &pxor ($inout2,$inout5);
2630
2631 &mov ($out,&DWP($out_off,"esp"));
2632 &call ("_aesni_decrypt3");
2633
2634 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2635 &xorps ($inout0,$inout3); # ^ offset_i
2636 &xorps ($inout1,$inout4);
2637 &xorps ($inout2,$inout5);
2638 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2639 &pxor ($rndkey1,$inout0); # checksum
2640 &movdqa ($rndkey0,$inout5); # pass last offset_i
2641 &movups (&QWP(16*1,$out,$inp),$inout1);
2642 &pxor ($rndkey1,$inout1);
2643 &movups (&QWP(16*2,$out,$inp),$inout2);
2644 &pxor ($rndkey1,$inout2);
2645
2646 &jmp (&label("done"));
2647
2648&set_label("four",16);
2649 &lea ($i1,&DWP(1,$block));
2650 &lea ($i3,&DWP(3,$block));
2651 &bsf ($i1,$i1);
2652 &bsf ($i3,$i3);
2653 &mov ($key,&DWP($key_off,"esp")); # restore key
2654 &shl ($i1,4);
2655 &shl ($i3,4);
2656 &movdqu ($inout2,&QWP(0,$l_));
2657 &movdqu ($inout3,&QWP(0,$l_,$i1));
2658 &movdqa ($inout4,$inout2);
2659 &movdqu ($inout5,&QWP(0,$l_,$i3));
2660
2661 &pxor ($inout2,$rndkey0); # ^ last offset_i
2662 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2663 &pxor ($inout3,$inout2);
2664 &movdqu ($inout1,&QWP(16*1,$inp));
2665 &pxor ($inout4,$inout3);
2666 &movdqa (&QWP(16*0,"esp"),$inout2);
2667 &pxor ($inout5,$inout4);
2668 &movdqa (&QWP(16*1,"esp"),$inout3);
2669 &movdqu ($inout2,&QWP(16*2,$inp));
2670 &movdqu ($inout3,&QWP(16*3,$inp));
2671 &mov ($rounds,&DWP(240,$key));
2672
2673 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2674 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2675 &pxor ($inout1,&QWP(16*1,"esp"));
2676 &pxor ($inout2,$inout4);
2677 &pxor ($inout3,$inout5);
2678
2679 &mov ($out,&DWP($out_off,"esp"));
2680 &call ("_aesni_decrypt4");
2681
2682 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2683 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2684 &xorps ($inout1,&QWP(16*1,"esp"));
2685 &xorps ($inout2,$inout4);
2686 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2687 &pxor ($rndkey1,$inout0); # checksum
2688 &xorps ($inout3,$inout5);
2689 &movups (&QWP(16*1,$out,$inp),$inout1);
2690 &pxor ($rndkey1,$inout1);
2691 &movdqa ($rndkey0,$inout5); # pass last offset_i
2692 &movups (&QWP(16*2,$out,$inp),$inout2);
2693 &pxor ($rndkey1,$inout2);
2694 &movups (&QWP(16*3,$out,$inp),$inout3);
2695 &pxor ($rndkey1,$inout3);
2696
2697&set_label("done");
2698 &mov ($key,&DWP($esp_off,"esp"));
2699 &pxor ($inout0,$inout0); # clear register bank
2700 &pxor ($inout1,$inout1);
2701 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
2702 &pxor ($inout2,$inout2);
2703 &movdqa (&QWP(16*1,"esp"),$inout0);
2704 &pxor ($inout3,$inout3);
2705 &movdqa (&QWP(16*2,"esp"),$inout0);
2706 &pxor ($inout4,$inout4);
2707 &movdqa (&QWP(16*3,"esp"),$inout0);
2708 &pxor ($inout5,$inout5);
2709 &movdqa (&QWP(16*4,"esp"),$inout0);
2710 &movdqa (&QWP(16*5,"esp"),$inout0);
2711 &movdqa (&QWP(16*6,"esp"),$inout0);
2712
2713 &lea ("esp",&DWP(0,$key));
2714 &mov ($rounds,&wparam(5)); # &offset_i
2715 &mov ($rounds_,&wparam(7)); # &checksum
2716 &movdqu (&QWP(0,$rounds),$rndkey0);
2717 &pxor ($rndkey0,$rndkey0);
2718 &movdqu (&QWP(0,$rounds_),$rndkey1);
2719 &pxor ($rndkey1,$rndkey1);
2720&function_end("aesni_ocb_decrypt");
2721}
6c83629b
AP
2722}
2723\f
2724######################################################################
d64a7232
AP
2725# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2726# size_t length, const AES_KEY *key,
2727# unsigned char *ivp,const int enc);
2728&function_begin("${PREFIX}_cbc_encrypt");
2729 &mov ($inp,&wparam(0));
f8501464 2730 &mov ($rounds_,"esp");
d64a7232 2731 &mov ($out,&wparam(1));
f8501464 2732 &sub ($rounds_,24);
d64a7232 2733 &mov ($len,&wparam(2));
f8501464 2734 &and ($rounds_,-16);
d64a7232 2735 &mov ($key,&wparam(3));
d64a7232 2736 &mov ($key_,&wparam(4));
d7d119a3 2737 &test ($len,$len);
f8501464 2738 &jz (&label("cbc_abort"));
d64a7232
AP
2739
2740 &cmp (&wparam(5),0);
f8501464
AP
2741 &xchg ($rounds_,"esp"); # alloca
2742 &movups ($ivec,&QWP(0,$key_)); # load IV
d64a7232 2743 &mov ($rounds,&DWP(240,$key));
f8501464
AP
2744 &mov ($key_,$key); # backup $key
2745 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
2746 &mov ($rounds_,$rounds); # backup $rounds
d64a7232
AP
2747 &je (&label("cbc_decrypt"));
2748
f8501464 2749 &movaps ($inout0,$ivec);
d64a7232
AP
2750 &cmp ($len,16);
2751 &jb (&label("cbc_enc_tail"));
2752 &sub ($len,16);
2753 &jmp (&label("cbc_enc_loop"));
2754
2755&set_label("cbc_enc_loop",16);
f8501464 2756 &movups ($ivec,&QWP(0,$inp)); # input actually
d64a7232 2757 &lea ($inp,&DWP(16,$inp));
6f766a41 2758 if ($inline)
f8501464 2759 { &aesni_inline_generate1("enc",$inout0,$ivec); }
6f766a41 2760 else
f8501464 2761 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
d64a7232
AP
2762 &mov ($rounds,$rounds_); # restore $rounds
2763 &mov ($key,$key_); # restore $key
d7d119a3
AP
2764 &movups (&QWP(0,$out),$inout0); # store output
2765 &lea ($out,&DWP(16,$out));
2766 &sub ($len,16);
d64a7232
AP
2767 &jnc (&label("cbc_enc_loop"));
2768 &add ($len,16);
2769 &jnz (&label("cbc_enc_tail"));
2770 &movaps ($ivec,$inout0);
23f6eec7 2771 &pxor ($inout0,$inout0);
d64a7232
AP
2772 &jmp (&label("cbc_ret"));
2773
2774&set_label("cbc_enc_tail");
2775 &mov ("ecx",$len); # zaps $rounds
2776 &data_word(0xA4F3F689); # rep movsb
2777 &mov ("ecx",16); # zero tail
2778 &sub ("ecx",$len);
2779 &xor ("eax","eax"); # zaps $len
2780 &data_word(0xAAF3F689); # rep stosb
2781 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
2782 &mov ($rounds,$rounds_); # restore $rounds
2783 &mov ($inp,$out); # $inp and $out are the same
2784 &mov ($key,$key_); # restore $key
2785 &jmp (&label("cbc_enc_loop"));
6c83629b 2786######################################################################
d64a7232 2787&set_label("cbc_decrypt",16);
f8501464 2788 &cmp ($len,0x50);
d608b4d6 2789 &jbe (&label("cbc_dec_tail"));
f8501464
AP
2790 &movaps (&QWP(0,"esp"),$ivec); # save IV
2791 &sub ($len,0x50);
2792 &jmp (&label("cbc_dec_loop6_enter"));
d64a7232 2793
f8501464
AP
2794&set_label("cbc_dec_loop6",16);
2795 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
2796 &movups (&QWP(0,$out),$inout5);
2797 &lea ($out,&DWP(0x10,$out));
2798&set_label("cbc_dec_loop6_enter");
2799 &movdqu ($inout0,&QWP(0,$inp));
2800 &movdqu ($inout1,&QWP(0x10,$inp));
2801 &movdqu ($inout2,&QWP(0x20,$inp));
2802 &movdqu ($inout3,&QWP(0x30,$inp));
2803 &movdqu ($inout4,&QWP(0x40,$inp));
2804 &movdqu ($inout5,&QWP(0x50,$inp));
2805
2806 &call ("_aesni_decrypt6");
2807
2808 &movups ($rndkey1,&QWP(0,$inp));
2809 &movups ($rndkey0,&QWP(0x10,$inp));
2810 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
2811 &xorps ($inout1,$rndkey1);
2812 &movups ($rndkey1,&QWP(0x20,$inp));
2813 &xorps ($inout2,$rndkey0);
2814 &movups ($rndkey0,&QWP(0x30,$inp));
2815 &xorps ($inout3,$rndkey1);
2816 &movups ($rndkey1,&QWP(0x40,$inp));
2817 &xorps ($inout4,$rndkey0);
2818 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
2819 &xorps ($inout5,$rndkey1);
2820 &movups (&QWP(0,$out),$inout0);
2821 &movups (&QWP(0x10,$out),$inout1);
2822 &lea ($inp,&DWP(0x60,$inp));
2823 &movups (&QWP(0x20,$out),$inout2);
f9c5e5d9 2824 &mov ($rounds,$rounds_); # restore $rounds
f8501464
AP
2825 &movups (&QWP(0x30,$out),$inout3);
2826 &mov ($key,$key_); # restore $key
2827 &movups (&QWP(0x40,$out),$inout4);
2828 &lea ($out,&DWP(0x50,$out));
2829 &sub ($len,0x60);
2830 &ja (&label("cbc_dec_loop6"));
2831
2832 &movaps ($inout0,$inout5);
2833 &movaps ($ivec,$rndkey0);
2834 &add ($len,0x50);
23f6eec7 2835 &jle (&label("cbc_dec_clear_tail_collected"));
f8501464
AP
2836 &movups (&QWP(0,$out),$inout0);
2837 &lea ($out,&DWP(0x10,$out));
6c83629b 2838&set_label("cbc_dec_tail");
d64a7232 2839 &movups ($inout0,&QWP(0,$inp));
d64a7232 2840 &movaps ($in0,$inout0);
d7d119a3 2841 &cmp ($len,0x10);
d64a7232 2842 &jbe (&label("cbc_dec_one"));
f8501464 2843
d64a7232 2844 &movups ($inout1,&QWP(0x10,$inp));
d64a7232 2845 &movaps ($in1,$inout1);
d7d119a3 2846 &cmp ($len,0x20);
d64a7232 2847 &jbe (&label("cbc_dec_two"));
f8501464 2848
d64a7232 2849 &movups ($inout2,&QWP(0x20,$inp));
d608b4d6
AP
2850 &cmp ($len,0x30);
2851 &jbe (&label("cbc_dec_three"));
f8501464 2852
d608b4d6 2853 &movups ($inout3,&QWP(0x30,$inp));
f8501464
AP
2854 &cmp ($len,0x40);
2855 &jbe (&label("cbc_dec_four"));
2856
2857 &movups ($inout4,&QWP(0x40,$inp));
2858 &movaps (&QWP(0,"esp"),$ivec); # save IV
2859 &movups ($inout0,&QWP(0,$inp));
2860 &xorps ($inout5,$inout5);
2861 &call ("_aesni_decrypt6");
2862 &movups ($rndkey1,&QWP(0,$inp));
2863 &movups ($rndkey0,&QWP(0x10,$inp));
2864 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
2865 &xorps ($inout1,$rndkey1);
2866 &movups ($rndkey1,&QWP(0x20,$inp));
2867 &xorps ($inout2,$rndkey0);
2868 &movups ($rndkey0,&QWP(0x30,$inp));
2869 &xorps ($inout3,$rndkey1);
2870 &movups ($ivec,&QWP(0x40,$inp)); # IV
2871 &xorps ($inout4,$rndkey0);
2872 &movups (&QWP(0,$out),$inout0);
2873 &movups (&QWP(0x10,$out),$inout1);
23f6eec7 2874 &pxor ($inout1,$inout1);
f8501464 2875 &movups (&QWP(0x20,$out),$inout2);
23f6eec7 2876 &pxor ($inout2,$inout2);
f8501464 2877 &movups (&QWP(0x30,$out),$inout3);
23f6eec7 2878 &pxor ($inout3,$inout3);
f8501464
AP
2879 &lea ($out,&DWP(0x40,$out));
2880 &movaps ($inout0,$inout4);
23f6eec7 2881 &pxor ($inout4,$inout4);
f8501464 2882 &sub ($len,0x50);
d64a7232
AP
2883 &jmp (&label("cbc_dec_tail_collected"));
2884
d7d119a3 2885&set_label("cbc_dec_one",16);
6f766a41
AP
2886 if ($inline)
2887 { &aesni_inline_generate1("dec"); }
2888 else
2889 { &call ("_aesni_decrypt1"); }
f8501464
AP
2890 &xorps ($inout0,$ivec);
2891 &movaps ($ivec,$in0);
2892 &sub ($len,0x10);
d64a7232
AP
2893 &jmp (&label("cbc_dec_tail_collected"));
2894
d7d119a3 2895&set_label("cbc_dec_two",16);
214368ff 2896 &call ("_aesni_decrypt2");
f8501464
AP
2897 &xorps ($inout0,$ivec);
2898 &xorps ($inout1,$in0);
2899 &movups (&QWP(0,$out),$inout0);
2900 &movaps ($inout0,$inout1);
23f6eec7 2901 &pxor ($inout1,$inout1);
d64a7232 2902 &lea ($out,&DWP(0x10,$out));
f8501464
AP
2903 &movaps ($ivec,$in1);
2904 &sub ($len,0x20);
d608b4d6
AP
2905 &jmp (&label("cbc_dec_tail_collected"));
2906
d7d119a3 2907&set_label("cbc_dec_three",16);
d608b4d6 2908 &call ("_aesni_decrypt3");
f8501464
AP
2909 &xorps ($inout0,$ivec);
2910 &xorps ($inout1,$in0);
2911 &xorps ($inout2,$in1);
2912 &movups (&QWP(0,$out),$inout0);
2913 &movaps ($inout0,$inout2);
23f6eec7 2914 &pxor ($inout2,$inout2);
f8501464 2915 &movups (&QWP(0x10,$out),$inout1);
23f6eec7 2916 &pxor ($inout1,$inout1);
d608b4d6 2917 &lea ($out,&DWP(0x20,$out));
f8501464
AP
2918 &movups ($ivec,&QWP(0x20,$inp));
2919 &sub ($len,0x30);
2920 &jmp (&label("cbc_dec_tail_collected"));
2921
2922&set_label("cbc_dec_four",16);
2923 &call ("_aesni_decrypt4");
2924 &movups ($rndkey1,&QWP(0x10,$inp));
2925 &movups ($rndkey0,&QWP(0x20,$inp));
2926 &xorps ($inout0,$ivec);
2927 &movups ($ivec,&QWP(0x30,$inp));
2928 &xorps ($inout1,$in0);
2929 &movups (&QWP(0,$out),$inout0);
2930 &xorps ($inout2,$rndkey1);
2931 &movups (&QWP(0x10,$out),$inout1);
23f6eec7 2932 &pxor ($inout1,$inout1);
f8501464
AP
2933 &xorps ($inout3,$rndkey0);
2934 &movups (&QWP(0x20,$out),$inout2);
23f6eec7 2935 &pxor ($inout2,$inout2);
f8501464
AP
2936 &lea ($out,&DWP(0x30,$out));
2937 &movaps ($inout0,$inout3);
23f6eec7 2938 &pxor ($inout3,$inout3);
f8501464 2939 &sub ($len,0x40);
23f6eec7 2940 &jmp (&label("cbc_dec_tail_collected"));
d64a7232 2941
23f6eec7
AP
2942&set_label("cbc_dec_clear_tail_collected",16);
2943 &pxor ($inout1,$inout1);
2944 &pxor ($inout2,$inout2);
2945 &pxor ($inout3,$inout3);
2946 &pxor ($inout4,$inout4);
d64a7232
AP
2947&set_label("cbc_dec_tail_collected");
2948 &and ($len,15);
2949 &jnz (&label("cbc_dec_tail_partial"));
f8501464 2950 &movups (&QWP(0,$out),$inout0);
23f6eec7 2951 &pxor ($rndkey0,$rndkey0);
d64a7232
AP
2952 &jmp (&label("cbc_ret"));
2953
d7d119a3 2954&set_label("cbc_dec_tail_partial",16);
f8501464 2955 &movaps (&QWP(0,"esp"),$inout0);
23f6eec7 2956 &pxor ($rndkey0,$rndkey0);
f8501464 2957 &mov ("ecx",16);
d64a7232 2958 &mov ($inp,"esp");
f8501464 2959 &sub ("ecx",$len);
d64a7232 2960 &data_word(0xA4F3F689); # rep movsb
23f6eec7 2961 &movdqa (&QWP(0,"esp"),$inout0);
d64a7232
AP
2962
2963&set_label("cbc_ret");
f8501464 2964 &mov ("esp",&DWP(16,"esp")); # pull original %esp
d64a7232 2965 &mov ($key_,&wparam(4));
23f6eec7
AP
2966 &pxor ($inout0,$inout0);
2967 &pxor ($rndkey1,$rndkey1);
d64a7232 2968 &movups (&QWP(0,$key_),$ivec); # output IV
23f6eec7 2969 &pxor ($ivec,$ivec);
f8501464 2970&set_label("cbc_abort");
d64a7232 2971&function_end("${PREFIX}_cbc_encrypt");
6c83629b
AP
2972\f
2973######################################################################
d64a7232
AP
2974# Mechanical port from aesni-x86_64.pl.
2975#
2976# _aesni_set_encrypt_key is private interface,
2977# input:
2978# "eax" const unsigned char *userKey
2979# $rounds int bits
2980# $key AES_KEY *key
2981# output:
2982# "eax" return code
2983# $round rounds
2984
2985&function_begin_B("_aesni_set_encrypt_key");
23f6eec7
AP
2986 &push ("ebp");
2987 &push ("ebx");
d64a7232
AP
2988 &test ("eax","eax");
2989 &jz (&label("bad_pointer"));
2990 &test ($key,$key);
2991 &jz (&label("bad_pointer"));
2992
23f6eec7
AP
2993 &call (&label("pic"));
2994&set_label("pic");
2995 &blindpop("ebx");
2996 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2997
2998 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
d64a7232 2999 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
f8501464 3000 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
23f6eec7 3001 &mov ("ebp",&DWP(4,"ebp"));
d64a7232 3002 &lea ($key,&DWP(16,$key));
23f6eec7 3003 &and ("ebp",1<<28|1<<11); # AVX and XOP bits
d64a7232
AP
3004 &cmp ($rounds,256);
3005 &je (&label("14rounds"));
3006 &cmp ($rounds,192);
3007 &je (&label("12rounds"));
3008 &cmp ($rounds,128);
3009 &jne (&label("bad_keybits"));
3010
3011&set_label("10rounds",16);
23f6eec7
AP
3012 &cmp ("ebp",1<<28);
3013 &je (&label("10rounds_alt"));
3014
d608b4d6 3015 &mov ($rounds,9);
d64a7232
AP
3016 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
3017 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
3018 &call (&label("key_128_cold"));
3019 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
3020 &call (&label("key_128"));
3021 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
3022 &call (&label("key_128"));
3023 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
3024 &call (&label("key_128"));
3025 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
3026 &call (&label("key_128"));
3027 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
3028 &call (&label("key_128"));
3029 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
3030 &call (&label("key_128"));
3031 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
3032 &call (&label("key_128"));
3033 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
3034 &call (&label("key_128"));
3035 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
3036 &call (&label("key_128"));
3037 &$movekey (&QWP(0,$key),"xmm0");
3038 &mov (&DWP(80,$key),$rounds);
23f6eec7
AP
3039
3040 &jmp (&label("good_key"));
d64a7232
AP
3041
3042&set_label("key_128",16);
3043 &$movekey (&QWP(0,$key),"xmm0");
3044 &lea ($key,&DWP(16,$key));
3045&set_label("key_128_cold");
3046 &shufps ("xmm4","xmm0",0b00010000);
f8501464
AP
3047 &xorps ("xmm0","xmm4");
3048 &shufps ("xmm4","xmm0",0b10001100);
3049 &xorps ("xmm0","xmm4");
3050 &shufps ("xmm1","xmm1",0b11111111); # critical path
3051 &xorps ("xmm0","xmm1");
d64a7232
AP
3052 &ret();
3053
23f6eec7
AP
3054&set_label("10rounds_alt",16);
3055 &movdqa ("xmm5",&QWP(0x00,"ebx"));
3056 &mov ($rounds,8);
3057 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3058 &movdqa ("xmm2","xmm0");
7be6bc68 3059 &movdqu (&QWP(-16,$key),"xmm0");
23f6eec7
AP
3060
3061&set_label("loop_key128");
3062 &pshufb ("xmm0","xmm5");
3063 &aesenclast ("xmm0","xmm4");
3064 &pslld ("xmm4",1);
3065 &lea ($key,&DWP(16,$key));
3066
3067 &movdqa ("xmm3","xmm2");
3068 &pslldq ("xmm2",4);
3069 &pxor ("xmm3","xmm2");
3070 &pslldq ("xmm2",4);
3071 &pxor ("xmm3","xmm2");
3072 &pslldq ("xmm2",4);
3073 &pxor ("xmm2","xmm3");
3074
3075 &pxor ("xmm0","xmm2");
3076 &movdqu (&QWP(-16,$key),"xmm0");
3077 &movdqa ("xmm2","xmm0");
3078
3079 &dec ($rounds);
3080 &jnz (&label("loop_key128"));
3081
3082 &movdqa ("xmm4",&QWP(0x30,"ebx"));
3083
3084 &pshufb ("xmm0","xmm5");
3085 &aesenclast ("xmm0","xmm4");
3086 &pslld ("xmm4",1);
3087
3088 &movdqa ("xmm3","xmm2");
3089 &pslldq ("xmm2",4);
3090 &pxor ("xmm3","xmm2");
3091 &pslldq ("xmm2",4);
3092 &pxor ("xmm3","xmm2");
3093 &pslldq ("xmm2",4);
3094 &pxor ("xmm2","xmm3");
3095
3096 &pxor ("xmm0","xmm2");
3097 &movdqu (&QWP(0,$key),"xmm0");
3098
3099 &movdqa ("xmm2","xmm0");
3100 &pshufb ("xmm0","xmm5");
3101 &aesenclast ("xmm0","xmm4");
3102
3103 &movdqa ("xmm3","xmm2");
3104 &pslldq ("xmm2",4);
3105 &pxor ("xmm3","xmm2");
3106 &pslldq ("xmm2",4);
3107 &pxor ("xmm3","xmm2");
3108 &pslldq ("xmm2",4);
3109 &pxor ("xmm2","xmm3");
3110
3111 &pxor ("xmm0","xmm2");
3112 &movdqu (&QWP(16,$key),"xmm0");
3113
3114 &mov ($rounds,9);
3115 &mov (&DWP(96,$key),$rounds);
3116
3117 &jmp (&label("good_key"));
3118
d64a7232
AP
3119&set_label("12rounds",16);
3120 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
23f6eec7
AP
3121 &cmp ("ebp",1<<28);
3122 &je (&label("12rounds_alt"));
3123
d608b4d6 3124 &mov ($rounds,11);
f9c5e5d9 3125 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
d64a7232
AP
3126 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
3127 &call (&label("key_192a_cold"));
3128 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
3129 &call (&label("key_192b"));
3130 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
3131 &call (&label("key_192a"));
3132 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
3133 &call (&label("key_192b"));
3134 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
3135 &call (&label("key_192a"));
3136 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
3137 &call (&label("key_192b"));
3138 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
3139 &call (&label("key_192a"));
3140 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
3141 &call (&label("key_192b"));
3142 &$movekey (&QWP(0,$key),"xmm0");
3143 &mov (&DWP(48,$key),$rounds);
23f6eec7
AP
3144
3145 &jmp (&label("good_key"));
d64a7232
AP
3146
3147&set_label("key_192a",16);
3148 &$movekey (&QWP(0,$key),"xmm0");
3149 &lea ($key,&DWP(16,$key));
3150&set_label("key_192a_cold",16);
3151 &movaps ("xmm5","xmm2");
3152&set_label("key_192b_warm");
3153 &shufps ("xmm4","xmm0",0b00010000);
f8501464
AP
3154 &movdqa ("xmm3","xmm2");
3155 &xorps ("xmm0","xmm4");
d64a7232
AP
3156 &shufps ("xmm4","xmm0",0b10001100);
3157 &pslldq ("xmm3",4);
f8501464 3158 &xorps ("xmm0","xmm4");
d64a7232
AP
3159 &pshufd ("xmm1","xmm1",0b01010101); # critical path
3160 &pxor ("xmm2","xmm3");
3161 &pxor ("xmm0","xmm1");
3162 &pshufd ("xmm3","xmm0",0b11111111);
3163 &pxor ("xmm2","xmm3");
3164 &ret();
3165
3166&set_label("key_192b",16);
3167 &movaps ("xmm3","xmm0");
3168 &shufps ("xmm5","xmm0",0b01000100);
3169 &$movekey (&QWP(0,$key),"xmm5");
3170 &shufps ("xmm3","xmm2",0b01001110);
3171 &$movekey (&QWP(16,$key),"xmm3");
3172 &lea ($key,&DWP(32,$key));
3173 &jmp (&label("key_192b_warm"));
3174
23f6eec7
AP
3175&set_label("12rounds_alt",16);
3176 &movdqa ("xmm5",&QWP(0x10,"ebx"));
3177 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3178 &mov ($rounds,8);
3179 &movdqu (&QWP(-16,$key),"xmm0");
3180
3181&set_label("loop_key192");
3182 &movq (&QWP(0,$key),"xmm2");
3183 &movdqa ("xmm1","xmm2");
3184 &pshufb ("xmm2","xmm5");
3185 &aesenclast ("xmm2","xmm4");
3186 &pslld ("xmm4",1);
3187 &lea ($key,&DWP(24,$key));
3188
3189 &movdqa ("xmm3","xmm0");
3190 &pslldq ("xmm0",4);
3191 &pxor ("xmm3","xmm0");
3192 &pslldq ("xmm0",4);
3193 &pxor ("xmm3","xmm0");
3194 &pslldq ("xmm0",4);
3195 &pxor ("xmm0","xmm3");
3196
3197 &pshufd ("xmm3","xmm0",0xff);
3198 &pxor ("xmm3","xmm1");
3199 &pslldq ("xmm1",4);
3200 &pxor ("xmm3","xmm1");
3201
3202 &pxor ("xmm0","xmm2");
3203 &pxor ("xmm2","xmm3");
3204 &movdqu (&QWP(-16,$key),"xmm0");
3205
3206 &dec ($rounds);
3207 &jnz (&label("loop_key192"));
3208
3209 &mov ($rounds,11);
3210 &mov (&DWP(32,$key),$rounds);
3211
3212 &jmp (&label("good_key"));
3213
d64a7232
AP
3214&set_label("14rounds",16);
3215 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
d64a7232 3216 &lea ($key,&DWP(16,$key));
23f6eec7
AP
3217 &cmp ("ebp",1<<28);
3218 &je (&label("14rounds_alt"));
3219
3220 &mov ($rounds,13);
d64a7232
AP
3221 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
3222 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
3223 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
3224 &call (&label("key_256a_cold"));
3225 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
3226 &call (&label("key_256b"));
3227 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
3228 &call (&label("key_256a"));
3229 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
3230 &call (&label("key_256b"));
3231 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
3232 &call (&label("key_256a"));
3233 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
3234 &call (&label("key_256b"));
3235 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
3236 &call (&label("key_256a"));
3237 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
3238 &call (&label("key_256b"));
3239 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
3240 &call (&label("key_256a"));
3241 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
3242 &call (&label("key_256b"));
3243 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
3244 &call (&label("key_256a"));
3245 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
3246 &call (&label("key_256b"));
3247 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
3248 &call (&label("key_256a"));
3249 &$movekey (&QWP(0,$key),"xmm0");
3250 &mov (&DWP(16,$key),$rounds);
3251 &xor ("eax","eax");
23f6eec7
AP
3252
3253 &jmp (&label("good_key"));
d64a7232
AP
3254
3255&set_label("key_256a",16);
3256 &$movekey (&QWP(0,$key),"xmm2");
3257 &lea ($key,&DWP(16,$key));
3258&set_label("key_256a_cold");
3259 &shufps ("xmm4","xmm0",0b00010000);
f8501464 3260 &xorps ("xmm0","xmm4");
d64a7232 3261 &shufps ("xmm4","xmm0",0b10001100);
f8501464
AP
3262 &xorps ("xmm0","xmm4");
3263 &shufps ("xmm1","xmm1",0b11111111); # critical path
3264 &xorps ("xmm0","xmm1");
d64a7232
AP
3265 &ret();
3266
3267&set_label("key_256b",16);
3268 &$movekey (&QWP(0,$key),"xmm0");
3269 &lea ($key,&DWP(16,$key));
3270
3271 &shufps ("xmm4","xmm2",0b00010000);
f8501464 3272 &xorps ("xmm2","xmm4");
d64a7232 3273 &shufps ("xmm4","xmm2",0b10001100);
f8501464
AP
3274 &xorps ("xmm2","xmm4");
3275 &shufps ("xmm1","xmm1",0b10101010); # critical path
3276 &xorps ("xmm2","xmm1");
d64a7232
AP
3277 &ret();
3278
23f6eec7
AP
3279&set_label("14rounds_alt",16);
3280 &movdqa ("xmm5",&QWP(0x00,"ebx"));
3281 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3282 &mov ($rounds,7);
3283 &movdqu (&QWP(-32,$key),"xmm0");
3284 &movdqa ("xmm1","xmm2");
3285 &movdqu (&QWP(-16,$key),"xmm2");
3286
3287&set_label("loop_key256");
3288 &pshufb ("xmm2","xmm5");
3289 &aesenclast ("xmm2","xmm4");
3290
3291 &movdqa ("xmm3","xmm0");
3292 &pslldq ("xmm0",4);
3293 &pxor ("xmm3","xmm0");
3294 &pslldq ("xmm0",4);
3295 &pxor ("xmm3","xmm0");
3296 &pslldq ("xmm0",4);
3297 &pxor ("xmm0","xmm3");
3298 &pslld ("xmm4",1);
3299
3300 &pxor ("xmm0","xmm2");
3301 &movdqu (&QWP(0,$key),"xmm0");
3302
3303 &dec ($rounds);
3304 &jz (&label("done_key256"));
3305
3306 &pshufd ("xmm2","xmm0",0xff);
3307 &pxor ("xmm3","xmm3");
3308 &aesenclast ("xmm2","xmm3");
3309
bd30091c 3310 &movdqa ("xmm3","xmm1");
23f6eec7
AP
3311 &pslldq ("xmm1",4);
3312 &pxor ("xmm3","xmm1");
3313 &pslldq ("xmm1",4);
3314 &pxor ("xmm3","xmm1");
3315 &pslldq ("xmm1",4);
3316 &pxor ("xmm1","xmm3");
3317
3318 &pxor ("xmm2","xmm1");
3319 &movdqu (&QWP(16,$key),"xmm2");
3320 &lea ($key,&DWP(32,$key));
3321 &movdqa ("xmm1","xmm2");
3322 &jmp (&label("loop_key256"));
3323
3324&set_label("done_key256");
3325 &mov ($rounds,13);
3326 &mov (&DWP(16,$key),$rounds);
3327
3328&set_label("good_key");
3329 &pxor ("xmm0","xmm0");
3330 &pxor ("xmm1","xmm1");
3331 &pxor ("xmm2","xmm2");
3332 &pxor ("xmm3","xmm3");
3333 &pxor ("xmm4","xmm4");
3334 &pxor ("xmm5","xmm5");
3335 &xor ("eax","eax");
3336 &pop ("ebx");
3337 &pop ("ebp");
3338 &ret ();
3339
d64a7232
AP
3340&set_label("bad_pointer",4);
3341 &mov ("eax",-1);
23f6eec7
AP
3342 &pop ("ebx");
3343 &pop ("ebp");
d64a7232
AP
3344 &ret ();
3345&set_label("bad_keybits",4);
23f6eec7 3346 &pxor ("xmm0","xmm0");
d64a7232 3347 &mov ("eax",-2);
23f6eec7
AP
3348 &pop ("ebx");
3349 &pop ("ebp");
d64a7232
AP
3350 &ret ();
3351&function_end_B("_aesni_set_encrypt_key");
3352
3353# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
3354# AES_KEY *key)
3355&function_begin_B("${PREFIX}_set_encrypt_key");
3356 &mov ("eax",&wparam(0));
3357 &mov ($rounds,&wparam(1));
3358 &mov ($key,&wparam(2));
3359 &call ("_aesni_set_encrypt_key");
3360 &ret ();
3361&function_end_B("${PREFIX}_set_encrypt_key");
3362
3363# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
3364# AES_KEY *key)
3365&function_begin_B("${PREFIX}_set_decrypt_key");
3366 &mov ("eax",&wparam(0));
3367 &mov ($rounds,&wparam(1));
3368 &mov ($key,&wparam(2));
3369 &call ("_aesni_set_encrypt_key");
3370 &mov ($key,&wparam(2));
f9c5e5d9 3371 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
d64a7232
AP
3372 &test ("eax","eax");
3373 &jnz (&label("dec_key_ret"));
d608b4d6 3374 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
d64a7232
AP
3375
3376 &$movekey ("xmm0",&QWP(0,$key)); # just swap
3377 &$movekey ("xmm1",&QWP(0,"eax"));
3378 &$movekey (&QWP(0,"eax"),"xmm0");
3379 &$movekey (&QWP(0,$key),"xmm1");
3380 &lea ($key,&DWP(16,$key));
3381 &lea ("eax",&DWP(-16,"eax"));
d64a7232 3382
d608b4d6 3383&set_label("dec_key_inverse");
d64a7232
AP
3384 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
3385 &$movekey ("xmm1",&QWP(0,"eax"));
3386 &aesimc ("xmm0","xmm0");
3387 &aesimc ("xmm1","xmm1");
3388 &lea ($key,&DWP(16,$key));
3389 &lea ("eax",&DWP(-16,"eax"));
d64a7232
AP
3390 &$movekey (&QWP(16,"eax"),"xmm0");
3391 &$movekey (&QWP(-16,$key),"xmm1");
d7d119a3 3392 &cmp ("eax",$key);
d64a7232
AP
3393 &ja (&label("dec_key_inverse"));
3394
3395 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
3396 &aesimc ("xmm0","xmm0");
3397 &$movekey (&QWP(0,$key),"xmm0");
3398
23f6eec7
AP
3399 &pxor ("xmm0","xmm0");
3400 &pxor ("xmm1","xmm1");
d64a7232
AP
3401 &xor ("eax","eax"); # return success
3402&set_label("dec_key_ret");
3403 &ret ();
3404&function_end_B("${PREFIX}_set_decrypt_key");
23f6eec7
AP
3405
3406&set_label("key_const",64);
3407&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
3408&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
3409&data_word(1,1,1,1);
3410&data_word(0x1b,0x1b,0x1b,0x1b);
d64a7232
AP
3411&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
3412
3413&asm_finish();
184bc45f
RL
3414
3415close STDOUT;