]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/aesni-x86.pl
Update fuzz corpora
[thirdparty/openssl.git] / crypto / aes / asm / aesni-x86.pl
CommitLineData
6aa36e8e
RS
1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
d64a7232
AP
9
10# ====================================================================
d8ba0dc9 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
d64a7232
AP
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
20# details].
d7d119a3
AP
21#
22# Performance.
23#
24# To start with see corresponding paragraph in aesni-x86_64.pl...
25# Instead of filling table similar to one found there I've chosen to
26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27# The simplified table below represents 32-bit performance relative
28# to 64-bit one in every given point. Ratios vary for different
29# encryption modes, therefore interval values.
30#
31# 16-byte 64-byte 256-byte 1-KB 8-KB
32# 53-67% 67-84% 91-94% 95-98% 97-99.5%
33#
34# Lower ratios for smaller block sizes are perfectly understandable,
35# because function call overhead is higher in 32-bit mode. Largest
36# 8-KB block performance is virtually same: 32-bit code is less than
f8501464
AP
37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
38
39# January 2011
40#
41# See aesni-x86_64.pl for details. Unlike x86_64 version this module
42# interleaves at most 6 aes[enc|dec] instructions, because there are
43# not enough registers for 8x interleave [which should be optimal for
44# Sandy Bridge]. Actually, performance results for 6x interleave
45# factor presented in aesni-x86_64.pl (except for CTR) are for this
46# module.
47
48# April 2011
49#
50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
d64a7232 52
bd30091c
AP
53# November 2015
54#
55# Add aesni_ocb_[en|de]crypt.
56
5599c733
AP
57######################################################################
58# Current large-block performance in cycles per byte processed with
59# 128-bit key (less is better).
60#
bd30091c 61# CBC en-/decrypt CTR XTS ECB OCB
5599c733 62# Westmere 3.77/1.37 1.37 1.52 1.27
bd30091c
AP
63# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10
64# Haswell 4.44/0.80 0.97 1.03 0.72 0.76
65# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03
66# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23
5599c733 67
d64a7232
AP
68$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
69 # generates drop-in replacement for
70 # crypto/aes/asm/aes-586.pl:-)
6f766a41 71$inline=1; # inline _aesni_[en|de]crypt
d64a7232
AP
72
73$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74push(@INC,"${dir}","${dir}../../perlasm");
75require "x86asm.pl";
76
184bc45f
RL
77$output = pop;
78open OUT,">$output";
79*STDOUT=*OUT;
80
d64a7232
AP
81&asm_init($ARGV[0],$0);
82
23f6eec7
AP
83&external_label("OPENSSL_ia32cap_P");
84&static_label("key_const");
85
f9c5e5d9
AP
86if ($PREFIX eq "aesni") { $movekey=\&movups; }
87else { $movekey=\&movups; }
d64a7232
AP
88
89$len="eax";
90$rounds="ecx";
91$key="edx";
92$inp="esi";
93$out="edi";
d608b4d6
AP
94$rounds_="ebx"; # backup copy for $rounds
95$key_="ebp"; # backup copy for $key
d64a7232 96
f8501464
AP
97$rndkey0="xmm0";
98$rndkey1="xmm1";
99$inout0="xmm2";
100$inout1="xmm3";
101$inout2="xmm4";
102$inout3="xmm5"; $in1="xmm5";
103$inout4="xmm6"; $in0="xmm6";
104$inout5="xmm7"; $ivec="xmm7";
133a7f9a 105
d900a015 106# AESNI extension
133a7f9a
AP
107sub aeskeygenassist
108{ my($dst,$src,$imm)=@_;
109 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
110 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
111}
112sub aescommon
113{ my($opcodelet,$dst,$src)=@_;
114 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
115 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
116}
117sub aesimc { aescommon(0xdb,@_); }
118sub aesenc { aescommon(0xdc,@_); }
119sub aesenclast { aescommon(0xdd,@_); }
120sub aesdec { aescommon(0xde,@_); }
121sub aesdeclast { aescommon(0xdf,@_); }
6c83629b 122\f
d608b4d6 123# Inline version of internal aesni_[en|de]crypt1
d7d119a3 124{ my $sn;
d608b4d6 125sub aesni_inline_generate1
f8501464 126{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
d7d119a3 127 $sn++;
d64a7232 128
f8501464 129 &$movekey ($rndkey0,&QWP(0,$key));
d608b4d6 130 &$movekey ($rndkey1,&QWP(16,$key));
f8501464 131 &xorps ($ivec,$rndkey0) if (defined($ivec));
d608b4d6 132 &lea ($key,&DWP(32,$key));
f8501464
AP
133 &xorps ($inout,$ivec) if (defined($ivec));
134 &xorps ($inout,$rndkey0) if (!defined($ivec));
d7d119a3
AP
135 &set_label("${p}1_loop_$sn");
136 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 137 &dec ($rounds);
d64a7232 138 &$movekey ($rndkey1,&QWP(0,$key));
d608b4d6 139 &lea ($key,&DWP(16,$key));
d7d119a3
AP
140 &jnz (&label("${p}1_loop_$sn"));
141 eval"&aes${p}last ($inout,$rndkey1)";
142}}
d64a7232
AP
143
144sub aesni_generate1 # fully unrolled loop
d7d119a3 145{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
d64a7232
AP
146
147 &function_begin_B("_aesni_${p}rypt1");
f8501464 148 &movups ($rndkey0,&QWP(0,$key));
d64a7232 149 &$movekey ($rndkey1,&QWP(0x10,$key));
f8501464 150 &xorps ($inout,$rndkey0);
d64a7232
AP
151 &$movekey ($rndkey0,&QWP(0x20,$key));
152 &lea ($key,&DWP(0x30,$key));
d7d119a3 153 &cmp ($rounds,11);
d64a7232
AP
154 &jb (&label("${p}128"));
155 &lea ($key,&DWP(0x20,$key));
156 &je (&label("${p}192"));
157 &lea ($key,&DWP(0x20,$key));
d7d119a3 158 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 159 &$movekey ($rndkey1,&QWP(-0x40,$key));
d7d119a3 160 eval"&aes${p} ($inout,$rndkey0)";
d64a7232
AP
161 &$movekey ($rndkey0,&QWP(-0x30,$key));
162 &set_label("${p}192");
d7d119a3 163 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 164 &$movekey ($rndkey1,&QWP(-0x20,$key));
d7d119a3 165 eval"&aes${p} ($inout,$rndkey0)";
d64a7232
AP
166 &$movekey ($rndkey0,&QWP(-0x10,$key));
167 &set_label("${p}128");
d7d119a3 168 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 169 &$movekey ($rndkey1,&QWP(0,$key));
d7d119a3 170 eval"&aes${p} ($inout,$rndkey0)";
d64a7232 171 &$movekey ($rndkey0,&QWP(0x10,$key));
d7d119a3 172 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 173 &$movekey ($rndkey1,&QWP(0x20,$key));
d7d119a3 174 eval"&aes${p} ($inout,$rndkey0)";
d64a7232 175 &$movekey ($rndkey0,&QWP(0x30,$key));
d7d119a3 176 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 177 &$movekey ($rndkey1,&QWP(0x40,$key));
d7d119a3 178 eval"&aes${p} ($inout,$rndkey0)";
d64a7232 179 &$movekey ($rndkey0,&QWP(0x50,$key));
d7d119a3 180 eval"&aes${p} ($inout,$rndkey1)";
d64a7232 181 &$movekey ($rndkey1,&QWP(0x60,$key));
d7d119a3 182 eval"&aes${p} ($inout,$rndkey0)";
d64a7232 183 &$movekey ($rndkey0,&QWP(0x70,$key));
d7d119a3
AP
184 eval"&aes${p} ($inout,$rndkey1)";
185 eval"&aes${p}last ($inout,$rndkey0)";
d64a7232
AP
186 &ret();
187 &function_end_B("_aesni_${p}rypt1");
188}
6c83629b 189\f
d64a7232 190# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
6f766a41 191&aesni_generate1("enc") if (!$inline);
d64a7232
AP
192&function_begin_B("${PREFIX}_encrypt");
193 &mov ("eax",&wparam(0));
194 &mov ($key,&wparam(2));
f8501464 195 &movups ($inout0,&QWP(0,"eax"));
d64a7232
AP
196 &mov ($rounds,&DWP(240,$key));
197 &mov ("eax",&wparam(1));
6f766a41
AP
198 if ($inline)
199 { &aesni_inline_generate1("enc"); }
200 else
201 { &call ("_aesni_encrypt1"); }
23f6eec7
AP
202 &pxor ($rndkey0,$rndkey0); # clear register bank
203 &pxor ($rndkey1,$rndkey1);
d64a7232 204 &movups (&QWP(0,"eax"),$inout0);
23f6eec7 205 &pxor ($inout0,$inout0);
d64a7232
AP
206 &ret ();
207&function_end_B("${PREFIX}_encrypt");
208
d64a7232 209# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
6f766a41 210&aesni_generate1("dec") if(!$inline);
d64a7232
AP
211&function_begin_B("${PREFIX}_decrypt");
212 &mov ("eax",&wparam(0));
213 &mov ($key,&wparam(2));
f8501464 214 &movups ($inout0,&QWP(0,"eax"));
d64a7232
AP
215 &mov ($rounds,&DWP(240,$key));
216 &mov ("eax",&wparam(1));
6f766a41
AP
217 if ($inline)
218 { &aesni_inline_generate1("dec"); }
219 else
220 { &call ("_aesni_decrypt1"); }
23f6eec7
AP
221 &pxor ($rndkey0,$rndkey0); # clear register bank
222 &pxor ($rndkey1,$rndkey1);
d64a7232 223 &movups (&QWP(0,"eax"),$inout0);
23f6eec7 224 &pxor ($inout0,$inout0);
d64a7232
AP
225 &ret ();
226&function_end_B("${PREFIX}_decrypt");
6c83629b 227
f8501464
AP
228# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
229# factor. Why 3x subroutine were originally used in loops? Even though
230# aes[enc|dec] latency was originally 6, it could be scheduled only
231# every *2nd* cycle. Thus 3x interleave was the one providing optimal
d608b4d6
AP
232# utilization, i.e. when subroutine's throughput is virtually same as
233# of non-interleaved subroutine [for number of input blocks up to 3].
214368ff
AP
234# This is why it originally made no sense to implement 2x subroutine.
235# But times change and it became appropriate to spend extra 192 bytes
236# on 2x subroutine on Atom Silvermont account. For processors that
237# can schedule aes[enc|dec] every cycle optimal interleave factor
238# equals to corresponding instructions latency. 8x is optimal for
239# * Bridge, but it's unfeasible to accommodate such implementation
240# in XMM registers addreassable in 32-bit mode and therefore maximum
241# of 6x is used instead...
242
243sub aesni_generate2
244{ my $p=shift;
245
246 &function_begin_B("_aesni_${p}rypt2");
247 &$movekey ($rndkey0,&QWP(0,$key));
248 &shl ($rounds,4);
249 &$movekey ($rndkey1,&QWP(16,$key));
250 &xorps ($inout0,$rndkey0);
251 &pxor ($inout1,$rndkey0);
252 &$movekey ($rndkey0,&QWP(32,$key));
253 &lea ($key,&DWP(32,$key,$rounds));
254 &neg ($rounds);
255 &add ($rounds,16);
256
257 &set_label("${p}2_loop");
258 eval"&aes${p} ($inout0,$rndkey1)";
259 eval"&aes${p} ($inout1,$rndkey1)";
260 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
261 &add ($rounds,32);
262 eval"&aes${p} ($inout0,$rndkey0)";
263 eval"&aes${p} ($inout1,$rndkey0)";
264 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
265 &jnz (&label("${p}2_loop"));
266 eval"&aes${p} ($inout0,$rndkey1)";
267 eval"&aes${p} ($inout1,$rndkey1)";
268 eval"&aes${p}last ($inout0,$rndkey0)";
269 eval"&aes${p}last ($inout1,$rndkey0)";
270 &ret();
271 &function_end_B("_aesni_${p}rypt2");
272}
f8501464 273
d64a7232
AP
274sub aesni_generate3
275{ my $p=shift;
276
277 &function_begin_B("_aesni_${p}rypt3");
278 &$movekey ($rndkey0,&QWP(0,$key));
d8ba0dc9 279 &shl ($rounds,4);
d608b4d6 280 &$movekey ($rndkey1,&QWP(16,$key));
f8501464 281 &xorps ($inout0,$rndkey0);
d64a7232 282 &pxor ($inout1,$rndkey0);
d64a7232 283 &pxor ($inout2,$rndkey0);
d8ba0dc9
AP
284 &$movekey ($rndkey0,&QWP(32,$key));
285 &lea ($key,&DWP(32,$key,$rounds));
286 &neg ($rounds);
287 &add ($rounds,16);
d7d119a3
AP
288
289 &set_label("${p}3_loop");
290 eval"&aes${p} ($inout0,$rndkey1)";
d64a7232 291 eval"&aes${p} ($inout1,$rndkey1)";
d64a7232 292 eval"&aes${p} ($inout2,$rndkey1)";
d8ba0dc9
AP
293 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
294 &add ($rounds,32);
d64a7232 295 eval"&aes${p} ($inout0,$rndkey0)";
d64a7232
AP
296 eval"&aes${p} ($inout1,$rndkey0)";
297 eval"&aes${p} ($inout2,$rndkey0)";
d8ba0dc9 298 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
d64a7232
AP
299 &jnz (&label("${p}3_loop"));
300 eval"&aes${p} ($inout0,$rndkey1)";
d64a7232
AP
301 eval"&aes${p} ($inout1,$rndkey1)";
302 eval"&aes${p} ($inout2,$rndkey1)";
303 eval"&aes${p}last ($inout0,$rndkey0)";
304 eval"&aes${p}last ($inout1,$rndkey0)";
305 eval"&aes${p}last ($inout2,$rndkey0)";
306 &ret();
307 &function_end_B("_aesni_${p}rypt3");
308}
d608b4d6
AP
309
310# 4x interleave is implemented to improve small block performance,
311# most notably [and naturally] 4 block by ~30%. One can argue that one
312# should have implemented 5x as well, but improvement would be <20%,
313# so it's not worth it...
314sub aesni_generate4
315{ my $p=shift;
316
317 &function_begin_B("_aesni_${p}rypt4");
318 &$movekey ($rndkey0,&QWP(0,$key));
319 &$movekey ($rndkey1,&QWP(16,$key));
d8ba0dc9 320 &shl ($rounds,4);
f8501464 321 &xorps ($inout0,$rndkey0);
d608b4d6
AP
322 &pxor ($inout1,$rndkey0);
323 &pxor ($inout2,$rndkey0);
324 &pxor ($inout3,$rndkey0);
d8ba0dc9
AP
325 &$movekey ($rndkey0,&QWP(32,$key));
326 &lea ($key,&DWP(32,$key,$rounds));
327 &neg ($rounds);
328 &data_byte (0x0f,0x1f,0x40,0x00);
329 &add ($rounds,16);
d7d119a3 330
f8501464 331 &set_label("${p}4_loop");
d7d119a3 332 eval"&aes${p} ($inout0,$rndkey1)";
d608b4d6 333 eval"&aes${p} ($inout1,$rndkey1)";
d608b4d6
AP
334 eval"&aes${p} ($inout2,$rndkey1)";
335 eval"&aes${p} ($inout3,$rndkey1)";
d8ba0dc9
AP
336 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
337 &add ($rounds,32);
d608b4d6 338 eval"&aes${p} ($inout0,$rndkey0)";
d608b4d6
AP
339 eval"&aes${p} ($inout1,$rndkey0)";
340 eval"&aes${p} ($inout2,$rndkey0)";
341 eval"&aes${p} ($inout3,$rndkey0)";
d8ba0dc9 342 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
f8501464 343 &jnz (&label("${p}4_loop"));
d7d119a3 344
d608b4d6 345 eval"&aes${p} ($inout0,$rndkey1)";
d608b4d6
AP
346 eval"&aes${p} ($inout1,$rndkey1)";
347 eval"&aes${p} ($inout2,$rndkey1)";
348 eval"&aes${p} ($inout3,$rndkey1)";
349 eval"&aes${p}last ($inout0,$rndkey0)";
350 eval"&aes${p}last ($inout1,$rndkey0)";
351 eval"&aes${p}last ($inout2,$rndkey0)";
352 eval"&aes${p}last ($inout3,$rndkey0)";
353 &ret();
354 &function_end_B("_aesni_${p}rypt4");
355}
f8501464
AP
356
357sub aesni_generate6
358{ my $p=shift;
359
360 &function_begin_B("_aesni_${p}rypt6");
361 &static_label("_aesni_${p}rypt6_enter");
362 &$movekey ($rndkey0,&QWP(0,$key));
d8ba0dc9 363 &shl ($rounds,4);
f8501464 364 &$movekey ($rndkey1,&QWP(16,$key));
f8501464
AP
365 &xorps ($inout0,$rndkey0);
366 &pxor ($inout1,$rndkey0); # pxor does better here
f8501464 367 &pxor ($inout2,$rndkey0);
d8ba0dc9 368 eval"&aes${p} ($inout0,$rndkey1)";
f8501464 369 &pxor ($inout3,$rndkey0);
f8501464 370 &pxor ($inout4,$rndkey0);
d8ba0dc9
AP
371 eval"&aes${p} ($inout1,$rndkey1)";
372 &lea ($key,&DWP(32,$key,$rounds));
373 &neg ($rounds);
374 eval"&aes${p} ($inout2,$rndkey1)";
f8501464 375 &pxor ($inout5,$rndkey0);
23f6eec7 376 &$movekey ($rndkey0,&QWP(0,$key,$rounds));
d8ba0dc9 377 &add ($rounds,16);
23f6eec7 378 &jmp (&label("_aesni_${p}rypt6_inner"));
f8501464
AP
379
380 &set_label("${p}6_loop",16);
381 eval"&aes${p} ($inout0,$rndkey1)";
382 eval"&aes${p} ($inout1,$rndkey1)";
f8501464 383 eval"&aes${p} ($inout2,$rndkey1)";
23f6eec7 384 &set_label("_aesni_${p}rypt6_inner");
f8501464
AP
385 eval"&aes${p} ($inout3,$rndkey1)";
386 eval"&aes${p} ($inout4,$rndkey1)";
387 eval"&aes${p} ($inout5,$rndkey1)";
d8ba0dc9
AP
388 &set_label("_aesni_${p}rypt6_enter");
389 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
390 &add ($rounds,32);
f8501464
AP
391 eval"&aes${p} ($inout0,$rndkey0)";
392 eval"&aes${p} ($inout1,$rndkey0)";
f8501464
AP
393 eval"&aes${p} ($inout2,$rndkey0)";
394 eval"&aes${p} ($inout3,$rndkey0)";
395 eval"&aes${p} ($inout4,$rndkey0)";
396 eval"&aes${p} ($inout5,$rndkey0)";
d8ba0dc9 397 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
f8501464
AP
398 &jnz (&label("${p}6_loop"));
399
400 eval"&aes${p} ($inout0,$rndkey1)";
401 eval"&aes${p} ($inout1,$rndkey1)";
402 eval"&aes${p} ($inout2,$rndkey1)";
403 eval"&aes${p} ($inout3,$rndkey1)";
404 eval"&aes${p} ($inout4,$rndkey1)";
405 eval"&aes${p} ($inout5,$rndkey1)";
406 eval"&aes${p}last ($inout0,$rndkey0)";
407 eval"&aes${p}last ($inout1,$rndkey0)";
408 eval"&aes${p}last ($inout2,$rndkey0)";
409 eval"&aes${p}last ($inout3,$rndkey0)";
410 eval"&aes${p}last ($inout4,$rndkey0)";
411 eval"&aes${p}last ($inout5,$rndkey0)";
412 &ret();
413 &function_end_B("_aesni_${p}rypt6");
414}
214368ff
AP
415&aesni_generate2("enc") if ($PREFIX eq "aesni");
416&aesni_generate2("dec");
d64a7232
AP
417&aesni_generate3("enc") if ($PREFIX eq "aesni");
418&aesni_generate3("dec");
d608b4d6
AP
419&aesni_generate4("enc") if ($PREFIX eq "aesni");
420&aesni_generate4("dec");
f8501464
AP
421&aesni_generate6("enc") if ($PREFIX eq "aesni");
422&aesni_generate6("dec");
6c83629b 423\f
d64a7232 424if ($PREFIX eq "aesni") {
6c83629b 425######################################################################
d64a7232
AP
426# void aesni_ecb_encrypt (const void *in, void *out,
427# size_t length, const AES_KEY *key,
428# int enc);
d64a7232
AP
429&function_begin("aesni_ecb_encrypt");
430 &mov ($inp,&wparam(0));
431 &mov ($out,&wparam(1));
432 &mov ($len,&wparam(2));
433 &mov ($key,&wparam(3));
f8501464 434 &mov ($rounds_,&wparam(4));
d64a7232 435 &and ($len,-16);
f8501464 436 &jz (&label("ecb_ret"));
d64a7232 437 &mov ($rounds,&DWP(240,$key));
f8501464
AP
438 &test ($rounds_,$rounds_);
439 &jz (&label("ecb_decrypt"));
440
d64a7232
AP
441 &mov ($key_,$key); # backup $key
442 &mov ($rounds_,$rounds); # backup $rounds
f8501464
AP
443 &cmp ($len,0x60);
444 &jb (&label("ecb_enc_tail"));
445
446 &movdqu ($inout0,&QWP(0,$inp));
447 &movdqu ($inout1,&QWP(0x10,$inp));
448 &movdqu ($inout2,&QWP(0x20,$inp));
449 &movdqu ($inout3,&QWP(0x30,$inp));
450 &movdqu ($inout4,&QWP(0x40,$inp));
451 &movdqu ($inout5,&QWP(0x50,$inp));
452 &lea ($inp,&DWP(0x60,$inp));
453 &sub ($len,0x60);
454 &jmp (&label("ecb_enc_loop6_enter"));
455
456&set_label("ecb_enc_loop6",16);
457 &movups (&QWP(0,$out),$inout0);
458 &movdqu ($inout0,&QWP(0,$inp));
459 &movups (&QWP(0x10,$out),$inout1);
460 &movdqu ($inout1,&QWP(0x10,$inp));
461 &movups (&QWP(0x20,$out),$inout2);
462 &movdqu ($inout2,&QWP(0x20,$inp));
463 &movups (&QWP(0x30,$out),$inout3);
464 &movdqu ($inout3,&QWP(0x30,$inp));
465 &movups (&QWP(0x40,$out),$inout4);
466 &movdqu ($inout4,&QWP(0x40,$inp));
467 &movups (&QWP(0x50,$out),$inout5);
468 &lea ($out,&DWP(0x60,$out));
469 &movdqu ($inout5,&QWP(0x50,$inp));
470 &lea ($inp,&DWP(0x60,$inp));
471&set_label("ecb_enc_loop6_enter");
d64a7232 472
f8501464 473 &call ("_aesni_encrypt6");
d64a7232 474
d64a7232 475 &mov ($key,$key_); # restore $key
d64a7232 476 &mov ($rounds,$rounds_); # restore $rounds
f8501464
AP
477 &sub ($len,0x60);
478 &jnc (&label("ecb_enc_loop6"));
479
480 &movups (&QWP(0,$out),$inout0);
481 &movups (&QWP(0x10,$out),$inout1);
d7d119a3 482 &movups (&QWP(0x20,$out),$inout2);
f8501464
AP
483 &movups (&QWP(0x30,$out),$inout3);
484 &movups (&QWP(0x40,$out),$inout4);
485 &movups (&QWP(0x50,$out),$inout5);
486 &lea ($out,&DWP(0x60,$out));
487 &add ($len,0x60);
488 &jz (&label("ecb_ret"));
d64a7232 489
6c83629b 490&set_label("ecb_enc_tail");
6c83629b 491 &movups ($inout0,&QWP(0,$inp));
d7d119a3 492 &cmp ($len,0x20);
6c83629b 493 &jb (&label("ecb_enc_one"));
d64a7232 494 &movups ($inout1,&QWP(0x10,$inp));
d608b4d6 495 &je (&label("ecb_enc_two"));
d608b4d6 496 &movups ($inout2,&QWP(0x20,$inp));
f8501464
AP
497 &cmp ($len,0x40);
498 &jb (&label("ecb_enc_three"));
d608b4d6 499 &movups ($inout3,&QWP(0x30,$inp));
f8501464
AP
500 &je (&label("ecb_enc_four"));
501 &movups ($inout4,&QWP(0x40,$inp));
502 &xorps ($inout5,$inout5);
503 &call ("_aesni_encrypt6");
d64a7232
AP
504 &movups (&QWP(0,$out),$inout0);
505 &movups (&QWP(0x10,$out),$inout1);
d608b4d6
AP
506 &movups (&QWP(0x20,$out),$inout2);
507 &movups (&QWP(0x30,$out),$inout3);
f8501464 508 &movups (&QWP(0x40,$out),$inout4);
d64a7232
AP
509 jmp (&label("ecb_ret"));
510
511&set_label("ecb_enc_one",16);
6f766a41
AP
512 if ($inline)
513 { &aesni_inline_generate1("enc"); }
514 else
515 { &call ("_aesni_encrypt1"); }
d64a7232
AP
516 &movups (&QWP(0,$out),$inout0);
517 &jmp (&label("ecb_ret"));
518
d608b4d6 519&set_label("ecb_enc_two",16);
214368ff 520 &call ("_aesni_encrypt2");
d608b4d6
AP
521 &movups (&QWP(0,$out),$inout0);
522 &movups (&QWP(0x10,$out),$inout1);
523 &jmp (&label("ecb_ret"));
524
525&set_label("ecb_enc_three",16);
526 &call ("_aesni_encrypt3");
527 &movups (&QWP(0,$out),$inout0);
528 &movups (&QWP(0x10,$out),$inout1);
529 &movups (&QWP(0x20,$out),$inout2);
530 &jmp (&label("ecb_ret"));
f8501464
AP
531
532&set_label("ecb_enc_four",16);
533 &call ("_aesni_encrypt4");
534 &movups (&QWP(0,$out),$inout0);
535 &movups (&QWP(0x10,$out),$inout1);
536 &movups (&QWP(0x20,$out),$inout2);
537 &movups (&QWP(0x30,$out),$inout3);
538 &jmp (&label("ecb_ret"));
6c83629b 539######################################################################
d64a7232 540&set_label("ecb_decrypt",16);
f8501464
AP
541 &mov ($key_,$key); # backup $key
542 &mov ($rounds_,$rounds); # backup $rounds
543 &cmp ($len,0x60);
544 &jb (&label("ecb_dec_tail"));
545
546 &movdqu ($inout0,&QWP(0,$inp));
547 &movdqu ($inout1,&QWP(0x10,$inp));
548 &movdqu ($inout2,&QWP(0x20,$inp));
549 &movdqu ($inout3,&QWP(0x30,$inp));
550 &movdqu ($inout4,&QWP(0x40,$inp));
551 &movdqu ($inout5,&QWP(0x50,$inp));
552 &lea ($inp,&DWP(0x60,$inp));
553 &sub ($len,0x60);
554 &jmp (&label("ecb_dec_loop6_enter"));
555
556&set_label("ecb_dec_loop6",16);
d7d119a3 557 &movups (&QWP(0,$out),$inout0);
f8501464 558 &movdqu ($inout0,&QWP(0,$inp));
d7d119a3 559 &movups (&QWP(0x10,$out),$inout1);
f8501464
AP
560 &movdqu ($inout1,&QWP(0x10,$inp));
561 &movups (&QWP(0x20,$out),$inout2);
562 &movdqu ($inout2,&QWP(0x20,$inp));
563 &movups (&QWP(0x30,$out),$inout3);
564 &movdqu ($inout3,&QWP(0x30,$inp));
565 &movups (&QWP(0x40,$out),$inout4);
566 &movdqu ($inout4,&QWP(0x40,$inp));
567 &movups (&QWP(0x50,$out),$inout5);
568 &lea ($out,&DWP(0x60,$out));
569 &movdqu ($inout5,&QWP(0x50,$inp));
570 &lea ($inp,&DWP(0x60,$inp));
571&set_label("ecb_dec_loop6_enter");
572
573 &call ("_aesni_decrypt6");
574
575 &mov ($key,$key_); # restore $key
d64a7232 576 &mov ($rounds,$rounds_); # restore $rounds
f8501464
AP
577 &sub ($len,0x60);
578 &jnc (&label("ecb_dec_loop6"));
579
580 &movups (&QWP(0,$out),$inout0);
581 &movups (&QWP(0x10,$out),$inout1);
d7d119a3 582 &movups (&QWP(0x20,$out),$inout2);
f8501464
AP
583 &movups (&QWP(0x30,$out),$inout3);
584 &movups (&QWP(0x40,$out),$inout4);
585 &movups (&QWP(0x50,$out),$inout5);
586 &lea ($out,&DWP(0x60,$out));
587 &add ($len,0x60);
588 &jz (&label("ecb_ret"));
d64a7232 589
6c83629b 590&set_label("ecb_dec_tail");
6c83629b 591 &movups ($inout0,&QWP(0,$inp));
d7d119a3 592 &cmp ($len,0x20);
6c83629b 593 &jb (&label("ecb_dec_one"));
d64a7232 594 &movups ($inout1,&QWP(0x10,$inp));
d608b4d6 595 &je (&label("ecb_dec_two"));
d608b4d6 596 &movups ($inout2,&QWP(0x20,$inp));
f8501464
AP
597 &cmp ($len,0x40);
598 &jb (&label("ecb_dec_three"));
d608b4d6 599 &movups ($inout3,&QWP(0x30,$inp));
f8501464
AP
600 &je (&label("ecb_dec_four"));
601 &movups ($inout4,&QWP(0x40,$inp));
602 &xorps ($inout5,$inout5);
603 &call ("_aesni_decrypt6");
d64a7232
AP
604 &movups (&QWP(0,$out),$inout0);
605 &movups (&QWP(0x10,$out),$inout1);
d608b4d6
AP
606 &movups (&QWP(0x20,$out),$inout2);
607 &movups (&QWP(0x30,$out),$inout3);
f8501464 608 &movups (&QWP(0x40,$out),$inout4);
d608b4d6 609 &jmp (&label("ecb_ret"));
d64a7232
AP
610
611&set_label("ecb_dec_one",16);
6f766a41
AP
612 if ($inline)
613 { &aesni_inline_generate1("dec"); }
614 else
615 { &call ("_aesni_decrypt1"); }
d64a7232 616 &movups (&QWP(0,$out),$inout0);
d608b4d6
AP
617 &jmp (&label("ecb_ret"));
618
619&set_label("ecb_dec_two",16);
214368ff 620 &call ("_aesni_decrypt2");
d608b4d6
AP
621 &movups (&QWP(0,$out),$inout0);
622 &movups (&QWP(0x10,$out),$inout1);
623 &jmp (&label("ecb_ret"));
624
625&set_label("ecb_dec_three",16);
626 &call ("_aesni_decrypt3");
627 &movups (&QWP(0,$out),$inout0);
628 &movups (&QWP(0x10,$out),$inout1);
629 &movups (&QWP(0x20,$out),$inout2);
f8501464
AP
630 &jmp (&label("ecb_ret"));
631
632&set_label("ecb_dec_four",16);
633 &call ("_aesni_decrypt4");
634 &movups (&QWP(0,$out),$inout0);
635 &movups (&QWP(0x10,$out),$inout1);
636 &movups (&QWP(0x20,$out),$inout2);
637 &movups (&QWP(0x30,$out),$inout3);
d64a7232
AP
638
639&set_label("ecb_ret");
23f6eec7
AP
640 &pxor ("xmm0","xmm0"); # clear register bank
641 &pxor ("xmm1","xmm1");
642 &pxor ("xmm2","xmm2");
643 &pxor ("xmm3","xmm3");
644 &pxor ("xmm4","xmm4");
645 &pxor ("xmm5","xmm5");
646 &pxor ("xmm6","xmm6");
647 &pxor ("xmm7","xmm7");
d64a7232 648&function_end("aesni_ecb_encrypt");
6c83629b
AP
649\f
650######################################################################
d7d119a3
AP
651# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
652# size_t blocks, const AES_KEY *key,
653# const char *ivec,char *cmac);
654#
655# Handles only complete blocks, operates on 64-bit counter and
656# does not update *ivec! Nor does it finalize CMAC value
657# (see engine/eng_aesni.c for details)
6c83629b 658#
f8501464 659{ my $cmac=$inout1;
d7d119a3
AP
660&function_begin("aesni_ccm64_encrypt_blocks");
661 &mov ($inp,&wparam(0));
662 &mov ($out,&wparam(1));
663 &mov ($len,&wparam(2));
664 &mov ($key,&wparam(3));
665 &mov ($rounds_,&wparam(4));
666 &mov ($rounds,&wparam(5));
667 &mov ($key_,"esp");
668 &sub ("esp",60);
669 &and ("esp",-16); # align stack
670 &mov (&DWP(48,"esp"),$key_);
671
672 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
f8501464 673 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
267b481c 674 &mov ($rounds,&DWP(240,$key));
d7d119a3
AP
675
676 # compose byte-swap control mask for pshufb on stack
677 &mov (&DWP(0,"esp"),0x0c0d0e0f);
678 &mov (&DWP(4,"esp"),0x08090a0b);
679 &mov (&DWP(8,"esp"),0x04050607);
680 &mov (&DWP(12,"esp"),0x00010203);
681
682 # compose counter increment vector on stack
267b481c 683 &mov ($rounds_,1);
d7d119a3 684 &xor ($key_,$key_);
267b481c 685 &mov (&DWP(16,"esp"),$rounds_);
d7d119a3
AP
686 &mov (&DWP(20,"esp"),$key_);
687 &mov (&DWP(24,"esp"),$key_);
688 &mov (&DWP(28,"esp"),$key_);
689
d8ba0dc9
AP
690 &shl ($rounds,4);
691 &mov ($rounds_,16);
267b481c 692 &lea ($key_,&DWP(0,$key));
9ee5916d 693 &movdqa ($inout3,&QWP(0,"esp"));
d7d119a3 694 &movdqa ($inout0,$ivec);
d8ba0dc9
AP
695 &lea ($key,&DWP(32,$key,$rounds));
696 &sub ($rounds_,$rounds);
9ee5916d 697 &pshufb ($ivec,$inout3);
d7d119a3
AP
698
699&set_label("ccm64_enc_outer");
267b481c 700 &$movekey ($rndkey0,&QWP(0,$key_));
f8501464 701 &mov ($rounds,$rounds_);
267b481c 702 &movups ($in0,&QWP(0,$inp));
d7d119a3 703
f8501464 704 &xorps ($inout0,$rndkey0);
267b481c
AP
705 &$movekey ($rndkey1,&QWP(16,$key_));
706 &xorps ($rndkey0,$in0);
267b481c 707 &xorps ($cmac,$rndkey0); # cmac^=inp
d8ba0dc9 708 &$movekey ($rndkey0,&QWP(32,$key_));
f8501464
AP
709
710&set_label("ccm64_enc2_loop");
711 &aesenc ($inout0,$rndkey1);
f8501464 712 &aesenc ($cmac,$rndkey1);
d8ba0dc9
AP
713 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
714 &add ($rounds,32);
f8501464 715 &aesenc ($inout0,$rndkey0);
f8501464 716 &aesenc ($cmac,$rndkey0);
d8ba0dc9 717 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
f8501464
AP
718 &jnz (&label("ccm64_enc2_loop"));
719 &aesenc ($inout0,$rndkey1);
720 &aesenc ($cmac,$rndkey1);
267b481c 721 &paddq ($ivec,&QWP(16,"esp"));
d8ba0dc9 722 &dec ($len);
f8501464
AP
723 &aesenclast ($inout0,$rndkey0);
724 &aesenclast ($cmac,$rndkey0);
d7d119a3 725
d7d119a3 726 &lea ($inp,&DWP(16,$inp));
f8501464 727 &xorps ($in0,$inout0); # inp^=E(ivec)
d7d119a3 728 &movdqa ($inout0,$ivec);
267b481c 729 &movups (&QWP(0,$out),$in0); # save output
9ee5916d 730 &pshufb ($inout0,$inout3);
d8ba0dc9 731 &lea ($out,&DWP(16,$out));
d7d119a3
AP
732 &jnz (&label("ccm64_enc_outer"));
733
734 &mov ("esp",&DWP(48,"esp"));
735 &mov ($out,&wparam(5));
f8501464 736 &movups (&QWP(0,$out),$cmac);
23f6eec7
AP
737
738 &pxor ("xmm0","xmm0"); # clear register bank
739 &pxor ("xmm1","xmm1");
740 &pxor ("xmm2","xmm2");
741 &pxor ("xmm3","xmm3");
742 &pxor ("xmm4","xmm4");
743 &pxor ("xmm5","xmm5");
744 &pxor ("xmm6","xmm6");
745 &pxor ("xmm7","xmm7");
d7d119a3
AP
746&function_end("aesni_ccm64_encrypt_blocks");
747
748&function_begin("aesni_ccm64_decrypt_blocks");
749 &mov ($inp,&wparam(0));
750 &mov ($out,&wparam(1));
751 &mov ($len,&wparam(2));
752 &mov ($key,&wparam(3));
753 &mov ($rounds_,&wparam(4));
754 &mov ($rounds,&wparam(5));
755 &mov ($key_,"esp");
756 &sub ("esp",60);
757 &and ("esp",-16); # align stack
758 &mov (&DWP(48,"esp"),$key_);
759
760 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
f8501464 761 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
267b481c 762 &mov ($rounds,&DWP(240,$key));
d7d119a3
AP
763
764 # compose byte-swap control mask for pshufb on stack
765 &mov (&DWP(0,"esp"),0x0c0d0e0f);
766 &mov (&DWP(4,"esp"),0x08090a0b);
767 &mov (&DWP(8,"esp"),0x04050607);
768 &mov (&DWP(12,"esp"),0x00010203);
769
770 # compose counter increment vector on stack
267b481c 771 &mov ($rounds_,1);
d7d119a3 772 &xor ($key_,$key_);
267b481c 773 &mov (&DWP(16,"esp"),$rounds_);
d7d119a3
AP
774 &mov (&DWP(20,"esp"),$key_);
775 &mov (&DWP(24,"esp"),$key_);
776 &mov (&DWP(28,"esp"),$key_);
777
778 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
779 &movdqa ($inout0,$ivec);
d7d119a3 780
d7d119a3
AP
781 &mov ($key_,$key);
782 &mov ($rounds_,$rounds);
783
267b481c 784 &pshufb ($ivec,$inout3);
d7d119a3
AP
785 if ($inline)
786 { &aesni_inline_generate1("enc"); }
787 else
788 { &call ("_aesni_encrypt1"); }
d8ba0dc9
AP
789 &shl ($rounds_,4);
790 &mov ($rounds,16);
f8501464 791 &movups ($in0,&QWP(0,$inp)); # load inp
267b481c 792 &paddq ($ivec,&QWP(16,"esp"));
f8501464 793 &lea ($inp,&QWP(16,$inp));
d8ba0dc9
AP
794 &sub ($rounds,$rounds_);
795 &lea ($key,&DWP(32,$key_,$rounds_));
796 &mov ($rounds_,$rounds);
267b481c
AP
797 &jmp (&label("ccm64_dec_outer"));
798
799&set_label("ccm64_dec_outer",16);
800 &xorps ($in0,$inout0); # inp ^= E(ivec)
801 &movdqa ($inout0,$ivec);
267b481c 802 &movups (&QWP(0,$out),$in0); # save output
d7d119a3 803 &lea ($out,&DWP(16,$out));
9ee5916d 804 &pshufb ($inout0,$inout3);
d7d119a3 805
f8501464 806 &sub ($len,1);
d7d119a3
AP
807 &jz (&label("ccm64_dec_break"));
808
267b481c 809 &$movekey ($rndkey0,&QWP(0,$key_));
d8ba0dc9 810 &mov ($rounds,$rounds_);
267b481c 811 &$movekey ($rndkey1,&QWP(16,$key_));
f8501464 812 &xorps ($in0,$rndkey0);
f8501464
AP
813 &xorps ($inout0,$rndkey0);
814 &xorps ($cmac,$in0); # cmac^=out
d8ba0dc9 815 &$movekey ($rndkey0,&QWP(32,$key_));
d7d119a3 816
f8501464
AP
817&set_label("ccm64_dec2_loop");
818 &aesenc ($inout0,$rndkey1);
f8501464 819 &aesenc ($cmac,$rndkey1);
d8ba0dc9
AP
820 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
821 &add ($rounds,32);
f8501464 822 &aesenc ($inout0,$rndkey0);
f8501464 823 &aesenc ($cmac,$rndkey0);
d8ba0dc9 824 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
f8501464 825 &jnz (&label("ccm64_dec2_loop"));
267b481c
AP
826 &movups ($in0,&QWP(0,$inp)); # load inp
827 &paddq ($ivec,&QWP(16,"esp"));
f8501464
AP
828 &aesenc ($inout0,$rndkey1);
829 &aesenc ($cmac,$rndkey1);
830 &aesenclast ($inout0,$rndkey0);
831 &aesenclast ($cmac,$rndkey0);
d8ba0dc9 832 &lea ($inp,&QWP(16,$inp));
d7d119a3
AP
833 &jmp (&label("ccm64_dec_outer"));
834
835&set_label("ccm64_dec_break",16);
d8ba0dc9 836 &mov ($rounds,&DWP(240,$key_));
267b481c 837 &mov ($key,$key_);
d7d119a3 838 if ($inline)
f8501464 839 { &aesni_inline_generate1("enc",$cmac,$in0); }
d7d119a3 840 else
f8501464 841 { &call ("_aesni_encrypt1",$cmac); }
d7d119a3
AP
842
843 &mov ("esp",&DWP(48,"esp"));
844 &mov ($out,&wparam(5));
f8501464 845 &movups (&QWP(0,$out),$cmac);
23f6eec7
AP
846
847 &pxor ("xmm0","xmm0"); # clear register bank
848 &pxor ("xmm1","xmm1");
849 &pxor ("xmm2","xmm2");
850 &pxor ("xmm3","xmm3");
851 &pxor ("xmm4","xmm4");
852 &pxor ("xmm5","xmm5");
853 &pxor ("xmm6","xmm6");
854 &pxor ("xmm7","xmm7");
d7d119a3 855&function_end("aesni_ccm64_decrypt_blocks");
f8501464 856}
d7d119a3
AP
857\f
858######################################################################
6c83629b
AP
859# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
860# size_t blocks, const AES_KEY *key,
861# const char *ivec);
d7d119a3
AP
862#
863# Handles only complete blocks, operates on 32-bit counter and
d8ba0dc9 864# does not update *ivec! (see crypto/modes/ctr128.c for details)
d7d119a3 865#
f8501464
AP
866# stack layout:
867# 0 pshufb mask
868# 16 vector addend: 0,6,6,6
869# 32 counter-less ivec
870# 48 1st triplet of counter vector
871# 64 2nd triplet of counter vector
872# 80 saved %esp
873
6c83629b
AP
874&function_begin("aesni_ctr32_encrypt_blocks");
875 &mov ($inp,&wparam(0));
876 &mov ($out,&wparam(1));
877 &mov ($len,&wparam(2));
878 &mov ($key,&wparam(3));
879 &mov ($rounds_,&wparam(4));
880 &mov ($key_,"esp");
f8501464 881 &sub ("esp",88);
6c83629b 882 &and ("esp",-16); # align stack
f8501464 883 &mov (&DWP(80,"esp"),$key_);
6c83629b 884
d7d119a3
AP
885 &cmp ($len,1);
886 &je (&label("ctr32_one_shortcut"));
887
f8501464 888 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
6c83629b
AP
889
890 # compose byte-swap control mask for pshufb on stack
891 &mov (&DWP(0,"esp"),0x0c0d0e0f);
892 &mov (&DWP(4,"esp"),0x08090a0b);
893 &mov (&DWP(8,"esp"),0x04050607);
894 &mov (&DWP(12,"esp"),0x00010203);
895
896 # compose counter increment vector on stack
f8501464 897 &mov ($rounds,6);
6c83629b
AP
898 &xor ($key_,$key_);
899 &mov (&DWP(16,"esp"),$rounds);
900 &mov (&DWP(20,"esp"),$rounds);
901 &mov (&DWP(24,"esp"),$rounds);
902 &mov (&DWP(28,"esp"),$key_);
903
f8501464
AP
904 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
905 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
6c83629b
AP
906
907 &mov ($rounds,&DWP(240,$key)); # key->rounds
6c83629b 908
f8501464 909 # compose 2 vectors of 3x32-bit counters
6c83629b 910 &bswap ($rounds_);
f8501464 911 &pxor ($rndkey0,$rndkey0);
d8ba0dc9 912 &pxor ($rndkey1,$rndkey1);
f8501464 913 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
d8ba0dc9 914 &pinsrd ($rndkey0,$rounds_,0);
f8501464 915 &lea ($key_,&DWP(3,$rounds_));
d8ba0dc9 916 &pinsrd ($rndkey1,$key_,0);
6c83629b 917 &inc ($rounds_);
d8ba0dc9 918 &pinsrd ($rndkey0,$rounds_,1);
f8501464 919 &inc ($key_);
d8ba0dc9 920 &pinsrd ($rndkey1,$key_,1);
6c83629b 921 &inc ($rounds_);
d8ba0dc9 922 &pinsrd ($rndkey0,$rounds_,2);
f8501464 923 &inc ($key_);
d8ba0dc9
AP
924 &pinsrd ($rndkey1,$key_,2);
925 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
f8501464 926 &pshufb ($rndkey0,$inout0); # byte swap
d8ba0dc9
AP
927 &movdqu ($inout4,&QWP(0,$key)); # key[0]
928 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
929 &pshufb ($rndkey1,$inout0); # byte swap
f8501464 930
d8ba0dc9
AP
931 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
932 &pshufd ($inout1,$rndkey0,2<<6);
f8501464
AP
933 &cmp ($len,6);
934 &jb (&label("ctr32_tail"));
d8ba0dc9
AP
935 &pxor ($inout5,$inout4); # counter-less ivec^key[0]
936 &shl ($rounds,4);
937 &mov ($rounds_,16);
938 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
f8501464 939 &mov ($key_,$key); # backup $key
d8ba0dc9
AP
940 &sub ($rounds_,$rounds); # backup twisted $rounds
941 &lea ($key,&DWP(32,$key,$rounds));
f8501464
AP
942 &sub ($len,6);
943 &jmp (&label("ctr32_loop6"));
944
945&set_label("ctr32_loop6",16);
d8ba0dc9
AP
946 # inlining _aesni_encrypt6's prologue gives ~6% improvement...
947 &pshufd ($inout2,$rndkey0,1<<6);
948 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
949 &pshufd ($inout3,$rndkey1,3<<6);
950 &pxor ($inout0,$rndkey0); # merge counter-less ivec
951 &pshufd ($inout4,$rndkey1,2<<6);
d7d119a3 952 &pxor ($inout1,$rndkey0);
d8ba0dc9
AP
953 &pshufd ($inout5,$rndkey1,1<<6);
954 &$movekey ($rndkey1,&QWP(16,$key_));
f8501464 955 &pxor ($inout2,$rndkey0);
f8501464 956 &pxor ($inout3,$rndkey0);
d8ba0dc9 957 &aesenc ($inout0,$rndkey1);
f8501464 958 &pxor ($inout4,$rndkey0);
f8501464 959 &pxor ($inout5,$rndkey0);
d8ba0dc9
AP
960 &aesenc ($inout1,$rndkey1);
961 &$movekey ($rndkey0,&QWP(32,$key_));
962 &mov ($rounds,$rounds_);
963 &aesenc ($inout2,$rndkey1);
964 &aesenc ($inout3,$rndkey1);
f8501464 965 &aesenc ($inout4,$rndkey1);
f8501464 966 &aesenc ($inout5,$rndkey1);
d7d119a3 967
f8501464
AP
968 &call (&label("_aesni_encrypt6_enter"));
969
970 &movups ($rndkey1,&QWP(0,$inp));
971 &movups ($rndkey0,&QWP(0x10,$inp));
972 &xorps ($inout0,$rndkey1);
973 &movups ($rndkey1,&QWP(0x20,$inp));
974 &xorps ($inout1,$rndkey0);
975 &movups (&QWP(0,$out),$inout0);
976 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
977 &xorps ($inout2,$rndkey1);
d8ba0dc9 978 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
f8501464
AP
979 &movups (&QWP(0x10,$out),$inout1);
980 &movups (&QWP(0x20,$out),$inout2);
981
d8ba0dc9
AP
982 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment
983 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
f8501464
AP
984 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
985
986 &movups ($inout1,&QWP(0x30,$inp));
987 &movups ($inout2,&QWP(0x40,$inp));
988 &xorps ($inout3,$inout1);
989 &movups ($inout1,&QWP(0x50,$inp));
990 &lea ($inp,&DWP(0x60,$inp));
d8ba0dc9
AP
991 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
992 &pshufb ($rndkey0,$inout0); # byte swap
f8501464
AP
993 &xorps ($inout4,$inout2);
994 &movups (&QWP(0x30,$out),$inout3);
995 &xorps ($inout5,$inout1);
d8ba0dc9
AP
996 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
997 &pshufb ($rndkey1,$inout0); # byte swap
f8501464 998 &movups (&QWP(0x40,$out),$inout4);
d8ba0dc9 999 &pshufd ($inout0,$rndkey0,3<<6);
f8501464
AP
1000 &movups (&QWP(0x50,$out),$inout5);
1001 &lea ($out,&DWP(0x60,$out));
d7d119a3 1002
d8ba0dc9 1003 &pshufd ($inout1,$rndkey0,2<<6);
f8501464
AP
1004 &sub ($len,6);
1005 &jnc (&label("ctr32_loop6"));
6c83629b 1006
f8501464
AP
1007 &add ($len,6);
1008 &jz (&label("ctr32_ret"));
d8ba0dc9 1009 &movdqu ($inout5,&QWP(0,$key_));
f8501464 1010 &mov ($key,$key_);
d8ba0dc9
AP
1011 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
1012 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
6c83629b
AP
1013
1014&set_label("ctr32_tail");
f8501464 1015 &por ($inout0,$inout5);
d7d119a3 1016 &cmp ($len,2);
6c83629b 1017 &jb (&label("ctr32_one"));
6c83629b 1018
d8ba0dc9 1019 &pshufd ($inout2,$rndkey0,1<<6);
f8501464
AP
1020 &por ($inout1,$inout5);
1021 &je (&label("ctr32_two"));
6c83629b 1022
d8ba0dc9 1023 &pshufd ($inout3,$rndkey1,3<<6);
f8501464
AP
1024 &por ($inout2,$inout5);
1025 &cmp ($len,4);
1026 &jb (&label("ctr32_three"));
1027
d8ba0dc9 1028 &pshufd ($inout4,$rndkey1,2<<6);
f8501464
AP
1029 &por ($inout3,$inout5);
1030 &je (&label("ctr32_four"));
1031
1032 &por ($inout4,$inout5);
1033 &call ("_aesni_encrypt6");
1034 &movups ($rndkey1,&QWP(0,$inp));
1035 &movups ($rndkey0,&QWP(0x10,$inp));
1036 &xorps ($inout0,$rndkey1);
1037 &movups ($rndkey1,&QWP(0x20,$inp));
1038 &xorps ($inout1,$rndkey0);
1039 &movups ($rndkey0,&QWP(0x30,$inp));
1040 &xorps ($inout2,$rndkey1);
1041 &movups ($rndkey1,&QWP(0x40,$inp));
1042 &xorps ($inout3,$rndkey0);
1043 &movups (&QWP(0,$out),$inout0);
1044 &xorps ($inout4,$rndkey1);
1045 &movups (&QWP(0x10,$out),$inout1);
1046 &movups (&QWP(0x20,$out),$inout2);
1047 &movups (&QWP(0x30,$out),$inout3);
1048 &movups (&QWP(0x40,$out),$inout4);
6c83629b
AP
1049 &jmp (&label("ctr32_ret"));
1050
d7d119a3 1051&set_label("ctr32_one_shortcut",16);
f8501464 1052 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
d7d119a3 1053 &mov ($rounds,&DWP(240,$key));
609b0852 1054
d7d119a3 1055&set_label("ctr32_one");
6c83629b
AP
1056 if ($inline)
1057 { &aesni_inline_generate1("enc"); }
1058 else
1059 { &call ("_aesni_encrypt1"); }
f8501464
AP
1060 &movups ($in0,&QWP(0,$inp));
1061 &xorps ($in0,$inout0);
1062 &movups (&QWP(0,$out),$in0);
6c83629b 1063 &jmp (&label("ctr32_ret"));
d64a7232 1064
6c83629b 1065&set_label("ctr32_two",16);
214368ff 1066 &call ("_aesni_encrypt2");
f8501464
AP
1067 &movups ($inout3,&QWP(0,$inp));
1068 &movups ($inout4,&QWP(0x10,$inp));
1069 &xorps ($inout0,$inout3);
1070 &xorps ($inout1,$inout4);
1071 &movups (&QWP(0,$out),$inout0);
1072 &movups (&QWP(0x10,$out),$inout1);
6c83629b
AP
1073 &jmp (&label("ctr32_ret"));
1074
1075&set_label("ctr32_three",16);
1076 &call ("_aesni_encrypt3");
f8501464
AP
1077 &movups ($inout3,&QWP(0,$inp));
1078 &movups ($inout4,&QWP(0x10,$inp));
1079 &xorps ($inout0,$inout3);
1080 &movups ($inout5,&QWP(0x20,$inp));
1081 &xorps ($inout1,$inout4);
1082 &movups (&QWP(0,$out),$inout0);
1083 &xorps ($inout2,$inout5);
1084 &movups (&QWP(0x10,$out),$inout1);
1085 &movups (&QWP(0x20,$out),$inout2);
1086 &jmp (&label("ctr32_ret"));
1087
1088&set_label("ctr32_four",16);
1089 &call ("_aesni_encrypt4");
1090 &movups ($inout4,&QWP(0,$inp));
1091 &movups ($inout5,&QWP(0x10,$inp));
1092 &movups ($rndkey1,&QWP(0x20,$inp));
1093 &xorps ($inout0,$inout4);
1094 &movups ($rndkey0,&QWP(0x30,$inp));
1095 &xorps ($inout1,$inout5);
1096 &movups (&QWP(0,$out),$inout0);
1097 &xorps ($inout2,$rndkey1);
1098 &movups (&QWP(0x10,$out),$inout1);
1099 &xorps ($inout3,$rndkey0);
1100 &movups (&QWP(0x20,$out),$inout2);
1101 &movups (&QWP(0x30,$out),$inout3);
6c83629b
AP
1102
1103&set_label("ctr32_ret");
23f6eec7
AP
1104 &pxor ("xmm0","xmm0"); # clear register bank
1105 &pxor ("xmm1","xmm1");
1106 &pxor ("xmm2","xmm2");
1107 &pxor ("xmm3","xmm3");
1108 &pxor ("xmm4","xmm4");
1109 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
1110 &pxor ("xmm5","xmm5");
1111 &movdqa (&QWP(48,"esp"),"xmm0");
1112 &pxor ("xmm6","xmm6");
1113 &movdqa (&QWP(64,"esp"),"xmm0");
1114 &pxor ("xmm7","xmm7");
f8501464 1115 &mov ("esp",&DWP(80,"esp"));
6c83629b 1116&function_end("aesni_ctr32_encrypt_blocks");
f8501464
AP
1117\f
1118######################################################################
1119# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1120# const AES_KEY *key1, const AES_KEY *key2
1121# const unsigned char iv[16]);
1122#
1123{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1124
1125&function_begin("aesni_xts_encrypt");
1126 &mov ($key,&wparam(4)); # key2
1127 &mov ($inp,&wparam(5)); # clear-text tweak
1128
1129 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1130 &movups ($inout0,&QWP(0,$inp));
1131 if ($inline)
1132 { &aesni_inline_generate1("enc"); }
1133 else
1134 { &call ("_aesni_encrypt1"); }
1135
1136 &mov ($inp,&wparam(0));
1137 &mov ($out,&wparam(1));
1138 &mov ($len,&wparam(2));
1139 &mov ($key,&wparam(3)); # key1
1140
1141 &mov ($key_,"esp");
1142 &sub ("esp",16*7+8);
1143 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1144 &and ("esp",-16); # align stack
1145
1146 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1147 &mov (&DWP(16*6+4,"esp"),0);
1148 &mov (&DWP(16*6+8,"esp"),1);
1149 &mov (&DWP(16*6+12,"esp"),0);
1150 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1151 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1152
1153 &movdqa ($tweak,$inout0);
1154 &pxor ($twtmp,$twtmp);
1155 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1156 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1157
1158 &and ($len,-16);
1159 &mov ($key_,$key); # backup $key
1160 &mov ($rounds_,$rounds); # backup $rounds
1161 &sub ($len,16*6);
1162 &jc (&label("xts_enc_short"));
1163
d8ba0dc9
AP
1164 &shl ($rounds,4);
1165 &mov ($rounds_,16);
1166 &sub ($rounds_,$rounds);
1167 &lea ($key,&DWP(32,$key,$rounds));
f8501464
AP
1168 &jmp (&label("xts_enc_loop6"));
1169
1170&set_label("xts_enc_loop6",16);
1171 for ($i=0;$i<4;$i++) {
1172 &pshufd ($twres,$twtmp,0x13);
1173 &pxor ($twtmp,$twtmp);
1174 &movdqa (&QWP(16*$i,"esp"),$tweak);
1175 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1176 &pand ($twres,$twmask); # isolate carry and residue
1177 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1178 &pxor ($tweak,$twres);
1179 }
1180 &pshufd ($inout5,$twtmp,0x13);
1181 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1182 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1183 &$movekey ($rndkey0,&QWP(0,$key_));
1184 &pand ($inout5,$twmask); # isolate carry and residue
1185 &movups ($inout0,&QWP(0,$inp)); # load input
1186 &pxor ($inout5,$tweak);
1187
1188 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
d8ba0dc9 1189 &mov ($rounds,$rounds_); # restore $rounds
f8501464
AP
1190 &movdqu ($inout1,&QWP(16*1,$inp));
1191 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1192 &movdqu ($inout2,&QWP(16*2,$inp));
1193 &pxor ($inout1,$rndkey0);
1194 &movdqu ($inout3,&QWP(16*3,$inp));
1195 &pxor ($inout2,$rndkey0);
1196 &movdqu ($inout4,&QWP(16*4,$inp));
1197 &pxor ($inout3,$rndkey0);
1198 &movdqu ($rndkey1,&QWP(16*5,$inp));
1199 &pxor ($inout4,$rndkey0);
1200 &lea ($inp,&DWP(16*6,$inp));
1201 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1202 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1203 &pxor ($inout5,$rndkey1);
1204
1205 &$movekey ($rndkey1,&QWP(16,$key_));
f8501464 1206 &pxor ($inout1,&QWP(16*1,"esp"));
f8501464 1207 &pxor ($inout2,&QWP(16*2,"esp"));
d8ba0dc9 1208 &aesenc ($inout0,$rndkey1);
f8501464 1209 &pxor ($inout3,&QWP(16*3,"esp"));
f8501464 1210 &pxor ($inout4,&QWP(16*4,"esp"));
d8ba0dc9 1211 &aesenc ($inout1,$rndkey1);
f8501464 1212 &pxor ($inout5,$rndkey0);
d8ba0dc9
AP
1213 &$movekey ($rndkey0,&QWP(32,$key_));
1214 &aesenc ($inout2,$rndkey1);
1215 &aesenc ($inout3,$rndkey1);
f8501464 1216 &aesenc ($inout4,$rndkey1);
f8501464
AP
1217 &aesenc ($inout5,$rndkey1);
1218 &call (&label("_aesni_encrypt6_enter"));
1219
1220 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1221 &pxor ($twtmp,$twtmp);
1222 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1223 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1224 &xorps ($inout1,&QWP(16*1,"esp"));
1225 &movups (&QWP(16*0,$out),$inout0); # write output
1226 &xorps ($inout2,&QWP(16*2,"esp"));
1227 &movups (&QWP(16*1,$out),$inout1);
1228 &xorps ($inout3,&QWP(16*3,"esp"));
1229 &movups (&QWP(16*2,$out),$inout2);
1230 &xorps ($inout4,&QWP(16*4,"esp"));
1231 &movups (&QWP(16*3,$out),$inout3);
1232 &xorps ($inout5,$tweak);
1233 &movups (&QWP(16*4,$out),$inout4);
1234 &pshufd ($twres,$twtmp,0x13);
1235 &movups (&QWP(16*5,$out),$inout5);
1236 &lea ($out,&DWP(16*6,$out));
1237 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1238
1239 &pxor ($twtmp,$twtmp);
1240 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1241 &pand ($twres,$twmask); # isolate carry and residue
1242 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
f8501464
AP
1243 &pxor ($tweak,$twres);
1244
1245 &sub ($len,16*6);
1246 &jnc (&label("xts_enc_loop6"));
1247
d8ba0dc9 1248 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
f8501464
AP
1249 &mov ($key,$key_); # restore $key
1250 &mov ($rounds_,$rounds);
1251
1252&set_label("xts_enc_short");
1253 &add ($len,16*6);
1254 &jz (&label("xts_enc_done6x"));
1255
1256 &movdqa ($inout3,$tweak); # put aside previous tweak
1257 &cmp ($len,0x20);
1258 &jb (&label("xts_enc_one"));
1259
1260 &pshufd ($twres,$twtmp,0x13);
1261 &pxor ($twtmp,$twtmp);
1262 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1263 &pand ($twres,$twmask); # isolate carry and residue
1264 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1265 &pxor ($tweak,$twres);
1266 &je (&label("xts_enc_two"));
1267
1268 &pshufd ($twres,$twtmp,0x13);
1269 &pxor ($twtmp,$twtmp);
1270 &movdqa ($inout4,$tweak); # put aside previous tweak
1271 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1272 &pand ($twres,$twmask); # isolate carry and residue
1273 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1274 &pxor ($tweak,$twres);
1275 &cmp ($len,0x40);
1276 &jb (&label("xts_enc_three"));
1277
1278 &pshufd ($twres,$twtmp,0x13);
1279 &pxor ($twtmp,$twtmp);
1280 &movdqa ($inout5,$tweak); # put aside previous tweak
1281 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1282 &pand ($twres,$twmask); # isolate carry and residue
1283 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1284 &pxor ($tweak,$twres);
1285 &movdqa (&QWP(16*0,"esp"),$inout3);
1286 &movdqa (&QWP(16*1,"esp"),$inout4);
1287 &je (&label("xts_enc_four"));
1288
1289 &movdqa (&QWP(16*2,"esp"),$inout5);
1290 &pshufd ($inout5,$twtmp,0x13);
1291 &movdqa (&QWP(16*3,"esp"),$tweak);
1292 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1293 &pand ($inout5,$twmask); # isolate carry and residue
1294 &pxor ($inout5,$tweak);
1295
1296 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1297 &movdqu ($inout1,&QWP(16*1,$inp));
1298 &movdqu ($inout2,&QWP(16*2,$inp));
1299 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1300 &movdqu ($inout3,&QWP(16*3,$inp));
1301 &pxor ($inout1,&QWP(16*1,"esp"));
1302 &movdqu ($inout4,&QWP(16*4,$inp));
1303 &pxor ($inout2,&QWP(16*2,"esp"));
1304 &lea ($inp,&DWP(16*5,$inp));
1305 &pxor ($inout3,&QWP(16*3,"esp"));
1306 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1307 &pxor ($inout4,$inout5);
1308
1309 &call ("_aesni_encrypt6");
1310
1311 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1312 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1313 &xorps ($inout1,&QWP(16*1,"esp"));
1314 &xorps ($inout2,&QWP(16*2,"esp"));
1315 &movups (&QWP(16*0,$out),$inout0); # write output
1316 &xorps ($inout3,&QWP(16*3,"esp"));
1317 &movups (&QWP(16*1,$out),$inout1);
1318 &xorps ($inout4,$tweak);
1319 &movups (&QWP(16*2,$out),$inout2);
1320 &movups (&QWP(16*3,$out),$inout3);
1321 &movups (&QWP(16*4,$out),$inout4);
1322 &lea ($out,&DWP(16*5,$out));
1323 &jmp (&label("xts_enc_done"));
1324
1325&set_label("xts_enc_one",16);
1326 &movups ($inout0,&QWP(16*0,$inp)); # load input
1327 &lea ($inp,&DWP(16*1,$inp));
1328 &xorps ($inout0,$inout3); # input^=tweak
1329 if ($inline)
1330 { &aesni_inline_generate1("enc"); }
1331 else
1332 { &call ("_aesni_encrypt1"); }
1333 &xorps ($inout0,$inout3); # output^=tweak
1334 &movups (&QWP(16*0,$out),$inout0); # write output
1335 &lea ($out,&DWP(16*1,$out));
1336
1337 &movdqa ($tweak,$inout3); # last tweak
1338 &jmp (&label("xts_enc_done"));
1339
1340&set_label("xts_enc_two",16);
1341 &movaps ($inout4,$tweak); # put aside last tweak
1342
1343 &movups ($inout0,&QWP(16*0,$inp)); # load input
1344 &movups ($inout1,&QWP(16*1,$inp));
1345 &lea ($inp,&DWP(16*2,$inp));
1346 &xorps ($inout0,$inout3); # input^=tweak
1347 &xorps ($inout1,$inout4);
f8501464 1348
214368ff 1349 &call ("_aesni_encrypt2");
f8501464
AP
1350
1351 &xorps ($inout0,$inout3); # output^=tweak
1352 &xorps ($inout1,$inout4);
1353 &movups (&QWP(16*0,$out),$inout0); # write output
1354 &movups (&QWP(16*1,$out),$inout1);
1355 &lea ($out,&DWP(16*2,$out));
1356
1357 &movdqa ($tweak,$inout4); # last tweak
1358 &jmp (&label("xts_enc_done"));
1359
1360&set_label("xts_enc_three",16);
1361 &movaps ($inout5,$tweak); # put aside last tweak
1362 &movups ($inout0,&QWP(16*0,$inp)); # load input
1363 &movups ($inout1,&QWP(16*1,$inp));
1364 &movups ($inout2,&QWP(16*2,$inp));
1365 &lea ($inp,&DWP(16*3,$inp));
1366 &xorps ($inout0,$inout3); # input^=tweak
1367 &xorps ($inout1,$inout4);
1368 &xorps ($inout2,$inout5);
1369
1370 &call ("_aesni_encrypt3");
1371
1372 &xorps ($inout0,$inout3); # output^=tweak
1373 &xorps ($inout1,$inout4);
1374 &xorps ($inout2,$inout5);
1375 &movups (&QWP(16*0,$out),$inout0); # write output
1376 &movups (&QWP(16*1,$out),$inout1);
1377 &movups (&QWP(16*2,$out),$inout2);
1378 &lea ($out,&DWP(16*3,$out));
1379
1380 &movdqa ($tweak,$inout5); # last tweak
1381 &jmp (&label("xts_enc_done"));
1382
1383&set_label("xts_enc_four",16);
1384 &movaps ($inout4,$tweak); # put aside last tweak
1385
1386 &movups ($inout0,&QWP(16*0,$inp)); # load input
1387 &movups ($inout1,&QWP(16*1,$inp));
1388 &movups ($inout2,&QWP(16*2,$inp));
1389 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1390 &movups ($inout3,&QWP(16*3,$inp));
1391 &lea ($inp,&DWP(16*4,$inp));
1392 &xorps ($inout1,&QWP(16*1,"esp"));
1393 &xorps ($inout2,$inout5);
1394 &xorps ($inout3,$inout4);
1395
1396 &call ("_aesni_encrypt4");
1397
1398 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1399 &xorps ($inout1,&QWP(16*1,"esp"));
1400 &xorps ($inout2,$inout5);
1401 &movups (&QWP(16*0,$out),$inout0); # write output
1402 &xorps ($inout3,$inout4);
1403 &movups (&QWP(16*1,$out),$inout1);
1404 &movups (&QWP(16*2,$out),$inout2);
1405 &movups (&QWP(16*3,$out),$inout3);
1406 &lea ($out,&DWP(16*4,$out));
1407
1408 &movdqa ($tweak,$inout4); # last tweak
1409 &jmp (&label("xts_enc_done"));
1410
1411&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1412 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1413 &and ($len,15);
1414 &jz (&label("xts_enc_ret"));
1415 &movdqa ($inout3,$tweak);
1416 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1417 &jmp (&label("xts_enc_steal"));
1418
1419&set_label("xts_enc_done",16);
1420 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1421 &pxor ($twtmp,$twtmp);
1422 &and ($len,15);
1423 &jz (&label("xts_enc_ret"));
1424
1425 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1426 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1427 &pshufd ($inout3,$twtmp,0x13);
1428 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1429 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1430 &pxor ($inout3,$tweak);
1431
1432&set_label("xts_enc_steal");
1433 &movz ($rounds,&BP(0,$inp));
1434 &movz ($key,&BP(-16,$out));
1435 &lea ($inp,&DWP(1,$inp));
1436 &mov (&BP(-16,$out),&LB($rounds));
1437 &mov (&BP(0,$out),&LB($key));
1438 &lea ($out,&DWP(1,$out));
1439 &sub ($len,1);
1440 &jnz (&label("xts_enc_steal"));
1441
1442 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1443 &mov ($key,$key_); # restore $key
1444 &mov ($rounds,$rounds_); # restore $rounds
1445
1446 &movups ($inout0,&QWP(-16,$out)); # load input
1447 &xorps ($inout0,$inout3); # input^=tweak
1448 if ($inline)
1449 { &aesni_inline_generate1("enc"); }
1450 else
1451 { &call ("_aesni_encrypt1"); }
1452 &xorps ($inout0,$inout3); # output^=tweak
1453 &movups (&QWP(-16,$out),$inout0); # write output
1454
1455&set_label("xts_enc_ret");
23f6eec7
AP
1456 &pxor ("xmm0","xmm0"); # clear register bank
1457 &pxor ("xmm1","xmm1");
1458 &pxor ("xmm2","xmm2");
1459 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1460 &pxor ("xmm3","xmm3");
1461 &movdqa (&QWP(16*1,"esp"),"xmm0");
1462 &pxor ("xmm4","xmm4");
1463 &movdqa (&QWP(16*2,"esp"),"xmm0");
1464 &pxor ("xmm5","xmm5");
1465 &movdqa (&QWP(16*3,"esp"),"xmm0");
1466 &pxor ("xmm6","xmm6");
1467 &movdqa (&QWP(16*4,"esp"),"xmm0");
1468 &pxor ("xmm7","xmm7");
1469 &movdqa (&QWP(16*5,"esp"),"xmm0");
f8501464
AP
1470 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1471&function_end("aesni_xts_encrypt");
1472
1473&function_begin("aesni_xts_decrypt");
1474 &mov ($key,&wparam(4)); # key2
1475 &mov ($inp,&wparam(5)); # clear-text tweak
1476
1477 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1478 &movups ($inout0,&QWP(0,$inp));
1479 if ($inline)
1480 { &aesni_inline_generate1("enc"); }
1481 else
1482 { &call ("_aesni_encrypt1"); }
1483
1484 &mov ($inp,&wparam(0));
1485 &mov ($out,&wparam(1));
1486 &mov ($len,&wparam(2));
1487 &mov ($key,&wparam(3)); # key1
1488
1489 &mov ($key_,"esp");
1490 &sub ("esp",16*7+8);
1491 &and ("esp",-16); # align stack
1492
1493 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1494 &test ($len,15);
1495 &setnz (&LB($rounds_));
1496 &shl ($rounds_,4);
1497 &sub ($len,$rounds_);
1498
1499 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1500 &mov (&DWP(16*6+4,"esp"),0);
1501 &mov (&DWP(16*6+8,"esp"),1);
1502 &mov (&DWP(16*6+12,"esp"),0);
1503 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1504 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1505
1506 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1507 &mov ($key_,$key); # backup $key
1508 &mov ($rounds_,$rounds); # backup $rounds
1509
1510 &movdqa ($tweak,$inout0);
1511 &pxor ($twtmp,$twtmp);
1512 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1513 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1514
1515 &and ($len,-16);
1516 &sub ($len,16*6);
1517 &jc (&label("xts_dec_short"));
1518
d8ba0dc9
AP
1519 &shl ($rounds,4);
1520 &mov ($rounds_,16);
1521 &sub ($rounds_,$rounds);
1522 &lea ($key,&DWP(32,$key,$rounds));
f8501464
AP
1523 &jmp (&label("xts_dec_loop6"));
1524
1525&set_label("xts_dec_loop6",16);
1526 for ($i=0;$i<4;$i++) {
1527 &pshufd ($twres,$twtmp,0x13);
1528 &pxor ($twtmp,$twtmp);
1529 &movdqa (&QWP(16*$i,"esp"),$tweak);
1530 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1531 &pand ($twres,$twmask); # isolate carry and residue
1532 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1533 &pxor ($tweak,$twres);
1534 }
1535 &pshufd ($inout5,$twtmp,0x13);
1536 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1537 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1538 &$movekey ($rndkey0,&QWP(0,$key_));
1539 &pand ($inout5,$twmask); # isolate carry and residue
1540 &movups ($inout0,&QWP(0,$inp)); # load input
1541 &pxor ($inout5,$tweak);
1542
1543 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
d8ba0dc9 1544 &mov ($rounds,$rounds_);
f8501464
AP
1545 &movdqu ($inout1,&QWP(16*1,$inp));
1546 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1547 &movdqu ($inout2,&QWP(16*2,$inp));
1548 &pxor ($inout1,$rndkey0);
1549 &movdqu ($inout3,&QWP(16*3,$inp));
1550 &pxor ($inout2,$rndkey0);
1551 &movdqu ($inout4,&QWP(16*4,$inp));
1552 &pxor ($inout3,$rndkey0);
1553 &movdqu ($rndkey1,&QWP(16*5,$inp));
1554 &pxor ($inout4,$rndkey0);
1555 &lea ($inp,&DWP(16*6,$inp));
1556 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1557 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1558 &pxor ($inout5,$rndkey1);
1559
1560 &$movekey ($rndkey1,&QWP(16,$key_));
f8501464 1561 &pxor ($inout1,&QWP(16*1,"esp"));
f8501464 1562 &pxor ($inout2,&QWP(16*2,"esp"));
d8ba0dc9 1563 &aesdec ($inout0,$rndkey1);
f8501464 1564 &pxor ($inout3,&QWP(16*3,"esp"));
f8501464 1565 &pxor ($inout4,&QWP(16*4,"esp"));
d8ba0dc9 1566 &aesdec ($inout1,$rndkey1);
f8501464 1567 &pxor ($inout5,$rndkey0);
d8ba0dc9
AP
1568 &$movekey ($rndkey0,&QWP(32,$key_));
1569 &aesdec ($inout2,$rndkey1);
1570 &aesdec ($inout3,$rndkey1);
f8501464 1571 &aesdec ($inout4,$rndkey1);
f8501464
AP
1572 &aesdec ($inout5,$rndkey1);
1573 &call (&label("_aesni_decrypt6_enter"));
1574
1575 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1576 &pxor ($twtmp,$twtmp);
1577 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1578 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1579 &xorps ($inout1,&QWP(16*1,"esp"));
1580 &movups (&QWP(16*0,$out),$inout0); # write output
1581 &xorps ($inout2,&QWP(16*2,"esp"));
1582 &movups (&QWP(16*1,$out),$inout1);
1583 &xorps ($inout3,&QWP(16*3,"esp"));
1584 &movups (&QWP(16*2,$out),$inout2);
1585 &xorps ($inout4,&QWP(16*4,"esp"));
1586 &movups (&QWP(16*3,$out),$inout3);
1587 &xorps ($inout5,$tweak);
1588 &movups (&QWP(16*4,$out),$inout4);
1589 &pshufd ($twres,$twtmp,0x13);
1590 &movups (&QWP(16*5,$out),$inout5);
1591 &lea ($out,&DWP(16*6,$out));
1592 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1593
1594 &pxor ($twtmp,$twtmp);
1595 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1596 &pand ($twres,$twmask); # isolate carry and residue
1597 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
f8501464
AP
1598 &pxor ($tweak,$twres);
1599
1600 &sub ($len,16*6);
1601 &jnc (&label("xts_dec_loop6"));
1602
d8ba0dc9 1603 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
f8501464
AP
1604 &mov ($key,$key_); # restore $key
1605 &mov ($rounds_,$rounds);
1606
1607&set_label("xts_dec_short");
1608 &add ($len,16*6);
1609 &jz (&label("xts_dec_done6x"));
1610
1611 &movdqa ($inout3,$tweak); # put aside previous tweak
1612 &cmp ($len,0x20);
1613 &jb (&label("xts_dec_one"));
1614
1615 &pshufd ($twres,$twtmp,0x13);
1616 &pxor ($twtmp,$twtmp);
1617 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1618 &pand ($twres,$twmask); # isolate carry and residue
1619 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1620 &pxor ($tweak,$twres);
1621 &je (&label("xts_dec_two"));
1622
1623 &pshufd ($twres,$twtmp,0x13);
1624 &pxor ($twtmp,$twtmp);
1625 &movdqa ($inout4,$tweak); # put aside previous tweak
1626 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1627 &pand ($twres,$twmask); # isolate carry and residue
1628 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1629 &pxor ($tweak,$twres);
1630 &cmp ($len,0x40);
1631 &jb (&label("xts_dec_three"));
1632
1633 &pshufd ($twres,$twtmp,0x13);
1634 &pxor ($twtmp,$twtmp);
1635 &movdqa ($inout5,$tweak); # put aside previous tweak
1636 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1637 &pand ($twres,$twmask); # isolate carry and residue
1638 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1639 &pxor ($tweak,$twres);
1640 &movdqa (&QWP(16*0,"esp"),$inout3);
1641 &movdqa (&QWP(16*1,"esp"),$inout4);
1642 &je (&label("xts_dec_four"));
1643
1644 &movdqa (&QWP(16*2,"esp"),$inout5);
1645 &pshufd ($inout5,$twtmp,0x13);
1646 &movdqa (&QWP(16*3,"esp"),$tweak);
1647 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1648 &pand ($inout5,$twmask); # isolate carry and residue
1649 &pxor ($inout5,$tweak);
1650
1651 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1652 &movdqu ($inout1,&QWP(16*1,$inp));
1653 &movdqu ($inout2,&QWP(16*2,$inp));
1654 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1655 &movdqu ($inout3,&QWP(16*3,$inp));
1656 &pxor ($inout1,&QWP(16*1,"esp"));
1657 &movdqu ($inout4,&QWP(16*4,$inp));
1658 &pxor ($inout2,&QWP(16*2,"esp"));
1659 &lea ($inp,&DWP(16*5,$inp));
1660 &pxor ($inout3,&QWP(16*3,"esp"));
1661 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1662 &pxor ($inout4,$inout5);
1663
1664 &call ("_aesni_decrypt6");
1665
1666 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1667 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1668 &xorps ($inout1,&QWP(16*1,"esp"));
1669 &xorps ($inout2,&QWP(16*2,"esp"));
1670 &movups (&QWP(16*0,$out),$inout0); # write output
1671 &xorps ($inout3,&QWP(16*3,"esp"));
1672 &movups (&QWP(16*1,$out),$inout1);
1673 &xorps ($inout4,$tweak);
1674 &movups (&QWP(16*2,$out),$inout2);
1675 &movups (&QWP(16*3,$out),$inout3);
1676 &movups (&QWP(16*4,$out),$inout4);
1677 &lea ($out,&DWP(16*5,$out));
1678 &jmp (&label("xts_dec_done"));
1679
1680&set_label("xts_dec_one",16);
1681 &movups ($inout0,&QWP(16*0,$inp)); # load input
1682 &lea ($inp,&DWP(16*1,$inp));
1683 &xorps ($inout0,$inout3); # input^=tweak
1684 if ($inline)
1685 { &aesni_inline_generate1("dec"); }
1686 else
1687 { &call ("_aesni_decrypt1"); }
1688 &xorps ($inout0,$inout3); # output^=tweak
1689 &movups (&QWP(16*0,$out),$inout0); # write output
1690 &lea ($out,&DWP(16*1,$out));
1691
1692 &movdqa ($tweak,$inout3); # last tweak
1693 &jmp (&label("xts_dec_done"));
1694
1695&set_label("xts_dec_two",16);
1696 &movaps ($inout4,$tweak); # put aside last tweak
1697
1698 &movups ($inout0,&QWP(16*0,$inp)); # load input
1699 &movups ($inout1,&QWP(16*1,$inp));
1700 &lea ($inp,&DWP(16*2,$inp));
1701 &xorps ($inout0,$inout3); # input^=tweak
1702 &xorps ($inout1,$inout4);
1703
214368ff 1704 &call ("_aesni_decrypt2");
f8501464
AP
1705
1706 &xorps ($inout0,$inout3); # output^=tweak
1707 &xorps ($inout1,$inout4);
1708 &movups (&QWP(16*0,$out),$inout0); # write output
1709 &movups (&QWP(16*1,$out),$inout1);
1710 &lea ($out,&DWP(16*2,$out));
1711
1712 &movdqa ($tweak,$inout4); # last tweak
1713 &jmp (&label("xts_dec_done"));
1714
1715&set_label("xts_dec_three",16);
1716 &movaps ($inout5,$tweak); # put aside last tweak
1717 &movups ($inout0,&QWP(16*0,$inp)); # load input
1718 &movups ($inout1,&QWP(16*1,$inp));
1719 &movups ($inout2,&QWP(16*2,$inp));
1720 &lea ($inp,&DWP(16*3,$inp));
1721 &xorps ($inout0,$inout3); # input^=tweak
1722 &xorps ($inout1,$inout4);
1723 &xorps ($inout2,$inout5);
1724
1725 &call ("_aesni_decrypt3");
1726
1727 &xorps ($inout0,$inout3); # output^=tweak
1728 &xorps ($inout1,$inout4);
1729 &xorps ($inout2,$inout5);
1730 &movups (&QWP(16*0,$out),$inout0); # write output
1731 &movups (&QWP(16*1,$out),$inout1);
1732 &movups (&QWP(16*2,$out),$inout2);
1733 &lea ($out,&DWP(16*3,$out));
1734
1735 &movdqa ($tweak,$inout5); # last tweak
1736 &jmp (&label("xts_dec_done"));
1737
1738&set_label("xts_dec_four",16);
1739 &movaps ($inout4,$tweak); # put aside last tweak
1740
1741 &movups ($inout0,&QWP(16*0,$inp)); # load input
1742 &movups ($inout1,&QWP(16*1,$inp));
1743 &movups ($inout2,&QWP(16*2,$inp));
1744 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1745 &movups ($inout3,&QWP(16*3,$inp));
1746 &lea ($inp,&DWP(16*4,$inp));
1747 &xorps ($inout1,&QWP(16*1,"esp"));
1748 &xorps ($inout2,$inout5);
1749 &xorps ($inout3,$inout4);
1750
1751 &call ("_aesni_decrypt4");
1752
1753 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1754 &xorps ($inout1,&QWP(16*1,"esp"));
1755 &xorps ($inout2,$inout5);
1756 &movups (&QWP(16*0,$out),$inout0); # write output
1757 &xorps ($inout3,$inout4);
1758 &movups (&QWP(16*1,$out),$inout1);
1759 &movups (&QWP(16*2,$out),$inout2);
1760 &movups (&QWP(16*3,$out),$inout3);
1761 &lea ($out,&DWP(16*4,$out));
1762
1763 &movdqa ($tweak,$inout4); # last tweak
1764 &jmp (&label("xts_dec_done"));
1765
1766&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1767 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1768 &and ($len,15);
1769 &jz (&label("xts_dec_ret"));
1770 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1771 &jmp (&label("xts_dec_only_one_more"));
1772
1773&set_label("xts_dec_done",16);
1774 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1775 &pxor ($twtmp,$twtmp);
1776 &and ($len,15);
1777 &jz (&label("xts_dec_ret"));
1778
1779 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1780 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1781 &pshufd ($twres,$twtmp,0x13);
1782 &pxor ($twtmp,$twtmp);
1783 &movdqa ($twmask,&QWP(16*6,"esp"));
1784 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1785 &pand ($twres,$twmask); # isolate carry and residue
1786 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1787 &pxor ($tweak,$twres);
1788
1789&set_label("xts_dec_only_one_more");
1790 &pshufd ($inout3,$twtmp,0x13);
1791 &movdqa ($inout4,$tweak); # put aside previous tweak
1792 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1793 &pand ($inout3,$twmask); # isolate carry and residue
1794 &pxor ($inout3,$tweak);
1795
1796 &mov ($key,$key_); # restore $key
1797 &mov ($rounds,$rounds_); # restore $rounds
1798
1799 &movups ($inout0,&QWP(0,$inp)); # load input
1800 &xorps ($inout0,$inout3); # input^=tweak
1801 if ($inline)
1802 { &aesni_inline_generate1("dec"); }
1803 else
1804 { &call ("_aesni_decrypt1"); }
1805 &xorps ($inout0,$inout3); # output^=tweak
1806 &movups (&QWP(0,$out),$inout0); # write output
1807
1808&set_label("xts_dec_steal");
1809 &movz ($rounds,&BP(16,$inp));
1810 &movz ($key,&BP(0,$out));
1811 &lea ($inp,&DWP(1,$inp));
1812 &mov (&BP(0,$out),&LB($rounds));
1813 &mov (&BP(16,$out),&LB($key));
1814 &lea ($out,&DWP(1,$out));
1815 &sub ($len,1);
1816 &jnz (&label("xts_dec_steal"));
1817
1818 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1819 &mov ($key,$key_); # restore $key
1820 &mov ($rounds,$rounds_); # restore $rounds
1821
1822 &movups ($inout0,&QWP(0,$out)); # load input
1823 &xorps ($inout0,$inout4); # input^=tweak
1824 if ($inline)
1825 { &aesni_inline_generate1("dec"); }
1826 else
1827 { &call ("_aesni_decrypt1"); }
1828 &xorps ($inout0,$inout4); # output^=tweak
1829 &movups (&QWP(0,$out),$inout0); # write output
1830
1831&set_label("xts_dec_ret");
23f6eec7
AP
1832 &pxor ("xmm0","xmm0"); # clear register bank
1833 &pxor ("xmm1","xmm1");
1834 &pxor ("xmm2","xmm2");
1835 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1836 &pxor ("xmm3","xmm3");
1837 &movdqa (&QWP(16*1,"esp"),"xmm0");
1838 &pxor ("xmm4","xmm4");
1839 &movdqa (&QWP(16*2,"esp"),"xmm0");
1840 &pxor ("xmm5","xmm5");
1841 &movdqa (&QWP(16*3,"esp"),"xmm0");
1842 &pxor ("xmm6","xmm6");
1843 &movdqa (&QWP(16*4,"esp"),"xmm0");
1844 &pxor ("xmm7","xmm7");
1845 &movdqa (&QWP(16*5,"esp"),"xmm0");
f8501464
AP
1846 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1847&function_end("aesni_xts_decrypt");
1848}
bd30091c
AP
1849\f
1850######################################################################
1851# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
1852# const AES_KEY *key, unsigned int start_block_num,
1853# unsigned char offset_i[16], const unsigned char L_[][16],
1854# unsigned char checksum[16]);
1855#
1856{
1857# offsets within stack frame
1858my $checksum = 16*6;
1859my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
1860
1861# reassigned registers
1862my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
1863# $l_, $blocks, $inp, $key are permanently allocated in registers;
1864# remaining non-volatile ones are offloaded to stack, which even
1865# stay invariant after written to stack.
1866
1867&function_begin("aesni_ocb_encrypt");
1868 &mov ($rounds,&wparam(5)); # &offset_i
1869 &mov ($rounds_,&wparam(7)); # &checksum
1870
1871 &mov ($inp,&wparam(0));
1872 &mov ($out,&wparam(1));
1873 &mov ($len,&wparam(2));
1874 &mov ($key,&wparam(3));
1875 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
1876 &mov ($block,&wparam(4)); # start_block_num
1877 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
1878 &mov ($l_,&wparam(6)); # L_
1879
1880 &mov ($rounds,"esp");
1881 &sub ("esp",$esp_off+4); # alloca
1882 &and ("esp",-16); # align stack
1883
1884 &sub ($out,$inp);
1885 &shl ($len,4);
1886 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
1887 &mov (&DWP($out_off,"esp"),$out);
1888 &mov (&DWP($end_off,"esp"),$len);
1889 &mov (&DWP($esp_off,"esp"),$rounds);
1890
1891 &mov ($rounds,&DWP(240,$key));
1892
1893 &test ($block,1);
1894 &jnz (&label("odd"));
1895
1896 &bsf ($i3,$block);
1897 &add ($block,1);
1898 &shl ($i3,4);
1899 &movdqu ($inout5,&QWP(0,$l_,$i3));
1900 &mov ($i3,$key); # put aside key
1901
1902 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1903 &lea ($inp,&DWP(16,$inp));
1904
1905 &pxor ($inout5,$rndkey0); # ^ last offset_i
1906 &pxor ($rndkey1,$inout0); # checksum
1907 &pxor ($inout0,$inout5); # ^ offset_i
1908
1909 &movdqa ($inout4,$rndkey1);
1910 if ($inline)
1911 { &aesni_inline_generate1("enc"); }
1912 else
1913 { &call ("_aesni_encrypt1"); }
1914
1915 &xorps ($inout0,$inout5); # ^ offset_i
1916 &movdqa ($rndkey0,$inout5); # pass last offset_i
1917 &movdqa ($rndkey1,$inout4); # pass the checksum
1918
1919 &movups (&QWP(-16,$out,$inp),$inout0); # store output
1920
1921 &mov ($rounds,&DWP(240,$i3));
1922 &mov ($key,$i3); # restore key
1923 &mov ($len,&DWP($end_off,"esp"));
1924
1925&set_label("odd");
1926 &shl ($rounds,4);
1927 &mov ($out,16);
1928 &sub ($out,$rounds); # twisted rounds
1929 &mov (&DWP($key_off,"esp"),$key);
1930 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
1931 &mov (&DWP($rounds_off,"esp"),$out);
1932
1933 &cmp ($inp,$len);
1934 &ja (&label("short"));
1935 &jmp (&label("grandloop"));
1936
1937&set_label("grandloop",32);
1938 &lea ($i1,&DWP(1,$block));
1939 &lea ($i3,&DWP(3,$block));
1940 &lea ($i5,&DWP(5,$block));
1941 &add ($block,6);
1942 &bsf ($i1,$i1);
1943 &bsf ($i3,$i3);
1944 &bsf ($i5,$i5);
1945 &shl ($i1,4);
1946 &shl ($i3,4);
1947 &shl ($i5,4);
1948 &movdqu ($inout0,&QWP(0,$l_));
1949 &movdqu ($inout1,&QWP(0,$l_,$i1));
1950 &mov ($rounds,&DWP($rounds_off,"esp"));
1951 &movdqa ($inout2,$inout0);
1952 &movdqu ($inout3,&QWP(0,$l_,$i3));
1953 &movdqa ($inout4,$inout0);
1954 &movdqu ($inout5,&QWP(0,$l_,$i5));
1955
1956 &pxor ($inout0,$rndkey0); # ^ last offset_i
1957 &pxor ($inout1,$inout0);
1958 &movdqa (&QWP(16*0,"esp"),$inout0);
1959 &pxor ($inout2,$inout1);
1960 &movdqa (&QWP(16*1,"esp"),$inout1);
1961 &pxor ($inout3,$inout2);
1962 &movdqa (&QWP(16*2,"esp"),$inout2);
1963 &pxor ($inout4,$inout3);
1964 &movdqa (&QWP(16*3,"esp"),$inout3);
1965 &pxor ($inout5,$inout4);
1966 &movdqa (&QWP(16*4,"esp"),$inout4);
1967 &movdqa (&QWP(16*5,"esp"),$inout5);
1968
1969 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
1970 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1971 &movdqu ($inout1,&QWP(16*1,$inp));
1972 &movdqu ($inout2,&QWP(16*2,$inp));
1973 &movdqu ($inout3,&QWP(16*3,$inp));
1974 &movdqu ($inout4,&QWP(16*4,$inp));
1975 &movdqu ($inout5,&QWP(16*5,$inp));
1976 &lea ($inp,&DWP(16*6,$inp));
1977
1978 &pxor ($rndkey1,$inout0); # checksum
1979 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
1980 &pxor ($rndkey1,$inout1);
1981 &pxor ($inout1,$rndkey0);
1982 &pxor ($rndkey1,$inout2);
1983 &pxor ($inout2,$rndkey0);
1984 &pxor ($rndkey1,$inout3);
1985 &pxor ($inout3,$rndkey0);
1986 &pxor ($rndkey1,$inout4);
1987 &pxor ($inout4,$rndkey0);
1988 &pxor ($rndkey1,$inout5);
1989 &pxor ($inout5,$rndkey0);
1990 &movdqa (&QWP($checksum,"esp"),$rndkey1);
1991
1992 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
1993 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
1994 &pxor ($inout1,&QWP(16*1,"esp"));
1995 &pxor ($inout2,&QWP(16*2,"esp"));
1996 &pxor ($inout3,&QWP(16*3,"esp"));
1997 &pxor ($inout4,&QWP(16*4,"esp"));
1998 &pxor ($inout5,&QWP(16*5,"esp"));
1999
2000 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2001 &aesenc ($inout0,$rndkey1);
2002 &aesenc ($inout1,$rndkey1);
2003 &aesenc ($inout2,$rndkey1);
2004 &aesenc ($inout3,$rndkey1);
2005 &aesenc ($inout4,$rndkey1);
2006 &aesenc ($inout5,$rndkey1);
2007
2008 &mov ($out,&DWP($out_off,"esp"));
2009 &mov ($len,&DWP($end_off,"esp"));
2010 &call ("_aesni_encrypt6_enter");
2011
2012 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
2013 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2014 &pxor ($inout1,&QWP(16*1,"esp"));
2015 &pxor ($inout2,&QWP(16*2,"esp"));
2016 &pxor ($inout3,&QWP(16*3,"esp"));
2017 &pxor ($inout4,&QWP(16*4,"esp"));
2018 &pxor ($inout5,$rndkey0);
2019 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2020
2021 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
2022 &movdqu (&QWP(-16*5,$out,$inp),$inout1);
2023 &movdqu (&QWP(-16*4,$out,$inp),$inout2);
2024 &movdqu (&QWP(-16*3,$out,$inp),$inout3);
2025 &movdqu (&QWP(-16*2,$out,$inp),$inout4);
2026 &movdqu (&QWP(-16*1,$out,$inp),$inout5);
2027 &cmp ($inp,$len); # done yet?
2028 &jb (&label("grandloop"));
2029
2030&set_label("short");
2031 &add ($len,16*6);
2032 &sub ($len,$inp);
2033 &jz (&label("done"));
2034
2035 &cmp ($len,16*2);
2036 &jb (&label("one"));
2037 &je (&label("two"));
2038
2039 &cmp ($len,16*4);
2040 &jb (&label("three"));
2041 &je (&label("four"));
2042
2043 &lea ($i1,&DWP(1,$block));
2044 &lea ($i3,&DWP(3,$block));
2045 &bsf ($i1,$i1);
2046 &bsf ($i3,$i3);
2047 &shl ($i1,4);
2048 &shl ($i3,4);
2049 &movdqu ($inout0,&QWP(0,$l_));
2050 &movdqu ($inout1,&QWP(0,$l_,$i1));
2051 &mov ($rounds,&DWP($rounds_off,"esp"));
2052 &movdqa ($inout2,$inout0);
2053 &movdqu ($inout3,&QWP(0,$l_,$i3));
2054 &movdqa ($inout4,$inout0);
2055
2056 &pxor ($inout0,$rndkey0); # ^ last offset_i
2057 &pxor ($inout1,$inout0);
2058 &movdqa (&QWP(16*0,"esp"),$inout0);
2059 &pxor ($inout2,$inout1);
2060 &movdqa (&QWP(16*1,"esp"),$inout1);
2061 &pxor ($inout3,$inout2);
2062 &movdqa (&QWP(16*2,"esp"),$inout2);
2063 &pxor ($inout4,$inout3);
2064 &movdqa (&QWP(16*3,"esp"),$inout3);
2065 &pxor ($inout5,$inout4);
2066 &movdqa (&QWP(16*4,"esp"),$inout4);
2067
2068 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2069 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2070 &movdqu ($inout1,&QWP(16*1,$inp));
2071 &movdqu ($inout2,&QWP(16*2,$inp));
2072 &movdqu ($inout3,&QWP(16*3,$inp));
2073 &movdqu ($inout4,&QWP(16*4,$inp));
2074 &pxor ($inout5,$inout5);
2075
2076 &pxor ($rndkey1,$inout0); # checksum
2077 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2078 &pxor ($rndkey1,$inout1);
2079 &pxor ($inout1,$rndkey0);
2080 &pxor ($rndkey1,$inout2);
2081 &pxor ($inout2,$rndkey0);
2082 &pxor ($rndkey1,$inout3);
2083 &pxor ($inout3,$rndkey0);
2084 &pxor ($rndkey1,$inout4);
2085 &pxor ($inout4,$rndkey0);
2086 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2087
2088 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2089 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2090 &pxor ($inout1,&QWP(16*1,"esp"));
2091 &pxor ($inout2,&QWP(16*2,"esp"));
2092 &pxor ($inout3,&QWP(16*3,"esp"));
2093 &pxor ($inout4,&QWP(16*4,"esp"));
2094
2095 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2096 &aesenc ($inout0,$rndkey1);
2097 &aesenc ($inout1,$rndkey1);
2098 &aesenc ($inout2,$rndkey1);
2099 &aesenc ($inout3,$rndkey1);
2100 &aesenc ($inout4,$rndkey1);
2101 &aesenc ($inout5,$rndkey1);
2102
2103 &mov ($out,&DWP($out_off,"esp"));
2104 &call ("_aesni_encrypt6_enter");
2105
2106 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
2107 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2108 &pxor ($inout1,&QWP(16*1,"esp"));
2109 &pxor ($inout2,&QWP(16*2,"esp"));
2110 &pxor ($inout3,&QWP(16*3,"esp"));
2111 &pxor ($inout4,$rndkey0);
2112 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2113
2114 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
2115 &movdqu (&QWP(16*1,$out,$inp),$inout1);
2116 &movdqu (&QWP(16*2,$out,$inp),$inout2);
2117 &movdqu (&QWP(16*3,$out,$inp),$inout3);
2118 &movdqu (&QWP(16*4,$out,$inp),$inout4);
2119
2120 &jmp (&label("done"));
2121
2122&set_label("one",16);
2123 &movdqu ($inout5,&QWP(0,$l_));
2124 &mov ($key,&DWP($key_off,"esp")); # restore key
2125
2126 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2127 &mov ($rounds,&DWP(240,$key));
2128
2129 &pxor ($inout5,$rndkey0); # ^ last offset_i
2130 &pxor ($rndkey1,$inout0); # checksum
2131 &pxor ($inout0,$inout5); # ^ offset_i
2132
2133 &movdqa ($inout4,$rndkey1);
2134 &mov ($out,&DWP($out_off,"esp"));
2135 if ($inline)
2136 { &aesni_inline_generate1("enc"); }
2137 else
2138 { &call ("_aesni_encrypt1"); }
2139
2140 &xorps ($inout0,$inout5); # ^ offset_i
2141 &movdqa ($rndkey0,$inout5); # pass last offset_i
2142 &movdqa ($rndkey1,$inout4); # pass the checksum
2143 &movups (&QWP(0,$out,$inp),$inout0);
2144
2145 &jmp (&label("done"));
2146
2147&set_label("two",16);
2148 &lea ($i1,&DWP(1,$block));
2149 &mov ($key,&DWP($key_off,"esp")); # restore key
2150 &bsf ($i1,$i1);
2151 &shl ($i1,4);
2152 &movdqu ($inout4,&QWP(0,$l_));
2153 &movdqu ($inout5,&QWP(0,$l_,$i1));
2154
2155 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2156 &movdqu ($inout1,&QWP(16*1,$inp));
2157 &mov ($rounds,&DWP(240,$key));
2158
2159 &pxor ($inout4,$rndkey0); # ^ last offset_i
2160 &pxor ($inout5,$inout4);
2161
2162 &pxor ($rndkey1,$inout0); # checksum
2163 &pxor ($inout0,$inout4); # ^ offset_i
2164 &pxor ($rndkey1,$inout1);
2165 &pxor ($inout1,$inout5);
2166
2167 &movdqa ($inout3,$rndkey1)
2168 &mov ($out,&DWP($out_off,"esp"));
2169 &call ("_aesni_encrypt2");
2170
2171 &xorps ($inout0,$inout4); # ^ offset_i
2172 &xorps ($inout1,$inout5);
2173 &movdqa ($rndkey0,$inout5); # pass last offset_i
2174 &movdqa ($rndkey1,$inout3); # pass the checksum
2175 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2176 &movups (&QWP(16*1,$out,$inp),$inout1);
2177
2178 &jmp (&label("done"));
2179
2180&set_label("three",16);
2181 &lea ($i1,&DWP(1,$block));
2182 &mov ($key,&DWP($key_off,"esp")); # restore key
2183 &bsf ($i1,$i1);
2184 &shl ($i1,4);
2185 &movdqu ($inout3,&QWP(0,$l_));
2186 &movdqu ($inout4,&QWP(0,$l_,$i1));
2187 &movdqa ($inout5,$inout3);
2188
2189 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2190 &movdqu ($inout1,&QWP(16*1,$inp));
2191 &movdqu ($inout2,&QWP(16*2,$inp));
2192 &mov ($rounds,&DWP(240,$key));
2193
2194 &pxor ($inout3,$rndkey0); # ^ last offset_i
2195 &pxor ($inout4,$inout3);
2196 &pxor ($inout5,$inout4);
2197
2198 &pxor ($rndkey1,$inout0); # checksum
2199 &pxor ($inout0,$inout3); # ^ offset_i
2200 &pxor ($rndkey1,$inout1);
2201 &pxor ($inout1,$inout4);
2202 &pxor ($rndkey1,$inout2);
2203 &pxor ($inout2,$inout5);
2204
2205 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2206 &mov ($out,&DWP($out_off,"esp"));
2207 &call ("_aesni_encrypt3");
2208
2209 &xorps ($inout0,$inout3); # ^ offset_i
2210 &xorps ($inout1,$inout4);
2211 &xorps ($inout2,$inout5);
2212 &movdqa ($rndkey0,$inout5); # pass last offset_i
2213 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2214 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2215 &movups (&QWP(16*1,$out,$inp),$inout1);
2216 &movups (&QWP(16*2,$out,$inp),$inout2);
2217
2218 &jmp (&label("done"));
2219
2220&set_label("four",16);
2221 &lea ($i1,&DWP(1,$block));
2222 &lea ($i3,&DWP(3,$block));
2223 &bsf ($i1,$i1);
2224 &bsf ($i3,$i3);
2225 &mov ($key,&DWP($key_off,"esp")); # restore key
2226 &shl ($i1,4);
2227 &shl ($i3,4);
2228 &movdqu ($inout2,&QWP(0,$l_));
2229 &movdqu ($inout3,&QWP(0,$l_,$i1));
2230 &movdqa ($inout4,$inout2);
2231 &movdqu ($inout5,&QWP(0,$l_,$i3));
2232
2233 &pxor ($inout2,$rndkey0); # ^ last offset_i
2234 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2235 &pxor ($inout3,$inout2);
2236 &movdqu ($inout1,&QWP(16*1,$inp));
2237 &pxor ($inout4,$inout3);
2238 &movdqa (&QWP(16*0,"esp"),$inout2);
2239 &pxor ($inout5,$inout4);
2240 &movdqa (&QWP(16*1,"esp"),$inout3);
2241 &movdqu ($inout2,&QWP(16*2,$inp));
2242 &movdqu ($inout3,&QWP(16*3,$inp));
2243 &mov ($rounds,&DWP(240,$key));
2244
2245 &pxor ($rndkey1,$inout0); # checksum
2246 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2247 &pxor ($rndkey1,$inout1);
2248 &pxor ($inout1,&QWP(16*1,"esp"));
2249 &pxor ($rndkey1,$inout2);
2250 &pxor ($inout2,$inout4);
2251 &pxor ($rndkey1,$inout3);
2252 &pxor ($inout3,$inout5);
2253
2254 &movdqa (&QWP($checksum,"esp"),$rndkey1)
2255 &mov ($out,&DWP($out_off,"esp"));
2256 &call ("_aesni_encrypt4");
2257
2258 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2259 &xorps ($inout1,&QWP(16*1,"esp"));
2260 &xorps ($inout2,$inout4);
2261 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2262 &xorps ($inout3,$inout5);
2263 &movups (&QWP(16*1,$out,$inp),$inout1);
2264 &movdqa ($rndkey0,$inout5); # pass last offset_i
2265 &movups (&QWP(16*2,$out,$inp),$inout2);
2266 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2267 &movups (&QWP(16*3,$out,$inp),$inout3);
2268
2269&set_label("done");
2270 &mov ($key,&DWP($esp_off,"esp"));
2271 &pxor ($inout0,$inout0); # clear register bank
2272 &pxor ($inout1,$inout1);
2273 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
2274 &pxor ($inout2,$inout2);
2275 &movdqa (&QWP(16*1,"esp"),$inout0);
2276 &pxor ($inout3,$inout3);
2277 &movdqa (&QWP(16*2,"esp"),$inout0);
2278 &pxor ($inout4,$inout4);
2279 &movdqa (&QWP(16*3,"esp"),$inout0);
2280 &pxor ($inout5,$inout5);
2281 &movdqa (&QWP(16*4,"esp"),$inout0);
2282 &movdqa (&QWP(16*5,"esp"),$inout0);
2283 &movdqa (&QWP(16*6,"esp"),$inout0);
2284
2285 &lea ("esp",&DWP(0,$key));
2286 &mov ($rounds,&wparam(5)); # &offset_i
2287 &mov ($rounds_,&wparam(7)); # &checksum
2288 &movdqu (&QWP(0,$rounds),$rndkey0);
2289 &pxor ($rndkey0,$rndkey0);
2290 &movdqu (&QWP(0,$rounds_),$rndkey1);
2291 &pxor ($rndkey1,$rndkey1);
2292&function_end("aesni_ocb_encrypt");
2293
2294&function_begin("aesni_ocb_decrypt");
2295 &mov ($rounds,&wparam(5)); # &offset_i
2296 &mov ($rounds_,&wparam(7)); # &checksum
2297
2298 &mov ($inp,&wparam(0));
2299 &mov ($out,&wparam(1));
2300 &mov ($len,&wparam(2));
2301 &mov ($key,&wparam(3));
2302 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
2303 &mov ($block,&wparam(4)); # start_block_num
2304 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
2305 &mov ($l_,&wparam(6)); # L_
2306
2307 &mov ($rounds,"esp");
2308 &sub ("esp",$esp_off+4); # alloca
2309 &and ("esp",-16); # align stack
2310
2311 &sub ($out,$inp);
2312 &shl ($len,4);
2313 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
2314 &mov (&DWP($out_off,"esp"),$out);
2315 &mov (&DWP($end_off,"esp"),$len);
2316 &mov (&DWP($esp_off,"esp"),$rounds);
2317
2318 &mov ($rounds,&DWP(240,$key));
2319
2320 &test ($block,1);
2321 &jnz (&label("odd"));
2322
2323 &bsf ($i3,$block);
2324 &add ($block,1);
2325 &shl ($i3,4);
2326 &movdqu ($inout5,&QWP(0,$l_,$i3));
2327 &mov ($i3,$key); # put aside key
2328
2329 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2330 &lea ($inp,&DWP(16,$inp));
2331
2332 &pxor ($inout5,$rndkey0); # ^ last offset_i
2333 &pxor ($inout0,$inout5); # ^ offset_i
2334
2335 &movdqa ($inout4,$rndkey1);
2336 if ($inline)
2337 { &aesni_inline_generate1("dec"); }
2338 else
2339 { &call ("_aesni_decrypt1"); }
2340
2341 &xorps ($inout0,$inout5); # ^ offset_i
2342 &movaps ($rndkey1,$inout4); # pass the checksum
2343 &movdqa ($rndkey0,$inout5); # pass last offset_i
2344 &xorps ($rndkey1,$inout0); # checksum
2345 &movups (&QWP(-16,$out,$inp),$inout0); # store output
2346
2347 &mov ($rounds,&DWP(240,$i3));
2348 &mov ($key,$i3); # restore key
2349 &mov ($len,&DWP($end_off,"esp"));
2350
2351&set_label("odd");
2352 &shl ($rounds,4);
2353 &mov ($out,16);
2354 &sub ($out,$rounds); # twisted rounds
2355 &mov (&DWP($key_off,"esp"),$key);
2356 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
2357 &mov (&DWP($rounds_off,"esp"),$out);
2358
2359 &cmp ($inp,$len);
2360 &ja (&label("short"));
2361 &jmp (&label("grandloop"));
2362
2363&set_label("grandloop",32);
2364 &lea ($i1,&DWP(1,$block));
2365 &lea ($i3,&DWP(3,$block));
2366 &lea ($i5,&DWP(5,$block));
2367 &add ($block,6);
2368 &bsf ($i1,$i1);
2369 &bsf ($i3,$i3);
2370 &bsf ($i5,$i5);
2371 &shl ($i1,4);
2372 &shl ($i3,4);
2373 &shl ($i5,4);
2374 &movdqu ($inout0,&QWP(0,$l_));
2375 &movdqu ($inout1,&QWP(0,$l_,$i1));
2376 &mov ($rounds,&DWP($rounds_off,"esp"));
2377 &movdqa ($inout2,$inout0);
2378 &movdqu ($inout3,&QWP(0,$l_,$i3));
2379 &movdqa ($inout4,$inout0);
2380 &movdqu ($inout5,&QWP(0,$l_,$i5));
2381
2382 &pxor ($inout0,$rndkey0); # ^ last offset_i
2383 &pxor ($inout1,$inout0);
2384 &movdqa (&QWP(16*0,"esp"),$inout0);
2385 &pxor ($inout2,$inout1);
2386 &movdqa (&QWP(16*1,"esp"),$inout1);
2387 &pxor ($inout3,$inout2);
2388 &movdqa (&QWP(16*2,"esp"),$inout2);
2389 &pxor ($inout4,$inout3);
2390 &movdqa (&QWP(16*3,"esp"),$inout3);
2391 &pxor ($inout5,$inout4);
2392 &movdqa (&QWP(16*4,"esp"),$inout4);
2393 &movdqa (&QWP(16*5,"esp"),$inout5);
2394
2395 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2396 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2397 &movdqu ($inout1,&QWP(16*1,$inp));
2398 &movdqu ($inout2,&QWP(16*2,$inp));
2399 &movdqu ($inout3,&QWP(16*3,$inp));
2400 &movdqu ($inout4,&QWP(16*4,$inp));
2401 &movdqu ($inout5,&QWP(16*5,$inp));
2402 &lea ($inp,&DWP(16*6,$inp));
2403
2404 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2405 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2406 &pxor ($inout1,$rndkey0);
2407 &pxor ($inout2,$rndkey0);
2408 &pxor ($inout3,$rndkey0);
2409 &pxor ($inout4,$rndkey0);
2410 &pxor ($inout5,$rndkey0);
2411
2412 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2413 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2414 &pxor ($inout1,&QWP(16*1,"esp"));
2415 &pxor ($inout2,&QWP(16*2,"esp"));
2416 &pxor ($inout3,&QWP(16*3,"esp"));
2417 &pxor ($inout4,&QWP(16*4,"esp"));
2418 &pxor ($inout5,&QWP(16*5,"esp"));
2419
2420 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2421 &aesdec ($inout0,$rndkey1);
2422 &aesdec ($inout1,$rndkey1);
2423 &aesdec ($inout2,$rndkey1);
2424 &aesdec ($inout3,$rndkey1);
2425 &aesdec ($inout4,$rndkey1);
2426 &aesdec ($inout5,$rndkey1);
2427
2428 &mov ($out,&DWP($out_off,"esp"));
2429 &mov ($len,&DWP($end_off,"esp"));
2430 &call ("_aesni_decrypt6_enter");
2431
2432 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
2433 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2434 &movdqa ($rndkey1,&QWP($checksum,"esp"));
2435 &pxor ($inout1,&QWP(16*1,"esp"));
2436 &pxor ($inout2,&QWP(16*2,"esp"));
2437 &pxor ($inout3,&QWP(16*3,"esp"));
2438 &pxor ($inout4,&QWP(16*4,"esp"));
2439 &pxor ($inout5,$rndkey0);
2440
2441 &pxor ($rndkey1,$inout0); # checksum
2442 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
2443 &pxor ($rndkey1,$inout1);
2444 &movdqu (&QWP(-16*5,$out,$inp),$inout1);
2445 &pxor ($rndkey1,$inout2);
2446 &movdqu (&QWP(-16*4,$out,$inp),$inout2);
2447 &pxor ($rndkey1,$inout3);
2448 &movdqu (&QWP(-16*3,$out,$inp),$inout3);
2449 &pxor ($rndkey1,$inout4);
2450 &movdqu (&QWP(-16*2,$out,$inp),$inout4);
2451 &pxor ($rndkey1,$inout5);
2452 &movdqu (&QWP(-16*1,$out,$inp),$inout5);
2453 &cmp ($inp,$len); # done yet?
2454 &jb (&label("grandloop"));
2455
2456&set_label("short");
2457 &add ($len,16*6);
2458 &sub ($len,$inp);
2459 &jz (&label("done"));
2460
2461 &cmp ($len,16*2);
2462 &jb (&label("one"));
2463 &je (&label("two"));
2464
2465 &cmp ($len,16*4);
2466 &jb (&label("three"));
2467 &je (&label("four"));
2468
2469 &lea ($i1,&DWP(1,$block));
2470 &lea ($i3,&DWP(3,$block));
2471 &bsf ($i1,$i1);
2472 &bsf ($i3,$i3);
2473 &shl ($i1,4);
2474 &shl ($i3,4);
2475 &movdqu ($inout0,&QWP(0,$l_));
2476 &movdqu ($inout1,&QWP(0,$l_,$i1));
2477 &mov ($rounds,&DWP($rounds_off,"esp"));
2478 &movdqa ($inout2,$inout0);
2479 &movdqu ($inout3,&QWP(0,$l_,$i3));
2480 &movdqa ($inout4,$inout0);
2481
2482 &pxor ($inout0,$rndkey0); # ^ last offset_i
2483 &pxor ($inout1,$inout0);
2484 &movdqa (&QWP(16*0,"esp"),$inout0);
2485 &pxor ($inout2,$inout1);
2486 &movdqa (&QWP(16*1,"esp"),$inout1);
2487 &pxor ($inout3,$inout2);
2488 &movdqa (&QWP(16*2,"esp"),$inout2);
2489 &pxor ($inout4,$inout3);
2490 &movdqa (&QWP(16*3,"esp"),$inout3);
2491 &pxor ($inout5,$inout4);
2492 &movdqa (&QWP(16*4,"esp"),$inout4);
2493
2494 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2495 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2496 &movdqu ($inout1,&QWP(16*1,$inp));
2497 &movdqu ($inout2,&QWP(16*2,$inp));
2498 &movdqu ($inout3,&QWP(16*3,$inp));
2499 &movdqu ($inout4,&QWP(16*4,$inp));
2500 &pxor ($inout5,$inout5);
2501
2502 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2503 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2504 &pxor ($inout1,$rndkey0);
2505 &pxor ($inout2,$rndkey0);
2506 &pxor ($inout3,$rndkey0);
2507 &pxor ($inout4,$rndkey0);
2508
2509 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2510 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2511 &pxor ($inout1,&QWP(16*1,"esp"));
2512 &pxor ($inout2,&QWP(16*2,"esp"));
2513 &pxor ($inout3,&QWP(16*3,"esp"));
2514 &pxor ($inout4,&QWP(16*4,"esp"));
2515
2516 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2517 &aesdec ($inout0,$rndkey1);
2518 &aesdec ($inout1,$rndkey1);
2519 &aesdec ($inout2,$rndkey1);
2520 &aesdec ($inout3,$rndkey1);
2521 &aesdec ($inout4,$rndkey1);
2522 &aesdec ($inout5,$rndkey1);
2523
2524 &mov ($out,&DWP($out_off,"esp"));
2525 &call ("_aesni_decrypt6_enter");
2526
2527 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
2528 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2529 &movdqa ($rndkey1,&QWP($checksum,"esp"));
2530 &pxor ($inout1,&QWP(16*1,"esp"));
2531 &pxor ($inout2,&QWP(16*2,"esp"));
2532 &pxor ($inout3,&QWP(16*3,"esp"));
2533 &pxor ($inout4,$rndkey0);
2534
2535 &pxor ($rndkey1,$inout0); # checksum
2536 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
2537 &pxor ($rndkey1,$inout1);
2538 &movdqu (&QWP(16*1,$out,$inp),$inout1);
2539 &pxor ($rndkey1,$inout2);
2540 &movdqu (&QWP(16*2,$out,$inp),$inout2);
2541 &pxor ($rndkey1,$inout3);
2542 &movdqu (&QWP(16*3,$out,$inp),$inout3);
2543 &pxor ($rndkey1,$inout4);
2544 &movdqu (&QWP(16*4,$out,$inp),$inout4);
2545
2546 &jmp (&label("done"));
2547
2548&set_label("one",16);
2549 &movdqu ($inout5,&QWP(0,$l_));
2550 &mov ($key,&DWP($key_off,"esp")); # restore key
2551
2552 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2553 &mov ($rounds,&DWP(240,$key));
2554
2555 &pxor ($inout5,$rndkey0); # ^ last offset_i
2556 &pxor ($inout0,$inout5); # ^ offset_i
2557
2558 &movdqa ($inout4,$rndkey1);
2559 &mov ($out,&DWP($out_off,"esp"));
2560 if ($inline)
2561 { &aesni_inline_generate1("dec"); }
2562 else
2563 { &call ("_aesni_decrypt1"); }
2564
2565 &xorps ($inout0,$inout5); # ^ offset_i
2566 &movaps ($rndkey1,$inout4); # pass the checksum
2567 &movdqa ($rndkey0,$inout5); # pass last offset_i
2568 &xorps ($rndkey1,$inout0); # checksum
2569 &movups (&QWP(0,$out,$inp),$inout0);
2570
2571 &jmp (&label("done"));
2572
2573&set_label("two",16);
2574 &lea ($i1,&DWP(1,$block));
2575 &mov ($key,&DWP($key_off,"esp")); # restore key
2576 &bsf ($i1,$i1);
2577 &shl ($i1,4);
2578 &movdqu ($inout4,&QWP(0,$l_));
2579 &movdqu ($inout5,&QWP(0,$l_,$i1));
2580
2581 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2582 &movdqu ($inout1,&QWP(16*1,$inp));
2583 &mov ($rounds,&DWP(240,$key));
2584
2585 &movdqa ($inout3,$rndkey1);
2586 &pxor ($inout4,$rndkey0); # ^ last offset_i
2587 &pxor ($inout5,$inout4);
2588
2589 &pxor ($inout0,$inout4); # ^ offset_i
2590 &pxor ($inout1,$inout5);
2591
2592 &mov ($out,&DWP($out_off,"esp"));
2593 &call ("_aesni_decrypt2");
2594
2595 &xorps ($inout0,$inout4); # ^ offset_i
2596 &xorps ($inout1,$inout5);
2597 &movdqa ($rndkey0,$inout5); # pass last offset_i
2598 &xorps ($inout3,$inout0); # checksum
2599 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2600 &xorps ($inout3,$inout1);
2601 &movups (&QWP(16*1,$out,$inp),$inout1);
2602 &movaps ($rndkey1,$inout3); # pass the checksum
2603
2604 &jmp (&label("done"));
2605
2606&set_label("three",16);
2607 &lea ($i1,&DWP(1,$block));
2608 &mov ($key,&DWP($key_off,"esp")); # restore key
2609 &bsf ($i1,$i1);
2610 &shl ($i1,4);
2611 &movdqu ($inout3,&QWP(0,$l_));
2612 &movdqu ($inout4,&QWP(0,$l_,$i1));
2613 &movdqa ($inout5,$inout3);
2614
2615 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2616 &movdqu ($inout1,&QWP(16*1,$inp));
2617 &movdqu ($inout2,&QWP(16*2,$inp));
2618 &mov ($rounds,&DWP(240,$key));
2619
2620 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2621 &pxor ($inout3,$rndkey0); # ^ last offset_i
2622 &pxor ($inout4,$inout3);
2623 &pxor ($inout5,$inout4);
2624
2625 &pxor ($inout0,$inout3); # ^ offset_i
2626 &pxor ($inout1,$inout4);
2627 &pxor ($inout2,$inout5);
2628
2629 &mov ($out,&DWP($out_off,"esp"));
2630 &call ("_aesni_decrypt3");
2631
2632 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2633 &xorps ($inout0,$inout3); # ^ offset_i
2634 &xorps ($inout1,$inout4);
2635 &xorps ($inout2,$inout5);
2636 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2637 &pxor ($rndkey1,$inout0); # checksum
2638 &movdqa ($rndkey0,$inout5); # pass last offset_i
2639 &movups (&QWP(16*1,$out,$inp),$inout1);
2640 &pxor ($rndkey1,$inout1);
2641 &movups (&QWP(16*2,$out,$inp),$inout2);
2642 &pxor ($rndkey1,$inout2);
2643
2644 &jmp (&label("done"));
2645
2646&set_label("four",16);
2647 &lea ($i1,&DWP(1,$block));
2648 &lea ($i3,&DWP(3,$block));
2649 &bsf ($i1,$i1);
2650 &bsf ($i3,$i3);
2651 &mov ($key,&DWP($key_off,"esp")); # restore key
2652 &shl ($i1,4);
2653 &shl ($i3,4);
2654 &movdqu ($inout2,&QWP(0,$l_));
2655 &movdqu ($inout3,&QWP(0,$l_,$i1));
2656 &movdqa ($inout4,$inout2);
2657 &movdqu ($inout5,&QWP(0,$l_,$i3));
2658
2659 &pxor ($inout2,$rndkey0); # ^ last offset_i
2660 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2661 &pxor ($inout3,$inout2);
2662 &movdqu ($inout1,&QWP(16*1,$inp));
2663 &pxor ($inout4,$inout3);
2664 &movdqa (&QWP(16*0,"esp"),$inout2);
2665 &pxor ($inout5,$inout4);
2666 &movdqa (&QWP(16*1,"esp"),$inout3);
2667 &movdqu ($inout2,&QWP(16*2,$inp));
2668 &movdqu ($inout3,&QWP(16*3,$inp));
2669 &mov ($rounds,&DWP(240,$key));
2670
2671 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2672 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2673 &pxor ($inout1,&QWP(16*1,"esp"));
2674 &pxor ($inout2,$inout4);
2675 &pxor ($inout3,$inout5);
2676
2677 &mov ($out,&DWP($out_off,"esp"));
2678 &call ("_aesni_decrypt4");
2679
2680 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2681 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2682 &xorps ($inout1,&QWP(16*1,"esp"));
2683 &xorps ($inout2,$inout4);
2684 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2685 &pxor ($rndkey1,$inout0); # checksum
2686 &xorps ($inout3,$inout5);
2687 &movups (&QWP(16*1,$out,$inp),$inout1);
2688 &pxor ($rndkey1,$inout1);
2689 &movdqa ($rndkey0,$inout5); # pass last offset_i
2690 &movups (&QWP(16*2,$out,$inp),$inout2);
2691 &pxor ($rndkey1,$inout2);
2692 &movups (&QWP(16*3,$out,$inp),$inout3);
2693 &pxor ($rndkey1,$inout3);
2694
2695&set_label("done");
2696 &mov ($key,&DWP($esp_off,"esp"));
2697 &pxor ($inout0,$inout0); # clear register bank
2698 &pxor ($inout1,$inout1);
2699 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
2700 &pxor ($inout2,$inout2);
2701 &movdqa (&QWP(16*1,"esp"),$inout0);
2702 &pxor ($inout3,$inout3);
2703 &movdqa (&QWP(16*2,"esp"),$inout0);
2704 &pxor ($inout4,$inout4);
2705 &movdqa (&QWP(16*3,"esp"),$inout0);
2706 &pxor ($inout5,$inout5);
2707 &movdqa (&QWP(16*4,"esp"),$inout0);
2708 &movdqa (&QWP(16*5,"esp"),$inout0);
2709 &movdqa (&QWP(16*6,"esp"),$inout0);
2710
2711 &lea ("esp",&DWP(0,$key));
2712 &mov ($rounds,&wparam(5)); # &offset_i
2713 &mov ($rounds_,&wparam(7)); # &checksum
2714 &movdqu (&QWP(0,$rounds),$rndkey0);
2715 &pxor ($rndkey0,$rndkey0);
2716 &movdqu (&QWP(0,$rounds_),$rndkey1);
2717 &pxor ($rndkey1,$rndkey1);
2718&function_end("aesni_ocb_decrypt");
2719}
6c83629b
AP
2720}
2721\f
2722######################################################################
d64a7232
AP
2723# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2724# size_t length, const AES_KEY *key,
2725# unsigned char *ivp,const int enc);
2726&function_begin("${PREFIX}_cbc_encrypt");
2727 &mov ($inp,&wparam(0));
f8501464 2728 &mov ($rounds_,"esp");
d64a7232 2729 &mov ($out,&wparam(1));
f8501464 2730 &sub ($rounds_,24);
d64a7232 2731 &mov ($len,&wparam(2));
f8501464 2732 &and ($rounds_,-16);
d64a7232 2733 &mov ($key,&wparam(3));
d64a7232 2734 &mov ($key_,&wparam(4));
d7d119a3 2735 &test ($len,$len);
f8501464 2736 &jz (&label("cbc_abort"));
d64a7232
AP
2737
2738 &cmp (&wparam(5),0);
f8501464
AP
2739 &xchg ($rounds_,"esp"); # alloca
2740 &movups ($ivec,&QWP(0,$key_)); # load IV
d64a7232 2741 &mov ($rounds,&DWP(240,$key));
f8501464
AP
2742 &mov ($key_,$key); # backup $key
2743 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
2744 &mov ($rounds_,$rounds); # backup $rounds
d64a7232
AP
2745 &je (&label("cbc_decrypt"));
2746
f8501464 2747 &movaps ($inout0,$ivec);
d64a7232
AP
2748 &cmp ($len,16);
2749 &jb (&label("cbc_enc_tail"));
2750 &sub ($len,16);
2751 &jmp (&label("cbc_enc_loop"));
2752
2753&set_label("cbc_enc_loop",16);
f8501464 2754 &movups ($ivec,&QWP(0,$inp)); # input actually
d64a7232 2755 &lea ($inp,&DWP(16,$inp));
6f766a41 2756 if ($inline)
f8501464 2757 { &aesni_inline_generate1("enc",$inout0,$ivec); }
6f766a41 2758 else
f8501464 2759 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
d64a7232
AP
2760 &mov ($rounds,$rounds_); # restore $rounds
2761 &mov ($key,$key_); # restore $key
d7d119a3
AP
2762 &movups (&QWP(0,$out),$inout0); # store output
2763 &lea ($out,&DWP(16,$out));
2764 &sub ($len,16);
d64a7232
AP
2765 &jnc (&label("cbc_enc_loop"));
2766 &add ($len,16);
2767 &jnz (&label("cbc_enc_tail"));
2768 &movaps ($ivec,$inout0);
23f6eec7 2769 &pxor ($inout0,$inout0);
d64a7232
AP
2770 &jmp (&label("cbc_ret"));
2771
2772&set_label("cbc_enc_tail");
2773 &mov ("ecx",$len); # zaps $rounds
2774 &data_word(0xA4F3F689); # rep movsb
2775 &mov ("ecx",16); # zero tail
2776 &sub ("ecx",$len);
2777 &xor ("eax","eax"); # zaps $len
2778 &data_word(0xAAF3F689); # rep stosb
2779 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
2780 &mov ($rounds,$rounds_); # restore $rounds
2781 &mov ($inp,$out); # $inp and $out are the same
2782 &mov ($key,$key_); # restore $key
2783 &jmp (&label("cbc_enc_loop"));
6c83629b 2784######################################################################
d64a7232 2785&set_label("cbc_decrypt",16);
f8501464 2786 &cmp ($len,0x50);
d608b4d6 2787 &jbe (&label("cbc_dec_tail"));
f8501464
AP
2788 &movaps (&QWP(0,"esp"),$ivec); # save IV
2789 &sub ($len,0x50);
2790 &jmp (&label("cbc_dec_loop6_enter"));
d64a7232 2791
f8501464
AP
2792&set_label("cbc_dec_loop6",16);
2793 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
2794 &movups (&QWP(0,$out),$inout5);
2795 &lea ($out,&DWP(0x10,$out));
2796&set_label("cbc_dec_loop6_enter");
2797 &movdqu ($inout0,&QWP(0,$inp));
2798 &movdqu ($inout1,&QWP(0x10,$inp));
2799 &movdqu ($inout2,&QWP(0x20,$inp));
2800 &movdqu ($inout3,&QWP(0x30,$inp));
2801 &movdqu ($inout4,&QWP(0x40,$inp));
2802 &movdqu ($inout5,&QWP(0x50,$inp));
2803
2804 &call ("_aesni_decrypt6");
2805
2806 &movups ($rndkey1,&QWP(0,$inp));
2807 &movups ($rndkey0,&QWP(0x10,$inp));
2808 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
2809 &xorps ($inout1,$rndkey1);
2810 &movups ($rndkey1,&QWP(0x20,$inp));
2811 &xorps ($inout2,$rndkey0);
2812 &movups ($rndkey0,&QWP(0x30,$inp));
2813 &xorps ($inout3,$rndkey1);
2814 &movups ($rndkey1,&QWP(0x40,$inp));
2815 &xorps ($inout4,$rndkey0);
2816 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
2817 &xorps ($inout5,$rndkey1);
2818 &movups (&QWP(0,$out),$inout0);
2819 &movups (&QWP(0x10,$out),$inout1);
2820 &lea ($inp,&DWP(0x60,$inp));
2821 &movups (&QWP(0x20,$out),$inout2);
f9c5e5d9 2822 &mov ($rounds,$rounds_); # restore $rounds
f8501464
AP
2823 &movups (&QWP(0x30,$out),$inout3);
2824 &mov ($key,$key_); # restore $key
2825 &movups (&QWP(0x40,$out),$inout4);
2826 &lea ($out,&DWP(0x50,$out));
2827 &sub ($len,0x60);
2828 &ja (&label("cbc_dec_loop6"));
2829
2830 &movaps ($inout0,$inout5);
2831 &movaps ($ivec,$rndkey0);
2832 &add ($len,0x50);
23f6eec7 2833 &jle (&label("cbc_dec_clear_tail_collected"));
f8501464
AP
2834 &movups (&QWP(0,$out),$inout0);
2835 &lea ($out,&DWP(0x10,$out));
6c83629b 2836&set_label("cbc_dec_tail");
d64a7232 2837 &movups ($inout0,&QWP(0,$inp));
d64a7232 2838 &movaps ($in0,$inout0);
d7d119a3 2839 &cmp ($len,0x10);
d64a7232 2840 &jbe (&label("cbc_dec_one"));
f8501464 2841
d64a7232 2842 &movups ($inout1,&QWP(0x10,$inp));
d64a7232 2843 &movaps ($in1,$inout1);
d7d119a3 2844 &cmp ($len,0x20);
d64a7232 2845 &jbe (&label("cbc_dec_two"));
f8501464 2846
d64a7232 2847 &movups ($inout2,&QWP(0x20,$inp));
d608b4d6
AP
2848 &cmp ($len,0x30);
2849 &jbe (&label("cbc_dec_three"));
f8501464 2850
d608b4d6 2851 &movups ($inout3,&QWP(0x30,$inp));
f8501464
AP
2852 &cmp ($len,0x40);
2853 &jbe (&label("cbc_dec_four"));
2854
2855 &movups ($inout4,&QWP(0x40,$inp));
2856 &movaps (&QWP(0,"esp"),$ivec); # save IV
2857 &movups ($inout0,&QWP(0,$inp));
2858 &xorps ($inout5,$inout5);
2859 &call ("_aesni_decrypt6");
2860 &movups ($rndkey1,&QWP(0,$inp));
2861 &movups ($rndkey0,&QWP(0x10,$inp));
2862 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
2863 &xorps ($inout1,$rndkey1);
2864 &movups ($rndkey1,&QWP(0x20,$inp));
2865 &xorps ($inout2,$rndkey0);
2866 &movups ($rndkey0,&QWP(0x30,$inp));
2867 &xorps ($inout3,$rndkey1);
2868 &movups ($ivec,&QWP(0x40,$inp)); # IV
2869 &xorps ($inout4,$rndkey0);
2870 &movups (&QWP(0,$out),$inout0);
2871 &movups (&QWP(0x10,$out),$inout1);
23f6eec7 2872 &pxor ($inout1,$inout1);
f8501464 2873 &movups (&QWP(0x20,$out),$inout2);
23f6eec7 2874 &pxor ($inout2,$inout2);
f8501464 2875 &movups (&QWP(0x30,$out),$inout3);
23f6eec7 2876 &pxor ($inout3,$inout3);
f8501464
AP
2877 &lea ($out,&DWP(0x40,$out));
2878 &movaps ($inout0,$inout4);
23f6eec7 2879 &pxor ($inout4,$inout4);
f8501464 2880 &sub ($len,0x50);
d64a7232
AP
2881 &jmp (&label("cbc_dec_tail_collected"));
2882
d7d119a3 2883&set_label("cbc_dec_one",16);
6f766a41
AP
2884 if ($inline)
2885 { &aesni_inline_generate1("dec"); }
2886 else
2887 { &call ("_aesni_decrypt1"); }
f8501464
AP
2888 &xorps ($inout0,$ivec);
2889 &movaps ($ivec,$in0);
2890 &sub ($len,0x10);
d64a7232
AP
2891 &jmp (&label("cbc_dec_tail_collected"));
2892
d7d119a3 2893&set_label("cbc_dec_two",16);
214368ff 2894 &call ("_aesni_decrypt2");
f8501464
AP
2895 &xorps ($inout0,$ivec);
2896 &xorps ($inout1,$in0);
2897 &movups (&QWP(0,$out),$inout0);
2898 &movaps ($inout0,$inout1);
23f6eec7 2899 &pxor ($inout1,$inout1);
d64a7232 2900 &lea ($out,&DWP(0x10,$out));
f8501464
AP
2901 &movaps ($ivec,$in1);
2902 &sub ($len,0x20);
d608b4d6
AP
2903 &jmp (&label("cbc_dec_tail_collected"));
2904
d7d119a3 2905&set_label("cbc_dec_three",16);
d608b4d6 2906 &call ("_aesni_decrypt3");
f8501464
AP
2907 &xorps ($inout0,$ivec);
2908 &xorps ($inout1,$in0);
2909 &xorps ($inout2,$in1);
2910 &movups (&QWP(0,$out),$inout0);
2911 &movaps ($inout0,$inout2);
23f6eec7 2912 &pxor ($inout2,$inout2);
f8501464 2913 &movups (&QWP(0x10,$out),$inout1);
23f6eec7 2914 &pxor ($inout1,$inout1);
d608b4d6 2915 &lea ($out,&DWP(0x20,$out));
f8501464
AP
2916 &movups ($ivec,&QWP(0x20,$inp));
2917 &sub ($len,0x30);
2918 &jmp (&label("cbc_dec_tail_collected"));
2919
2920&set_label("cbc_dec_four",16);
2921 &call ("_aesni_decrypt4");
2922 &movups ($rndkey1,&QWP(0x10,$inp));
2923 &movups ($rndkey0,&QWP(0x20,$inp));
2924 &xorps ($inout0,$ivec);
2925 &movups ($ivec,&QWP(0x30,$inp));
2926 &xorps ($inout1,$in0);
2927 &movups (&QWP(0,$out),$inout0);
2928 &xorps ($inout2,$rndkey1);
2929 &movups (&QWP(0x10,$out),$inout1);
23f6eec7 2930 &pxor ($inout1,$inout1);
f8501464
AP
2931 &xorps ($inout3,$rndkey0);
2932 &movups (&QWP(0x20,$out),$inout2);
23f6eec7 2933 &pxor ($inout2,$inout2);
f8501464
AP
2934 &lea ($out,&DWP(0x30,$out));
2935 &movaps ($inout0,$inout3);
23f6eec7 2936 &pxor ($inout3,$inout3);
f8501464 2937 &sub ($len,0x40);
23f6eec7 2938 &jmp (&label("cbc_dec_tail_collected"));
d64a7232 2939
23f6eec7
AP
2940&set_label("cbc_dec_clear_tail_collected",16);
2941 &pxor ($inout1,$inout1);
2942 &pxor ($inout2,$inout2);
2943 &pxor ($inout3,$inout3);
2944 &pxor ($inout4,$inout4);
d64a7232
AP
2945&set_label("cbc_dec_tail_collected");
2946 &and ($len,15);
2947 &jnz (&label("cbc_dec_tail_partial"));
f8501464 2948 &movups (&QWP(0,$out),$inout0);
23f6eec7 2949 &pxor ($rndkey0,$rndkey0);
d64a7232
AP
2950 &jmp (&label("cbc_ret"));
2951
d7d119a3 2952&set_label("cbc_dec_tail_partial",16);
f8501464 2953 &movaps (&QWP(0,"esp"),$inout0);
23f6eec7 2954 &pxor ($rndkey0,$rndkey0);
f8501464 2955 &mov ("ecx",16);
d64a7232 2956 &mov ($inp,"esp");
f8501464 2957 &sub ("ecx",$len);
d64a7232 2958 &data_word(0xA4F3F689); # rep movsb
23f6eec7 2959 &movdqa (&QWP(0,"esp"),$inout0);
d64a7232
AP
2960
2961&set_label("cbc_ret");
f8501464 2962 &mov ("esp",&DWP(16,"esp")); # pull original %esp
d64a7232 2963 &mov ($key_,&wparam(4));
23f6eec7
AP
2964 &pxor ($inout0,$inout0);
2965 &pxor ($rndkey1,$rndkey1);
d64a7232 2966 &movups (&QWP(0,$key_),$ivec); # output IV
23f6eec7 2967 &pxor ($ivec,$ivec);
f8501464 2968&set_label("cbc_abort");
d64a7232 2969&function_end("${PREFIX}_cbc_encrypt");
6c83629b
AP
2970\f
2971######################################################################
d64a7232
AP
2972# Mechanical port from aesni-x86_64.pl.
2973#
2974# _aesni_set_encrypt_key is private interface,
2975# input:
2976# "eax" const unsigned char *userKey
2977# $rounds int bits
2978# $key AES_KEY *key
2979# output:
2980# "eax" return code
2981# $round rounds
2982
2983&function_begin_B("_aesni_set_encrypt_key");
23f6eec7
AP
2984 &push ("ebp");
2985 &push ("ebx");
d64a7232
AP
2986 &test ("eax","eax");
2987 &jz (&label("bad_pointer"));
2988 &test ($key,$key);
2989 &jz (&label("bad_pointer"));
2990
23f6eec7
AP
2991 &call (&label("pic"));
2992&set_label("pic");
2993 &blindpop("ebx");
2994 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2995
2996 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
d64a7232 2997 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
f8501464 2998 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
23f6eec7 2999 &mov ("ebp",&DWP(4,"ebp"));
d64a7232 3000 &lea ($key,&DWP(16,$key));
23f6eec7 3001 &and ("ebp",1<<28|1<<11); # AVX and XOP bits
d64a7232
AP
3002 &cmp ($rounds,256);
3003 &je (&label("14rounds"));
3004 &cmp ($rounds,192);
3005 &je (&label("12rounds"));
3006 &cmp ($rounds,128);
3007 &jne (&label("bad_keybits"));
3008
3009&set_label("10rounds",16);
23f6eec7
AP
3010 &cmp ("ebp",1<<28);
3011 &je (&label("10rounds_alt"));
3012
d608b4d6 3013 &mov ($rounds,9);
d64a7232
AP
3014 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
3015 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
3016 &call (&label("key_128_cold"));
3017 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
3018 &call (&label("key_128"));
3019 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
3020 &call (&label("key_128"));
3021 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
3022 &call (&label("key_128"));
3023 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
3024 &call (&label("key_128"));
3025 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
3026 &call (&label("key_128"));
3027 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
3028 &call (&label("key_128"));
3029 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
3030 &call (&label("key_128"));
3031 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
3032 &call (&label("key_128"));
3033 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
3034 &call (&label("key_128"));
3035 &$movekey (&QWP(0,$key),"xmm0");
3036 &mov (&DWP(80,$key),$rounds);
23f6eec7
AP
3037
3038 &jmp (&label("good_key"));
d64a7232
AP
3039
3040&set_label("key_128",16);
3041 &$movekey (&QWP(0,$key),"xmm0");
3042 &lea ($key,&DWP(16,$key));
3043&set_label("key_128_cold");
3044 &shufps ("xmm4","xmm0",0b00010000);
f8501464
AP
3045 &xorps ("xmm0","xmm4");
3046 &shufps ("xmm4","xmm0",0b10001100);
3047 &xorps ("xmm0","xmm4");
3048 &shufps ("xmm1","xmm1",0b11111111); # critical path
3049 &xorps ("xmm0","xmm1");
d64a7232
AP
3050 &ret();
3051
23f6eec7
AP
3052&set_label("10rounds_alt",16);
3053 &movdqa ("xmm5",&QWP(0x00,"ebx"));
3054 &mov ($rounds,8);
3055 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3056 &movdqa ("xmm2","xmm0");
7be6bc68 3057 &movdqu (&QWP(-16,$key),"xmm0");
23f6eec7
AP
3058
3059&set_label("loop_key128");
3060 &pshufb ("xmm0","xmm5");
3061 &aesenclast ("xmm0","xmm4");
3062 &pslld ("xmm4",1);
3063 &lea ($key,&DWP(16,$key));
3064
3065 &movdqa ("xmm3","xmm2");
3066 &pslldq ("xmm2",4);
3067 &pxor ("xmm3","xmm2");
3068 &pslldq ("xmm2",4);
3069 &pxor ("xmm3","xmm2");
3070 &pslldq ("xmm2",4);
3071 &pxor ("xmm2","xmm3");
3072
3073 &pxor ("xmm0","xmm2");
3074 &movdqu (&QWP(-16,$key),"xmm0");
3075 &movdqa ("xmm2","xmm0");
3076
3077 &dec ($rounds);
3078 &jnz (&label("loop_key128"));
3079
3080 &movdqa ("xmm4",&QWP(0x30,"ebx"));
3081
3082 &pshufb ("xmm0","xmm5");
3083 &aesenclast ("xmm0","xmm4");
3084 &pslld ("xmm4",1);
3085
3086 &movdqa ("xmm3","xmm2");
3087 &pslldq ("xmm2",4);
3088 &pxor ("xmm3","xmm2");
3089 &pslldq ("xmm2",4);
3090 &pxor ("xmm3","xmm2");
3091 &pslldq ("xmm2",4);
3092 &pxor ("xmm2","xmm3");
3093
3094 &pxor ("xmm0","xmm2");
3095 &movdqu (&QWP(0,$key),"xmm0");
3096
3097 &movdqa ("xmm2","xmm0");
3098 &pshufb ("xmm0","xmm5");
3099 &aesenclast ("xmm0","xmm4");
3100
3101 &movdqa ("xmm3","xmm2");
3102 &pslldq ("xmm2",4);
3103 &pxor ("xmm3","xmm2");
3104 &pslldq ("xmm2",4);
3105 &pxor ("xmm3","xmm2");
3106 &pslldq ("xmm2",4);
3107 &pxor ("xmm2","xmm3");
3108
3109 &pxor ("xmm0","xmm2");
3110 &movdqu (&QWP(16,$key),"xmm0");
3111
3112 &mov ($rounds,9);
3113 &mov (&DWP(96,$key),$rounds);
3114
3115 &jmp (&label("good_key"));
3116
d64a7232
AP
3117&set_label("12rounds",16);
3118 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
23f6eec7
AP
3119 &cmp ("ebp",1<<28);
3120 &je (&label("12rounds_alt"));
3121
d608b4d6 3122 &mov ($rounds,11);
f9c5e5d9 3123 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
d64a7232
AP
3124 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
3125 &call (&label("key_192a_cold"));
3126 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
3127 &call (&label("key_192b"));
3128 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
3129 &call (&label("key_192a"));
3130 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
3131 &call (&label("key_192b"));
3132 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
3133 &call (&label("key_192a"));
3134 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
3135 &call (&label("key_192b"));
3136 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
3137 &call (&label("key_192a"));
3138 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
3139 &call (&label("key_192b"));
3140 &$movekey (&QWP(0,$key),"xmm0");
3141 &mov (&DWP(48,$key),$rounds);
23f6eec7
AP
3142
3143 &jmp (&label("good_key"));
d64a7232
AP
3144
3145&set_label("key_192a",16);
3146 &$movekey (&QWP(0,$key),"xmm0");
3147 &lea ($key,&DWP(16,$key));
3148&set_label("key_192a_cold",16);
3149 &movaps ("xmm5","xmm2");
3150&set_label("key_192b_warm");
3151 &shufps ("xmm4","xmm0",0b00010000);
f8501464
AP
3152 &movdqa ("xmm3","xmm2");
3153 &xorps ("xmm0","xmm4");
d64a7232
AP
3154 &shufps ("xmm4","xmm0",0b10001100);
3155 &pslldq ("xmm3",4);
f8501464 3156 &xorps ("xmm0","xmm4");
d64a7232
AP
3157 &pshufd ("xmm1","xmm1",0b01010101); # critical path
3158 &pxor ("xmm2","xmm3");
3159 &pxor ("xmm0","xmm1");
3160 &pshufd ("xmm3","xmm0",0b11111111);
3161 &pxor ("xmm2","xmm3");
3162 &ret();
3163
3164&set_label("key_192b",16);
3165 &movaps ("xmm3","xmm0");
3166 &shufps ("xmm5","xmm0",0b01000100);
3167 &$movekey (&QWP(0,$key),"xmm5");
3168 &shufps ("xmm3","xmm2",0b01001110);
3169 &$movekey (&QWP(16,$key),"xmm3");
3170 &lea ($key,&DWP(32,$key));
3171 &jmp (&label("key_192b_warm"));
3172
23f6eec7
AP
3173&set_label("12rounds_alt",16);
3174 &movdqa ("xmm5",&QWP(0x10,"ebx"));
3175 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3176 &mov ($rounds,8);
3177 &movdqu (&QWP(-16,$key),"xmm0");
3178
3179&set_label("loop_key192");
3180 &movq (&QWP(0,$key),"xmm2");
3181 &movdqa ("xmm1","xmm2");
3182 &pshufb ("xmm2","xmm5");
3183 &aesenclast ("xmm2","xmm4");
3184 &pslld ("xmm4",1);
3185 &lea ($key,&DWP(24,$key));
3186
3187 &movdqa ("xmm3","xmm0");
3188 &pslldq ("xmm0",4);
3189 &pxor ("xmm3","xmm0");
3190 &pslldq ("xmm0",4);
3191 &pxor ("xmm3","xmm0");
3192 &pslldq ("xmm0",4);
3193 &pxor ("xmm0","xmm3");
3194
3195 &pshufd ("xmm3","xmm0",0xff);
3196 &pxor ("xmm3","xmm1");
3197 &pslldq ("xmm1",4);
3198 &pxor ("xmm3","xmm1");
3199
3200 &pxor ("xmm0","xmm2");
3201 &pxor ("xmm2","xmm3");
3202 &movdqu (&QWP(-16,$key),"xmm0");
3203
3204 &dec ($rounds);
3205 &jnz (&label("loop_key192"));
3206
3207 &mov ($rounds,11);
3208 &mov (&DWP(32,$key),$rounds);
3209
3210 &jmp (&label("good_key"));
3211
d64a7232
AP
3212&set_label("14rounds",16);
3213 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
d64a7232 3214 &lea ($key,&DWP(16,$key));
23f6eec7
AP
3215 &cmp ("ebp",1<<28);
3216 &je (&label("14rounds_alt"));
3217
3218 &mov ($rounds,13);
d64a7232
AP
3219 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
3220 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
3221 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
3222 &call (&label("key_256a_cold"));
3223 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
3224 &call (&label("key_256b"));
3225 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
3226 &call (&label("key_256a"));
3227 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
3228 &call (&label("key_256b"));
3229 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
3230 &call (&label("key_256a"));
3231 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
3232 &call (&label("key_256b"));
3233 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
3234 &call (&label("key_256a"));
3235 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
3236 &call (&label("key_256b"));
3237 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
3238 &call (&label("key_256a"));
3239 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
3240 &call (&label("key_256b"));
3241 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
3242 &call (&label("key_256a"));
3243 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
3244 &call (&label("key_256b"));
3245 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
3246 &call (&label("key_256a"));
3247 &$movekey (&QWP(0,$key),"xmm0");
3248 &mov (&DWP(16,$key),$rounds);
3249 &xor ("eax","eax");
23f6eec7
AP
3250
3251 &jmp (&label("good_key"));
d64a7232
AP
3252
3253&set_label("key_256a",16);
3254 &$movekey (&QWP(0,$key),"xmm2");
3255 &lea ($key,&DWP(16,$key));
3256&set_label("key_256a_cold");
3257 &shufps ("xmm4","xmm0",0b00010000);
f8501464 3258 &xorps ("xmm0","xmm4");
d64a7232 3259 &shufps ("xmm4","xmm0",0b10001100);
f8501464
AP
3260 &xorps ("xmm0","xmm4");
3261 &shufps ("xmm1","xmm1",0b11111111); # critical path
3262 &xorps ("xmm0","xmm1");
d64a7232
AP
3263 &ret();
3264
3265&set_label("key_256b",16);
3266 &$movekey (&QWP(0,$key),"xmm0");
3267 &lea ($key,&DWP(16,$key));
3268
3269 &shufps ("xmm4","xmm2",0b00010000);
f8501464 3270 &xorps ("xmm2","xmm4");
d64a7232 3271 &shufps ("xmm4","xmm2",0b10001100);
f8501464
AP
3272 &xorps ("xmm2","xmm4");
3273 &shufps ("xmm1","xmm1",0b10101010); # critical path
3274 &xorps ("xmm2","xmm1");
d64a7232
AP
3275 &ret();
3276
23f6eec7
AP
3277&set_label("14rounds_alt",16);
3278 &movdqa ("xmm5",&QWP(0x00,"ebx"));
3279 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3280 &mov ($rounds,7);
3281 &movdqu (&QWP(-32,$key),"xmm0");
3282 &movdqa ("xmm1","xmm2");
3283 &movdqu (&QWP(-16,$key),"xmm2");
3284
3285&set_label("loop_key256");
3286 &pshufb ("xmm2","xmm5");
3287 &aesenclast ("xmm2","xmm4");
3288
3289 &movdqa ("xmm3","xmm0");
3290 &pslldq ("xmm0",4);
3291 &pxor ("xmm3","xmm0");
3292 &pslldq ("xmm0",4);
3293 &pxor ("xmm3","xmm0");
3294 &pslldq ("xmm0",4);
3295 &pxor ("xmm0","xmm3");
3296 &pslld ("xmm4",1);
3297
3298 &pxor ("xmm0","xmm2");
3299 &movdqu (&QWP(0,$key),"xmm0");
3300
3301 &dec ($rounds);
3302 &jz (&label("done_key256"));
3303
3304 &pshufd ("xmm2","xmm0",0xff);
3305 &pxor ("xmm3","xmm3");
3306 &aesenclast ("xmm2","xmm3");
3307
bd30091c 3308 &movdqa ("xmm3","xmm1");
23f6eec7
AP
3309 &pslldq ("xmm1",4);
3310 &pxor ("xmm3","xmm1");
3311 &pslldq ("xmm1",4);
3312 &pxor ("xmm3","xmm1");
3313 &pslldq ("xmm1",4);
3314 &pxor ("xmm1","xmm3");
3315
3316 &pxor ("xmm2","xmm1");
3317 &movdqu (&QWP(16,$key),"xmm2");
3318 &lea ($key,&DWP(32,$key));
3319 &movdqa ("xmm1","xmm2");
3320 &jmp (&label("loop_key256"));
3321
3322&set_label("done_key256");
3323 &mov ($rounds,13);
3324 &mov (&DWP(16,$key),$rounds);
3325
3326&set_label("good_key");
3327 &pxor ("xmm0","xmm0");
3328 &pxor ("xmm1","xmm1");
3329 &pxor ("xmm2","xmm2");
3330 &pxor ("xmm3","xmm3");
3331 &pxor ("xmm4","xmm4");
3332 &pxor ("xmm5","xmm5");
3333 &xor ("eax","eax");
3334 &pop ("ebx");
3335 &pop ("ebp");
3336 &ret ();
3337
d64a7232
AP
3338&set_label("bad_pointer",4);
3339 &mov ("eax",-1);
23f6eec7
AP
3340 &pop ("ebx");
3341 &pop ("ebp");
d64a7232
AP
3342 &ret ();
3343&set_label("bad_keybits",4);
23f6eec7 3344 &pxor ("xmm0","xmm0");
d64a7232 3345 &mov ("eax",-2);
23f6eec7
AP
3346 &pop ("ebx");
3347 &pop ("ebp");
d64a7232
AP
3348 &ret ();
3349&function_end_B("_aesni_set_encrypt_key");
3350
3351# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
3352# AES_KEY *key)
3353&function_begin_B("${PREFIX}_set_encrypt_key");
3354 &mov ("eax",&wparam(0));
3355 &mov ($rounds,&wparam(1));
3356 &mov ($key,&wparam(2));
3357 &call ("_aesni_set_encrypt_key");
3358 &ret ();
3359&function_end_B("${PREFIX}_set_encrypt_key");
3360
3361# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
3362# AES_KEY *key)
3363&function_begin_B("${PREFIX}_set_decrypt_key");
3364 &mov ("eax",&wparam(0));
3365 &mov ($rounds,&wparam(1));
3366 &mov ($key,&wparam(2));
3367 &call ("_aesni_set_encrypt_key");
3368 &mov ($key,&wparam(2));
f9c5e5d9 3369 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
d64a7232
AP
3370 &test ("eax","eax");
3371 &jnz (&label("dec_key_ret"));
d608b4d6 3372 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
d64a7232
AP
3373
3374 &$movekey ("xmm0",&QWP(0,$key)); # just swap
3375 &$movekey ("xmm1",&QWP(0,"eax"));
3376 &$movekey (&QWP(0,"eax"),"xmm0");
3377 &$movekey (&QWP(0,$key),"xmm1");
3378 &lea ($key,&DWP(16,$key));
3379 &lea ("eax",&DWP(-16,"eax"));
d64a7232 3380
d608b4d6 3381&set_label("dec_key_inverse");
d64a7232
AP
3382 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
3383 &$movekey ("xmm1",&QWP(0,"eax"));
3384 &aesimc ("xmm0","xmm0");
3385 &aesimc ("xmm1","xmm1");
3386 &lea ($key,&DWP(16,$key));
3387 &lea ("eax",&DWP(-16,"eax"));
d64a7232
AP
3388 &$movekey (&QWP(16,"eax"),"xmm0");
3389 &$movekey (&QWP(-16,$key),"xmm1");
d7d119a3 3390 &cmp ("eax",$key);
d64a7232
AP
3391 &ja (&label("dec_key_inverse"));
3392
3393 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
3394 &aesimc ("xmm0","xmm0");
3395 &$movekey (&QWP(0,$key),"xmm0");
3396
23f6eec7
AP
3397 &pxor ("xmm0","xmm0");
3398 &pxor ("xmm1","xmm1");
d64a7232
AP
3399 &xor ("eax","eax"); # return success
3400&set_label("dec_key_ret");
3401 &ret ();
3402&function_end_B("${PREFIX}_set_decrypt_key");
23f6eec7
AP
3403
3404&set_label("key_const",64);
3405&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
3406&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
3407&data_word(1,1,1,1);
3408&data_word(0x1b,0x1b,0x1b,0x1b);
d64a7232
AP
3409&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
3410
3411&asm_finish();
184bc45f
RL
3412
3413close STDOUT;