]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/asm/aesni-x86.pl
9b2e37aafb1a05977a8675bccb3bf508ab9f49e4
[thirdparty/openssl.git] / crypto / aes / asm / aesni-x86.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13 # details].
14 #
15 # Performance.
16 #
17 # To start with see corresponding paragraph in aesni-x86_64.pl...
18 # Instead of filling table similar to one found there I've chosen to
19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20 # The simplified table below represents 32-bit performance relative
21 # to 64-bit one in every given point. Ratios vary for different
22 # encryption modes, therefore interval values.
23 #
24 # 16-byte 64-byte 256-byte 1-KB 8-KB
25 # 53-67% 67-84% 91-94% 95-98% 97-99.5%
26 #
27 # Lower ratios for smaller block sizes are perfectly understandable,
28 # because function call overhead is higher in 32-bit mode. Largest
29 # 8-KB block performance is virtually same: 32-bit code is less than
30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31
32 # January 2011
33 #
34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
35 # interleaves at most 6 aes[enc|dec] instructions, because there are
36 # not enough registers for 8x interleave [which should be optimal for
37 # Sandy Bridge]. Actually, performance results for 6x interleave
38 # factor presented in aesni-x86_64.pl (except for CTR) are for this
39 # module.
40
41 # April 2011
42 #
43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
45
46 ######################################################################
47 # Current large-block performance in cycles per byte processed with
48 # 128-bit key (less is better).
49 #
50 # CBC en-/decrypt CTR XTS ECB
51 # Westmere 3.77/1.37 1.37 1.52 1.27
52 # * Bridge 5.07/0.98 0.99 1.09 0.91
53 # Haswell 4.44/0.80 0.97 1.03 0.72
54 # Silvermont 5.77/3.56 3.67 4.03 3.46
55 # Bulldozer 5.80/0.98 1.05 1.24 0.93
56
57 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
58 # generates drop-in replacement for
59 # crypto/aes/asm/aes-586.pl:-)
60 $inline=1; # inline _aesni_[en|de]crypt
61
62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63 push(@INC,"${dir}","${dir}../../perlasm");
64 require "x86asm.pl";
65
66 &asm_init($ARGV[0],$0);
67
68 &external_label("OPENSSL_ia32cap_P");
69 &static_label("key_const");
70
71 if ($PREFIX eq "aesni") { $movekey=\&movups; }
72 else { $movekey=\&movups; }
73
74 $len="eax";
75 $rounds="ecx";
76 $key="edx";
77 $inp="esi";
78 $out="edi";
79 $rounds_="ebx"; # backup copy for $rounds
80 $key_="ebp"; # backup copy for $key
81
82 $rndkey0="xmm0";
83 $rndkey1="xmm1";
84 $inout0="xmm2";
85 $inout1="xmm3";
86 $inout2="xmm4";
87 $inout3="xmm5"; $in1="xmm5";
88 $inout4="xmm6"; $in0="xmm6";
89 $inout5="xmm7"; $ivec="xmm7";
90
91 # AESNI extension
92 sub aeskeygenassist
93 { my($dst,$src,$imm)=@_;
94 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
95 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
96 }
97 sub aescommon
98 { my($opcodelet,$dst,$src)=@_;
99 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
100 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
101 }
102 sub aesimc { aescommon(0xdb,@_); }
103 sub aesenc { aescommon(0xdc,@_); }
104 sub aesenclast { aescommon(0xdd,@_); }
105 sub aesdec { aescommon(0xde,@_); }
106 sub aesdeclast { aescommon(0xdf,@_); }
107 \f
108 # Inline version of internal aesni_[en|de]crypt1
109 { my $sn;
110 sub aesni_inline_generate1
111 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
112 $sn++;
113
114 &$movekey ($rndkey0,&QWP(0,$key));
115 &$movekey ($rndkey1,&QWP(16,$key));
116 &xorps ($ivec,$rndkey0) if (defined($ivec));
117 &lea ($key,&DWP(32,$key));
118 &xorps ($inout,$ivec) if (defined($ivec));
119 &xorps ($inout,$rndkey0) if (!defined($ivec));
120 &set_label("${p}1_loop_$sn");
121 eval"&aes${p} ($inout,$rndkey1)";
122 &dec ($rounds);
123 &$movekey ($rndkey1,&QWP(0,$key));
124 &lea ($key,&DWP(16,$key));
125 &jnz (&label("${p}1_loop_$sn"));
126 eval"&aes${p}last ($inout,$rndkey1)";
127 }}
128
129 sub aesni_generate1 # fully unrolled loop
130 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
131
132 &function_begin_B("_aesni_${p}rypt1");
133 &movups ($rndkey0,&QWP(0,$key));
134 &$movekey ($rndkey1,&QWP(0x10,$key));
135 &xorps ($inout,$rndkey0);
136 &$movekey ($rndkey0,&QWP(0x20,$key));
137 &lea ($key,&DWP(0x30,$key));
138 &cmp ($rounds,11);
139 &jb (&label("${p}128"));
140 &lea ($key,&DWP(0x20,$key));
141 &je (&label("${p}192"));
142 &lea ($key,&DWP(0x20,$key));
143 eval"&aes${p} ($inout,$rndkey1)";
144 &$movekey ($rndkey1,&QWP(-0x40,$key));
145 eval"&aes${p} ($inout,$rndkey0)";
146 &$movekey ($rndkey0,&QWP(-0x30,$key));
147 &set_label("${p}192");
148 eval"&aes${p} ($inout,$rndkey1)";
149 &$movekey ($rndkey1,&QWP(-0x20,$key));
150 eval"&aes${p} ($inout,$rndkey0)";
151 &$movekey ($rndkey0,&QWP(-0x10,$key));
152 &set_label("${p}128");
153 eval"&aes${p} ($inout,$rndkey1)";
154 &$movekey ($rndkey1,&QWP(0,$key));
155 eval"&aes${p} ($inout,$rndkey0)";
156 &$movekey ($rndkey0,&QWP(0x10,$key));
157 eval"&aes${p} ($inout,$rndkey1)";
158 &$movekey ($rndkey1,&QWP(0x20,$key));
159 eval"&aes${p} ($inout,$rndkey0)";
160 &$movekey ($rndkey0,&QWP(0x30,$key));
161 eval"&aes${p} ($inout,$rndkey1)";
162 &$movekey ($rndkey1,&QWP(0x40,$key));
163 eval"&aes${p} ($inout,$rndkey0)";
164 &$movekey ($rndkey0,&QWP(0x50,$key));
165 eval"&aes${p} ($inout,$rndkey1)";
166 &$movekey ($rndkey1,&QWP(0x60,$key));
167 eval"&aes${p} ($inout,$rndkey0)";
168 &$movekey ($rndkey0,&QWP(0x70,$key));
169 eval"&aes${p} ($inout,$rndkey1)";
170 eval"&aes${p}last ($inout,$rndkey0)";
171 &ret();
172 &function_end_B("_aesni_${p}rypt1");
173 }
174 \f
175 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
176 &aesni_generate1("enc") if (!$inline);
177 &function_begin_B("${PREFIX}_encrypt");
178 &mov ("eax",&wparam(0));
179 &mov ($key,&wparam(2));
180 &movups ($inout0,&QWP(0,"eax"));
181 &mov ($rounds,&DWP(240,$key));
182 &mov ("eax",&wparam(1));
183 if ($inline)
184 { &aesni_inline_generate1("enc"); }
185 else
186 { &call ("_aesni_encrypt1"); }
187 &pxor ($rndkey0,$rndkey0); # clear register bank
188 &pxor ($rndkey1,$rndkey1);
189 &movups (&QWP(0,"eax"),$inout0);
190 &pxor ($inout0,$inout0);
191 &ret ();
192 &function_end_B("${PREFIX}_encrypt");
193
194 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
195 &aesni_generate1("dec") if(!$inline);
196 &function_begin_B("${PREFIX}_decrypt");
197 &mov ("eax",&wparam(0));
198 &mov ($key,&wparam(2));
199 &movups ($inout0,&QWP(0,"eax"));
200 &mov ($rounds,&DWP(240,$key));
201 &mov ("eax",&wparam(1));
202 if ($inline)
203 { &aesni_inline_generate1("dec"); }
204 else
205 { &call ("_aesni_decrypt1"); }
206 &pxor ($rndkey0,$rndkey0); # clear register bank
207 &pxor ($rndkey1,$rndkey1);
208 &movups (&QWP(0,"eax"),$inout0);
209 &pxor ($inout0,$inout0);
210 &ret ();
211 &function_end_B("${PREFIX}_decrypt");
212
213 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
214 # factor. Why 3x subroutine were originally used in loops? Even though
215 # aes[enc|dec] latency was originally 6, it could be scheduled only
216 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
217 # utilization, i.e. when subroutine's throughput is virtually same as
218 # of non-interleaved subroutine [for number of input blocks up to 3].
219 # This is why it originally made no sense to implement 2x subroutine.
220 # But times change and it became appropriate to spend extra 192 bytes
221 # on 2x subroutine on Atom Silvermont account. For processors that
222 # can schedule aes[enc|dec] every cycle optimal interleave factor
223 # equals to corresponding instructions latency. 8x is optimal for
224 # * Bridge, but it's unfeasible to accommodate such implementation
225 # in XMM registers addreassable in 32-bit mode and therefore maximum
226 # of 6x is used instead...
227
228 sub aesni_generate2
229 { my $p=shift;
230
231 &function_begin_B("_aesni_${p}rypt2");
232 &$movekey ($rndkey0,&QWP(0,$key));
233 &shl ($rounds,4);
234 &$movekey ($rndkey1,&QWP(16,$key));
235 &xorps ($inout0,$rndkey0);
236 &pxor ($inout1,$rndkey0);
237 &$movekey ($rndkey0,&QWP(32,$key));
238 &lea ($key,&DWP(32,$key,$rounds));
239 &neg ($rounds);
240 &add ($rounds,16);
241
242 &set_label("${p}2_loop");
243 eval"&aes${p} ($inout0,$rndkey1)";
244 eval"&aes${p} ($inout1,$rndkey1)";
245 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
246 &add ($rounds,32);
247 eval"&aes${p} ($inout0,$rndkey0)";
248 eval"&aes${p} ($inout1,$rndkey0)";
249 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
250 &jnz (&label("${p}2_loop"));
251 eval"&aes${p} ($inout0,$rndkey1)";
252 eval"&aes${p} ($inout1,$rndkey1)";
253 eval"&aes${p}last ($inout0,$rndkey0)";
254 eval"&aes${p}last ($inout1,$rndkey0)";
255 &ret();
256 &function_end_B("_aesni_${p}rypt2");
257 }
258
259 sub aesni_generate3
260 { my $p=shift;
261
262 &function_begin_B("_aesni_${p}rypt3");
263 &$movekey ($rndkey0,&QWP(0,$key));
264 &shl ($rounds,4);
265 &$movekey ($rndkey1,&QWP(16,$key));
266 &xorps ($inout0,$rndkey0);
267 &pxor ($inout1,$rndkey0);
268 &pxor ($inout2,$rndkey0);
269 &$movekey ($rndkey0,&QWP(32,$key));
270 &lea ($key,&DWP(32,$key,$rounds));
271 &neg ($rounds);
272 &add ($rounds,16);
273
274 &set_label("${p}3_loop");
275 eval"&aes${p} ($inout0,$rndkey1)";
276 eval"&aes${p} ($inout1,$rndkey1)";
277 eval"&aes${p} ($inout2,$rndkey1)";
278 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
279 &add ($rounds,32);
280 eval"&aes${p} ($inout0,$rndkey0)";
281 eval"&aes${p} ($inout1,$rndkey0)";
282 eval"&aes${p} ($inout2,$rndkey0)";
283 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
284 &jnz (&label("${p}3_loop"));
285 eval"&aes${p} ($inout0,$rndkey1)";
286 eval"&aes${p} ($inout1,$rndkey1)";
287 eval"&aes${p} ($inout2,$rndkey1)";
288 eval"&aes${p}last ($inout0,$rndkey0)";
289 eval"&aes${p}last ($inout1,$rndkey0)";
290 eval"&aes${p}last ($inout2,$rndkey0)";
291 &ret();
292 &function_end_B("_aesni_${p}rypt3");
293 }
294
295 # 4x interleave is implemented to improve small block performance,
296 # most notably [and naturally] 4 block by ~30%. One can argue that one
297 # should have implemented 5x as well, but improvement would be <20%,
298 # so it's not worth it...
299 sub aesni_generate4
300 { my $p=shift;
301
302 &function_begin_B("_aesni_${p}rypt4");
303 &$movekey ($rndkey0,&QWP(0,$key));
304 &$movekey ($rndkey1,&QWP(16,$key));
305 &shl ($rounds,4);
306 &xorps ($inout0,$rndkey0);
307 &pxor ($inout1,$rndkey0);
308 &pxor ($inout2,$rndkey0);
309 &pxor ($inout3,$rndkey0);
310 &$movekey ($rndkey0,&QWP(32,$key));
311 &lea ($key,&DWP(32,$key,$rounds));
312 &neg ($rounds);
313 &data_byte (0x0f,0x1f,0x40,0x00);
314 &add ($rounds,16);
315
316 &set_label("${p}4_loop");
317 eval"&aes${p} ($inout0,$rndkey1)";
318 eval"&aes${p} ($inout1,$rndkey1)";
319 eval"&aes${p} ($inout2,$rndkey1)";
320 eval"&aes${p} ($inout3,$rndkey1)";
321 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
322 &add ($rounds,32);
323 eval"&aes${p} ($inout0,$rndkey0)";
324 eval"&aes${p} ($inout1,$rndkey0)";
325 eval"&aes${p} ($inout2,$rndkey0)";
326 eval"&aes${p} ($inout3,$rndkey0)";
327 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
328 &jnz (&label("${p}4_loop"));
329
330 eval"&aes${p} ($inout0,$rndkey1)";
331 eval"&aes${p} ($inout1,$rndkey1)";
332 eval"&aes${p} ($inout2,$rndkey1)";
333 eval"&aes${p} ($inout3,$rndkey1)";
334 eval"&aes${p}last ($inout0,$rndkey0)";
335 eval"&aes${p}last ($inout1,$rndkey0)";
336 eval"&aes${p}last ($inout2,$rndkey0)";
337 eval"&aes${p}last ($inout3,$rndkey0)";
338 &ret();
339 &function_end_B("_aesni_${p}rypt4");
340 }
341
342 sub aesni_generate6
343 { my $p=shift;
344
345 &function_begin_B("_aesni_${p}rypt6");
346 &static_label("_aesni_${p}rypt6_enter");
347 &$movekey ($rndkey0,&QWP(0,$key));
348 &shl ($rounds,4);
349 &$movekey ($rndkey1,&QWP(16,$key));
350 &xorps ($inout0,$rndkey0);
351 &pxor ($inout1,$rndkey0); # pxor does better here
352 &pxor ($inout2,$rndkey0);
353 eval"&aes${p} ($inout0,$rndkey1)";
354 &pxor ($inout3,$rndkey0);
355 &pxor ($inout4,$rndkey0);
356 eval"&aes${p} ($inout1,$rndkey1)";
357 &lea ($key,&DWP(32,$key,$rounds));
358 &neg ($rounds);
359 eval"&aes${p} ($inout2,$rndkey1)";
360 &pxor ($inout5,$rndkey0);
361 &$movekey ($rndkey0,&QWP(0,$key,$rounds));
362 &add ($rounds,16);
363 &jmp (&label("_aesni_${p}rypt6_inner"));
364
365 &set_label("${p}6_loop",16);
366 eval"&aes${p} ($inout0,$rndkey1)";
367 eval"&aes${p} ($inout1,$rndkey1)";
368 eval"&aes${p} ($inout2,$rndkey1)";
369 &set_label("_aesni_${p}rypt6_inner");
370 eval"&aes${p} ($inout3,$rndkey1)";
371 eval"&aes${p} ($inout4,$rndkey1)";
372 eval"&aes${p} ($inout5,$rndkey1)";
373 &set_label("_aesni_${p}rypt6_enter");
374 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
375 &add ($rounds,32);
376 eval"&aes${p} ($inout0,$rndkey0)";
377 eval"&aes${p} ($inout1,$rndkey0)";
378 eval"&aes${p} ($inout2,$rndkey0)";
379 eval"&aes${p} ($inout3,$rndkey0)";
380 eval"&aes${p} ($inout4,$rndkey0)";
381 eval"&aes${p} ($inout5,$rndkey0)";
382 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
383 &jnz (&label("${p}6_loop"));
384
385 eval"&aes${p} ($inout0,$rndkey1)";
386 eval"&aes${p} ($inout1,$rndkey1)";
387 eval"&aes${p} ($inout2,$rndkey1)";
388 eval"&aes${p} ($inout3,$rndkey1)";
389 eval"&aes${p} ($inout4,$rndkey1)";
390 eval"&aes${p} ($inout5,$rndkey1)";
391 eval"&aes${p}last ($inout0,$rndkey0)";
392 eval"&aes${p}last ($inout1,$rndkey0)";
393 eval"&aes${p}last ($inout2,$rndkey0)";
394 eval"&aes${p}last ($inout3,$rndkey0)";
395 eval"&aes${p}last ($inout4,$rndkey0)";
396 eval"&aes${p}last ($inout5,$rndkey0)";
397 &ret();
398 &function_end_B("_aesni_${p}rypt6");
399 }
400 &aesni_generate2("enc") if ($PREFIX eq "aesni");
401 &aesni_generate2("dec");
402 &aesni_generate3("enc") if ($PREFIX eq "aesni");
403 &aesni_generate3("dec");
404 &aesni_generate4("enc") if ($PREFIX eq "aesni");
405 &aesni_generate4("dec");
406 &aesni_generate6("enc") if ($PREFIX eq "aesni");
407 &aesni_generate6("dec");
408 \f
409 if ($PREFIX eq "aesni") {
410 ######################################################################
411 # void aesni_ecb_encrypt (const void *in, void *out,
412 # size_t length, const AES_KEY *key,
413 # int enc);
414 &function_begin("aesni_ecb_encrypt");
415 &mov ($inp,&wparam(0));
416 &mov ($out,&wparam(1));
417 &mov ($len,&wparam(2));
418 &mov ($key,&wparam(3));
419 &mov ($rounds_,&wparam(4));
420 &and ($len,-16);
421 &jz (&label("ecb_ret"));
422 &mov ($rounds,&DWP(240,$key));
423 &test ($rounds_,$rounds_);
424 &jz (&label("ecb_decrypt"));
425
426 &mov ($key_,$key); # backup $key
427 &mov ($rounds_,$rounds); # backup $rounds
428 &cmp ($len,0x60);
429 &jb (&label("ecb_enc_tail"));
430
431 &movdqu ($inout0,&QWP(0,$inp));
432 &movdqu ($inout1,&QWP(0x10,$inp));
433 &movdqu ($inout2,&QWP(0x20,$inp));
434 &movdqu ($inout3,&QWP(0x30,$inp));
435 &movdqu ($inout4,&QWP(0x40,$inp));
436 &movdqu ($inout5,&QWP(0x50,$inp));
437 &lea ($inp,&DWP(0x60,$inp));
438 &sub ($len,0x60);
439 &jmp (&label("ecb_enc_loop6_enter"));
440
441 &set_label("ecb_enc_loop6",16);
442 &movups (&QWP(0,$out),$inout0);
443 &movdqu ($inout0,&QWP(0,$inp));
444 &movups (&QWP(0x10,$out),$inout1);
445 &movdqu ($inout1,&QWP(0x10,$inp));
446 &movups (&QWP(0x20,$out),$inout2);
447 &movdqu ($inout2,&QWP(0x20,$inp));
448 &movups (&QWP(0x30,$out),$inout3);
449 &movdqu ($inout3,&QWP(0x30,$inp));
450 &movups (&QWP(0x40,$out),$inout4);
451 &movdqu ($inout4,&QWP(0x40,$inp));
452 &movups (&QWP(0x50,$out),$inout5);
453 &lea ($out,&DWP(0x60,$out));
454 &movdqu ($inout5,&QWP(0x50,$inp));
455 &lea ($inp,&DWP(0x60,$inp));
456 &set_label("ecb_enc_loop6_enter");
457
458 &call ("_aesni_encrypt6");
459
460 &mov ($key,$key_); # restore $key
461 &mov ($rounds,$rounds_); # restore $rounds
462 &sub ($len,0x60);
463 &jnc (&label("ecb_enc_loop6"));
464
465 &movups (&QWP(0,$out),$inout0);
466 &movups (&QWP(0x10,$out),$inout1);
467 &movups (&QWP(0x20,$out),$inout2);
468 &movups (&QWP(0x30,$out),$inout3);
469 &movups (&QWP(0x40,$out),$inout4);
470 &movups (&QWP(0x50,$out),$inout5);
471 &lea ($out,&DWP(0x60,$out));
472 &add ($len,0x60);
473 &jz (&label("ecb_ret"));
474
475 &set_label("ecb_enc_tail");
476 &movups ($inout0,&QWP(0,$inp));
477 &cmp ($len,0x20);
478 &jb (&label("ecb_enc_one"));
479 &movups ($inout1,&QWP(0x10,$inp));
480 &je (&label("ecb_enc_two"));
481 &movups ($inout2,&QWP(0x20,$inp));
482 &cmp ($len,0x40);
483 &jb (&label("ecb_enc_three"));
484 &movups ($inout3,&QWP(0x30,$inp));
485 &je (&label("ecb_enc_four"));
486 &movups ($inout4,&QWP(0x40,$inp));
487 &xorps ($inout5,$inout5);
488 &call ("_aesni_encrypt6");
489 &movups (&QWP(0,$out),$inout0);
490 &movups (&QWP(0x10,$out),$inout1);
491 &movups (&QWP(0x20,$out),$inout2);
492 &movups (&QWP(0x30,$out),$inout3);
493 &movups (&QWP(0x40,$out),$inout4);
494 jmp (&label("ecb_ret"));
495
496 &set_label("ecb_enc_one",16);
497 if ($inline)
498 { &aesni_inline_generate1("enc"); }
499 else
500 { &call ("_aesni_encrypt1"); }
501 &movups (&QWP(0,$out),$inout0);
502 &jmp (&label("ecb_ret"));
503
504 &set_label("ecb_enc_two",16);
505 &call ("_aesni_encrypt2");
506 &movups (&QWP(0,$out),$inout0);
507 &movups (&QWP(0x10,$out),$inout1);
508 &jmp (&label("ecb_ret"));
509
510 &set_label("ecb_enc_three",16);
511 &call ("_aesni_encrypt3");
512 &movups (&QWP(0,$out),$inout0);
513 &movups (&QWP(0x10,$out),$inout1);
514 &movups (&QWP(0x20,$out),$inout2);
515 &jmp (&label("ecb_ret"));
516
517 &set_label("ecb_enc_four",16);
518 &call ("_aesni_encrypt4");
519 &movups (&QWP(0,$out),$inout0);
520 &movups (&QWP(0x10,$out),$inout1);
521 &movups (&QWP(0x20,$out),$inout2);
522 &movups (&QWP(0x30,$out),$inout3);
523 &jmp (&label("ecb_ret"));
524 ######################################################################
525 &set_label("ecb_decrypt",16);
526 &mov ($key_,$key); # backup $key
527 &mov ($rounds_,$rounds); # backup $rounds
528 &cmp ($len,0x60);
529 &jb (&label("ecb_dec_tail"));
530
531 &movdqu ($inout0,&QWP(0,$inp));
532 &movdqu ($inout1,&QWP(0x10,$inp));
533 &movdqu ($inout2,&QWP(0x20,$inp));
534 &movdqu ($inout3,&QWP(0x30,$inp));
535 &movdqu ($inout4,&QWP(0x40,$inp));
536 &movdqu ($inout5,&QWP(0x50,$inp));
537 &lea ($inp,&DWP(0x60,$inp));
538 &sub ($len,0x60);
539 &jmp (&label("ecb_dec_loop6_enter"));
540
541 &set_label("ecb_dec_loop6",16);
542 &movups (&QWP(0,$out),$inout0);
543 &movdqu ($inout0,&QWP(0,$inp));
544 &movups (&QWP(0x10,$out),$inout1);
545 &movdqu ($inout1,&QWP(0x10,$inp));
546 &movups (&QWP(0x20,$out),$inout2);
547 &movdqu ($inout2,&QWP(0x20,$inp));
548 &movups (&QWP(0x30,$out),$inout3);
549 &movdqu ($inout3,&QWP(0x30,$inp));
550 &movups (&QWP(0x40,$out),$inout4);
551 &movdqu ($inout4,&QWP(0x40,$inp));
552 &movups (&QWP(0x50,$out),$inout5);
553 &lea ($out,&DWP(0x60,$out));
554 &movdqu ($inout5,&QWP(0x50,$inp));
555 &lea ($inp,&DWP(0x60,$inp));
556 &set_label("ecb_dec_loop6_enter");
557
558 &call ("_aesni_decrypt6");
559
560 &mov ($key,$key_); # restore $key
561 &mov ($rounds,$rounds_); # restore $rounds
562 &sub ($len,0x60);
563 &jnc (&label("ecb_dec_loop6"));
564
565 &movups (&QWP(0,$out),$inout0);
566 &movups (&QWP(0x10,$out),$inout1);
567 &movups (&QWP(0x20,$out),$inout2);
568 &movups (&QWP(0x30,$out),$inout3);
569 &movups (&QWP(0x40,$out),$inout4);
570 &movups (&QWP(0x50,$out),$inout5);
571 &lea ($out,&DWP(0x60,$out));
572 &add ($len,0x60);
573 &jz (&label("ecb_ret"));
574
575 &set_label("ecb_dec_tail");
576 &movups ($inout0,&QWP(0,$inp));
577 &cmp ($len,0x20);
578 &jb (&label("ecb_dec_one"));
579 &movups ($inout1,&QWP(0x10,$inp));
580 &je (&label("ecb_dec_two"));
581 &movups ($inout2,&QWP(0x20,$inp));
582 &cmp ($len,0x40);
583 &jb (&label("ecb_dec_three"));
584 &movups ($inout3,&QWP(0x30,$inp));
585 &je (&label("ecb_dec_four"));
586 &movups ($inout4,&QWP(0x40,$inp));
587 &xorps ($inout5,$inout5);
588 &call ("_aesni_decrypt6");
589 &movups (&QWP(0,$out),$inout0);
590 &movups (&QWP(0x10,$out),$inout1);
591 &movups (&QWP(0x20,$out),$inout2);
592 &movups (&QWP(0x30,$out),$inout3);
593 &movups (&QWP(0x40,$out),$inout4);
594 &jmp (&label("ecb_ret"));
595
596 &set_label("ecb_dec_one",16);
597 if ($inline)
598 { &aesni_inline_generate1("dec"); }
599 else
600 { &call ("_aesni_decrypt1"); }
601 &movups (&QWP(0,$out),$inout0);
602 &jmp (&label("ecb_ret"));
603
604 &set_label("ecb_dec_two",16);
605 &call ("_aesni_decrypt2");
606 &movups (&QWP(0,$out),$inout0);
607 &movups (&QWP(0x10,$out),$inout1);
608 &jmp (&label("ecb_ret"));
609
610 &set_label("ecb_dec_three",16);
611 &call ("_aesni_decrypt3");
612 &movups (&QWP(0,$out),$inout0);
613 &movups (&QWP(0x10,$out),$inout1);
614 &movups (&QWP(0x20,$out),$inout2);
615 &jmp (&label("ecb_ret"));
616
617 &set_label("ecb_dec_four",16);
618 &call ("_aesni_decrypt4");
619 &movups (&QWP(0,$out),$inout0);
620 &movups (&QWP(0x10,$out),$inout1);
621 &movups (&QWP(0x20,$out),$inout2);
622 &movups (&QWP(0x30,$out),$inout3);
623
624 &set_label("ecb_ret");
625 &pxor ("xmm0","xmm0"); # clear register bank
626 &pxor ("xmm1","xmm1");
627 &pxor ("xmm2","xmm2");
628 &pxor ("xmm3","xmm3");
629 &pxor ("xmm4","xmm4");
630 &pxor ("xmm5","xmm5");
631 &pxor ("xmm6","xmm6");
632 &pxor ("xmm7","xmm7");
633 &function_end("aesni_ecb_encrypt");
634 \f
635 ######################################################################
636 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
637 # size_t blocks, const AES_KEY *key,
638 # const char *ivec,char *cmac);
639 #
640 # Handles only complete blocks, operates on 64-bit counter and
641 # does not update *ivec! Nor does it finalize CMAC value
642 # (see engine/eng_aesni.c for details)
643 #
644 { my $cmac=$inout1;
645 &function_begin("aesni_ccm64_encrypt_blocks");
646 &mov ($inp,&wparam(0));
647 &mov ($out,&wparam(1));
648 &mov ($len,&wparam(2));
649 &mov ($key,&wparam(3));
650 &mov ($rounds_,&wparam(4));
651 &mov ($rounds,&wparam(5));
652 &mov ($key_,"esp");
653 &sub ("esp",60);
654 &and ("esp",-16); # align stack
655 &mov (&DWP(48,"esp"),$key_);
656
657 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
658 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
659 &mov ($rounds,&DWP(240,$key));
660
661 # compose byte-swap control mask for pshufb on stack
662 &mov (&DWP(0,"esp"),0x0c0d0e0f);
663 &mov (&DWP(4,"esp"),0x08090a0b);
664 &mov (&DWP(8,"esp"),0x04050607);
665 &mov (&DWP(12,"esp"),0x00010203);
666
667 # compose counter increment vector on stack
668 &mov ($rounds_,1);
669 &xor ($key_,$key_);
670 &mov (&DWP(16,"esp"),$rounds_);
671 &mov (&DWP(20,"esp"),$key_);
672 &mov (&DWP(24,"esp"),$key_);
673 &mov (&DWP(28,"esp"),$key_);
674
675 &shl ($rounds,4);
676 &mov ($rounds_,16);
677 &lea ($key_,&DWP(0,$key));
678 &movdqa ($inout3,&QWP(0,"esp"));
679 &movdqa ($inout0,$ivec);
680 &lea ($key,&DWP(32,$key,$rounds));
681 &sub ($rounds_,$rounds);
682 &pshufb ($ivec,$inout3);
683
684 &set_label("ccm64_enc_outer");
685 &$movekey ($rndkey0,&QWP(0,$key_));
686 &mov ($rounds,$rounds_);
687 &movups ($in0,&QWP(0,$inp));
688
689 &xorps ($inout0,$rndkey0);
690 &$movekey ($rndkey1,&QWP(16,$key_));
691 &xorps ($rndkey0,$in0);
692 &xorps ($cmac,$rndkey0); # cmac^=inp
693 &$movekey ($rndkey0,&QWP(32,$key_));
694
695 &set_label("ccm64_enc2_loop");
696 &aesenc ($inout0,$rndkey1);
697 &aesenc ($cmac,$rndkey1);
698 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
699 &add ($rounds,32);
700 &aesenc ($inout0,$rndkey0);
701 &aesenc ($cmac,$rndkey0);
702 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
703 &jnz (&label("ccm64_enc2_loop"));
704 &aesenc ($inout0,$rndkey1);
705 &aesenc ($cmac,$rndkey1);
706 &paddq ($ivec,&QWP(16,"esp"));
707 &dec ($len);
708 &aesenclast ($inout0,$rndkey0);
709 &aesenclast ($cmac,$rndkey0);
710
711 &lea ($inp,&DWP(16,$inp));
712 &xorps ($in0,$inout0); # inp^=E(ivec)
713 &movdqa ($inout0,$ivec);
714 &movups (&QWP(0,$out),$in0); # save output
715 &pshufb ($inout0,$inout3);
716 &lea ($out,&DWP(16,$out));
717 &jnz (&label("ccm64_enc_outer"));
718
719 &mov ("esp",&DWP(48,"esp"));
720 &mov ($out,&wparam(5));
721 &movups (&QWP(0,$out),$cmac);
722
723 &pxor ("xmm0","xmm0"); # clear register bank
724 &pxor ("xmm1","xmm1");
725 &pxor ("xmm2","xmm2");
726 &pxor ("xmm3","xmm3");
727 &pxor ("xmm4","xmm4");
728 &pxor ("xmm5","xmm5");
729 &pxor ("xmm6","xmm6");
730 &pxor ("xmm7","xmm7");
731 &function_end("aesni_ccm64_encrypt_blocks");
732
733 &function_begin("aesni_ccm64_decrypt_blocks");
734 &mov ($inp,&wparam(0));
735 &mov ($out,&wparam(1));
736 &mov ($len,&wparam(2));
737 &mov ($key,&wparam(3));
738 &mov ($rounds_,&wparam(4));
739 &mov ($rounds,&wparam(5));
740 &mov ($key_,"esp");
741 &sub ("esp",60);
742 &and ("esp",-16); # align stack
743 &mov (&DWP(48,"esp"),$key_);
744
745 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
746 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
747 &mov ($rounds,&DWP(240,$key));
748
749 # compose byte-swap control mask for pshufb on stack
750 &mov (&DWP(0,"esp"),0x0c0d0e0f);
751 &mov (&DWP(4,"esp"),0x08090a0b);
752 &mov (&DWP(8,"esp"),0x04050607);
753 &mov (&DWP(12,"esp"),0x00010203);
754
755 # compose counter increment vector on stack
756 &mov ($rounds_,1);
757 &xor ($key_,$key_);
758 &mov (&DWP(16,"esp"),$rounds_);
759 &mov (&DWP(20,"esp"),$key_);
760 &mov (&DWP(24,"esp"),$key_);
761 &mov (&DWP(28,"esp"),$key_);
762
763 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
764 &movdqa ($inout0,$ivec);
765
766 &mov ($key_,$key);
767 &mov ($rounds_,$rounds);
768
769 &pshufb ($ivec,$inout3);
770 if ($inline)
771 { &aesni_inline_generate1("enc"); }
772 else
773 { &call ("_aesni_encrypt1"); }
774 &shl ($rounds_,4);
775 &mov ($rounds,16);
776 &movups ($in0,&QWP(0,$inp)); # load inp
777 &paddq ($ivec,&QWP(16,"esp"));
778 &lea ($inp,&QWP(16,$inp));
779 &sub ($rounds,$rounds_);
780 &lea ($key,&DWP(32,$key_,$rounds_));
781 &mov ($rounds_,$rounds);
782 &jmp (&label("ccm64_dec_outer"));
783
784 &set_label("ccm64_dec_outer",16);
785 &xorps ($in0,$inout0); # inp ^= E(ivec)
786 &movdqa ($inout0,$ivec);
787 &movups (&QWP(0,$out),$in0); # save output
788 &lea ($out,&DWP(16,$out));
789 &pshufb ($inout0,$inout3);
790
791 &sub ($len,1);
792 &jz (&label("ccm64_dec_break"));
793
794 &$movekey ($rndkey0,&QWP(0,$key_));
795 &mov ($rounds,$rounds_);
796 &$movekey ($rndkey1,&QWP(16,$key_));
797 &xorps ($in0,$rndkey0);
798 &xorps ($inout0,$rndkey0);
799 &xorps ($cmac,$in0); # cmac^=out
800 &$movekey ($rndkey0,&QWP(32,$key_));
801
802 &set_label("ccm64_dec2_loop");
803 &aesenc ($inout0,$rndkey1);
804 &aesenc ($cmac,$rndkey1);
805 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
806 &add ($rounds,32);
807 &aesenc ($inout0,$rndkey0);
808 &aesenc ($cmac,$rndkey0);
809 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
810 &jnz (&label("ccm64_dec2_loop"));
811 &movups ($in0,&QWP(0,$inp)); # load inp
812 &paddq ($ivec,&QWP(16,"esp"));
813 &aesenc ($inout0,$rndkey1);
814 &aesenc ($cmac,$rndkey1);
815 &aesenclast ($inout0,$rndkey0);
816 &aesenclast ($cmac,$rndkey0);
817 &lea ($inp,&QWP(16,$inp));
818 &jmp (&label("ccm64_dec_outer"));
819
820 &set_label("ccm64_dec_break",16);
821 &mov ($rounds,&DWP(240,$key_));
822 &mov ($key,$key_);
823 if ($inline)
824 { &aesni_inline_generate1("enc",$cmac,$in0); }
825 else
826 { &call ("_aesni_encrypt1",$cmac); }
827
828 &mov ("esp",&DWP(48,"esp"));
829 &mov ($out,&wparam(5));
830 &movups (&QWP(0,$out),$cmac);
831
832 &pxor ("xmm0","xmm0"); # clear register bank
833 &pxor ("xmm1","xmm1");
834 &pxor ("xmm2","xmm2");
835 &pxor ("xmm3","xmm3");
836 &pxor ("xmm4","xmm4");
837 &pxor ("xmm5","xmm5");
838 &pxor ("xmm6","xmm6");
839 &pxor ("xmm7","xmm7");
840 &function_end("aesni_ccm64_decrypt_blocks");
841 }
842 \f
843 ######################################################################
844 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
845 # size_t blocks, const AES_KEY *key,
846 # const char *ivec);
847 #
848 # Handles only complete blocks, operates on 32-bit counter and
849 # does not update *ivec! (see crypto/modes/ctr128.c for details)
850 #
851 # stack layout:
852 # 0 pshufb mask
853 # 16 vector addend: 0,6,6,6
854 # 32 counter-less ivec
855 # 48 1st triplet of counter vector
856 # 64 2nd triplet of counter vector
857 # 80 saved %esp
858
859 &function_begin("aesni_ctr32_encrypt_blocks");
860 &mov ($inp,&wparam(0));
861 &mov ($out,&wparam(1));
862 &mov ($len,&wparam(2));
863 &mov ($key,&wparam(3));
864 &mov ($rounds_,&wparam(4));
865 &mov ($key_,"esp");
866 &sub ("esp",88);
867 &and ("esp",-16); # align stack
868 &mov (&DWP(80,"esp"),$key_);
869
870 &cmp ($len,1);
871 &je (&label("ctr32_one_shortcut"));
872
873 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
874
875 # compose byte-swap control mask for pshufb on stack
876 &mov (&DWP(0,"esp"),0x0c0d0e0f);
877 &mov (&DWP(4,"esp"),0x08090a0b);
878 &mov (&DWP(8,"esp"),0x04050607);
879 &mov (&DWP(12,"esp"),0x00010203);
880
881 # compose counter increment vector on stack
882 &mov ($rounds,6);
883 &xor ($key_,$key_);
884 &mov (&DWP(16,"esp"),$rounds);
885 &mov (&DWP(20,"esp"),$rounds);
886 &mov (&DWP(24,"esp"),$rounds);
887 &mov (&DWP(28,"esp"),$key_);
888
889 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
890 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
891
892 &mov ($rounds,&DWP(240,$key)); # key->rounds
893
894 # compose 2 vectors of 3x32-bit counters
895 &bswap ($rounds_);
896 &pxor ($rndkey0,$rndkey0);
897 &pxor ($rndkey1,$rndkey1);
898 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
899 &pinsrd ($rndkey0,$rounds_,0);
900 &lea ($key_,&DWP(3,$rounds_));
901 &pinsrd ($rndkey1,$key_,0);
902 &inc ($rounds_);
903 &pinsrd ($rndkey0,$rounds_,1);
904 &inc ($key_);
905 &pinsrd ($rndkey1,$key_,1);
906 &inc ($rounds_);
907 &pinsrd ($rndkey0,$rounds_,2);
908 &inc ($key_);
909 &pinsrd ($rndkey1,$key_,2);
910 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
911 &pshufb ($rndkey0,$inout0); # byte swap
912 &movdqu ($inout4,&QWP(0,$key)); # key[0]
913 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
914 &pshufb ($rndkey1,$inout0); # byte swap
915
916 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
917 &pshufd ($inout1,$rndkey0,2<<6);
918 &cmp ($len,6);
919 &jb (&label("ctr32_tail"));
920 &pxor ($inout5,$inout4); # counter-less ivec^key[0]
921 &shl ($rounds,4);
922 &mov ($rounds_,16);
923 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
924 &mov ($key_,$key); # backup $key
925 &sub ($rounds_,$rounds); # backup twisted $rounds
926 &lea ($key,&DWP(32,$key,$rounds));
927 &sub ($len,6);
928 &jmp (&label("ctr32_loop6"));
929
930 &set_label("ctr32_loop6",16);
931 # inlining _aesni_encrypt6's prologue gives ~6% improvement...
932 &pshufd ($inout2,$rndkey0,1<<6);
933 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
934 &pshufd ($inout3,$rndkey1,3<<6);
935 &pxor ($inout0,$rndkey0); # merge counter-less ivec
936 &pshufd ($inout4,$rndkey1,2<<6);
937 &pxor ($inout1,$rndkey0);
938 &pshufd ($inout5,$rndkey1,1<<6);
939 &$movekey ($rndkey1,&QWP(16,$key_));
940 &pxor ($inout2,$rndkey0);
941 &pxor ($inout3,$rndkey0);
942 &aesenc ($inout0,$rndkey1);
943 &pxor ($inout4,$rndkey0);
944 &pxor ($inout5,$rndkey0);
945 &aesenc ($inout1,$rndkey1);
946 &$movekey ($rndkey0,&QWP(32,$key_));
947 &mov ($rounds,$rounds_);
948 &aesenc ($inout2,$rndkey1);
949 &aesenc ($inout3,$rndkey1);
950 &aesenc ($inout4,$rndkey1);
951 &aesenc ($inout5,$rndkey1);
952
953 &call (&label("_aesni_encrypt6_enter"));
954
955 &movups ($rndkey1,&QWP(0,$inp));
956 &movups ($rndkey0,&QWP(0x10,$inp));
957 &xorps ($inout0,$rndkey1);
958 &movups ($rndkey1,&QWP(0x20,$inp));
959 &xorps ($inout1,$rndkey0);
960 &movups (&QWP(0,$out),$inout0);
961 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
962 &xorps ($inout2,$rndkey1);
963 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
964 &movups (&QWP(0x10,$out),$inout1);
965 &movups (&QWP(0x20,$out),$inout2);
966
967 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment
968 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
969 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
970
971 &movups ($inout1,&QWP(0x30,$inp));
972 &movups ($inout2,&QWP(0x40,$inp));
973 &xorps ($inout3,$inout1);
974 &movups ($inout1,&QWP(0x50,$inp));
975 &lea ($inp,&DWP(0x60,$inp));
976 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
977 &pshufb ($rndkey0,$inout0); # byte swap
978 &xorps ($inout4,$inout2);
979 &movups (&QWP(0x30,$out),$inout3);
980 &xorps ($inout5,$inout1);
981 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
982 &pshufb ($rndkey1,$inout0); # byte swap
983 &movups (&QWP(0x40,$out),$inout4);
984 &pshufd ($inout0,$rndkey0,3<<6);
985 &movups (&QWP(0x50,$out),$inout5);
986 &lea ($out,&DWP(0x60,$out));
987
988 &pshufd ($inout1,$rndkey0,2<<6);
989 &sub ($len,6);
990 &jnc (&label("ctr32_loop6"));
991
992 &add ($len,6);
993 &jz (&label("ctr32_ret"));
994 &movdqu ($inout5,&QWP(0,$key_));
995 &mov ($key,$key_);
996 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
997 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
998
999 &set_label("ctr32_tail");
1000 &por ($inout0,$inout5);
1001 &cmp ($len,2);
1002 &jb (&label("ctr32_one"));
1003
1004 &pshufd ($inout2,$rndkey0,1<<6);
1005 &por ($inout1,$inout5);
1006 &je (&label("ctr32_two"));
1007
1008 &pshufd ($inout3,$rndkey1,3<<6);
1009 &por ($inout2,$inout5);
1010 &cmp ($len,4);
1011 &jb (&label("ctr32_three"));
1012
1013 &pshufd ($inout4,$rndkey1,2<<6);
1014 &por ($inout3,$inout5);
1015 &je (&label("ctr32_four"));
1016
1017 &por ($inout4,$inout5);
1018 &call ("_aesni_encrypt6");
1019 &movups ($rndkey1,&QWP(0,$inp));
1020 &movups ($rndkey0,&QWP(0x10,$inp));
1021 &xorps ($inout0,$rndkey1);
1022 &movups ($rndkey1,&QWP(0x20,$inp));
1023 &xorps ($inout1,$rndkey0);
1024 &movups ($rndkey0,&QWP(0x30,$inp));
1025 &xorps ($inout2,$rndkey1);
1026 &movups ($rndkey1,&QWP(0x40,$inp));
1027 &xorps ($inout3,$rndkey0);
1028 &movups (&QWP(0,$out),$inout0);
1029 &xorps ($inout4,$rndkey1);
1030 &movups (&QWP(0x10,$out),$inout1);
1031 &movups (&QWP(0x20,$out),$inout2);
1032 &movups (&QWP(0x30,$out),$inout3);
1033 &movups (&QWP(0x40,$out),$inout4);
1034 &jmp (&label("ctr32_ret"));
1035
1036 &set_label("ctr32_one_shortcut",16);
1037 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
1038 &mov ($rounds,&DWP(240,$key));
1039
1040 &set_label("ctr32_one");
1041 if ($inline)
1042 { &aesni_inline_generate1("enc"); }
1043 else
1044 { &call ("_aesni_encrypt1"); }
1045 &movups ($in0,&QWP(0,$inp));
1046 &xorps ($in0,$inout0);
1047 &movups (&QWP(0,$out),$in0);
1048 &jmp (&label("ctr32_ret"));
1049
1050 &set_label("ctr32_two",16);
1051 &call ("_aesni_encrypt2");
1052 &movups ($inout3,&QWP(0,$inp));
1053 &movups ($inout4,&QWP(0x10,$inp));
1054 &xorps ($inout0,$inout3);
1055 &xorps ($inout1,$inout4);
1056 &movups (&QWP(0,$out),$inout0);
1057 &movups (&QWP(0x10,$out),$inout1);
1058 &jmp (&label("ctr32_ret"));
1059
1060 &set_label("ctr32_three",16);
1061 &call ("_aesni_encrypt3");
1062 &movups ($inout3,&QWP(0,$inp));
1063 &movups ($inout4,&QWP(0x10,$inp));
1064 &xorps ($inout0,$inout3);
1065 &movups ($inout5,&QWP(0x20,$inp));
1066 &xorps ($inout1,$inout4);
1067 &movups (&QWP(0,$out),$inout0);
1068 &xorps ($inout2,$inout5);
1069 &movups (&QWP(0x10,$out),$inout1);
1070 &movups (&QWP(0x20,$out),$inout2);
1071 &jmp (&label("ctr32_ret"));
1072
1073 &set_label("ctr32_four",16);
1074 &call ("_aesni_encrypt4");
1075 &movups ($inout4,&QWP(0,$inp));
1076 &movups ($inout5,&QWP(0x10,$inp));
1077 &movups ($rndkey1,&QWP(0x20,$inp));
1078 &xorps ($inout0,$inout4);
1079 &movups ($rndkey0,&QWP(0x30,$inp));
1080 &xorps ($inout1,$inout5);
1081 &movups (&QWP(0,$out),$inout0);
1082 &xorps ($inout2,$rndkey1);
1083 &movups (&QWP(0x10,$out),$inout1);
1084 &xorps ($inout3,$rndkey0);
1085 &movups (&QWP(0x20,$out),$inout2);
1086 &movups (&QWP(0x30,$out),$inout3);
1087
1088 &set_label("ctr32_ret");
1089 &pxor ("xmm0","xmm0"); # clear register bank
1090 &pxor ("xmm1","xmm1");
1091 &pxor ("xmm2","xmm2");
1092 &pxor ("xmm3","xmm3");
1093 &pxor ("xmm4","xmm4");
1094 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
1095 &pxor ("xmm5","xmm5");
1096 &movdqa (&QWP(48,"esp"),"xmm0");
1097 &pxor ("xmm6","xmm6");
1098 &movdqa (&QWP(64,"esp"),"xmm0");
1099 &pxor ("xmm7","xmm7");
1100 &mov ("esp",&DWP(80,"esp"));
1101 &function_end("aesni_ctr32_encrypt_blocks");
1102 \f
1103 ######################################################################
1104 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1105 # const AES_KEY *key1, const AES_KEY *key2
1106 # const unsigned char iv[16]);
1107 #
1108 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1109
1110 &function_begin("aesni_xts_encrypt");
1111 &mov ($key,&wparam(4)); # key2
1112 &mov ($inp,&wparam(5)); # clear-text tweak
1113
1114 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1115 &movups ($inout0,&QWP(0,$inp));
1116 if ($inline)
1117 { &aesni_inline_generate1("enc"); }
1118 else
1119 { &call ("_aesni_encrypt1"); }
1120
1121 &mov ($inp,&wparam(0));
1122 &mov ($out,&wparam(1));
1123 &mov ($len,&wparam(2));
1124 &mov ($key,&wparam(3)); # key1
1125
1126 &mov ($key_,"esp");
1127 &sub ("esp",16*7+8);
1128 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1129 &and ("esp",-16); # align stack
1130
1131 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1132 &mov (&DWP(16*6+4,"esp"),0);
1133 &mov (&DWP(16*6+8,"esp"),1);
1134 &mov (&DWP(16*6+12,"esp"),0);
1135 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1136 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1137
1138 &movdqa ($tweak,$inout0);
1139 &pxor ($twtmp,$twtmp);
1140 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1141 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1142
1143 &and ($len,-16);
1144 &mov ($key_,$key); # backup $key
1145 &mov ($rounds_,$rounds); # backup $rounds
1146 &sub ($len,16*6);
1147 &jc (&label("xts_enc_short"));
1148
1149 &shl ($rounds,4);
1150 &mov ($rounds_,16);
1151 &sub ($rounds_,$rounds);
1152 &lea ($key,&DWP(32,$key,$rounds));
1153 &jmp (&label("xts_enc_loop6"));
1154
1155 &set_label("xts_enc_loop6",16);
1156 for ($i=0;$i<4;$i++) {
1157 &pshufd ($twres,$twtmp,0x13);
1158 &pxor ($twtmp,$twtmp);
1159 &movdqa (&QWP(16*$i,"esp"),$tweak);
1160 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1161 &pand ($twres,$twmask); # isolate carry and residue
1162 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1163 &pxor ($tweak,$twres);
1164 }
1165 &pshufd ($inout5,$twtmp,0x13);
1166 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1167 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1168 &$movekey ($rndkey0,&QWP(0,$key_));
1169 &pand ($inout5,$twmask); # isolate carry and residue
1170 &movups ($inout0,&QWP(0,$inp)); # load input
1171 &pxor ($inout5,$tweak);
1172
1173 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1174 &mov ($rounds,$rounds_); # restore $rounds
1175 &movdqu ($inout1,&QWP(16*1,$inp));
1176 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1177 &movdqu ($inout2,&QWP(16*2,$inp));
1178 &pxor ($inout1,$rndkey0);
1179 &movdqu ($inout3,&QWP(16*3,$inp));
1180 &pxor ($inout2,$rndkey0);
1181 &movdqu ($inout4,&QWP(16*4,$inp));
1182 &pxor ($inout3,$rndkey0);
1183 &movdqu ($rndkey1,&QWP(16*5,$inp));
1184 &pxor ($inout4,$rndkey0);
1185 &lea ($inp,&DWP(16*6,$inp));
1186 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1187 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1188 &pxor ($inout5,$rndkey1);
1189
1190 &$movekey ($rndkey1,&QWP(16,$key_));
1191 &pxor ($inout1,&QWP(16*1,"esp"));
1192 &pxor ($inout2,&QWP(16*2,"esp"));
1193 &aesenc ($inout0,$rndkey1);
1194 &pxor ($inout3,&QWP(16*3,"esp"));
1195 &pxor ($inout4,&QWP(16*4,"esp"));
1196 &aesenc ($inout1,$rndkey1);
1197 &pxor ($inout5,$rndkey0);
1198 &$movekey ($rndkey0,&QWP(32,$key_));
1199 &aesenc ($inout2,$rndkey1);
1200 &aesenc ($inout3,$rndkey1);
1201 &aesenc ($inout4,$rndkey1);
1202 &aesenc ($inout5,$rndkey1);
1203 &call (&label("_aesni_encrypt6_enter"));
1204
1205 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1206 &pxor ($twtmp,$twtmp);
1207 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1208 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1209 &xorps ($inout1,&QWP(16*1,"esp"));
1210 &movups (&QWP(16*0,$out),$inout0); # write output
1211 &xorps ($inout2,&QWP(16*2,"esp"));
1212 &movups (&QWP(16*1,$out),$inout1);
1213 &xorps ($inout3,&QWP(16*3,"esp"));
1214 &movups (&QWP(16*2,$out),$inout2);
1215 &xorps ($inout4,&QWP(16*4,"esp"));
1216 &movups (&QWP(16*3,$out),$inout3);
1217 &xorps ($inout5,$tweak);
1218 &movups (&QWP(16*4,$out),$inout4);
1219 &pshufd ($twres,$twtmp,0x13);
1220 &movups (&QWP(16*5,$out),$inout5);
1221 &lea ($out,&DWP(16*6,$out));
1222 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1223
1224 &pxor ($twtmp,$twtmp);
1225 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1226 &pand ($twres,$twmask); # isolate carry and residue
1227 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1228 &pxor ($tweak,$twres);
1229
1230 &sub ($len,16*6);
1231 &jnc (&label("xts_enc_loop6"));
1232
1233 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1234 &mov ($key,$key_); # restore $key
1235 &mov ($rounds_,$rounds);
1236
1237 &set_label("xts_enc_short");
1238 &add ($len,16*6);
1239 &jz (&label("xts_enc_done6x"));
1240
1241 &movdqa ($inout3,$tweak); # put aside previous tweak
1242 &cmp ($len,0x20);
1243 &jb (&label("xts_enc_one"));
1244
1245 &pshufd ($twres,$twtmp,0x13);
1246 &pxor ($twtmp,$twtmp);
1247 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1248 &pand ($twres,$twmask); # isolate carry and residue
1249 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1250 &pxor ($tweak,$twres);
1251 &je (&label("xts_enc_two"));
1252
1253 &pshufd ($twres,$twtmp,0x13);
1254 &pxor ($twtmp,$twtmp);
1255 &movdqa ($inout4,$tweak); # put aside previous tweak
1256 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1257 &pand ($twres,$twmask); # isolate carry and residue
1258 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1259 &pxor ($tweak,$twres);
1260 &cmp ($len,0x40);
1261 &jb (&label("xts_enc_three"));
1262
1263 &pshufd ($twres,$twtmp,0x13);
1264 &pxor ($twtmp,$twtmp);
1265 &movdqa ($inout5,$tweak); # put aside previous tweak
1266 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1267 &pand ($twres,$twmask); # isolate carry and residue
1268 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1269 &pxor ($tweak,$twres);
1270 &movdqa (&QWP(16*0,"esp"),$inout3);
1271 &movdqa (&QWP(16*1,"esp"),$inout4);
1272 &je (&label("xts_enc_four"));
1273
1274 &movdqa (&QWP(16*2,"esp"),$inout5);
1275 &pshufd ($inout5,$twtmp,0x13);
1276 &movdqa (&QWP(16*3,"esp"),$tweak);
1277 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1278 &pand ($inout5,$twmask); # isolate carry and residue
1279 &pxor ($inout5,$tweak);
1280
1281 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1282 &movdqu ($inout1,&QWP(16*1,$inp));
1283 &movdqu ($inout2,&QWP(16*2,$inp));
1284 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1285 &movdqu ($inout3,&QWP(16*3,$inp));
1286 &pxor ($inout1,&QWP(16*1,"esp"));
1287 &movdqu ($inout4,&QWP(16*4,$inp));
1288 &pxor ($inout2,&QWP(16*2,"esp"));
1289 &lea ($inp,&DWP(16*5,$inp));
1290 &pxor ($inout3,&QWP(16*3,"esp"));
1291 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1292 &pxor ($inout4,$inout5);
1293
1294 &call ("_aesni_encrypt6");
1295
1296 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1297 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1298 &xorps ($inout1,&QWP(16*1,"esp"));
1299 &xorps ($inout2,&QWP(16*2,"esp"));
1300 &movups (&QWP(16*0,$out),$inout0); # write output
1301 &xorps ($inout3,&QWP(16*3,"esp"));
1302 &movups (&QWP(16*1,$out),$inout1);
1303 &xorps ($inout4,$tweak);
1304 &movups (&QWP(16*2,$out),$inout2);
1305 &movups (&QWP(16*3,$out),$inout3);
1306 &movups (&QWP(16*4,$out),$inout4);
1307 &lea ($out,&DWP(16*5,$out));
1308 &jmp (&label("xts_enc_done"));
1309
1310 &set_label("xts_enc_one",16);
1311 &movups ($inout0,&QWP(16*0,$inp)); # load input
1312 &lea ($inp,&DWP(16*1,$inp));
1313 &xorps ($inout0,$inout3); # input^=tweak
1314 if ($inline)
1315 { &aesni_inline_generate1("enc"); }
1316 else
1317 { &call ("_aesni_encrypt1"); }
1318 &xorps ($inout0,$inout3); # output^=tweak
1319 &movups (&QWP(16*0,$out),$inout0); # write output
1320 &lea ($out,&DWP(16*1,$out));
1321
1322 &movdqa ($tweak,$inout3); # last tweak
1323 &jmp (&label("xts_enc_done"));
1324
1325 &set_label("xts_enc_two",16);
1326 &movaps ($inout4,$tweak); # put aside last tweak
1327
1328 &movups ($inout0,&QWP(16*0,$inp)); # load input
1329 &movups ($inout1,&QWP(16*1,$inp));
1330 &lea ($inp,&DWP(16*2,$inp));
1331 &xorps ($inout0,$inout3); # input^=tweak
1332 &xorps ($inout1,$inout4);
1333
1334 &call ("_aesni_encrypt2");
1335
1336 &xorps ($inout0,$inout3); # output^=tweak
1337 &xorps ($inout1,$inout4);
1338 &movups (&QWP(16*0,$out),$inout0); # write output
1339 &movups (&QWP(16*1,$out),$inout1);
1340 &lea ($out,&DWP(16*2,$out));
1341
1342 &movdqa ($tweak,$inout4); # last tweak
1343 &jmp (&label("xts_enc_done"));
1344
1345 &set_label("xts_enc_three",16);
1346 &movaps ($inout5,$tweak); # put aside last tweak
1347 &movups ($inout0,&QWP(16*0,$inp)); # load input
1348 &movups ($inout1,&QWP(16*1,$inp));
1349 &movups ($inout2,&QWP(16*2,$inp));
1350 &lea ($inp,&DWP(16*3,$inp));
1351 &xorps ($inout0,$inout3); # input^=tweak
1352 &xorps ($inout1,$inout4);
1353 &xorps ($inout2,$inout5);
1354
1355 &call ("_aesni_encrypt3");
1356
1357 &xorps ($inout0,$inout3); # output^=tweak
1358 &xorps ($inout1,$inout4);
1359 &xorps ($inout2,$inout5);
1360 &movups (&QWP(16*0,$out),$inout0); # write output
1361 &movups (&QWP(16*1,$out),$inout1);
1362 &movups (&QWP(16*2,$out),$inout2);
1363 &lea ($out,&DWP(16*3,$out));
1364
1365 &movdqa ($tweak,$inout5); # last tweak
1366 &jmp (&label("xts_enc_done"));
1367
1368 &set_label("xts_enc_four",16);
1369 &movaps ($inout4,$tweak); # put aside last tweak
1370
1371 &movups ($inout0,&QWP(16*0,$inp)); # load input
1372 &movups ($inout1,&QWP(16*1,$inp));
1373 &movups ($inout2,&QWP(16*2,$inp));
1374 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1375 &movups ($inout3,&QWP(16*3,$inp));
1376 &lea ($inp,&DWP(16*4,$inp));
1377 &xorps ($inout1,&QWP(16*1,"esp"));
1378 &xorps ($inout2,$inout5);
1379 &xorps ($inout3,$inout4);
1380
1381 &call ("_aesni_encrypt4");
1382
1383 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1384 &xorps ($inout1,&QWP(16*1,"esp"));
1385 &xorps ($inout2,$inout5);
1386 &movups (&QWP(16*0,$out),$inout0); # write output
1387 &xorps ($inout3,$inout4);
1388 &movups (&QWP(16*1,$out),$inout1);
1389 &movups (&QWP(16*2,$out),$inout2);
1390 &movups (&QWP(16*3,$out),$inout3);
1391 &lea ($out,&DWP(16*4,$out));
1392
1393 &movdqa ($tweak,$inout4); # last tweak
1394 &jmp (&label("xts_enc_done"));
1395
1396 &set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1397 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1398 &and ($len,15);
1399 &jz (&label("xts_enc_ret"));
1400 &movdqa ($inout3,$tweak);
1401 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1402 &jmp (&label("xts_enc_steal"));
1403
1404 &set_label("xts_enc_done",16);
1405 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1406 &pxor ($twtmp,$twtmp);
1407 &and ($len,15);
1408 &jz (&label("xts_enc_ret"));
1409
1410 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1411 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1412 &pshufd ($inout3,$twtmp,0x13);
1413 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1414 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1415 &pxor ($inout3,$tweak);
1416
1417 &set_label("xts_enc_steal");
1418 &movz ($rounds,&BP(0,$inp));
1419 &movz ($key,&BP(-16,$out));
1420 &lea ($inp,&DWP(1,$inp));
1421 &mov (&BP(-16,$out),&LB($rounds));
1422 &mov (&BP(0,$out),&LB($key));
1423 &lea ($out,&DWP(1,$out));
1424 &sub ($len,1);
1425 &jnz (&label("xts_enc_steal"));
1426
1427 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1428 &mov ($key,$key_); # restore $key
1429 &mov ($rounds,$rounds_); # restore $rounds
1430
1431 &movups ($inout0,&QWP(-16,$out)); # load input
1432 &xorps ($inout0,$inout3); # input^=tweak
1433 if ($inline)
1434 { &aesni_inline_generate1("enc"); }
1435 else
1436 { &call ("_aesni_encrypt1"); }
1437 &xorps ($inout0,$inout3); # output^=tweak
1438 &movups (&QWP(-16,$out),$inout0); # write output
1439
1440 &set_label("xts_enc_ret");
1441 &pxor ("xmm0","xmm0"); # clear register bank
1442 &pxor ("xmm1","xmm1");
1443 &pxor ("xmm2","xmm2");
1444 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1445 &pxor ("xmm3","xmm3");
1446 &movdqa (&QWP(16*1,"esp"),"xmm0");
1447 &pxor ("xmm4","xmm4");
1448 &movdqa (&QWP(16*2,"esp"),"xmm0");
1449 &pxor ("xmm5","xmm5");
1450 &movdqa (&QWP(16*3,"esp"),"xmm0");
1451 &pxor ("xmm6","xmm6");
1452 &movdqa (&QWP(16*4,"esp"),"xmm0");
1453 &pxor ("xmm7","xmm7");
1454 &movdqa (&QWP(16*5,"esp"),"xmm0");
1455 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1456 &function_end("aesni_xts_encrypt");
1457
1458 &function_begin("aesni_xts_decrypt");
1459 &mov ($key,&wparam(4)); # key2
1460 &mov ($inp,&wparam(5)); # clear-text tweak
1461
1462 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1463 &movups ($inout0,&QWP(0,$inp));
1464 if ($inline)
1465 { &aesni_inline_generate1("enc"); }
1466 else
1467 { &call ("_aesni_encrypt1"); }
1468
1469 &mov ($inp,&wparam(0));
1470 &mov ($out,&wparam(1));
1471 &mov ($len,&wparam(2));
1472 &mov ($key,&wparam(3)); # key1
1473
1474 &mov ($key_,"esp");
1475 &sub ("esp",16*7+8);
1476 &and ("esp",-16); # align stack
1477
1478 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1479 &test ($len,15);
1480 &setnz (&LB($rounds_));
1481 &shl ($rounds_,4);
1482 &sub ($len,$rounds_);
1483
1484 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1485 &mov (&DWP(16*6+4,"esp"),0);
1486 &mov (&DWP(16*6+8,"esp"),1);
1487 &mov (&DWP(16*6+12,"esp"),0);
1488 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1489 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1490
1491 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1492 &mov ($key_,$key); # backup $key
1493 &mov ($rounds_,$rounds); # backup $rounds
1494
1495 &movdqa ($tweak,$inout0);
1496 &pxor ($twtmp,$twtmp);
1497 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1498 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1499
1500 &and ($len,-16);
1501 &sub ($len,16*6);
1502 &jc (&label("xts_dec_short"));
1503
1504 &shl ($rounds,4);
1505 &mov ($rounds_,16);
1506 &sub ($rounds_,$rounds);
1507 &lea ($key,&DWP(32,$key,$rounds));
1508 &jmp (&label("xts_dec_loop6"));
1509
1510 &set_label("xts_dec_loop6",16);
1511 for ($i=0;$i<4;$i++) {
1512 &pshufd ($twres,$twtmp,0x13);
1513 &pxor ($twtmp,$twtmp);
1514 &movdqa (&QWP(16*$i,"esp"),$tweak);
1515 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1516 &pand ($twres,$twmask); # isolate carry and residue
1517 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1518 &pxor ($tweak,$twres);
1519 }
1520 &pshufd ($inout5,$twtmp,0x13);
1521 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1522 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1523 &$movekey ($rndkey0,&QWP(0,$key_));
1524 &pand ($inout5,$twmask); # isolate carry and residue
1525 &movups ($inout0,&QWP(0,$inp)); # load input
1526 &pxor ($inout5,$tweak);
1527
1528 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1529 &mov ($rounds,$rounds_);
1530 &movdqu ($inout1,&QWP(16*1,$inp));
1531 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1532 &movdqu ($inout2,&QWP(16*2,$inp));
1533 &pxor ($inout1,$rndkey0);
1534 &movdqu ($inout3,&QWP(16*3,$inp));
1535 &pxor ($inout2,$rndkey0);
1536 &movdqu ($inout4,&QWP(16*4,$inp));
1537 &pxor ($inout3,$rndkey0);
1538 &movdqu ($rndkey1,&QWP(16*5,$inp));
1539 &pxor ($inout4,$rndkey0);
1540 &lea ($inp,&DWP(16*6,$inp));
1541 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1542 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1543 &pxor ($inout5,$rndkey1);
1544
1545 &$movekey ($rndkey1,&QWP(16,$key_));
1546 &pxor ($inout1,&QWP(16*1,"esp"));
1547 &pxor ($inout2,&QWP(16*2,"esp"));
1548 &aesdec ($inout0,$rndkey1);
1549 &pxor ($inout3,&QWP(16*3,"esp"));
1550 &pxor ($inout4,&QWP(16*4,"esp"));
1551 &aesdec ($inout1,$rndkey1);
1552 &pxor ($inout5,$rndkey0);
1553 &$movekey ($rndkey0,&QWP(32,$key_));
1554 &aesdec ($inout2,$rndkey1);
1555 &aesdec ($inout3,$rndkey1);
1556 &aesdec ($inout4,$rndkey1);
1557 &aesdec ($inout5,$rndkey1);
1558 &call (&label("_aesni_decrypt6_enter"));
1559
1560 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1561 &pxor ($twtmp,$twtmp);
1562 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1563 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1564 &xorps ($inout1,&QWP(16*1,"esp"));
1565 &movups (&QWP(16*0,$out),$inout0); # write output
1566 &xorps ($inout2,&QWP(16*2,"esp"));
1567 &movups (&QWP(16*1,$out),$inout1);
1568 &xorps ($inout3,&QWP(16*3,"esp"));
1569 &movups (&QWP(16*2,$out),$inout2);
1570 &xorps ($inout4,&QWP(16*4,"esp"));
1571 &movups (&QWP(16*3,$out),$inout3);
1572 &xorps ($inout5,$tweak);
1573 &movups (&QWP(16*4,$out),$inout4);
1574 &pshufd ($twres,$twtmp,0x13);
1575 &movups (&QWP(16*5,$out),$inout5);
1576 &lea ($out,&DWP(16*6,$out));
1577 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1578
1579 &pxor ($twtmp,$twtmp);
1580 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1581 &pand ($twres,$twmask); # isolate carry and residue
1582 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1583 &pxor ($tweak,$twres);
1584
1585 &sub ($len,16*6);
1586 &jnc (&label("xts_dec_loop6"));
1587
1588 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1589 &mov ($key,$key_); # restore $key
1590 &mov ($rounds_,$rounds);
1591
1592 &set_label("xts_dec_short");
1593 &add ($len,16*6);
1594 &jz (&label("xts_dec_done6x"));
1595
1596 &movdqa ($inout3,$tweak); # put aside previous tweak
1597 &cmp ($len,0x20);
1598 &jb (&label("xts_dec_one"));
1599
1600 &pshufd ($twres,$twtmp,0x13);
1601 &pxor ($twtmp,$twtmp);
1602 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1603 &pand ($twres,$twmask); # isolate carry and residue
1604 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1605 &pxor ($tweak,$twres);
1606 &je (&label("xts_dec_two"));
1607
1608 &pshufd ($twres,$twtmp,0x13);
1609 &pxor ($twtmp,$twtmp);
1610 &movdqa ($inout4,$tweak); # put aside previous tweak
1611 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1612 &pand ($twres,$twmask); # isolate carry and residue
1613 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1614 &pxor ($tweak,$twres);
1615 &cmp ($len,0x40);
1616 &jb (&label("xts_dec_three"));
1617
1618 &pshufd ($twres,$twtmp,0x13);
1619 &pxor ($twtmp,$twtmp);
1620 &movdqa ($inout5,$tweak); # put aside previous tweak
1621 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1622 &pand ($twres,$twmask); # isolate carry and residue
1623 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1624 &pxor ($tweak,$twres);
1625 &movdqa (&QWP(16*0,"esp"),$inout3);
1626 &movdqa (&QWP(16*1,"esp"),$inout4);
1627 &je (&label("xts_dec_four"));
1628
1629 &movdqa (&QWP(16*2,"esp"),$inout5);
1630 &pshufd ($inout5,$twtmp,0x13);
1631 &movdqa (&QWP(16*3,"esp"),$tweak);
1632 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1633 &pand ($inout5,$twmask); # isolate carry and residue
1634 &pxor ($inout5,$tweak);
1635
1636 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1637 &movdqu ($inout1,&QWP(16*1,$inp));
1638 &movdqu ($inout2,&QWP(16*2,$inp));
1639 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1640 &movdqu ($inout3,&QWP(16*3,$inp));
1641 &pxor ($inout1,&QWP(16*1,"esp"));
1642 &movdqu ($inout4,&QWP(16*4,$inp));
1643 &pxor ($inout2,&QWP(16*2,"esp"));
1644 &lea ($inp,&DWP(16*5,$inp));
1645 &pxor ($inout3,&QWP(16*3,"esp"));
1646 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1647 &pxor ($inout4,$inout5);
1648
1649 &call ("_aesni_decrypt6");
1650
1651 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1652 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1653 &xorps ($inout1,&QWP(16*1,"esp"));
1654 &xorps ($inout2,&QWP(16*2,"esp"));
1655 &movups (&QWP(16*0,$out),$inout0); # write output
1656 &xorps ($inout3,&QWP(16*3,"esp"));
1657 &movups (&QWP(16*1,$out),$inout1);
1658 &xorps ($inout4,$tweak);
1659 &movups (&QWP(16*2,$out),$inout2);
1660 &movups (&QWP(16*3,$out),$inout3);
1661 &movups (&QWP(16*4,$out),$inout4);
1662 &lea ($out,&DWP(16*5,$out));
1663 &jmp (&label("xts_dec_done"));
1664
1665 &set_label("xts_dec_one",16);
1666 &movups ($inout0,&QWP(16*0,$inp)); # load input
1667 &lea ($inp,&DWP(16*1,$inp));
1668 &xorps ($inout0,$inout3); # input^=tweak
1669 if ($inline)
1670 { &aesni_inline_generate1("dec"); }
1671 else
1672 { &call ("_aesni_decrypt1"); }
1673 &xorps ($inout0,$inout3); # output^=tweak
1674 &movups (&QWP(16*0,$out),$inout0); # write output
1675 &lea ($out,&DWP(16*1,$out));
1676
1677 &movdqa ($tweak,$inout3); # last tweak
1678 &jmp (&label("xts_dec_done"));
1679
1680 &set_label("xts_dec_two",16);
1681 &movaps ($inout4,$tweak); # put aside last tweak
1682
1683 &movups ($inout0,&QWP(16*0,$inp)); # load input
1684 &movups ($inout1,&QWP(16*1,$inp));
1685 &lea ($inp,&DWP(16*2,$inp));
1686 &xorps ($inout0,$inout3); # input^=tweak
1687 &xorps ($inout1,$inout4);
1688
1689 &call ("_aesni_decrypt2");
1690
1691 &xorps ($inout0,$inout3); # output^=tweak
1692 &xorps ($inout1,$inout4);
1693 &movups (&QWP(16*0,$out),$inout0); # write output
1694 &movups (&QWP(16*1,$out),$inout1);
1695 &lea ($out,&DWP(16*2,$out));
1696
1697 &movdqa ($tweak,$inout4); # last tweak
1698 &jmp (&label("xts_dec_done"));
1699
1700 &set_label("xts_dec_three",16);
1701 &movaps ($inout5,$tweak); # put aside last tweak
1702 &movups ($inout0,&QWP(16*0,$inp)); # load input
1703 &movups ($inout1,&QWP(16*1,$inp));
1704 &movups ($inout2,&QWP(16*2,$inp));
1705 &lea ($inp,&DWP(16*3,$inp));
1706 &xorps ($inout0,$inout3); # input^=tweak
1707 &xorps ($inout1,$inout4);
1708 &xorps ($inout2,$inout5);
1709
1710 &call ("_aesni_decrypt3");
1711
1712 &xorps ($inout0,$inout3); # output^=tweak
1713 &xorps ($inout1,$inout4);
1714 &xorps ($inout2,$inout5);
1715 &movups (&QWP(16*0,$out),$inout0); # write output
1716 &movups (&QWP(16*1,$out),$inout1);
1717 &movups (&QWP(16*2,$out),$inout2);
1718 &lea ($out,&DWP(16*3,$out));
1719
1720 &movdqa ($tweak,$inout5); # last tweak
1721 &jmp (&label("xts_dec_done"));
1722
1723 &set_label("xts_dec_four",16);
1724 &movaps ($inout4,$tweak); # put aside last tweak
1725
1726 &movups ($inout0,&QWP(16*0,$inp)); # load input
1727 &movups ($inout1,&QWP(16*1,$inp));
1728 &movups ($inout2,&QWP(16*2,$inp));
1729 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1730 &movups ($inout3,&QWP(16*3,$inp));
1731 &lea ($inp,&DWP(16*4,$inp));
1732 &xorps ($inout1,&QWP(16*1,"esp"));
1733 &xorps ($inout2,$inout5);
1734 &xorps ($inout3,$inout4);
1735
1736 &call ("_aesni_decrypt4");
1737
1738 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1739 &xorps ($inout1,&QWP(16*1,"esp"));
1740 &xorps ($inout2,$inout5);
1741 &movups (&QWP(16*0,$out),$inout0); # write output
1742 &xorps ($inout3,$inout4);
1743 &movups (&QWP(16*1,$out),$inout1);
1744 &movups (&QWP(16*2,$out),$inout2);
1745 &movups (&QWP(16*3,$out),$inout3);
1746 &lea ($out,&DWP(16*4,$out));
1747
1748 &movdqa ($tweak,$inout4); # last tweak
1749 &jmp (&label("xts_dec_done"));
1750
1751 &set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1752 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1753 &and ($len,15);
1754 &jz (&label("xts_dec_ret"));
1755 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1756 &jmp (&label("xts_dec_only_one_more"));
1757
1758 &set_label("xts_dec_done",16);
1759 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1760 &pxor ($twtmp,$twtmp);
1761 &and ($len,15);
1762 &jz (&label("xts_dec_ret"));
1763
1764 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1765 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1766 &pshufd ($twres,$twtmp,0x13);
1767 &pxor ($twtmp,$twtmp);
1768 &movdqa ($twmask,&QWP(16*6,"esp"));
1769 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1770 &pand ($twres,$twmask); # isolate carry and residue
1771 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1772 &pxor ($tweak,$twres);
1773
1774 &set_label("xts_dec_only_one_more");
1775 &pshufd ($inout3,$twtmp,0x13);
1776 &movdqa ($inout4,$tweak); # put aside previous tweak
1777 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1778 &pand ($inout3,$twmask); # isolate carry and residue
1779 &pxor ($inout3,$tweak);
1780
1781 &mov ($key,$key_); # restore $key
1782 &mov ($rounds,$rounds_); # restore $rounds
1783
1784 &movups ($inout0,&QWP(0,$inp)); # load input
1785 &xorps ($inout0,$inout3); # input^=tweak
1786 if ($inline)
1787 { &aesni_inline_generate1("dec"); }
1788 else
1789 { &call ("_aesni_decrypt1"); }
1790 &xorps ($inout0,$inout3); # output^=tweak
1791 &movups (&QWP(0,$out),$inout0); # write output
1792
1793 &set_label("xts_dec_steal");
1794 &movz ($rounds,&BP(16,$inp));
1795 &movz ($key,&BP(0,$out));
1796 &lea ($inp,&DWP(1,$inp));
1797 &mov (&BP(0,$out),&LB($rounds));
1798 &mov (&BP(16,$out),&LB($key));
1799 &lea ($out,&DWP(1,$out));
1800 &sub ($len,1);
1801 &jnz (&label("xts_dec_steal"));
1802
1803 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1804 &mov ($key,$key_); # restore $key
1805 &mov ($rounds,$rounds_); # restore $rounds
1806
1807 &movups ($inout0,&QWP(0,$out)); # load input
1808 &xorps ($inout0,$inout4); # input^=tweak
1809 if ($inline)
1810 { &aesni_inline_generate1("dec"); }
1811 else
1812 { &call ("_aesni_decrypt1"); }
1813 &xorps ($inout0,$inout4); # output^=tweak
1814 &movups (&QWP(0,$out),$inout0); # write output
1815
1816 &set_label("xts_dec_ret");
1817 &pxor ("xmm0","xmm0"); # clear register bank
1818 &pxor ("xmm1","xmm1");
1819 &pxor ("xmm2","xmm2");
1820 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1821 &pxor ("xmm3","xmm3");
1822 &movdqa (&QWP(16*1,"esp"),"xmm0");
1823 &pxor ("xmm4","xmm4");
1824 &movdqa (&QWP(16*2,"esp"),"xmm0");
1825 &pxor ("xmm5","xmm5");
1826 &movdqa (&QWP(16*3,"esp"),"xmm0");
1827 &pxor ("xmm6","xmm6");
1828 &movdqa (&QWP(16*4,"esp"),"xmm0");
1829 &pxor ("xmm7","xmm7");
1830 &movdqa (&QWP(16*5,"esp"),"xmm0");
1831 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1832 &function_end("aesni_xts_decrypt");
1833 }
1834 }
1835 \f
1836 ######################################################################
1837 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
1838 # size_t length, const AES_KEY *key,
1839 # unsigned char *ivp,const int enc);
1840 &function_begin("${PREFIX}_cbc_encrypt");
1841 &mov ($inp,&wparam(0));
1842 &mov ($rounds_,"esp");
1843 &mov ($out,&wparam(1));
1844 &sub ($rounds_,24);
1845 &mov ($len,&wparam(2));
1846 &and ($rounds_,-16);
1847 &mov ($key,&wparam(3));
1848 &mov ($key_,&wparam(4));
1849 &test ($len,$len);
1850 &jz (&label("cbc_abort"));
1851
1852 &cmp (&wparam(5),0);
1853 &xchg ($rounds_,"esp"); # alloca
1854 &movups ($ivec,&QWP(0,$key_)); # load IV
1855 &mov ($rounds,&DWP(240,$key));
1856 &mov ($key_,$key); # backup $key
1857 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
1858 &mov ($rounds_,$rounds); # backup $rounds
1859 &je (&label("cbc_decrypt"));
1860
1861 &movaps ($inout0,$ivec);
1862 &cmp ($len,16);
1863 &jb (&label("cbc_enc_tail"));
1864 &sub ($len,16);
1865 &jmp (&label("cbc_enc_loop"));
1866
1867 &set_label("cbc_enc_loop",16);
1868 &movups ($ivec,&QWP(0,$inp)); # input actually
1869 &lea ($inp,&DWP(16,$inp));
1870 if ($inline)
1871 { &aesni_inline_generate1("enc",$inout0,$ivec); }
1872 else
1873 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
1874 &mov ($rounds,$rounds_); # restore $rounds
1875 &mov ($key,$key_); # restore $key
1876 &movups (&QWP(0,$out),$inout0); # store output
1877 &lea ($out,&DWP(16,$out));
1878 &sub ($len,16);
1879 &jnc (&label("cbc_enc_loop"));
1880 &add ($len,16);
1881 &jnz (&label("cbc_enc_tail"));
1882 &movaps ($ivec,$inout0);
1883 &pxor ($inout0,$inout0);
1884 &jmp (&label("cbc_ret"));
1885
1886 &set_label("cbc_enc_tail");
1887 &mov ("ecx",$len); # zaps $rounds
1888 &data_word(0xA4F3F689); # rep movsb
1889 &mov ("ecx",16); # zero tail
1890 &sub ("ecx",$len);
1891 &xor ("eax","eax"); # zaps $len
1892 &data_word(0xAAF3F689); # rep stosb
1893 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
1894 &mov ($rounds,$rounds_); # restore $rounds
1895 &mov ($inp,$out); # $inp and $out are the same
1896 &mov ($key,$key_); # restore $key
1897 &jmp (&label("cbc_enc_loop"));
1898 ######################################################################
1899 &set_label("cbc_decrypt",16);
1900 &cmp ($len,0x50);
1901 &jbe (&label("cbc_dec_tail"));
1902 &movaps (&QWP(0,"esp"),$ivec); # save IV
1903 &sub ($len,0x50);
1904 &jmp (&label("cbc_dec_loop6_enter"));
1905
1906 &set_label("cbc_dec_loop6",16);
1907 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
1908 &movups (&QWP(0,$out),$inout5);
1909 &lea ($out,&DWP(0x10,$out));
1910 &set_label("cbc_dec_loop6_enter");
1911 &movdqu ($inout0,&QWP(0,$inp));
1912 &movdqu ($inout1,&QWP(0x10,$inp));
1913 &movdqu ($inout2,&QWP(0x20,$inp));
1914 &movdqu ($inout3,&QWP(0x30,$inp));
1915 &movdqu ($inout4,&QWP(0x40,$inp));
1916 &movdqu ($inout5,&QWP(0x50,$inp));
1917
1918 &call ("_aesni_decrypt6");
1919
1920 &movups ($rndkey1,&QWP(0,$inp));
1921 &movups ($rndkey0,&QWP(0x10,$inp));
1922 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
1923 &xorps ($inout1,$rndkey1);
1924 &movups ($rndkey1,&QWP(0x20,$inp));
1925 &xorps ($inout2,$rndkey0);
1926 &movups ($rndkey0,&QWP(0x30,$inp));
1927 &xorps ($inout3,$rndkey1);
1928 &movups ($rndkey1,&QWP(0x40,$inp));
1929 &xorps ($inout4,$rndkey0);
1930 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
1931 &xorps ($inout5,$rndkey1);
1932 &movups (&QWP(0,$out),$inout0);
1933 &movups (&QWP(0x10,$out),$inout1);
1934 &lea ($inp,&DWP(0x60,$inp));
1935 &movups (&QWP(0x20,$out),$inout2);
1936 &mov ($rounds,$rounds_); # restore $rounds
1937 &movups (&QWP(0x30,$out),$inout3);
1938 &mov ($key,$key_); # restore $key
1939 &movups (&QWP(0x40,$out),$inout4);
1940 &lea ($out,&DWP(0x50,$out));
1941 &sub ($len,0x60);
1942 &ja (&label("cbc_dec_loop6"));
1943
1944 &movaps ($inout0,$inout5);
1945 &movaps ($ivec,$rndkey0);
1946 &add ($len,0x50);
1947 &jle (&label("cbc_dec_clear_tail_collected"));
1948 &movups (&QWP(0,$out),$inout0);
1949 &lea ($out,&DWP(0x10,$out));
1950 &set_label("cbc_dec_tail");
1951 &movups ($inout0,&QWP(0,$inp));
1952 &movaps ($in0,$inout0);
1953 &cmp ($len,0x10);
1954 &jbe (&label("cbc_dec_one"));
1955
1956 &movups ($inout1,&QWP(0x10,$inp));
1957 &movaps ($in1,$inout1);
1958 &cmp ($len,0x20);
1959 &jbe (&label("cbc_dec_two"));
1960
1961 &movups ($inout2,&QWP(0x20,$inp));
1962 &cmp ($len,0x30);
1963 &jbe (&label("cbc_dec_three"));
1964
1965 &movups ($inout3,&QWP(0x30,$inp));
1966 &cmp ($len,0x40);
1967 &jbe (&label("cbc_dec_four"));
1968
1969 &movups ($inout4,&QWP(0x40,$inp));
1970 &movaps (&QWP(0,"esp"),$ivec); # save IV
1971 &movups ($inout0,&QWP(0,$inp));
1972 &xorps ($inout5,$inout5);
1973 &call ("_aesni_decrypt6");
1974 &movups ($rndkey1,&QWP(0,$inp));
1975 &movups ($rndkey0,&QWP(0x10,$inp));
1976 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
1977 &xorps ($inout1,$rndkey1);
1978 &movups ($rndkey1,&QWP(0x20,$inp));
1979 &xorps ($inout2,$rndkey0);
1980 &movups ($rndkey0,&QWP(0x30,$inp));
1981 &xorps ($inout3,$rndkey1);
1982 &movups ($ivec,&QWP(0x40,$inp)); # IV
1983 &xorps ($inout4,$rndkey0);
1984 &movups (&QWP(0,$out),$inout0);
1985 &movups (&QWP(0x10,$out),$inout1);
1986 &pxor ($inout1,$inout1);
1987 &movups (&QWP(0x20,$out),$inout2);
1988 &pxor ($inout2,$inout2);
1989 &movups (&QWP(0x30,$out),$inout3);
1990 &pxor ($inout3,$inout3);
1991 &lea ($out,&DWP(0x40,$out));
1992 &movaps ($inout0,$inout4);
1993 &pxor ($inout4,$inout4);
1994 &sub ($len,0x50);
1995 &jmp (&label("cbc_dec_tail_collected"));
1996
1997 &set_label("cbc_dec_one",16);
1998 if ($inline)
1999 { &aesni_inline_generate1("dec"); }
2000 else
2001 { &call ("_aesni_decrypt1"); }
2002 &xorps ($inout0,$ivec);
2003 &movaps ($ivec,$in0);
2004 &sub ($len,0x10);
2005 &jmp (&label("cbc_dec_tail_collected"));
2006
2007 &set_label("cbc_dec_two",16);
2008 &call ("_aesni_decrypt2");
2009 &xorps ($inout0,$ivec);
2010 &xorps ($inout1,$in0);
2011 &movups (&QWP(0,$out),$inout0);
2012 &movaps ($inout0,$inout1);
2013 &pxor ($inout1,$inout1);
2014 &lea ($out,&DWP(0x10,$out));
2015 &movaps ($ivec,$in1);
2016 &sub ($len,0x20);
2017 &jmp (&label("cbc_dec_tail_collected"));
2018
2019 &set_label("cbc_dec_three",16);
2020 &call ("_aesni_decrypt3");
2021 &xorps ($inout0,$ivec);
2022 &xorps ($inout1,$in0);
2023 &xorps ($inout2,$in1);
2024 &movups (&QWP(0,$out),$inout0);
2025 &movaps ($inout0,$inout2);
2026 &pxor ($inout2,$inout2);
2027 &movups (&QWP(0x10,$out),$inout1);
2028 &pxor ($inout1,$inout1);
2029 &lea ($out,&DWP(0x20,$out));
2030 &movups ($ivec,&QWP(0x20,$inp));
2031 &sub ($len,0x30);
2032 &jmp (&label("cbc_dec_tail_collected"));
2033
2034 &set_label("cbc_dec_four",16);
2035 &call ("_aesni_decrypt4");
2036 &movups ($rndkey1,&QWP(0x10,$inp));
2037 &movups ($rndkey0,&QWP(0x20,$inp));
2038 &xorps ($inout0,$ivec);
2039 &movups ($ivec,&QWP(0x30,$inp));
2040 &xorps ($inout1,$in0);
2041 &movups (&QWP(0,$out),$inout0);
2042 &xorps ($inout2,$rndkey1);
2043 &movups (&QWP(0x10,$out),$inout1);
2044 &pxor ($inout1,$inout1);
2045 &xorps ($inout3,$rndkey0);
2046 &movups (&QWP(0x20,$out),$inout2);
2047 &pxor ($inout2,$inout2);
2048 &lea ($out,&DWP(0x30,$out));
2049 &movaps ($inout0,$inout3);
2050 &pxor ($inout3,$inout3);
2051 &sub ($len,0x40);
2052 &jmp (&label("cbc_dec_tail_collected"));
2053
2054 &set_label("cbc_dec_clear_tail_collected",16);
2055 &pxor ($inout1,$inout1);
2056 &pxor ($inout2,$inout2);
2057 &pxor ($inout3,$inout3);
2058 &pxor ($inout4,$inout4);
2059 &set_label("cbc_dec_tail_collected");
2060 &and ($len,15);
2061 &jnz (&label("cbc_dec_tail_partial"));
2062 &movups (&QWP(0,$out),$inout0);
2063 &pxor ($rndkey0,$rndkey0);
2064 &jmp (&label("cbc_ret"));
2065
2066 &set_label("cbc_dec_tail_partial",16);
2067 &movaps (&QWP(0,"esp"),$inout0);
2068 &pxor ($rndkey0,$rndkey0);
2069 &mov ("ecx",16);
2070 &mov ($inp,"esp");
2071 &sub ("ecx",$len);
2072 &data_word(0xA4F3F689); # rep movsb
2073 &movdqa (&QWP(0,"esp"),$inout0);
2074
2075 &set_label("cbc_ret");
2076 &mov ("esp",&DWP(16,"esp")); # pull original %esp
2077 &mov ($key_,&wparam(4));
2078 &pxor ($inout0,$inout0);
2079 &pxor ($rndkey1,$rndkey1);
2080 &movups (&QWP(0,$key_),$ivec); # output IV
2081 &pxor ($ivec,$ivec);
2082 &set_label("cbc_abort");
2083 &function_end("${PREFIX}_cbc_encrypt");
2084 \f
2085 ######################################################################
2086 # Mechanical port from aesni-x86_64.pl.
2087 #
2088 # _aesni_set_encrypt_key is private interface,
2089 # input:
2090 # "eax" const unsigned char *userKey
2091 # $rounds int bits
2092 # $key AES_KEY *key
2093 # output:
2094 # "eax" return code
2095 # $round rounds
2096
2097 &function_begin_B("_aesni_set_encrypt_key");
2098 &push ("ebp");
2099 &push ("ebx");
2100 &test ("eax","eax");
2101 &jz (&label("bad_pointer"));
2102 &test ($key,$key);
2103 &jz (&label("bad_pointer"));
2104
2105 &call (&label("pic"));
2106 &set_label("pic");
2107 &blindpop("ebx");
2108 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2109
2110 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
2111 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
2112 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
2113 &mov ("ebp",&DWP(4,"ebp"));
2114 &lea ($key,&DWP(16,$key));
2115 &and ("ebp",1<<28|1<<11); # AVX and XOP bits
2116 &cmp ($rounds,256);
2117 &je (&label("14rounds"));
2118 &cmp ($rounds,192);
2119 &je (&label("12rounds"));
2120 &cmp ($rounds,128);
2121 &jne (&label("bad_keybits"));
2122
2123 &set_label("10rounds",16);
2124 &cmp ("ebp",1<<28);
2125 &je (&label("10rounds_alt"));
2126
2127 &mov ($rounds,9);
2128 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
2129 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
2130 &call (&label("key_128_cold"));
2131 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
2132 &call (&label("key_128"));
2133 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
2134 &call (&label("key_128"));
2135 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
2136 &call (&label("key_128"));
2137 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
2138 &call (&label("key_128"));
2139 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
2140 &call (&label("key_128"));
2141 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
2142 &call (&label("key_128"));
2143 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
2144 &call (&label("key_128"));
2145 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
2146 &call (&label("key_128"));
2147 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
2148 &call (&label("key_128"));
2149 &$movekey (&QWP(0,$key),"xmm0");
2150 &mov (&DWP(80,$key),$rounds);
2151
2152 &jmp (&label("good_key"));
2153
2154 &set_label("key_128",16);
2155 &$movekey (&QWP(0,$key),"xmm0");
2156 &lea ($key,&DWP(16,$key));
2157 &set_label("key_128_cold");
2158 &shufps ("xmm4","xmm0",0b00010000);
2159 &xorps ("xmm0","xmm4");
2160 &shufps ("xmm4","xmm0",0b10001100);
2161 &xorps ("xmm0","xmm4");
2162 &shufps ("xmm1","xmm1",0b11111111); # critical path
2163 &xorps ("xmm0","xmm1");
2164 &ret();
2165
2166 &set_label("10rounds_alt",16);
2167 &movdqa ("xmm5",&QWP(0x00,"ebx"));
2168 &mov ($rounds,8);
2169 &movdqa ("xmm4",&QWP(0x20,"ebx"));
2170 &movdqa ("xmm2","xmm0");
2171 &movdqu (&QWP(-16,$key),"xmm0");
2172
2173 &set_label("loop_key128");
2174 &pshufb ("xmm0","xmm5");
2175 &aesenclast ("xmm0","xmm4");
2176 &pslld ("xmm4",1);
2177 &lea ($key,&DWP(16,$key));
2178
2179 &movdqa ("xmm3","xmm2");
2180 &pslldq ("xmm2",4);
2181 &pxor ("xmm3","xmm2");
2182 &pslldq ("xmm2",4);
2183 &pxor ("xmm3","xmm2");
2184 &pslldq ("xmm2",4);
2185 &pxor ("xmm2","xmm3");
2186
2187 &pxor ("xmm0","xmm2");
2188 &movdqu (&QWP(-16,$key),"xmm0");
2189 &movdqa ("xmm2","xmm0");
2190
2191 &dec ($rounds);
2192 &jnz (&label("loop_key128"));
2193
2194 &movdqa ("xmm4",&QWP(0x30,"ebx"));
2195
2196 &pshufb ("xmm0","xmm5");
2197 &aesenclast ("xmm0","xmm4");
2198 &pslld ("xmm4",1);
2199
2200 &movdqa ("xmm3","xmm2");
2201 &pslldq ("xmm2",4);
2202 &pxor ("xmm3","xmm2");
2203 &pslldq ("xmm2",4);
2204 &pxor ("xmm3","xmm2");
2205 &pslldq ("xmm2",4);
2206 &pxor ("xmm2","xmm3");
2207
2208 &pxor ("xmm0","xmm2");
2209 &movdqu (&QWP(0,$key),"xmm0");
2210
2211 &movdqa ("xmm2","xmm0");
2212 &pshufb ("xmm0","xmm5");
2213 &aesenclast ("xmm0","xmm4");
2214
2215 &movdqa ("xmm3","xmm2");
2216 &pslldq ("xmm2",4);
2217 &pxor ("xmm3","xmm2");
2218 &pslldq ("xmm2",4);
2219 &pxor ("xmm3","xmm2");
2220 &pslldq ("xmm2",4);
2221 &pxor ("xmm2","xmm3");
2222
2223 &pxor ("xmm0","xmm2");
2224 &movdqu (&QWP(16,$key),"xmm0");
2225
2226 &mov ($rounds,9);
2227 &mov (&DWP(96,$key),$rounds);
2228
2229 &jmp (&label("good_key"));
2230
2231 &set_label("12rounds",16);
2232 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
2233 &cmp ("ebp",1<<28);
2234 &je (&label("12rounds_alt"));
2235
2236 &mov ($rounds,11);
2237 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
2238 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
2239 &call (&label("key_192a_cold"));
2240 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
2241 &call (&label("key_192b"));
2242 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
2243 &call (&label("key_192a"));
2244 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
2245 &call (&label("key_192b"));
2246 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
2247 &call (&label("key_192a"));
2248 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
2249 &call (&label("key_192b"));
2250 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
2251 &call (&label("key_192a"));
2252 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
2253 &call (&label("key_192b"));
2254 &$movekey (&QWP(0,$key),"xmm0");
2255 &mov (&DWP(48,$key),$rounds);
2256
2257 &jmp (&label("good_key"));
2258
2259 &set_label("key_192a",16);
2260 &$movekey (&QWP(0,$key),"xmm0");
2261 &lea ($key,&DWP(16,$key));
2262 &set_label("key_192a_cold",16);
2263 &movaps ("xmm5","xmm2");
2264 &set_label("key_192b_warm");
2265 &shufps ("xmm4","xmm0",0b00010000);
2266 &movdqa ("xmm3","xmm2");
2267 &xorps ("xmm0","xmm4");
2268 &shufps ("xmm4","xmm0",0b10001100);
2269 &pslldq ("xmm3",4);
2270 &xorps ("xmm0","xmm4");
2271 &pshufd ("xmm1","xmm1",0b01010101); # critical path
2272 &pxor ("xmm2","xmm3");
2273 &pxor ("xmm0","xmm1");
2274 &pshufd ("xmm3","xmm0",0b11111111);
2275 &pxor ("xmm2","xmm3");
2276 &ret();
2277
2278 &set_label("key_192b",16);
2279 &movaps ("xmm3","xmm0");
2280 &shufps ("xmm5","xmm0",0b01000100);
2281 &$movekey (&QWP(0,$key),"xmm5");
2282 &shufps ("xmm3","xmm2",0b01001110);
2283 &$movekey (&QWP(16,$key),"xmm3");
2284 &lea ($key,&DWP(32,$key));
2285 &jmp (&label("key_192b_warm"));
2286
2287 &set_label("12rounds_alt",16);
2288 &movdqa ("xmm5",&QWP(0x10,"ebx"));
2289 &movdqa ("xmm4",&QWP(0x20,"ebx"));
2290 &mov ($rounds,8);
2291 &movdqu (&QWP(-16,$key),"xmm0");
2292
2293 &set_label("loop_key192");
2294 &movq (&QWP(0,$key),"xmm2");
2295 &movdqa ("xmm1","xmm2");
2296 &pshufb ("xmm2","xmm5");
2297 &aesenclast ("xmm2","xmm4");
2298 &pslld ("xmm4",1);
2299 &lea ($key,&DWP(24,$key));
2300
2301 &movdqa ("xmm3","xmm0");
2302 &pslldq ("xmm0",4);
2303 &pxor ("xmm3","xmm0");
2304 &pslldq ("xmm0",4);
2305 &pxor ("xmm3","xmm0");
2306 &pslldq ("xmm0",4);
2307 &pxor ("xmm0","xmm3");
2308
2309 &pshufd ("xmm3","xmm0",0xff);
2310 &pxor ("xmm3","xmm1");
2311 &pslldq ("xmm1",4);
2312 &pxor ("xmm3","xmm1");
2313
2314 &pxor ("xmm0","xmm2");
2315 &pxor ("xmm2","xmm3");
2316 &movdqu (&QWP(-16,$key),"xmm0");
2317
2318 &dec ($rounds);
2319 &jnz (&label("loop_key192"));
2320
2321 &mov ($rounds,11);
2322 &mov (&DWP(32,$key),$rounds);
2323
2324 &jmp (&label("good_key"));
2325
2326 &set_label("14rounds",16);
2327 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
2328 &lea ($key,&DWP(16,$key));
2329 &cmp ("ebp",1<<28);
2330 &je (&label("14rounds_alt"));
2331
2332 &mov ($rounds,13);
2333 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
2334 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
2335 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
2336 &call (&label("key_256a_cold"));
2337 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
2338 &call (&label("key_256b"));
2339 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
2340 &call (&label("key_256a"));
2341 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
2342 &call (&label("key_256b"));
2343 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
2344 &call (&label("key_256a"));
2345 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
2346 &call (&label("key_256b"));
2347 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
2348 &call (&label("key_256a"));
2349 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
2350 &call (&label("key_256b"));
2351 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
2352 &call (&label("key_256a"));
2353 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
2354 &call (&label("key_256b"));
2355 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
2356 &call (&label("key_256a"));
2357 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
2358 &call (&label("key_256b"));
2359 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
2360 &call (&label("key_256a"));
2361 &$movekey (&QWP(0,$key),"xmm0");
2362 &mov (&DWP(16,$key),$rounds);
2363 &xor ("eax","eax");
2364
2365 &jmp (&label("good_key"));
2366
2367 &set_label("key_256a",16);
2368 &$movekey (&QWP(0,$key),"xmm2");
2369 &lea ($key,&DWP(16,$key));
2370 &set_label("key_256a_cold");
2371 &shufps ("xmm4","xmm0",0b00010000);
2372 &xorps ("xmm0","xmm4");
2373 &shufps ("xmm4","xmm0",0b10001100);
2374 &xorps ("xmm0","xmm4");
2375 &shufps ("xmm1","xmm1",0b11111111); # critical path
2376 &xorps ("xmm0","xmm1");
2377 &ret();
2378
2379 &set_label("key_256b",16);
2380 &$movekey (&QWP(0,$key),"xmm0");
2381 &lea ($key,&DWP(16,$key));
2382
2383 &shufps ("xmm4","xmm2",0b00010000);
2384 &xorps ("xmm2","xmm4");
2385 &shufps ("xmm4","xmm2",0b10001100);
2386 &xorps ("xmm2","xmm4");
2387 &shufps ("xmm1","xmm1",0b10101010); # critical path
2388 &xorps ("xmm2","xmm1");
2389 &ret();
2390
2391 &set_label("14rounds_alt",16);
2392 &movdqa ("xmm5",&QWP(0x00,"ebx"));
2393 &movdqa ("xmm4",&QWP(0x20,"ebx"));
2394 &mov ($rounds,7);
2395 &movdqu (&QWP(-32,$key),"xmm0");
2396 &movdqa ("xmm1","xmm2");
2397 &movdqu (&QWP(-16,$key),"xmm2");
2398
2399 &set_label("loop_key256");
2400 &pshufb ("xmm2","xmm5");
2401 &aesenclast ("xmm2","xmm4");
2402
2403 &movdqa ("xmm3","xmm0");
2404 &pslldq ("xmm0",4);
2405 &pxor ("xmm3","xmm0");
2406 &pslldq ("xmm0",4);
2407 &pxor ("xmm3","xmm0");
2408 &pslldq ("xmm0",4);
2409 &pxor ("xmm0","xmm3");
2410 &pslld ("xmm4",1);
2411
2412 &pxor ("xmm0","xmm2");
2413 &movdqu (&QWP(0,$key),"xmm0");
2414
2415 &dec ($rounds);
2416 &jz (&label("done_key256"));
2417
2418 &pshufd ("xmm2","xmm0",0xff);
2419 &pxor ("xmm3","xmm3");
2420 &aesenclast ("xmm2","xmm3");
2421
2422 &movdqa ("xmm3","xmm1")
2423 &pslldq ("xmm1",4);
2424 &pxor ("xmm3","xmm1");
2425 &pslldq ("xmm1",4);
2426 &pxor ("xmm3","xmm1");
2427 &pslldq ("xmm1",4);
2428 &pxor ("xmm1","xmm3");
2429
2430 &pxor ("xmm2","xmm1");
2431 &movdqu (&QWP(16,$key),"xmm2");
2432 &lea ($key,&DWP(32,$key));
2433 &movdqa ("xmm1","xmm2");
2434 &jmp (&label("loop_key256"));
2435
2436 &set_label("done_key256");
2437 &mov ($rounds,13);
2438 &mov (&DWP(16,$key),$rounds);
2439
2440 &set_label("good_key");
2441 &pxor ("xmm0","xmm0");
2442 &pxor ("xmm1","xmm1");
2443 &pxor ("xmm2","xmm2");
2444 &pxor ("xmm3","xmm3");
2445 &pxor ("xmm4","xmm4");
2446 &pxor ("xmm5","xmm5");
2447 &xor ("eax","eax");
2448 &pop ("ebx");
2449 &pop ("ebp");
2450 &ret ();
2451
2452 &set_label("bad_pointer",4);
2453 &mov ("eax",-1);
2454 &pop ("ebx");
2455 &pop ("ebp");
2456 &ret ();
2457 &set_label("bad_keybits",4);
2458 &pxor ("xmm0","xmm0");
2459 &mov ("eax",-2);
2460 &pop ("ebx");
2461 &pop ("ebp");
2462 &ret ();
2463 &function_end_B("_aesni_set_encrypt_key");
2464
2465 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2466 # AES_KEY *key)
2467 &function_begin_B("${PREFIX}_set_encrypt_key");
2468 &mov ("eax",&wparam(0));
2469 &mov ($rounds,&wparam(1));
2470 &mov ($key,&wparam(2));
2471 &call ("_aesni_set_encrypt_key");
2472 &ret ();
2473 &function_end_B("${PREFIX}_set_encrypt_key");
2474
2475 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2476 # AES_KEY *key)
2477 &function_begin_B("${PREFIX}_set_decrypt_key");
2478 &mov ("eax",&wparam(0));
2479 &mov ($rounds,&wparam(1));
2480 &mov ($key,&wparam(2));
2481 &call ("_aesni_set_encrypt_key");
2482 &mov ($key,&wparam(2));
2483 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
2484 &test ("eax","eax");
2485 &jnz (&label("dec_key_ret"));
2486 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
2487
2488 &$movekey ("xmm0",&QWP(0,$key)); # just swap
2489 &$movekey ("xmm1",&QWP(0,"eax"));
2490 &$movekey (&QWP(0,"eax"),"xmm0");
2491 &$movekey (&QWP(0,$key),"xmm1");
2492 &lea ($key,&DWP(16,$key));
2493 &lea ("eax",&DWP(-16,"eax"));
2494
2495 &set_label("dec_key_inverse");
2496 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
2497 &$movekey ("xmm1",&QWP(0,"eax"));
2498 &aesimc ("xmm0","xmm0");
2499 &aesimc ("xmm1","xmm1");
2500 &lea ($key,&DWP(16,$key));
2501 &lea ("eax",&DWP(-16,"eax"));
2502 &$movekey (&QWP(16,"eax"),"xmm0");
2503 &$movekey (&QWP(-16,$key),"xmm1");
2504 &cmp ("eax",$key);
2505 &ja (&label("dec_key_inverse"));
2506
2507 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
2508 &aesimc ("xmm0","xmm0");
2509 &$movekey (&QWP(0,$key),"xmm0");
2510
2511 &pxor ("xmm0","xmm0");
2512 &pxor ("xmm1","xmm1");
2513 &xor ("eax","eax"); # return success
2514 &set_label("dec_key_ret");
2515 &ret ();
2516 &function_end_B("${PREFIX}_set_decrypt_key");
2517
2518 &set_label("key_const",64);
2519 &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
2520 &data_word(0x04070605,0x04070605,0x04070605,0x04070605);
2521 &data_word(1,1,1,1);
2522 &data_word(0x1b,0x1b,0x1b,0x1b);
2523 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2524
2525 &asm_finish();