]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/vpaes-x86.pl
Update copyright year
[thirdparty/openssl.git] / crypto / aes / asm / vpaes-x86.pl
CommitLineData
6aa36e8e 1#! /usr/bin/env perl
33388b44 2# Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
6aa36e8e 3#
c918d8e2 4# Licensed under the Apache License 2.0 (the "License"). You may not use
6aa36e8e
RS
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
a87ff751
AP
9
10######################################################################
11## Constant-time SSSE3 AES core implementation.
12## version 0.1
13##
14## By Mike Hamburg (Stanford University), 2009
15## Public domain.
16##
17## For details see http://shiftleft.org/papers/vector_aes/ and
18## http://crypto.stanford.edu/vpaes/.
19
20######################################################################
21# September 2011.
22#
23# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
24# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
25# doesn't handle partial vectors (doesn't have to if called from
26# EVP only). "Drop-in" implies that this module doesn't share key
27# schedule structure with the original nor does it make assumption
28# about its alignment...
29#
30# Performance summary. aes-586.pl column lists large-block CBC
8ca28da0 31# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
3b7c14bb
AP
32# byte processed with 128-bit key, and vpaes-x86.pl column - [also
33# large-block CBC] encrypt/decrypt.
a87ff751
AP
34#
35# aes-586.pl vpaes-x86.pl
36#
89f1eb82
AP
37# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
38# Nehalem 27.9/40.4/18.1 10.2/11.9
988d11b6 39# Atom 70.7/92.1/60.1 61.1/75.4(***)
b59f92e7 40# Silvermont 45.4/62.9/24.1 49.2/61.1(***)
a87ff751
AP
41#
42# (*) "Hyper-threading" in the context refers rather to cache shared
43# among multiple cores, than to specifically Intel HTT. As vast
44# majority of contemporary cores share cache, slower code path
45# is common place. In other words "with-hyper-threading-off"
46# results are presented mostly for reference purposes.
47#
48# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
49#
50# (***) Less impressive improvement on Core 2 and Atom is due to slow
89f1eb82
AP
51# pshufb, yet it's respectable +28%/64% improvement on Core 2
52# and +15% on Atom (as implied, over "hyper-threading-safe"
8ca28da0 53# code path).
a87ff751 54#
543dfa9f 55# <appro@openssl.org>
a87ff751
AP
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58push(@INC,"${dir}","${dir}../../perlasm");
59require "x86asm.pl";
60
1aa89a7a 61$output = pop and open STDOUT,">$output";
184bc45f 62
e195c8a2 63&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
a87ff751 64
8ca28da0 65$PREFIX="vpaes";
a87ff751
AP
66
67my ($round, $base, $magic, $key, $const, $inp, $out)=
68 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
69
70&static_label("_vpaes_consts");
71&static_label("_vpaes_schedule_low_round");
72
73&set_label("_vpaes_consts",64);
74$k_inv=-0x30; # inv, inva
75 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
76 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
77
78$k_s0F=-0x10; # s0F
79 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
80
81$k_ipt=0x00; # input transform (lo, hi)
82 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
83 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
84
85$k_sb1=0x20; # sb1u, sb1t
86 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
87 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
88$k_sb2=0x40; # sb2u, sb2t
89 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
90 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
91$k_sbo=0x60; # sbou, sbot
92 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
93 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
94
95$k_mc_forward=0x80; # mc_forward
96 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
97 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
98 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
99 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
100
101$k_mc_backward=0xc0; # mc_backward
102 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
103 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
104 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
105 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
106
107$k_sr=0x100; # sr
108 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
109 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
110 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
111 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
112
113$k_rcon=0x140; # rcon
114 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
115
116$k_s63=0x150; # s63: all equal to 0x63 transformed
117 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
118
119$k_opt=0x160; # output transform
120 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
121 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
122
123$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
124 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
125 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
126##
127## Decryption stuff
128## Key schedule constants
129##
130$k_dksd=0x1a0; # decryption key schedule: invskew x*D
131 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
132 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
133$k_dksb=0x1c0; # decryption key schedule: invskew x*B
134 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
135 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
136$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
137 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
138 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
139$k_dks9=0x200; # decryption key schedule: invskew x*9
140 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
141 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
142
143##
144## Decryption stuff
145## Round function constants
146##
147$k_dipt=0x220; # decryption input transform
148 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
149 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
150
151$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
152 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
153 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
154$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
155 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
156 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
157$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
158 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
159 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
160$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
161 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
162 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
163$k_dsbo=0x2c0; # decryption sbox final output
164 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
165 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
3b7c14bb 166&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
a87ff751
AP
167&align (64);
168
169&function_begin_B("_vpaes_preheat");
170 &add ($const,&DWP(0,"esp"));
171 &movdqa ("xmm7",&QWP($k_inv,$const));
172 &movdqa ("xmm6",&QWP($k_s0F,$const));
173 &ret ();
174&function_end_B("_vpaes_preheat");
175
176##
177## _aes_encrypt_core
178##
179## AES-encrypt %xmm0.
180##
181## Inputs:
182## %xmm0 = input
183## %xmm6-%xmm7 as in _vpaes_preheat
184## (%edx) = scheduled keys
185##
186## Output in %xmm0
187## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
188##
189##
190&function_begin_B("_vpaes_encrypt_core");
191 &mov ($magic,16);
192 &mov ($round,&DWP(240,$key));
193 &movdqa ("xmm1","xmm6")
194 &movdqa ("xmm2",&QWP($k_ipt,$const));
195 &pandn ("xmm1","xmm0");
a87ff751 196 &pand ("xmm0","xmm6");
89f1eb82 197 &movdqu ("xmm5",&QWP(0,$key));
a87ff751
AP
198 &pshufb ("xmm2","xmm0");
199 &movdqa ("xmm0",&QWP($k_ipt+16,$const));
a87ff751 200 &pxor ("xmm2","xmm5");
89f1eb82 201 &psrld ("xmm1",4);
a87ff751 202 &add ($key,16);
89f1eb82 203 &pshufb ("xmm0","xmm1");
a87ff751 204 &lea ($base,&DWP($k_mc_backward,$const));
89f1eb82 205 &pxor ("xmm0","xmm2");
a87ff751
AP
206 &jmp (&label("enc_entry"));
207
208
209&set_label("enc_loop",16);
210 # middle of middle round
211 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
a87ff751 212 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
89f1eb82 213 &pshufb ("xmm4","xmm2"); # 4 = sb1u
a87ff751 214 &pshufb ("xmm0","xmm3"); # 0 = sb1t
89f1eb82 215 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
a87ff751 216 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
89f1eb82 217 &pxor ("xmm0","xmm4"); # 0 = A
a87ff751 218 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
89f1eb82 219 &pshufb ("xmm5","xmm2"); # 4 = sb2u
a87ff751 220 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
a87ff751 221 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
89f1eb82 222 &pshufb ("xmm2","xmm3"); # 2 = sb2t
a87ff751 223 &movdqa ("xmm3","xmm0"); # 3 = A
89f1eb82 224 &pxor ("xmm2","xmm5"); # 2 = 2A
a87ff751
AP
225 &pshufb ("xmm0","xmm1"); # 0 = B
226 &add ($key,16); # next key
227 &pxor ("xmm0","xmm2"); # 0 = 2A+B
228 &pshufb ("xmm3","xmm4"); # 3 = D
229 &add ($magic,16); # next mc
230 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
231 &pshufb ("xmm0","xmm1"); # 0 = 2B+C
232 &and ($magic,0x30); # ... mod 4
a87ff751 233 &sub ($round,1); # nr--
89f1eb82 234 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
a87ff751
AP
235
236&set_label("enc_entry");
237 # top of round
238 &movdqa ("xmm1","xmm6"); # 1 : i
89f1eb82 239 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
a87ff751
AP
240 &pandn ("xmm1","xmm0"); # 1 = i<<4
241 &psrld ("xmm1",4); # 1 = i
242 &pand ("xmm0","xmm6"); # 0 = k
a87ff751 243 &pshufb ("xmm5","xmm0"); # 2 = a/k
a87ff751 244 &movdqa ("xmm3","xmm7"); # 3 : 1/i
89f1eb82 245 &pxor ("xmm0","xmm1"); # 0 = j
a87ff751 246 &pshufb ("xmm3","xmm1"); # 3 = 1/i
a87ff751 247 &movdqa ("xmm4","xmm7"); # 4 : 1/j
89f1eb82 248 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
a87ff751 249 &pshufb ("xmm4","xmm0"); # 4 = 1/j
a87ff751 250 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
89f1eb82 251 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
a87ff751 252 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
a87ff751 253 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
89f1eb82 254 &pxor ("xmm2","xmm0"); # 2 = io
a87ff751 255 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
89f1eb82 256 &movdqu ("xmm5",&QWP(0,$key));
a87ff751
AP
257 &pxor ("xmm3","xmm1"); # 3 = jo
258 &jnz (&label("enc_loop"));
259
260 # middle of last round
261 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
262 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
263 &pshufb ("xmm4","xmm2"); # 4 = sbou
264 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
265 &pshufb ("xmm0","xmm3"); # 0 = sb1t
266 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
267 &pxor ("xmm0","xmm4"); # 0 = A
268 &pshufb ("xmm0","xmm1");
269 &ret ();
270&function_end_B("_vpaes_encrypt_core");
271
272##
273## Decryption core
274##
275## Same API as encryption core.
276##
277&function_begin_B("_vpaes_decrypt_core");
a87ff751 278 &lea ($base,&DWP($k_dsbd,$const));
89f1eb82 279 &mov ($round,&DWP(240,$key));
a87ff751
AP
280 &movdqa ("xmm1","xmm6");
281 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
282 &pandn ("xmm1","xmm0");
283 &mov ($magic,$round);
284 &psrld ("xmm1",4)
285 &movdqu ("xmm5",&QWP(0,$key));
286 &shl ($magic,4);
287 &pand ("xmm0","xmm6");
288 &pshufb ("xmm2","xmm0");
2b1f17f8 289 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
a87ff751
AP
290 &xor ($magic,0x30);
291 &pshufb ("xmm0","xmm1");
292 &and ($magic,0x30);
293 &pxor ("xmm2","xmm5");
294 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
295 &pxor ("xmm0","xmm2");
296 &add ($key,16);
297 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
298 &jmp (&label("dec_entry"));
299
300&set_label("dec_loop",16);
301##
302## Inverse mix columns
303##
304 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
89f1eb82 305 &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t
a87ff751 306 &pshufb ("xmm4","xmm2"); # 4 = sb9u
89f1eb82 307 &pshufb ("xmm1","xmm3"); # 0 = sb9t
988d11b6 308 &pxor ("xmm0","xmm4");
a87ff751 309 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
988d11b6
AP
310 &pxor ("xmm0","xmm1"); # 0 = ch
311 &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt
a87ff751 312
988d11b6 313 &pshufb ("xmm4","xmm2"); # 4 = sbdu
89f1eb82 314 &pshufb ("xmm0","xmm5"); # MC ch
988d11b6
AP
315 &pshufb ("xmm1","xmm3"); # 0 = sbdt
316 &pxor ("xmm0","xmm4"); # 4 = ch
317 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
318 &pxor ("xmm0","xmm1"); # 0 = ch
89f1eb82 319 &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt
988d11b6 320
a87ff751 321 &pshufb ("xmm4","xmm2"); # 4 = sbbu
988d11b6 322 &pshufb ("xmm0","xmm5"); # MC ch
89f1eb82 323 &pshufb ("xmm1","xmm3"); # 0 = sbbt
988d11b6 324 &pxor ("xmm0","xmm4"); # 4 = ch
a87ff751 325 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
988d11b6
AP
326 &pxor ("xmm0","xmm1"); # 0 = ch
327 &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet
328
89f1eb82 329 &pshufb ("xmm4","xmm2"); # 4 = sbeu
988d11b6
AP
330 &pshufb ("xmm0","xmm5"); # MC ch
331 &pshufb ("xmm1","xmm3"); # 0 = sbet
332 &pxor ("xmm0","xmm4"); # 4 = ch
333 &add ($key,16); # next round key
a87ff751 334 &palignr("xmm5","xmm5",12);
988d11b6
AP
335 &pxor ("xmm0","xmm1"); # 0 = ch
336 &sub ($round,1); # nr--
a87ff751
AP
337
338&set_label("dec_entry");
339 # top of round
340 &movdqa ("xmm1","xmm6"); # 1 : i
89f1eb82 341 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
988d11b6 342 &pandn ("xmm1","xmm0"); # 1 = i<<4
a87ff751 343 &pand ("xmm0","xmm6"); # 0 = k
988d11b6 344 &psrld ("xmm1",4); # 1 = i
a87ff751 345 &pshufb ("xmm2","xmm0"); # 2 = a/k
a87ff751 346 &movdqa ("xmm3","xmm7"); # 3 : 1/i
89f1eb82 347 &pxor ("xmm0","xmm1"); # 0 = j
a87ff751 348 &pshufb ("xmm3","xmm1"); # 3 = 1/i
a87ff751 349 &movdqa ("xmm4","xmm7"); # 4 : 1/j
89f1eb82 350 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
a87ff751
AP
351 &pshufb ("xmm4","xmm0"); # 4 = 1/j
352 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
353 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
354 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
a87ff751 355 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
89f1eb82 356 &pxor ("xmm2","xmm0"); # 2 = io
a87ff751 357 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
a87ff751 358 &movdqu ("xmm0",&QWP(0,$key));
89f1eb82 359 &pxor ("xmm3","xmm1"); # 3 = jo
a87ff751
AP
360 &jnz (&label("dec_loop"));
361
362 # middle of last round
363 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
364 &pshufb ("xmm4","xmm2"); # 4 = sbou
365 &pxor ("xmm4","xmm0"); # 4 = sb1u + k
366 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
367 &movdqa ("xmm2",&QWP(0,$magic));
368 &pshufb ("xmm0","xmm3"); # 0 = sb1t
369 &pxor ("xmm0","xmm4"); # 0 = A
370 &pshufb ("xmm0","xmm2");
371 &ret ();
372&function_end_B("_vpaes_decrypt_core");
373
374########################################################
375## ##
376## AES key schedule ##
377## ##
378########################################################
379&function_begin_B("_vpaes_schedule_core");
380 &add ($const,&DWP(0,"esp"));
381 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
382 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
383
384 # input transform
385 &movdqa ("xmm3","xmm0");
386 &lea ($base,&DWP($k_ipt,$const));
387 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
388 &call ("_vpaes_schedule_transform");
389 &movdqa ("xmm7","xmm0");
390
391 &test ($out,$out);
392 &jnz (&label("schedule_am_decrypting"));
393
394 # encrypting, output zeroth round key after transform
395 &movdqu (&QWP(0,$key),"xmm0");
396 &jmp (&label("schedule_go"));
397
398&set_label("schedule_am_decrypting");
399 # decrypting, output zeroth round key after shiftrows
400 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
401 &pshufb ("xmm3","xmm1");
402 &movdqu (&QWP(0,$key),"xmm3");
403 &xor ($magic,0x30);
404
405&set_label("schedule_go");
406 &cmp ($round,192);
407 &ja (&label("schedule_256"));
408 &je (&label("schedule_192"));
409 # 128: fall though
410
411##
412## .schedule_128
413##
414## 128-bit specific part of key schedule.
415##
416## This schedule is really simple, because all its parts
417## are accomplished by the subroutines.
418##
419&set_label("schedule_128");
420 &mov ($round,10);
421
422&set_label("loop_schedule_128");
423 &call ("_vpaes_schedule_round");
424 &dec ($round);
425 &jz (&label("schedule_mangle_last"));
426 &call ("_vpaes_schedule_mangle"); # write output
427 &jmp (&label("loop_schedule_128"));
428
429##
430## .aes_schedule_192
431##
432## 192-bit specific part of key schedule.
433##
434## The main body of this schedule is the same as the 128-bit
435## schedule, but with more smearing. The long, high side is
436## stored in %xmm7 as before, and the short, low side is in
437## the high bits of %xmm6.
438##
439## This schedule is somewhat nastier, however, because each
440## round produces 192 bits of key material, or 1.5 round keys.
441## Therefore, on each cycle we do 2 rounds and produce 3 round
442## keys.
443##
444&set_label("schedule_192",16);
445 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
609b0852 446 &call ("_vpaes_schedule_transform"); # input transform
a87ff751
AP
447 &movdqa ("xmm6","xmm0"); # save short part
448 &pxor ("xmm4","xmm4"); # clear 4
449 &movhlps("xmm6","xmm4"); # clobber low side with zeros
450 &mov ($round,4);
451
452&set_label("loop_schedule_192");
453 &call ("_vpaes_schedule_round");
454 &palignr("xmm0","xmm6",8);
455 &call ("_vpaes_schedule_mangle"); # save key n
456 &call ("_vpaes_schedule_192_smear");
457 &call ("_vpaes_schedule_mangle"); # save key n+1
458 &call ("_vpaes_schedule_round");
459 &dec ($round);
460 &jz (&label("schedule_mangle_last"));
461 &call ("_vpaes_schedule_mangle"); # save key n+2
462 &call ("_vpaes_schedule_192_smear");
463 &jmp (&label("loop_schedule_192"));
464
465##
466## .aes_schedule_256
467##
468## 256-bit specific part of key schedule.
469##
470## The structure here is very similar to the 128-bit
471## schedule, but with an additional "low side" in
472## %xmm6. The low side's rounds are the same as the
473## high side's, except no rcon and no rotation.
474##
475&set_label("schedule_256",16);
476 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
609b0852 477 &call ("_vpaes_schedule_transform"); # input transform
a87ff751
AP
478 &mov ($round,7);
479
480&set_label("loop_schedule_256");
481 &call ("_vpaes_schedule_mangle"); # output low result
482 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
483
484 # high round
485 &call ("_vpaes_schedule_round");
486 &dec ($round);
487 &jz (&label("schedule_mangle_last"));
609b0852 488 &call ("_vpaes_schedule_mangle");
a87ff751
AP
489
490 # low round. swap xmm7 and xmm6
491 &pshufd ("xmm0","xmm0",0xFF);
492 &movdqa (&QWP(20,"esp"),"xmm7");
493 &movdqa ("xmm7","xmm6");
494 &call ("_vpaes_schedule_low_round");
495 &movdqa ("xmm7",&QWP(20,"esp"));
496
497 &jmp (&label("loop_schedule_256"));
498
499##
500## .aes_schedule_mangle_last
501##
502## Mangler for last round of key schedule
503## Mangles %xmm0
504## when encrypting, outputs out(%xmm0) ^ 63
505## when decrypting, outputs unskew(%xmm0)
506##
507## Always called right before return... jumps to cleanup and exits
508##
509&set_label("schedule_mangle_last",16);
510 # schedule last round key from xmm0
511 &lea ($base,&DWP($k_deskew,$const));
512 &test ($out,$out);
513 &jnz (&label("schedule_mangle_last_dec"));
514
515 # encrypting
516 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
517 &pshufb ("xmm0","xmm1"); # output permute
518 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
519 &add ($key,32);
520
521&set_label("schedule_mangle_last_dec");
522 &add ($key,-16);
523 &pxor ("xmm0",&QWP($k_s63,$const));
524 &call ("_vpaes_schedule_transform"); # output transform
525 &movdqu (&QWP(0,$key),"xmm0"); # save last key
526
527 # cleanup
528 &pxor ("xmm0","xmm0");
529 &pxor ("xmm1","xmm1");
530 &pxor ("xmm2","xmm2");
531 &pxor ("xmm3","xmm3");
532 &pxor ("xmm4","xmm4");
533 &pxor ("xmm5","xmm5");
534 &pxor ("xmm6","xmm6");
535 &pxor ("xmm7","xmm7");
536 &ret ();
537&function_end_B("_vpaes_schedule_core");
538
539##
540## .aes_schedule_192_smear
541##
542## Smear the short, low side in the 192-bit key schedule.
543##
544## Inputs:
545## %xmm7: high side, b a x y
546## %xmm6: low side, d c 0 0
547## %xmm13: 0
548##
549## Outputs:
550## %xmm6: b+c+d b+c 0 0
551## %xmm0: b+c+d b+c b a
552##
553&function_begin_B("_vpaes_schedule_192_smear");
89f1eb82 554 &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0
a87ff751 555 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
89f1eb82
AP
556 &pxor ("xmm6","xmm1"); # -> c+d c 0 0
557 &pxor ("xmm1","xmm1");
a87ff751
AP
558 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
559 &movdqa ("xmm0","xmm6");
a87ff751
AP
560 &movhlps("xmm6","xmm1"); # clobber low side with zeros
561 &ret ();
562&function_end_B("_vpaes_schedule_192_smear");
563
564##
565## .aes_schedule_round
566##
567## Runs one main round of the key schedule on %xmm0, %xmm7
568##
569## Specifically, runs subbytes on the high dword of %xmm0
570## then rotates it by one byte and xors into the low dword of
571## %xmm7.
572##
573## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
574## next rcon.
575##
576## Smears the dwords of %xmm7 by xoring the low into the
577## second low, result into third, result into highest.
578##
579## Returns results in %xmm7 = %xmm0.
580## Clobbers %xmm1-%xmm5.
581##
582&function_begin_B("_vpaes_schedule_round");
583 # extract rcon from xmm8
584 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
585 &pxor ("xmm1","xmm1");
586 &palignr("xmm1","xmm2",15);
587 &palignr("xmm2","xmm2",15);
588 &pxor ("xmm7","xmm1");
589
590 # rotate
591 &pshufd ("xmm0","xmm0",0xFF);
592 &palignr("xmm0","xmm0",1);
593
594 # fall through...
595 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
596
597 # low round: same as high round, but no rotation and no rcon.
598&set_label("_vpaes_schedule_low_round");
599 # smear xmm7
600 &movdqa ("xmm1","xmm7");
601 &pslldq ("xmm7",4);
602 &pxor ("xmm7","xmm1");
603 &movdqa ("xmm1","xmm7");
604 &pslldq ("xmm7",8);
605 &pxor ("xmm7","xmm1");
606 &pxor ("xmm7",&QWP($k_s63,$const));
607
608 # subbyte
609 &movdqa ("xmm4",&QWP($k_s0F,$const));
610 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
609b0852 611 &movdqa ("xmm1","xmm4");
a87ff751
AP
612 &pandn ("xmm1","xmm0");
613 &psrld ("xmm1",4); # 1 = i
614 &pand ("xmm0","xmm4"); # 0 = k
615 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
616 &pshufb ("xmm2","xmm0"); # 2 = a/k
617 &pxor ("xmm0","xmm1"); # 0 = j
618 &movdqa ("xmm3","xmm5"); # 3 : 1/i
619 &pshufb ("xmm3","xmm1"); # 3 = 1/i
620 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
621 &movdqa ("xmm4","xmm5"); # 4 : 1/j
622 &pshufb ("xmm4","xmm0"); # 4 = 1/j
623 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
624 &movdqa ("xmm2","xmm5"); # 2 : 1/iak
625 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
626 &pxor ("xmm2","xmm0"); # 2 = io
627 &movdqa ("xmm3","xmm5"); # 3 : 1/jak
628 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
629 &pxor ("xmm3","xmm1"); # 3 = jo
630 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
631 &pshufb ("xmm4","xmm2"); # 4 = sbou
632 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
633 &pshufb ("xmm0","xmm3"); # 0 = sb1t
634 &pxor ("xmm0","xmm4"); # 0 = sbox output
635
636 # add in smeared stuff
637 &pxor ("xmm0","xmm7");
638 &movdqa ("xmm7","xmm0");
639 &ret ();
640&function_end_B("_vpaes_schedule_round");
641
642##
643## .aes_schedule_transform
644##
645## Linear-transform %xmm0 according to tables at (%ebx)
646##
647## Output in %xmm0
648## Clobbers %xmm1, %xmm2
649##
650&function_begin_B("_vpaes_schedule_transform");
651 &movdqa ("xmm2",&QWP($k_s0F,$const));
652 &movdqa ("xmm1","xmm2");
653 &pandn ("xmm1","xmm0");
654 &psrld ("xmm1",4);
655 &pand ("xmm0","xmm2");
656 &movdqa ("xmm2",&QWP(0,$base));
657 &pshufb ("xmm2","xmm0");
658 &movdqa ("xmm0",&QWP(16,$base));
659 &pshufb ("xmm0","xmm1");
660 &pxor ("xmm0","xmm2");
661 &ret ();
662&function_end_B("_vpaes_schedule_transform");
663
664##
665## .aes_schedule_mangle
666##
667## Mangle xmm0 from (basis-transformed) standard version
668## to our version.
669##
670## On encrypt,
671## xor with 0x63
672## multiply by circulant 0,1,1,1
673## apply shiftrows transform
674##
675## On decrypt,
676## xor with 0x63
677## multiply by "inverse mixcolumns" circulant E,B,D,9
678## deskew
679## apply shiftrows transform
680##
681##
682## Writes out to (%edx), and increments or decrements it
683## Keeps track of round number mod 4 in %ecx
684## Preserves xmm0
685## Clobbers xmm1-xmm5
686##
687&function_begin_B("_vpaes_schedule_mangle");
688 &movdqa ("xmm4","xmm0"); # save xmm0 for later
689 &movdqa ("xmm5",&QWP($k_mc_forward,$const));
690 &test ($out,$out);
691 &jnz (&label("schedule_mangle_dec"));
692
693 # encrypting
694 &add ($key,16);
695 &pxor ("xmm4",&QWP($k_s63,$const));
696 &pshufb ("xmm4","xmm5");
697 &movdqa ("xmm3","xmm4");
698 &pshufb ("xmm4","xmm5");
699 &pxor ("xmm3","xmm4");
700 &pshufb ("xmm4","xmm5");
701 &pxor ("xmm3","xmm4");
702
703 &jmp (&label("schedule_mangle_both"));
704
705&set_label("schedule_mangle_dec",16);
706 # inverse mix columns
707 &movdqa ("xmm2",&QWP($k_s0F,$const));
708 &lea ($inp,&DWP($k_dksd,$const));
709 &movdqa ("xmm1","xmm2");
710 &pandn ("xmm1","xmm4");
711 &psrld ("xmm1",4); # 1 = hi
712 &pand ("xmm4","xmm2"); # 4 = lo
713
714 &movdqa ("xmm2",&QWP(0,$inp));
715 &pshufb ("xmm2","xmm4");
716 &movdqa ("xmm3",&QWP(0x10,$inp));
717 &pshufb ("xmm3","xmm1");
718 &pxor ("xmm3","xmm2");
719 &pshufb ("xmm3","xmm5");
720
721 &movdqa ("xmm2",&QWP(0x20,$inp));
722 &pshufb ("xmm2","xmm4");
723 &pxor ("xmm2","xmm3");
724 &movdqa ("xmm3",&QWP(0x30,$inp));
725 &pshufb ("xmm3","xmm1");
726 &pxor ("xmm3","xmm2");
727 &pshufb ("xmm3","xmm5");
728
729 &movdqa ("xmm2",&QWP(0x40,$inp));
730 &pshufb ("xmm2","xmm4");
731 &pxor ("xmm2","xmm3");
732 &movdqa ("xmm3",&QWP(0x50,$inp));
733 &pshufb ("xmm3","xmm1");
734 &pxor ("xmm3","xmm2");
735 &pshufb ("xmm3","xmm5");
736
737 &movdqa ("xmm2",&QWP(0x60,$inp));
738 &pshufb ("xmm2","xmm4");
739 &pxor ("xmm2","xmm3");
740 &movdqa ("xmm3",&QWP(0x70,$inp));
741 &pshufb ("xmm3","xmm1");
742 &pxor ("xmm3","xmm2");
743
744 &add ($key,-16);
745
746&set_label("schedule_mangle_both");
747 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
748 &pshufb ("xmm3","xmm1");
749 &add ($magic,-16);
750 &and ($magic,0x30);
751 &movdqu (&QWP(0,$key),"xmm3");
752 &ret ();
753&function_end_B("_vpaes_schedule_mangle");
754
755#
756# Interface to OpenSSL
757#
758&function_begin("${PREFIX}_set_encrypt_key");
759 &mov ($inp,&wparam(0)); # inp
760 &lea ($base,&DWP(-56,"esp"));
761 &mov ($round,&wparam(1)); # bits
762 &and ($base,-16);
763 &mov ($key,&wparam(2)); # key
764 &xchg ($base,"esp"); # alloca
765 &mov (&DWP(48,"esp"),$base);
766
767 &mov ($base,$round);
768 &shr ($base,5);
769 &add ($base,5);
770 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
771 &mov ($magic,0x30);
772 &mov ($out,0);
773
0e1467a6 774 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
a87ff751
AP
775 &call ("_vpaes_schedule_core");
776&set_label("pic_point");
777
778 &mov ("esp",&DWP(48,"esp"));
779 &xor ("eax","eax");
780&function_end("${PREFIX}_set_encrypt_key");
781
782&function_begin("${PREFIX}_set_decrypt_key");
783 &mov ($inp,&wparam(0)); # inp
784 &lea ($base,&DWP(-56,"esp"));
785 &mov ($round,&wparam(1)); # bits
786 &and ($base,-16);
787 &mov ($key,&wparam(2)); # key
788 &xchg ($base,"esp"); # alloca
789 &mov (&DWP(48,"esp"),$base);
790
791 &mov ($base,$round);
792 &shr ($base,5);
793 &add ($base,5);
794 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
795 &shl ($base,4);
796 &lea ($key,&DWP(16,$key,$base));
797
798 &mov ($out,1);
799 &mov ($magic,$round);
800 &shr ($magic,1);
801 &and ($magic,32);
802 &xor ($magic,32); # nbist==192?0:32;
803
0e1467a6 804 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
a87ff751
AP
805 &call ("_vpaes_schedule_core");
806&set_label("pic_point");
807
808 &mov ("esp",&DWP(48,"esp"));
809 &xor ("eax","eax");
810&function_end("${PREFIX}_set_decrypt_key");
811
812&function_begin("${PREFIX}_encrypt");
0e1467a6 813 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
a87ff751
AP
814 &call ("_vpaes_preheat");
815&set_label("pic_point");
816 &mov ($inp,&wparam(0)); # inp
817 &lea ($base,&DWP(-56,"esp"));
818 &mov ($out,&wparam(1)); # out
819 &and ($base,-16);
820 &mov ($key,&wparam(2)); # key
821 &xchg ($base,"esp"); # alloca
822 &mov (&DWP(48,"esp"),$base);
823
824 &movdqu ("xmm0",&QWP(0,$inp));
825 &call ("_vpaes_encrypt_core");
826 &movdqu (&QWP(0,$out),"xmm0");
827
828 &mov ("esp",&DWP(48,"esp"));
829&function_end("${PREFIX}_encrypt");
830
831&function_begin("${PREFIX}_decrypt");
0e1467a6 832 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
a87ff751
AP
833 &call ("_vpaes_preheat");
834&set_label("pic_point");
835 &mov ($inp,&wparam(0)); # inp
836 &lea ($base,&DWP(-56,"esp"));
837 &mov ($out,&wparam(1)); # out
838 &and ($base,-16);
839 &mov ($key,&wparam(2)); # key
840 &xchg ($base,"esp"); # alloca
841 &mov (&DWP(48,"esp"),$base);
842
843 &movdqu ("xmm0",&QWP(0,$inp));
844 &call ("_vpaes_decrypt_core");
845 &movdqu (&QWP(0,$out),"xmm0");
846
847 &mov ("esp",&DWP(48,"esp"));
848&function_end("${PREFIX}_decrypt");
849
850&function_begin("${PREFIX}_cbc_encrypt");
851 &mov ($inp,&wparam(0)); # inp
852 &mov ($out,&wparam(1)); # out
853 &mov ($round,&wparam(2)); # len
854 &mov ($key,&wparam(3)); # key
5db9645f
AP
855 &sub ($round,16);
856 &jc (&label("cbc_abort"));
a87ff751
AP
857 &lea ($base,&DWP(-56,"esp"));
858 &mov ($const,&wparam(4)); # ivp
859 &and ($base,-16);
860 &mov ($magic,&wparam(5)); # enc
861 &xchg ($base,"esp"); # alloca
862 &movdqu ("xmm1",&QWP(0,$const)); # load IV
863 &sub ($out,$inp);
864 &mov (&DWP(48,"esp"),$base);
865
866 &mov (&DWP(0,"esp"),$out); # save out
a87ff751
AP
867 &mov (&DWP(4,"esp"),$key) # save key
868 &mov (&DWP(8,"esp"),$const); # save ivp
869 &mov ($out,$round); # $out works as $len
870
0e1467a6 871 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
a87ff751
AP
872 &call ("_vpaes_preheat");
873&set_label("pic_point");
874 &cmp ($magic,0);
875 &je (&label("cbc_dec_loop"));
876 &jmp (&label("cbc_enc_loop"));
877
878&set_label("cbc_enc_loop",16);
879 &movdqu ("xmm0",&QWP(0,$inp)); # load input
880 &pxor ("xmm0","xmm1"); # inp^=iv
881 &call ("_vpaes_encrypt_core");
882 &mov ($base,&DWP(0,"esp")); # restore out
883 &mov ($key,&DWP(4,"esp")); # restore key
884 &movdqa ("xmm1","xmm0");
885 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
886 &lea ($inp,&DWP(16,$inp));
887 &sub ($out,16);
888 &jnc (&label("cbc_enc_loop"));
889 &jmp (&label("cbc_done"));
890
891&set_label("cbc_dec_loop",16);
892 &movdqu ("xmm0",&QWP(0,$inp)); # load input
893 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
894 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
895 &call ("_vpaes_decrypt_core");
896 &mov ($base,&DWP(0,"esp")); # restore out
897 &mov ($key,&DWP(4,"esp")); # restore key
898 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
899 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
900 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
901 &lea ($inp,&DWP(16,$inp));
902 &sub ($out,16);
903 &jnc (&label("cbc_dec_loop"));
904
905&set_label("cbc_done");
906 &mov ($base,&DWP(8,"esp")); # restore ivp
907 &mov ("esp",&DWP(48,"esp"));
908 &movdqu (&QWP(0,$base),"xmm1"); # write IV
5db9645f 909&set_label("cbc_abort");
a87ff751
AP
910&function_end("${PREFIX}_cbc_encrypt");
911
912&asm_finish();
184bc45f 913
a21314db 914close STDOUT or die "error closing STDOUT: $!";