]>
Commit | Line | Data |
---|---|---|
6aa36e8e RS |
1 | #! /usr/bin/env perl |
2 | # Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the OpenSSL license (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
71314710 AP |
9 | # |
10 | # ==================================================================== | |
e3713c36 | 11 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL |
96b0f6c1 AP |
12 | # project. The module is, however, dual licensed under OpenSSL and |
13 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
14 | # details see http://www.openssl.org/~appro/cryptogams/. | |
71314710 AP |
15 | # ==================================================================== |
16 | # | |
6c69aa53 | 17 | # Version 4.3. |
e7e11507 | 18 | # |
71314710 | 19 | # You might fail to appreciate this module performance from the first |
e7e11507 AP |
20 | # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered |
21 | # to be *the* best Intel C compiler without -KPIC, performance appears | |
22 | # to be virtually identical... But try to re-configure with shared | |
23 | # library support... Aha! Intel compiler "suddenly" lags behind by 30% | |
24 | # [on P4, more on others]:-) And if compared to position-independent | |
25 | # code generated by GNU C, this code performs *more* than *twice* as | |
26 | # fast! Yes, all this buzz about PIC means that unlike other hand- | |
27 | # coded implementations, this one was explicitly designed to be safe | |
28 | # to use even in shared library context... This also means that this | |
29 | # code isn't necessarily absolutely fastest "ever," because in order | |
30 | # to achieve position independence an extra register has to be | |
31 | # off-loaded to stack, which affects the benchmark result. | |
71314710 AP |
32 | # |
33 | # Special note about instruction choice. Do you recall RC4_INT code | |
34 | # performing poorly on P4? It might be the time to figure out why. | |
35 | # RC4_INT code implies effective address calculations in base+offset*4 | |
36 | # form. Trouble is that it seems that offset scaling turned to be | |
37 | # critical path... At least eliminating scaling resulted in 2.8x RC4 | |
38 | # performance improvement [as you might recall]. As AES code is hungry | |
39 | # for scaling too, I [try to] avoid the latter by favoring off-by-2 | |
40 | # shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF. | |
3b3df98c | 41 | # |
e3713c36 | 42 | # As was shown by Dean Gaudet, the above note turned out to be |
3b3df98c AP |
43 | # void. Performance improvement with off-by-2 shifts was observed on |
44 | # intermediate implementation, which was spilling yet another register | |
45 | # to stack... Final offset*4 code below runs just a tad faster on P4, | |
46 | # but exhibits up to 10% improvement on other cores. | |
e7e11507 AP |
47 | # |
48 | # Second version is "monolithic" replacement for aes_core.c, which in | |
49 | # addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key. | |
50 | # This made it possible to implement little-endian variant of the | |
51 | # algorithm without modifying the base C code. Motivating factor for | |
52 | # the undertaken effort was that it appeared that in tight IA-32 | |
53 | # register window little-endian flavor could achieve slightly higher | |
54 | # Instruction Level Parallelism, and it indeed resulted in up to 15% | |
053fa39a | 55 | # better performance on most recent ยต-archs... |
e7e11507 | 56 | # |
addb6e16 | 57 | # Third version adds AES_cbc_encrypt implementation, which resulted in |
46f4e1be JS |
58 | # up to 40% performance improvement of CBC benchmark results. 40% was |
59 | # observed on P4 core, where "overall" improvement coefficient, i.e. if | |
bac252a5 AP |
60 | # compared to PIC generated by GCC and in CBC mode, was observed to be |
61 | # as large as 4x:-) CBC performance is virtually identical to ECB now | |
c8d5c71a AP |
62 | # and on some platforms even better, e.g. 17.6 "small" cycles/byte on |
63 | # Opteron, because certain function prologues and epilogues are | |
addb6e16 | 64 | # effectively taken out of the loop... |
c8d5c71a AP |
65 | # |
66 | # Version 3.2 implements compressed tables and prefetch of these tables | |
67 | # in CBC[!] mode. Former means that 3/4 of table references are now | |
68 | # misaligned, which unfortunately has negative impact on elder IA-32 | |
69 | # implementations, Pentium suffered 30% penalty, PIII - 10%. | |
70 | # | |
3d5fd312 AP |
71 | # Version 3.3 avoids L1 cache aliasing between stack frame and |
72 | # S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The | |
73 | # latter is achieved by copying the key schedule to controlled place in | |
74 | # stack. This unfortunately has rather strong impact on small block CBC | |
75 | # performance, ~2x deterioration on 16-byte block if compared to 3.3. | |
76 | # | |
53a20bfd | 77 | # Version 3.5 checks if there is L1 cache aliasing between user-supplied |
a2806233 | 78 | # key schedule and S-boxes and abstains from copying the former if |
53a20bfd | 79 | # there is no. This allows end-user to consciously retain small block |
a2806233 AP |
80 | # performance by aligning key schedule in specific manner. |
81 | # | |
9598fa87 AP |
82 | # Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB. |
83 | # | |
c8d5c71a AP |
84 | # Current ECB performance numbers for 128-bit key in CPU cycles per |
85 | # processed byte [measure commonly used by AES benchmarkers] are: | |
86 | # | |
87 | # small footprint fully unrolled | |
88 | # P4 24 22 | |
89 | # AMD K8 20 19 | |
90 | # PIII 25 23 | |
91 | # Pentium 81 78 | |
af8c1d81 AP |
92 | # |
93 | # Version 3.7 reimplements outer rounds as "compact." Meaning that | |
94 | # first and last rounds reference compact 256 bytes S-box. This means | |
95 | # that first round consumes a lot more CPU cycles and that encrypt | |
96 | # and decrypt performance becomes asymmetric. Encrypt performance | |
97 | # drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is | |
98 | # aggressively pre-fetched. | |
22c268e6 AP |
99 | # |
100 | # Version 4.0 effectively rolls back to 3.6 and instead implements | |
fc924142 | 101 | # additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact, |
22c268e6 AP |
102 | # which use exclusively 256 byte S-box. These functions are to be |
103 | # called in modes not concealing plain text, such as ECB, or when | |
104 | # we're asked to process smaller amount of data [or unconditionally | |
105 | # on hyper-threading CPU]. Currently it's called unconditionally from | |
106 | # AES_[en|de]crypt, which affects all modes, but CBC. CBC routine | |
107 | # still needs to be modified to switch between slower and faster | |
108 | # mode when appropriate... But in either case benchmark landscape | |
109 | # changes dramatically and below numbers are CPU cycles per processed | |
110 | # byte for 128-bit key. | |
111 | # | |
112 | # ECB encrypt ECB decrypt CBC large chunk | |
89f1eb82 AP |
113 | # P4 52[54] 83[95] 23 |
114 | # AMD K8 46[41] 66[70] 18 | |
115 | # PIII 41[50] 60[77] 24 | |
116 | # Core 2 31[36] 45[64] 18.5 | |
117 | # Atom 76[100] 96[138] 60 | |
118 | # Pentium 115 150 77 | |
53154d71 AP |
119 | # |
120 | # Version 4.1 switches to compact S-box even in key schedule setup. | |
fc924142 AP |
121 | # |
122 | # Version 4.2 prefetches compact S-box in every SSE round or in other | |
123 | # words every cache-line is *guaranteed* to be accessed within ~50 | |
124 | # cycles window. Why just SSE? Because it's needed on hyper-threading | |
125 | # CPU! Which is also why it's prefetched with 64 byte stride. Best | |
609b0852 | 126 | # part is that it has no negative effect on performance:-) |
6c69aa53 AP |
127 | # |
128 | # Version 4.3 implements switch between compact and non-compact block | |
129 | # functions in AES_cbc_encrypt depending on how much data was asked | |
3a8012cb | 130 | # to be processed in one stroke. |
6c69aa53 | 131 | # |
3a8012cb | 132 | ###################################################################### |
6c69aa53 | 133 | # Timing attacks are classified in two classes: synchronous when |
2b8a5406 | 134 | # attacker consciously initiates cryptographic operation and collects |
6c69aa53 AP |
135 | # timing data of various character afterwards, and asynchronous when |
136 | # malicious code is executed on same CPU simultaneously with AES, | |
137 | # instruments itself and performs statistical analysis of this data. | |
138 | # | |
139 | # As far as synchronous attacks go the root to the AES timing | |
140 | # vulnerability is twofold. Firstly, of 256 S-box elements at most 160 | |
141 | # are referred to in single 128-bit block operation. Well, in C | |
142 | # implementation with 4 distinct tables it's actually as little as 40 | |
143 | # references per 256 elements table, but anyway... Secondly, even | |
144 | # though S-box elements are clustered into smaller amount of cache- | |
145 | # lines, smaller than 160 and even 40, it turned out that for certain | |
146 | # plain-text pattern[s] or simply put chosen plain-text and given key | |
147 | # few cache-lines remain unaccessed during block operation. Now, if | |
148 | # attacker can figure out this access pattern, he can deduct the key | |
149 | # [or at least part of it]. The natural way to mitigate this kind of | |
150 | # attacks is to minimize the amount of cache-lines in S-box and/or | |
151 | # prefetch them to ensure that every one is accessed for more uniform | |
152 | # timing. But note that *if* plain-text was concealed in such way that | |
153 | # input to block function is distributed *uniformly*, then attack | |
154 | # wouldn't apply. Now note that some encryption modes, most notably | |
3a8012cb | 155 | # CBC, do mask the plain-text in this exact way [secure cipher output |
6c69aa53 AP |
156 | # is distributed uniformly]. Yes, one still might find input that |
157 | # would reveal the information about given key, but if amount of | |
2b8a5406 | 158 | # candidate inputs to be tried is larger than amount of possible key |
6c69aa53 AP |
159 | # combinations then attack becomes infeasible. This is why revised |
160 | # AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk | |
161 | # of data is to be processed in one stroke. The current size limit of | |
46f4e1be | 162 | # 512 bytes is chosen to provide same [diminishingly low] probability |
6c69aa53 AP |
163 | # for cache-line to remain untouched in large chunk operation with |
164 | # large S-box as for single block operation with compact S-box and | |
165 | # surely needs more careful consideration... | |
166 | # | |
167 | # As for asynchronous attacks. There are two flavours: attacker code | |
168 | # being interleaved with AES on hyper-threading CPU at *instruction* | |
169 | # level, and two processes time sharing single core. As for latter. | |
170 | # Two vectors. 1. Given that attacker process has higher priority, | |
171 | # yield execution to process performing AES just before timer fires | |
172 | # off the scheduler, immediately regain control of CPU and analyze the | |
173 | # cache state. For this attack to be efficient attacker would have to | |
46f4e1be | 174 | # effectively slow down the operation by several *orders* of magnitude, |
6c69aa53 AP |
175 | # by ratio of time slice to duration of handful of AES rounds, which |
176 | # unlikely to remain unnoticed. Not to mention that this also means | |
46f4e1be | 177 | # that he would spend correspondingly more time to collect enough |
6c69aa53 | 178 | # statistical data to mount the attack. It's probably appropriate to |
46f4e1be | 179 | # say that if adversary reckons that this attack is beneficial and |
6c69aa53 AP |
180 | # risks to be noticed, you probably have larger problems having him |
181 | # mere opportunity. In other words suggested code design expects you | |
182 | # to preclude/mitigate this attack by overall system security design. | |
183 | # 2. Attacker manages to make his code interrupt driven. In order for | |
184 | # this kind of attack to be feasible, interrupt rate has to be high | |
185 | # enough, again comparable to duration of handful of AES rounds. But | |
186 | # is there interrupt source of such rate? Hardly, not even 1Gbps NIC | |
187 | # generates interrupts at such raging rate... | |
188 | # | |
189 | # And now back to the former, hyper-threading CPU or more specifically | |
190 | # Intel P4. Recall that asynchronous attack implies that malicious | |
191 | # code instruments itself. And naturally instrumentation granularity | |
192 | # has be noticeably lower than duration of codepath accessing S-box. | |
193 | # Given that all cache-lines are accessed during that time that is. | |
194 | # Current implementation accesses *all* cache-lines within ~50 cycles | |
195 | # window, which is actually *less* than RDTSC latency on Intel P4! | |
71314710 | 196 | |
96b0f6c1 AP |
197 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
198 | push(@INC,"${dir}","${dir}../../perlasm"); | |
71314710 AP |
199 | require "x86asm.pl"; |
200 | ||
184bc45f RL |
201 | $output = pop; |
202 | open OUT,">$output"; | |
203 | *STDOUT=*OUT; | |
204 | ||
e195c8a2 | 205 | &asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386"); |
79eeb470 AP |
206 | &static_label("AES_Te"); |
207 | &static_label("AES_Td"); | |
71314710 | 208 | |
addb6e16 AP |
209 | $s0="eax"; |
210 | $s1="ebx"; | |
211 | $s2="ecx"; | |
212 | $s3="edx"; | |
213 | $key="edi"; | |
214 | $acc="esi"; | |
22c268e6 AP |
215 | $tbl="ebp"; |
216 | ||
6c69aa53 AP |
217 | # stack frame layout in _[x86|sse]_AES_* routines, frame is allocated |
218 | # by caller | |
219 | $__ra=&DWP(0,"esp"); # return address | |
220 | $__s0=&DWP(4,"esp"); # s0 backing store | |
221 | $__s1=&DWP(8,"esp"); # s1 backing store | |
222 | $__s2=&DWP(12,"esp"); # s2 backing store | |
223 | $__s3=&DWP(16,"esp"); # s3 backing store | |
224 | $__key=&DWP(20,"esp"); # pointer to key schedule | |
225 | $__end=&DWP(24,"esp"); # pointer to end of key schedule | |
226 | $__tbl=&DWP(28,"esp"); # %ebp backing store | |
227 | ||
228 | # stack frame layout in AES_[en|crypt] routines, which differs from | |
229 | # above by 4 and overlaps by %ebp backing store | |
230 | $_tbl=&DWP(24,"esp"); | |
231 | $_esp=&DWP(28,"esp"); | |
232 | ||
22c268e6 | 233 | sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } } |
addb6e16 | 234 | |
6c69aa53 AP |
235 | $speed_limit=512; # chunks smaller than $speed_limit are |
236 | # processed with compact routine in CBC mode | |
e7e11507 | 237 | $small_footprint=1; # $small_footprint=1 code is ~5% slower [on |
053fa39a | 238 | # recent ยต-archs], but ~5 times smaller! |
bac252a5 AP |
239 | # I favor compact code to minimize cache |
240 | # contention and in hope to "collect" 5% back | |
241 | # in real-life applications... | |
22c268e6 | 242 | |
46f4e1be | 243 | $vertical_spin=0; # shift "vertically" defaults to 0, because of |
addb6e16 | 244 | # its proof-of-concept status... |
e7e11507 AP |
245 | # Note that there is no decvert(), as well as last encryption round is |
246 | # performed with "horizontal" shifts. This is because this "vertical" | |
247 | # implementation [one which groups shifts on a given $s[i] to form a | |
248 | # "column," unlike "horizontal" one, which groups shifts on different | |
249 | # $s[i] to form a "row"] is work in progress. It was observed to run | |
250 | # few percents faster on Intel cores, but not AMD. On AMD K8 core it's | |
251 | # whole 12% slower:-( So we face a trade-off... Shall it be resolved | |
252 | # some day? Till then the code is considered experimental and by | |
253 | # default remains dormant... | |
254 | ||
255 | sub encvert() | |
256 | { my ($te,@s) = @_; | |
f9c5e5d9 | 257 | my ($v0,$v1) = ($acc,$key); |
e7e11507 AP |
258 | |
259 | &mov ($v0,$s[3]); # copy s3 | |
04d0d0ac | 260 | &mov (&DWP(4,"esp"),$s[2]); # save s2 |
e7e11507 | 261 | &mov ($v1,$s[0]); # copy s0 |
04d0d0ac | 262 | &mov (&DWP(8,"esp"),$s[1]); # save s1 |
e7e11507 AP |
263 | |
264 | &movz ($s[2],&HB($s[0])); | |
265 | &and ($s[0],0xFF); | |
c8d5c71a | 266 | &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0 |
e7e11507 | 267 | &shr ($v1,16); |
c8d5c71a | 268 | &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8 |
e7e11507 AP |
269 | &movz ($s[1],&HB($v1)); |
270 | &and ($v1,0xFF); | |
c8d5c71a | 271 | &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16 |
e7e11507 | 272 | &mov ($v1,$v0); |
c8d5c71a | 273 | &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24 |
e7e11507 AP |
274 | |
275 | &and ($v0,0xFF); | |
c8d5c71a | 276 | &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0 |
e7e11507 AP |
277 | &movz ($v0,&HB($v1)); |
278 | &shr ($v1,16); | |
c8d5c71a | 279 | &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8 |
e7e11507 AP |
280 | &movz ($v0,&HB($v1)); |
281 | &and ($v1,0xFF); | |
c8d5c71a | 282 | &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16 |
04d0d0ac | 283 | &mov ($v1,&DWP(4,"esp")); # restore s2 |
c8d5c71a | 284 | &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24 |
e7e11507 AP |
285 | |
286 | &mov ($v0,$v1); | |
287 | &and ($v1,0xFF); | |
c8d5c71a | 288 | &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0 |
e7e11507 AP |
289 | &movz ($v1,&HB($v0)); |
290 | &shr ($v0,16); | |
c8d5c71a | 291 | &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8 |
e7e11507 AP |
292 | &movz ($v1,&HB($v0)); |
293 | &and ($v0,0xFF); | |
c8d5c71a | 294 | &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16 |
04d0d0ac | 295 | &mov ($v0,&DWP(8,"esp")); # restore s1 |
c8d5c71a | 296 | &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24 |
e7e11507 AP |
297 | |
298 | &mov ($v1,$v0); | |
299 | &and ($v0,0xFF); | |
c8d5c71a | 300 | &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0 |
e7e11507 AP |
301 | &movz ($v0,&HB($v1)); |
302 | &shr ($v1,16); | |
c8d5c71a | 303 | &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8 |
e7e11507 AP |
304 | &movz ($v0,&HB($v1)); |
305 | &and ($v1,0xFF); | |
c8d5c71a | 306 | &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16 |
6c69aa53 | 307 | &mov ($key,$__key); # reincarnate v1 as key |
c8d5c71a | 308 | &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24 |
e7e11507 | 309 | } |
71314710 | 310 | |
22c268e6 AP |
311 | # Another experimental routine, which features "horizontal spin," but |
312 | # eliminates one reference to stack. Strangely enough runs slower... | |
313 | sub enchoriz() | |
f9c5e5d9 | 314 | { my ($v0,$v1) = ($key,$acc); |
22c268e6 AP |
315 | |
316 | &movz ($v0,&LB($s0)); # 3, 2, 1, 0* | |
317 | &rotr ($s2,8); # 8,11,10, 9 | |
318 | &mov ($v1,&DWP(0,$te,$v0,8)); # 0 | |
319 | &movz ($v0,&HB($s1)); # 7, 6, 5*, 4 | |
320 | &rotr ($s3,16); # 13,12,15,14 | |
321 | &xor ($v1,&DWP(3,$te,$v0,8)); # 5 | |
322 | &movz ($v0,&HB($s2)); # 8,11,10*, 9 | |
323 | &rotr ($s0,16); # 1, 0, 3, 2 | |
324 | &xor ($v1,&DWP(2,$te,$v0,8)); # 10 | |
325 | &movz ($v0,&HB($s3)); # 13,12,15*,14 | |
326 | &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected | |
6c69aa53 | 327 | &mov ($__s0,$v1); # t[0] saved |
22c268e6 AP |
328 | |
329 | &movz ($v0,&LB($s1)); # 7, 6, 5, 4* | |
330 | &shr ($s1,16); # -, -, 7, 6 | |
331 | &mov ($v1,&DWP(0,$te,$v0,8)); # 4 | |
332 | &movz ($v0,&LB($s3)); # 13,12,15,14* | |
333 | &xor ($v1,&DWP(2,$te,$v0,8)); # 14 | |
334 | &movz ($v0,&HB($s0)); # 1, 0, 3*, 2 | |
335 | &and ($s3,0xffff0000); # 13,12, -, - | |
336 | &xor ($v1,&DWP(1,$te,$v0,8)); # 3 | |
337 | &movz ($v0,&LB($s2)); # 8,11,10, 9* | |
338 | &or ($s3,$s1); # 13,12, 7, 6 | |
339 | &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected | |
340 | &mov ($s1,$v1); # s[1]=t[1] | |
341 | ||
342 | &movz ($v0,&LB($s0)); # 1, 0, 3, 2* | |
343 | &shr ($s2,16); # -, -, 8,11 | |
344 | &mov ($v1,&DWP(2,$te,$v0,8)); # 2 | |
345 | &movz ($v0,&HB($s3)); # 13,12, 7*, 6 | |
346 | &xor ($v1,&DWP(1,$te,$v0,8)); # 7 | |
347 | &movz ($v0,&HB($s2)); # -, -, 8*,11 | |
348 | &xor ($v1,&DWP(0,$te,$v0,8)); # 8 | |
349 | &mov ($v0,$s3); | |
350 | &shr ($v0,24); # 13 | |
351 | &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected | |
352 | ||
353 | &movz ($v0,&LB($s2)); # -, -, 8,11* | |
354 | &shr ($s0,24); # 1* | |
355 | &mov ($s2,&DWP(1,$te,$v0,8)); # 11 | |
356 | &xor ($s2,&DWP(3,$te,$s0,8)); # 1 | |
6c69aa53 | 357 | &mov ($s0,$__s0); # s[0]=t[0] |
22c268e6 AP |
358 | &movz ($v0,&LB($s3)); # 13,12, 7, 6* |
359 | &shr ($s3,16); # , ,13,12 | |
360 | &xor ($s2,&DWP(2,$te,$v0,8)); # 6 | |
6c69aa53 | 361 | &mov ($key,$__key); # reincarnate v0 as key |
22c268e6 AP |
362 | &and ($s3,0xff); # , ,13,12* |
363 | &mov ($s3,&DWP(0,$te,$s3,8)); # 12 | |
364 | &xor ($s3,$s2); # s[2]=t[3] collected | |
365 | &mov ($s2,$v1); # s[2]=t[2] | |
366 | } | |
71314710 | 367 | |
fc924142 | 368 | # More experimental code... SSE one... Even though this one eliminates |
22c268e6 | 369 | # *all* references to stack, it's not faster... |
fc924142 | 370 | sub sse_encbody() |
22c268e6 | 371 | { |
53154d71 AP |
372 | &movz ($acc,&LB("eax")); # 0 |
373 | &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0 | |
22c268e6 AP |
374 | &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 |
375 | &movz ("edx",&HB("eax")); # 1 | |
376 | &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1 | |
377 | &shr ("eax",16); # 5, 4 | |
378 | ||
53154d71 AP |
379 | &movz ($acc,&LB("ebx")); # 10 |
380 | &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10 | |
22c268e6 | 381 | &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 |
53154d71 AP |
382 | &movz ($acc,&HB("ebx")); # 11 |
383 | &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11 | |
22c268e6 AP |
384 | &shr ("ebx",16); # 15,14 |
385 | ||
53154d71 AP |
386 | &movz ($acc,&HB("eax")); # 5 |
387 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5 | |
22c268e6 | 388 | &movq ("mm3",QWP(16,$key)); |
53154d71 AP |
389 | &movz ($acc,&HB("ebx")); # 15 |
390 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15 | |
22c268e6 AP |
391 | &movd ("mm0","ecx"); # t[0] collected |
392 | ||
53154d71 AP |
393 | &movz ($acc,&LB("eax")); # 4 |
394 | &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4 | |
22c268e6 | 395 | &movd ("eax","mm2"); # 7, 6, 3, 2 |
53154d71 AP |
396 | &movz ($acc,&LB("ebx")); # 14 |
397 | &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14 | |
22c268e6 AP |
398 | &movd ("ebx","mm6"); # 13,12, 9, 8 |
399 | ||
53154d71 AP |
400 | &movz ($acc,&HB("eax")); # 3 |
401 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3 | |
402 | &movz ($acc,&HB("ebx")); # 9 | |
403 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9 | |
22c268e6 AP |
404 | &movd ("mm1","ecx"); # t[1] collected |
405 | ||
53154d71 AP |
406 | &movz ($acc,&LB("eax")); # 2 |
407 | &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2 | |
22c268e6 AP |
408 | &shr ("eax",16); # 7, 6 |
409 | &punpckldq ("mm0","mm1"); # t[0,1] collected | |
53154d71 AP |
410 | &movz ($acc,&LB("ebx")); # 8 |
411 | &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8 | |
22c268e6 AP |
412 | &shr ("ebx",16); # 13,12 |
413 | ||
53154d71 AP |
414 | &movz ($acc,&HB("eax")); # 7 |
415 | &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7 | |
22c268e6 AP |
416 | &pxor ("mm0","mm3"); |
417 | &movz ("eax",&LB("eax")); # 6 | |
418 | &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6 | |
419 | &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0 | |
53154d71 AP |
420 | &movz ($acc,&HB("ebx")); # 13 |
421 | &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13 | |
22c268e6 AP |
422 | &xor ("ecx",&DWP(24,$key)); # t[2] |
423 | &movd ("mm4","ecx"); # t[2] collected | |
424 | &movz ("ebx",&LB("ebx")); # 12 | |
425 | &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12 | |
426 | &shr ("ecx",16); | |
427 | &movd ("eax","mm1"); # 5, 4, 1, 0 | |
428 | &mov ("ebx",&DWP(28,$key)); # t[3] | |
429 | &xor ("ebx","edx"); | |
430 | &movd ("mm5","ebx"); # t[3] collected | |
431 | &and ("ebx",0xffff0000); | |
432 | &or ("ebx","ecx"); | |
433 | ||
434 | &punpckldq ("mm4","mm5"); # t[2,3] collected | |
71314710 AP |
435 | } |
436 | ||
22c268e6 AP |
437 | ###################################################################### |
438 | # "Compact" block function | |
439 | ###################################################################### | |
440 | ||
441 | sub enccompact() | |
f9c5e5d9 | 442 | { my $Fn = \&mov; |
af8c1d81 AP |
443 | while ($#_>5) { pop(@_); $Fn=sub{}; } |
444 | my ($i,$te,@s)=@_; | |
445 | my $tmp = $key; | |
446 | my $out = $i==3?$s[0]:$acc; | |
447 | ||
af8c1d81 AP |
448 | # $Fn is used in first compact round and its purpose is to |
449 | # void restoration of some values from stack, so that after | |
22c268e6 | 450 | # 4xenccompact with extra argument $key value is left there... |
6c69aa53 | 451 | if ($i==3) { &$Fn ($key,$__key); }##%edx |
af8c1d81 AP |
452 | else { &mov ($out,$s[0]); } |
453 | &and ($out,0xFF); | |
454 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] | |
455 | if ($i==2) { &shr ($s[0],24); }#%ecx[2] | |
53154d71 | 456 | &movz ($out,&BP(-128,$te,$out,1)); |
af8c1d81 AP |
457 | |
458 | if ($i==3) { $tmp=$s[1]; }##%eax | |
459 | &movz ($tmp,&HB($s[1])); | |
53154d71 | 460 | &movz ($tmp,&BP(-128,$te,$tmp,1)); |
af8c1d81 AP |
461 | &shl ($tmp,8); |
462 | &xor ($out,$tmp); | |
463 | ||
6c69aa53 | 464 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx |
af8c1d81 AP |
465 | else { &mov ($tmp,$s[2]); |
466 | &shr ($tmp,16); } | |
467 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | |
468 | &and ($tmp,0xFF); | |
53154d71 | 469 | &movz ($tmp,&BP(-128,$te,$tmp,1)); |
af8c1d81 AP |
470 | &shl ($tmp,16); |
471 | &xor ($out,$tmp); | |
472 | ||
6c69aa53 | 473 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx |
af8c1d81 AP |
474 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] |
475 | else { &mov ($tmp,$s[3]); | |
476 | &shr ($tmp,24); } | |
53154d71 | 477 | &movz ($tmp,&BP(-128,$te,$tmp,1)); |
af8c1d81 AP |
478 | &shl ($tmp,24); |
479 | &xor ($out,$tmp); | |
480 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | |
22c268e6 | 481 | if ($i==3) { &mov ($s[3],$acc); } |
af8c1d81 AP |
482 | &comment(); |
483 | } | |
484 | ||
485 | sub enctransform() | |
486 | { my @s = ($s0,$s1,$s2,$s3); | |
487 | my $i = shift; | |
22c268e6 AP |
488 | my $tmp = $tbl; |
489 | my $r2 = $key ; | |
af8c1d81 | 490 | |
89f1eb82 | 491 | &and ($tmp,$s[$i]); |
96b0f6c1 | 492 | &lea ($r2,&DWP(0,$s[$i],$s[$i])); |
89f1eb82 AP |
493 | &mov ($acc,$tmp); |
494 | &shr ($tmp,7); | |
96b0f6c1 | 495 | &and ($r2,0xfefefefe); |
89f1eb82 | 496 | &sub ($acc,$tmp); |
af8c1d81 | 497 | &mov ($tmp,$s[$i]); |
89f1eb82 AP |
498 | &and ($acc,0x1b1b1b1b); |
499 | &rotr ($tmp,16); | |
22c268e6 | 500 | &xor ($acc,$r2); # r2 |
89f1eb82 | 501 | &mov ($r2,$s[$i]); |
af8c1d81 | 502 | |
22c268e6 | 503 | &xor ($s[$i],$acc); # r0 ^ r2 |
89f1eb82 AP |
504 | &rotr ($r2,16+8); |
505 | &xor ($acc,$tmp); | |
af8c1d81 | 506 | &rotl ($s[$i],24); |
89f1eb82 AP |
507 | &xor ($acc,$r2); |
508 | &mov ($tmp,0x80808080) if ($i!=1); | |
f9c5e5d9 | 509 | &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2 |
af8c1d81 AP |
510 | } |
511 | ||
22c268e6 AP |
512 | &function_begin_B("_x86_AES_encrypt_compact"); |
513 | # note that caller is expected to allocate stack frame for me! | |
6c69aa53 | 514 | &mov ($__key,$key); # save key |
22c268e6 AP |
515 | |
516 | &xor ($s0,&DWP(0,$key)); # xor with key | |
517 | &xor ($s1,&DWP(4,$key)); | |
518 | &xor ($s2,&DWP(8,$key)); | |
519 | &xor ($s3,&DWP(12,$key)); | |
520 | ||
521 | &mov ($acc,&DWP(240,$key)); # load key->rounds | |
522 | &lea ($acc,&DWP(-2,$acc,$acc)); | |
523 | &lea ($acc,&DWP(0,$key,$acc,8)); | |
6c69aa53 | 524 | &mov ($__end,$acc); # end of key schedule |
22c268e6 AP |
525 | |
526 | # prefetch Te4 | |
527 | &mov ($key,&DWP(0-128,$tbl)); | |
528 | &mov ($acc,&DWP(32-128,$tbl)); | |
529 | &mov ($key,&DWP(64-128,$tbl)); | |
530 | &mov ($acc,&DWP(96-128,$tbl)); | |
531 | &mov ($key,&DWP(128-128,$tbl)); | |
532 | &mov ($acc,&DWP(160-128,$tbl)); | |
533 | &mov ($key,&DWP(192-128,$tbl)); | |
534 | &mov ($acc,&DWP(224-128,$tbl)); | |
535 | ||
536 | &set_label("loop",16); | |
537 | ||
538 | &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1); | |
539 | &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1); | |
540 | &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1); | |
541 | &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1); | |
89f1eb82 | 542 | &mov ($tbl,0x80808080); |
22c268e6 AP |
543 | &enctransform(2); |
544 | &enctransform(3); | |
545 | &enctransform(0); | |
546 | &enctransform(1); | |
6c69aa53 AP |
547 | &mov ($key,$__key); |
548 | &mov ($tbl,$__tbl); | |
22c268e6 AP |
549 | &add ($key,16); # advance rd_key |
550 | &xor ($s0,&DWP(0,$key)); | |
551 | &xor ($s1,&DWP(4,$key)); | |
552 | &xor ($s2,&DWP(8,$key)); | |
553 | &xor ($s3,&DWP(12,$key)); | |
554 | ||
6c69aa53 AP |
555 | &cmp ($key,$__end); |
556 | &mov ($__key,$key); | |
22c268e6 AP |
557 | &jb (&label("loop")); |
558 | ||
559 | &enccompact(0,$tbl,$s0,$s1,$s2,$s3); | |
560 | &enccompact(1,$tbl,$s1,$s2,$s3,$s0); | |
561 | &enccompact(2,$tbl,$s2,$s3,$s0,$s1); | |
562 | &enccompact(3,$tbl,$s3,$s0,$s1,$s2); | |
563 | ||
564 | &xor ($s0,&DWP(16,$key)); | |
565 | &xor ($s1,&DWP(20,$key)); | |
566 | &xor ($s2,&DWP(24,$key)); | |
567 | &xor ($s3,&DWP(28,$key)); | |
568 | ||
569 | &ret (); | |
570 | &function_end_B("_x86_AES_encrypt_compact"); | |
571 | ||
572 | ###################################################################### | |
fc924142 | 573 | # "Compact" SSE block function. |
22c268e6 AP |
574 | ###################################################################### |
575 | # | |
576 | # Performance is not actually extraordinary in comparison to pure | |
577 | # x86 code. In particular encrypt performance is virtually the same. | |
53154d71 | 578 | # Decrypt performance on the other hand is 15-20% better on newer |
053fa39a | 579 | # ยต-archs [but we're thankful for *any* improvement here], and ~50% |
53154d71 | 580 | # better on PIII:-) And additionally on the pros side this code |
22c268e6 AP |
581 | # eliminates redundant references to stack and thus relieves/ |
582 | # minimizes the pressure on the memory bus. | |
583 | # | |
584 | # MMX register layout lsb | |
585 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | |
586 | # | mm4 | mm0 | | |
587 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | |
609b0852 | 588 | # | s3 | s2 | s1 | s0 | |
22c268e6 AP |
589 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ |
590 | # |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0| | |
591 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ | |
592 | # | |
593 | # Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8. | |
594 | # In this terms encryption and decryption "compact" permutation | |
595 | # matrices can be depicted as following: | |
596 | # | |
597 | # encryption lsb # decryption lsb | |
598 | # +----++----+----+----+----+ # +----++----+----+----+----+ | |
599 | # | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 | | |
600 | # +----++----+----+----+----+ # +----++----+----+----+----+ | |
601 | # | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 | | |
602 | # +----++----+----+----+----+ # +----++----+----+----+----+ | |
603 | # | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 | | |
604 | # +----++----+----+----+----+ # +----++----+----+----+----+ | |
605 | # | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 | | |
606 | # +----++----+----+----+----+ # +----++----+----+----+----+ | |
607 | # | |
608 | ###################################################################### | |
609 | # Why not xmm registers? Short answer. It was actually tested and | |
610 | # was not any faster, but *contrary*, most notably on Intel CPUs. | |
611 | # Longer answer. Main advantage of using mm registers is that movd | |
612 | # latency is lower, especially on Intel P4. While arithmetic | |
613 | # instructions are twice as many, they can be scheduled every cycle | |
614 | # and not every second one when they are operating on xmm register, | |
615 | # so that "arithmetic throughput" remains virtually the same. And | |
fc924142 | 616 | # finally the code can be executed even on elder SSE-only CPUs:-) |
22c268e6 | 617 | |
fc924142 | 618 | sub sse_enccompact() |
22c268e6 AP |
619 | { |
620 | &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0 | |
621 | &pshufw ("mm5","mm4",0x0d); # 15,14,11,10 | |
622 | &movd ("eax","mm1"); # 5, 4, 1, 0 | |
623 | &movd ("ebx","mm5"); # 15,14,11,10 | |
89f1eb82 | 624 | &mov ($__key,$key); |
22c268e6 | 625 | |
53154d71 | 626 | &movz ($acc,&LB("eax")); # 0 |
22c268e6 | 627 | &movz ("edx",&HB("eax")); # 1 |
89f1eb82 AP |
628 | &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2 |
629 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 | |
630 | &movz ($key,&LB("ebx")); # 10 | |
53154d71 | 631 | &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 |
22c268e6 | 632 | &shr ("eax",16); # 5, 4 |
89f1eb82 | 633 | &shl ("edx",8); # 1 |
22c268e6 | 634 | |
89f1eb82 AP |
635 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 10 |
636 | &movz ($key,&HB("ebx")); # 11 | |
53154d71 | 637 | &shl ($acc,16); # 10 |
22c268e6 | 638 | &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8 |
89f1eb82 AP |
639 | &or ("ecx",$acc); # 10 |
640 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 11 | |
641 | &movz ($key,&HB("eax")); # 5 | |
53154d71 | 642 | &shl ($acc,24); # 11 |
22c268e6 | 643 | &shr ("ebx",16); # 15,14 |
89f1eb82 | 644 | &or ("edx",$acc); # 11 |
22c268e6 | 645 | |
89f1eb82 AP |
646 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 5 |
647 | &movz ($key,&HB("ebx")); # 15 | |
53154d71 AP |
648 | &shl ($acc,8); # 5 |
649 | &or ("ecx",$acc); # 5 | |
89f1eb82 AP |
650 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 15 |
651 | &movz ($key,&LB("eax")); # 4 | |
53154d71 AP |
652 | &shl ($acc,24); # 15 |
653 | &or ("ecx",$acc); # 15 | |
22c268e6 | 654 | |
89f1eb82 AP |
655 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 4 |
656 | &movz ($key,&LB("ebx")); # 14 | |
22c268e6 | 657 | &movd ("eax","mm2"); # 7, 6, 3, 2 |
89f1eb82 AP |
658 | &movd ("mm0","ecx"); # t[0] collected |
659 | &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14 | |
660 | &movz ($key,&HB("eax")); # 3 | |
661 | &shl ("ecx",16); # 14 | |
662 | &movd ("ebx","mm6"); # 13,12, 9, 8 | |
53154d71 | 663 | &or ("ecx",$acc); # 14 |
22c268e6 | 664 | |
89f1eb82 AP |
665 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 3 |
666 | &movz ($key,&HB("ebx")); # 9 | |
53154d71 AP |
667 | &shl ($acc,24); # 3 |
668 | &or ("ecx",$acc); # 3 | |
89f1eb82 AP |
669 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 9 |
670 | &movz ($key,&LB("ebx")); # 8 | |
53154d71 | 671 | &shl ($acc,8); # 9 |
89f1eb82 | 672 | &shr ("ebx",16); # 13,12 |
53154d71 | 673 | &or ("ecx",$acc); # 9 |
22c268e6 | 674 | |
89f1eb82 AP |
675 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 8 |
676 | &movz ($key,&LB("eax")); # 2 | |
22c268e6 | 677 | &shr ("eax",16); # 7, 6 |
89f1eb82 AP |
678 | &movd ("mm1","ecx"); # t[1] collected |
679 | &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2 | |
680 | &movz ($key,&HB("eax")); # 7 | |
681 | &shl ("ecx",16); # 2 | |
682 | &and ("eax",0xff); # 6 | |
683 | &or ("ecx",$acc); # 2 | |
22c268e6 AP |
684 | |
685 | &punpckldq ("mm0","mm1"); # t[0,1] collected | |
686 | ||
89f1eb82 AP |
687 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 7 |
688 | &movz ($key,&HB("ebx")); # 13 | |
53154d71 | 689 | &shl ($acc,24); # 7 |
89f1eb82 | 690 | &and ("ebx",0xff); # 12 |
53154d71 | 691 | &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6 |
89f1eb82 | 692 | &or ("ecx",$acc); # 7 |
22c268e6 | 693 | &shl ("eax",16); # 6 |
89f1eb82 | 694 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 13 |
22c268e6 | 695 | &or ("edx","eax"); # 6 |
53154d71 | 696 | &shl ($acc,8); # 13 |
53154d71 | 697 | &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12 |
89f1eb82 | 698 | &or ("ecx",$acc); # 13 |
22c268e6 | 699 | &or ("edx","ebx"); # 12 |
89f1eb82 AP |
700 | &mov ($key,$__key); |
701 | &movd ("mm4","ecx"); # t[2] collected | |
22c268e6 AP |
702 | &movd ("mm5","edx"); # t[3] collected |
703 | ||
704 | &punpckldq ("mm4","mm5"); # t[2,3] collected | |
705 | } | |
706 | ||
6c69aa53 | 707 | if (!$x86only) { |
fc924142 | 708 | &function_begin_B("_sse_AES_encrypt_compact"); |
22c268e6 AP |
709 | &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0 |
710 | &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8 | |
711 | ||
712 | # note that caller is expected to allocate stack frame for me! | |
713 | &mov ($acc,&DWP(240,$key)); # load key->rounds | |
714 | &lea ($acc,&DWP(-2,$acc,$acc)); | |
715 | &lea ($acc,&DWP(0,$key,$acc,8)); | |
6c69aa53 | 716 | &mov ($__end,$acc); # end of key schedule |
22c268e6 AP |
717 | |
718 | &mov ($s0,0x1b1b1b1b); # magic constant | |
719 | &mov (&DWP(8,"esp"),$s0); | |
720 | &mov (&DWP(12,"esp"),$s0); | |
721 | ||
722 | # prefetch Te4 | |
723 | &mov ($s0,&DWP(0-128,$tbl)); | |
724 | &mov ($s1,&DWP(32-128,$tbl)); | |
725 | &mov ($s2,&DWP(64-128,$tbl)); | |
726 | &mov ($s3,&DWP(96-128,$tbl)); | |
727 | &mov ($s0,&DWP(128-128,$tbl)); | |
728 | &mov ($s1,&DWP(160-128,$tbl)); | |
729 | &mov ($s2,&DWP(192-128,$tbl)); | |
730 | &mov ($s3,&DWP(224-128,$tbl)); | |
731 | ||
732 | &set_label("loop",16); | |
fc924142 | 733 | &sse_enccompact(); |
22c268e6 | 734 | &add ($key,16); |
6c69aa53 | 735 | &cmp ($key,$__end); |
22c268e6 AP |
736 | &ja (&label("out")); |
737 | ||
738 | &movq ("mm2",&QWP(8,"esp")); | |
739 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); | |
740 | &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0 | |
741 | &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4"); | |
742 | &pand ("mm3","mm2"); &pand ("mm7","mm2"); | |
53154d71 | 743 | &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16) |
22c268e6 AP |
744 | &paddb ("mm0","mm0"); &paddb ("mm4","mm4"); |
745 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2 | |
53154d71 AP |
746 | &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0 |
747 | &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2 | |
748 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16) | |
22c268e6 | 749 | |
53154d71 | 750 | &movq ("mm2","mm3"); &movq ("mm6","mm7"); |
22c268e6 | 751 | &pslld ("mm3",8); &pslld ("mm7",8); |
53154d71 | 752 | &psrld ("mm2",24); &psrld ("mm6",24); |
22c268e6 | 753 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8 |
22c268e6 AP |
754 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24 |
755 | ||
53154d71 | 756 | &movq ("mm3","mm1"); &movq ("mm7","mm5"); |
22c268e6 | 757 | &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key)); |
53154d71 | 758 | &psrld ("mm1",8); &psrld ("mm5",8); |
fc924142 | 759 | &mov ($s0,&DWP(0-128,$tbl)); |
22c268e6 | 760 | &pslld ("mm3",24); &pslld ("mm7",24); |
fc924142 | 761 | &mov ($s1,&DWP(64-128,$tbl)); |
22c268e6 | 762 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8 |
fc924142 | 763 | &mov ($s2,&DWP(128-128,$tbl)); |
22c268e6 | 764 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24 |
fc924142 | 765 | &mov ($s3,&DWP(192-128,$tbl)); |
22c268e6 AP |
766 | |
767 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); | |
768 | &jmp (&label("loop")); | |
769 | ||
770 | &set_label("out",16); | |
771 | &pxor ("mm0",&QWP(0,$key)); | |
772 | &pxor ("mm4",&QWP(8,$key)); | |
773 | ||
774 | &ret (); | |
fc924142 | 775 | &function_end_B("_sse_AES_encrypt_compact"); |
6c69aa53 | 776 | } |
22c268e6 AP |
777 | |
778 | ###################################################################### | |
779 | # Vanilla block function. | |
780 | ###################################################################### | |
781 | ||
782 | sub encstep() | |
783 | { my ($i,$te,@s) = @_; | |
784 | my $tmp = $key; | |
785 | my $out = $i==3?$s[0]:$acc; | |
786 | ||
787 | # lines marked with #%e?x[i] denote "reordered" instructions... | |
6c69aa53 | 788 | if ($i==3) { &mov ($key,$__key); }##%edx |
22c268e6 AP |
789 | else { &mov ($out,$s[0]); |
790 | &and ($out,0xFF); } | |
791 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] | |
792 | if ($i==2) { &shr ($s[0],24); }#%ecx[2] | |
793 | &mov ($out,&DWP(0,$te,$out,8)); | |
794 | ||
795 | if ($i==3) { $tmp=$s[1]; }##%eax | |
796 | &movz ($tmp,&HB($s[1])); | |
797 | &xor ($out,&DWP(3,$te,$tmp,8)); | |
798 | ||
6c69aa53 | 799 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx |
22c268e6 AP |
800 | else { &mov ($tmp,$s[2]); |
801 | &shr ($tmp,16); } | |
802 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | |
803 | &and ($tmp,0xFF); | |
804 | &xor ($out,&DWP(2,$te,$tmp,8)); | |
805 | ||
6c69aa53 | 806 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx |
22c268e6 | 807 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] |
609b0852 | 808 | else { &mov ($tmp,$s[3]); |
22c268e6 AP |
809 | &shr ($tmp,24) } |
810 | &xor ($out,&DWP(1,$te,$tmp,8)); | |
811 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | |
812 | if ($i==3) { &mov ($s[3],$acc); } | |
813 | &comment(); | |
814 | } | |
815 | ||
816 | sub enclast() | |
71314710 | 817 | { my ($i,$te,@s)=@_; |
e7e11507 AP |
818 | my $tmp = $key; |
819 | my $out = $i==3?$s[0]:$acc; | |
71314710 | 820 | |
6c69aa53 | 821 | if ($i==3) { &mov ($key,$__key); }##%edx |
e7e11507 AP |
822 | else { &mov ($out,$s[0]); } |
823 | &and ($out,0xFF); | |
824 | if ($i==1) { &shr ($s[0],16); }#%ebx[1] | |
825 | if ($i==2) { &shr ($s[0],24); }#%ecx[2] | |
c8d5c71a | 826 | &mov ($out,&DWP(2,$te,$out,8)); |
e7e11507 | 827 | &and ($out,0x000000ff); |
71314710 | 828 | |
e7e11507 AP |
829 | if ($i==3) { $tmp=$s[1]; }##%eax |
830 | &movz ($tmp,&HB($s[1])); | |
c8d5c71a | 831 | &mov ($tmp,&DWP(0,$te,$tmp,8)); |
e7e11507 | 832 | &and ($tmp,0x0000ff00); |
71314710 AP |
833 | &xor ($out,$tmp); |
834 | ||
6c69aa53 | 835 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx |
af8c1d81 | 836 | else { &mov ($tmp,$s[2]); |
e7e11507 AP |
837 | &shr ($tmp,16); } |
838 | if ($i==2) { &and ($s[1],0xFF); }#%edx[2] | |
839 | &and ($tmp,0xFF); | |
c8d5c71a | 840 | &mov ($tmp,&DWP(0,$te,$tmp,8)); |
e7e11507 | 841 | &and ($tmp,0x00ff0000); |
71314710 AP |
842 | &xor ($out,$tmp); |
843 | ||
6c69aa53 | 844 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx |
e7e11507 AP |
845 | elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] |
846 | else { &mov ($tmp,$s[3]); | |
847 | &shr ($tmp,24); } | |
c8d5c71a | 848 | &mov ($tmp,&DWP(2,$te,$tmp,8)); |
e7e11507 | 849 | &and ($tmp,0xff000000); |
71314710 | 850 | &xor ($out,$tmp); |
04d0d0ac | 851 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
e7e11507 | 852 | if ($i==3) { &mov ($s[3],$acc); } |
71314710 AP |
853 | } |
854 | ||
addb6e16 AP |
855 | &function_begin_B("_x86_AES_encrypt"); |
856 | if ($vertical_spin) { | |
857 | # I need high parts of volatile registers to be accessible... | |
858 | &exch ($s1="edi",$key="ebx"); | |
859 | &mov ($s2="esi",$acc="ecx"); | |
860 | } | |
71314710 | 861 | |
04d0d0ac | 862 | # note that caller is expected to allocate stack frame for me! |
6c69aa53 | 863 | &mov ($__key,$key); # save key |
e7e11507 | 864 | |
addb6e16 | 865 | &xor ($s0,&DWP(0,$key)); # xor with key |
e7e11507 AP |
866 | &xor ($s1,&DWP(4,$key)); |
867 | &xor ($s2,&DWP(8,$key)); | |
868 | &xor ($s3,&DWP(12,$key)); | |
869 | ||
870 | &mov ($acc,&DWP(240,$key)); # load key->rounds | |
71314710 AP |
871 | |
872 | if ($small_footprint) { | |
e7e11507 AP |
873 | &lea ($acc,&DWP(-2,$acc,$acc)); |
874 | &lea ($acc,&DWP(0,$key,$acc,8)); | |
6c69aa53 | 875 | &mov ($__end,$acc); # end of key schedule |
af8c1d81 | 876 | |
22c268e6 | 877 | &set_label("loop",16); |
e7e11507 | 878 | if ($vertical_spin) { |
22c268e6 | 879 | &encvert($tbl,$s0,$s1,$s2,$s3); |
e7e11507 | 880 | } else { |
22c268e6 AP |
881 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
882 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); | |
883 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); | |
884 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); | |
e7e11507 | 885 | } |
04d0d0ac | 886 | &add ($key,16); # advance rd_key |
e7e11507 AP |
887 | &xor ($s0,&DWP(0,$key)); |
888 | &xor ($s1,&DWP(4,$key)); | |
889 | &xor ($s2,&DWP(8,$key)); | |
890 | &xor ($s3,&DWP(12,$key)); | |
6c69aa53 AP |
891 | &cmp ($key,$__end); |
892 | &mov ($__key,$key); | |
71314710 AP |
893 | &jb (&label("loop")); |
894 | } | |
895 | else { | |
e7e11507 | 896 | &cmp ($acc,10); |
71314710 | 897 | &jle (&label("10rounds")); |
e7e11507 | 898 | &cmp ($acc,12); |
71314710 AP |
899 | &jle (&label("12rounds")); |
900 | ||
22c268e6 AP |
901 | &set_label("14rounds",4); |
902 | for ($i=1;$i<3;$i++) { | |
e7e11507 | 903 | if ($vertical_spin) { |
22c268e6 | 904 | &encvert($tbl,$s0,$s1,$s2,$s3); |
e7e11507 | 905 | } else { |
22c268e6 AP |
906 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
907 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); | |
908 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); | |
909 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); | |
e7e11507 AP |
910 | } |
911 | &xor ($s0,&DWP(16*$i+0,$key)); | |
912 | &xor ($s1,&DWP(16*$i+4,$key)); | |
913 | &xor ($s2,&DWP(16*$i+8,$key)); | |
914 | &xor ($s3,&DWP(16*$i+12,$key)); | |
71314710 | 915 | } |
e7e11507 | 916 | &add ($key,32); |
6c69aa53 | 917 | &mov ($__key,$key); # advance rd_key |
22c268e6 AP |
918 | &set_label("12rounds",4); |
919 | for ($i=1;$i<3;$i++) { | |
e7e11507 | 920 | if ($vertical_spin) { |
22c268e6 | 921 | &encvert($tbl,$s0,$s1,$s2,$s3); |
e7e11507 | 922 | } else { |
22c268e6 AP |
923 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
924 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); | |
925 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); | |
926 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); | |
e7e11507 AP |
927 | } |
928 | &xor ($s0,&DWP(16*$i+0,$key)); | |
929 | &xor ($s1,&DWP(16*$i+4,$key)); | |
930 | &xor ($s2,&DWP(16*$i+8,$key)); | |
931 | &xor ($s3,&DWP(16*$i+12,$key)); | |
71314710 | 932 | } |
e7e11507 | 933 | &add ($key,32); |
6c69aa53 | 934 | &mov ($__key,$key); # advance rd_key |
22c268e6 AP |
935 | &set_label("10rounds",4); |
936 | for ($i=1;$i<10;$i++) { | |
e7e11507 | 937 | if ($vertical_spin) { |
22c268e6 | 938 | &encvert($tbl,$s0,$s1,$s2,$s3); |
e7e11507 | 939 | } else { |
22c268e6 AP |
940 | &encstep(0,$tbl,$s0,$s1,$s2,$s3); |
941 | &encstep(1,$tbl,$s1,$s2,$s3,$s0); | |
942 | &encstep(2,$tbl,$s2,$s3,$s0,$s1); | |
943 | &encstep(3,$tbl,$s3,$s0,$s1,$s2); | |
e7e11507 AP |
944 | } |
945 | &xor ($s0,&DWP(16*$i+0,$key)); | |
946 | &xor ($s1,&DWP(16*$i+4,$key)); | |
947 | &xor ($s2,&DWP(16*$i+8,$key)); | |
948 | &xor ($s3,&DWP(16*$i+12,$key)); | |
71314710 AP |
949 | } |
950 | } | |
951 | ||
e7e11507 AP |
952 | if ($vertical_spin) { |
953 | # "reincarnate" some registers for "horizontal" spin... | |
addb6e16 AP |
954 | &mov ($s1="ebx",$key="edi"); |
955 | &mov ($s2="ecx",$acc="esi"); | |
e7e11507 | 956 | } |
22c268e6 AP |
957 | &enclast(0,$tbl,$s0,$s1,$s2,$s3); |
958 | &enclast(1,$tbl,$s1,$s2,$s3,$s0); | |
959 | &enclast(2,$tbl,$s2,$s3,$s0,$s1); | |
960 | &enclast(3,$tbl,$s3,$s0,$s1,$s2); | |
71314710 | 961 | |
e7e11507 AP |
962 | &add ($key,$small_footprint?16:160); |
963 | &xor ($s0,&DWP(0,$key)); | |
964 | &xor ($s1,&DWP(4,$key)); | |
965 | &xor ($s2,&DWP(8,$key)); | |
966 | &xor ($s3,&DWP(12,$key)); | |
967 | ||
71314710 AP |
968 | &ret (); |
969 | ||
fc924142 | 970 | &set_label("AES_Te",64); # Yes! I keep it in the code segment! |
c8d5c71a AP |
971 | &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); |
972 | &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); | |
973 | &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56); | |
974 | &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec); | |
975 | &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa); | |
976 | &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb); | |
977 | &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45); | |
978 | &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b); | |
979 | &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c); | |
980 | &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83); | |
981 | &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9); | |
982 | &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a); | |
983 | &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d); | |
984 | &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f); | |
985 | &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df); | |
986 | &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea); | |
987 | &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34); | |
988 | &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b); | |
989 | &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d); | |
990 | &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413); | |
991 | &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1); | |
992 | &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6); | |
993 | &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972); | |
994 | &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85); | |
995 | &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed); | |
996 | &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511); | |
997 | &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe); | |
998 | &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b); | |
999 | &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05); | |
1000 | &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1); | |
1001 | &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142); | |
1002 | &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf); | |
1003 | &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3); | |
1004 | &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e); | |
1005 | &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a); | |
1006 | &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6); | |
1007 | &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3); | |
1008 | &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b); | |
1009 | &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428); | |
1010 | &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad); | |
1011 | &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14); | |
1012 | &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8); | |
1013 | &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4); | |
1014 | &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2); | |
1015 | &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda); | |
1016 | &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949); | |
1017 | &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf); | |
1018 | &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810); | |
1019 | &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c); | |
1020 | &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697); | |
1021 | &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e); | |
1022 | &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f); | |
1023 | &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc); | |
1024 | &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c); | |
1025 | &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969); | |
1026 | &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27); | |
1027 | &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122); | |
1028 | &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433); | |
1029 | &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9); | |
1030 | &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5); | |
1031 | &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a); | |
1032 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); | |
1033 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); | |
1034 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); | |
22c268e6 AP |
1035 | |
1036 | #Te4 # four copies of Te4 to choose from to avoid L1 aliasing | |
1037 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | |
1038 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | |
1039 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | |
1040 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | |
1041 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | |
1042 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | |
1043 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | |
1044 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | |
1045 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | |
1046 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | |
1047 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | |
1048 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | |
1049 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | |
1050 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | |
1051 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | |
1052 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | |
1053 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | |
1054 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | |
1055 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | |
1056 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | |
1057 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | |
1058 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | |
1059 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | |
1060 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | |
1061 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | |
1062 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | |
1063 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | |
1064 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | |
1065 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | |
1066 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | |
1067 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | |
1068 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | |
1069 | ||
1070 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | |
1071 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | |
1072 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | |
1073 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | |
1074 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | |
1075 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | |
1076 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | |
1077 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | |
1078 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | |
1079 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | |
1080 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | |
1081 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | |
1082 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | |
1083 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | |
1084 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | |
1085 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | |
1086 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | |
1087 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | |
1088 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | |
1089 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | |
1090 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | |
1091 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | |
1092 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | |
1093 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | |
1094 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | |
1095 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | |
1096 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | |
1097 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | |
1098 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | |
1099 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | |
1100 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | |
1101 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | |
1102 | ||
1103 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | |
1104 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | |
1105 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | |
1106 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | |
1107 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | |
1108 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | |
1109 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | |
1110 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | |
1111 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | |
1112 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | |
1113 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | |
1114 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | |
1115 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | |
1116 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | |
1117 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | |
1118 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | |
1119 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | |
1120 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | |
1121 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | |
1122 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | |
1123 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | |
1124 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | |
1125 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | |
1126 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | |
1127 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | |
1128 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | |
1129 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | |
1130 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | |
1131 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | |
1132 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | |
1133 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | |
1134 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | |
1135 | ||
af8c1d81 AP |
1136 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); |
1137 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | |
1138 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | |
1139 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | |
1140 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | |
1141 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | |
1142 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | |
1143 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | |
1144 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | |
1145 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | |
1146 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | |
1147 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | |
1148 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | |
1149 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | |
1150 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | |
1151 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | |
1152 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | |
1153 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | |
1154 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | |
1155 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | |
1156 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | |
1157 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | |
1158 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | |
1159 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | |
1160 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | |
1161 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | |
1162 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | |
1163 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | |
1164 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | |
1165 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | |
1166 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | |
1167 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | |
e7e11507 AP |
1168 | #rcon: |
1169 | &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008); | |
c8d5c71a | 1170 | &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080); |
fc924142 AP |
1171 | &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000); |
1172 | &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000); | |
addb6e16 | 1173 | &function_end_B("_x86_AES_encrypt"); |
71314710 | 1174 | |
addb6e16 | 1175 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); |
addb6e16 AP |
1176 | &function_begin("AES_encrypt"); |
1177 | &mov ($acc,&wparam(0)); # load inp | |
1178 | &mov ($key,&wparam(2)); # load key | |
e7e11507 | 1179 | |
04d0d0ac | 1180 | &mov ($s0,"esp"); |
af8c1d81 | 1181 | &sub ("esp",36); |
22c268e6 AP |
1182 | &and ("esp",-64); # align to cache-line |
1183 | ||
1184 | # place stack frame just "above" the key schedule | |
1185 | &lea ($s1,&DWP(-64-63,$key)); | |
1186 | &sub ($s1,"esp"); | |
1187 | &neg ($s1); | |
1188 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line | |
1189 | &sub ("esp",$s1); | |
1190 | &add ("esp",4); # 4 is reserved for caller's return address | |
6c69aa53 | 1191 | &mov ($_esp,$s0); # save stack pointer |
04d0d0ac | 1192 | |
addb6e16 AP |
1193 | &call (&label("pic_point")); # make it PIC! |
1194 | &set_label("pic_point"); | |
22c268e6 | 1195 | &blindpop($tbl); |
6c69aa53 | 1196 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only); |
22c268e6 | 1197 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); |
6c69aa53 | 1198 | |
22c268e6 | 1199 | # pick Te4 copy which can't "overlap" with stack frame or key schedule |
fc924142 AP |
1200 | &lea ($s1,&DWP(768-4,"esp")); |
1201 | &sub ($s1,$tbl); | |
22c268e6 AP |
1202 | &and ($s1,0x300); |
1203 | &lea ($tbl,&DWP(2048+128,$tbl,$s1)); | |
1204 | ||
6c69aa53 AP |
1205 | if (!$x86only) { |
1206 | &bt (&DWP(0,$s0),25); # check for SSE bit | |
fc924142 AP |
1207 | &jnc (&label("x86")); |
1208 | ||
1209 | &movq ("mm0",&QWP(0,$acc)); | |
1210 | &movq ("mm4",&QWP(8,$acc)); | |
1211 | &call ("_sse_AES_encrypt_compact"); | |
6c69aa53 | 1212 | &mov ("esp",$_esp); # restore stack pointer |
fc924142 AP |
1213 | &mov ($acc,&wparam(1)); # load out |
1214 | &movq (&QWP(0,$acc),"mm0"); # write output data | |
1215 | &movq (&QWP(8,$acc),"mm4"); | |
1216 | &emms (); | |
1217 | &function_end_A(); | |
6c69aa53 | 1218 | } |
fc924142 | 1219 | &set_label("x86",16); |
6c69aa53 | 1220 | &mov ($_tbl,$tbl); |
addb6e16 AP |
1221 | &mov ($s0,&DWP(0,$acc)); # load input data |
1222 | &mov ($s1,&DWP(4,$acc)); | |
1223 | &mov ($s2,&DWP(8,$acc)); | |
1224 | &mov ($s3,&DWP(12,$acc)); | |
22c268e6 | 1225 | &call ("_x86_AES_encrypt_compact"); |
6c69aa53 | 1226 | &mov ("esp",$_esp); # restore stack pointer |
addb6e16 AP |
1227 | &mov ($acc,&wparam(1)); # load out |
1228 | &mov (&DWP(0,$acc),$s0); # write output data | |
1229 | &mov (&DWP(4,$acc),$s1); | |
1230 | &mov (&DWP(8,$acc),$s2); | |
1231 | &mov (&DWP(12,$acc),$s3); | |
22c268e6 | 1232 | &function_end("AES_encrypt"); |
71314710 | 1233 | |
22c268e6 | 1234 | #--------------------------------------------------------------------# |
71314710 | 1235 | |
22c268e6 AP |
1236 | ###################################################################### |
1237 | # "Compact" block function | |
1238 | ###################################################################### | |
71314710 | 1239 | |
22c268e6 | 1240 | sub deccompact() |
f9c5e5d9 | 1241 | { my $Fn = \&mov; |
af8c1d81 AP |
1242 | while ($#_>5) { pop(@_); $Fn=sub{}; } |
1243 | my ($i,$td,@s)=@_; | |
e7e11507 AP |
1244 | my $tmp = $key; |
1245 | my $out = $i==3?$s[0]:$acc; | |
71314710 | 1246 | |
af8c1d81 AP |
1247 | # $Fn is used in first compact round and its purpose is to |
1248 | # void restoration of some values from stack, so that after | |
22c268e6 | 1249 | # 4xdeccompact with extra argument $key, $s0 and $s1 values |
af8c1d81 | 1250 | # are left there... |
6c69aa53 | 1251 | if($i==3) { &$Fn ($key,$__key); } |
e7e11507 AP |
1252 | else { &mov ($out,$s[0]); } |
1253 | &and ($out,0xFF); | |
53154d71 | 1254 | &movz ($out,&BP(-128,$td,$out,1)); |
71314710 | 1255 | |
e7e11507 AP |
1256 | if ($i==3) { $tmp=$s[1]; } |
1257 | &movz ($tmp,&HB($s[1])); | |
53154d71 | 1258 | &movz ($tmp,&BP(-128,$td,$tmp,1)); |
9598fa87 | 1259 | &shl ($tmp,8); |
71314710 AP |
1260 | &xor ($out,$tmp); |
1261 | ||
e7e11507 AP |
1262 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); } |
1263 | else { mov ($tmp,$s[2]); } | |
1264 | &shr ($tmp,16); | |
1265 | &and ($tmp,0xFF); | |
53154d71 | 1266 | &movz ($tmp,&BP(-128,$td,$tmp,1)); |
9598fa87 | 1267 | &shl ($tmp,16); |
71314710 AP |
1268 | &xor ($out,$tmp); |
1269 | ||
6c69aa53 | 1270 | if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); } |
e7e11507 AP |
1271 | else { &mov ($tmp,$s[3]); } |
1272 | &shr ($tmp,24); | |
53154d71 | 1273 | &movz ($tmp,&BP(-128,$td,$tmp,1)); |
9598fa87 | 1274 | &shl ($tmp,24); |
71314710 | 1275 | &xor ($out,$tmp); |
04d0d0ac | 1276 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } |
6c69aa53 | 1277 | if ($i==3) { &$Fn ($s[3],$__s0); } |
af8c1d81 AP |
1278 | } |
1279 | ||
1280 | # must be called with 2,3,0,1 as argument sequence!!! | |
1281 | sub dectransform() | |
1282 | { my @s = ($s0,$s1,$s2,$s3); | |
1283 | my $i = shift; | |
1284 | my $tmp = $key; | |
1285 | my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1); | |
1286 | my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1); | |
22c268e6 | 1287 | my $tp8 = $tbl; |
af8c1d81 | 1288 | |
89f1eb82 AP |
1289 | &mov ($tmp,0x80808080); |
1290 | &and ($tmp,$s[$i]); | |
1291 | &mov ($acc,$tmp); | |
af8c1d81 | 1292 | &shr ($tmp,7); |
96b0f6c1 | 1293 | &lea ($tp2,&DWP(0,$s[$i],$s[$i])); |
af8c1d81 | 1294 | &sub ($acc,$tmp); |
96b0f6c1 | 1295 | &and ($tp2,0xfefefefe); |
af8c1d81 | 1296 | &and ($acc,0x1b1b1b1b); |
89f1eb82 AP |
1297 | &xor ($tp2,$acc); |
1298 | &mov ($tmp,0x80808080); | |
af8c1d81 | 1299 | |
89f1eb82 AP |
1300 | &and ($tmp,$tp2); |
1301 | &mov ($acc,$tmp); | |
af8c1d81 | 1302 | &shr ($tmp,7); |
96b0f6c1 | 1303 | &lea ($tp4,&DWP(0,$tp2,$tp2)); |
af8c1d81 | 1304 | &sub ($acc,$tmp); |
96b0f6c1 | 1305 | &and ($tp4,0xfefefefe); |
af8c1d81 | 1306 | &and ($acc,0x1b1b1b1b); |
96b0f6c1 | 1307 | &xor ($tp2,$s[$i]); # tp2^tp1 |
89f1eb82 AP |
1308 | &xor ($tp4,$acc); |
1309 | &mov ($tmp,0x80808080); | |
af8c1d81 | 1310 | |
89f1eb82 AP |
1311 | &and ($tmp,$tp4); |
1312 | &mov ($acc,$tmp); | |
af8c1d81 | 1313 | &shr ($tmp,7); |
96b0f6c1 | 1314 | &lea ($tp8,&DWP(0,$tp4,$tp4)); |
af8c1d81 | 1315 | &sub ($acc,$tmp); |
96b0f6c1 | 1316 | &and ($tp8,0xfefefefe); |
af8c1d81 | 1317 | &and ($acc,0x1b1b1b1b); |
96b0f6c1 | 1318 | &xor ($tp4,$s[$i]); # tp4^tp1 |
22c268e6 | 1319 | &rotl ($s[$i],8); # = ROTATE(tp1,8) |
af8c1d81 AP |
1320 | &xor ($tp8,$acc); |
1321 | ||
22c268e6 | 1322 | &xor ($s[$i],$tp2); |
af8c1d81 | 1323 | &xor ($tp2,$tp8); |
96b0f6c1 | 1324 | &xor ($s[$i],$tp4); |
22c268e6 | 1325 | &xor ($tp4,$tp8); |
89f1eb82 | 1326 | &rotl ($tp2,24); |
96b0f6c1 | 1327 | &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) |
89f1eb82 | 1328 | &rotl ($tp4,16); |
96b0f6c1 | 1329 | &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24) |
89f1eb82 | 1330 | &rotl ($tp8,8); |
22c268e6 | 1331 | &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16) |
96b0f6c1 AP |
1332 | &mov ($s[0],$__s0) if($i==2); #prefetch $s0 |
1333 | &mov ($s[1],$__s1) if($i==3); #prefetch $s1 | |
1334 | &mov ($s[2],$__s2) if($i==1); | |
22c268e6 | 1335 | &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8) |
af8c1d81 | 1336 | |
6c69aa53 | 1337 | &mov ($s[3],$__s3) if($i==1); |
af8c1d81 | 1338 | &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2); |
71314710 AP |
1339 | } |
1340 | ||
22c268e6 | 1341 | &function_begin_B("_x86_AES_decrypt_compact"); |
04d0d0ac | 1342 | # note that caller is expected to allocate stack frame for me! |
6c69aa53 | 1343 | &mov ($__key,$key); # save key |
71314710 | 1344 | |
addb6e16 | 1345 | &xor ($s0,&DWP(0,$key)); # xor with key |
e7e11507 AP |
1346 | &xor ($s1,&DWP(4,$key)); |
1347 | &xor ($s2,&DWP(8,$key)); | |
1348 | &xor ($s3,&DWP(12,$key)); | |
71314710 | 1349 | |
22c268e6 AP |
1350 | &mov ($acc,&DWP(240,$key)); # load key->rounds |
1351 | ||
1352 | &lea ($acc,&DWP(-2,$acc,$acc)); | |
1353 | &lea ($acc,&DWP(0,$key,$acc,8)); | |
6c69aa53 | 1354 | &mov ($__end,$acc); # end of key schedule |
22c268e6 AP |
1355 | |
1356 | # prefetch Td4 | |
1357 | &mov ($key,&DWP(0-128,$tbl)); | |
1358 | &mov ($acc,&DWP(32-128,$tbl)); | |
1359 | &mov ($key,&DWP(64-128,$tbl)); | |
1360 | &mov ($acc,&DWP(96-128,$tbl)); | |
1361 | &mov ($key,&DWP(128-128,$tbl)); | |
1362 | &mov ($acc,&DWP(160-128,$tbl)); | |
1363 | &mov ($key,&DWP(192-128,$tbl)); | |
1364 | &mov ($acc,&DWP(224-128,$tbl)); | |
1365 | ||
1366 | &set_label("loop",16); | |
1367 | ||
1368 | &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1); | |
1369 | &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1); | |
1370 | &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1); | |
1371 | &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1); | |
1372 | &dectransform(2); | |
1373 | &dectransform(3); | |
1374 | &dectransform(0); | |
1375 | &dectransform(1); | |
6c69aa53 AP |
1376 | &mov ($key,$__key); |
1377 | &mov ($tbl,$__tbl); | |
22c268e6 AP |
1378 | &add ($key,16); # advance rd_key |
1379 | &xor ($s0,&DWP(0,$key)); | |
1380 | &xor ($s1,&DWP(4,$key)); | |
1381 | &xor ($s2,&DWP(8,$key)); | |
1382 | &xor ($s3,&DWP(12,$key)); | |
1383 | ||
6c69aa53 AP |
1384 | &cmp ($key,$__end); |
1385 | &mov ($__key,$key); | |
22c268e6 AP |
1386 | &jb (&label("loop")); |
1387 | ||
1388 | &deccompact(0,$tbl,$s0,$s3,$s2,$s1); | |
1389 | &deccompact(1,$tbl,$s1,$s0,$s3,$s2); | |
1390 | &deccompact(2,$tbl,$s2,$s1,$s0,$s3); | |
1391 | &deccompact(3,$tbl,$s3,$s2,$s1,$s0); | |
1392 | ||
af8c1d81 AP |
1393 | &xor ($s0,&DWP(16,$key)); |
1394 | &xor ($s1,&DWP(20,$key)); | |
1395 | &xor ($s2,&DWP(24,$key)); | |
1396 | &xor ($s3,&DWP(28,$key)); | |
1397 | ||
22c268e6 AP |
1398 | &ret (); |
1399 | &function_end_B("_x86_AES_decrypt_compact"); | |
1400 | ||
1401 | ###################################################################### | |
fc924142 | 1402 | # "Compact" SSE block function. |
22c268e6 AP |
1403 | ###################################################################### |
1404 | ||
fc924142 | 1405 | sub sse_deccompact() |
22c268e6 AP |
1406 | { |
1407 | &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0 | |
89f1eb82 | 1408 | &pshufw ("mm5","mm4",0x09); # 13,12,11,10 |
22c268e6 | 1409 | &movd ("eax","mm1"); # 7, 6, 1, 0 |
89f1eb82 AP |
1410 | &movd ("ebx","mm5"); # 13,12,11,10 |
1411 | &mov ($__key,$key); | |
22c268e6 | 1412 | |
53154d71 | 1413 | &movz ($acc,&LB("eax")); # 0 |
22c268e6 | 1414 | &movz ("edx",&HB("eax")); # 1 |
89f1eb82 AP |
1415 | &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4 |
1416 | &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0 | |
1417 | &movz ($key,&LB("ebx")); # 10 | |
53154d71 | 1418 | &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1 |
89f1eb82 | 1419 | &shr ("eax",16); # 7, 6 |
22c268e6 AP |
1420 | &shl ("edx",8); # 1 |
1421 | ||
89f1eb82 AP |
1422 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 10 |
1423 | &movz ($key,&HB("ebx")); # 11 | |
53154d71 | 1424 | &shl ($acc,16); # 10 |
89f1eb82 | 1425 | &pshufw ("mm6","mm4",0x03); # 9, 8,15,14 |
53154d71 | 1426 | &or ("ecx",$acc); # 10 |
89f1eb82 AP |
1427 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 11 |
1428 | &movz ($key,&HB("eax")); # 7 | |
53154d71 | 1429 | &shl ($acc,24); # 11 |
22c268e6 | 1430 | &shr ("ebx",16); # 13,12 |
89f1eb82 | 1431 | &or ("edx",$acc); # 11 |
22c268e6 | 1432 | |
89f1eb82 AP |
1433 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 7 |
1434 | &movz ($key,&HB("ebx")); # 13 | |
53154d71 AP |
1435 | &shl ($acc,24); # 7 |
1436 | &or ("ecx",$acc); # 7 | |
89f1eb82 AP |
1437 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 13 |
1438 | &movz ($key,&LB("eax")); # 6 | |
53154d71 | 1439 | &shl ($acc,8); # 13 |
89f1eb82 | 1440 | &movd ("eax","mm2"); # 3, 2, 5, 4 |
53154d71 | 1441 | &or ("ecx",$acc); # 13 |
22c268e6 | 1442 | |
89f1eb82 AP |
1443 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 6 |
1444 | &movz ($key,&LB("ebx")); # 12 | |
1445 | &shl ($acc,16); # 6 | |
22c268e6 | 1446 | &movd ("ebx","mm6"); # 9, 8,15,14 |
89f1eb82 AP |
1447 | &movd ("mm0","ecx"); # t[0] collected |
1448 | &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12 | |
1449 | &movz ($key,&LB("eax")); # 4 | |
53154d71 AP |
1450 | &or ("ecx",$acc); # 12 |
1451 | ||
89f1eb82 AP |
1452 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 4 |
1453 | &movz ($key,&LB("ebx")); # 14 | |
53154d71 | 1454 | &or ("edx",$acc); # 4 |
89f1eb82 AP |
1455 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 14 |
1456 | &movz ($key,&HB("eax")); # 5 | |
53154d71 | 1457 | &shl ($acc,16); # 14 |
89f1eb82 | 1458 | &shr ("eax",16); # 3, 2 |
53154d71 | 1459 | &or ("edx",$acc); # 14 |
22c268e6 | 1460 | |
89f1eb82 AP |
1461 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 5 |
1462 | &movz ($key,&HB("ebx")); # 15 | |
22c268e6 | 1463 | &shr ("ebx",16); # 9, 8 |
89f1eb82 AP |
1464 | &shl ($acc,8); # 5 |
1465 | &movd ("mm1","edx"); # t[1] collected | |
1466 | &movz ("edx",&BP(-128,$tbl,$key,1)); # 15 | |
1467 | &movz ($key,&HB("ebx")); # 9 | |
1468 | &shl ("edx",24); # 15 | |
1469 | &and ("ebx",0xff); # 8 | |
1470 | &or ("edx",$acc); # 15 | |
22c268e6 AP |
1471 | |
1472 | &punpckldq ("mm0","mm1"); # t[0,1] collected | |
1473 | ||
89f1eb82 AP |
1474 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 9 |
1475 | &movz ($key,&LB("eax")); # 2 | |
53154d71 | 1476 | &shl ($acc,8); # 9 |
89f1eb82 | 1477 | &movz ("eax",&HB("eax")); # 3 |
53154d71 | 1478 | &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8 |
89f1eb82 AP |
1479 | &or ("ecx",$acc); # 9 |
1480 | &movz ($acc,&BP(-128,$tbl,$key,1)); # 2 | |
22c268e6 | 1481 | &or ("edx","ebx"); # 8 |
53154d71 | 1482 | &shl ($acc,16); # 2 |
53154d71 | 1483 | &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3 |
89f1eb82 | 1484 | &or ("edx",$acc); # 2 |
22c268e6 AP |
1485 | &shl ("eax",24); # 3 |
1486 | &or ("ecx","eax"); # 3 | |
89f1eb82 AP |
1487 | &mov ($key,$__key); |
1488 | &movd ("mm4","edx"); # t[2] collected | |
22c268e6 AP |
1489 | &movd ("mm5","ecx"); # t[3] collected |
1490 | ||
1491 | &punpckldq ("mm4","mm5"); # t[2,3] collected | |
1492 | } | |
1493 | ||
6c69aa53 | 1494 | if (!$x86only) { |
fc924142 | 1495 | &function_begin_B("_sse_AES_decrypt_compact"); |
22c268e6 AP |
1496 | &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0 |
1497 | &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8 | |
1498 | ||
1499 | # note that caller is expected to allocate stack frame for me! | |
1500 | &mov ($acc,&DWP(240,$key)); # load key->rounds | |
1501 | &lea ($acc,&DWP(-2,$acc,$acc)); | |
1502 | &lea ($acc,&DWP(0,$key,$acc,8)); | |
6c69aa53 | 1503 | &mov ($__end,$acc); # end of key schedule |
22c268e6 AP |
1504 | |
1505 | &mov ($s0,0x1b1b1b1b); # magic constant | |
1506 | &mov (&DWP(8,"esp"),$s0); | |
1507 | &mov (&DWP(12,"esp"),$s0); | |
1508 | ||
1509 | # prefetch Td4 | |
1510 | &mov ($s0,&DWP(0-128,$tbl)); | |
1511 | &mov ($s1,&DWP(32-128,$tbl)); | |
1512 | &mov ($s2,&DWP(64-128,$tbl)); | |
1513 | &mov ($s3,&DWP(96-128,$tbl)); | |
1514 | &mov ($s0,&DWP(128-128,$tbl)); | |
1515 | &mov ($s1,&DWP(160-128,$tbl)); | |
1516 | &mov ($s2,&DWP(192-128,$tbl)); | |
1517 | &mov ($s3,&DWP(224-128,$tbl)); | |
1518 | ||
22c268e6 | 1519 | &set_label("loop",16); |
fc924142 | 1520 | &sse_deccompact(); |
22c268e6 | 1521 | &add ($key,16); |
6c69aa53 | 1522 | &cmp ($key,$__end); |
22c268e6 AP |
1523 | &ja (&label("out")); |
1524 | ||
1525 | # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N) | |
1526 | &movq ("mm3","mm0"); &movq ("mm7","mm4"); | |
1527 | &movq ("mm2","mm0",1); &movq ("mm6","mm4",1); | |
1528 | &movq ("mm1","mm0"); &movq ("mm5","mm4"); | |
1529 | &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16) | |
1530 | &pslld ("mm2",8); &pslld ("mm6",8); | |
1531 | &psrld ("mm3",8); &psrld ("mm7",8); | |
1532 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8 | |
1533 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8 | |
1534 | &pslld ("mm2",16); &pslld ("mm6",16); | |
1535 | &psrld ("mm3",16); &psrld ("mm7",16); | |
1536 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24 | |
1537 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24 | |
1538 | ||
1539 | &movq ("mm3",&QWP(8,"esp")); | |
1540 | &pxor ("mm2","mm2"); &pxor ("mm6","mm6"); | |
1541 | &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5"); | |
1542 | &pand ("mm2","mm3"); &pand ("mm6","mm3"); | |
1543 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); | |
1544 | &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2 | |
1545 | &movq ("mm3","mm1"); &movq ("mm7","mm5"); | |
1546 | &movq ("mm2","mm1"); &movq ("mm6","mm5"); | |
1547 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2 | |
1548 | &pslld ("mm3",24); &pslld ("mm7",24); | |
1549 | &psrld ("mm2",8); &psrld ("mm6",8); | |
1550 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24 | |
1551 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8 | |
1552 | ||
1553 | &movq ("mm2",&QWP(8,"esp")); | |
1554 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); | |
1555 | &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5"); | |
1556 | &pand ("mm3","mm2"); &pand ("mm7","mm2"); | |
1557 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); | |
1558 | &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4 | |
1559 | &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1); | |
1560 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4 | |
609b0852 | 1561 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16) |
22c268e6 AP |
1562 | |
1563 | &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); | |
1564 | &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5"); | |
1565 | &pand ("mm3","mm2"); &pand ("mm7","mm2"); | |
1566 | &paddb ("mm1","mm1"); &paddb ("mm5","mm5"); | |
1567 | &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8 | |
1568 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8 | |
1569 | &movq ("mm3","mm1"); &movq ("mm7","mm5"); | |
1570 | &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1); | |
1571 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16) | |
1572 | &pslld ("mm1",8); &pslld ("mm5",8); | |
1573 | &psrld ("mm3",8); &psrld ("mm7",8); | |
fc924142 | 1574 | &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key)); |
22c268e6 AP |
1575 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8 |
1576 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8 | |
fc924142 | 1577 | &mov ($s0,&DWP(0-128,$tbl)); |
22c268e6 | 1578 | &pslld ("mm1",16); &pslld ("mm5",16); |
fc924142 | 1579 | &mov ($s1,&DWP(64-128,$tbl)); |
22c268e6 | 1580 | &psrld ("mm3",16); &psrld ("mm7",16); |
fc924142 | 1581 | &mov ($s2,&DWP(128-128,$tbl)); |
22c268e6 | 1582 | &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24 |
fc924142 | 1583 | &mov ($s3,&DWP(192-128,$tbl)); |
22c268e6 AP |
1584 | &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24 |
1585 | ||
1586 | &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); | |
1587 | &jmp (&label("loop")); | |
1588 | ||
1589 | &set_label("out",16); | |
1590 | &pxor ("mm0",&QWP(0,$key)); | |
1591 | &pxor ("mm4",&QWP(8,$key)); | |
1592 | ||
1593 | &ret (); | |
fc924142 | 1594 | &function_end_B("_sse_AES_decrypt_compact"); |
6c69aa53 | 1595 | } |
22c268e6 AP |
1596 | |
1597 | ###################################################################### | |
1598 | # Vanilla block function. | |
1599 | ###################################################################### | |
1600 | ||
1601 | sub decstep() | |
1602 | { my ($i,$td,@s) = @_; | |
1603 | my $tmp = $key; | |
1604 | my $out = $i==3?$s[0]:$acc; | |
1605 | ||
1606 | # no instructions are reordered, as performance appears | |
1607 | # optimal... or rather that all attempts to reorder didn't | |
1608 | # result in better performance [which by the way is not a | |
46f4e1be | 1609 | # bit lower than encryption]. |
6c69aa53 | 1610 | if($i==3) { &mov ($key,$__key); } |
22c268e6 AP |
1611 | else { &mov ($out,$s[0]); } |
1612 | &and ($out,0xFF); | |
1613 | &mov ($out,&DWP(0,$td,$out,8)); | |
1614 | ||
1615 | if ($i==3) { $tmp=$s[1]; } | |
1616 | &movz ($tmp,&HB($s[1])); | |
1617 | &xor ($out,&DWP(3,$td,$tmp,8)); | |
1618 | ||
1619 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); } | |
1620 | else { &mov ($tmp,$s[2]); } | |
1621 | &shr ($tmp,16); | |
1622 | &and ($tmp,0xFF); | |
1623 | &xor ($out,&DWP(2,$td,$tmp,8)); | |
1624 | ||
6c69aa53 | 1625 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); } |
22c268e6 AP |
1626 | else { &mov ($tmp,$s[3]); } |
1627 | &shr ($tmp,24); | |
1628 | &xor ($out,&DWP(1,$td,$tmp,8)); | |
1629 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | |
6c69aa53 | 1630 | if ($i==3) { &mov ($s[3],$__s0); } |
22c268e6 AP |
1631 | &comment(); |
1632 | } | |
1633 | ||
1634 | sub declast() | |
1635 | { my ($i,$td,@s)=@_; | |
1636 | my $tmp = $key; | |
1637 | my $out = $i==3?$s[0]:$acc; | |
1638 | ||
1639 | if($i==0) { &lea ($td,&DWP(2048+128,$td)); | |
1640 | &mov ($tmp,&DWP(0-128,$td)); | |
1641 | &mov ($acc,&DWP(32-128,$td)); | |
1642 | &mov ($tmp,&DWP(64-128,$td)); | |
1643 | &mov ($acc,&DWP(96-128,$td)); | |
1644 | &mov ($tmp,&DWP(128-128,$td)); | |
1645 | &mov ($acc,&DWP(160-128,$td)); | |
1646 | &mov ($tmp,&DWP(192-128,$td)); | |
1647 | &mov ($acc,&DWP(224-128,$td)); | |
1648 | &lea ($td,&DWP(-128,$td)); } | |
6c69aa53 | 1649 | if($i==3) { &mov ($key,$__key); } |
22c268e6 AP |
1650 | else { &mov ($out,$s[0]); } |
1651 | &and ($out,0xFF); | |
53154d71 | 1652 | &movz ($out,&BP(0,$td,$out,1)); |
22c268e6 AP |
1653 | |
1654 | if ($i==3) { $tmp=$s[1]; } | |
1655 | &movz ($tmp,&HB($s[1])); | |
53154d71 | 1656 | &movz ($tmp,&BP(0,$td,$tmp,1)); |
22c268e6 AP |
1657 | &shl ($tmp,8); |
1658 | &xor ($out,$tmp); | |
1659 | ||
1660 | if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); } | |
1661 | else { mov ($tmp,$s[2]); } | |
1662 | &shr ($tmp,16); | |
1663 | &and ($tmp,0xFF); | |
53154d71 | 1664 | &movz ($tmp,&BP(0,$td,$tmp,1)); |
22c268e6 AP |
1665 | &shl ($tmp,16); |
1666 | &xor ($out,$tmp); | |
1667 | ||
6c69aa53 | 1668 | if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); } |
22c268e6 AP |
1669 | else { &mov ($tmp,$s[3]); } |
1670 | &shr ($tmp,24); | |
53154d71 | 1671 | &movz ($tmp,&BP(0,$td,$tmp,1)); |
22c268e6 AP |
1672 | &shl ($tmp,24); |
1673 | &xor ($out,$tmp); | |
1674 | if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } | |
6c69aa53 | 1675 | if ($i==3) { &mov ($s[3],$__s0); |
22c268e6 AP |
1676 | &lea ($td,&DWP(-2048,$td)); } |
1677 | } | |
1678 | ||
22c268e6 AP |
1679 | &function_begin_B("_x86_AES_decrypt"); |
1680 | # note that caller is expected to allocate stack frame for me! | |
6c69aa53 | 1681 | &mov ($__key,$key); # save key |
22c268e6 AP |
1682 | |
1683 | &xor ($s0,&DWP(0,$key)); # xor with key | |
1684 | &xor ($s1,&DWP(4,$key)); | |
1685 | &xor ($s2,&DWP(8,$key)); | |
1686 | &xor ($s3,&DWP(12,$key)); | |
1687 | ||
e7e11507 | 1688 | &mov ($acc,&DWP(240,$key)); # load key->rounds |
71314710 AP |
1689 | |
1690 | if ($small_footprint) { | |
e7e11507 AP |
1691 | &lea ($acc,&DWP(-2,$acc,$acc)); |
1692 | &lea ($acc,&DWP(0,$key,$acc,8)); | |
6c69aa53 | 1693 | &mov ($__end,$acc); # end of key schedule |
22c268e6 AP |
1694 | &set_label("loop",16); |
1695 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); | |
1696 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); | |
1697 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); | |
1698 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); | |
04d0d0ac | 1699 | &add ($key,16); # advance rd_key |
e7e11507 AP |
1700 | &xor ($s0,&DWP(0,$key)); |
1701 | &xor ($s1,&DWP(4,$key)); | |
1702 | &xor ($s2,&DWP(8,$key)); | |
1703 | &xor ($s3,&DWP(12,$key)); | |
6c69aa53 AP |
1704 | &cmp ($key,$__end); |
1705 | &mov ($__key,$key); | |
71314710 AP |
1706 | &jb (&label("loop")); |
1707 | } | |
1708 | else { | |
e7e11507 | 1709 | &cmp ($acc,10); |
71314710 | 1710 | &jle (&label("10rounds")); |
e7e11507 | 1711 | &cmp ($acc,12); |
71314710 AP |
1712 | &jle (&label("12rounds")); |
1713 | ||
22c268e6 AP |
1714 | &set_label("14rounds",4); |
1715 | for ($i=1;$i<3;$i++) { | |
1716 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); | |
1717 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); | |
1718 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); | |
1719 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); | |
e7e11507 AP |
1720 | &xor ($s0,&DWP(16*$i+0,$key)); |
1721 | &xor ($s1,&DWP(16*$i+4,$key)); | |
1722 | &xor ($s2,&DWP(16*$i+8,$key)); | |
1723 | &xor ($s3,&DWP(16*$i+12,$key)); | |
71314710 | 1724 | } |
e7e11507 | 1725 | &add ($key,32); |
6c69aa53 | 1726 | &mov ($__key,$key); # advance rd_key |
22c268e6 AP |
1727 | &set_label("12rounds",4); |
1728 | for ($i=1;$i<3;$i++) { | |
1729 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); | |
1730 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); | |
1731 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); | |
1732 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); | |
e7e11507 AP |
1733 | &xor ($s0,&DWP(16*$i+0,$key)); |
1734 | &xor ($s1,&DWP(16*$i+4,$key)); | |
1735 | &xor ($s2,&DWP(16*$i+8,$key)); | |
1736 | &xor ($s3,&DWP(16*$i+12,$key)); | |
71314710 | 1737 | } |
e7e11507 | 1738 | &add ($key,32); |
6c69aa53 | 1739 | &mov ($__key,$key); # advance rd_key |
22c268e6 AP |
1740 | &set_label("10rounds",4); |
1741 | for ($i=1;$i<10;$i++) { | |
1742 | &decstep(0,$tbl,$s0,$s3,$s2,$s1); | |
1743 | &decstep(1,$tbl,$s1,$s0,$s3,$s2); | |
1744 | &decstep(2,$tbl,$s2,$s1,$s0,$s3); | |
1745 | &decstep(3,$tbl,$s3,$s2,$s1,$s0); | |
e7e11507 AP |
1746 | &xor ($s0,&DWP(16*$i+0,$key)); |
1747 | &xor ($s1,&DWP(16*$i+4,$key)); | |
1748 | &xor ($s2,&DWP(16*$i+8,$key)); | |
1749 | &xor ($s3,&DWP(16*$i+12,$key)); | |
71314710 AP |
1750 | } |
1751 | } | |
1752 | ||
22c268e6 AP |
1753 | &declast(0,$tbl,$s0,$s3,$s2,$s1); |
1754 | &declast(1,$tbl,$s1,$s0,$s3,$s2); | |
1755 | &declast(2,$tbl,$s2,$s1,$s0,$s3); | |
1756 | &declast(3,$tbl,$s3,$s2,$s1,$s0); | |
71314710 | 1757 | |
e7e11507 AP |
1758 | &add ($key,$small_footprint?16:160); |
1759 | &xor ($s0,&DWP(0,$key)); | |
1760 | &xor ($s1,&DWP(4,$key)); | |
1761 | &xor ($s2,&DWP(8,$key)); | |
1762 | &xor ($s3,&DWP(12,$key)); | |
1763 | ||
71314710 AP |
1764 | &ret (); |
1765 | ||
fc924142 | 1766 | &set_label("AES_Td",64); # Yes! I keep it in the code segment! |
c8d5c71a AP |
1767 | &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); |
1768 | &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); | |
1769 | &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5); | |
1770 | &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5); | |
1771 | &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d); | |
1772 | &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b); | |
1773 | &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295); | |
1774 | &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e); | |
1775 | &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927); | |
1776 | &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d); | |
1777 | &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362); | |
1778 | &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9); | |
1779 | &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52); | |
1780 | &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566); | |
1781 | &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3); | |
1782 | &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed); | |
1783 | &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e); | |
1784 | &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4); | |
1785 | &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4); | |
1786 | &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd); | |
1787 | &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d); | |
1788 | &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060); | |
1789 | &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967); | |
1790 | &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879); | |
1791 | &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000); | |
1792 | &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c); | |
1793 | &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36); | |
1794 | &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624); | |
1795 | &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b); | |
1796 | &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c); | |
1797 | &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12); | |
1798 | &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14); | |
1799 | &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3); | |
1800 | &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b); | |
1801 | &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8); | |
1802 | &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684); | |
1803 | &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7); | |
1804 | &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177); | |
1805 | &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947); | |
1806 | &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322); | |
1807 | &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498); | |
1808 | &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f); | |
1809 | &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54); | |
1810 | &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382); | |
1811 | &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf); | |
1812 | &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb); | |
1813 | &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83); | |
1814 | &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef); | |
1815 | &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029); | |
1816 | &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235); | |
1817 | &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733); | |
1818 | &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117); | |
1819 | &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4); | |
1820 | &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546); | |
1821 | &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb); | |
1822 | &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d); | |
1823 | &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb); | |
1824 | &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a); | |
1825 | &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773); | |
1826 | &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478); | |
1827 | &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2); | |
1828 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); | |
1829 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); | |
1830 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); | |
22c268e6 AP |
1831 | |
1832 | #Td4: # four copies of Td4 to choose from to avoid L1 aliasing | |
1833 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | |
1834 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | |
1835 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | |
1836 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | |
1837 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | |
1838 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | |
1839 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | |
1840 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | |
1841 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | |
1842 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | |
1843 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | |
1844 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | |
1845 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | |
1846 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | |
1847 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | |
1848 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | |
1849 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | |
1850 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | |
1851 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | |
1852 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | |
1853 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | |
1854 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | |
1855 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | |
1856 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | |
1857 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | |
1858 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | |
1859 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | |
1860 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | |
1861 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | |
1862 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | |
1863 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | |
1864 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | |
1865 | ||
1866 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | |
1867 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | |
1868 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | |
1869 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | |
1870 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | |
1871 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | |
1872 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | |
1873 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | |
1874 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | |
1875 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | |
1876 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | |
1877 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | |
1878 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | |
1879 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | |
1880 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | |
1881 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | |
1882 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | |
1883 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | |
1884 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | |
1885 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | |
1886 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | |
1887 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | |
1888 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | |
1889 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | |
1890 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | |
1891 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | |
1892 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | |
1893 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | |
1894 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | |
1895 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | |
1896 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | |
1897 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | |
1898 | ||
1899 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | |
1900 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | |
1901 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | |
1902 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | |
1903 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | |
1904 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | |
1905 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | |
1906 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | |
1907 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | |
1908 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | |
1909 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | |
1910 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | |
1911 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | |
1912 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | |
1913 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | |
1914 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | |
1915 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | |
1916 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | |
1917 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | |
1918 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | |
1919 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | |
1920 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | |
1921 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | |
1922 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | |
1923 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | |
1924 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | |
1925 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | |
1926 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | |
1927 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | |
1928 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | |
1929 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | |
1930 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | |
1931 | ||
9598fa87 AP |
1932 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); |
1933 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | |
1934 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | |
1935 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | |
1936 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | |
1937 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | |
1938 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | |
1939 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | |
1940 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | |
1941 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | |
1942 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | |
1943 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | |
1944 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | |
1945 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | |
1946 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | |
1947 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | |
1948 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | |
1949 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | |
1950 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | |
1951 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | |
1952 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | |
1953 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | |
1954 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | |
1955 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | |
1956 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | |
1957 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | |
1958 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | |
1959 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | |
1960 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | |
1961 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | |
1962 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | |
1963 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | |
addb6e16 AP |
1964 | &function_end_B("_x86_AES_decrypt"); |
1965 | ||
1966 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); | |
addb6e16 AP |
1967 | &function_begin("AES_decrypt"); |
1968 | &mov ($acc,&wparam(0)); # load inp | |
1969 | &mov ($key,&wparam(2)); # load key | |
1970 | ||
04d0d0ac | 1971 | &mov ($s0,"esp"); |
af8c1d81 | 1972 | &sub ("esp",36); |
22c268e6 AP |
1973 | &and ("esp",-64); # align to cache-line |
1974 | ||
1975 | # place stack frame just "above" the key schedule | |
1976 | &lea ($s1,&DWP(-64-63,$key)); | |
1977 | &sub ($s1,"esp"); | |
1978 | &neg ($s1); | |
1979 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line | |
1980 | &sub ("esp",$s1); | |
1981 | &add ("esp",4); # 4 is reserved for caller's return address | |
6c69aa53 | 1982 | &mov ($_esp,$s0); # save stack pointer |
04d0d0ac | 1983 | |
addb6e16 AP |
1984 | &call (&label("pic_point")); # make it PIC! |
1985 | &set_label("pic_point"); | |
22c268e6 | 1986 | &blindpop($tbl); |
6c69aa53 | 1987 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only); |
22c268e6 | 1988 | &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl)); |
6c69aa53 | 1989 | |
22c268e6 | 1990 | # pick Td4 copy which can't "overlap" with stack frame or key schedule |
fc924142 AP |
1991 | &lea ($s1,&DWP(768-4,"esp")); |
1992 | &sub ($s1,$tbl); | |
22c268e6 AP |
1993 | &and ($s1,0x300); |
1994 | &lea ($tbl,&DWP(2048+128,$tbl,$s1)); | |
1995 | ||
6c69aa53 AP |
1996 | if (!$x86only) { |
1997 | &bt (&DWP(0,$s0),25); # check for SSE bit | |
fc924142 AP |
1998 | &jnc (&label("x86")); |
1999 | ||
2000 | &movq ("mm0",&QWP(0,$acc)); | |
2001 | &movq ("mm4",&QWP(8,$acc)); | |
2002 | &call ("_sse_AES_decrypt_compact"); | |
6c69aa53 | 2003 | &mov ("esp",$_esp); # restore stack pointer |
fc924142 AP |
2004 | &mov ($acc,&wparam(1)); # load out |
2005 | &movq (&QWP(0,$acc),"mm0"); # write output data | |
2006 | &movq (&QWP(8,$acc),"mm4"); | |
2007 | &emms (); | |
2008 | &function_end_A(); | |
6c69aa53 | 2009 | } |
fc924142 | 2010 | &set_label("x86",16); |
6c69aa53 | 2011 | &mov ($_tbl,$tbl); |
addb6e16 AP |
2012 | &mov ($s0,&DWP(0,$acc)); # load input data |
2013 | &mov ($s1,&DWP(4,$acc)); | |
2014 | &mov ($s2,&DWP(8,$acc)); | |
2015 | &mov ($s3,&DWP(12,$acc)); | |
22c268e6 | 2016 | &call ("_x86_AES_decrypt_compact"); |
6c69aa53 | 2017 | &mov ("esp",$_esp); # restore stack pointer |
addb6e16 AP |
2018 | &mov ($acc,&wparam(1)); # load out |
2019 | &mov (&DWP(0,$acc),$s0); # write output data | |
2020 | &mov (&DWP(4,$acc),$s1); | |
2021 | &mov (&DWP(8,$acc),$s2); | |
2022 | &mov (&DWP(12,$acc),$s3); | |
2023 | &function_end("AES_decrypt"); | |
2024 | ||
2025 | # void AES_cbc_encrypt (const void char *inp, unsigned char *out, | |
2026 | # size_t length, const AES_KEY *key, | |
04d0d0ac AP |
2027 | # unsigned char *ivp,const int enc); |
2028 | { | |
2029 | # stack frame layout | |
6c69aa53 | 2030 | # -4(%esp) # return address 0(%esp) |
609b0852 | 2031 | # 0(%esp) # s0 backing store 4(%esp) |
6c69aa53 AP |
2032 | # 4(%esp) # s1 backing store 8(%esp) |
2033 | # 8(%esp) # s2 backing store 12(%esp) | |
2034 | # 12(%esp) # s3 backing store 16(%esp) | |
2035 | # 16(%esp) # key backup 20(%esp) | |
2036 | # 20(%esp) # end of key schedule 24(%esp) | |
2037 | # 24(%esp) # %ebp backup 28(%esp) | |
2038 | # 28(%esp) # %esp backup | |
2039 | my $_inp=&DWP(32,"esp"); # copy of wparam(0) | |
2040 | my $_out=&DWP(36,"esp"); # copy of wparam(1) | |
2041 | my $_len=&DWP(40,"esp"); # copy of wparam(2) | |
2042 | my $_key=&DWP(44,"esp"); # copy of wparam(3) | |
2043 | my $_ivp=&DWP(48,"esp"); # copy of wparam(4) | |
2044 | my $_tmp=&DWP(52,"esp"); # volatile variable | |
2045 | # | |
2046 | my $ivec=&DWP(60,"esp"); # ivec[16] | |
2047 | my $aes_key=&DWP(76,"esp"); # copy of aes_key | |
2048 | my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds | |
04d0d0ac | 2049 | |
addb6e16 AP |
2050 | &function_begin("AES_cbc_encrypt"); |
2051 | &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len | |
2052 | &cmp ($s2,0); | |
6c69aa53 | 2053 | &je (&label("drop_out")); |
addb6e16 | 2054 | |
04d0d0ac | 2055 | &call (&label("pic_point")); # make it PIC! |
addb6e16 | 2056 | &set_label("pic_point"); |
22c268e6 | 2057 | &blindpop($tbl); |
6c69aa53 | 2058 | &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only); |
3d5fd312 | 2059 | |
addb6e16 | 2060 | &cmp (&wparam(5),0); |
22c268e6 | 2061 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); |
6c69aa53 AP |
2062 | &jne (&label("picked_te")); |
2063 | &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl)); | |
2064 | &set_label("picked_te"); | |
addb6e16 | 2065 | |
6c69aa53 AP |
2066 | # one can argue if this is required |
2067 | &pushf (); | |
2068 | &cld (); | |
04d0d0ac | 2069 | |
6c69aa53 AP |
2070 | &cmp ($s2,$speed_limit); |
2071 | &jb (&label("slow_way")); | |
2072 | &test ($s2,15); | |
2073 | &jnz (&label("slow_way")); | |
2074 | if (!$x86only) { | |
2075 | &bt (&DWP(0,$s0),28); # check for hyper-threading bit | |
2076 | &jc (&label("slow_way")); | |
2077 | } | |
2078 | # pre-allocate aligned stack frame... | |
2079 | &lea ($acc,&DWP(-80-244,"esp")); | |
2080 | &and ($acc,-64); | |
2081 | ||
2082 | # ... and make sure it doesn't alias with $tbl modulo 4096 | |
22c268e6 | 2083 | &mov ($s0,$tbl); |
6c69aa53 AP |
2084 | &lea ($s1,&DWP(2048+256,$tbl)); |
2085 | &mov ($s3,$acc); | |
3d5fd312 | 2086 | &and ($s0,0xfff); # s = %ebp&0xfff |
6c69aa53 | 2087 | &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff |
04d0d0ac AP |
2088 | &and ($s3,0xfff); # p = %esp&0xfff |
2089 | ||
3d5fd312 | 2090 | &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); |
6c69aa53 | 2091 | &jb (&label("tbl_break_out")); |
3d5fd312 | 2092 | &sub ($s3,$s1); |
6c69aa53 AP |
2093 | &sub ($acc,$s3); |
2094 | &jmp (&label("tbl_ok")); | |
2095 | &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz; | |
3d5fd312 AP |
2096 | &sub ($s3,$s0); |
2097 | &and ($s3,0xfff); | |
6c69aa53 AP |
2098 | &add ($s3,384); |
2099 | &sub ($acc,$s3); | |
2100 | &set_label("tbl_ok",4); | |
04d0d0ac | 2101 | |
6c69aa53 AP |
2102 | &lea ($s3,&wparam(0)); # obtain pointer to parameter block |
2103 | &exch ("esp",$acc); # allocate stack frame | |
04d0d0ac | 2104 | &add ("esp",4); # reserve for return address! |
6c69aa53 AP |
2105 | &mov ($_tbl,$tbl); # save %ebp |
2106 | &mov ($_esp,$acc); # save %esp | |
2107 | ||
2108 | &mov ($s0,&DWP(0,$s3)); # load inp | |
2109 | &mov ($s1,&DWP(4,$s3)); # load out | |
2110 | #&mov ($s2,&DWP(8,$s3)); # load len | |
2111 | &mov ($key,&DWP(12,$s3)); # load key | |
2112 | &mov ($acc,&DWP(16,$s3)); # load ivp | |
2113 | &mov ($s3,&DWP(20,$s3)); # load enc flag | |
04d0d0ac AP |
2114 | |
2115 | &mov ($_inp,$s0); # save copy of inp | |
2116 | &mov ($_out,$s1); # save copy of out | |
2117 | &mov ($_len,$s2); # save copy of len | |
6c69aa53 | 2118 | &mov ($_key,$key); # save copy of key |
04d0d0ac AP |
2119 | &mov ($_ivp,$acc); # save copy of ivp |
2120 | ||
a2806233 | 2121 | &mov ($mark,0); # copy of aes_key->rounds = 0; |
a2806233 | 2122 | # do we copy key schedule to stack? |
6c69aa53 | 2123 | &mov ($s1 eq "ebx" ? $s1 : "",$key); |
a2806233 | 2124 | &mov ($s2 eq "ecx" ? $s2 : "",244/4); |
22c268e6 | 2125 | &sub ($s1,$tbl); |
6c69aa53 | 2126 | &mov ("esi",$key); |
a2806233 | 2127 | &and ($s1,0xfff); |
3d5fd312 | 2128 | &lea ("edi",$aes_key); |
6c69aa53 AP |
2129 | &cmp ($s1,2048+256); |
2130 | &jb (&label("do_copy")); | |
a2806233 | 2131 | &cmp ($s1,4096-244); |
6c69aa53 AP |
2132 | &jb (&label("skip_copy")); |
2133 | &set_label("do_copy",4); | |
a2806233 | 2134 | &mov ($_key,"edi"); |
4e28f132 | 2135 | &data_word(0xA5F3F689); # rep movsd |
6c69aa53 | 2136 | &set_label("skip_copy"); |
3d5fd312 | 2137 | |
22c268e6 | 2138 | &mov ($key,16); |
6c69aa53 | 2139 | &set_label("prefetch_tbl",4); |
22c268e6 AP |
2140 | &mov ($s0,&DWP(0,$tbl)); |
2141 | &mov ($s1,&DWP(32,$tbl)); | |
2142 | &mov ($s2,&DWP(64,$tbl)); | |
6c69aa53 | 2143 | &mov ($acc,&DWP(96,$tbl)); |
22c268e6 | 2144 | &lea ($tbl,&DWP(128,$tbl)); |
6c69aa53 AP |
2145 | &sub ($key,1); |
2146 | &jnz (&label("prefetch_tbl")); | |
22c268e6 | 2147 | &sub ($tbl,2048); |
04d0d0ac | 2148 | |
6c69aa53 | 2149 | &mov ($acc,$_inp); |
04d0d0ac | 2150 | &mov ($key,$_ivp); |
addb6e16 | 2151 | |
6c69aa53 AP |
2152 | &cmp ($s3,0); |
2153 | &je (&label("fast_decrypt")); | |
2154 | ||
2155 | #----------------------------- ENCRYPT -----------------------------# | |
addb6e16 AP |
2156 | &mov ($s0,&DWP(0,$key)); # load iv |
2157 | &mov ($s1,&DWP(4,$key)); | |
2158 | ||
6c69aa53 | 2159 | &set_label("fast_enc_loop",16); |
addb6e16 AP |
2160 | &mov ($s2,&DWP(8,$key)); |
2161 | &mov ($s3,&DWP(12,$key)); | |
2162 | ||
04d0d0ac | 2163 | &xor ($s0,&DWP(0,$acc)); # xor input data |
addb6e16 AP |
2164 | &xor ($s1,&DWP(4,$acc)); |
2165 | &xor ($s2,&DWP(8,$acc)); | |
2166 | &xor ($s3,&DWP(12,$acc)); | |
2167 | ||
04d0d0ac | 2168 | &mov ($key,$_key); # load key |
addb6e16 AP |
2169 | &call ("_x86_AES_encrypt"); |
2170 | ||
04d0d0ac AP |
2171 | &mov ($acc,$_inp); # load inp |
2172 | &mov ($key,$_out); # load out | |
addb6e16 | 2173 | |
04d0d0ac | 2174 | &mov (&DWP(0,$key),$s0); # save output data |
addb6e16 AP |
2175 | &mov (&DWP(4,$key),$s1); |
2176 | &mov (&DWP(8,$key),$s2); | |
2177 | &mov (&DWP(12,$key),$s3); | |
2178 | ||
6c69aa53 | 2179 | &lea ($acc,&DWP(16,$acc)); # advance inp |
04d0d0ac | 2180 | &mov ($s2,$_len); # load len |
04d0d0ac | 2181 | &mov ($_inp,$acc); # save inp |
6c69aa53 | 2182 | &lea ($s3,&DWP(16,$key)); # advance out |
04d0d0ac | 2183 | &mov ($_out,$s3); # save out |
6c69aa53 | 2184 | &sub ($s2,16); # decrease len |
04d0d0ac | 2185 | &mov ($_len,$s2); # save len |
6c69aa53 | 2186 | &jnz (&label("fast_enc_loop")); |
04d0d0ac | 2187 | &mov ($acc,$_ivp); # load ivp |
6c69aa53 | 2188 | &mov ($s2,&DWP(8,$key)); # restore last 2 dwords |
addb6e16 | 2189 | &mov ($s3,&DWP(12,$key)); |
04d0d0ac | 2190 | &mov (&DWP(0,$acc),$s0); # save ivec |
addb6e16 AP |
2191 | &mov (&DWP(4,$acc),$s1); |
2192 | &mov (&DWP(8,$acc),$s2); | |
2193 | &mov (&DWP(12,$acc),$s3); | |
3d5fd312 | 2194 | |
a2806233 | 2195 | &cmp ($mark,0); # was the key schedule copied? |
3d5fd312 | 2196 | &mov ("edi",$_key); |
a2806233 | 2197 | &je (&label("skip_ezero")); |
3d5fd312 | 2198 | # zero copy of key schedule |
57ee0070 | 2199 | &mov ("ecx",240/4); |
3d5fd312 AP |
2200 | &xor ("eax","eax"); |
2201 | &align (4); | |
f9c5e5d9 AP |
2202 | &data_word(0xABF3F689); # rep stosd |
2203 | &set_label("skip_ezero"); | |
6c69aa53 | 2204 | &mov ("esp",$_esp); |
3d5fd312 | 2205 | &popf (); |
6c69aa53 | 2206 | &set_label("drop_out"); |
addb6e16 | 2207 | &function_end_A(); |
3d5fd312 | 2208 | &pushf (); # kludge, never executed |
addb6e16 | 2209 | |
addb6e16 | 2210 | #----------------------------- DECRYPT -----------------------------# |
6c69aa53 | 2211 | &set_label("fast_decrypt",16); |
04d0d0ac AP |
2212 | |
2213 | &cmp ($acc,$_out); | |
6c69aa53 | 2214 | &je (&label("fast_dec_in_place")); # in-place processing... |
addb6e16 | 2215 | |
04d0d0ac | 2216 | &mov ($_tmp,$key); |
addb6e16 AP |
2217 | |
2218 | &align (4); | |
6c69aa53 | 2219 | &set_label("fast_dec_loop",16); |
04d0d0ac | 2220 | &mov ($s0,&DWP(0,$acc)); # read input |
addb6e16 AP |
2221 | &mov ($s1,&DWP(4,$acc)); |
2222 | &mov ($s2,&DWP(8,$acc)); | |
2223 | &mov ($s3,&DWP(12,$acc)); | |
2224 | ||
04d0d0ac | 2225 | &mov ($key,$_key); # load key |
addb6e16 AP |
2226 | &call ("_x86_AES_decrypt"); |
2227 | ||
04d0d0ac AP |
2228 | &mov ($key,$_tmp); # load ivp |
2229 | &mov ($acc,$_len); # load len | |
2230 | &xor ($s0,&DWP(0,$key)); # xor iv | |
addb6e16 AP |
2231 | &xor ($s1,&DWP(4,$key)); |
2232 | &xor ($s2,&DWP(8,$key)); | |
2233 | &xor ($s3,&DWP(12,$key)); | |
2234 | ||
04d0d0ac | 2235 | &mov ($key,$_out); # load out |
6c69aa53 | 2236 | &mov ($acc,$_inp); # load inp |
addb6e16 | 2237 | |
04d0d0ac | 2238 | &mov (&DWP(0,$key),$s0); # write output |
addb6e16 AP |
2239 | &mov (&DWP(4,$key),$s1); |
2240 | &mov (&DWP(8,$key),$s2); | |
2241 | &mov (&DWP(12,$key),$s3); | |
2242 | ||
6c69aa53 | 2243 | &mov ($s2,$_len); # load len |
04d0d0ac | 2244 | &mov ($_tmp,$acc); # save ivp |
6c69aa53 | 2245 | &lea ($acc,&DWP(16,$acc)); # advance inp |
04d0d0ac | 2246 | &mov ($_inp,$acc); # save inp |
6c69aa53 | 2247 | &lea ($key,&DWP(16,$key)); # advance out |
04d0d0ac | 2248 | &mov ($_out,$key); # save out |
6c69aa53 AP |
2249 | &sub ($s2,16); # decrease len |
2250 | &mov ($_len,$s2); # save len | |
2251 | &jnz (&label("fast_dec_loop")); | |
04d0d0ac | 2252 | &mov ($key,$_tmp); # load temp ivp |
04d0d0ac | 2253 | &mov ($acc,$_ivp); # load user ivp |
addb6e16 AP |
2254 | &mov ($s0,&DWP(0,$key)); # load iv |
2255 | &mov ($s1,&DWP(4,$key)); | |
2256 | &mov ($s2,&DWP(8,$key)); | |
2257 | &mov ($s3,&DWP(12,$key)); | |
2258 | &mov (&DWP(0,$acc),$s0); # copy back to user | |
2259 | &mov (&DWP(4,$acc),$s1); | |
2260 | &mov (&DWP(8,$acc),$s2); | |
2261 | &mov (&DWP(12,$acc),$s3); | |
6c69aa53 | 2262 | &jmp (&label("fast_dec_out")); |
addb6e16 | 2263 | |
6c69aa53 AP |
2264 | &set_label("fast_dec_in_place",16); |
2265 | &set_label("fast_dec_in_place_loop"); | |
04d0d0ac | 2266 | &mov ($s0,&DWP(0,$acc)); # read input |
addb6e16 AP |
2267 | &mov ($s1,&DWP(4,$acc)); |
2268 | &mov ($s2,&DWP(8,$acc)); | |
2269 | &mov ($s3,&DWP(12,$acc)); | |
2270 | ||
6c69aa53 | 2271 | &lea ($key,$ivec); |
04d0d0ac | 2272 | &mov (&DWP(0,$key),$s0); # copy to temp |
addb6e16 AP |
2273 | &mov (&DWP(4,$key),$s1); |
2274 | &mov (&DWP(8,$key),$s2); | |
2275 | &mov (&DWP(12,$key),$s3); | |
2276 | ||
04d0d0ac | 2277 | &mov ($key,$_key); # load key |
addb6e16 AP |
2278 | &call ("_x86_AES_decrypt"); |
2279 | ||
04d0d0ac AP |
2280 | &mov ($key,$_ivp); # load ivp |
2281 | &mov ($acc,$_out); # load out | |
2282 | &xor ($s0,&DWP(0,$key)); # xor iv | |
addb6e16 AP |
2283 | &xor ($s1,&DWP(4,$key)); |
2284 | &xor ($s2,&DWP(8,$key)); | |
2285 | &xor ($s3,&DWP(12,$key)); | |
2286 | ||
04d0d0ac | 2287 | &mov (&DWP(0,$acc),$s0); # write output |
addb6e16 AP |
2288 | &mov (&DWP(4,$acc),$s1); |
2289 | &mov (&DWP(8,$acc),$s2); | |
2290 | &mov (&DWP(12,$acc),$s3); | |
2291 | ||
6c69aa53 | 2292 | &lea ($acc,&DWP(16,$acc)); # advance out |
04d0d0ac | 2293 | &mov ($_out,$acc); # save out |
addb6e16 | 2294 | |
04d0d0ac AP |
2295 | &lea ($acc,$ivec); |
2296 | &mov ($s0,&DWP(0,$acc)); # read temp | |
addb6e16 AP |
2297 | &mov ($s1,&DWP(4,$acc)); |
2298 | &mov ($s2,&DWP(8,$acc)); | |
2299 | &mov ($s3,&DWP(12,$acc)); | |
2300 | ||
04d0d0ac | 2301 | &mov (&DWP(0,$key),$s0); # copy iv |
addb6e16 AP |
2302 | &mov (&DWP(4,$key),$s1); |
2303 | &mov (&DWP(8,$key),$s2); | |
2304 | &mov (&DWP(12,$key),$s3); | |
2305 | ||
04d0d0ac | 2306 | &mov ($acc,$_inp); # load inp |
6c69aa53 AP |
2307 | &mov ($s2,$_len); # load len |
2308 | &lea ($acc,&DWP(16,$acc)); # advance inp | |
2309 | &mov ($_inp,$acc); # save inp | |
2310 | &sub ($s2,16); # decrease len | |
2311 | &mov ($_len,$s2); # save len | |
2312 | &jnz (&label("fast_dec_in_place_loop")); | |
addb6e16 | 2313 | |
6c69aa53 AP |
2314 | &set_label("fast_dec_out",4); |
2315 | &cmp ($mark,0); # was the key schedule copied? | |
2316 | &mov ("edi",$_key); | |
2317 | &je (&label("skip_dzero")); | |
2318 | # zero copy of key schedule | |
2319 | &mov ("ecx",240/4); | |
2320 | &xor ("eax","eax"); | |
2321 | &align (4); | |
f9c5e5d9 AP |
2322 | &data_word(0xABF3F689); # rep stosd |
2323 | &set_label("skip_dzero"); | |
6c69aa53 AP |
2324 | &mov ("esp",$_esp); |
2325 | &popf (); | |
2326 | &function_end_A(); | |
2327 | &pushf (); # kludge, never executed | |
2328 | ||
2329 | #--------------------------- SLOW ROUTINE ---------------------------# | |
2330 | &set_label("slow_way",16); | |
2331 | ||
2332 | &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap | |
2333 | &mov ($key,&wparam(3)); # load key | |
2334 | ||
2335 | # pre-allocate aligned stack frame... | |
2336 | &lea ($acc,&DWP(-80,"esp")); | |
2337 | &and ($acc,-64); | |
2338 | ||
2339 | # ... and make sure it doesn't alias with $key modulo 1024 | |
2340 | &lea ($s1,&DWP(-80-63,$key)); | |
2341 | &sub ($s1,$acc); | |
2342 | &neg ($s1); | |
2343 | &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line | |
2344 | &sub ($acc,$s1); | |
2345 | ||
2346 | # pick S-box copy which can't overlap with stack frame or $key | |
2347 | &lea ($s1,&DWP(768,$acc)); | |
2348 | &sub ($s1,$tbl); | |
2349 | &and ($s1,0x300); | |
2350 | &lea ($tbl,&DWP(2048+128,$tbl,$s1)); | |
2351 | ||
2352 | &lea ($s3,&wparam(0)); # pointer to parameter block | |
2353 | ||
2354 | &exch ("esp",$acc); | |
2355 | &add ("esp",4); # reserve for return address! | |
2356 | &mov ($_tbl,$tbl); # save %ebp | |
2357 | &mov ($_esp,$acc); # save %esp | |
2358 | &mov ($_tmp,$s0); # save OPENSSL_ia32cap | |
2359 | ||
2360 | &mov ($s0,&DWP(0,$s3)); # load inp | |
2361 | &mov ($s1,&DWP(4,$s3)); # load out | |
2362 | #&mov ($s2,&DWP(8,$s3)); # load len | |
2363 | #&mov ($key,&DWP(12,$s3)); # load key | |
2364 | &mov ($acc,&DWP(16,$s3)); # load ivp | |
2365 | &mov ($s3,&DWP(20,$s3)); # load enc flag | |
2366 | ||
2367 | &mov ($_inp,$s0); # save copy of inp | |
2368 | &mov ($_out,$s1); # save copy of out | |
2369 | &mov ($_len,$s2); # save copy of len | |
2370 | &mov ($_key,$key); # save copy of key | |
2371 | &mov ($_ivp,$acc); # save copy of ivp | |
2372 | ||
2373 | &mov ($key,$acc); | |
2374 | &mov ($acc,$s0); | |
2375 | ||
2376 | &cmp ($s3,0); | |
2377 | &je (&label("slow_decrypt")); | |
2378 | ||
2379 | #--------------------------- SLOW ENCRYPT ---------------------------# | |
2380 | &cmp ($s2,16); | |
3b0ee0d2 | 2381 | &mov ($s3,$s1); |
6c69aa53 AP |
2382 | &jb (&label("slow_enc_tail")); |
2383 | ||
2384 | if (!$x86only) { | |
2385 | &bt ($_tmp,25); # check for SSE bit | |
2386 | &jnc (&label("slow_enc_x86")); | |
2387 | ||
2388 | &movq ("mm0",&QWP(0,$key)); # load iv | |
2389 | &movq ("mm4",&QWP(8,$key)); | |
2390 | ||
2391 | &set_label("slow_enc_loop_sse",16); | |
2392 | &pxor ("mm0",&QWP(0,$acc)); # xor input data | |
2393 | &pxor ("mm4",&QWP(8,$acc)); | |
2394 | ||
2395 | &mov ($key,$_key); | |
2396 | &call ("_sse_AES_encrypt_compact"); | |
2397 | ||
2398 | &mov ($acc,$_inp); # load inp | |
2399 | &mov ($key,$_out); # load out | |
2400 | &mov ($s2,$_len); # load len | |
2401 | ||
2402 | &movq (&QWP(0,$key),"mm0"); # save output data | |
2403 | &movq (&QWP(8,$key),"mm4"); | |
2404 | ||
2405 | &lea ($acc,&DWP(16,$acc)); # advance inp | |
04d0d0ac | 2406 | &mov ($_inp,$acc); # save inp |
6c69aa53 AP |
2407 | &lea ($s3,&DWP(16,$key)); # advance out |
2408 | &mov ($_out,$s3); # save out | |
2409 | &sub ($s2,16); # decrease len | |
2410 | &cmp ($s2,16); | |
2411 | &mov ($_len,$s2); # save len | |
2412 | &jae (&label("slow_enc_loop_sse")); | |
2413 | &test ($s2,15); | |
2414 | &jnz (&label("slow_enc_tail")); | |
2415 | &mov ($acc,$_ivp); # load ivp | |
2416 | &movq (&QWP(0,$acc),"mm0"); # save ivec | |
2417 | &movq (&QWP(8,$acc),"mm4"); | |
2418 | &emms (); | |
2419 | &mov ("esp",$_esp); | |
2420 | &popf (); | |
2421 | &function_end_A(); | |
2422 | &pushf (); # kludge, never executed | |
2423 | } | |
2424 | &set_label("slow_enc_x86",16); | |
2425 | &mov ($s0,&DWP(0,$key)); # load iv | |
2426 | &mov ($s1,&DWP(4,$key)); | |
2427 | ||
2428 | &set_label("slow_enc_loop_x86",4); | |
2429 | &mov ($s2,&DWP(8,$key)); | |
2430 | &mov ($s3,&DWP(12,$key)); | |
2431 | ||
2432 | &xor ($s0,&DWP(0,$acc)); # xor input data | |
2433 | &xor ($s1,&DWP(4,$acc)); | |
2434 | &xor ($s2,&DWP(8,$acc)); | |
2435 | &xor ($s3,&DWP(12,$acc)); | |
2436 | ||
2437 | &mov ($key,$_key); # load key | |
2438 | &call ("_x86_AES_encrypt_compact"); | |
2439 | ||
2440 | &mov ($acc,$_inp); # load inp | |
2441 | &mov ($key,$_out); # load out | |
2442 | ||
2443 | &mov (&DWP(0,$key),$s0); # save output data | |
2444 | &mov (&DWP(4,$key),$s1); | |
2445 | &mov (&DWP(8,$key),$s2); | |
2446 | &mov (&DWP(12,$key),$s3); | |
addb6e16 | 2447 | |
04d0d0ac | 2448 | &mov ($s2,$_len); # load len |
6c69aa53 AP |
2449 | &lea ($acc,&DWP(16,$acc)); # advance inp |
2450 | &mov ($_inp,$acc); # save inp | |
2451 | &lea ($s3,&DWP(16,$key)); # advance out | |
2452 | &mov ($_out,$s3); # save out | |
2453 | &sub ($s2,16); # decrease len | |
2454 | &cmp ($s2,16); | |
04d0d0ac | 2455 | &mov ($_len,$s2); # save len |
6c69aa53 AP |
2456 | &jae (&label("slow_enc_loop_x86")); |
2457 | &test ($s2,15); | |
2458 | &jnz (&label("slow_enc_tail")); | |
2459 | &mov ($acc,$_ivp); # load ivp | |
2460 | &mov ($s2,&DWP(8,$key)); # restore last dwords | |
2461 | &mov ($s3,&DWP(12,$key)); | |
2462 | &mov (&DWP(0,$acc),$s0); # save ivec | |
2463 | &mov (&DWP(4,$acc),$s1); | |
2464 | &mov (&DWP(8,$acc),$s2); | |
2465 | &mov (&DWP(12,$acc),$s3); | |
2466 | ||
2467 | &mov ("esp",$_esp); | |
2468 | &popf (); | |
2469 | &function_end_A(); | |
2470 | &pushf (); # kludge, never executed | |
2471 | ||
2472 | &set_label("slow_enc_tail",16); | |
3a8012cb | 2473 | &emms () if (!$x86only); |
6c69aa53 AP |
2474 | &mov ($key eq "edi"? $key:"",$s3); # load out to edi |
2475 | &mov ($s1,16); | |
2476 | &sub ($s1,$s2); | |
2477 | &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp | |
2478 | &je (&label("enc_in_place")); | |
2479 | &align (4); | |
2480 | &data_word(0xA4F3F689); # rep movsb # copy input | |
2481 | &jmp (&label("enc_skip_in_place")); | |
2482 | &set_label("enc_in_place"); | |
addb6e16 | 2483 | &lea ($key,&DWP(0,$key,$s2)); |
6c69aa53 AP |
2484 | &set_label("enc_skip_in_place"); |
2485 | &mov ($s2,$s1); | |
2486 | &xor ($s0,$s0); | |
2487 | &align (4); | |
2488 | &data_word(0xAAF3F689); # rep stosb # zero tail | |
2489 | ||
3b0ee0d2 | 2490 | &mov ($key,$_ivp); # restore ivp |
6c69aa53 AP |
2491 | &mov ($acc,$s3); # output as input |
2492 | &mov ($s0,&DWP(0,$key)); | |
2493 | &mov ($s1,&DWP(4,$key)); | |
2494 | &mov ($_len,16); # len=16 | |
2495 | &jmp (&label("slow_enc_loop_x86")); # one more spin... | |
2496 | ||
2497 | #--------------------------- SLOW DECRYPT ---------------------------# | |
2498 | &set_label("slow_decrypt",16); | |
2499 | if (!$x86only) { | |
2500 | &bt ($_tmp,25); # check for SSE bit | |
2501 | &jnc (&label("slow_dec_loop_x86")); | |
2502 | ||
2503 | &set_label("slow_dec_loop_sse",4); | |
2504 | &movq ("mm0",&QWP(0,$acc)); # read input | |
2505 | &movq ("mm4",&QWP(8,$acc)); | |
2506 | ||
2507 | &mov ($key,$_key); | |
2508 | &call ("_sse_AES_decrypt_compact"); | |
2509 | ||
2510 | &mov ($acc,$_inp); # load inp | |
2511 | &lea ($s0,$ivec); | |
2512 | &mov ($s1,$_out); # load out | |
2513 | &mov ($s2,$_len); # load len | |
2514 | &mov ($key,$_ivp); # load ivp | |
2515 | ||
2516 | &movq ("mm1",&QWP(0,$acc)); # re-read input | |
2517 | &movq ("mm5",&QWP(8,$acc)); | |
2518 | ||
2519 | &pxor ("mm0",&QWP(0,$key)); # xor iv | |
2520 | &pxor ("mm4",&QWP(8,$key)); | |
2521 | ||
2522 | &movq (&QWP(0,$key),"mm1"); # copy input to iv | |
2523 | &movq (&QWP(8,$key),"mm5"); | |
2524 | ||
2525 | &sub ($s2,16); # decrease len | |
2526 | &jc (&label("slow_dec_partial_sse")); | |
2527 | ||
2528 | &movq (&QWP(0,$s1),"mm0"); # write output | |
2529 | &movq (&QWP(8,$s1),"mm4"); | |
2530 | ||
2531 | &lea ($s1,&DWP(16,$s1)); # advance out | |
2532 | &mov ($_out,$s1); # save out | |
2533 | &lea ($acc,&DWP(16,$acc)); # advance inp | |
2534 | &mov ($_inp,$acc); # save inp | |
2535 | &mov ($_len,$s2); # save len | |
2536 | &jnz (&label("slow_dec_loop_sse")); | |
2537 | &emms (); | |
2538 | &mov ("esp",$_esp); | |
2539 | &popf (); | |
2540 | &function_end_A(); | |
2541 | &pushf (); # kludge, never executed | |
2542 | ||
2543 | &set_label("slow_dec_partial_sse",16); | |
2544 | &movq (&QWP(0,$s0),"mm0"); # save output to temp | |
2545 | &movq (&QWP(8,$s0),"mm4"); | |
2546 | &emms (); | |
2547 | ||
2548 | &add ($s2 eq "ecx" ? "ecx":"",16); | |
2549 | &mov ("edi",$s1); # out | |
2550 | &mov ("esi",$s0); # temp | |
2551 | &align (4); | |
2552 | &data_word(0xA4F3F689); # rep movsb # copy partial output | |
2553 | ||
2554 | &mov ("esp",$_esp); | |
2555 | &popf (); | |
2556 | &function_end_A(); | |
2557 | &pushf (); # kludge, never executed | |
2558 | } | |
2559 | &set_label("slow_dec_loop_x86",16); | |
2560 | &mov ($s0,&DWP(0,$acc)); # read input | |
2561 | &mov ($s1,&DWP(4,$acc)); | |
2562 | &mov ($s2,&DWP(8,$acc)); | |
2563 | &mov ($s3,&DWP(12,$acc)); | |
2564 | ||
2565 | &lea ($key,$ivec); | |
2566 | &mov (&DWP(0,$key),$s0); # copy to temp | |
2567 | &mov (&DWP(4,$key),$s1); | |
2568 | &mov (&DWP(8,$key),$s2); | |
2569 | &mov (&DWP(12,$key),$s3); | |
2570 | ||
2571 | &mov ($key,$_key); # load key | |
2572 | &call ("_x86_AES_decrypt_compact"); | |
2573 | ||
2574 | &mov ($key,$_ivp); # load ivp | |
2575 | &mov ($acc,$_len); # load len | |
2576 | &xor ($s0,&DWP(0,$key)); # xor iv | |
2577 | &xor ($s1,&DWP(4,$key)); | |
2578 | &xor ($s2,&DWP(8,$key)); | |
2579 | &xor ($s3,&DWP(12,$key)); | |
2580 | ||
2581 | &sub ($acc,16); | |
2582 | &jc (&label("slow_dec_partial_x86")); | |
2583 | ||
2584 | &mov ($_len,$acc); # save len | |
2585 | &mov ($acc,$_out); # load out | |
2586 | ||
2587 | &mov (&DWP(0,$acc),$s0); # write output | |
2588 | &mov (&DWP(4,$acc),$s1); | |
2589 | &mov (&DWP(8,$acc),$s2); | |
2590 | &mov (&DWP(12,$acc),$s3); | |
2591 | ||
2592 | &lea ($acc,&DWP(16,$acc)); # advance out | |
2593 | &mov ($_out,$acc); # save out | |
2594 | ||
2595 | &lea ($acc,$ivec); | |
2596 | &mov ($s0,&DWP(0,$acc)); # read temp | |
2597 | &mov ($s1,&DWP(4,$acc)); | |
2598 | &mov ($s2,&DWP(8,$acc)); | |
2599 | &mov ($s3,&DWP(12,$acc)); | |
2600 | ||
2601 | &mov (&DWP(0,$key),$s0); # copy it to iv | |
2602 | &mov (&DWP(4,$key),$s1); | |
2603 | &mov (&DWP(8,$key),$s2); | |
2604 | &mov (&DWP(12,$key),$s3); | |
2605 | ||
2606 | &mov ($acc,$_inp); # load inp | |
2607 | &lea ($acc,&DWP(16,$acc)); # advance inp | |
2608 | &mov ($_inp,$acc); # save inp | |
6c69aa53 AP |
2609 | &jnz (&label("slow_dec_loop_x86")); |
2610 | &mov ("esp",$_esp); | |
2611 | &popf (); | |
2612 | &function_end_A(); | |
2613 | &pushf (); # kludge, never executed | |
2614 | ||
2615 | &set_label("slow_dec_partial_x86",16); | |
2616 | &lea ($acc,$ivec); | |
2617 | &mov (&DWP(0,$acc),$s0); # save output to temp | |
2618 | &mov (&DWP(4,$acc),$s1); | |
2619 | &mov (&DWP(8,$acc),$s2); | |
2620 | &mov (&DWP(12,$acc),$s3); | |
2621 | ||
2622 | &mov ($acc,$_inp); | |
2623 | &mov ($s0,&DWP(0,$acc)); # re-read input | |
2624 | &mov ($s1,&DWP(4,$acc)); | |
2625 | &mov ($s2,&DWP(8,$acc)); | |
2626 | &mov ($s3,&DWP(12,$acc)); | |
2627 | ||
2628 | &mov (&DWP(0,$key),$s0); # copy it to iv | |
2629 | &mov (&DWP(4,$key),$s1); | |
2630 | &mov (&DWP(8,$key),$s2); | |
2631 | &mov (&DWP(12,$key),$s3); | |
2632 | ||
2633 | &mov ("ecx",$_len); | |
2634 | &mov ("edi",$_out); | |
2635 | &lea ("esi",$ivec); | |
2636 | &align (4); | |
2637 | &data_word(0xA4F3F689); # rep movsb # copy partial output | |
2638 | ||
2639 | &mov ("esp",$_esp); | |
2640 | &popf (); | |
addb6e16 | 2641 | &function_end("AES_cbc_encrypt"); |
04d0d0ac | 2642 | } |
addb6e16 AP |
2643 | |
2644 | #------------------------------------------------------------------# | |
71314710 | 2645 | |
e7e11507 AP |
2646 | sub enckey() |
2647 | { | |
2648 | &movz ("esi",&LB("edx")); # rk[i]>>0 | |
53154d71 | 2649 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
e7e11507 | 2650 | &movz ("esi",&HB("edx")); # rk[i]>>8 |
53154d71 | 2651 | &shl ("ebx",24); |
e7e11507 AP |
2652 | &xor ("eax","ebx"); |
2653 | ||
53154d71 | 2654 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
e7e11507 | 2655 | &shr ("edx",16); |
e7e11507 AP |
2656 | &movz ("esi",&LB("edx")); # rk[i]>>16 |
2657 | &xor ("eax","ebx"); | |
2658 | ||
53154d71 | 2659 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
e7e11507 | 2660 | &movz ("esi",&HB("edx")); # rk[i]>>24 |
53154d71 | 2661 | &shl ("ebx",8); |
e7e11507 AP |
2662 | &xor ("eax","ebx"); |
2663 | ||
53154d71 AP |
2664 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
2665 | &shl ("ebx",16); | |
e7e11507 AP |
2666 | &xor ("eax","ebx"); |
2667 | ||
fc924142 | 2668 | &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon |
e7e11507 AP |
2669 | } |
2670 | ||
ad8bd4ec AP |
2671 | &function_begin("_x86_AES_set_encrypt_key"); |
2672 | &mov ("esi",&wparam(1)); # user supplied key | |
2673 | &mov ("edi",&wparam(3)); # private key schedule | |
e7e11507 AP |
2674 | |
2675 | &test ("esi",-1); | |
2676 | &jz (&label("badpointer")); | |
2677 | &test ("edi",-1); | |
2678 | &jz (&label("badpointer")); | |
2679 | ||
2680 | &call (&label("pic_point")); | |
2681 | &set_label("pic_point"); | |
22c268e6 AP |
2682 | &blindpop($tbl); |
2683 | &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl)); | |
53154d71 AP |
2684 | &lea ($tbl,&DWP(2048+128,$tbl)); |
2685 | ||
2686 | # prefetch Te4 | |
2687 | &mov ("eax",&DWP(0-128,$tbl)); | |
2688 | &mov ("ebx",&DWP(32-128,$tbl)); | |
2689 | &mov ("ecx",&DWP(64-128,$tbl)); | |
2690 | &mov ("edx",&DWP(96-128,$tbl)); | |
2691 | &mov ("eax",&DWP(128-128,$tbl)); | |
2692 | &mov ("ebx",&DWP(160-128,$tbl)); | |
2693 | &mov ("ecx",&DWP(192-128,$tbl)); | |
2694 | &mov ("edx",&DWP(224-128,$tbl)); | |
e7e11507 | 2695 | |
ad8bd4ec | 2696 | &mov ("ecx",&wparam(2)); # number of bits in key |
e7e11507 AP |
2697 | &cmp ("ecx",128); |
2698 | &je (&label("10rounds")); | |
2699 | &cmp ("ecx",192); | |
2700 | &je (&label("12rounds")); | |
2701 | &cmp ("ecx",256); | |
2702 | &je (&label("14rounds")); | |
2703 | &mov ("eax",-2); # invalid number of bits | |
2704 | &jmp (&label("exit")); | |
2705 | ||
2706 | &set_label("10rounds"); | |
2707 | &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords | |
2708 | &mov ("ebx",&DWP(4,"esi")); | |
2709 | &mov ("ecx",&DWP(8,"esi")); | |
2710 | &mov ("edx",&DWP(12,"esi")); | |
2711 | &mov (&DWP(0,"edi"),"eax"); | |
2712 | &mov (&DWP(4,"edi"),"ebx"); | |
2713 | &mov (&DWP(8,"edi"),"ecx"); | |
2714 | &mov (&DWP(12,"edi"),"edx"); | |
2715 | ||
2716 | &xor ("ecx","ecx"); | |
2717 | &jmp (&label("10shortcut")); | |
2718 | ||
2719 | &align (4); | |
2720 | &set_label("10loop"); | |
2721 | &mov ("eax",&DWP(0,"edi")); # rk[0] | |
2722 | &mov ("edx",&DWP(12,"edi")); # rk[3] | |
2723 | &set_label("10shortcut"); | |
2724 | &enckey (); | |
2725 | ||
2726 | &mov (&DWP(16,"edi"),"eax"); # rk[4] | |
2727 | &xor ("eax",&DWP(4,"edi")); | |
2728 | &mov (&DWP(20,"edi"),"eax"); # rk[5] | |
2729 | &xor ("eax",&DWP(8,"edi")); | |
2730 | &mov (&DWP(24,"edi"),"eax"); # rk[6] | |
2731 | &xor ("eax",&DWP(12,"edi")); | |
2732 | &mov (&DWP(28,"edi"),"eax"); # rk[7] | |
2733 | &inc ("ecx"); | |
2734 | &add ("edi",16); | |
2735 | &cmp ("ecx",10); | |
2736 | &jl (&label("10loop")); | |
2737 | ||
2738 | &mov (&DWP(80,"edi"),10); # setup number of rounds | |
2739 | &xor ("eax","eax"); | |
2740 | &jmp (&label("exit")); | |
609b0852 | 2741 | |
e7e11507 AP |
2742 | &set_label("12rounds"); |
2743 | &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords | |
2744 | &mov ("ebx",&DWP(4,"esi")); | |
2745 | &mov ("ecx",&DWP(8,"esi")); | |
2746 | &mov ("edx",&DWP(12,"esi")); | |
2747 | &mov (&DWP(0,"edi"),"eax"); | |
2748 | &mov (&DWP(4,"edi"),"ebx"); | |
2749 | &mov (&DWP(8,"edi"),"ecx"); | |
2750 | &mov (&DWP(12,"edi"),"edx"); | |
2751 | &mov ("ecx",&DWP(16,"esi")); | |
2752 | &mov ("edx",&DWP(20,"esi")); | |
2753 | &mov (&DWP(16,"edi"),"ecx"); | |
2754 | &mov (&DWP(20,"edi"),"edx"); | |
2755 | ||
2756 | &xor ("ecx","ecx"); | |
2757 | &jmp (&label("12shortcut")); | |
2758 | ||
2759 | &align (4); | |
2760 | &set_label("12loop"); | |
2761 | &mov ("eax",&DWP(0,"edi")); # rk[0] | |
2762 | &mov ("edx",&DWP(20,"edi")); # rk[5] | |
2763 | &set_label("12shortcut"); | |
2764 | &enckey (); | |
2765 | ||
2766 | &mov (&DWP(24,"edi"),"eax"); # rk[6] | |
2767 | &xor ("eax",&DWP(4,"edi")); | |
2768 | &mov (&DWP(28,"edi"),"eax"); # rk[7] | |
2769 | &xor ("eax",&DWP(8,"edi")); | |
2770 | &mov (&DWP(32,"edi"),"eax"); # rk[8] | |
2771 | &xor ("eax",&DWP(12,"edi")); | |
2772 | &mov (&DWP(36,"edi"),"eax"); # rk[9] | |
2773 | ||
2774 | &cmp ("ecx",7); | |
2775 | &je (&label("12break")); | |
2776 | &inc ("ecx"); | |
2777 | ||
2778 | &xor ("eax",&DWP(16,"edi")); | |
2779 | &mov (&DWP(40,"edi"),"eax"); # rk[10] | |
2780 | &xor ("eax",&DWP(20,"edi")); | |
2781 | &mov (&DWP(44,"edi"),"eax"); # rk[11] | |
2782 | ||
2783 | &add ("edi",24); | |
2784 | &jmp (&label("12loop")); | |
2785 | ||
2786 | &set_label("12break"); | |
2787 | &mov (&DWP(72,"edi"),12); # setup number of rounds | |
2788 | &xor ("eax","eax"); | |
2789 | &jmp (&label("exit")); | |
2790 | ||
2791 | &set_label("14rounds"); | |
2792 | &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords | |
2793 | &mov ("ebx",&DWP(4,"esi")); | |
2794 | &mov ("ecx",&DWP(8,"esi")); | |
2795 | &mov ("edx",&DWP(12,"esi")); | |
2796 | &mov (&DWP(0,"edi"),"eax"); | |
2797 | &mov (&DWP(4,"edi"),"ebx"); | |
2798 | &mov (&DWP(8,"edi"),"ecx"); | |
2799 | &mov (&DWP(12,"edi"),"edx"); | |
2800 | &mov ("eax",&DWP(16,"esi")); | |
2801 | &mov ("ebx",&DWP(20,"esi")); | |
2802 | &mov ("ecx",&DWP(24,"esi")); | |
2803 | &mov ("edx",&DWP(28,"esi")); | |
2804 | &mov (&DWP(16,"edi"),"eax"); | |
2805 | &mov (&DWP(20,"edi"),"ebx"); | |
2806 | &mov (&DWP(24,"edi"),"ecx"); | |
2807 | &mov (&DWP(28,"edi"),"edx"); | |
2808 | ||
2809 | &xor ("ecx","ecx"); | |
2810 | &jmp (&label("14shortcut")); | |
2811 | ||
2812 | &align (4); | |
2813 | &set_label("14loop"); | |
2814 | &mov ("edx",&DWP(28,"edi")); # rk[7] | |
2815 | &set_label("14shortcut"); | |
2816 | &mov ("eax",&DWP(0,"edi")); # rk[0] | |
2817 | ||
2818 | &enckey (); | |
2819 | ||
2820 | &mov (&DWP(32,"edi"),"eax"); # rk[8] | |
2821 | &xor ("eax",&DWP(4,"edi")); | |
2822 | &mov (&DWP(36,"edi"),"eax"); # rk[9] | |
2823 | &xor ("eax",&DWP(8,"edi")); | |
2824 | &mov (&DWP(40,"edi"),"eax"); # rk[10] | |
2825 | &xor ("eax",&DWP(12,"edi")); | |
2826 | &mov (&DWP(44,"edi"),"eax"); # rk[11] | |
2827 | ||
2828 | &cmp ("ecx",6); | |
2829 | &je (&label("14break")); | |
2830 | &inc ("ecx"); | |
2831 | ||
2832 | &mov ("edx","eax"); | |
2833 | &mov ("eax",&DWP(16,"edi")); # rk[4] | |
2834 | &movz ("esi",&LB("edx")); # rk[11]>>0 | |
53154d71 | 2835 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
e7e11507 | 2836 | &movz ("esi",&HB("edx")); # rk[11]>>8 |
e7e11507 AP |
2837 | &xor ("eax","ebx"); |
2838 | ||
53154d71 | 2839 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
e7e11507 | 2840 | &shr ("edx",16); |
53154d71 | 2841 | &shl ("ebx",8); |
e7e11507 AP |
2842 | &movz ("esi",&LB("edx")); # rk[11]>>16 |
2843 | &xor ("eax","ebx"); | |
2844 | ||
53154d71 | 2845 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
e7e11507 | 2846 | &movz ("esi",&HB("edx")); # rk[11]>>24 |
53154d71 | 2847 | &shl ("ebx",16); |
e7e11507 AP |
2848 | &xor ("eax","ebx"); |
2849 | ||
53154d71 AP |
2850 | &movz ("ebx",&BP(-128,$tbl,"esi",1)); |
2851 | &shl ("ebx",24); | |
e7e11507 AP |
2852 | &xor ("eax","ebx"); |
2853 | ||
2854 | &mov (&DWP(48,"edi"),"eax"); # rk[12] | |
2855 | &xor ("eax",&DWP(20,"edi")); | |
2856 | &mov (&DWP(52,"edi"),"eax"); # rk[13] | |
2857 | &xor ("eax",&DWP(24,"edi")); | |
2858 | &mov (&DWP(56,"edi"),"eax"); # rk[14] | |
2859 | &xor ("eax",&DWP(28,"edi")); | |
2860 | &mov (&DWP(60,"edi"),"eax"); # rk[15] | |
2861 | ||
2862 | &add ("edi",32); | |
2863 | &jmp (&label("14loop")); | |
2864 | ||
2865 | &set_label("14break"); | |
2866 | &mov (&DWP(48,"edi"),14); # setup number of rounds | |
2867 | &xor ("eax","eax"); | |
2868 | &jmp (&label("exit")); | |
2869 | ||
2870 | &set_label("badpointer"); | |
2871 | &mov ("eax",-1); | |
2872 | &set_label("exit"); | |
ad8bd4ec AP |
2873 | &function_end("_x86_AES_set_encrypt_key"); |
2874 | ||
2875 | # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | |
2876 | # AES_KEY *key) | |
2877 | &function_begin_B("AES_set_encrypt_key"); | |
2878 | &call ("_x86_AES_set_encrypt_key"); | |
2879 | &ret (); | |
2880 | &function_end_B("AES_set_encrypt_key"); | |
e7e11507 AP |
2881 | |
2882 | sub deckey() | |
53154d71 AP |
2883 | { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; |
2884 | my $tmp = $tbl; | |
e7e11507 | 2885 | |
89f1eb82 AP |
2886 | &mov ($tmp,0x80808080); |
2887 | &and ($tmp,$tp1); | |
96b0f6c1 | 2888 | &lea ($tp2,&DWP(0,$tp1,$tp1)); |
89f1eb82 AP |
2889 | &mov ($acc,$tmp); |
2890 | &shr ($tmp,7); | |
53154d71 | 2891 | &sub ($acc,$tmp); |
96b0f6c1 | 2892 | &and ($tp2,0xfefefefe); |
53154d71 | 2893 | &and ($acc,0x1b1b1b1b); |
89f1eb82 AP |
2894 | &xor ($tp2,$acc); |
2895 | &mov ($tmp,0x80808080); | |
53154d71 | 2896 | |
89f1eb82 | 2897 | &and ($tmp,$tp2); |
96b0f6c1 | 2898 | &lea ($tp4,&DWP(0,$tp2,$tp2)); |
89f1eb82 AP |
2899 | &mov ($acc,$tmp); |
2900 | &shr ($tmp,7); | |
53154d71 | 2901 | &sub ($acc,$tmp); |
96b0f6c1 | 2902 | &and ($tp4,0xfefefefe); |
53154d71 | 2903 | &and ($acc,0x1b1b1b1b); |
96b0f6c1 | 2904 | &xor ($tp2,$tp1); # tp2^tp1 |
89f1eb82 AP |
2905 | &xor ($tp4,$acc); |
2906 | &mov ($tmp,0x80808080); | |
53154d71 | 2907 | |
89f1eb82 | 2908 | &and ($tmp,$tp4); |
96b0f6c1 | 2909 | &lea ($tp8,&DWP(0,$tp4,$tp4)); |
89f1eb82 AP |
2910 | &mov ($acc,$tmp); |
2911 | &shr ($tmp,7); | |
96b0f6c1 | 2912 | &xor ($tp4,$tp1); # tp4^tp1 |
53154d71 | 2913 | &sub ($acc,$tmp); |
96b0f6c1 | 2914 | &and ($tp8,0xfefefefe); |
53154d71 AP |
2915 | &and ($acc,0x1b1b1b1b); |
2916 | &rotl ($tp1,8); # = ROTATE(tp1,8) | |
2917 | &xor ($tp8,$acc); | |
2918 | ||
2919 | &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load | |
2920 | ||
2921 | &xor ($tp1,$tp2); | |
2922 | &xor ($tp2,$tp8); | |
2923 | &xor ($tp1,$tp4); | |
2924 | &rotl ($tp2,24); | |
2925 | &xor ($tp4,$tp8); | |
2926 | &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1) | |
2927 | &rotl ($tp4,16); | |
2928 | &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24) | |
2929 | &rotl ($tp8,8); | |
2930 | &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16) | |
2931 | &mov ($tp2,$tmp); | |
2932 | &xor ($tp1,$tp8); # ^= ROTATE(tp8,8) | |
2933 | ||
2934 | &mov (&DWP(4*$i,$key),$tp1); | |
e7e11507 AP |
2935 | } |
2936 | ||
2937 | # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | |
2938 | # AES_KEY *key) | |
e7e11507 | 2939 | &function_begin_B("AES_set_decrypt_key"); |
ad8bd4ec | 2940 | &call ("_x86_AES_set_encrypt_key"); |
e7e11507 AP |
2941 | &cmp ("eax",0); |
2942 | &je (&label("proceed")); | |
2943 | &ret (); | |
2944 | ||
2945 | &set_label("proceed"); | |
2946 | &push ("ebp"); | |
2947 | &push ("ebx"); | |
2948 | &push ("esi"); | |
2949 | &push ("edi"); | |
2950 | ||
2951 | &mov ("esi",&wparam(2)); | |
2952 | &mov ("ecx",&DWP(240,"esi")); # pull number of rounds | |
2953 | &lea ("ecx",&DWP(0,"","ecx",4)); | |
2954 | &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk | |
2955 | ||
53154d71 | 2956 | &set_label("invert",4); # invert order of chunks |
e7e11507 AP |
2957 | &mov ("eax",&DWP(0,"esi")); |
2958 | &mov ("ebx",&DWP(4,"esi")); | |
2959 | &mov ("ecx",&DWP(0,"edi")); | |
2960 | &mov ("edx",&DWP(4,"edi")); | |
2961 | &mov (&DWP(0,"edi"),"eax"); | |
2962 | &mov (&DWP(4,"edi"),"ebx"); | |
2963 | &mov (&DWP(0,"esi"),"ecx"); | |
2964 | &mov (&DWP(4,"esi"),"edx"); | |
2965 | &mov ("eax",&DWP(8,"esi")); | |
2966 | &mov ("ebx",&DWP(12,"esi")); | |
2967 | &mov ("ecx",&DWP(8,"edi")); | |
2968 | &mov ("edx",&DWP(12,"edi")); | |
2969 | &mov (&DWP(8,"edi"),"eax"); | |
2970 | &mov (&DWP(12,"edi"),"ebx"); | |
2971 | &mov (&DWP(8,"esi"),"ecx"); | |
2972 | &mov (&DWP(12,"esi"),"edx"); | |
2973 | &add ("esi",16); | |
2974 | &sub ("edi",16); | |
2975 | &cmp ("esi","edi"); | |
2976 | &jne (&label("invert")); | |
2977 | ||
53154d71 AP |
2978 | &mov ($key,&wparam(2)); |
2979 | &mov ($acc,&DWP(240,$key)); # pull number of rounds | |
2980 | &lea ($acc,&DWP(-2,$acc,$acc)); | |
2981 | &lea ($acc,&DWP(0,$key,$acc,8)); | |
2982 | &mov (&wparam(2),$acc); | |
e7e11507 | 2983 | |
53154d71 AP |
2984 | &mov ($s0,&DWP(16,$key)); # modulo-scheduled load |
2985 | &set_label("permute",4); # permute the key schedule | |
2986 | &add ($key,16); | |
2987 | &deckey (0,$key,$s0,$s1,$s2,$s3); | |
2988 | &deckey (1,$key,$s1,$s2,$s3,$s0); | |
2989 | &deckey (2,$key,$s2,$s3,$s0,$s1); | |
2990 | &deckey (3,$key,$s3,$s0,$s1,$s2); | |
2991 | &cmp ($key,&wparam(2)); | |
2992 | &jb (&label("permute")); | |
e7e11507 AP |
2993 | |
2994 | &xor ("eax","eax"); # return success | |
2995 | &function_end("AES_set_decrypt_key"); | |
96b0f6c1 | 2996 | &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>"); |
e7e11507 | 2997 | |
71314710 | 2998 | &asm_finish(); |
184bc45f RL |
2999 | |
3000 | close STDOUT; |