]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/aes/aes_x86core.c
Add option for "compact" rounds to aes_x86core.c. "Compact" rounds are
[thirdparty/openssl.git] / crypto / aes / aes_x86core.c
1 /* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
2 /**
3 * rijndael-alg-fst.c
4 *
5 * @version 3.0 (December 2000)
6 *
7 * Optimised ANSI C code for the Rijndael cipher (now AES)
8 *
9 * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10 * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11 * @author Paulo Barreto <paulo.barreto@terra.com.br>
12 *
13 * This code is hereby placed in the public domain.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 /*
29 * This is experimental x86[_64] derivative. It assumes little-endian
30 * byte order and expects CPU to sustain unaligned memory references.
31 * It is used as playground for cache-time attack mitigations and
32 * serves as reference C implementation for x86[_64] assembler.
33 *
34 * <appro@fy.chalmers.se>
35 */
36
37
38 #ifndef AES_DEBUG
39 # ifndef NDEBUG
40 # define NDEBUG
41 # endif
42 #endif
43 #include <assert.h>
44
45 #include <stdlib.h>
46 #include <openssl/aes.h>
47 #include "aes_locl.h"
48
49 /*
50 * These two parameters control which table, 256-byte or 2KB, is
51 * referenced in outer and respectively inner rounds.
52 */
53 #define AES_COMPACT_IN_OUTER_ROUNDS
54 #ifdef AES_COMPACT_IN_OUTER_ROUNDS
55 /* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
56 * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
57 * by factor of ~2. */
58 # undef AES_COMPACT_IN_INNER_ROUNDS
59 #endif
60
61 #if 1
62 static void prefetch256(const void *table)
63 {
64 volatile unsigned long *t=(void *)table,ret;
65 unsigned long sum;
66 int i;
67
68 /* 32 is common least cache-line size */
69 for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0])) sum ^= t[i];
70
71 ret = sum;
72 }
73 #else
74 # define prefetch256(t)
75 #endif
76
77 #undef GETU32
78 #define GETU32(p) (*((u32*)(p)))
79
80 #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
81 typedef unsigned __int64 u64;
82 #define U64(C) C##UI64
83 #elif defined(__arch64__)
84 typedef unsigned long u64;
85 #define U64(C) C##UL
86 #else
87 typedef unsigned long long u64;
88 #define U64(C) C##ULL
89 #endif
90
91 #undef ROTATE
92 #if defined(_MSC_VER) || defined(__ICC)
93 # define ROTATE(a,n) _lrotl(a,n)
94 #elif defined(__GNUC__) && __GNUC__>=2
95 # if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
96 # define ROTATE(a,n) ({ register unsigned int ret; \
97 asm ( \
98 "roll %1,%0" \
99 : "=r"(ret) \
100 : "I"(n), "0"(a) \
101 : "cc"); \
102 ret; \
103 })
104 # endif
105 #endif
106 /*
107 Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
108 Te0[x] = S [x].[02, 01, 01, 03];
109 Te1[x] = S [x].[03, 02, 01, 01];
110 Te2[x] = S [x].[01, 03, 02, 01];
111 Te3[x] = S [x].[01, 01, 03, 02];
112 */
113 #define Te0 (u32)((u64*)((u8*)Te+0))
114 #define Te1 (u32)((u64*)((u8*)Te+3))
115 #define Te2 (u32)((u64*)((u8*)Te+2))
116 #define Te3 (u32)((u64*)((u8*)Te+1))
117 /*
118 Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
119 Td0[x] = Si[x].[0e, 09, 0d, 0b];
120 Td1[x] = Si[x].[0b, 0e, 09, 0d];
121 Td2[x] = Si[x].[0d, 0b, 0e, 09];
122 Td3[x] = Si[x].[09, 0d, 0b, 0e];
123 Td4[x] = Si[x].[01];
124 */
125 #define Td0 (u32)((u64*)((u8*)Td+0))
126 #define Td1 (u32)((u64*)((u8*)Td+3))
127 #define Td2 (u32)((u64*)((u8*)Td+2))
128 #define Td3 (u32)((u64*)((u8*)Td+1))
129
130 static const u64 Te[256] = {
131 U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
132 U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
133 U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
134 U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
135 U64(0x5030306050303060), U64(0x0301010203010102),
136 U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
137 U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
138 U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
139 U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
140 U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
141 U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
142 U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
143 U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
144 U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
145 U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
146 U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
147 U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
148 U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
149 U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
150 U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
151 U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
152 U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
153 U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
154 U64(0x5331316253313162), U64(0x3f15152a3f15152a),
155 U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
156 U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
157 U64(0x2818183028181830), U64(0xa1969637a1969637),
158 U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
159 U64(0x0907070e0907070e), U64(0x3612122436121224),
160 U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
161 U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
162 U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
163 U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
164 U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
165 U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
166 U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
167 U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
168 U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
169 U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
170 U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
171 U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
172 U64(0x0000000000000000), U64(0x2cededc12cededc1),
173 U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
174 U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
175 U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
176 U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
177 U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
178 U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
179 U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
180 U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
181 U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
182 U64(0x5533336655333366), U64(0x9485851194858511),
183 U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
184 U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
185 U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
186 U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
187 U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
188 U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
189 U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
190 U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
191 U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
192 U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
193 U64(0x3010102030101020), U64(0x1affffe51affffe5),
194 U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
195 U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
196 U64(0x3513132635131326), U64(0x2fececc32fececc3),
197 U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
198 U64(0xcc444488cc444488), U64(0x3917172e3917172e),
199 U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
200 U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
201 U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
202 U64(0x2b1919322b191932), U64(0x957373e6957373e6),
203 U64(0xa06060c0a06060c0), U64(0x9881811998818119),
204 U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
205 U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
206 U64(0xab90903bab90903b), U64(0x8388880b8388880b),
207 U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
208 U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
209 U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
210 U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
211 U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
212 U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
213 U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
214 U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
215 U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
216 U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
217 U64(0xa8919139a8919139), U64(0xa4959531a4959531),
218 U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
219 U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
220 U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
221 U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
222 U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
223 U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
224 U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
225 U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
226 U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
227 U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
228 U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
229 U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
230 U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
231 U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
232 U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
233 U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
234 U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
235 U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
236 U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
237 U64(0xd8484890d8484890), U64(0x0503030605030306),
238 U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
239 U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
240 U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
241 U64(0x9186861791868617), U64(0x58c1c19958c1c199),
242 U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
243 U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
244 U64(0xb398982bb398982b), U64(0x3311112233111122),
245 U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
246 U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
247 U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
248 U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
249 U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
250 U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
251 U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
252 U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
253 U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
254 U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
255 U64(0xc3414182c3414182), U64(0xb0999929b0999929),
256 U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
257 U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
258 U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
259 };
260
261 static const u8 Te4[256] = {
262 0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
263 0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
264 0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
265 0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
266 0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
267 0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
268 0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
269 0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
270 0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
271 0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
272 0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
273 0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
274 0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
275 0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
276 0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
277 0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
278 0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
279 0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
280 0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
281 0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
282 0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
283 0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
284 0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
285 0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
286 0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
287 0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
288 0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
289 0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
290 0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
291 0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
292 0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
293 0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
294 };
295
296 static const u64 Td[256] = {
297 U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
298 U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
299 U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
300 U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
301 U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
302 U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
303 U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
304 U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
305 U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
306 U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
307 U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
308 U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
309 U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
310 U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
311 U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
312 U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
313 U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
314 U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
315 U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
316 U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
317 U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
318 U64(0x6033519760335197), U64(0x457f5362457f5362),
319 U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
320 U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
321 U64(0x5868487058684870), U64(0x19fd458f19fd458f),
322 U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
323 U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
324 U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
325 U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
326 U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
327 U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
328 U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
329 U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
330 U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
331 U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
332 U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
333 U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
334 U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
335 U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
336 U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
337 U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
338 U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
339 U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
340 U64(0x6fd406046fd40604), U64(0xff155060ff155060),
341 U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
342 U64(0xcc434089cc434089), U64(0x779ed967779ed967),
343 U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
344 U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
345 U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
346 U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
347 U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
348 U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
349 U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
350 U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
351 U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
352 U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
353 U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
354 U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
355 U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
356 U64(0x694b775a694b775a), U64(0x161a121c161a121c),
357 U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
358 U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
359 U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
360 U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
361 U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
362 U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
363 U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
364 U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
365 U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
366 U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
367 U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
368 U64(0x4022971340229713), U64(0x2011c6842011c684),
369 U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
370 U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
371 U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
372 U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
373 U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
374 U64(0xfa489411fa489411), U64(0x2264e9472264e947),
375 U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
376 U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
377 U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
378 U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
379 U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
380 U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
381 U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
382 U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
383 U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
384 U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
385 U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
386 U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
387 U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
388 U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
389 U64(0x097826cd097826cd), U64(0xf418596ef418596e),
390 U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
391 U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
392 U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
393 U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
394 U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
395 U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
396 U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
397 U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
398 U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
399 U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
400 U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
401 U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
402 U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
403 U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
404 U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
405 U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
406 U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
407 U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
408 U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
409 U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
410 U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
411 U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
412 U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
413 U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
414 U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
415 U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
416 U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
417 U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
418 U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
419 U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
420 U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
421 U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
422 U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
423 U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
424 U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
425 };
426 static const u8 Td4[256] = {
427 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
428 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
429 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
430 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
431 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
432 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
433 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
434 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
435 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
436 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
437 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
438 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
439 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
440 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
441 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
442 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
443 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
444 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
445 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
446 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
447 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
448 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
449 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
450 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
451 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
452 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
453 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
454 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
455 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
456 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
457 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
458 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
459 };
460
461 static const u32 rcon[] = {
462 0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
463 0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
464 0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
465 };
466
467 /**
468 * Expand the cipher key into the encryption key schedule.
469 */
470 int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
471 AES_KEY *key) {
472
473 u32 *rk;
474 int i = 0;
475 u32 temp;
476
477 if (!userKey || !key)
478 return -1;
479 if (bits != 128 && bits != 192 && bits != 256)
480 return -2;
481
482 rk = key->rd_key;
483
484 if (bits==128)
485 key->rounds = 10;
486 else if (bits==192)
487 key->rounds = 12;
488 else
489 key->rounds = 14;
490
491 rk[0] = GETU32(userKey );
492 rk[1] = GETU32(userKey + 4);
493 rk[2] = GETU32(userKey + 8);
494 rk[3] = GETU32(userKey + 12);
495 if (bits == 128) {
496 while (1) {
497 temp = rk[3];
498 rk[4] = rk[0] ^
499 (Te2[(temp >> 8) & 0xff] & 0x000000ffU) ^
500 (Te3[(temp >> 16) & 0xff] & 0x0000ff00U) ^
501 (Te0[(temp >> 24) ] & 0x00ff0000U) ^
502 (Te1[(temp ) & 0xff] & 0xff000000U) ^
503 rcon[i];
504 rk[5] = rk[1] ^ rk[4];
505 rk[6] = rk[2] ^ rk[5];
506 rk[7] = rk[3] ^ rk[6];
507 if (++i == 10) {
508 return 0;
509 }
510 rk += 4;
511 }
512 }
513 rk[4] = GETU32(userKey + 16);
514 rk[5] = GETU32(userKey + 20);
515 if (bits == 192) {
516 while (1) {
517 temp = rk[ 5];
518 rk[ 6] = rk[ 0] ^
519 (Te2[(temp >> 8) & 0xff] & 0x000000ffU) ^
520 (Te3[(temp >> 16) & 0xff] & 0x0000ff00U) ^
521 (Te0[(temp >> 24) ] & 0x00ff0000U) ^
522 (Te1[(temp ) & 0xff] & 0xff000000U) ^
523 rcon[i];
524 rk[ 7] = rk[ 1] ^ rk[ 6];
525 rk[ 8] = rk[ 2] ^ rk[ 7];
526 rk[ 9] = rk[ 3] ^ rk[ 8];
527 if (++i == 8) {
528 return 0;
529 }
530 rk[10] = rk[ 4] ^ rk[ 9];
531 rk[11] = rk[ 5] ^ rk[10];
532 rk += 6;
533 }
534 }
535 rk[6] = GETU32(userKey + 24);
536 rk[7] = GETU32(userKey + 28);
537 if (bits == 256) {
538 while (1) {
539 temp = rk[ 7];
540 rk[ 8] = rk[ 0] ^
541 (Te2[(temp >> 8) & 0xff] & 0x000000ffU) ^
542 (Te3[(temp >> 16) & 0xff] & 0x0000ff00U) ^
543 (Te0[(temp >> 24) ] & 0x00ff0000U) ^
544 (Te1[(temp ) & 0xff] & 0xff000000U) ^
545 rcon[i];
546 rk[ 9] = rk[ 1] ^ rk[ 8];
547 rk[10] = rk[ 2] ^ rk[ 9];
548 rk[11] = rk[ 3] ^ rk[10];
549 if (++i == 7) {
550 return 0;
551 }
552 temp = rk[11];
553 rk[12] = rk[ 4] ^
554 (Te2[(temp ) & 0xff] & 0x000000ffU) ^
555 (Te3[(temp >> 8) & 0xff] & 0x0000ff00U) ^
556 (Te0[(temp >> 16) & 0xff] & 0x00ff0000U) ^
557 (Te1[(temp >> 24) ] & 0xff000000U);
558 rk[13] = rk[ 5] ^ rk[12];
559 rk[14] = rk[ 6] ^ rk[13];
560 rk[15] = rk[ 7] ^ rk[14];
561
562 rk += 8;
563 }
564 }
565 return 0;
566 }
567
568 /**
569 * Expand the cipher key into the decryption key schedule.
570 */
571 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
572 AES_KEY *key) {
573
574 u32 *rk;
575 int i, j, status;
576 u32 temp;
577
578 /* first, start with an encryption schedule */
579 status = AES_set_encrypt_key(userKey, bits, key);
580 if (status < 0)
581 return status;
582
583 rk = key->rd_key;
584
585 /* invert the order of the round keys: */
586 for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
587 temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp;
588 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
589 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
590 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
591 }
592 /* apply the inverse MixColumn transform to all round keys but the first and the last: */
593 for (i = 1; i < (key->rounds); i++) {
594 rk += 4;
595 rk[0] =
596 Td0[Te2[(rk[0] ) & 0xff] & 0xff] ^
597 Td1[Te2[(rk[0] >> 8) & 0xff] & 0xff] ^
598 Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
599 Td3[Te2[(rk[0] >> 24) ] & 0xff];
600 rk[1] =
601 Td0[Te2[(rk[1] ) & 0xff] & 0xff] ^
602 Td1[Te2[(rk[1] >> 8) & 0xff] & 0xff] ^
603 Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
604 Td3[Te2[(rk[1] >> 24) ] & 0xff];
605 rk[2] =
606 Td0[Te2[(rk[2] ) & 0xff] & 0xff] ^
607 Td1[Te2[(rk[2] >> 8) & 0xff] & 0xff] ^
608 Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
609 Td3[Te2[(rk[2] >> 24) ] & 0xff];
610 rk[3] =
611 Td0[Te2[(rk[3] ) & 0xff] & 0xff] ^
612 Td1[Te2[(rk[3] >> 8) & 0xff] & 0xff] ^
613 Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
614 Td3[Te2[(rk[3] >> 24) ] & 0xff];
615 }
616 return 0;
617 }
618
619 /*
620 * Encrypt a single block
621 * in and out can overlap
622 */
623 void AES_encrypt(const unsigned char *in, unsigned char *out,
624 const AES_KEY *key) {
625
626 const u32 *rk;
627 u32 s0, s1, s2, s3, t[4];
628 int r;
629
630 assert(in && out && key);
631 rk = key->rd_key;
632
633 /*
634 * map byte array block to cipher state
635 * and add initial round key:
636 */
637 s0 = GETU32(in ) ^ rk[0];
638 s1 = GETU32(in + 4) ^ rk[1];
639 s2 = GETU32(in + 8) ^ rk[2];
640 s3 = GETU32(in + 12) ^ rk[3];
641
642 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
643 prefetch256(Te4);
644
645 t[0] = Te4[(s0 ) & 0xff] ^
646 Te4[(s1 >> 8) & 0xff] << 8 ^
647 Te4[(s2 >> 16) & 0xff] << 16 ^
648 Te4[(s3 >> 24) ] << 24;
649 t[1] = Te4[(s1 ) & 0xff] ^
650 Te4[(s2 >> 8) & 0xff] << 8 ^
651 Te4[(s3 >> 16) & 0xff] << 16 ^
652 Te4[(s0 >> 24) ] << 24;
653 t[2] = Te4[(s2 ) & 0xff] ^
654 Te4[(s3 >> 8) & 0xff] << 8 ^
655 Te4[(s0 >> 16) & 0xff] << 16 ^
656 Te4[(s1 >> 24) ] << 24;
657 t[3] = Te4[(s3 ) & 0xff] ^
658 Te4[(s0 >> 8) & 0xff] << 8 ^
659 Te4[(s1 >> 16) & 0xff] << 16 ^
660 Te4[(s2 >> 24) ] << 24;
661
662 /* now do the linear transform using words */
663 { int i;
664 u32 r0, r1, r2;
665
666 for (i = 0; i < 4; i++) {
667 r0 = t[i];
668 r1 = r0 & 0x80808080;
669 r2 = ((r0 & 0x7f7f7f7f) << 1) ^
670 ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
671 #if defined(ROTATE)
672 t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
673 ROTATE(r0,16) ^ ROTATE(r0,8);
674 #else
675 t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
676 (r0 << 16) ^ (r0 >> 16) ^
677 (r0 << 8) ^ (r0 >> 24);
678 #endif
679 t[i] ^= rk[4+i];
680 }
681 }
682 #else
683 t[0] = Te0[(s0 ) & 0xff] ^
684 Te1[(s1 >> 8) & 0xff] ^
685 Te2[(s2 >> 16) & 0xff] ^
686 Te3[(s3 >> 24) ] ^
687 rk[4];
688 t[1] = Te0[(s1 ) & 0xff] ^
689 Te1[(s2 >> 8) & 0xff] ^
690 Te2[(s3 >> 16) & 0xff] ^
691 Te3[(s0 >> 24) ] ^
692 rk[5];
693 t[2] = Te0[(s2 ) & 0xff] ^
694 Te1[(s3 >> 8) & 0xff] ^
695 Te2[(s0 >> 16) & 0xff] ^
696 Te3[(s1 >> 24) ] ^
697 rk[6];
698 t[3] = Te0[(s3 ) & 0xff] ^
699 Te1[(s0 >> 8) & 0xff] ^
700 Te2[(s1 >> 16) & 0xff] ^
701 Te3[(s2 >> 24) ] ^
702 rk[7];
703 #endif
704 s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
705
706 /*
707 * Nr - 2 full rounds:
708 */
709 for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
710 #if defined(AES_COMPACT_IN_INNER_ROUNDS)
711 t[0] = Te4[(s0 ) & 0xff] ^
712 Te4[(s1 >> 8) & 0xff] << 8 ^
713 Te4[(s2 >> 16) & 0xff] << 16 ^
714 Te4[(s3 >> 24) ] << 24;
715 t[1] = Te4[(s1 ) & 0xff] ^
716 Te4[(s2 >> 8) & 0xff] << 8 ^
717 Te4[(s3 >> 16) & 0xff] << 16 ^
718 Te4[(s0 >> 24) ] << 24;
719 t[2] = Te4[(s2 ) & 0xff] ^
720 Te4[(s3 >> 8) & 0xff] << 8 ^
721 Te4[(s0 >> 16) & 0xff] << 16 ^
722 Te4[(s1 >> 24) ] << 24;
723 t[3] = Te4[(s3 ) & 0xff] ^
724 Te4[(s0 >> 8) & 0xff] << 8 ^
725 Te4[(s1 >> 16) & 0xff] << 16 ^
726 Te4[(s2 >> 24) ] << 24;
727
728 /* now do the linear transform using words */
729 { int i;
730 u32 r0, r1, r2;
731
732 for (i = 0; i < 4; i++) {
733 r0 = t[i];
734 r1 = r0 & 0x80808080;
735 r2 = ((r0 & 0x7f7f7f7f) << 1) ^
736 ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
737 #if defined(ROTATE)
738 t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
739 ROTATE(r0,16) ^ ROTATE(r0,8);
740 #else
741 t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
742 (r0 << 16) ^ (r0 >> 16) ^
743 (r0 << 8) ^ (r0 >> 24);
744 #endif
745 t[i] ^= rk[i];
746 }
747 }
748 #else
749 t[0] = Te0[(s0 ) & 0xff] ^
750 Te1[(s1 >> 8) & 0xff] ^
751 Te2[(s2 >> 16) & 0xff] ^
752 Te3[(s3 >> 24) ] ^
753 rk[0];
754 t[1] = Te0[(s1 ) & 0xff] ^
755 Te1[(s2 >> 8) & 0xff] ^
756 Te2[(s3 >> 16) & 0xff] ^
757 Te3[(s0 >> 24) ] ^
758 rk[1];
759 t[2] = Te0[(s2 ) & 0xff] ^
760 Te1[(s3 >> 8) & 0xff] ^
761 Te2[(s0 >> 16) & 0xff] ^
762 Te3[(s1 >> 24) ] ^
763 rk[2];
764 t[3] = Te0[(s3 ) & 0xff] ^
765 Te1[(s0 >> 8) & 0xff] ^
766 Te2[(s1 >> 16) & 0xff] ^
767 Te3[(s2 >> 24) ] ^
768 rk[3];
769 #endif
770 s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
771 }
772 /*
773 * apply last round and
774 * map cipher state to byte array block:
775 */
776 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
777 prefetch256(Te4);
778
779 *(u32*)(out+0) =
780 Te4[(s0 ) & 0xff] ^
781 Te4[(s1 >> 8) & 0xff] << 8 ^
782 Te4[(s2 >> 16) & 0xff] << 16 ^
783 Te4[(s3 >> 24) ] << 24 ^
784 rk[0];
785 *(u32*)(out+4) =
786 Te4[(s1 ) & 0xff] ^
787 Te4[(s2 >> 8) & 0xff] << 8 ^
788 Te4[(s3 >> 16) & 0xff] << 16 ^
789 Te4[(s0 >> 24) ] << 24 ^
790 rk[1];
791 *(u32*)(out+8) =
792 Te4[(s2 ) & 0xff] ^
793 Te4[(s3 >> 8) & 0xff] << 8 ^
794 Te4[(s0 >> 16) & 0xff] << 16 ^
795 Te4[(s1 >> 24) ] << 24 ^
796 rk[2];
797 *(u32*)(out+12) =
798 Te4[(s3 ) & 0xff] ^
799 Te4[(s0 >> 8) & 0xff] << 8 ^
800 Te4[(s1 >> 16) & 0xff] << 16 ^
801 Te4[(s2 >> 24) ] << 24 ^
802 rk[3];
803 #else
804 *(u32*)(out+0) =
805 (Te2[(s0 ) & 0xff] & 0x000000ffU) ^
806 (Te3[(s1 >> 8) & 0xff] & 0x0000ff00U) ^
807 (Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
808 (Te1[(s3 >> 24) ] & 0xff000000U) ^
809 rk[0];
810 *(u32*)(out+4) =
811 (Te2[(s1 ) & 0xff] & 0x000000ffU) ^
812 (Te3[(s2 >> 8) & 0xff] & 0x0000ff00U) ^
813 (Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
814 (Te1[(s0 >> 24) ] & 0xff000000U) ^
815 rk[1];
816 *(u32*)(out+8) =
817 (Te2[(s2 ) & 0xff] & 0x000000ffU) ^
818 (Te3[(s3 >> 8) & 0xff] & 0x0000ff00U) ^
819 (Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
820 (Te1[(s1 >> 24) ] & 0xff000000U) ^
821 rk[2];
822 *(u32*)(out+12) =
823 (Te2[(s3 ) & 0xff] & 0x000000ffU) ^
824 (Te3[(s0 >> 8) & 0xff] & 0x0000ff00U) ^
825 (Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
826 (Te1[(s2 >> 24) ] & 0xff000000U) ^
827 rk[3];
828 #endif
829 }
830
831 /*
832 * Decrypt a single block
833 * in and out can overlap
834 */
835 void AES_decrypt(const unsigned char *in, unsigned char *out,
836 const AES_KEY *key) {
837
838 const u32 *rk;
839 u32 s0, s1, s2, s3, t[4];
840 int r;
841
842 assert(in && out && key);
843 rk = key->rd_key;
844
845 /*
846 * map byte array block to cipher state
847 * and add initial round key:
848 */
849 s0 = GETU32(in ) ^ rk[0];
850 s1 = GETU32(in + 4) ^ rk[1];
851 s2 = GETU32(in + 8) ^ rk[2];
852 s3 = GETU32(in + 12) ^ rk[3];
853
854 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
855 prefetch256(Td4);
856
857 t[0] = Td4[(s0 ) & 0xff] ^
858 Td4[(s3 >> 8) & 0xff] << 8 ^
859 Td4[(s2 >> 16) & 0xff] << 16 ^
860 Td4[(s1 >> 24) ] << 24;
861 t[1] = Td4[(s1 ) & 0xff] ^
862 Td4[(s0 >> 8) & 0xff] << 8 ^
863 Td4[(s3 >> 16) & 0xff] << 16 ^
864 Td4[(s2 >> 24) ] << 24;
865 t[2] = Td4[(s2 ) & 0xff] ^
866 Td4[(s1 >> 8) & 0xff] << 8 ^
867 Td4[(s0 >> 16) & 0xff] << 16 ^
868 Td4[(s3 >> 24) ] << 24;
869 t[3] = Td4[(s3 ) & 0xff] ^
870 Td4[(s2 >> 8) & 0xff] << 8 ^
871 Td4[(s1 >> 16) & 0xff] << 16 ^
872 Td4[(s0 >> 24) ] << 24;
873
874 /* now do the linear transform using words */
875 { int i;
876 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
877
878 for (i = 0; i < 4; i++) {
879 tp1 = t[i];
880 m = tp1 & 0x80808080;
881 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
882 ((m - (m >> 7)) & 0x1b1b1b1b);
883 m = tp2 & 0x80808080;
884 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
885 ((m - (m >> 7)) & 0x1b1b1b1b);
886 m = tp4 & 0x80808080;
887 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
888 ((m - (m >> 7)) & 0x1b1b1b1b);
889 tp9 = tp8 ^ tp1;
890 tpb = tp9 ^ tp2;
891 tpd = tp9 ^ tp4;
892 tpe = tp8 ^ tp4 ^ tp2;
893 #if defined(ROTATE)
894 t[i] = tpe ^ ROTATE(tpd,16) ^
895 ROTATE(tp9,8) ^ ROTATE(tpb,24);
896 #else
897 t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
898 (tp9 >> 24) ^ (tp9 << 8) ^
899 (tpb >> 8) ^ (tpb << 24);
900 #endif
901 t[i] ^= rk[4+i];
902 }
903 }
904 #else
905 t[0] = Td0[(s0 ) & 0xff] ^
906 Td1[(s3 >> 8) & 0xff] ^
907 Td2[(s2 >> 16) & 0xff] ^
908 Td3[(s1 >> 24) ] ^
909 rk[4];
910 t[1] = Td0[(s1 ) & 0xff] ^
911 Td1[(s0 >> 8) & 0xff] ^
912 Td2[(s3 >> 16) & 0xff] ^
913 Td3[(s2 >> 24) ] ^
914 rk[5];
915 t[2] = Td0[(s2 ) & 0xff] ^
916 Td1[(s1 >> 8) & 0xff] ^
917 Td2[(s0 >> 16) & 0xff] ^
918 Td3[(s3 >> 24) ] ^
919 rk[6];
920 t[3] = Td0[(s3 ) & 0xff] ^
921 Td1[(s2 >> 8) & 0xff] ^
922 Td2[(s1 >> 16) & 0xff] ^
923 Td3[(s0 >> 24) ] ^
924 rk[7];
925 #endif
926 s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
927
928 /*
929 * Nr - 2 full rounds:
930 */
931 for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
932 #if defined(AES_COMPACT_IN_INNER_ROUNDS)
933 t[0] = Td4[(s0 ) & 0xff] ^
934 Td4[(s3 >> 8) & 0xff] << 8 ^
935 Td4[(s2 >> 16) & 0xff] << 16 ^
936 Td4[(s1 >> 24) ] << 24;
937 t[1] = Td4[(s1 ) & 0xff] ^
938 Td4[(s0 >> 8) & 0xff] << 8 ^
939 Td4[(s3 >> 16) & 0xff] << 16 ^
940 Td4[(s2 >> 24) ] << 24;
941 t[2] = Td4[(s2 ) & 0xff] ^
942 Td4[(s1 >> 8) & 0xff] << 8 ^
943 Td4[(s0 >> 16) & 0xff] << 16 ^
944 Td4[(s3 >> 24) ] << 24;
945 t[3] = Td4[(s3 ) & 0xff] ^
946 Td4[(s2 >> 8) & 0xff] << 8 ^
947 Td4[(s1 >> 16) & 0xff] << 16 ^
948 Td4[(s0 >> 24) ] << 24;
949
950 /* now do the linear transform using words */
951 { int i;
952 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
953
954 for (i = 0; i < 4; i++) {
955 tp1 = t[i];
956 m = tp1 & 0x80808080;
957 tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
958 ((m - (m >> 7)) & 0x1b1b1b1b);
959 m = tp2 & 0x80808080;
960 tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
961 ((m - (m >> 7)) & 0x1b1b1b1b);
962 m = tp4 & 0x80808080;
963 tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
964 ((m - (m >> 7)) & 0x1b1b1b1b);
965 tp9 = tp8 ^ tp1;
966 tpb = tp9 ^ tp2;
967 tpd = tp9 ^ tp4;
968 tpe = tp8 ^ tp4 ^ tp2;
969 #if defined(ROTATE)
970 t[i] = tpe ^ ROTATE(tpd,16) ^
971 ROTATE(tp9,8) ^ ROTATE(tpb,24);
972 #else
973 t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^
974 (tp9 >> 24) ^ (tp9 << 8) ^
975 (tpb >> 8) ^ (tpb << 24);
976 #endif
977 t[i] ^= rk[i];
978 }
979 }
980 #else
981 t[0] = Td0[(s0 ) & 0xff] ^
982 Td1[(s3 >> 8) & 0xff] ^
983 Td2[(s2 >> 16) & 0xff] ^
984 Td3[(s1 >> 24) ] ^
985 rk[0];
986 t[1] = Td0[(s1 ) & 0xff] ^
987 Td1[(s0 >> 8) & 0xff] ^
988 Td2[(s3 >> 16) & 0xff] ^
989 Td3[(s2 >> 24) ] ^
990 rk[1];
991 t[2] = Td0[(s2 ) & 0xff] ^
992 Td1[(s1 >> 8) & 0xff] ^
993 Td2[(s0 >> 16) & 0xff] ^
994 Td3[(s3 >> 24) ] ^
995 rk[2];
996 t[3] = Td0[(s3 ) & 0xff] ^
997 Td1[(s2 >> 8) & 0xff] ^
998 Td2[(s1 >> 16) & 0xff] ^
999 Td3[(s0 >> 24) ] ^
1000 rk[3];
1001 #endif
1002 s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
1003 }
1004 /*
1005 * apply last round and
1006 * map cipher state to byte array block:
1007 */
1008 prefetch256(Td4);
1009
1010 *(u32*)(out+0) =
1011 (Td4[(s0 ) & 0xff]) ^
1012 (Td4[(s3 >> 8) & 0xff] << 8) ^
1013 (Td4[(s2 >> 16) & 0xff] << 16) ^
1014 (Td4[(s1 >> 24) ] << 24) ^
1015 rk[0];
1016 *(u32*)(out+4) =
1017 (Td4[(s1 ) & 0xff]) ^
1018 (Td4[(s0 >> 8) & 0xff] << 8) ^
1019 (Td4[(s3 >> 16) & 0xff] << 16) ^
1020 (Td4[(s2 >> 24) ] << 24) ^
1021 rk[1];
1022 *(u32*)(out+8) =
1023 (Td4[(s2 ) & 0xff]) ^
1024 (Td4[(s1 >> 8) & 0xff] << 8) ^
1025 (Td4[(s0 >> 16) & 0xff] << 16) ^
1026 (Td4[(s3 >> 24) ] << 24) ^
1027 rk[2];
1028 *(u32*)(out+12) =
1029 (Td4[(s3 ) & 0xff]) ^
1030 (Td4[(s2 >> 8) & 0xff] << 8) ^
1031 (Td4[(s1 >> 16) & 0xff] << 16) ^
1032 (Td4[(s0 >> 24) ] << 24) ^
1033 rk[3];
1034 }