]> git.ipfire.org Git - people/arne_f/kernel.git/blob - arch/arm64/crypto/aes-neon.S
crypto: arm64/aes-neon - fix for big endian
[people/arne_f/kernel.git] / arch / arm64 / crypto / aes-neon.S
1 /*
2 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
3 *
4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13
14 #define AES_ENTRY(func) ENTRY(neon_ ## func)
15 #define AES_ENDPROC(func) ENDPROC(neon_ ## func)
16
17 /* multiply by polynomial 'x' in GF(2^8) */
18 .macro mul_by_x, out, in, temp, const
19 sshr \temp, \in, #7
20 add \out, \in, \in
21 and \temp, \temp, \const
22 eor \out, \out, \temp
23 .endm
24
25 /* preload the entire Sbox */
26 .macro prepare, sbox, shiftrows, temp
27 adr \temp, \sbox
28 movi v12.16b, #0x40
29 ldr q13, \shiftrows
30 movi v14.16b, #0x1b
31 ld1 {v16.16b-v19.16b}, [\temp], #64
32 ld1 {v20.16b-v23.16b}, [\temp], #64
33 ld1 {v24.16b-v27.16b}, [\temp], #64
34 ld1 {v28.16b-v31.16b}, [\temp]
35 .endm
36
37 /* do preload for encryption */
38 .macro enc_prepare, ignore0, ignore1, temp
39 prepare .LForward_Sbox, .LForward_ShiftRows, \temp
40 .endm
41
42 .macro enc_switch_key, ignore0, ignore1, temp
43 /* do nothing */
44 .endm
45
46 /* do preload for decryption */
47 .macro dec_prepare, ignore0, ignore1, temp
48 prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp
49 .endm
50
51 /* apply SubBytes transformation using the the preloaded Sbox */
52 .macro sub_bytes, in
53 sub v9.16b, \in\().16b, v12.16b
54 tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
55 sub v10.16b, v9.16b, v12.16b
56 tbx \in\().16b, {v20.16b-v23.16b}, v9.16b
57 sub v11.16b, v10.16b, v12.16b
58 tbx \in\().16b, {v24.16b-v27.16b}, v10.16b
59 tbx \in\().16b, {v28.16b-v31.16b}, v11.16b
60 .endm
61
62 /* apply MixColumns transformation */
63 .macro mix_columns, in
64 mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b
65 rev32 v8.8h, \in\().8h
66 eor \in\().16b, v10.16b, \in\().16b
67 shl v9.4s, v8.4s, #24
68 shl v11.4s, \in\().4s, #24
69 sri v9.4s, v8.4s, #8
70 sri v11.4s, \in\().4s, #8
71 eor v9.16b, v9.16b, v8.16b
72 eor v10.16b, v10.16b, v9.16b
73 eor \in\().16b, v10.16b, v11.16b
74 .endm
75
76 /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
77 .macro inv_mix_columns, in
78 mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b
79 mul_by_x v11.16b, v11.16b, v10.16b, v14.16b
80 eor \in\().16b, \in\().16b, v11.16b
81 rev32 v11.8h, v11.8h
82 eor \in\().16b, \in\().16b, v11.16b
83 mix_columns \in
84 .endm
85
86 .macro do_block, enc, in, rounds, rk, rkp, i
87 ld1 {v15.4s}, [\rk]
88 add \rkp, \rk, #16
89 mov \i, \rounds
90 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
91 tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */
92 sub_bytes \in
93 ld1 {v15.4s}, [\rkp], #16
94 subs \i, \i, #1
95 beq 2222f
96 .if \enc == 1
97 mix_columns \in
98 .else
99 inv_mix_columns \in
100 .endif
101 b 1111b
102 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
103 .endm
104
105 .macro encrypt_block, in, rounds, rk, rkp, i
106 do_block 1, \in, \rounds, \rk, \rkp, \i
107 .endm
108
109 .macro decrypt_block, in, rounds, rk, rkp, i
110 do_block 0, \in, \rounds, \rk, \rkp, \i
111 .endm
112
113 /*
114 * Interleaved versions: functionally equivalent to the
115 * ones above, but applied to 2 or 4 AES states in parallel.
116 */
117
118 .macro sub_bytes_2x, in0, in1
119 sub v8.16b, \in0\().16b, v12.16b
120 sub v9.16b, \in1\().16b, v12.16b
121 tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
122 tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
123 sub v10.16b, v8.16b, v12.16b
124 sub v11.16b, v9.16b, v12.16b
125 tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
126 tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
127 sub v8.16b, v10.16b, v12.16b
128 sub v9.16b, v11.16b, v12.16b
129 tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b
130 tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b
131 tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
132 tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
133 .endm
134
135 .macro sub_bytes_4x, in0, in1, in2, in3
136 sub v8.16b, \in0\().16b, v12.16b
137 tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
138 sub v9.16b, \in1\().16b, v12.16b
139 tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
140 sub v10.16b, \in2\().16b, v12.16b
141 tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
142 sub v11.16b, \in3\().16b, v12.16b
143 tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
144 tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
145 tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
146 sub v8.16b, v8.16b, v12.16b
147 tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b
148 sub v9.16b, v9.16b, v12.16b
149 tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b
150 sub v10.16b, v10.16b, v12.16b
151 tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b
152 sub v11.16b, v11.16b, v12.16b
153 tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b
154 sub v8.16b, v8.16b, v12.16b
155 tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b
156 sub v9.16b, v9.16b, v12.16b
157 tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b
158 sub v10.16b, v10.16b, v12.16b
159 tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
160 sub v11.16b, v11.16b, v12.16b
161 tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
162 tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b
163 tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b
164 .endm
165
166 .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
167 sshr \tmp0\().16b, \in0\().16b, #7
168 add \out0\().16b, \in0\().16b, \in0\().16b
169 sshr \tmp1\().16b, \in1\().16b, #7
170 and \tmp0\().16b, \tmp0\().16b, \const\().16b
171 add \out1\().16b, \in1\().16b, \in1\().16b
172 and \tmp1\().16b, \tmp1\().16b, \const\().16b
173 eor \out0\().16b, \out0\().16b, \tmp0\().16b
174 eor \out1\().16b, \out1\().16b, \tmp1\().16b
175 .endm
176
177 .macro mix_columns_2x, in0, in1
178 mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
179 rev32 v10.8h, \in0\().8h
180 rev32 v11.8h, \in1\().8h
181 eor \in0\().16b, v8.16b, \in0\().16b
182 eor \in1\().16b, v9.16b, \in1\().16b
183 shl v12.4s, v10.4s, #24
184 shl v13.4s, v11.4s, #24
185 eor v8.16b, v8.16b, v10.16b
186 sri v12.4s, v10.4s, #8
187 shl v10.4s, \in0\().4s, #24
188 eor v9.16b, v9.16b, v11.16b
189 sri v13.4s, v11.4s, #8
190 shl v11.4s, \in1\().4s, #24
191 sri v10.4s, \in0\().4s, #8
192 eor \in0\().16b, v8.16b, v12.16b
193 sri v11.4s, \in1\().4s, #8
194 eor \in1\().16b, v9.16b, v13.16b
195 eor \in0\().16b, v10.16b, \in0\().16b
196 eor \in1\().16b, v11.16b, \in1\().16b
197 .endm
198
199 .macro inv_mix_cols_2x, in0, in1
200 mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
201 mul_by_x_2x v8, v9, v8, v9, v10, v11, v14
202 eor \in0\().16b, \in0\().16b, v8.16b
203 eor \in1\().16b, \in1\().16b, v9.16b
204 rev32 v8.8h, v8.8h
205 rev32 v9.8h, v9.8h
206 eor \in0\().16b, \in0\().16b, v8.16b
207 eor \in1\().16b, \in1\().16b, v9.16b
208 mix_columns_2x \in0, \in1
209 .endm
210
211 .macro inv_mix_cols_4x, in0, in1, in2, in3
212 mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
213 mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14
214 mul_by_x_2x v8, v9, v8, v9, v12, v13, v14
215 mul_by_x_2x v10, v11, v10, v11, v12, v13, v14
216 eor \in0\().16b, \in0\().16b, v8.16b
217 eor \in1\().16b, \in1\().16b, v9.16b
218 eor \in2\().16b, \in2\().16b, v10.16b
219 eor \in3\().16b, \in3\().16b, v11.16b
220 rev32 v8.8h, v8.8h
221 rev32 v9.8h, v9.8h
222 rev32 v10.8h, v10.8h
223 rev32 v11.8h, v11.8h
224 eor \in0\().16b, \in0\().16b, v8.16b
225 eor \in1\().16b, \in1\().16b, v9.16b
226 eor \in2\().16b, \in2\().16b, v10.16b
227 eor \in3\().16b, \in3\().16b, v11.16b
228 mix_columns_2x \in0, \in1
229 mix_columns_2x \in2, \in3
230 .endm
231
232 .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i
233 ld1 {v15.4s}, [\rk]
234 add \rkp, \rk, #16
235 mov \i, \rounds
236 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
237 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
238 sub_bytes_2x \in0, \in1
239 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
240 tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
241 ld1 {v15.4s}, [\rkp], #16
242 subs \i, \i, #1
243 beq 2222f
244 .if \enc == 1
245 mix_columns_2x \in0, \in1
246 ldr q13, .LForward_ShiftRows
247 .else
248 inv_mix_cols_2x \in0, \in1
249 ldr q13, .LReverse_ShiftRows
250 .endif
251 movi v12.16b, #0x40
252 b 1111b
253 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
254 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
255 .endm
256
257 .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
258 ld1 {v15.4s}, [\rk]
259 add \rkp, \rk, #16
260 mov \i, \rounds
261 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
262 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
263 eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
264 eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
265 sub_bytes_4x \in0, \in1, \in2, \in3
266 tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
267 tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
268 tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
269 tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
270 ld1 {v15.4s}, [\rkp], #16
271 subs \i, \i, #1
272 beq 2222f
273 .if \enc == 1
274 mix_columns_2x \in0, \in1
275 mix_columns_2x \in2, \in3
276 ldr q13, .LForward_ShiftRows
277 .else
278 inv_mix_cols_4x \in0, \in1, \in2, \in3
279 ldr q13, .LReverse_ShiftRows
280 .endif
281 movi v12.16b, #0x40
282 b 1111b
283 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
284 eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
285 eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
286 eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
287 .endm
288
289 .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i
290 do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i
291 .endm
292
293 .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i
294 do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i
295 .endm
296
297 .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
298 do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
299 .endm
300
301 .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
302 do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
303 .endm
304
305 #include "aes-modes.S"
306
307 .text
308 .align 4
309 .LForward_ShiftRows:
310 CPU_LE( .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 )
311 CPU_LE( .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb )
312 CPU_BE( .byte 0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8 )
313 CPU_BE( .byte 0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0 )
314
315 .LReverse_ShiftRows:
316 CPU_LE( .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb )
317 CPU_LE( .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 )
318 CPU_BE( .byte 0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8 )
319 CPU_BE( .byte 0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0 )
320
321 .LForward_Sbox:
322 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
323 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
324 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
325 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
326 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
327 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
328 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
329 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
330 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
331 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
332 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
333 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
334 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
335 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
336 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
337 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
338 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
339 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
340 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
341 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
342 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
343 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
344 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
345 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
346 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
347 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
348 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
349 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
350 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
351 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
352 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
353 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
354
355 .LReverse_Sbox:
356 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
357 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
358 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
359 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
360 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
361 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
362 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
363 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
364 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
365 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
366 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
367 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
368 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
369 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
370 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
371 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
372 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
373 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
374 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
375 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
376 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
377 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
378 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
379 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
380 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
381 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
382 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
383 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
384 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
385 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
386 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
387 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d