]>
Commit | Line | Data |
---|---|---|
a35c3a9f | 1 | #!/usr/bin/env perl |
da1c088f | 2 | # Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved. |
a35c3a9f TC |
3 | # |
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
9 | use strict; | |
10 | ||
11 | my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
12 | my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
13 | my $xlate; | |
14 | ||
15 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; | |
16 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |
17 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or | |
18 | die "can't locate arm-xlate.pl"; | |
19 | ||
20 | open OUT,"| \"$^X\" $xlate $flavour $output"; | |
21 | *STDOUT=*OUT; | |
22 | ||
23 | my $code = data(); | |
24 | print $code; | |
25 | ||
26 | close STDOUT or die "error closing STDOUT: $!"; # enforce flush | |
27 | ||
28 | sub data | |
29 | { | |
30 | local $/; | |
31 | return <DATA>; | |
32 | } | |
33 | ||
34 | __END__ | |
da1c088f | 35 | // Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved. |
82551af5 BA |
36 | // |
37 | // Licensed under the OpenSSL license (the "License"). You may not use | |
38 | // this file except in compliance with the License. You can obtain a copy | |
39 | // in the file LICENSE in the source distribution or at | |
40 | // https://www.openssl.org/source/license.html | |
41 | // | |
42 | // ==================================================================== | |
43 | // Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL | |
44 | // project. Rights for redistribution and usage in source and binary | |
45 | // forms are granted according to the OpenSSL license. | |
46 | // ==================================================================== | |
47 | // | |
48 | // This implementation is a translation of bsaes-armv7 for AArch64. | |
49 | // No attempt has been made to carry across the build switches for | |
50 | // kernel targets, since the Linux kernel crypto support has moved on | |
51 | // from when it was based on OpenSSL. | |
52 | ||
53 | // A lot of hand-scheduling has been performed. Consequently, this code | |
54 | // doesn't factor out neatly into macros in the same way that the | |
55 | // AArch32 version did, and there is little to be gained by wrapping it | |
56 | // up in Perl, and it is presented as pure assembly. | |
57 | ||
58 | ||
59 | #include "crypto/arm_arch.h" | |
60 | ||
61 | .text | |
62 | ||
a35c3a9f TC |
63 | .extern AES_cbc_encrypt |
64 | .extern AES_encrypt | |
65 | .extern AES_decrypt | |
66 | ||
82551af5 BA |
67 | .type _bsaes_decrypt8,%function |
68 | .align 4 | |
69 | // On entry: | |
70 | // x9 -> key (previously expanded using _bsaes_key_convert) | |
71 | // x10 = number of rounds | |
72 | // v0-v7 input data | |
73 | // On exit: | |
74 | // x9-x11 corrupted | |
75 | // other general-purpose registers preserved | |
76 | // v0-v7 output data | |
77 | // v11-v15 preserved | |
78 | // other SIMD registers corrupted | |
79 | _bsaes_decrypt8: | |
80 | ldr q8, [x9], #16 | |
81 | adr x11, .LM0ISR | |
82 | movi v9.16b, #0x55 | |
83 | ldr q10, [x11], #16 | |
84 | movi v16.16b, #0x33 | |
85 | movi v17.16b, #0x0f | |
86 | sub x10, x10, #1 | |
87 | eor v0.16b, v0.16b, v8.16b | |
88 | eor v1.16b, v1.16b, v8.16b | |
89 | eor v2.16b, v2.16b, v8.16b | |
90 | eor v4.16b, v4.16b, v8.16b | |
91 | eor v3.16b, v3.16b, v8.16b | |
92 | eor v5.16b, v5.16b, v8.16b | |
93 | tbl v0.16b, {v0.16b}, v10.16b | |
94 | tbl v1.16b, {v1.16b}, v10.16b | |
95 | tbl v2.16b, {v2.16b}, v10.16b | |
96 | tbl v4.16b, {v4.16b}, v10.16b | |
97 | eor v6.16b, v6.16b, v8.16b | |
98 | eor v7.16b, v7.16b, v8.16b | |
99 | tbl v3.16b, {v3.16b}, v10.16b | |
100 | tbl v5.16b, {v5.16b}, v10.16b | |
101 | tbl v6.16b, {v6.16b}, v10.16b | |
102 | ushr v8.2d, v0.2d, #1 | |
103 | tbl v7.16b, {v7.16b}, v10.16b | |
104 | ushr v10.2d, v4.2d, #1 | |
105 | ushr v18.2d, v2.2d, #1 | |
106 | eor v8.16b, v8.16b, v1.16b | |
107 | ushr v19.2d, v6.2d, #1 | |
108 | eor v10.16b, v10.16b, v5.16b | |
109 | eor v18.16b, v18.16b, v3.16b | |
110 | and v8.16b, v8.16b, v9.16b | |
111 | eor v19.16b, v19.16b, v7.16b | |
112 | and v10.16b, v10.16b, v9.16b | |
113 | and v18.16b, v18.16b, v9.16b | |
114 | eor v1.16b, v1.16b, v8.16b | |
115 | shl v8.2d, v8.2d, #1 | |
116 | and v9.16b, v19.16b, v9.16b | |
117 | eor v5.16b, v5.16b, v10.16b | |
118 | shl v10.2d, v10.2d, #1 | |
119 | eor v3.16b, v3.16b, v18.16b | |
120 | shl v18.2d, v18.2d, #1 | |
121 | eor v0.16b, v0.16b, v8.16b | |
122 | shl v8.2d, v9.2d, #1 | |
123 | eor v7.16b, v7.16b, v9.16b | |
124 | eor v4.16b, v4.16b, v10.16b | |
125 | eor v2.16b, v2.16b, v18.16b | |
126 | ushr v9.2d, v1.2d, #2 | |
127 | eor v6.16b, v6.16b, v8.16b | |
128 | ushr v8.2d, v0.2d, #2 | |
129 | ushr v10.2d, v5.2d, #2 | |
130 | ushr v18.2d, v4.2d, #2 | |
131 | eor v9.16b, v9.16b, v3.16b | |
132 | eor v8.16b, v8.16b, v2.16b | |
133 | eor v10.16b, v10.16b, v7.16b | |
134 | eor v18.16b, v18.16b, v6.16b | |
135 | and v9.16b, v9.16b, v16.16b | |
136 | and v8.16b, v8.16b, v16.16b | |
137 | and v10.16b, v10.16b, v16.16b | |
138 | and v16.16b, v18.16b, v16.16b | |
139 | eor v3.16b, v3.16b, v9.16b | |
140 | shl v9.2d, v9.2d, #2 | |
141 | eor v2.16b, v2.16b, v8.16b | |
142 | shl v8.2d, v8.2d, #2 | |
143 | eor v7.16b, v7.16b, v10.16b | |
144 | shl v10.2d, v10.2d, #2 | |
145 | eor v6.16b, v6.16b, v16.16b | |
146 | shl v16.2d, v16.2d, #2 | |
147 | eor v1.16b, v1.16b, v9.16b | |
148 | eor v0.16b, v0.16b, v8.16b | |
149 | eor v5.16b, v5.16b, v10.16b | |
150 | eor v4.16b, v4.16b, v16.16b | |
151 | ushr v8.2d, v3.2d, #4 | |
152 | ushr v9.2d, v2.2d, #4 | |
153 | ushr v10.2d, v1.2d, #4 | |
154 | ushr v16.2d, v0.2d, #4 | |
155 | eor v8.16b, v8.16b, v7.16b | |
156 | eor v9.16b, v9.16b, v6.16b | |
157 | eor v10.16b, v10.16b, v5.16b | |
158 | eor v16.16b, v16.16b, v4.16b | |
159 | and v8.16b, v8.16b, v17.16b | |
160 | and v9.16b, v9.16b, v17.16b | |
161 | and v10.16b, v10.16b, v17.16b | |
162 | and v16.16b, v16.16b, v17.16b | |
163 | eor v7.16b, v7.16b, v8.16b | |
164 | shl v8.2d, v8.2d, #4 | |
165 | eor v6.16b, v6.16b, v9.16b | |
166 | shl v9.2d, v9.2d, #4 | |
167 | eor v5.16b, v5.16b, v10.16b | |
168 | shl v10.2d, v10.2d, #4 | |
169 | eor v4.16b, v4.16b, v16.16b | |
170 | shl v16.2d, v16.2d, #4 | |
171 | eor v3.16b, v3.16b, v8.16b | |
172 | eor v2.16b, v2.16b, v9.16b | |
173 | eor v1.16b, v1.16b, v10.16b | |
174 | eor v0.16b, v0.16b, v16.16b | |
175 | b .Ldec_sbox | |
176 | .align 4 | |
177 | .Ldec_loop: | |
178 | ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 | |
179 | ldp q8, q9, [x9], #32 | |
180 | eor v0.16b, v16.16b, v0.16b | |
181 | ldr q10, [x9], #16 | |
182 | eor v1.16b, v17.16b, v1.16b | |
183 | ldr q16, [x9], #16 | |
184 | eor v2.16b, v18.16b, v2.16b | |
185 | eor v3.16b, v19.16b, v3.16b | |
186 | eor v4.16b, v8.16b, v4.16b | |
187 | eor v5.16b, v9.16b, v5.16b | |
188 | eor v6.16b, v10.16b, v6.16b | |
189 | eor v7.16b, v16.16b, v7.16b | |
190 | tbl v0.16b, {v0.16b}, v28.16b | |
191 | tbl v1.16b, {v1.16b}, v28.16b | |
192 | tbl v2.16b, {v2.16b}, v28.16b | |
193 | tbl v3.16b, {v3.16b}, v28.16b | |
194 | tbl v4.16b, {v4.16b}, v28.16b | |
195 | tbl v5.16b, {v5.16b}, v28.16b | |
196 | tbl v6.16b, {v6.16b}, v28.16b | |
197 | tbl v7.16b, {v7.16b}, v28.16b | |
198 | .Ldec_sbox: | |
199 | eor v1.16b, v1.16b, v4.16b | |
200 | eor v3.16b, v3.16b, v4.16b | |
201 | subs x10, x10, #1 | |
202 | eor v4.16b, v4.16b, v7.16b | |
203 | eor v2.16b, v2.16b, v7.16b | |
204 | eor v1.16b, v1.16b, v6.16b | |
205 | eor v6.16b, v6.16b, v4.16b | |
206 | eor v2.16b, v2.16b, v5.16b | |
207 | eor v0.16b, v0.16b, v1.16b | |
208 | eor v7.16b, v7.16b, v6.16b | |
209 | eor v8.16b, v6.16b, v2.16b | |
210 | and v9.16b, v4.16b, v6.16b | |
211 | eor v10.16b, v2.16b, v6.16b | |
212 | eor v3.16b, v3.16b, v0.16b | |
213 | eor v5.16b, v5.16b, v0.16b | |
214 | eor v16.16b, v7.16b, v4.16b | |
215 | eor v17.16b, v4.16b, v0.16b | |
216 | and v18.16b, v0.16b, v2.16b | |
217 | eor v19.16b, v7.16b, v4.16b | |
218 | eor v1.16b, v1.16b, v3.16b | |
219 | eor v20.16b, v3.16b, v0.16b | |
220 | eor v21.16b, v5.16b, v2.16b | |
221 | eor v22.16b, v3.16b, v7.16b | |
222 | and v8.16b, v17.16b, v8.16b | |
223 | orr v17.16b, v3.16b, v5.16b | |
224 | eor v23.16b, v1.16b, v6.16b | |
225 | eor v24.16b, v20.16b, v16.16b | |
226 | eor v25.16b, v1.16b, v5.16b | |
227 | orr v26.16b, v20.16b, v21.16b | |
228 | and v20.16b, v20.16b, v21.16b | |
229 | and v27.16b, v7.16b, v1.16b | |
230 | eor v21.16b, v21.16b, v23.16b | |
231 | orr v28.16b, v16.16b, v23.16b | |
232 | orr v29.16b, v22.16b, v25.16b | |
233 | eor v26.16b, v26.16b, v8.16b | |
234 | and v16.16b, v16.16b, v23.16b | |
235 | and v22.16b, v22.16b, v25.16b | |
236 | and v21.16b, v24.16b, v21.16b | |
237 | eor v8.16b, v28.16b, v8.16b | |
238 | eor v23.16b, v5.16b, v2.16b | |
239 | eor v24.16b, v1.16b, v6.16b | |
240 | eor v16.16b, v16.16b, v22.16b | |
241 | eor v22.16b, v3.16b, v0.16b | |
242 | eor v25.16b, v29.16b, v21.16b | |
243 | eor v21.16b, v26.16b, v21.16b | |
244 | eor v8.16b, v8.16b, v20.16b | |
245 | eor v26.16b, v23.16b, v24.16b | |
246 | eor v16.16b, v16.16b, v20.16b | |
247 | eor v28.16b, v22.16b, v19.16b | |
248 | eor v20.16b, v25.16b, v20.16b | |
249 | eor v9.16b, v21.16b, v9.16b | |
250 | eor v8.16b, v8.16b, v18.16b | |
251 | eor v18.16b, v5.16b, v1.16b | |
252 | eor v21.16b, v16.16b, v17.16b | |
253 | eor v16.16b, v16.16b, v17.16b | |
254 | eor v17.16b, v20.16b, v27.16b | |
255 | eor v20.16b, v3.16b, v7.16b | |
256 | eor v25.16b, v9.16b, v8.16b | |
257 | eor v27.16b, v0.16b, v4.16b | |
258 | and v29.16b, v9.16b, v17.16b | |
259 | eor v30.16b, v8.16b, v29.16b | |
260 | eor v31.16b, v21.16b, v29.16b | |
261 | eor v29.16b, v21.16b, v29.16b | |
262 | bsl v30.16b, v17.16b, v21.16b | |
263 | bsl v31.16b, v9.16b, v8.16b | |
264 | bsl v16.16b, v30.16b, v29.16b | |
265 | bsl v21.16b, v29.16b, v30.16b | |
266 | eor v8.16b, v31.16b, v30.16b | |
267 | and v1.16b, v1.16b, v31.16b | |
268 | and v9.16b, v16.16b, v31.16b | |
269 | and v6.16b, v6.16b, v30.16b | |
270 | eor v16.16b, v17.16b, v21.16b | |
271 | and v4.16b, v4.16b, v30.16b | |
272 | eor v17.16b, v8.16b, v30.16b | |
273 | and v21.16b, v24.16b, v8.16b | |
274 | eor v9.16b, v9.16b, v25.16b | |
275 | and v19.16b, v19.16b, v8.16b | |
276 | eor v24.16b, v30.16b, v16.16b | |
277 | eor v25.16b, v30.16b, v16.16b | |
278 | and v7.16b, v7.16b, v17.16b | |
279 | and v10.16b, v10.16b, v16.16b | |
280 | eor v29.16b, v9.16b, v16.16b | |
281 | eor v30.16b, v31.16b, v9.16b | |
282 | and v0.16b, v24.16b, v0.16b | |
283 | and v9.16b, v18.16b, v9.16b | |
284 | and v2.16b, v25.16b, v2.16b | |
285 | eor v10.16b, v10.16b, v6.16b | |
286 | eor v18.16b, v29.16b, v16.16b | |
287 | and v5.16b, v30.16b, v5.16b | |
288 | eor v24.16b, v8.16b, v29.16b | |
289 | and v25.16b, v26.16b, v29.16b | |
290 | and v26.16b, v28.16b, v29.16b | |
291 | eor v8.16b, v8.16b, v29.16b | |
292 | eor v17.16b, v17.16b, v18.16b | |
293 | eor v5.16b, v1.16b, v5.16b | |
294 | and v23.16b, v24.16b, v23.16b | |
295 | eor v21.16b, v21.16b, v25.16b | |
296 | eor v19.16b, v19.16b, v26.16b | |
297 | eor v0.16b, v4.16b, v0.16b | |
298 | and v3.16b, v17.16b, v3.16b | |
299 | eor v1.16b, v9.16b, v1.16b | |
300 | eor v9.16b, v25.16b, v23.16b | |
301 | eor v5.16b, v5.16b, v21.16b | |
302 | eor v2.16b, v6.16b, v2.16b | |
303 | and v6.16b, v8.16b, v22.16b | |
304 | eor v3.16b, v7.16b, v3.16b | |
305 | and v8.16b, v20.16b, v18.16b | |
306 | eor v10.16b, v10.16b, v9.16b | |
307 | eor v0.16b, v0.16b, v19.16b | |
308 | eor v9.16b, v1.16b, v9.16b | |
309 | eor v1.16b, v2.16b, v21.16b | |
310 | eor v3.16b, v3.16b, v19.16b | |
311 | and v16.16b, v27.16b, v16.16b | |
312 | eor v17.16b, v26.16b, v6.16b | |
313 | eor v6.16b, v8.16b, v7.16b | |
314 | eor v7.16b, v1.16b, v9.16b | |
315 | eor v1.16b, v5.16b, v3.16b | |
316 | eor v2.16b, v10.16b, v3.16b | |
317 | eor v4.16b, v16.16b, v4.16b | |
318 | eor v8.16b, v6.16b, v17.16b | |
319 | eor v5.16b, v9.16b, v3.16b | |
320 | eor v9.16b, v0.16b, v1.16b | |
321 | eor v6.16b, v7.16b, v1.16b | |
322 | eor v0.16b, v4.16b, v17.16b | |
323 | eor v4.16b, v8.16b, v7.16b | |
324 | eor v7.16b, v9.16b, v2.16b | |
325 | eor v8.16b, v3.16b, v0.16b | |
326 | eor v7.16b, v7.16b, v5.16b | |
327 | eor v3.16b, v4.16b, v7.16b | |
328 | eor v4.16b, v7.16b, v0.16b | |
329 | eor v7.16b, v8.16b, v3.16b | |
330 | bcc .Ldec_done | |
331 | ext v8.16b, v0.16b, v0.16b, #8 | |
332 | ext v9.16b, v1.16b, v1.16b, #8 | |
333 | ldr q28, [x11] // load from .LISR in common case (x10 > 0) | |
334 | ext v10.16b, v6.16b, v6.16b, #8 | |
335 | ext v16.16b, v3.16b, v3.16b, #8 | |
336 | ext v17.16b, v5.16b, v5.16b, #8 | |
337 | ext v18.16b, v4.16b, v4.16b, #8 | |
338 | eor v8.16b, v8.16b, v0.16b | |
339 | eor v9.16b, v9.16b, v1.16b | |
340 | eor v10.16b, v10.16b, v6.16b | |
341 | eor v16.16b, v16.16b, v3.16b | |
342 | eor v17.16b, v17.16b, v5.16b | |
343 | ext v19.16b, v2.16b, v2.16b, #8 | |
344 | ext v20.16b, v7.16b, v7.16b, #8 | |
345 | eor v18.16b, v18.16b, v4.16b | |
346 | eor v6.16b, v6.16b, v8.16b | |
347 | eor v8.16b, v2.16b, v10.16b | |
348 | eor v4.16b, v4.16b, v9.16b | |
349 | eor v2.16b, v19.16b, v2.16b | |
350 | eor v9.16b, v20.16b, v7.16b | |
351 | eor v0.16b, v0.16b, v16.16b | |
352 | eor v1.16b, v1.16b, v16.16b | |
353 | eor v6.16b, v6.16b, v17.16b | |
354 | eor v8.16b, v8.16b, v16.16b | |
355 | eor v7.16b, v7.16b, v18.16b | |
356 | eor v4.16b, v4.16b, v16.16b | |
357 | eor v2.16b, v3.16b, v2.16b | |
358 | eor v1.16b, v1.16b, v17.16b | |
359 | eor v3.16b, v5.16b, v9.16b | |
360 | eor v5.16b, v8.16b, v17.16b | |
361 | eor v7.16b, v7.16b, v17.16b | |
362 | ext v8.16b, v0.16b, v0.16b, #12 | |
363 | ext v9.16b, v6.16b, v6.16b, #12 | |
364 | ext v10.16b, v4.16b, v4.16b, #12 | |
365 | ext v16.16b, v1.16b, v1.16b, #12 | |
366 | ext v17.16b, v5.16b, v5.16b, #12 | |
367 | ext v18.16b, v7.16b, v7.16b, #12 | |
368 | eor v0.16b, v0.16b, v8.16b | |
369 | eor v6.16b, v6.16b, v9.16b | |
370 | eor v4.16b, v4.16b, v10.16b | |
371 | ext v19.16b, v2.16b, v2.16b, #12 | |
372 | ext v20.16b, v3.16b, v3.16b, #12 | |
373 | eor v1.16b, v1.16b, v16.16b | |
374 | eor v5.16b, v5.16b, v17.16b | |
375 | eor v7.16b, v7.16b, v18.16b | |
376 | eor v2.16b, v2.16b, v19.16b | |
377 | eor v16.16b, v16.16b, v0.16b | |
378 | eor v3.16b, v3.16b, v20.16b | |
379 | eor v17.16b, v17.16b, v4.16b | |
380 | eor v10.16b, v10.16b, v6.16b | |
381 | ext v0.16b, v0.16b, v0.16b, #8 | |
382 | eor v9.16b, v9.16b, v1.16b | |
383 | ext v1.16b, v1.16b, v1.16b, #8 | |
384 | eor v8.16b, v8.16b, v3.16b | |
385 | eor v16.16b, v16.16b, v3.16b | |
386 | eor v18.16b, v18.16b, v5.16b | |
387 | eor v19.16b, v19.16b, v7.16b | |
388 | ext v21.16b, v5.16b, v5.16b, #8 | |
389 | ext v5.16b, v7.16b, v7.16b, #8 | |
390 | eor v7.16b, v20.16b, v2.16b | |
391 | ext v4.16b, v4.16b, v4.16b, #8 | |
392 | ext v20.16b, v3.16b, v3.16b, #8 | |
393 | eor v17.16b, v17.16b, v3.16b | |
394 | ext v2.16b, v2.16b, v2.16b, #8 | |
395 | eor v3.16b, v10.16b, v3.16b | |
396 | ext v10.16b, v6.16b, v6.16b, #8 | |
397 | eor v0.16b, v0.16b, v8.16b | |
398 | eor v1.16b, v1.16b, v16.16b | |
399 | eor v5.16b, v5.16b, v18.16b | |
400 | eor v3.16b, v3.16b, v4.16b | |
401 | eor v7.16b, v20.16b, v7.16b | |
402 | eor v6.16b, v2.16b, v19.16b | |
403 | eor v4.16b, v21.16b, v17.16b | |
404 | eor v2.16b, v10.16b, v9.16b | |
405 | bne .Ldec_loop | |
406 | ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0) | |
407 | b .Ldec_loop | |
408 | .align 4 | |
409 | .Ldec_done: | |
410 | ushr v8.2d, v0.2d, #1 | |
411 | movi v9.16b, #0x55 | |
412 | ldr q10, [x9] | |
413 | ushr v16.2d, v2.2d, #1 | |
414 | movi v17.16b, #0x33 | |
415 | ushr v18.2d, v6.2d, #1 | |
416 | movi v19.16b, #0x0f | |
417 | eor v8.16b, v8.16b, v1.16b | |
418 | ushr v20.2d, v3.2d, #1 | |
419 | eor v16.16b, v16.16b, v7.16b | |
420 | eor v18.16b, v18.16b, v4.16b | |
421 | and v8.16b, v8.16b, v9.16b | |
422 | eor v20.16b, v20.16b, v5.16b | |
423 | and v16.16b, v16.16b, v9.16b | |
424 | and v18.16b, v18.16b, v9.16b | |
425 | shl v21.2d, v8.2d, #1 | |
426 | eor v1.16b, v1.16b, v8.16b | |
427 | and v8.16b, v20.16b, v9.16b | |
428 | eor v7.16b, v7.16b, v16.16b | |
429 | shl v9.2d, v16.2d, #1 | |
430 | eor v4.16b, v4.16b, v18.16b | |
431 | shl v16.2d, v18.2d, #1 | |
432 | eor v0.16b, v0.16b, v21.16b | |
433 | shl v18.2d, v8.2d, #1 | |
434 | eor v5.16b, v5.16b, v8.16b | |
435 | eor v2.16b, v2.16b, v9.16b | |
436 | eor v6.16b, v6.16b, v16.16b | |
437 | ushr v8.2d, v1.2d, #2 | |
438 | eor v3.16b, v3.16b, v18.16b | |
439 | ushr v9.2d, v0.2d, #2 | |
440 | ushr v16.2d, v7.2d, #2 | |
441 | ushr v18.2d, v2.2d, #2 | |
442 | eor v8.16b, v8.16b, v4.16b | |
443 | eor v9.16b, v9.16b, v6.16b | |
444 | eor v16.16b, v16.16b, v5.16b | |
445 | eor v18.16b, v18.16b, v3.16b | |
446 | and v8.16b, v8.16b, v17.16b | |
447 | and v9.16b, v9.16b, v17.16b | |
448 | and v16.16b, v16.16b, v17.16b | |
449 | and v17.16b, v18.16b, v17.16b | |
450 | eor v4.16b, v4.16b, v8.16b | |
451 | shl v8.2d, v8.2d, #2 | |
452 | eor v6.16b, v6.16b, v9.16b | |
453 | shl v9.2d, v9.2d, #2 | |
454 | eor v5.16b, v5.16b, v16.16b | |
455 | shl v16.2d, v16.2d, #2 | |
456 | eor v3.16b, v3.16b, v17.16b | |
457 | shl v17.2d, v17.2d, #2 | |
458 | eor v1.16b, v1.16b, v8.16b | |
459 | eor v0.16b, v0.16b, v9.16b | |
460 | eor v7.16b, v7.16b, v16.16b | |
461 | eor v2.16b, v2.16b, v17.16b | |
462 | ushr v8.2d, v4.2d, #4 | |
463 | ushr v9.2d, v6.2d, #4 | |
464 | ushr v16.2d, v1.2d, #4 | |
465 | ushr v17.2d, v0.2d, #4 | |
466 | eor v8.16b, v8.16b, v5.16b | |
467 | eor v9.16b, v9.16b, v3.16b | |
468 | eor v16.16b, v16.16b, v7.16b | |
469 | eor v17.16b, v17.16b, v2.16b | |
470 | and v8.16b, v8.16b, v19.16b | |
471 | and v9.16b, v9.16b, v19.16b | |
472 | and v16.16b, v16.16b, v19.16b | |
473 | and v17.16b, v17.16b, v19.16b | |
474 | eor v5.16b, v5.16b, v8.16b | |
475 | shl v8.2d, v8.2d, #4 | |
476 | eor v3.16b, v3.16b, v9.16b | |
477 | shl v9.2d, v9.2d, #4 | |
478 | eor v7.16b, v7.16b, v16.16b | |
479 | shl v16.2d, v16.2d, #4 | |
480 | eor v2.16b, v2.16b, v17.16b | |
481 | shl v17.2d, v17.2d, #4 | |
482 | eor v4.16b, v4.16b, v8.16b | |
483 | eor v6.16b, v6.16b, v9.16b | |
484 | eor v7.16b, v7.16b, v10.16b | |
485 | eor v1.16b, v1.16b, v16.16b | |
486 | eor v2.16b, v2.16b, v10.16b | |
487 | eor v0.16b, v0.16b, v17.16b | |
488 | eor v4.16b, v4.16b, v10.16b | |
489 | eor v6.16b, v6.16b, v10.16b | |
490 | eor v3.16b, v3.16b, v10.16b | |
491 | eor v5.16b, v5.16b, v10.16b | |
492 | eor v1.16b, v1.16b, v10.16b | |
493 | eor v0.16b, v0.16b, v10.16b | |
494 | ret | |
495 | .size _bsaes_decrypt8,.-_bsaes_decrypt8 | |
496 | ||
497 | .type _bsaes_const,%object | |
498 | .align 6 | |
499 | _bsaes_const: | |
500 | // InvShiftRows constants | |
501 | // Used in _bsaes_decrypt8, which assumes contiguity | |
502 | // .LM0ISR used with round 0 key | |
503 | // .LISR used with middle round keys | |
504 | // .LISRM0 used with final round key | |
505 | .LM0ISR: | |
506 | .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 | |
507 | .LISR: | |
508 | .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 | |
509 | .LISRM0: | |
510 | .quad 0x01040b0e0205080f, 0x0306090c00070a0d | |
511 | ||
512 | // ShiftRows constants | |
513 | // Used in _bsaes_encrypt8, which assumes contiguity | |
514 | // .LM0SR used with round 0 key | |
515 | // .LSR used with middle round keys | |
516 | // .LSRM0 used with final round key | |
517 | .LM0SR: | |
518 | .quad 0x0a0e02060f03070b, 0x0004080c05090d01 | |
519 | .LSR: | |
520 | .quad 0x0504070600030201, 0x0f0e0d0c0a09080b | |
521 | .LSRM0: | |
522 | .quad 0x0304090e00050a0f, 0x01060b0c0207080d | |
523 | ||
524 | .LM0_bigendian: | |
525 | .quad 0x02060a0e03070b0f, 0x0004080c0105090d | |
526 | .LM0_littleendian: | |
527 | .quad 0x0105090d0004080c, 0x03070b0f02060a0e | |
528 | ||
529 | // Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into | |
530 | // _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR | |
531 | .LREVM0SR: | |
532 | .quad 0x090d01050c000408, 0x03070b0f060a0e02 | |
533 | ||
534 | .align 6 | |
535 | .size _bsaes_const,.-_bsaes_const | |
536 | ||
537 | .type _bsaes_encrypt8,%function | |
538 | .align 4 | |
539 | // On entry: | |
540 | // x9 -> key (previously expanded using _bsaes_key_convert) | |
541 | // x10 = number of rounds | |
542 | // v0-v7 input data | |
543 | // On exit: | |
544 | // x9-x11 corrupted | |
545 | // other general-purpose registers preserved | |
546 | // v0-v7 output data | |
547 | // v11-v15 preserved | |
548 | // other SIMD registers corrupted | |
549 | _bsaes_encrypt8: | |
550 | ldr q8, [x9], #16 | |
551 | adr x11, .LM0SR | |
552 | ldr q9, [x11], #16 | |
553 | _bsaes_encrypt8_alt: | |
554 | eor v0.16b, v0.16b, v8.16b | |
555 | eor v1.16b, v1.16b, v8.16b | |
556 | sub x10, x10, #1 | |
557 | eor v2.16b, v2.16b, v8.16b | |
558 | eor v4.16b, v4.16b, v8.16b | |
559 | eor v3.16b, v3.16b, v8.16b | |
560 | eor v5.16b, v5.16b, v8.16b | |
561 | tbl v0.16b, {v0.16b}, v9.16b | |
562 | tbl v1.16b, {v1.16b}, v9.16b | |
563 | tbl v2.16b, {v2.16b}, v9.16b | |
564 | tbl v4.16b, {v4.16b}, v9.16b | |
565 | eor v6.16b, v6.16b, v8.16b | |
566 | eor v7.16b, v7.16b, v8.16b | |
567 | tbl v3.16b, {v3.16b}, v9.16b | |
568 | tbl v5.16b, {v5.16b}, v9.16b | |
569 | tbl v6.16b, {v6.16b}, v9.16b | |
570 | ushr v8.2d, v0.2d, #1 | |
571 | movi v10.16b, #0x55 | |
572 | tbl v7.16b, {v7.16b}, v9.16b | |
573 | ushr v9.2d, v4.2d, #1 | |
574 | movi v16.16b, #0x33 | |
575 | ushr v17.2d, v2.2d, #1 | |
576 | eor v8.16b, v8.16b, v1.16b | |
577 | movi v18.16b, #0x0f | |
578 | ushr v19.2d, v6.2d, #1 | |
579 | eor v9.16b, v9.16b, v5.16b | |
580 | eor v17.16b, v17.16b, v3.16b | |
581 | and v8.16b, v8.16b, v10.16b | |
582 | eor v19.16b, v19.16b, v7.16b | |
583 | and v9.16b, v9.16b, v10.16b | |
584 | and v17.16b, v17.16b, v10.16b | |
585 | eor v1.16b, v1.16b, v8.16b | |
586 | shl v8.2d, v8.2d, #1 | |
587 | and v10.16b, v19.16b, v10.16b | |
588 | eor v5.16b, v5.16b, v9.16b | |
589 | shl v9.2d, v9.2d, #1 | |
590 | eor v3.16b, v3.16b, v17.16b | |
591 | shl v17.2d, v17.2d, #1 | |
592 | eor v0.16b, v0.16b, v8.16b | |
593 | shl v8.2d, v10.2d, #1 | |
594 | eor v7.16b, v7.16b, v10.16b | |
595 | eor v4.16b, v4.16b, v9.16b | |
596 | eor v2.16b, v2.16b, v17.16b | |
597 | ushr v9.2d, v1.2d, #2 | |
598 | eor v6.16b, v6.16b, v8.16b | |
599 | ushr v8.2d, v0.2d, #2 | |
600 | ushr v10.2d, v5.2d, #2 | |
601 | ushr v17.2d, v4.2d, #2 | |
602 | eor v9.16b, v9.16b, v3.16b | |
603 | eor v8.16b, v8.16b, v2.16b | |
604 | eor v10.16b, v10.16b, v7.16b | |
605 | eor v17.16b, v17.16b, v6.16b | |
606 | and v9.16b, v9.16b, v16.16b | |
607 | and v8.16b, v8.16b, v16.16b | |
608 | and v10.16b, v10.16b, v16.16b | |
609 | and v16.16b, v17.16b, v16.16b | |
610 | eor v3.16b, v3.16b, v9.16b | |
611 | shl v9.2d, v9.2d, #2 | |
612 | eor v2.16b, v2.16b, v8.16b | |
613 | shl v8.2d, v8.2d, #2 | |
614 | eor v7.16b, v7.16b, v10.16b | |
615 | shl v10.2d, v10.2d, #2 | |
616 | eor v6.16b, v6.16b, v16.16b | |
617 | shl v16.2d, v16.2d, #2 | |
618 | eor v1.16b, v1.16b, v9.16b | |
619 | eor v0.16b, v0.16b, v8.16b | |
620 | eor v5.16b, v5.16b, v10.16b | |
621 | eor v4.16b, v4.16b, v16.16b | |
622 | ushr v8.2d, v3.2d, #4 | |
623 | ushr v9.2d, v2.2d, #4 | |
624 | ushr v10.2d, v1.2d, #4 | |
625 | ushr v16.2d, v0.2d, #4 | |
626 | eor v8.16b, v8.16b, v7.16b | |
627 | eor v9.16b, v9.16b, v6.16b | |
628 | eor v10.16b, v10.16b, v5.16b | |
629 | eor v16.16b, v16.16b, v4.16b | |
630 | and v8.16b, v8.16b, v18.16b | |
631 | and v9.16b, v9.16b, v18.16b | |
632 | and v10.16b, v10.16b, v18.16b | |
633 | and v16.16b, v16.16b, v18.16b | |
634 | eor v7.16b, v7.16b, v8.16b | |
635 | shl v8.2d, v8.2d, #4 | |
636 | eor v6.16b, v6.16b, v9.16b | |
637 | shl v9.2d, v9.2d, #4 | |
638 | eor v5.16b, v5.16b, v10.16b | |
639 | shl v10.2d, v10.2d, #4 | |
640 | eor v4.16b, v4.16b, v16.16b | |
641 | shl v16.2d, v16.2d, #4 | |
642 | eor v3.16b, v3.16b, v8.16b | |
643 | eor v2.16b, v2.16b, v9.16b | |
644 | eor v1.16b, v1.16b, v10.16b | |
645 | eor v0.16b, v0.16b, v16.16b | |
646 | b .Lenc_sbox | |
647 | .align 4 | |
648 | .Lenc_loop: | |
649 | ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 | |
650 | ldp q8, q9, [x9], #32 | |
651 | eor v0.16b, v16.16b, v0.16b | |
652 | ldr q10, [x9], #16 | |
653 | eor v1.16b, v17.16b, v1.16b | |
654 | ldr q16, [x9], #16 | |
655 | eor v2.16b, v18.16b, v2.16b | |
656 | eor v3.16b, v19.16b, v3.16b | |
657 | eor v4.16b, v8.16b, v4.16b | |
658 | eor v5.16b, v9.16b, v5.16b | |
659 | eor v6.16b, v10.16b, v6.16b | |
660 | eor v7.16b, v16.16b, v7.16b | |
661 | tbl v0.16b, {v0.16b}, v28.16b | |
662 | tbl v1.16b, {v1.16b}, v28.16b | |
663 | tbl v2.16b, {v2.16b}, v28.16b | |
664 | tbl v3.16b, {v3.16b}, v28.16b | |
665 | tbl v4.16b, {v4.16b}, v28.16b | |
666 | tbl v5.16b, {v5.16b}, v28.16b | |
667 | tbl v6.16b, {v6.16b}, v28.16b | |
668 | tbl v7.16b, {v7.16b}, v28.16b | |
669 | .Lenc_sbox: | |
670 | eor v5.16b, v5.16b, v6.16b | |
671 | eor v3.16b, v3.16b, v0.16b | |
672 | subs x10, x10, #1 | |
673 | eor v2.16b, v2.16b, v1.16b | |
674 | eor v5.16b, v5.16b, v0.16b | |
675 | eor v8.16b, v3.16b, v7.16b | |
676 | eor v6.16b, v6.16b, v2.16b | |
677 | eor v7.16b, v7.16b, v5.16b | |
678 | eor v8.16b, v8.16b, v4.16b | |
679 | eor v3.16b, v6.16b, v3.16b | |
680 | eor v4.16b, v4.16b, v5.16b | |
681 | eor v6.16b, v1.16b, v5.16b | |
682 | eor v2.16b, v2.16b, v7.16b | |
683 | eor v1.16b, v8.16b, v1.16b | |
684 | eor v8.16b, v7.16b, v4.16b | |
685 | eor v9.16b, v3.16b, v0.16b | |
686 | eor v10.16b, v7.16b, v6.16b | |
687 | eor v16.16b, v5.16b, v3.16b | |
688 | eor v17.16b, v6.16b, v2.16b | |
689 | eor v18.16b, v5.16b, v1.16b | |
690 | eor v19.16b, v2.16b, v4.16b | |
691 | eor v20.16b, v1.16b, v0.16b | |
692 | orr v21.16b, v8.16b, v9.16b | |
693 | orr v22.16b, v10.16b, v16.16b | |
694 | eor v23.16b, v8.16b, v17.16b | |
695 | eor v24.16b, v9.16b, v18.16b | |
696 | and v19.16b, v19.16b, v20.16b | |
697 | orr v20.16b, v17.16b, v18.16b | |
698 | and v8.16b, v8.16b, v9.16b | |
699 | and v9.16b, v17.16b, v18.16b | |
700 | and v17.16b, v23.16b, v24.16b | |
701 | and v10.16b, v10.16b, v16.16b | |
702 | eor v16.16b, v21.16b, v19.16b | |
703 | eor v18.16b, v20.16b, v19.16b | |
704 | and v19.16b, v2.16b, v1.16b | |
705 | and v20.16b, v6.16b, v5.16b | |
706 | eor v21.16b, v22.16b, v17.16b | |
707 | eor v9.16b, v9.16b, v10.16b | |
708 | eor v10.16b, v16.16b, v17.16b | |
709 | eor v16.16b, v18.16b, v8.16b | |
710 | and v17.16b, v4.16b, v0.16b | |
711 | orr v18.16b, v7.16b, v3.16b | |
712 | eor v21.16b, v21.16b, v8.16b | |
713 | eor v8.16b, v9.16b, v8.16b | |
714 | eor v9.16b, v10.16b, v19.16b | |
715 | eor v10.16b, v3.16b, v0.16b | |
716 | eor v16.16b, v16.16b, v17.16b | |
717 | eor v17.16b, v5.16b, v1.16b | |
718 | eor v19.16b, v21.16b, v20.16b | |
719 | eor v20.16b, v8.16b, v18.16b | |
720 | eor v8.16b, v8.16b, v18.16b | |
721 | eor v18.16b, v7.16b, v4.16b | |
722 | eor v21.16b, v9.16b, v16.16b | |
723 | eor v22.16b, v6.16b, v2.16b | |
724 | and v23.16b, v9.16b, v19.16b | |
725 | eor v24.16b, v10.16b, v17.16b | |
726 | eor v25.16b, v0.16b, v1.16b | |
727 | eor v26.16b, v7.16b, v6.16b | |
728 | eor v27.16b, v18.16b, v22.16b | |
729 | eor v28.16b, v3.16b, v5.16b | |
730 | eor v29.16b, v16.16b, v23.16b | |
731 | eor v30.16b, v20.16b, v23.16b | |
732 | eor v23.16b, v20.16b, v23.16b | |
733 | eor v31.16b, v4.16b, v2.16b | |
734 | bsl v29.16b, v19.16b, v20.16b | |
735 | bsl v30.16b, v9.16b, v16.16b | |
736 | bsl v8.16b, v29.16b, v23.16b | |
737 | bsl v20.16b, v23.16b, v29.16b | |
738 | eor v9.16b, v30.16b, v29.16b | |
739 | and v5.16b, v5.16b, v30.16b | |
740 | and v8.16b, v8.16b, v30.16b | |
741 | and v1.16b, v1.16b, v29.16b | |
742 | eor v16.16b, v19.16b, v20.16b | |
743 | and v2.16b, v2.16b, v29.16b | |
744 | eor v19.16b, v9.16b, v29.16b | |
745 | and v17.16b, v17.16b, v9.16b | |
746 | eor v8.16b, v8.16b, v21.16b | |
747 | and v20.16b, v22.16b, v9.16b | |
748 | eor v21.16b, v29.16b, v16.16b | |
749 | eor v22.16b, v29.16b, v16.16b | |
750 | and v23.16b, v25.16b, v16.16b | |
751 | and v6.16b, v6.16b, v19.16b | |
752 | eor v25.16b, v8.16b, v16.16b | |
753 | eor v29.16b, v30.16b, v8.16b | |
754 | and v4.16b, v21.16b, v4.16b | |
755 | and v8.16b, v28.16b, v8.16b | |
756 | and v0.16b, v22.16b, v0.16b | |
757 | eor v21.16b, v23.16b, v1.16b | |
758 | eor v22.16b, v9.16b, v25.16b | |
759 | eor v9.16b, v9.16b, v25.16b | |
760 | eor v23.16b, v25.16b, v16.16b | |
761 | and v3.16b, v29.16b, v3.16b | |
762 | and v24.16b, v24.16b, v25.16b | |
763 | and v25.16b, v27.16b, v25.16b | |
764 | and v10.16b, v22.16b, v10.16b | |
765 | and v9.16b, v9.16b, v18.16b | |
766 | eor v18.16b, v19.16b, v23.16b | |
767 | and v19.16b, v26.16b, v23.16b | |
768 | eor v3.16b, v5.16b, v3.16b | |
769 | eor v17.16b, v17.16b, v24.16b | |
770 | eor v10.16b, v24.16b, v10.16b | |
771 | and v16.16b, v31.16b, v16.16b | |
772 | eor v20.16b, v20.16b, v25.16b | |
773 | eor v9.16b, v25.16b, v9.16b | |
774 | eor v4.16b, v2.16b, v4.16b | |
775 | and v7.16b, v18.16b, v7.16b | |
776 | eor v18.16b, v19.16b, v6.16b | |
777 | eor v5.16b, v8.16b, v5.16b | |
778 | eor v0.16b, v1.16b, v0.16b | |
779 | eor v1.16b, v21.16b, v10.16b | |
780 | eor v8.16b, v3.16b, v17.16b | |
781 | eor v2.16b, v16.16b, v2.16b | |
782 | eor v3.16b, v6.16b, v7.16b | |
783 | eor v6.16b, v18.16b, v9.16b | |
784 | eor v4.16b, v4.16b, v20.16b | |
785 | eor v10.16b, v5.16b, v10.16b | |
786 | eor v0.16b, v0.16b, v17.16b | |
787 | eor v9.16b, v2.16b, v9.16b | |
788 | eor v3.16b, v3.16b, v20.16b | |
789 | eor v7.16b, v6.16b, v1.16b | |
790 | eor v5.16b, v8.16b, v4.16b | |
791 | eor v6.16b, v10.16b, v1.16b | |
792 | eor v2.16b, v4.16b, v0.16b | |
793 | eor v4.16b, v3.16b, v10.16b | |
794 | eor v9.16b, v9.16b, v7.16b | |
795 | eor v3.16b, v0.16b, v5.16b | |
796 | eor v0.16b, v1.16b, v4.16b | |
797 | eor v1.16b, v4.16b, v8.16b | |
798 | eor v4.16b, v9.16b, v5.16b | |
799 | eor v6.16b, v6.16b, v3.16b | |
800 | bcc .Lenc_done | |
801 | ext v8.16b, v0.16b, v0.16b, #12 | |
802 | ext v9.16b, v4.16b, v4.16b, #12 | |
803 | ldr q28, [x11] | |
804 | ext v10.16b, v6.16b, v6.16b, #12 | |
805 | ext v16.16b, v1.16b, v1.16b, #12 | |
806 | ext v17.16b, v3.16b, v3.16b, #12 | |
807 | ext v18.16b, v7.16b, v7.16b, #12 | |
808 | eor v0.16b, v0.16b, v8.16b | |
809 | eor v4.16b, v4.16b, v9.16b | |
810 | eor v6.16b, v6.16b, v10.16b | |
811 | ext v19.16b, v2.16b, v2.16b, #12 | |
812 | ext v20.16b, v5.16b, v5.16b, #12 | |
813 | eor v1.16b, v1.16b, v16.16b | |
814 | eor v3.16b, v3.16b, v17.16b | |
815 | eor v7.16b, v7.16b, v18.16b | |
816 | eor v2.16b, v2.16b, v19.16b | |
817 | eor v16.16b, v16.16b, v0.16b | |
818 | eor v5.16b, v5.16b, v20.16b | |
819 | eor v17.16b, v17.16b, v6.16b | |
820 | eor v10.16b, v10.16b, v4.16b | |
821 | ext v0.16b, v0.16b, v0.16b, #8 | |
822 | eor v9.16b, v9.16b, v1.16b | |
823 | ext v1.16b, v1.16b, v1.16b, #8 | |
824 | eor v8.16b, v8.16b, v5.16b | |
825 | eor v16.16b, v16.16b, v5.16b | |
826 | eor v18.16b, v18.16b, v3.16b | |
827 | eor v19.16b, v19.16b, v7.16b | |
828 | ext v3.16b, v3.16b, v3.16b, #8 | |
829 | ext v7.16b, v7.16b, v7.16b, #8 | |
830 | eor v20.16b, v20.16b, v2.16b | |
831 | ext v6.16b, v6.16b, v6.16b, #8 | |
832 | ext v21.16b, v5.16b, v5.16b, #8 | |
833 | eor v17.16b, v17.16b, v5.16b | |
834 | ext v2.16b, v2.16b, v2.16b, #8 | |
835 | eor v10.16b, v10.16b, v5.16b | |
836 | ext v22.16b, v4.16b, v4.16b, #8 | |
837 | eor v0.16b, v0.16b, v8.16b | |
838 | eor v1.16b, v1.16b, v16.16b | |
839 | eor v5.16b, v7.16b, v18.16b | |
840 | eor v4.16b, v3.16b, v17.16b | |
841 | eor v3.16b, v6.16b, v10.16b | |
842 | eor v7.16b, v21.16b, v20.16b | |
843 | eor v6.16b, v2.16b, v19.16b | |
844 | eor v2.16b, v22.16b, v9.16b | |
845 | bne .Lenc_loop | |
846 | ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0) | |
847 | b .Lenc_loop | |
848 | .align 4 | |
849 | .Lenc_done: | |
850 | ushr v8.2d, v0.2d, #1 | |
851 | movi v9.16b, #0x55 | |
852 | ldr q10, [x9] | |
853 | ushr v16.2d, v3.2d, #1 | |
854 | movi v17.16b, #0x33 | |
855 | ushr v18.2d, v4.2d, #1 | |
856 | movi v19.16b, #0x0f | |
857 | eor v8.16b, v8.16b, v1.16b | |
858 | ushr v20.2d, v2.2d, #1 | |
859 | eor v16.16b, v16.16b, v7.16b | |
860 | eor v18.16b, v18.16b, v6.16b | |
861 | and v8.16b, v8.16b, v9.16b | |
862 | eor v20.16b, v20.16b, v5.16b | |
863 | and v16.16b, v16.16b, v9.16b | |
864 | and v18.16b, v18.16b, v9.16b | |
865 | shl v21.2d, v8.2d, #1 | |
866 | eor v1.16b, v1.16b, v8.16b | |
867 | and v8.16b, v20.16b, v9.16b | |
868 | eor v7.16b, v7.16b, v16.16b | |
869 | shl v9.2d, v16.2d, #1 | |
870 | eor v6.16b, v6.16b, v18.16b | |
871 | shl v16.2d, v18.2d, #1 | |
872 | eor v0.16b, v0.16b, v21.16b | |
873 | shl v18.2d, v8.2d, #1 | |
874 | eor v5.16b, v5.16b, v8.16b | |
875 | eor v3.16b, v3.16b, v9.16b | |
876 | eor v4.16b, v4.16b, v16.16b | |
877 | ushr v8.2d, v1.2d, #2 | |
878 | eor v2.16b, v2.16b, v18.16b | |
879 | ushr v9.2d, v0.2d, #2 | |
880 | ushr v16.2d, v7.2d, #2 | |
881 | ushr v18.2d, v3.2d, #2 | |
882 | eor v8.16b, v8.16b, v6.16b | |
883 | eor v9.16b, v9.16b, v4.16b | |
884 | eor v16.16b, v16.16b, v5.16b | |
885 | eor v18.16b, v18.16b, v2.16b | |
886 | and v8.16b, v8.16b, v17.16b | |
887 | and v9.16b, v9.16b, v17.16b | |
888 | and v16.16b, v16.16b, v17.16b | |
889 | and v17.16b, v18.16b, v17.16b | |
890 | eor v6.16b, v6.16b, v8.16b | |
891 | shl v8.2d, v8.2d, #2 | |
892 | eor v4.16b, v4.16b, v9.16b | |
893 | shl v9.2d, v9.2d, #2 | |
894 | eor v5.16b, v5.16b, v16.16b | |
895 | shl v16.2d, v16.2d, #2 | |
896 | eor v2.16b, v2.16b, v17.16b | |
897 | shl v17.2d, v17.2d, #2 | |
898 | eor v1.16b, v1.16b, v8.16b | |
899 | eor v0.16b, v0.16b, v9.16b | |
900 | eor v7.16b, v7.16b, v16.16b | |
901 | eor v3.16b, v3.16b, v17.16b | |
902 | ushr v8.2d, v6.2d, #4 | |
903 | ushr v9.2d, v4.2d, #4 | |
904 | ushr v16.2d, v1.2d, #4 | |
905 | ushr v17.2d, v0.2d, #4 | |
906 | eor v8.16b, v8.16b, v5.16b | |
907 | eor v9.16b, v9.16b, v2.16b | |
908 | eor v16.16b, v16.16b, v7.16b | |
909 | eor v17.16b, v17.16b, v3.16b | |
910 | and v8.16b, v8.16b, v19.16b | |
911 | and v9.16b, v9.16b, v19.16b | |
912 | and v16.16b, v16.16b, v19.16b | |
913 | and v17.16b, v17.16b, v19.16b | |
914 | eor v5.16b, v5.16b, v8.16b | |
915 | shl v8.2d, v8.2d, #4 | |
916 | eor v2.16b, v2.16b, v9.16b | |
917 | shl v9.2d, v9.2d, #4 | |
918 | eor v7.16b, v7.16b, v16.16b | |
919 | shl v16.2d, v16.2d, #4 | |
920 | eor v3.16b, v3.16b, v17.16b | |
921 | shl v17.2d, v17.2d, #4 | |
922 | eor v6.16b, v6.16b, v8.16b | |
923 | eor v4.16b, v4.16b, v9.16b | |
924 | eor v7.16b, v7.16b, v10.16b | |
925 | eor v1.16b, v1.16b, v16.16b | |
926 | eor v3.16b, v3.16b, v10.16b | |
927 | eor v0.16b, v0.16b, v17.16b | |
928 | eor v6.16b, v6.16b, v10.16b | |
929 | eor v4.16b, v4.16b, v10.16b | |
930 | eor v2.16b, v2.16b, v10.16b | |
931 | eor v5.16b, v5.16b, v10.16b | |
932 | eor v1.16b, v1.16b, v10.16b | |
933 | eor v0.16b, v0.16b, v10.16b | |
934 | ret | |
935 | .size _bsaes_encrypt8,.-_bsaes_encrypt8 | |
936 | ||
937 | .type _bsaes_key_convert,%function | |
938 | .align 4 | |
939 | // On entry: | |
940 | // x9 -> input key (big-endian) | |
941 | // x10 = number of rounds | |
942 | // x17 -> output key (native endianness) | |
943 | // On exit: | |
944 | // x9, x10 corrupted | |
945 | // x11 -> .LM0_bigendian | |
946 | // x17 -> last quadword of output key | |
947 | // other general-purpose registers preserved | |
948 | // v2-v6 preserved | |
949 | // v7.16b[] = 0x63 | |
950 | // v8-v14 preserved | |
951 | // v15 = last round key (converted to native endianness) | |
952 | // other SIMD registers corrupted | |
953 | _bsaes_key_convert: | |
2bd5cde5 | 954 | #ifdef __AARCH64EL__ |
82551af5 BA |
955 | adr x11, .LM0_littleendian |
956 | #else | |
957 | adr x11, .LM0_bigendian | |
958 | #endif | |
959 | ldr q0, [x9], #16 // load round 0 key | |
960 | ldr q1, [x11] // .LM0 | |
961 | ldr q15, [x9], #16 // load round 1 key | |
962 | ||
963 | movi v7.16b, #0x63 // compose .L63 | |
964 | movi v16.16b, #0x01 // bit masks | |
965 | movi v17.16b, #0x02 | |
966 | movi v18.16b, #0x04 | |
967 | movi v19.16b, #0x08 | |
968 | movi v20.16b, #0x10 | |
969 | movi v21.16b, #0x20 | |
970 | movi v22.16b, #0x40 | |
971 | movi v23.16b, #0x80 | |
972 | ||
2bd5cde5 | 973 | #ifdef __AARCH64EL__ |
82551af5 BA |
974 | rev32 v0.16b, v0.16b |
975 | #endif | |
976 | sub x10, x10, #1 | |
977 | str q0, [x17], #16 // save round 0 key | |
978 | ||
979 | .align 4 | |
980 | .Lkey_loop: | |
981 | tbl v0.16b, {v15.16b}, v1.16b | |
982 | ldr q15, [x9], #16 // load next round key | |
983 | ||
984 | eor v0.16b, v0.16b, v7.16b | |
985 | cmtst v24.16b, v0.16b, v16.16b | |
986 | cmtst v25.16b, v0.16b, v17.16b | |
987 | cmtst v26.16b, v0.16b, v18.16b | |
988 | cmtst v27.16b, v0.16b, v19.16b | |
989 | cmtst v28.16b, v0.16b, v20.16b | |
990 | cmtst v29.16b, v0.16b, v21.16b | |
991 | cmtst v30.16b, v0.16b, v22.16b | |
992 | cmtst v31.16b, v0.16b, v23.16b | |
993 | sub x10, x10, #1 | |
994 | st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key | |
995 | st1 {v28.16b-v31.16b}, [x17], #64 | |
996 | cbnz x10, .Lkey_loop | |
997 | ||
998 | // don't save last round key | |
2bd5cde5 | 999 | #ifdef __AARCH64EL__ |
82551af5 BA |
1000 | rev32 v15.16b, v15.16b |
1001 | adr x11, .LM0_bigendian | |
1002 | #endif | |
1003 | ret | |
1004 | .size _bsaes_key_convert,.-_bsaes_key_convert | |
1005 | ||
1006 | .globl ossl_bsaes_cbc_encrypt | |
1007 | .type ossl_bsaes_cbc_encrypt,%function | |
1008 | .align 4 | |
1009 | // On entry: | |
1010 | // x0 -> input ciphertext | |
1011 | // x1 -> output plaintext | |
1012 | // x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16) | |
1013 | // x3 -> key | |
1014 | // x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call) | |
1015 | // w5 must be == 0 | |
1016 | // On exit: | |
1017 | // Output plaintext filled in | |
1018 | // Initialisation vector overwritten with last quadword of ciphertext | |
1019 | // No output registers, usual AAPCS64 register preservation | |
1020 | ossl_bsaes_cbc_encrypt: | |
1021 | cmp x2, #128 | |
a35c3a9f TC |
1022 | bhs .Lcbc_do_bsaes |
1023 | b AES_cbc_encrypt | |
1024 | .Lcbc_do_bsaes: | |
82551af5 BA |
1025 | |
1026 | // it is up to the caller to make sure we are called with enc == 0 | |
1027 | ||
5adddcd9 | 1028 | stp x29, x30, [sp, #-48]! |
82551af5 BA |
1029 | stp d8, d9, [sp, #16] |
1030 | stp d10, d15, [sp, #32] | |
1031 | lsr x2, x2, #4 // len in 16 byte blocks | |
1032 | ||
1033 | ldr w15, [x3, #240] // get # of rounds | |
1034 | mov x14, sp | |
1035 | ||
1036 | // allocate the key schedule on the stack | |
1037 | add x17, sp, #96 | |
1038 | sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes | |
1039 | ||
1040 | // populate the key schedule | |
1041 | mov x9, x3 // pass key | |
1042 | mov x10, x15 // pass # of rounds | |
1043 | mov sp, x17 // sp is sp | |
1044 | bl _bsaes_key_convert | |
1045 | ldr q6, [sp] | |
1046 | str q15, [x17] // save last round key | |
1047 | eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) | |
1048 | str q6, [sp] | |
1049 | ||
1050 | ldr q15, [x4] // load IV | |
1051 | b .Lcbc_dec_loop | |
1052 | ||
1053 | .align 4 | |
1054 | .Lcbc_dec_loop: | |
1055 | subs x2, x2, #0x8 | |
1056 | bmi .Lcbc_dec_loop_finish | |
1057 | ||
1058 | ldr q0, [x0], #16 // load input | |
1059 | mov x9, sp // pass the key | |
1060 | ldr q1, [x0], #16 | |
1061 | mov x10, x15 | |
1062 | ldr q2, [x0], #16 | |
1063 | ldr q3, [x0], #16 | |
1064 | ldr q4, [x0], #16 | |
1065 | ldr q5, [x0], #16 | |
1066 | ldr q6, [x0], #16 | |
1067 | ldr q7, [x0], #-7*16 | |
1068 | ||
1069 | bl _bsaes_decrypt8 | |
1070 | ||
1071 | ldr q16, [x0], #16 // reload input | |
1072 | eor v0.16b, v0.16b, v15.16b // ^= IV | |
1073 | eor v1.16b, v1.16b, v16.16b | |
1074 | str q0, [x1], #16 // write output | |
1075 | ldr q0, [x0], #16 | |
1076 | str q1, [x1], #16 | |
1077 | ldr q1, [x0], #16 | |
1078 | eor v1.16b, v4.16b, v1.16b | |
1079 | ldr q4, [x0], #16 | |
1080 | eor v2.16b, v2.16b, v4.16b | |
1081 | eor v0.16b, v6.16b, v0.16b | |
1082 | ldr q4, [x0], #16 | |
1083 | str q0, [x1], #16 | |
1084 | str q1, [x1], #16 | |
1085 | eor v0.16b, v7.16b, v4.16b | |
1086 | ldr q1, [x0], #16 | |
1087 | str q2, [x1], #16 | |
1088 | ldr q2, [x0], #16 | |
1089 | ldr q15, [x0], #16 | |
1090 | str q0, [x1], #16 | |
1091 | eor v0.16b, v5.16b, v2.16b | |
1092 | eor v1.16b, v3.16b, v1.16b | |
1093 | str q1, [x1], #16 | |
1094 | str q0, [x1], #16 | |
1095 | ||
1096 | b .Lcbc_dec_loop | |
1097 | ||
1098 | .Lcbc_dec_loop_finish: | |
1099 | adds x2, x2, #8 | |
1100 | beq .Lcbc_dec_done | |
1101 | ||
1102 | ldr q0, [x0], #16 // load input | |
1103 | cmp x2, #2 | |
1104 | blo .Lcbc_dec_one | |
1105 | ldr q1, [x0], #16 | |
1106 | mov x9, sp // pass the key | |
1107 | mov x10, x15 | |
1108 | beq .Lcbc_dec_two | |
1109 | ldr q2, [x0], #16 | |
1110 | cmp x2, #4 | |
1111 | blo .Lcbc_dec_three | |
1112 | ldr q3, [x0], #16 | |
1113 | beq .Lcbc_dec_four | |
1114 | ldr q4, [x0], #16 | |
1115 | cmp x2, #6 | |
1116 | blo .Lcbc_dec_five | |
1117 | ldr q5, [x0], #16 | |
1118 | beq .Lcbc_dec_six | |
1119 | ldr q6, [x0], #-6*16 | |
1120 | ||
1121 | bl _bsaes_decrypt8 | |
1122 | ||
1123 | ldr q5, [x0], #16 // reload input | |
1124 | eor v0.16b, v0.16b, v15.16b // ^= IV | |
1125 | ldr q8, [x0], #16 | |
1126 | ldr q9, [x0], #16 | |
1127 | ldr q10, [x0], #16 | |
1128 | str q0, [x1], #16 // write output | |
1129 | ldr q0, [x0], #16 | |
1130 | eor v1.16b, v1.16b, v5.16b | |
1131 | ldr q5, [x0], #16 | |
1132 | eor v6.16b, v6.16b, v8.16b | |
1133 | ldr q15, [x0] | |
1134 | eor v4.16b, v4.16b, v9.16b | |
1135 | eor v2.16b, v2.16b, v10.16b | |
1136 | str q1, [x1], #16 | |
1137 | eor v0.16b, v7.16b, v0.16b | |
1138 | str q6, [x1], #16 | |
1139 | eor v1.16b, v3.16b, v5.16b | |
1140 | str q4, [x1], #16 | |
1141 | str q2, [x1], #16 | |
1142 | str q0, [x1], #16 | |
1143 | str q1, [x1] | |
1144 | b .Lcbc_dec_done | |
1145 | .align 4 | |
1146 | .Lcbc_dec_six: | |
1147 | sub x0, x0, #0x60 | |
1148 | bl _bsaes_decrypt8 | |
1149 | ldr q3, [x0], #16 // reload input | |
1150 | eor v0.16b, v0.16b, v15.16b // ^= IV | |
1151 | ldr q5, [x0], #16 | |
1152 | ldr q8, [x0], #16 | |
1153 | ldr q9, [x0], #16 | |
1154 | str q0, [x1], #16 // write output | |
1155 | ldr q0, [x0], #16 | |
1156 | eor v1.16b, v1.16b, v3.16b | |
1157 | ldr q15, [x0] | |
1158 | eor v3.16b, v6.16b, v5.16b | |
1159 | eor v4.16b, v4.16b, v8.16b | |
1160 | eor v2.16b, v2.16b, v9.16b | |
1161 | str q1, [x1], #16 | |
1162 | eor v0.16b, v7.16b, v0.16b | |
1163 | str q3, [x1], #16 | |
1164 | str q4, [x1], #16 | |
1165 | str q2, [x1], #16 | |
1166 | str q0, [x1] | |
1167 | b .Lcbc_dec_done | |
1168 | .align 4 | |
1169 | .Lcbc_dec_five: | |
1170 | sub x0, x0, #0x50 | |
1171 | bl _bsaes_decrypt8 | |
1172 | ldr q3, [x0], #16 // reload input | |
1173 | eor v0.16b, v0.16b, v15.16b // ^= IV | |
1174 | ldr q5, [x0], #16 | |
1175 | ldr q7, [x0], #16 | |
1176 | ldr q8, [x0], #16 | |
1177 | str q0, [x1], #16 // write output | |
1178 | ldr q15, [x0] | |
1179 | eor v0.16b, v1.16b, v3.16b | |
1180 | eor v1.16b, v6.16b, v5.16b | |
1181 | eor v3.16b, v4.16b, v7.16b | |
1182 | str q0, [x1], #16 | |
1183 | eor v0.16b, v2.16b, v8.16b | |
1184 | str q1, [x1], #16 | |
1185 | str q3, [x1], #16 | |
1186 | str q0, [x1] | |
1187 | b .Lcbc_dec_done | |
1188 | .align 4 | |
1189 | .Lcbc_dec_four: | |
1190 | sub x0, x0, #0x40 | |
1191 | bl _bsaes_decrypt8 | |
1192 | ldr q2, [x0], #16 // reload input | |
1193 | eor v0.16b, v0.16b, v15.16b // ^= IV | |
1194 | ldr q3, [x0], #16 | |
1195 | ldr q5, [x0], #16 | |
1196 | str q0, [x1], #16 // write output | |
1197 | ldr q15, [x0] | |
1198 | eor v0.16b, v1.16b, v2.16b | |
1199 | eor v1.16b, v6.16b, v3.16b | |
1200 | eor v2.16b, v4.16b, v5.16b | |
1201 | str q0, [x1], #16 | |
1202 | str q1, [x1], #16 | |
1203 | str q2, [x1] | |
1204 | b .Lcbc_dec_done | |
1205 | .align 4 | |
1206 | .Lcbc_dec_three: | |
1207 | sub x0, x0, #0x30 | |
1208 | bl _bsaes_decrypt8 | |
1209 | ldr q2, [x0], #16 // reload input | |
1210 | eor v0.16b, v0.16b, v15.16b // ^= IV | |
1211 | ldr q3, [x0], #16 | |
1212 | ldr q15, [x0] | |
1213 | str q0, [x1], #16 // write output | |
1214 | eor v0.16b, v1.16b, v2.16b | |
1215 | eor v1.16b, v6.16b, v3.16b | |
1216 | str q0, [x1], #16 | |
1217 | str q1, [x1] | |
1218 | b .Lcbc_dec_done | |
1219 | .align 4 | |
1220 | .Lcbc_dec_two: | |
1221 | sub x0, x0, #0x20 | |
1222 | bl _bsaes_decrypt8 | |
1223 | ldr q2, [x0], #16 // reload input | |
1224 | eor v0.16b, v0.16b, v15.16b // ^= IV | |
1225 | ldr q15, [x0] | |
1226 | str q0, [x1], #16 // write output | |
1227 | eor v0.16b, v1.16b, v2.16b | |
1228 | str q0, [x1] | |
1229 | b .Lcbc_dec_done | |
1230 | .align 4 | |
1231 | .Lcbc_dec_one: | |
1232 | sub x0, x0, #0x10 | |
1233 | stp x1, x4, [sp, #-32]! | |
1234 | str x14, [sp, #16] | |
1235 | mov v8.16b, v15.16b | |
1236 | mov v15.16b, v0.16b | |
1237 | mov x2, x3 | |
1238 | bl AES_decrypt | |
1239 | ldr x14, [sp, #16] | |
1240 | ldp x1, x4, [sp], #32 | |
1241 | ldr q0, [x1] // load result | |
1242 | eor v0.16b, v0.16b, v8.16b // ^= IV | |
1243 | str q0, [x1] // write output | |
1244 | ||
1245 | .align 4 | |
1246 | .Lcbc_dec_done: | |
1247 | movi v0.16b, #0 | |
1248 | movi v1.16b, #0 | |
1249 | .Lcbc_dec_bzero:// wipe key schedule [if any] | |
1250 | stp q0, q1, [sp], #32 | |
1251 | cmp sp, x14 | |
1252 | bne .Lcbc_dec_bzero | |
1253 | str q15, [x4] // return IV | |
1254 | ldp d8, d9, [sp, #16] | |
1255 | ldp d10, d15, [sp, #32] | |
5adddcd9 | 1256 | ldp x29, x30, [sp], #48 |
82551af5 BA |
1257 | ret |
1258 | .size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt | |
1259 | ||
1260 | .globl ossl_bsaes_ctr32_encrypt_blocks | |
1261 | .type ossl_bsaes_ctr32_encrypt_blocks,%function | |
1262 | .align 4 | |
1263 | // On entry: | |
1264 | // x0 -> input text (whole 16-byte blocks) | |
1265 | // x1 -> output text (whole 16-byte blocks) | |
1266 | // x2 = number of 16-byte blocks to encrypt/decrypt (> 0) | |
1267 | // x3 -> key | |
1268 | // x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block | |
1269 | // On exit: | |
1270 | // Output text filled in | |
1271 | // No output registers, usual AAPCS64 register preservation | |
1272 | ossl_bsaes_ctr32_encrypt_blocks: | |
1273 | ||
1274 | cmp x2, #8 // use plain AES for | |
1275 | blo .Lctr_enc_short // small sizes | |
1276 | ||
5adddcd9 | 1277 | stp x29, x30, [sp, #-80]! |
82551af5 BA |
1278 | stp d8, d9, [sp, #16] |
1279 | stp d10, d11, [sp, #32] | |
1280 | stp d12, d13, [sp, #48] | |
1281 | stp d14, d15, [sp, #64] | |
1282 | ||
1283 | ldr w15, [x3, #240] // get # of rounds | |
1284 | mov x14, sp | |
1285 | ||
1286 | // allocate the key schedule on the stack | |
1287 | add x17, sp, #96 | |
1288 | sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes | |
1289 | ||
1290 | // populate the key schedule | |
1291 | mov x9, x3 // pass key | |
1292 | mov x10, x15 // pass # of rounds | |
1293 | mov sp, x17 // sp is sp | |
1294 | bl _bsaes_key_convert | |
1295 | eor v7.16b, v7.16b, v15.16b // fix up last round key | |
1296 | str q7, [x17] // save last round key | |
1297 | ||
1298 | ldr q0, [x4] // load counter | |
1299 | add x13, x11, #.LREVM0SR-.LM0_bigendian | |
1300 | ldr q4, [sp] // load round0 key | |
1301 | ||
1302 | movi v8.4s, #1 // compose 1<<96 | |
1303 | movi v9.16b, #0 | |
1304 | rev32 v15.16b, v0.16b | |
1305 | rev32 v0.16b, v0.16b | |
1306 | ext v11.16b, v9.16b, v8.16b, #4 | |
1307 | rev32 v4.16b, v4.16b | |
1308 | add v12.4s, v11.4s, v11.4s // compose 2<<96 | |
1309 | str q4, [sp] // save adjusted round0 key | |
1310 | add v13.4s, v11.4s, v12.4s // compose 3<<96 | |
1311 | add v14.4s, v12.4s, v12.4s // compose 4<<96 | |
1312 | b .Lctr_enc_loop | |
1313 | ||
1314 | .align 4 | |
1315 | .Lctr_enc_loop: | |
1316 | // Intermix prologue from _bsaes_encrypt8 to use the opportunity | |
1317 | // to flip byte order in 32-bit counter | |
1318 | ||
1319 | add v1.4s, v15.4s, v11.4s // +1 | |
1320 | add x9, sp, #0x10 // pass next round key | |
1321 | add v2.4s, v15.4s, v12.4s // +2 | |
1322 | ldr q9, [x13] // .LREVM0SR | |
1323 | ldr q8, [sp] // load round0 key | |
1324 | add v3.4s, v15.4s, v13.4s // +3 | |
1325 | mov x10, x15 // pass rounds | |
1326 | sub x11, x13, #.LREVM0SR-.LSR // pass constants | |
1327 | add v6.4s, v2.4s, v14.4s | |
1328 | add v4.4s, v15.4s, v14.4s // +4 | |
1329 | add v7.4s, v3.4s, v14.4s | |
1330 | add v15.4s, v4.4s, v14.4s // next counter | |
1331 | add v5.4s, v1.4s, v14.4s | |
1332 | ||
1333 | bl _bsaes_encrypt8_alt | |
1334 | ||
1335 | subs x2, x2, #8 | |
1336 | blo .Lctr_enc_loop_done | |
1337 | ||
1338 | ldr q16, [x0], #16 | |
1339 | ldr q17, [x0], #16 | |
1340 | eor v1.16b, v1.16b, v17.16b | |
1341 | ldr q17, [x0], #16 | |
1342 | eor v0.16b, v0.16b, v16.16b | |
1343 | eor v4.16b, v4.16b, v17.16b | |
1344 | str q0, [x1], #16 | |
1345 | ldr q16, [x0], #16 | |
1346 | str q1, [x1], #16 | |
1347 | mov v0.16b, v15.16b | |
1348 | str q4, [x1], #16 | |
1349 | ldr q1, [x0], #16 | |
1350 | eor v4.16b, v6.16b, v16.16b | |
1351 | eor v1.16b, v3.16b, v1.16b | |
1352 | ldr q3, [x0], #16 | |
1353 | eor v3.16b, v7.16b, v3.16b | |
1354 | ldr q6, [x0], #16 | |
1355 | eor v2.16b, v2.16b, v6.16b | |
1356 | ldr q6, [x0], #16 | |
1357 | eor v5.16b, v5.16b, v6.16b | |
1358 | str q4, [x1], #16 | |
1359 | str q1, [x1], #16 | |
1360 | str q3, [x1], #16 | |
1361 | str q2, [x1], #16 | |
1362 | str q5, [x1], #16 | |
1363 | ||
1364 | bne .Lctr_enc_loop | |
1365 | b .Lctr_enc_done | |
1366 | ||
1367 | .align 4 | |
1368 | .Lctr_enc_loop_done: | |
1369 | add x2, x2, #8 | |
1370 | ldr q16, [x0], #16 // load input | |
1371 | eor v0.16b, v0.16b, v16.16b | |
1372 | str q0, [x1], #16 // write output | |
1373 | cmp x2, #2 | |
1374 | blo .Lctr_enc_done | |
1375 | ldr q17, [x0], #16 | |
1376 | eor v1.16b, v1.16b, v17.16b | |
1377 | str q1, [x1], #16 | |
1378 | beq .Lctr_enc_done | |
1379 | ldr q18, [x0], #16 | |
1380 | eor v4.16b, v4.16b, v18.16b | |
1381 | str q4, [x1], #16 | |
1382 | cmp x2, #4 | |
1383 | blo .Lctr_enc_done | |
1384 | ldr q19, [x0], #16 | |
1385 | eor v6.16b, v6.16b, v19.16b | |
1386 | str q6, [x1], #16 | |
1387 | beq .Lctr_enc_done | |
1388 | ldr q20, [x0], #16 | |
1389 | eor v3.16b, v3.16b, v20.16b | |
1390 | str q3, [x1], #16 | |
1391 | cmp x2, #6 | |
1392 | blo .Lctr_enc_done | |
1393 | ldr q21, [x0], #16 | |
1394 | eor v7.16b, v7.16b, v21.16b | |
1395 | str q7, [x1], #16 | |
1396 | beq .Lctr_enc_done | |
1397 | ldr q22, [x0] | |
1398 | eor v2.16b, v2.16b, v22.16b | |
1399 | str q2, [x1], #16 | |
1400 | ||
1401 | .Lctr_enc_done: | |
1402 | movi v0.16b, #0 | |
1403 | movi v1.16b, #0 | |
1404 | .Lctr_enc_bzero: // wipe key schedule [if any] | |
1405 | stp q0, q1, [sp], #32 | |
1406 | cmp sp, x14 | |
1407 | bne .Lctr_enc_bzero | |
1408 | ||
1409 | ldp d8, d9, [sp, #16] | |
1410 | ldp d10, d11, [sp, #32] | |
1411 | ldp d12, d13, [sp, #48] | |
1412 | ldp d14, d15, [sp, #64] | |
5adddcd9 | 1413 | ldp x29, x30, [sp], #80 |
82551af5 BA |
1414 | ret |
1415 | ||
1416 | .Lctr_enc_short: | |
5adddcd9 | 1417 | stp x29, x30, [sp, #-96]! |
82551af5 BA |
1418 | stp x19, x20, [sp, #16] |
1419 | stp x21, x22, [sp, #32] | |
1420 | str x23, [sp, #48] | |
1421 | ||
1422 | mov x19, x0 // copy arguments | |
1423 | mov x20, x1 | |
1424 | mov x21, x2 | |
1425 | mov x22, x3 | |
1426 | ldr w23, [x4, #12] // load counter .LSW | |
1427 | ldr q1, [x4] // load whole counter value | |
2bd5cde5 | 1428 | #ifdef __AARCH64EL__ |
82551af5 BA |
1429 | rev w23, w23 |
1430 | #endif | |
1431 | str q1, [sp, #80] // copy counter value | |
1432 | ||
1433 | .Lctr_enc_short_loop: | |
1434 | add x0, sp, #80 // input counter value | |
1435 | add x1, sp, #64 // output on the stack | |
1436 | mov x2, x22 // key | |
1437 | ||
1438 | bl AES_encrypt | |
1439 | ||
1440 | ldr q0, [x19], #16 // load input | |
1441 | ldr q1, [sp, #64] // load encrypted counter | |
1442 | add x23, x23, #1 | |
2bd5cde5 | 1443 | #ifdef __AARCH64EL__ |
82551af5 BA |
1444 | rev w0, w23 |
1445 | str w0, [sp, #80+12] // next counter value | |
1446 | #else | |
1447 | str w23, [sp, #80+12] // next counter value | |
1448 | #endif | |
1449 | eor v0.16b, v0.16b, v1.16b | |
1450 | str q0, [x20], #16 // store output | |
1451 | subs x21, x21, #1 | |
1452 | bne .Lctr_enc_short_loop | |
1453 | ||
1454 | movi v0.16b, #0 | |
1455 | movi v1.16b, #0 | |
1456 | stp q0, q1, [sp, #64] | |
1457 | ||
1458 | ldr x23, [sp, #48] | |
1459 | ldp x21, x22, [sp, #32] | |
1460 | ldp x19, x20, [sp, #16] | |
5adddcd9 | 1461 | ldp x29, x30, [sp], #96 |
82551af5 BA |
1462 | ret |
1463 | .size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks | |
1464 | ||
1465 | .globl ossl_bsaes_xts_encrypt | |
1466 | .type ossl_bsaes_xts_encrypt,%function | |
1467 | .align 4 | |
1468 | // On entry: | |
1469 | // x0 -> input plaintext | |
1470 | // x1 -> output ciphertext | |
1471 | // x2 -> length of text in bytes (must be at least 16) | |
1472 | // x3 -> key1 (used to encrypt the XORed plaintext blocks) | |
1473 | // x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) | |
1474 | // x5 -> 16-byte initial vector (typically, sector number) | |
1475 | // On exit: | |
1476 | // Output ciphertext filled in | |
1477 | // No output registers, usual AAPCS64 register preservation | |
1478 | ossl_bsaes_xts_encrypt: | |
1479 | // Stack layout: | |
1480 | // sp -> | |
1481 | // nrounds*128-96 bytes: key schedule | |
1482 | // x19 -> | |
1483 | // 16 bytes: frame record | |
1484 | // 4*16 bytes: tweak storage across _bsaes_encrypt8 | |
1485 | // 6*8 bytes: storage for 5 callee-saved general-purpose registers | |
1486 | // 8*8 bytes: storage for 8 callee-saved SIMD registers | |
5adddcd9 | 1487 | stp x29, x30, [sp, #-192]! |
82551af5 BA |
1488 | stp x19, x20, [sp, #80] |
1489 | stp x21, x22, [sp, #96] | |
1490 | str x23, [sp, #112] | |
1491 | stp d8, d9, [sp, #128] | |
1492 | stp d10, d11, [sp, #144] | |
1493 | stp d12, d13, [sp, #160] | |
1494 | stp d14, d15, [sp, #176] | |
1495 | ||
1496 | mov x19, sp | |
1497 | mov x20, x0 | |
1498 | mov x21, x1 | |
1499 | mov x22, x2 | |
1500 | mov x23, x3 | |
1501 | ||
1502 | // generate initial tweak | |
1503 | sub sp, sp, #16 | |
1504 | mov x0, x5 // iv[] | |
1505 | mov x1, sp | |
1506 | mov x2, x4 // key2 | |
1507 | bl AES_encrypt | |
1508 | ldr q11, [sp], #16 | |
1509 | ||
1510 | ldr w1, [x23, #240] // get # of rounds | |
1511 | // allocate the key schedule on the stack | |
1512 | add x17, sp, #96 | |
1513 | sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes | |
1514 | ||
1515 | // populate the key schedule | |
1516 | mov x9, x23 // pass key | |
1517 | mov x10, x1 // pass # of rounds | |
1518 | mov sp, x17 | |
1519 | bl _bsaes_key_convert | |
1520 | eor v15.16b, v15.16b, v7.16b // fix up last round key | |
1521 | str q15, [x17] // save last round key | |
1522 | ||
1523 | subs x22, x22, #0x80 | |
1524 | blo .Lxts_enc_short | |
1525 | b .Lxts_enc_loop | |
1526 | ||
1527 | .align 4 | |
1528 | .Lxts_enc_loop: | |
1529 | ldr q8, .Lxts_magic | |
1530 | mov x10, x1 // pass rounds | |
1531 | add x2, x19, #16 | |
1532 | ldr q0, [x20], #16 | |
1533 | sshr v1.2d, v11.2d, #63 | |
1534 | mov x9, sp // pass key schedule | |
1535 | ldr q6, .Lxts_magic+16 | |
1536 | add v2.2d, v11.2d, v11.2d | |
1537 | cmtst v3.2d, v11.2d, v6.2d | |
1538 | and v1.16b, v1.16b, v8.16b | |
1539 | ext v1.16b, v1.16b, v1.16b, #8 | |
1540 | and v3.16b, v3.16b, v8.16b | |
1541 | ldr q4, [x20], #16 | |
1542 | eor v12.16b, v2.16b, v1.16b | |
1543 | eor v1.16b, v4.16b, v12.16b | |
1544 | eor v0.16b, v0.16b, v11.16b | |
1545 | cmtst v2.2d, v12.2d, v6.2d | |
1546 | add v4.2d, v12.2d, v12.2d | |
1547 | add x0, x19, #16 | |
1548 | ext v3.16b, v3.16b, v3.16b, #8 | |
1549 | and v2.16b, v2.16b, v8.16b | |
1550 | eor v13.16b, v4.16b, v3.16b | |
1551 | ldr q3, [x20], #16 | |
1552 | ext v4.16b, v2.16b, v2.16b, #8 | |
1553 | eor v2.16b, v3.16b, v13.16b | |
1554 | ldr q3, [x20], #16 | |
1555 | add v5.2d, v13.2d, v13.2d | |
1556 | cmtst v7.2d, v13.2d, v6.2d | |
1557 | and v7.16b, v7.16b, v8.16b | |
1558 | ldr q9, [x20], #16 | |
1559 | ext v7.16b, v7.16b, v7.16b, #8 | |
1560 | ldr q10, [x20], #16 | |
1561 | eor v14.16b, v5.16b, v4.16b | |
1562 | ldr q16, [x20], #16 | |
1563 | add v4.2d, v14.2d, v14.2d | |
1564 | eor v3.16b, v3.16b, v14.16b | |
1565 | eor v15.16b, v4.16b, v7.16b | |
1566 | add v5.2d, v15.2d, v15.2d | |
1567 | ldr q7, [x20], #16 | |
1568 | cmtst v4.2d, v14.2d, v6.2d | |
1569 | and v17.16b, v4.16b, v8.16b | |
1570 | cmtst v18.2d, v15.2d, v6.2d | |
1571 | eor v4.16b, v9.16b, v15.16b | |
1572 | ext v9.16b, v17.16b, v17.16b, #8 | |
1573 | eor v9.16b, v5.16b, v9.16b | |
1574 | add v17.2d, v9.2d, v9.2d | |
1575 | and v18.16b, v18.16b, v8.16b | |
1576 | eor v5.16b, v10.16b, v9.16b | |
1577 | str q9, [x2], #16 | |
1578 | ext v10.16b, v18.16b, v18.16b, #8 | |
1579 | cmtst v9.2d, v9.2d, v6.2d | |
1580 | and v9.16b, v9.16b, v8.16b | |
1581 | eor v10.16b, v17.16b, v10.16b | |
1582 | cmtst v17.2d, v10.2d, v6.2d | |
1583 | eor v6.16b, v16.16b, v10.16b | |
1584 | str q10, [x2], #16 | |
1585 | ext v9.16b, v9.16b, v9.16b, #8 | |
1586 | add v10.2d, v10.2d, v10.2d | |
1587 | eor v9.16b, v10.16b, v9.16b | |
1588 | str q9, [x2], #16 | |
1589 | eor v7.16b, v7.16b, v9.16b | |
1590 | add v9.2d, v9.2d, v9.2d | |
1591 | and v8.16b, v17.16b, v8.16b | |
1592 | ext v8.16b, v8.16b, v8.16b, #8 | |
1593 | eor v8.16b, v9.16b, v8.16b | |
1594 | str q8, [x2] // next round tweak | |
1595 | ||
1596 | bl _bsaes_encrypt8 | |
1597 | ||
1598 | ldr q8, [x0], #16 | |
1599 | eor v0.16b, v0.16b, v11.16b | |
1600 | eor v1.16b, v1.16b, v12.16b | |
1601 | ldr q9, [x0], #16 | |
1602 | eor v4.16b, v4.16b, v13.16b | |
1603 | eor v6.16b, v6.16b, v14.16b | |
1604 | ldr q10, [x0], #16 | |
1605 | eor v3.16b, v3.16b, v15.16b | |
1606 | subs x22, x22, #0x80 | |
1607 | str q0, [x21], #16 | |
1608 | ldr q11, [x0] // next round tweak | |
1609 | str q1, [x21], #16 | |
1610 | eor v0.16b, v7.16b, v8.16b | |
1611 | eor v1.16b, v2.16b, v9.16b | |
1612 | str q4, [x21], #16 | |
1613 | eor v2.16b, v5.16b, v10.16b | |
1614 | str q6, [x21], #16 | |
1615 | str q3, [x21], #16 | |
1616 | str q0, [x21], #16 | |
1617 | str q1, [x21], #16 | |
1618 | str q2, [x21], #16 | |
1619 | bpl .Lxts_enc_loop | |
1620 | ||
1621 | .Lxts_enc_short: | |
1622 | adds x22, x22, #0x70 | |
1623 | bmi .Lxts_enc_done | |
1624 | ||
1625 | ldr q8, .Lxts_magic | |
1626 | sshr v1.2d, v11.2d, #63 | |
1627 | add v2.2d, v11.2d, v11.2d | |
1628 | ldr q9, .Lxts_magic+16 | |
1629 | subs x22, x22, #0x10 | |
1630 | ldr q0, [x20], #16 | |
1631 | and v1.16b, v1.16b, v8.16b | |
1632 | cmtst v3.2d, v11.2d, v9.2d | |
1633 | ext v1.16b, v1.16b, v1.16b, #8 | |
1634 | and v3.16b, v3.16b, v8.16b | |
1635 | eor v12.16b, v2.16b, v1.16b | |
1636 | ext v1.16b, v3.16b, v3.16b, #8 | |
1637 | add v2.2d, v12.2d, v12.2d | |
1638 | cmtst v3.2d, v12.2d, v9.2d | |
1639 | eor v13.16b, v2.16b, v1.16b | |
1640 | and v22.16b, v3.16b, v8.16b | |
1641 | bmi .Lxts_enc_1 | |
1642 | ||
1643 | ext v2.16b, v22.16b, v22.16b, #8 | |
1644 | add v3.2d, v13.2d, v13.2d | |
1645 | ldr q1, [x20], #16 | |
1646 | cmtst v4.2d, v13.2d, v9.2d | |
1647 | subs x22, x22, #0x10 | |
1648 | eor v14.16b, v3.16b, v2.16b | |
1649 | and v23.16b, v4.16b, v8.16b | |
1650 | bmi .Lxts_enc_2 | |
1651 | ||
1652 | ext v3.16b, v23.16b, v23.16b, #8 | |
1653 | add v4.2d, v14.2d, v14.2d | |
1654 | ldr q2, [x20], #16 | |
1655 | cmtst v5.2d, v14.2d, v9.2d | |
1656 | eor v0.16b, v0.16b, v11.16b | |
1657 | subs x22, x22, #0x10 | |
1658 | eor v15.16b, v4.16b, v3.16b | |
1659 | and v24.16b, v5.16b, v8.16b | |
1660 | bmi .Lxts_enc_3 | |
1661 | ||
1662 | ext v4.16b, v24.16b, v24.16b, #8 | |
1663 | add v5.2d, v15.2d, v15.2d | |
1664 | ldr q3, [x20], #16 | |
1665 | cmtst v6.2d, v15.2d, v9.2d | |
1666 | eor v1.16b, v1.16b, v12.16b | |
1667 | subs x22, x22, #0x10 | |
1668 | eor v16.16b, v5.16b, v4.16b | |
1669 | and v25.16b, v6.16b, v8.16b | |
1670 | bmi .Lxts_enc_4 | |
1671 | ||
1672 | ext v5.16b, v25.16b, v25.16b, #8 | |
1673 | add v6.2d, v16.2d, v16.2d | |
1674 | add x0, x19, #16 | |
1675 | cmtst v7.2d, v16.2d, v9.2d | |
1676 | ldr q4, [x20], #16 | |
1677 | eor v2.16b, v2.16b, v13.16b | |
1678 | str q16, [x0], #16 | |
1679 | subs x22, x22, #0x10 | |
1680 | eor v17.16b, v6.16b, v5.16b | |
1681 | and v26.16b, v7.16b, v8.16b | |
1682 | bmi .Lxts_enc_5 | |
1683 | ||
1684 | ext v7.16b, v26.16b, v26.16b, #8 | |
1685 | add v18.2d, v17.2d, v17.2d | |
1686 | ldr q5, [x20], #16 | |
1687 | eor v3.16b, v3.16b, v14.16b | |
1688 | str q17, [x0], #16 | |
1689 | subs x22, x22, #0x10 | |
1690 | eor v18.16b, v18.16b, v7.16b | |
1691 | bmi .Lxts_enc_6 | |
1692 | ||
1693 | ldr q6, [x20], #16 | |
1694 | eor v4.16b, v4.16b, v15.16b | |
1695 | eor v5.16b, v5.16b, v16.16b | |
1696 | str q18, [x0] // next round tweak | |
1697 | mov x9, sp // pass key schedule | |
1698 | mov x10, x1 | |
1699 | add x0, x19, #16 | |
1700 | sub x22, x22, #0x10 | |
1701 | eor v6.16b, v6.16b, v17.16b | |
1702 | ||
1703 | bl _bsaes_encrypt8 | |
1704 | ||
1705 | ldr q16, [x0], #16 | |
1706 | eor v0.16b, v0.16b, v11.16b | |
1707 | eor v1.16b, v1.16b, v12.16b | |
1708 | ldr q17, [x0], #16 | |
1709 | eor v4.16b, v4.16b, v13.16b | |
1710 | eor v6.16b, v6.16b, v14.16b | |
1711 | eor v3.16b, v3.16b, v15.16b | |
1712 | ldr q11, [x0] // next round tweak | |
1713 | str q0, [x21], #16 | |
1714 | str q1, [x21], #16 | |
1715 | eor v0.16b, v7.16b, v16.16b | |
1716 | eor v1.16b, v2.16b, v17.16b | |
1717 | str q4, [x21], #16 | |
1718 | str q6, [x21], #16 | |
1719 | str q3, [x21], #16 | |
1720 | str q0, [x21], #16 | |
1721 | str q1, [x21], #16 | |
1722 | b .Lxts_enc_done | |
1723 | ||
1724 | .align 4 | |
1725 | .Lxts_enc_6: | |
1726 | eor v4.16b, v4.16b, v15.16b | |
1727 | eor v5.16b, v5.16b, v16.16b | |
1728 | mov x9, sp // pass key schedule | |
1729 | mov x10, x1 // pass rounds | |
1730 | add x0, x19, #16 | |
1731 | ||
1732 | bl _bsaes_encrypt8 | |
1733 | ||
1734 | ldr q16, [x0], #16 | |
1735 | eor v0.16b, v0.16b, v11.16b | |
1736 | eor v1.16b, v1.16b, v12.16b | |
1737 | eor v4.16b, v4.16b, v13.16b | |
1738 | eor v6.16b, v6.16b, v14.16b | |
1739 | ldr q11, [x0] // next round tweak | |
1740 | eor v3.16b, v3.16b, v15.16b | |
1741 | str q0, [x21], #16 | |
1742 | str q1, [x21], #16 | |
1743 | eor v0.16b, v7.16b, v16.16b | |
1744 | str q4, [x21], #16 | |
1745 | str q6, [x21], #16 | |
1746 | str q3, [x21], #16 | |
1747 | str q0, [x21], #16 | |
1748 | b .Lxts_enc_done | |
1749 | ||
1750 | .align 4 | |
1751 | .Lxts_enc_5: | |
1752 | eor v3.16b, v3.16b, v14.16b | |
1753 | eor v4.16b, v4.16b, v15.16b | |
1754 | mov x9, sp // pass key schedule | |
1755 | mov x10, x1 // pass rounds | |
1756 | add x0, x19, #16 | |
1757 | ||
1758 | bl _bsaes_encrypt8 | |
1759 | ||
1760 | eor v0.16b, v0.16b, v11.16b | |
1761 | eor v1.16b, v1.16b, v12.16b | |
1762 | ldr q11, [x0] // next round tweak | |
1763 | eor v4.16b, v4.16b, v13.16b | |
1764 | eor v6.16b, v6.16b, v14.16b | |
1765 | eor v3.16b, v3.16b, v15.16b | |
1766 | str q0, [x21], #16 | |
1767 | str q1, [x21], #16 | |
1768 | str q4, [x21], #16 | |
1769 | str q6, [x21], #16 | |
1770 | str q3, [x21], #16 | |
1771 | b .Lxts_enc_done | |
1772 | ||
1773 | .align 4 | |
1774 | .Lxts_enc_4: | |
1775 | eor v2.16b, v2.16b, v13.16b | |
1776 | eor v3.16b, v3.16b, v14.16b | |
1777 | mov x9, sp // pass key schedule | |
1778 | mov x10, x1 // pass rounds | |
1779 | add x0, x19, #16 | |
1780 | ||
1781 | bl _bsaes_encrypt8 | |
1782 | ||
1783 | eor v0.16b, v0.16b, v11.16b | |
1784 | eor v1.16b, v1.16b, v12.16b | |
1785 | eor v4.16b, v4.16b, v13.16b | |
1786 | eor v6.16b, v6.16b, v14.16b | |
1787 | mov v11.16b, v15.16b // next round tweak | |
1788 | str q0, [x21], #16 | |
1789 | str q1, [x21], #16 | |
1790 | str q4, [x21], #16 | |
1791 | str q6, [x21], #16 | |
1792 | b .Lxts_enc_done | |
1793 | ||
1794 | .align 4 | |
1795 | .Lxts_enc_3: | |
1796 | eor v1.16b, v1.16b, v12.16b | |
1797 | eor v2.16b, v2.16b, v13.16b | |
1798 | mov x9, sp // pass key schedule | |
1799 | mov x10, x1 // pass rounds | |
1800 | add x0, x19, #16 | |
1801 | ||
1802 | bl _bsaes_encrypt8 | |
1803 | ||
1804 | eor v0.16b, v0.16b, v11.16b | |
1805 | eor v1.16b, v1.16b, v12.16b | |
1806 | eor v4.16b, v4.16b, v13.16b | |
1807 | mov v11.16b, v14.16b // next round tweak | |
1808 | str q0, [x21], #16 | |
1809 | str q1, [x21], #16 | |
1810 | str q4, [x21], #16 | |
1811 | b .Lxts_enc_done | |
1812 | ||
1813 | .align 4 | |
1814 | .Lxts_enc_2: | |
1815 | eor v0.16b, v0.16b, v11.16b | |
1816 | eor v1.16b, v1.16b, v12.16b | |
1817 | mov x9, sp // pass key schedule | |
1818 | mov x10, x1 // pass rounds | |
1819 | add x0, x19, #16 | |
1820 | ||
1821 | bl _bsaes_encrypt8 | |
1822 | ||
1823 | eor v0.16b, v0.16b, v11.16b | |
1824 | eor v1.16b, v1.16b, v12.16b | |
1825 | mov v11.16b, v13.16b // next round tweak | |
1826 | str q0, [x21], #16 | |
1827 | str q1, [x21], #16 | |
1828 | b .Lxts_enc_done | |
1829 | ||
1830 | .align 4 | |
1831 | .Lxts_enc_1: | |
1832 | eor v0.16b, v0.16b, v11.16b | |
1833 | sub x0, sp, #16 | |
1834 | sub x1, sp, #16 | |
1835 | mov x2, x23 | |
1836 | mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers | |
1837 | mov v14.d[0], v12.d[1] | |
1838 | str q0, [sp, #-16]! | |
1839 | ||
1840 | bl AES_encrypt | |
1841 | ||
1842 | ldr q0, [sp], #16 | |
1843 | trn1 v13.2d, v11.2d, v13.2d | |
1844 | trn1 v11.2d, v12.2d, v14.2d // next round tweak | |
1845 | eor v0.16b, v0.16b, v13.16b | |
1846 | str q0, [x21], #16 | |
1847 | ||
1848 | .Lxts_enc_done: | |
1849 | adds x22, x22, #0x10 | |
1850 | beq .Lxts_enc_ret | |
1851 | ||
1852 | sub x6, x21, #0x10 | |
1853 | // Penultimate plaintext block produces final ciphertext part-block | |
1854 | // plus remaining part of final plaintext block. Move ciphertext part | |
a024ab98 | 1855 | // to final position and reuse penultimate ciphertext block buffer to |
82551af5 BA |
1856 | // construct final plaintext block |
1857 | .Lxts_enc_steal: | |
1858 | ldrb w0, [x20], #1 | |
1859 | ldrb w1, [x21, #-0x10] | |
1860 | strb w0, [x21, #-0x10] | |
1861 | strb w1, [x21], #1 | |
1862 | ||
1863 | subs x22, x22, #1 | |
1864 | bhi .Lxts_enc_steal | |
1865 | ||
1866 | // Finally encrypt the penultimate ciphertext block using the | |
1867 | // last tweak | |
1868 | ldr q0, [x6] | |
1869 | eor v0.16b, v0.16b, v11.16b | |
1870 | str q0, [sp, #-16]! | |
1871 | mov x0, sp | |
1872 | mov x1, sp | |
1873 | mov x2, x23 | |
1874 | mov x21, x6 | |
1875 | mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers | |
1876 | ||
1877 | bl AES_encrypt | |
1878 | ||
1879 | trn1 v11.2d, v11.2d, v13.2d | |
1880 | ldr q0, [sp], #16 | |
1881 | eor v0.16b, v0.16b, v11.16b | |
1882 | str q0, [x21] | |
1883 | ||
1884 | .Lxts_enc_ret: | |
1885 | ||
1886 | movi v0.16b, #0 | |
1887 | movi v1.16b, #0 | |
1888 | .Lxts_enc_bzero: // wipe key schedule | |
1889 | stp q0, q1, [sp], #32 | |
1890 | cmp sp, x19 | |
1891 | bne .Lxts_enc_bzero | |
1892 | ||
1893 | ldp x19, x20, [sp, #80] | |
1894 | ldp x21, x22, [sp, #96] | |
1895 | ldr x23, [sp, #112] | |
1896 | ldp d8, d9, [sp, #128] | |
1897 | ldp d10, d11, [sp, #144] | |
1898 | ldp d12, d13, [sp, #160] | |
1899 | ldp d14, d15, [sp, #176] | |
5adddcd9 | 1900 | ldp x29, x30, [sp], #192 |
82551af5 BA |
1901 | ret |
1902 | .size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt | |
1903 | ||
1904 | // The assembler doesn't seem capable of de-duplicating these when expressed | |
1905 | // using `ldr qd,=` syntax, so assign a symbolic address | |
1906 | .align 5 | |
1907 | .Lxts_magic: | |
1908 | .quad 1, 0x87, 0x4000000000000000, 0x4000000000000000 | |
1909 | ||
1910 | .globl ossl_bsaes_xts_decrypt | |
1911 | .type ossl_bsaes_xts_decrypt,%function | |
1912 | .align 4 | |
1913 | // On entry: | |
1914 | // x0 -> input ciphertext | |
1915 | // x1 -> output plaintext | |
1916 | // x2 -> length of text in bytes (must be at least 16) | |
1917 | // x3 -> key1 (used to decrypt the XORed ciphertext blocks) | |
1918 | // x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) | |
1919 | // x5 -> 16-byte initial vector (typically, sector number) | |
1920 | // On exit: | |
1921 | // Output plaintext filled in | |
1922 | // No output registers, usual AAPCS64 register preservation | |
1923 | ossl_bsaes_xts_decrypt: | |
1924 | // Stack layout: | |
1925 | // sp -> | |
1926 | // nrounds*128-96 bytes: key schedule | |
1927 | // x19 -> | |
1928 | // 16 bytes: frame record | |
1929 | // 4*16 bytes: tweak storage across _bsaes_decrypt8 | |
1930 | // 6*8 bytes: storage for 5 callee-saved general-purpose registers | |
1931 | // 8*8 bytes: storage for 8 callee-saved SIMD registers | |
5adddcd9 | 1932 | stp x29, x30, [sp, #-192]! |
82551af5 BA |
1933 | stp x19, x20, [sp, #80] |
1934 | stp x21, x22, [sp, #96] | |
1935 | str x23, [sp, #112] | |
1936 | stp d8, d9, [sp, #128] | |
1937 | stp d10, d11, [sp, #144] | |
1938 | stp d12, d13, [sp, #160] | |
1939 | stp d14, d15, [sp, #176] | |
1940 | ||
1941 | mov x19, sp | |
1942 | mov x20, x0 | |
1943 | mov x21, x1 | |
1944 | mov x22, x2 | |
1945 | mov x23, x3 | |
1946 | ||
1947 | // generate initial tweak | |
1948 | sub sp, sp, #16 | |
1949 | mov x0, x5 // iv[] | |
1950 | mov x1, sp | |
1951 | mov x2, x4 // key2 | |
1952 | bl AES_encrypt | |
1953 | ldr q11, [sp], #16 | |
1954 | ||
1955 | ldr w1, [x23, #240] // get # of rounds | |
1956 | // allocate the key schedule on the stack | |
1957 | add x17, sp, #96 | |
1958 | sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes | |
1959 | ||
1960 | // populate the key schedule | |
1961 | mov x9, x23 // pass key | |
1962 | mov x10, x1 // pass # of rounds | |
1963 | mov sp, x17 | |
1964 | bl _bsaes_key_convert | |
1965 | ldr q6, [sp] | |
1966 | str q15, [x17] // save last round key | |
1967 | eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) | |
1968 | str q6, [sp] | |
1969 | ||
1970 | sub x30, x22, #0x10 | |
1971 | tst x22, #0xf // if not multiple of 16 | |
1972 | csel x22, x30, x22, ne // subtract another 16 bytes | |
1973 | subs x22, x22, #0x80 | |
1974 | ||
1975 | blo .Lxts_dec_short | |
1976 | b .Lxts_dec_loop | |
1977 | ||
1978 | .align 4 | |
1979 | .Lxts_dec_loop: | |
1980 | ldr q8, .Lxts_magic | |
1981 | mov x10, x1 // pass rounds | |
1982 | add x2, x19, #16 | |
1983 | ldr q0, [x20], #16 | |
1984 | sshr v1.2d, v11.2d, #63 | |
1985 | mov x9, sp // pass key schedule | |
1986 | ldr q6, .Lxts_magic+16 | |
1987 | add v2.2d, v11.2d, v11.2d | |
1988 | cmtst v3.2d, v11.2d, v6.2d | |
1989 | and v1.16b, v1.16b, v8.16b | |
1990 | ext v1.16b, v1.16b, v1.16b, #8 | |
1991 | and v3.16b, v3.16b, v8.16b | |
1992 | ldr q4, [x20], #16 | |
1993 | eor v12.16b, v2.16b, v1.16b | |
1994 | eor v1.16b, v4.16b, v12.16b | |
1995 | eor v0.16b, v0.16b, v11.16b | |
1996 | cmtst v2.2d, v12.2d, v6.2d | |
1997 | add v4.2d, v12.2d, v12.2d | |
1998 | add x0, x19, #16 | |
1999 | ext v3.16b, v3.16b, v3.16b, #8 | |
2000 | and v2.16b, v2.16b, v8.16b | |
2001 | eor v13.16b, v4.16b, v3.16b | |
2002 | ldr q3, [x20], #16 | |
2003 | ext v4.16b, v2.16b, v2.16b, #8 | |
2004 | eor v2.16b, v3.16b, v13.16b | |
2005 | ldr q3, [x20], #16 | |
2006 | add v5.2d, v13.2d, v13.2d | |
2007 | cmtst v7.2d, v13.2d, v6.2d | |
2008 | and v7.16b, v7.16b, v8.16b | |
2009 | ldr q9, [x20], #16 | |
2010 | ext v7.16b, v7.16b, v7.16b, #8 | |
2011 | ldr q10, [x20], #16 | |
2012 | eor v14.16b, v5.16b, v4.16b | |
2013 | ldr q16, [x20], #16 | |
2014 | add v4.2d, v14.2d, v14.2d | |
2015 | eor v3.16b, v3.16b, v14.16b | |
2016 | eor v15.16b, v4.16b, v7.16b | |
2017 | add v5.2d, v15.2d, v15.2d | |
2018 | ldr q7, [x20], #16 | |
2019 | cmtst v4.2d, v14.2d, v6.2d | |
2020 | and v17.16b, v4.16b, v8.16b | |
2021 | cmtst v18.2d, v15.2d, v6.2d | |
2022 | eor v4.16b, v9.16b, v15.16b | |
2023 | ext v9.16b, v17.16b, v17.16b, #8 | |
2024 | eor v9.16b, v5.16b, v9.16b | |
2025 | add v17.2d, v9.2d, v9.2d | |
2026 | and v18.16b, v18.16b, v8.16b | |
2027 | eor v5.16b, v10.16b, v9.16b | |
2028 | str q9, [x2], #16 | |
2029 | ext v10.16b, v18.16b, v18.16b, #8 | |
2030 | cmtst v9.2d, v9.2d, v6.2d | |
2031 | and v9.16b, v9.16b, v8.16b | |
2032 | eor v10.16b, v17.16b, v10.16b | |
2033 | cmtst v17.2d, v10.2d, v6.2d | |
2034 | eor v6.16b, v16.16b, v10.16b | |
2035 | str q10, [x2], #16 | |
2036 | ext v9.16b, v9.16b, v9.16b, #8 | |
2037 | add v10.2d, v10.2d, v10.2d | |
2038 | eor v9.16b, v10.16b, v9.16b | |
2039 | str q9, [x2], #16 | |
2040 | eor v7.16b, v7.16b, v9.16b | |
2041 | add v9.2d, v9.2d, v9.2d | |
2042 | and v8.16b, v17.16b, v8.16b | |
2043 | ext v8.16b, v8.16b, v8.16b, #8 | |
2044 | eor v8.16b, v9.16b, v8.16b | |
2045 | str q8, [x2] // next round tweak | |
2046 | ||
2047 | bl _bsaes_decrypt8 | |
2048 | ||
2049 | eor v6.16b, v6.16b, v13.16b | |
2050 | eor v0.16b, v0.16b, v11.16b | |
2051 | ldr q8, [x0], #16 | |
2052 | eor v7.16b, v7.16b, v8.16b | |
2053 | str q0, [x21], #16 | |
2054 | eor v0.16b, v1.16b, v12.16b | |
2055 | ldr q1, [x0], #16 | |
2056 | eor v1.16b, v3.16b, v1.16b | |
2057 | subs x22, x22, #0x80 | |
2058 | eor v2.16b, v2.16b, v15.16b | |
2059 | eor v3.16b, v4.16b, v14.16b | |
2060 | ldr q4, [x0], #16 | |
2061 | str q0, [x21], #16 | |
2062 | ldr q11, [x0] // next round tweak | |
2063 | eor v0.16b, v5.16b, v4.16b | |
2064 | str q6, [x21], #16 | |
2065 | str q3, [x21], #16 | |
2066 | str q2, [x21], #16 | |
2067 | str q7, [x21], #16 | |
2068 | str q1, [x21], #16 | |
2069 | str q0, [x21], #16 | |
2070 | bpl .Lxts_dec_loop | |
2071 | ||
2072 | .Lxts_dec_short: | |
2073 | adds x22, x22, #0x70 | |
2074 | bmi .Lxts_dec_done | |
2075 | ||
2076 | ldr q8, .Lxts_magic | |
2077 | sshr v1.2d, v11.2d, #63 | |
2078 | add v2.2d, v11.2d, v11.2d | |
2079 | ldr q9, .Lxts_magic+16 | |
2080 | subs x22, x22, #0x10 | |
2081 | ldr q0, [x20], #16 | |
2082 | and v1.16b, v1.16b, v8.16b | |
2083 | cmtst v3.2d, v11.2d, v9.2d | |
2084 | ext v1.16b, v1.16b, v1.16b, #8 | |
2085 | and v3.16b, v3.16b, v8.16b | |
2086 | eor v12.16b, v2.16b, v1.16b | |
2087 | ext v1.16b, v3.16b, v3.16b, #8 | |
2088 | add v2.2d, v12.2d, v12.2d | |
2089 | cmtst v3.2d, v12.2d, v9.2d | |
2090 | eor v13.16b, v2.16b, v1.16b | |
2091 | and v22.16b, v3.16b, v8.16b | |
2092 | bmi .Lxts_dec_1 | |
2093 | ||
2094 | ext v2.16b, v22.16b, v22.16b, #8 | |
2095 | add v3.2d, v13.2d, v13.2d | |
2096 | ldr q1, [x20], #16 | |
2097 | cmtst v4.2d, v13.2d, v9.2d | |
2098 | subs x22, x22, #0x10 | |
2099 | eor v14.16b, v3.16b, v2.16b | |
2100 | and v23.16b, v4.16b, v8.16b | |
2101 | bmi .Lxts_dec_2 | |
2102 | ||
2103 | ext v3.16b, v23.16b, v23.16b, #8 | |
2104 | add v4.2d, v14.2d, v14.2d | |
2105 | ldr q2, [x20], #16 | |
2106 | cmtst v5.2d, v14.2d, v9.2d | |
2107 | eor v0.16b, v0.16b, v11.16b | |
2108 | subs x22, x22, #0x10 | |
2109 | eor v15.16b, v4.16b, v3.16b | |
2110 | and v24.16b, v5.16b, v8.16b | |
2111 | bmi .Lxts_dec_3 | |
2112 | ||
2113 | ext v4.16b, v24.16b, v24.16b, #8 | |
2114 | add v5.2d, v15.2d, v15.2d | |
2115 | ldr q3, [x20], #16 | |
2116 | cmtst v6.2d, v15.2d, v9.2d | |
2117 | eor v1.16b, v1.16b, v12.16b | |
2118 | subs x22, x22, #0x10 | |
2119 | eor v16.16b, v5.16b, v4.16b | |
2120 | and v25.16b, v6.16b, v8.16b | |
2121 | bmi .Lxts_dec_4 | |
2122 | ||
2123 | ext v5.16b, v25.16b, v25.16b, #8 | |
2124 | add v6.2d, v16.2d, v16.2d | |
2125 | add x0, x19, #16 | |
2126 | cmtst v7.2d, v16.2d, v9.2d | |
2127 | ldr q4, [x20], #16 | |
2128 | eor v2.16b, v2.16b, v13.16b | |
2129 | str q16, [x0], #16 | |
2130 | subs x22, x22, #0x10 | |
2131 | eor v17.16b, v6.16b, v5.16b | |
2132 | and v26.16b, v7.16b, v8.16b | |
2133 | bmi .Lxts_dec_5 | |
2134 | ||
2135 | ext v7.16b, v26.16b, v26.16b, #8 | |
2136 | add v18.2d, v17.2d, v17.2d | |
2137 | ldr q5, [x20], #16 | |
2138 | eor v3.16b, v3.16b, v14.16b | |
2139 | str q17, [x0], #16 | |
2140 | subs x22, x22, #0x10 | |
2141 | eor v18.16b, v18.16b, v7.16b | |
2142 | bmi .Lxts_dec_6 | |
2143 | ||
2144 | ldr q6, [x20], #16 | |
2145 | eor v4.16b, v4.16b, v15.16b | |
2146 | eor v5.16b, v5.16b, v16.16b | |
2147 | str q18, [x0] // next round tweak | |
2148 | mov x9, sp // pass key schedule | |
2149 | mov x10, x1 | |
2150 | add x0, x19, #16 | |
2151 | sub x22, x22, #0x10 | |
2152 | eor v6.16b, v6.16b, v17.16b | |
2153 | ||
2154 | bl _bsaes_decrypt8 | |
2155 | ||
2156 | ldr q16, [x0], #16 | |
2157 | eor v0.16b, v0.16b, v11.16b | |
2158 | eor v1.16b, v1.16b, v12.16b | |
2159 | ldr q17, [x0], #16 | |
2160 | eor v6.16b, v6.16b, v13.16b | |
2161 | eor v4.16b, v4.16b, v14.16b | |
2162 | eor v2.16b, v2.16b, v15.16b | |
2163 | ldr q11, [x0] // next round tweak | |
2164 | str q0, [x21], #16 | |
2165 | str q1, [x21], #16 | |
2166 | eor v0.16b, v7.16b, v16.16b | |
2167 | eor v1.16b, v3.16b, v17.16b | |
2168 | str q6, [x21], #16 | |
2169 | str q4, [x21], #16 | |
2170 | str q2, [x21], #16 | |
2171 | str q0, [x21], #16 | |
2172 | str q1, [x21], #16 | |
2173 | b .Lxts_dec_done | |
2174 | ||
2175 | .align 4 | |
2176 | .Lxts_dec_6: | |
2177 | eor v4.16b, v4.16b, v15.16b | |
2178 | eor v5.16b, v5.16b, v16.16b | |
2179 | mov x9, sp // pass key schedule | |
2180 | mov x10, x1 // pass rounds | |
2181 | add x0, x19, #16 | |
2182 | ||
2183 | bl _bsaes_decrypt8 | |
2184 | ||
2185 | ldr q16, [x0], #16 | |
2186 | eor v0.16b, v0.16b, v11.16b | |
2187 | eor v1.16b, v1.16b, v12.16b | |
2188 | eor v6.16b, v6.16b, v13.16b | |
2189 | eor v4.16b, v4.16b, v14.16b | |
2190 | ldr q11, [x0] // next round tweak | |
2191 | eor v2.16b, v2.16b, v15.16b | |
2192 | str q0, [x21], #16 | |
2193 | str q1, [x21], #16 | |
2194 | eor v0.16b, v7.16b, v16.16b | |
2195 | str q6, [x21], #16 | |
2196 | str q4, [x21], #16 | |
2197 | str q2, [x21], #16 | |
2198 | str q0, [x21], #16 | |
2199 | b .Lxts_dec_done | |
2200 | ||
2201 | .align 4 | |
2202 | .Lxts_dec_5: | |
2203 | eor v3.16b, v3.16b, v14.16b | |
2204 | eor v4.16b, v4.16b, v15.16b | |
2205 | mov x9, sp // pass key schedule | |
2206 | mov x10, x1 // pass rounds | |
2207 | add x0, x19, #16 | |
2208 | ||
2209 | bl _bsaes_decrypt8 | |
2210 | ||
2211 | eor v0.16b, v0.16b, v11.16b | |
2212 | eor v1.16b, v1.16b, v12.16b | |
2213 | ldr q11, [x0] // next round tweak | |
2214 | eor v6.16b, v6.16b, v13.16b | |
2215 | eor v4.16b, v4.16b, v14.16b | |
2216 | eor v2.16b, v2.16b, v15.16b | |
2217 | str q0, [x21], #16 | |
2218 | str q1, [x21], #16 | |
2219 | str q6, [x21], #16 | |
2220 | str q4, [x21], #16 | |
2221 | str q2, [x21], #16 | |
2222 | b .Lxts_dec_done | |
2223 | ||
2224 | .align 4 | |
2225 | .Lxts_dec_4: | |
2226 | eor v2.16b, v2.16b, v13.16b | |
2227 | eor v3.16b, v3.16b, v14.16b | |
2228 | mov x9, sp // pass key schedule | |
2229 | mov x10, x1 // pass rounds | |
2230 | add x0, x19, #16 | |
2231 | ||
2232 | bl _bsaes_decrypt8 | |
2233 | ||
2234 | eor v0.16b, v0.16b, v11.16b | |
2235 | eor v1.16b, v1.16b, v12.16b | |
2236 | eor v6.16b, v6.16b, v13.16b | |
2237 | eor v4.16b, v4.16b, v14.16b | |
2238 | mov v11.16b, v15.16b // next round tweak | |
2239 | str q0, [x21], #16 | |
2240 | str q1, [x21], #16 | |
2241 | str q6, [x21], #16 | |
2242 | str q4, [x21], #16 | |
2243 | b .Lxts_dec_done | |
2244 | ||
2245 | .align 4 | |
2246 | .Lxts_dec_3: | |
2247 | eor v1.16b, v1.16b, v12.16b | |
2248 | eor v2.16b, v2.16b, v13.16b | |
2249 | mov x9, sp // pass key schedule | |
2250 | mov x10, x1 // pass rounds | |
2251 | add x0, x19, #16 | |
2252 | ||
2253 | bl _bsaes_decrypt8 | |
2254 | ||
2255 | eor v0.16b, v0.16b, v11.16b | |
2256 | eor v1.16b, v1.16b, v12.16b | |
2257 | eor v6.16b, v6.16b, v13.16b | |
2258 | mov v11.16b, v14.16b // next round tweak | |
2259 | str q0, [x21], #16 | |
2260 | str q1, [x21], #16 | |
2261 | str q6, [x21], #16 | |
2262 | b .Lxts_dec_done | |
2263 | ||
2264 | .align 4 | |
2265 | .Lxts_dec_2: | |
2266 | eor v0.16b, v0.16b, v11.16b | |
2267 | eor v1.16b, v1.16b, v12.16b | |
2268 | mov x9, sp // pass key schedule | |
2269 | mov x10, x1 // pass rounds | |
2270 | add x0, x19, #16 | |
2271 | ||
2272 | bl _bsaes_decrypt8 | |
2273 | ||
2274 | eor v0.16b, v0.16b, v11.16b | |
2275 | eor v1.16b, v1.16b, v12.16b | |
2276 | mov v11.16b, v13.16b // next round tweak | |
2277 | str q0, [x21], #16 | |
2278 | str q1, [x21], #16 | |
2279 | b .Lxts_dec_done | |
2280 | ||
2281 | .align 4 | |
2282 | .Lxts_dec_1: | |
2283 | eor v0.16b, v0.16b, v11.16b | |
2284 | sub x0, sp, #16 | |
2285 | sub x1, sp, #16 | |
2286 | mov x2, x23 | |
2287 | mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers | |
2288 | mov v14.d[0], v12.d[1] | |
2289 | str q0, [sp, #-16]! | |
2290 | ||
2291 | bl AES_decrypt | |
2292 | ||
2293 | ldr q0, [sp], #16 | |
2294 | trn1 v13.2d, v11.2d, v13.2d | |
2295 | trn1 v11.2d, v12.2d, v14.2d // next round tweak | |
2296 | eor v0.16b, v0.16b, v13.16b | |
2297 | str q0, [x21], #16 | |
2298 | ||
2299 | .Lxts_dec_done: | |
2300 | adds x22, x22, #0x10 | |
2301 | beq .Lxts_dec_ret | |
2302 | ||
2303 | // calculate one round of extra tweak for the stolen ciphertext | |
2304 | ldr q8, .Lxts_magic | |
2305 | sshr v6.2d, v11.2d, #63 | |
2306 | and v6.16b, v6.16b, v8.16b | |
2307 | add v12.2d, v11.2d, v11.2d | |
2308 | ext v6.16b, v6.16b, v6.16b, #8 | |
2309 | eor v12.16b, v12.16b, v6.16b | |
2310 | ||
2311 | // perform the final decryption with the last tweak value | |
2312 | ldr q0, [x20], #16 | |
2313 | eor v0.16b, v0.16b, v12.16b | |
2314 | str q0, [sp, #-16]! | |
2315 | mov x0, sp | |
2316 | mov x1, sp | |
2317 | mov x2, x23 | |
2318 | mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers | |
2319 | mov v14.d[0], v12.d[1] | |
2320 | ||
2321 | bl AES_decrypt | |
2322 | ||
2323 | trn1 v12.2d, v12.2d, v14.2d | |
2324 | trn1 v11.2d, v11.2d, v13.2d | |
2325 | ldr q0, [sp], #16 | |
2326 | eor v0.16b, v0.16b, v12.16b | |
2327 | str q0, [x21] | |
2328 | ||
2329 | mov x6, x21 | |
2330 | // Penultimate ciphertext block produces final plaintext part-block | |
2331 | // plus remaining part of final ciphertext block. Move plaintext part | |
a024ab98 | 2332 | // to final position and reuse penultimate plaintext block buffer to |
82551af5 BA |
2333 | // construct final ciphertext block |
2334 | .Lxts_dec_steal: | |
2335 | ldrb w1, [x21] | |
2336 | ldrb w0, [x20], #1 | |
2337 | strb w1, [x21, #0x10] | |
2338 | strb w0, [x21], #1 | |
2339 | ||
2340 | subs x22, x22, #1 | |
2341 | bhi .Lxts_dec_steal | |
2342 | ||
2343 | // Finally decrypt the penultimate plaintext block using the | |
2344 | // penultimate tweak | |
2345 | ldr q0, [x6] | |
2346 | eor v0.16b, v0.16b, v11.16b | |
2347 | str q0, [sp, #-16]! | |
2348 | mov x0, sp | |
2349 | mov x1, sp | |
2350 | mov x2, x23 | |
2351 | mov x21, x6 | |
2352 | ||
2353 | bl AES_decrypt | |
2354 | ||
2355 | trn1 v11.2d, v11.2d, v13.2d | |
2356 | ldr q0, [sp], #16 | |
2357 | eor v0.16b, v0.16b, v11.16b | |
2358 | str q0, [x21] | |
2359 | ||
2360 | .Lxts_dec_ret: | |
2361 | ||
2362 | movi v0.16b, #0 | |
2363 | movi v1.16b, #0 | |
2364 | .Lxts_dec_bzero: // wipe key schedule | |
2365 | stp q0, q1, [sp], #32 | |
2366 | cmp sp, x19 | |
2367 | bne .Lxts_dec_bzero | |
2368 | ||
2369 | ldp x19, x20, [sp, #80] | |
2370 | ldp x21, x22, [sp, #96] | |
2371 | ldr x23, [sp, #112] | |
2372 | ldp d8, d9, [sp, #128] | |
2373 | ldp d10, d11, [sp, #144] | |
2374 | ldp d12, d13, [sp, #160] | |
2375 | ldp d14, d15, [sp, #176] | |
5adddcd9 | 2376 | ldp x29, x30, [sp], #192 |
82551af5 BA |
2377 | ret |
2378 | .size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt |