]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/aes/asm/bsaes-armv8.pl
Copyright year updates
[thirdparty/openssl.git] / crypto / aes / asm / bsaes-armv8.pl
CommitLineData
a35c3a9f 1#!/usr/bin/env perl
da1c088f 2# Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved.
a35c3a9f
TC
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9use strict;
10
11my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
12my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
13my $xlate;
14
15$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1;
16( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
17( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
18die "can't locate arm-xlate.pl";
19
20open OUT,"| \"$^X\" $xlate $flavour $output";
21*STDOUT=*OUT;
22
23my $code = data();
24print $code;
25
26close STDOUT or die "error closing STDOUT: $!"; # enforce flush
27
28sub data
29{
30 local $/;
31 return <DATA>;
32}
33
34__END__
da1c088f 35// Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
82551af5
BA
36//
37// Licensed under the OpenSSL license (the "License"). You may not use
38// this file except in compliance with the License. You can obtain a copy
39// in the file LICENSE in the source distribution or at
40// https://www.openssl.org/source/license.html
41//
42// ====================================================================
43// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
44// project. Rights for redistribution and usage in source and binary
45// forms are granted according to the OpenSSL license.
46// ====================================================================
47//
48// This implementation is a translation of bsaes-armv7 for AArch64.
49// No attempt has been made to carry across the build switches for
50// kernel targets, since the Linux kernel crypto support has moved on
51// from when it was based on OpenSSL.
52
53// A lot of hand-scheduling has been performed. Consequently, this code
54// doesn't factor out neatly into macros in the same way that the
55// AArch32 version did, and there is little to be gained by wrapping it
56// up in Perl, and it is presented as pure assembly.
57
58
59#include "crypto/arm_arch.h"
60
61.text
62
a35c3a9f
TC
63.extern AES_cbc_encrypt
64.extern AES_encrypt
65.extern AES_decrypt
66
82551af5
BA
67.type _bsaes_decrypt8,%function
68.align 4
69// On entry:
70// x9 -> key (previously expanded using _bsaes_key_convert)
71// x10 = number of rounds
72// v0-v7 input data
73// On exit:
74// x9-x11 corrupted
75// other general-purpose registers preserved
76// v0-v7 output data
77// v11-v15 preserved
78// other SIMD registers corrupted
79_bsaes_decrypt8:
80 ldr q8, [x9], #16
81 adr x11, .LM0ISR
82 movi v9.16b, #0x55
83 ldr q10, [x11], #16
84 movi v16.16b, #0x33
85 movi v17.16b, #0x0f
86 sub x10, x10, #1
87 eor v0.16b, v0.16b, v8.16b
88 eor v1.16b, v1.16b, v8.16b
89 eor v2.16b, v2.16b, v8.16b
90 eor v4.16b, v4.16b, v8.16b
91 eor v3.16b, v3.16b, v8.16b
92 eor v5.16b, v5.16b, v8.16b
93 tbl v0.16b, {v0.16b}, v10.16b
94 tbl v1.16b, {v1.16b}, v10.16b
95 tbl v2.16b, {v2.16b}, v10.16b
96 tbl v4.16b, {v4.16b}, v10.16b
97 eor v6.16b, v6.16b, v8.16b
98 eor v7.16b, v7.16b, v8.16b
99 tbl v3.16b, {v3.16b}, v10.16b
100 tbl v5.16b, {v5.16b}, v10.16b
101 tbl v6.16b, {v6.16b}, v10.16b
102 ushr v8.2d, v0.2d, #1
103 tbl v7.16b, {v7.16b}, v10.16b
104 ushr v10.2d, v4.2d, #1
105 ushr v18.2d, v2.2d, #1
106 eor v8.16b, v8.16b, v1.16b
107 ushr v19.2d, v6.2d, #1
108 eor v10.16b, v10.16b, v5.16b
109 eor v18.16b, v18.16b, v3.16b
110 and v8.16b, v8.16b, v9.16b
111 eor v19.16b, v19.16b, v7.16b
112 and v10.16b, v10.16b, v9.16b
113 and v18.16b, v18.16b, v9.16b
114 eor v1.16b, v1.16b, v8.16b
115 shl v8.2d, v8.2d, #1
116 and v9.16b, v19.16b, v9.16b
117 eor v5.16b, v5.16b, v10.16b
118 shl v10.2d, v10.2d, #1
119 eor v3.16b, v3.16b, v18.16b
120 shl v18.2d, v18.2d, #1
121 eor v0.16b, v0.16b, v8.16b
122 shl v8.2d, v9.2d, #1
123 eor v7.16b, v7.16b, v9.16b
124 eor v4.16b, v4.16b, v10.16b
125 eor v2.16b, v2.16b, v18.16b
126 ushr v9.2d, v1.2d, #2
127 eor v6.16b, v6.16b, v8.16b
128 ushr v8.2d, v0.2d, #2
129 ushr v10.2d, v5.2d, #2
130 ushr v18.2d, v4.2d, #2
131 eor v9.16b, v9.16b, v3.16b
132 eor v8.16b, v8.16b, v2.16b
133 eor v10.16b, v10.16b, v7.16b
134 eor v18.16b, v18.16b, v6.16b
135 and v9.16b, v9.16b, v16.16b
136 and v8.16b, v8.16b, v16.16b
137 and v10.16b, v10.16b, v16.16b
138 and v16.16b, v18.16b, v16.16b
139 eor v3.16b, v3.16b, v9.16b
140 shl v9.2d, v9.2d, #2
141 eor v2.16b, v2.16b, v8.16b
142 shl v8.2d, v8.2d, #2
143 eor v7.16b, v7.16b, v10.16b
144 shl v10.2d, v10.2d, #2
145 eor v6.16b, v6.16b, v16.16b
146 shl v16.2d, v16.2d, #2
147 eor v1.16b, v1.16b, v9.16b
148 eor v0.16b, v0.16b, v8.16b
149 eor v5.16b, v5.16b, v10.16b
150 eor v4.16b, v4.16b, v16.16b
151 ushr v8.2d, v3.2d, #4
152 ushr v9.2d, v2.2d, #4
153 ushr v10.2d, v1.2d, #4
154 ushr v16.2d, v0.2d, #4
155 eor v8.16b, v8.16b, v7.16b
156 eor v9.16b, v9.16b, v6.16b
157 eor v10.16b, v10.16b, v5.16b
158 eor v16.16b, v16.16b, v4.16b
159 and v8.16b, v8.16b, v17.16b
160 and v9.16b, v9.16b, v17.16b
161 and v10.16b, v10.16b, v17.16b
162 and v16.16b, v16.16b, v17.16b
163 eor v7.16b, v7.16b, v8.16b
164 shl v8.2d, v8.2d, #4
165 eor v6.16b, v6.16b, v9.16b
166 shl v9.2d, v9.2d, #4
167 eor v5.16b, v5.16b, v10.16b
168 shl v10.2d, v10.2d, #4
169 eor v4.16b, v4.16b, v16.16b
170 shl v16.2d, v16.2d, #4
171 eor v3.16b, v3.16b, v8.16b
172 eor v2.16b, v2.16b, v9.16b
173 eor v1.16b, v1.16b, v10.16b
174 eor v0.16b, v0.16b, v16.16b
175 b .Ldec_sbox
176.align 4
177.Ldec_loop:
178 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
179 ldp q8, q9, [x9], #32
180 eor v0.16b, v16.16b, v0.16b
181 ldr q10, [x9], #16
182 eor v1.16b, v17.16b, v1.16b
183 ldr q16, [x9], #16
184 eor v2.16b, v18.16b, v2.16b
185 eor v3.16b, v19.16b, v3.16b
186 eor v4.16b, v8.16b, v4.16b
187 eor v5.16b, v9.16b, v5.16b
188 eor v6.16b, v10.16b, v6.16b
189 eor v7.16b, v16.16b, v7.16b
190 tbl v0.16b, {v0.16b}, v28.16b
191 tbl v1.16b, {v1.16b}, v28.16b
192 tbl v2.16b, {v2.16b}, v28.16b
193 tbl v3.16b, {v3.16b}, v28.16b
194 tbl v4.16b, {v4.16b}, v28.16b
195 tbl v5.16b, {v5.16b}, v28.16b
196 tbl v6.16b, {v6.16b}, v28.16b
197 tbl v7.16b, {v7.16b}, v28.16b
198.Ldec_sbox:
199 eor v1.16b, v1.16b, v4.16b
200 eor v3.16b, v3.16b, v4.16b
201 subs x10, x10, #1
202 eor v4.16b, v4.16b, v7.16b
203 eor v2.16b, v2.16b, v7.16b
204 eor v1.16b, v1.16b, v6.16b
205 eor v6.16b, v6.16b, v4.16b
206 eor v2.16b, v2.16b, v5.16b
207 eor v0.16b, v0.16b, v1.16b
208 eor v7.16b, v7.16b, v6.16b
209 eor v8.16b, v6.16b, v2.16b
210 and v9.16b, v4.16b, v6.16b
211 eor v10.16b, v2.16b, v6.16b
212 eor v3.16b, v3.16b, v0.16b
213 eor v5.16b, v5.16b, v0.16b
214 eor v16.16b, v7.16b, v4.16b
215 eor v17.16b, v4.16b, v0.16b
216 and v18.16b, v0.16b, v2.16b
217 eor v19.16b, v7.16b, v4.16b
218 eor v1.16b, v1.16b, v3.16b
219 eor v20.16b, v3.16b, v0.16b
220 eor v21.16b, v5.16b, v2.16b
221 eor v22.16b, v3.16b, v7.16b
222 and v8.16b, v17.16b, v8.16b
223 orr v17.16b, v3.16b, v5.16b
224 eor v23.16b, v1.16b, v6.16b
225 eor v24.16b, v20.16b, v16.16b
226 eor v25.16b, v1.16b, v5.16b
227 orr v26.16b, v20.16b, v21.16b
228 and v20.16b, v20.16b, v21.16b
229 and v27.16b, v7.16b, v1.16b
230 eor v21.16b, v21.16b, v23.16b
231 orr v28.16b, v16.16b, v23.16b
232 orr v29.16b, v22.16b, v25.16b
233 eor v26.16b, v26.16b, v8.16b
234 and v16.16b, v16.16b, v23.16b
235 and v22.16b, v22.16b, v25.16b
236 and v21.16b, v24.16b, v21.16b
237 eor v8.16b, v28.16b, v8.16b
238 eor v23.16b, v5.16b, v2.16b
239 eor v24.16b, v1.16b, v6.16b
240 eor v16.16b, v16.16b, v22.16b
241 eor v22.16b, v3.16b, v0.16b
242 eor v25.16b, v29.16b, v21.16b
243 eor v21.16b, v26.16b, v21.16b
244 eor v8.16b, v8.16b, v20.16b
245 eor v26.16b, v23.16b, v24.16b
246 eor v16.16b, v16.16b, v20.16b
247 eor v28.16b, v22.16b, v19.16b
248 eor v20.16b, v25.16b, v20.16b
249 eor v9.16b, v21.16b, v9.16b
250 eor v8.16b, v8.16b, v18.16b
251 eor v18.16b, v5.16b, v1.16b
252 eor v21.16b, v16.16b, v17.16b
253 eor v16.16b, v16.16b, v17.16b
254 eor v17.16b, v20.16b, v27.16b
255 eor v20.16b, v3.16b, v7.16b
256 eor v25.16b, v9.16b, v8.16b
257 eor v27.16b, v0.16b, v4.16b
258 and v29.16b, v9.16b, v17.16b
259 eor v30.16b, v8.16b, v29.16b
260 eor v31.16b, v21.16b, v29.16b
261 eor v29.16b, v21.16b, v29.16b
262 bsl v30.16b, v17.16b, v21.16b
263 bsl v31.16b, v9.16b, v8.16b
264 bsl v16.16b, v30.16b, v29.16b
265 bsl v21.16b, v29.16b, v30.16b
266 eor v8.16b, v31.16b, v30.16b
267 and v1.16b, v1.16b, v31.16b
268 and v9.16b, v16.16b, v31.16b
269 and v6.16b, v6.16b, v30.16b
270 eor v16.16b, v17.16b, v21.16b
271 and v4.16b, v4.16b, v30.16b
272 eor v17.16b, v8.16b, v30.16b
273 and v21.16b, v24.16b, v8.16b
274 eor v9.16b, v9.16b, v25.16b
275 and v19.16b, v19.16b, v8.16b
276 eor v24.16b, v30.16b, v16.16b
277 eor v25.16b, v30.16b, v16.16b
278 and v7.16b, v7.16b, v17.16b
279 and v10.16b, v10.16b, v16.16b
280 eor v29.16b, v9.16b, v16.16b
281 eor v30.16b, v31.16b, v9.16b
282 and v0.16b, v24.16b, v0.16b
283 and v9.16b, v18.16b, v9.16b
284 and v2.16b, v25.16b, v2.16b
285 eor v10.16b, v10.16b, v6.16b
286 eor v18.16b, v29.16b, v16.16b
287 and v5.16b, v30.16b, v5.16b
288 eor v24.16b, v8.16b, v29.16b
289 and v25.16b, v26.16b, v29.16b
290 and v26.16b, v28.16b, v29.16b
291 eor v8.16b, v8.16b, v29.16b
292 eor v17.16b, v17.16b, v18.16b
293 eor v5.16b, v1.16b, v5.16b
294 and v23.16b, v24.16b, v23.16b
295 eor v21.16b, v21.16b, v25.16b
296 eor v19.16b, v19.16b, v26.16b
297 eor v0.16b, v4.16b, v0.16b
298 and v3.16b, v17.16b, v3.16b
299 eor v1.16b, v9.16b, v1.16b
300 eor v9.16b, v25.16b, v23.16b
301 eor v5.16b, v5.16b, v21.16b
302 eor v2.16b, v6.16b, v2.16b
303 and v6.16b, v8.16b, v22.16b
304 eor v3.16b, v7.16b, v3.16b
305 and v8.16b, v20.16b, v18.16b
306 eor v10.16b, v10.16b, v9.16b
307 eor v0.16b, v0.16b, v19.16b
308 eor v9.16b, v1.16b, v9.16b
309 eor v1.16b, v2.16b, v21.16b
310 eor v3.16b, v3.16b, v19.16b
311 and v16.16b, v27.16b, v16.16b
312 eor v17.16b, v26.16b, v6.16b
313 eor v6.16b, v8.16b, v7.16b
314 eor v7.16b, v1.16b, v9.16b
315 eor v1.16b, v5.16b, v3.16b
316 eor v2.16b, v10.16b, v3.16b
317 eor v4.16b, v16.16b, v4.16b
318 eor v8.16b, v6.16b, v17.16b
319 eor v5.16b, v9.16b, v3.16b
320 eor v9.16b, v0.16b, v1.16b
321 eor v6.16b, v7.16b, v1.16b
322 eor v0.16b, v4.16b, v17.16b
323 eor v4.16b, v8.16b, v7.16b
324 eor v7.16b, v9.16b, v2.16b
325 eor v8.16b, v3.16b, v0.16b
326 eor v7.16b, v7.16b, v5.16b
327 eor v3.16b, v4.16b, v7.16b
328 eor v4.16b, v7.16b, v0.16b
329 eor v7.16b, v8.16b, v3.16b
330 bcc .Ldec_done
331 ext v8.16b, v0.16b, v0.16b, #8
332 ext v9.16b, v1.16b, v1.16b, #8
333 ldr q28, [x11] // load from .LISR in common case (x10 > 0)
334 ext v10.16b, v6.16b, v6.16b, #8
335 ext v16.16b, v3.16b, v3.16b, #8
336 ext v17.16b, v5.16b, v5.16b, #8
337 ext v18.16b, v4.16b, v4.16b, #8
338 eor v8.16b, v8.16b, v0.16b
339 eor v9.16b, v9.16b, v1.16b
340 eor v10.16b, v10.16b, v6.16b
341 eor v16.16b, v16.16b, v3.16b
342 eor v17.16b, v17.16b, v5.16b
343 ext v19.16b, v2.16b, v2.16b, #8
344 ext v20.16b, v7.16b, v7.16b, #8
345 eor v18.16b, v18.16b, v4.16b
346 eor v6.16b, v6.16b, v8.16b
347 eor v8.16b, v2.16b, v10.16b
348 eor v4.16b, v4.16b, v9.16b
349 eor v2.16b, v19.16b, v2.16b
350 eor v9.16b, v20.16b, v7.16b
351 eor v0.16b, v0.16b, v16.16b
352 eor v1.16b, v1.16b, v16.16b
353 eor v6.16b, v6.16b, v17.16b
354 eor v8.16b, v8.16b, v16.16b
355 eor v7.16b, v7.16b, v18.16b
356 eor v4.16b, v4.16b, v16.16b
357 eor v2.16b, v3.16b, v2.16b
358 eor v1.16b, v1.16b, v17.16b
359 eor v3.16b, v5.16b, v9.16b
360 eor v5.16b, v8.16b, v17.16b
361 eor v7.16b, v7.16b, v17.16b
362 ext v8.16b, v0.16b, v0.16b, #12
363 ext v9.16b, v6.16b, v6.16b, #12
364 ext v10.16b, v4.16b, v4.16b, #12
365 ext v16.16b, v1.16b, v1.16b, #12
366 ext v17.16b, v5.16b, v5.16b, #12
367 ext v18.16b, v7.16b, v7.16b, #12
368 eor v0.16b, v0.16b, v8.16b
369 eor v6.16b, v6.16b, v9.16b
370 eor v4.16b, v4.16b, v10.16b
371 ext v19.16b, v2.16b, v2.16b, #12
372 ext v20.16b, v3.16b, v3.16b, #12
373 eor v1.16b, v1.16b, v16.16b
374 eor v5.16b, v5.16b, v17.16b
375 eor v7.16b, v7.16b, v18.16b
376 eor v2.16b, v2.16b, v19.16b
377 eor v16.16b, v16.16b, v0.16b
378 eor v3.16b, v3.16b, v20.16b
379 eor v17.16b, v17.16b, v4.16b
380 eor v10.16b, v10.16b, v6.16b
381 ext v0.16b, v0.16b, v0.16b, #8
382 eor v9.16b, v9.16b, v1.16b
383 ext v1.16b, v1.16b, v1.16b, #8
384 eor v8.16b, v8.16b, v3.16b
385 eor v16.16b, v16.16b, v3.16b
386 eor v18.16b, v18.16b, v5.16b
387 eor v19.16b, v19.16b, v7.16b
388 ext v21.16b, v5.16b, v5.16b, #8
389 ext v5.16b, v7.16b, v7.16b, #8
390 eor v7.16b, v20.16b, v2.16b
391 ext v4.16b, v4.16b, v4.16b, #8
392 ext v20.16b, v3.16b, v3.16b, #8
393 eor v17.16b, v17.16b, v3.16b
394 ext v2.16b, v2.16b, v2.16b, #8
395 eor v3.16b, v10.16b, v3.16b
396 ext v10.16b, v6.16b, v6.16b, #8
397 eor v0.16b, v0.16b, v8.16b
398 eor v1.16b, v1.16b, v16.16b
399 eor v5.16b, v5.16b, v18.16b
400 eor v3.16b, v3.16b, v4.16b
401 eor v7.16b, v20.16b, v7.16b
402 eor v6.16b, v2.16b, v19.16b
403 eor v4.16b, v21.16b, v17.16b
404 eor v2.16b, v10.16b, v9.16b
405 bne .Ldec_loop
406 ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0)
407 b .Ldec_loop
408.align 4
409.Ldec_done:
410 ushr v8.2d, v0.2d, #1
411 movi v9.16b, #0x55
412 ldr q10, [x9]
413 ushr v16.2d, v2.2d, #1
414 movi v17.16b, #0x33
415 ushr v18.2d, v6.2d, #1
416 movi v19.16b, #0x0f
417 eor v8.16b, v8.16b, v1.16b
418 ushr v20.2d, v3.2d, #1
419 eor v16.16b, v16.16b, v7.16b
420 eor v18.16b, v18.16b, v4.16b
421 and v8.16b, v8.16b, v9.16b
422 eor v20.16b, v20.16b, v5.16b
423 and v16.16b, v16.16b, v9.16b
424 and v18.16b, v18.16b, v9.16b
425 shl v21.2d, v8.2d, #1
426 eor v1.16b, v1.16b, v8.16b
427 and v8.16b, v20.16b, v9.16b
428 eor v7.16b, v7.16b, v16.16b
429 shl v9.2d, v16.2d, #1
430 eor v4.16b, v4.16b, v18.16b
431 shl v16.2d, v18.2d, #1
432 eor v0.16b, v0.16b, v21.16b
433 shl v18.2d, v8.2d, #1
434 eor v5.16b, v5.16b, v8.16b
435 eor v2.16b, v2.16b, v9.16b
436 eor v6.16b, v6.16b, v16.16b
437 ushr v8.2d, v1.2d, #2
438 eor v3.16b, v3.16b, v18.16b
439 ushr v9.2d, v0.2d, #2
440 ushr v16.2d, v7.2d, #2
441 ushr v18.2d, v2.2d, #2
442 eor v8.16b, v8.16b, v4.16b
443 eor v9.16b, v9.16b, v6.16b
444 eor v16.16b, v16.16b, v5.16b
445 eor v18.16b, v18.16b, v3.16b
446 and v8.16b, v8.16b, v17.16b
447 and v9.16b, v9.16b, v17.16b
448 and v16.16b, v16.16b, v17.16b
449 and v17.16b, v18.16b, v17.16b
450 eor v4.16b, v4.16b, v8.16b
451 shl v8.2d, v8.2d, #2
452 eor v6.16b, v6.16b, v9.16b
453 shl v9.2d, v9.2d, #2
454 eor v5.16b, v5.16b, v16.16b
455 shl v16.2d, v16.2d, #2
456 eor v3.16b, v3.16b, v17.16b
457 shl v17.2d, v17.2d, #2
458 eor v1.16b, v1.16b, v8.16b
459 eor v0.16b, v0.16b, v9.16b
460 eor v7.16b, v7.16b, v16.16b
461 eor v2.16b, v2.16b, v17.16b
462 ushr v8.2d, v4.2d, #4
463 ushr v9.2d, v6.2d, #4
464 ushr v16.2d, v1.2d, #4
465 ushr v17.2d, v0.2d, #4
466 eor v8.16b, v8.16b, v5.16b
467 eor v9.16b, v9.16b, v3.16b
468 eor v16.16b, v16.16b, v7.16b
469 eor v17.16b, v17.16b, v2.16b
470 and v8.16b, v8.16b, v19.16b
471 and v9.16b, v9.16b, v19.16b
472 and v16.16b, v16.16b, v19.16b
473 and v17.16b, v17.16b, v19.16b
474 eor v5.16b, v5.16b, v8.16b
475 shl v8.2d, v8.2d, #4
476 eor v3.16b, v3.16b, v9.16b
477 shl v9.2d, v9.2d, #4
478 eor v7.16b, v7.16b, v16.16b
479 shl v16.2d, v16.2d, #4
480 eor v2.16b, v2.16b, v17.16b
481 shl v17.2d, v17.2d, #4
482 eor v4.16b, v4.16b, v8.16b
483 eor v6.16b, v6.16b, v9.16b
484 eor v7.16b, v7.16b, v10.16b
485 eor v1.16b, v1.16b, v16.16b
486 eor v2.16b, v2.16b, v10.16b
487 eor v0.16b, v0.16b, v17.16b
488 eor v4.16b, v4.16b, v10.16b
489 eor v6.16b, v6.16b, v10.16b
490 eor v3.16b, v3.16b, v10.16b
491 eor v5.16b, v5.16b, v10.16b
492 eor v1.16b, v1.16b, v10.16b
493 eor v0.16b, v0.16b, v10.16b
494 ret
495.size _bsaes_decrypt8,.-_bsaes_decrypt8
496
497.type _bsaes_const,%object
498.align 6
499_bsaes_const:
500// InvShiftRows constants
501// Used in _bsaes_decrypt8, which assumes contiguity
502// .LM0ISR used with round 0 key
503// .LISR used with middle round keys
504// .LISRM0 used with final round key
505.LM0ISR:
506.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
507.LISR:
508.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
509.LISRM0:
510.quad 0x01040b0e0205080f, 0x0306090c00070a0d
511
512// ShiftRows constants
513// Used in _bsaes_encrypt8, which assumes contiguity
514// .LM0SR used with round 0 key
515// .LSR used with middle round keys
516// .LSRM0 used with final round key
517.LM0SR:
518.quad 0x0a0e02060f03070b, 0x0004080c05090d01
519.LSR:
520.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
521.LSRM0:
522.quad 0x0304090e00050a0f, 0x01060b0c0207080d
523
524.LM0_bigendian:
525.quad 0x02060a0e03070b0f, 0x0004080c0105090d
526.LM0_littleendian:
527.quad 0x0105090d0004080c, 0x03070b0f02060a0e
528
529// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into
530// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
531.LREVM0SR:
532.quad 0x090d01050c000408, 0x03070b0f060a0e02
533
534.align 6
535.size _bsaes_const,.-_bsaes_const
536
537.type _bsaes_encrypt8,%function
538.align 4
539// On entry:
540// x9 -> key (previously expanded using _bsaes_key_convert)
541// x10 = number of rounds
542// v0-v7 input data
543// On exit:
544// x9-x11 corrupted
545// other general-purpose registers preserved
546// v0-v7 output data
547// v11-v15 preserved
548// other SIMD registers corrupted
549_bsaes_encrypt8:
550 ldr q8, [x9], #16
551 adr x11, .LM0SR
552 ldr q9, [x11], #16
553_bsaes_encrypt8_alt:
554 eor v0.16b, v0.16b, v8.16b
555 eor v1.16b, v1.16b, v8.16b
556 sub x10, x10, #1
557 eor v2.16b, v2.16b, v8.16b
558 eor v4.16b, v4.16b, v8.16b
559 eor v3.16b, v3.16b, v8.16b
560 eor v5.16b, v5.16b, v8.16b
561 tbl v0.16b, {v0.16b}, v9.16b
562 tbl v1.16b, {v1.16b}, v9.16b
563 tbl v2.16b, {v2.16b}, v9.16b
564 tbl v4.16b, {v4.16b}, v9.16b
565 eor v6.16b, v6.16b, v8.16b
566 eor v7.16b, v7.16b, v8.16b
567 tbl v3.16b, {v3.16b}, v9.16b
568 tbl v5.16b, {v5.16b}, v9.16b
569 tbl v6.16b, {v6.16b}, v9.16b
570 ushr v8.2d, v0.2d, #1
571 movi v10.16b, #0x55
572 tbl v7.16b, {v7.16b}, v9.16b
573 ushr v9.2d, v4.2d, #1
574 movi v16.16b, #0x33
575 ushr v17.2d, v2.2d, #1
576 eor v8.16b, v8.16b, v1.16b
577 movi v18.16b, #0x0f
578 ushr v19.2d, v6.2d, #1
579 eor v9.16b, v9.16b, v5.16b
580 eor v17.16b, v17.16b, v3.16b
581 and v8.16b, v8.16b, v10.16b
582 eor v19.16b, v19.16b, v7.16b
583 and v9.16b, v9.16b, v10.16b
584 and v17.16b, v17.16b, v10.16b
585 eor v1.16b, v1.16b, v8.16b
586 shl v8.2d, v8.2d, #1
587 and v10.16b, v19.16b, v10.16b
588 eor v5.16b, v5.16b, v9.16b
589 shl v9.2d, v9.2d, #1
590 eor v3.16b, v3.16b, v17.16b
591 shl v17.2d, v17.2d, #1
592 eor v0.16b, v0.16b, v8.16b
593 shl v8.2d, v10.2d, #1
594 eor v7.16b, v7.16b, v10.16b
595 eor v4.16b, v4.16b, v9.16b
596 eor v2.16b, v2.16b, v17.16b
597 ushr v9.2d, v1.2d, #2
598 eor v6.16b, v6.16b, v8.16b
599 ushr v8.2d, v0.2d, #2
600 ushr v10.2d, v5.2d, #2
601 ushr v17.2d, v4.2d, #2
602 eor v9.16b, v9.16b, v3.16b
603 eor v8.16b, v8.16b, v2.16b
604 eor v10.16b, v10.16b, v7.16b
605 eor v17.16b, v17.16b, v6.16b
606 and v9.16b, v9.16b, v16.16b
607 and v8.16b, v8.16b, v16.16b
608 and v10.16b, v10.16b, v16.16b
609 and v16.16b, v17.16b, v16.16b
610 eor v3.16b, v3.16b, v9.16b
611 shl v9.2d, v9.2d, #2
612 eor v2.16b, v2.16b, v8.16b
613 shl v8.2d, v8.2d, #2
614 eor v7.16b, v7.16b, v10.16b
615 shl v10.2d, v10.2d, #2
616 eor v6.16b, v6.16b, v16.16b
617 shl v16.2d, v16.2d, #2
618 eor v1.16b, v1.16b, v9.16b
619 eor v0.16b, v0.16b, v8.16b
620 eor v5.16b, v5.16b, v10.16b
621 eor v4.16b, v4.16b, v16.16b
622 ushr v8.2d, v3.2d, #4
623 ushr v9.2d, v2.2d, #4
624 ushr v10.2d, v1.2d, #4
625 ushr v16.2d, v0.2d, #4
626 eor v8.16b, v8.16b, v7.16b
627 eor v9.16b, v9.16b, v6.16b
628 eor v10.16b, v10.16b, v5.16b
629 eor v16.16b, v16.16b, v4.16b
630 and v8.16b, v8.16b, v18.16b
631 and v9.16b, v9.16b, v18.16b
632 and v10.16b, v10.16b, v18.16b
633 and v16.16b, v16.16b, v18.16b
634 eor v7.16b, v7.16b, v8.16b
635 shl v8.2d, v8.2d, #4
636 eor v6.16b, v6.16b, v9.16b
637 shl v9.2d, v9.2d, #4
638 eor v5.16b, v5.16b, v10.16b
639 shl v10.2d, v10.2d, #4
640 eor v4.16b, v4.16b, v16.16b
641 shl v16.2d, v16.2d, #4
642 eor v3.16b, v3.16b, v8.16b
643 eor v2.16b, v2.16b, v9.16b
644 eor v1.16b, v1.16b, v10.16b
645 eor v0.16b, v0.16b, v16.16b
646 b .Lenc_sbox
647.align 4
648.Lenc_loop:
649 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
650 ldp q8, q9, [x9], #32
651 eor v0.16b, v16.16b, v0.16b
652 ldr q10, [x9], #16
653 eor v1.16b, v17.16b, v1.16b
654 ldr q16, [x9], #16
655 eor v2.16b, v18.16b, v2.16b
656 eor v3.16b, v19.16b, v3.16b
657 eor v4.16b, v8.16b, v4.16b
658 eor v5.16b, v9.16b, v5.16b
659 eor v6.16b, v10.16b, v6.16b
660 eor v7.16b, v16.16b, v7.16b
661 tbl v0.16b, {v0.16b}, v28.16b
662 tbl v1.16b, {v1.16b}, v28.16b
663 tbl v2.16b, {v2.16b}, v28.16b
664 tbl v3.16b, {v3.16b}, v28.16b
665 tbl v4.16b, {v4.16b}, v28.16b
666 tbl v5.16b, {v5.16b}, v28.16b
667 tbl v6.16b, {v6.16b}, v28.16b
668 tbl v7.16b, {v7.16b}, v28.16b
669.Lenc_sbox:
670 eor v5.16b, v5.16b, v6.16b
671 eor v3.16b, v3.16b, v0.16b
672 subs x10, x10, #1
673 eor v2.16b, v2.16b, v1.16b
674 eor v5.16b, v5.16b, v0.16b
675 eor v8.16b, v3.16b, v7.16b
676 eor v6.16b, v6.16b, v2.16b
677 eor v7.16b, v7.16b, v5.16b
678 eor v8.16b, v8.16b, v4.16b
679 eor v3.16b, v6.16b, v3.16b
680 eor v4.16b, v4.16b, v5.16b
681 eor v6.16b, v1.16b, v5.16b
682 eor v2.16b, v2.16b, v7.16b
683 eor v1.16b, v8.16b, v1.16b
684 eor v8.16b, v7.16b, v4.16b
685 eor v9.16b, v3.16b, v0.16b
686 eor v10.16b, v7.16b, v6.16b
687 eor v16.16b, v5.16b, v3.16b
688 eor v17.16b, v6.16b, v2.16b
689 eor v18.16b, v5.16b, v1.16b
690 eor v19.16b, v2.16b, v4.16b
691 eor v20.16b, v1.16b, v0.16b
692 orr v21.16b, v8.16b, v9.16b
693 orr v22.16b, v10.16b, v16.16b
694 eor v23.16b, v8.16b, v17.16b
695 eor v24.16b, v9.16b, v18.16b
696 and v19.16b, v19.16b, v20.16b
697 orr v20.16b, v17.16b, v18.16b
698 and v8.16b, v8.16b, v9.16b
699 and v9.16b, v17.16b, v18.16b
700 and v17.16b, v23.16b, v24.16b
701 and v10.16b, v10.16b, v16.16b
702 eor v16.16b, v21.16b, v19.16b
703 eor v18.16b, v20.16b, v19.16b
704 and v19.16b, v2.16b, v1.16b
705 and v20.16b, v6.16b, v5.16b
706 eor v21.16b, v22.16b, v17.16b
707 eor v9.16b, v9.16b, v10.16b
708 eor v10.16b, v16.16b, v17.16b
709 eor v16.16b, v18.16b, v8.16b
710 and v17.16b, v4.16b, v0.16b
711 orr v18.16b, v7.16b, v3.16b
712 eor v21.16b, v21.16b, v8.16b
713 eor v8.16b, v9.16b, v8.16b
714 eor v9.16b, v10.16b, v19.16b
715 eor v10.16b, v3.16b, v0.16b
716 eor v16.16b, v16.16b, v17.16b
717 eor v17.16b, v5.16b, v1.16b
718 eor v19.16b, v21.16b, v20.16b
719 eor v20.16b, v8.16b, v18.16b
720 eor v8.16b, v8.16b, v18.16b
721 eor v18.16b, v7.16b, v4.16b
722 eor v21.16b, v9.16b, v16.16b
723 eor v22.16b, v6.16b, v2.16b
724 and v23.16b, v9.16b, v19.16b
725 eor v24.16b, v10.16b, v17.16b
726 eor v25.16b, v0.16b, v1.16b
727 eor v26.16b, v7.16b, v6.16b
728 eor v27.16b, v18.16b, v22.16b
729 eor v28.16b, v3.16b, v5.16b
730 eor v29.16b, v16.16b, v23.16b
731 eor v30.16b, v20.16b, v23.16b
732 eor v23.16b, v20.16b, v23.16b
733 eor v31.16b, v4.16b, v2.16b
734 bsl v29.16b, v19.16b, v20.16b
735 bsl v30.16b, v9.16b, v16.16b
736 bsl v8.16b, v29.16b, v23.16b
737 bsl v20.16b, v23.16b, v29.16b
738 eor v9.16b, v30.16b, v29.16b
739 and v5.16b, v5.16b, v30.16b
740 and v8.16b, v8.16b, v30.16b
741 and v1.16b, v1.16b, v29.16b
742 eor v16.16b, v19.16b, v20.16b
743 and v2.16b, v2.16b, v29.16b
744 eor v19.16b, v9.16b, v29.16b
745 and v17.16b, v17.16b, v9.16b
746 eor v8.16b, v8.16b, v21.16b
747 and v20.16b, v22.16b, v9.16b
748 eor v21.16b, v29.16b, v16.16b
749 eor v22.16b, v29.16b, v16.16b
750 and v23.16b, v25.16b, v16.16b
751 and v6.16b, v6.16b, v19.16b
752 eor v25.16b, v8.16b, v16.16b
753 eor v29.16b, v30.16b, v8.16b
754 and v4.16b, v21.16b, v4.16b
755 and v8.16b, v28.16b, v8.16b
756 and v0.16b, v22.16b, v0.16b
757 eor v21.16b, v23.16b, v1.16b
758 eor v22.16b, v9.16b, v25.16b
759 eor v9.16b, v9.16b, v25.16b
760 eor v23.16b, v25.16b, v16.16b
761 and v3.16b, v29.16b, v3.16b
762 and v24.16b, v24.16b, v25.16b
763 and v25.16b, v27.16b, v25.16b
764 and v10.16b, v22.16b, v10.16b
765 and v9.16b, v9.16b, v18.16b
766 eor v18.16b, v19.16b, v23.16b
767 and v19.16b, v26.16b, v23.16b
768 eor v3.16b, v5.16b, v3.16b
769 eor v17.16b, v17.16b, v24.16b
770 eor v10.16b, v24.16b, v10.16b
771 and v16.16b, v31.16b, v16.16b
772 eor v20.16b, v20.16b, v25.16b
773 eor v9.16b, v25.16b, v9.16b
774 eor v4.16b, v2.16b, v4.16b
775 and v7.16b, v18.16b, v7.16b
776 eor v18.16b, v19.16b, v6.16b
777 eor v5.16b, v8.16b, v5.16b
778 eor v0.16b, v1.16b, v0.16b
779 eor v1.16b, v21.16b, v10.16b
780 eor v8.16b, v3.16b, v17.16b
781 eor v2.16b, v16.16b, v2.16b
782 eor v3.16b, v6.16b, v7.16b
783 eor v6.16b, v18.16b, v9.16b
784 eor v4.16b, v4.16b, v20.16b
785 eor v10.16b, v5.16b, v10.16b
786 eor v0.16b, v0.16b, v17.16b
787 eor v9.16b, v2.16b, v9.16b
788 eor v3.16b, v3.16b, v20.16b
789 eor v7.16b, v6.16b, v1.16b
790 eor v5.16b, v8.16b, v4.16b
791 eor v6.16b, v10.16b, v1.16b
792 eor v2.16b, v4.16b, v0.16b
793 eor v4.16b, v3.16b, v10.16b
794 eor v9.16b, v9.16b, v7.16b
795 eor v3.16b, v0.16b, v5.16b
796 eor v0.16b, v1.16b, v4.16b
797 eor v1.16b, v4.16b, v8.16b
798 eor v4.16b, v9.16b, v5.16b
799 eor v6.16b, v6.16b, v3.16b
800 bcc .Lenc_done
801 ext v8.16b, v0.16b, v0.16b, #12
802 ext v9.16b, v4.16b, v4.16b, #12
803 ldr q28, [x11]
804 ext v10.16b, v6.16b, v6.16b, #12
805 ext v16.16b, v1.16b, v1.16b, #12
806 ext v17.16b, v3.16b, v3.16b, #12
807 ext v18.16b, v7.16b, v7.16b, #12
808 eor v0.16b, v0.16b, v8.16b
809 eor v4.16b, v4.16b, v9.16b
810 eor v6.16b, v6.16b, v10.16b
811 ext v19.16b, v2.16b, v2.16b, #12
812 ext v20.16b, v5.16b, v5.16b, #12
813 eor v1.16b, v1.16b, v16.16b
814 eor v3.16b, v3.16b, v17.16b
815 eor v7.16b, v7.16b, v18.16b
816 eor v2.16b, v2.16b, v19.16b
817 eor v16.16b, v16.16b, v0.16b
818 eor v5.16b, v5.16b, v20.16b
819 eor v17.16b, v17.16b, v6.16b
820 eor v10.16b, v10.16b, v4.16b
821 ext v0.16b, v0.16b, v0.16b, #8
822 eor v9.16b, v9.16b, v1.16b
823 ext v1.16b, v1.16b, v1.16b, #8
824 eor v8.16b, v8.16b, v5.16b
825 eor v16.16b, v16.16b, v5.16b
826 eor v18.16b, v18.16b, v3.16b
827 eor v19.16b, v19.16b, v7.16b
828 ext v3.16b, v3.16b, v3.16b, #8
829 ext v7.16b, v7.16b, v7.16b, #8
830 eor v20.16b, v20.16b, v2.16b
831 ext v6.16b, v6.16b, v6.16b, #8
832 ext v21.16b, v5.16b, v5.16b, #8
833 eor v17.16b, v17.16b, v5.16b
834 ext v2.16b, v2.16b, v2.16b, #8
835 eor v10.16b, v10.16b, v5.16b
836 ext v22.16b, v4.16b, v4.16b, #8
837 eor v0.16b, v0.16b, v8.16b
838 eor v1.16b, v1.16b, v16.16b
839 eor v5.16b, v7.16b, v18.16b
840 eor v4.16b, v3.16b, v17.16b
841 eor v3.16b, v6.16b, v10.16b
842 eor v7.16b, v21.16b, v20.16b
843 eor v6.16b, v2.16b, v19.16b
844 eor v2.16b, v22.16b, v9.16b
845 bne .Lenc_loop
846 ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0)
847 b .Lenc_loop
848.align 4
849.Lenc_done:
850 ushr v8.2d, v0.2d, #1
851 movi v9.16b, #0x55
852 ldr q10, [x9]
853 ushr v16.2d, v3.2d, #1
854 movi v17.16b, #0x33
855 ushr v18.2d, v4.2d, #1
856 movi v19.16b, #0x0f
857 eor v8.16b, v8.16b, v1.16b
858 ushr v20.2d, v2.2d, #1
859 eor v16.16b, v16.16b, v7.16b
860 eor v18.16b, v18.16b, v6.16b
861 and v8.16b, v8.16b, v9.16b
862 eor v20.16b, v20.16b, v5.16b
863 and v16.16b, v16.16b, v9.16b
864 and v18.16b, v18.16b, v9.16b
865 shl v21.2d, v8.2d, #1
866 eor v1.16b, v1.16b, v8.16b
867 and v8.16b, v20.16b, v9.16b
868 eor v7.16b, v7.16b, v16.16b
869 shl v9.2d, v16.2d, #1
870 eor v6.16b, v6.16b, v18.16b
871 shl v16.2d, v18.2d, #1
872 eor v0.16b, v0.16b, v21.16b
873 shl v18.2d, v8.2d, #1
874 eor v5.16b, v5.16b, v8.16b
875 eor v3.16b, v3.16b, v9.16b
876 eor v4.16b, v4.16b, v16.16b
877 ushr v8.2d, v1.2d, #2
878 eor v2.16b, v2.16b, v18.16b
879 ushr v9.2d, v0.2d, #2
880 ushr v16.2d, v7.2d, #2
881 ushr v18.2d, v3.2d, #2
882 eor v8.16b, v8.16b, v6.16b
883 eor v9.16b, v9.16b, v4.16b
884 eor v16.16b, v16.16b, v5.16b
885 eor v18.16b, v18.16b, v2.16b
886 and v8.16b, v8.16b, v17.16b
887 and v9.16b, v9.16b, v17.16b
888 and v16.16b, v16.16b, v17.16b
889 and v17.16b, v18.16b, v17.16b
890 eor v6.16b, v6.16b, v8.16b
891 shl v8.2d, v8.2d, #2
892 eor v4.16b, v4.16b, v9.16b
893 shl v9.2d, v9.2d, #2
894 eor v5.16b, v5.16b, v16.16b
895 shl v16.2d, v16.2d, #2
896 eor v2.16b, v2.16b, v17.16b
897 shl v17.2d, v17.2d, #2
898 eor v1.16b, v1.16b, v8.16b
899 eor v0.16b, v0.16b, v9.16b
900 eor v7.16b, v7.16b, v16.16b
901 eor v3.16b, v3.16b, v17.16b
902 ushr v8.2d, v6.2d, #4
903 ushr v9.2d, v4.2d, #4
904 ushr v16.2d, v1.2d, #4
905 ushr v17.2d, v0.2d, #4
906 eor v8.16b, v8.16b, v5.16b
907 eor v9.16b, v9.16b, v2.16b
908 eor v16.16b, v16.16b, v7.16b
909 eor v17.16b, v17.16b, v3.16b
910 and v8.16b, v8.16b, v19.16b
911 and v9.16b, v9.16b, v19.16b
912 and v16.16b, v16.16b, v19.16b
913 and v17.16b, v17.16b, v19.16b
914 eor v5.16b, v5.16b, v8.16b
915 shl v8.2d, v8.2d, #4
916 eor v2.16b, v2.16b, v9.16b
917 shl v9.2d, v9.2d, #4
918 eor v7.16b, v7.16b, v16.16b
919 shl v16.2d, v16.2d, #4
920 eor v3.16b, v3.16b, v17.16b
921 shl v17.2d, v17.2d, #4
922 eor v6.16b, v6.16b, v8.16b
923 eor v4.16b, v4.16b, v9.16b
924 eor v7.16b, v7.16b, v10.16b
925 eor v1.16b, v1.16b, v16.16b
926 eor v3.16b, v3.16b, v10.16b
927 eor v0.16b, v0.16b, v17.16b
928 eor v6.16b, v6.16b, v10.16b
929 eor v4.16b, v4.16b, v10.16b
930 eor v2.16b, v2.16b, v10.16b
931 eor v5.16b, v5.16b, v10.16b
932 eor v1.16b, v1.16b, v10.16b
933 eor v0.16b, v0.16b, v10.16b
934 ret
935.size _bsaes_encrypt8,.-_bsaes_encrypt8
936
937.type _bsaes_key_convert,%function
938.align 4
939// On entry:
940// x9 -> input key (big-endian)
941// x10 = number of rounds
942// x17 -> output key (native endianness)
943// On exit:
944// x9, x10 corrupted
945// x11 -> .LM0_bigendian
946// x17 -> last quadword of output key
947// other general-purpose registers preserved
948// v2-v6 preserved
949// v7.16b[] = 0x63
950// v8-v14 preserved
951// v15 = last round key (converted to native endianness)
952// other SIMD registers corrupted
953_bsaes_key_convert:
2bd5cde5 954#ifdef __AARCH64EL__
82551af5
BA
955 adr x11, .LM0_littleendian
956#else
957 adr x11, .LM0_bigendian
958#endif
959 ldr q0, [x9], #16 // load round 0 key
960 ldr q1, [x11] // .LM0
961 ldr q15, [x9], #16 // load round 1 key
962
963 movi v7.16b, #0x63 // compose .L63
964 movi v16.16b, #0x01 // bit masks
965 movi v17.16b, #0x02
966 movi v18.16b, #0x04
967 movi v19.16b, #0x08
968 movi v20.16b, #0x10
969 movi v21.16b, #0x20
970 movi v22.16b, #0x40
971 movi v23.16b, #0x80
972
2bd5cde5 973#ifdef __AARCH64EL__
82551af5
BA
974 rev32 v0.16b, v0.16b
975#endif
976 sub x10, x10, #1
977 str q0, [x17], #16 // save round 0 key
978
979.align 4
980.Lkey_loop:
981 tbl v0.16b, {v15.16b}, v1.16b
982 ldr q15, [x9], #16 // load next round key
983
984 eor v0.16b, v0.16b, v7.16b
985 cmtst v24.16b, v0.16b, v16.16b
986 cmtst v25.16b, v0.16b, v17.16b
987 cmtst v26.16b, v0.16b, v18.16b
988 cmtst v27.16b, v0.16b, v19.16b
989 cmtst v28.16b, v0.16b, v20.16b
990 cmtst v29.16b, v0.16b, v21.16b
991 cmtst v30.16b, v0.16b, v22.16b
992 cmtst v31.16b, v0.16b, v23.16b
993 sub x10, x10, #1
994 st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key
995 st1 {v28.16b-v31.16b}, [x17], #64
996 cbnz x10, .Lkey_loop
997
998 // don't save last round key
2bd5cde5 999#ifdef __AARCH64EL__
82551af5
BA
1000 rev32 v15.16b, v15.16b
1001 adr x11, .LM0_bigendian
1002#endif
1003 ret
1004.size _bsaes_key_convert,.-_bsaes_key_convert
1005
1006.globl ossl_bsaes_cbc_encrypt
1007.type ossl_bsaes_cbc_encrypt,%function
1008.align 4
1009// On entry:
1010// x0 -> input ciphertext
1011// x1 -> output plaintext
1012// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
1013// x3 -> key
1014// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
1015// w5 must be == 0
1016// On exit:
1017// Output plaintext filled in
1018// Initialisation vector overwritten with last quadword of ciphertext
1019// No output registers, usual AAPCS64 register preservation
1020ossl_bsaes_cbc_encrypt:
1021 cmp x2, #128
a35c3a9f
TC
1022 bhs .Lcbc_do_bsaes
1023 b AES_cbc_encrypt
1024.Lcbc_do_bsaes:
82551af5
BA
1025
1026 // it is up to the caller to make sure we are called with enc == 0
1027
5adddcd9 1028 stp x29, x30, [sp, #-48]!
82551af5
BA
1029 stp d8, d9, [sp, #16]
1030 stp d10, d15, [sp, #32]
1031 lsr x2, x2, #4 // len in 16 byte blocks
1032
1033 ldr w15, [x3, #240] // get # of rounds
1034 mov x14, sp
1035
1036 // allocate the key schedule on the stack
1037 add x17, sp, #96
1038 sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
1039
1040 // populate the key schedule
1041 mov x9, x3 // pass key
1042 mov x10, x15 // pass # of rounds
1043 mov sp, x17 // sp is sp
1044 bl _bsaes_key_convert
1045 ldr q6, [sp]
1046 str q15, [x17] // save last round key
1047 eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
1048 str q6, [sp]
1049
1050 ldr q15, [x4] // load IV
1051 b .Lcbc_dec_loop
1052
1053.align 4
1054.Lcbc_dec_loop:
1055 subs x2, x2, #0x8
1056 bmi .Lcbc_dec_loop_finish
1057
1058 ldr q0, [x0], #16 // load input
1059 mov x9, sp // pass the key
1060 ldr q1, [x0], #16
1061 mov x10, x15
1062 ldr q2, [x0], #16
1063 ldr q3, [x0], #16
1064 ldr q4, [x0], #16
1065 ldr q5, [x0], #16
1066 ldr q6, [x0], #16
1067 ldr q7, [x0], #-7*16
1068
1069 bl _bsaes_decrypt8
1070
1071 ldr q16, [x0], #16 // reload input
1072 eor v0.16b, v0.16b, v15.16b // ^= IV
1073 eor v1.16b, v1.16b, v16.16b
1074 str q0, [x1], #16 // write output
1075 ldr q0, [x0], #16
1076 str q1, [x1], #16
1077 ldr q1, [x0], #16
1078 eor v1.16b, v4.16b, v1.16b
1079 ldr q4, [x0], #16
1080 eor v2.16b, v2.16b, v4.16b
1081 eor v0.16b, v6.16b, v0.16b
1082 ldr q4, [x0], #16
1083 str q0, [x1], #16
1084 str q1, [x1], #16
1085 eor v0.16b, v7.16b, v4.16b
1086 ldr q1, [x0], #16
1087 str q2, [x1], #16
1088 ldr q2, [x0], #16
1089 ldr q15, [x0], #16
1090 str q0, [x1], #16
1091 eor v0.16b, v5.16b, v2.16b
1092 eor v1.16b, v3.16b, v1.16b
1093 str q1, [x1], #16
1094 str q0, [x1], #16
1095
1096 b .Lcbc_dec_loop
1097
1098.Lcbc_dec_loop_finish:
1099 adds x2, x2, #8
1100 beq .Lcbc_dec_done
1101
1102 ldr q0, [x0], #16 // load input
1103 cmp x2, #2
1104 blo .Lcbc_dec_one
1105 ldr q1, [x0], #16
1106 mov x9, sp // pass the key
1107 mov x10, x15
1108 beq .Lcbc_dec_two
1109 ldr q2, [x0], #16
1110 cmp x2, #4
1111 blo .Lcbc_dec_three
1112 ldr q3, [x0], #16
1113 beq .Lcbc_dec_four
1114 ldr q4, [x0], #16
1115 cmp x2, #6
1116 blo .Lcbc_dec_five
1117 ldr q5, [x0], #16
1118 beq .Lcbc_dec_six
1119 ldr q6, [x0], #-6*16
1120
1121 bl _bsaes_decrypt8
1122
1123 ldr q5, [x0], #16 // reload input
1124 eor v0.16b, v0.16b, v15.16b // ^= IV
1125 ldr q8, [x0], #16
1126 ldr q9, [x0], #16
1127 ldr q10, [x0], #16
1128 str q0, [x1], #16 // write output
1129 ldr q0, [x0], #16
1130 eor v1.16b, v1.16b, v5.16b
1131 ldr q5, [x0], #16
1132 eor v6.16b, v6.16b, v8.16b
1133 ldr q15, [x0]
1134 eor v4.16b, v4.16b, v9.16b
1135 eor v2.16b, v2.16b, v10.16b
1136 str q1, [x1], #16
1137 eor v0.16b, v7.16b, v0.16b
1138 str q6, [x1], #16
1139 eor v1.16b, v3.16b, v5.16b
1140 str q4, [x1], #16
1141 str q2, [x1], #16
1142 str q0, [x1], #16
1143 str q1, [x1]
1144 b .Lcbc_dec_done
1145.align 4
1146.Lcbc_dec_six:
1147 sub x0, x0, #0x60
1148 bl _bsaes_decrypt8
1149 ldr q3, [x0], #16 // reload input
1150 eor v0.16b, v0.16b, v15.16b // ^= IV
1151 ldr q5, [x0], #16
1152 ldr q8, [x0], #16
1153 ldr q9, [x0], #16
1154 str q0, [x1], #16 // write output
1155 ldr q0, [x0], #16
1156 eor v1.16b, v1.16b, v3.16b
1157 ldr q15, [x0]
1158 eor v3.16b, v6.16b, v5.16b
1159 eor v4.16b, v4.16b, v8.16b
1160 eor v2.16b, v2.16b, v9.16b
1161 str q1, [x1], #16
1162 eor v0.16b, v7.16b, v0.16b
1163 str q3, [x1], #16
1164 str q4, [x1], #16
1165 str q2, [x1], #16
1166 str q0, [x1]
1167 b .Lcbc_dec_done
1168.align 4
1169.Lcbc_dec_five:
1170 sub x0, x0, #0x50
1171 bl _bsaes_decrypt8
1172 ldr q3, [x0], #16 // reload input
1173 eor v0.16b, v0.16b, v15.16b // ^= IV
1174 ldr q5, [x0], #16
1175 ldr q7, [x0], #16
1176 ldr q8, [x0], #16
1177 str q0, [x1], #16 // write output
1178 ldr q15, [x0]
1179 eor v0.16b, v1.16b, v3.16b
1180 eor v1.16b, v6.16b, v5.16b
1181 eor v3.16b, v4.16b, v7.16b
1182 str q0, [x1], #16
1183 eor v0.16b, v2.16b, v8.16b
1184 str q1, [x1], #16
1185 str q3, [x1], #16
1186 str q0, [x1]
1187 b .Lcbc_dec_done
1188.align 4
1189.Lcbc_dec_four:
1190 sub x0, x0, #0x40
1191 bl _bsaes_decrypt8
1192 ldr q2, [x0], #16 // reload input
1193 eor v0.16b, v0.16b, v15.16b // ^= IV
1194 ldr q3, [x0], #16
1195 ldr q5, [x0], #16
1196 str q0, [x1], #16 // write output
1197 ldr q15, [x0]
1198 eor v0.16b, v1.16b, v2.16b
1199 eor v1.16b, v6.16b, v3.16b
1200 eor v2.16b, v4.16b, v5.16b
1201 str q0, [x1], #16
1202 str q1, [x1], #16
1203 str q2, [x1]
1204 b .Lcbc_dec_done
1205.align 4
1206.Lcbc_dec_three:
1207 sub x0, x0, #0x30
1208 bl _bsaes_decrypt8
1209 ldr q2, [x0], #16 // reload input
1210 eor v0.16b, v0.16b, v15.16b // ^= IV
1211 ldr q3, [x0], #16
1212 ldr q15, [x0]
1213 str q0, [x1], #16 // write output
1214 eor v0.16b, v1.16b, v2.16b
1215 eor v1.16b, v6.16b, v3.16b
1216 str q0, [x1], #16
1217 str q1, [x1]
1218 b .Lcbc_dec_done
1219.align 4
1220.Lcbc_dec_two:
1221 sub x0, x0, #0x20
1222 bl _bsaes_decrypt8
1223 ldr q2, [x0], #16 // reload input
1224 eor v0.16b, v0.16b, v15.16b // ^= IV
1225 ldr q15, [x0]
1226 str q0, [x1], #16 // write output
1227 eor v0.16b, v1.16b, v2.16b
1228 str q0, [x1]
1229 b .Lcbc_dec_done
1230.align 4
1231.Lcbc_dec_one:
1232 sub x0, x0, #0x10
1233 stp x1, x4, [sp, #-32]!
1234 str x14, [sp, #16]
1235 mov v8.16b, v15.16b
1236 mov v15.16b, v0.16b
1237 mov x2, x3
1238 bl AES_decrypt
1239 ldr x14, [sp, #16]
1240 ldp x1, x4, [sp], #32
1241 ldr q0, [x1] // load result
1242 eor v0.16b, v0.16b, v8.16b // ^= IV
1243 str q0, [x1] // write output
1244
1245.align 4
1246.Lcbc_dec_done:
1247 movi v0.16b, #0
1248 movi v1.16b, #0
1249.Lcbc_dec_bzero:// wipe key schedule [if any]
1250 stp q0, q1, [sp], #32
1251 cmp sp, x14
1252 bne .Lcbc_dec_bzero
1253 str q15, [x4] // return IV
1254 ldp d8, d9, [sp, #16]
1255 ldp d10, d15, [sp, #32]
5adddcd9 1256 ldp x29, x30, [sp], #48
82551af5
BA
1257 ret
1258.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
1259
1260.globl ossl_bsaes_ctr32_encrypt_blocks
1261.type ossl_bsaes_ctr32_encrypt_blocks,%function
1262.align 4
1263// On entry:
1264// x0 -> input text (whole 16-byte blocks)
1265// x1 -> output text (whole 16-byte blocks)
1266// x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
1267// x3 -> key
1268// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
1269// On exit:
1270// Output text filled in
1271// No output registers, usual AAPCS64 register preservation
1272ossl_bsaes_ctr32_encrypt_blocks:
1273
1274 cmp x2, #8 // use plain AES for
1275 blo .Lctr_enc_short // small sizes
1276
5adddcd9 1277 stp x29, x30, [sp, #-80]!
82551af5
BA
1278 stp d8, d9, [sp, #16]
1279 stp d10, d11, [sp, #32]
1280 stp d12, d13, [sp, #48]
1281 stp d14, d15, [sp, #64]
1282
1283 ldr w15, [x3, #240] // get # of rounds
1284 mov x14, sp
1285
1286 // allocate the key schedule on the stack
1287 add x17, sp, #96
1288 sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes
1289
1290 // populate the key schedule
1291 mov x9, x3 // pass key
1292 mov x10, x15 // pass # of rounds
1293 mov sp, x17 // sp is sp
1294 bl _bsaes_key_convert
1295 eor v7.16b, v7.16b, v15.16b // fix up last round key
1296 str q7, [x17] // save last round key
1297
1298 ldr q0, [x4] // load counter
1299 add x13, x11, #.LREVM0SR-.LM0_bigendian
1300 ldr q4, [sp] // load round0 key
1301
1302 movi v8.4s, #1 // compose 1<<96
1303 movi v9.16b, #0
1304 rev32 v15.16b, v0.16b
1305 rev32 v0.16b, v0.16b
1306 ext v11.16b, v9.16b, v8.16b, #4
1307 rev32 v4.16b, v4.16b
1308 add v12.4s, v11.4s, v11.4s // compose 2<<96
1309 str q4, [sp] // save adjusted round0 key
1310 add v13.4s, v11.4s, v12.4s // compose 3<<96
1311 add v14.4s, v12.4s, v12.4s // compose 4<<96
1312 b .Lctr_enc_loop
1313
1314.align 4
1315.Lctr_enc_loop:
1316 // Intermix prologue from _bsaes_encrypt8 to use the opportunity
1317 // to flip byte order in 32-bit counter
1318
1319 add v1.4s, v15.4s, v11.4s // +1
1320 add x9, sp, #0x10 // pass next round key
1321 add v2.4s, v15.4s, v12.4s // +2
1322 ldr q9, [x13] // .LREVM0SR
1323 ldr q8, [sp] // load round0 key
1324 add v3.4s, v15.4s, v13.4s // +3
1325 mov x10, x15 // pass rounds
1326 sub x11, x13, #.LREVM0SR-.LSR // pass constants
1327 add v6.4s, v2.4s, v14.4s
1328 add v4.4s, v15.4s, v14.4s // +4
1329 add v7.4s, v3.4s, v14.4s
1330 add v15.4s, v4.4s, v14.4s // next counter
1331 add v5.4s, v1.4s, v14.4s
1332
1333 bl _bsaes_encrypt8_alt
1334
1335 subs x2, x2, #8
1336 blo .Lctr_enc_loop_done
1337
1338 ldr q16, [x0], #16
1339 ldr q17, [x0], #16
1340 eor v1.16b, v1.16b, v17.16b
1341 ldr q17, [x0], #16
1342 eor v0.16b, v0.16b, v16.16b
1343 eor v4.16b, v4.16b, v17.16b
1344 str q0, [x1], #16
1345 ldr q16, [x0], #16
1346 str q1, [x1], #16
1347 mov v0.16b, v15.16b
1348 str q4, [x1], #16
1349 ldr q1, [x0], #16
1350 eor v4.16b, v6.16b, v16.16b
1351 eor v1.16b, v3.16b, v1.16b
1352 ldr q3, [x0], #16
1353 eor v3.16b, v7.16b, v3.16b
1354 ldr q6, [x0], #16
1355 eor v2.16b, v2.16b, v6.16b
1356 ldr q6, [x0], #16
1357 eor v5.16b, v5.16b, v6.16b
1358 str q4, [x1], #16
1359 str q1, [x1], #16
1360 str q3, [x1], #16
1361 str q2, [x1], #16
1362 str q5, [x1], #16
1363
1364 bne .Lctr_enc_loop
1365 b .Lctr_enc_done
1366
1367.align 4
1368.Lctr_enc_loop_done:
1369 add x2, x2, #8
1370 ldr q16, [x0], #16 // load input
1371 eor v0.16b, v0.16b, v16.16b
1372 str q0, [x1], #16 // write output
1373 cmp x2, #2
1374 blo .Lctr_enc_done
1375 ldr q17, [x0], #16
1376 eor v1.16b, v1.16b, v17.16b
1377 str q1, [x1], #16
1378 beq .Lctr_enc_done
1379 ldr q18, [x0], #16
1380 eor v4.16b, v4.16b, v18.16b
1381 str q4, [x1], #16
1382 cmp x2, #4
1383 blo .Lctr_enc_done
1384 ldr q19, [x0], #16
1385 eor v6.16b, v6.16b, v19.16b
1386 str q6, [x1], #16
1387 beq .Lctr_enc_done
1388 ldr q20, [x0], #16
1389 eor v3.16b, v3.16b, v20.16b
1390 str q3, [x1], #16
1391 cmp x2, #6
1392 blo .Lctr_enc_done
1393 ldr q21, [x0], #16
1394 eor v7.16b, v7.16b, v21.16b
1395 str q7, [x1], #16
1396 beq .Lctr_enc_done
1397 ldr q22, [x0]
1398 eor v2.16b, v2.16b, v22.16b
1399 str q2, [x1], #16
1400
1401.Lctr_enc_done:
1402 movi v0.16b, #0
1403 movi v1.16b, #0
1404.Lctr_enc_bzero: // wipe key schedule [if any]
1405 stp q0, q1, [sp], #32
1406 cmp sp, x14
1407 bne .Lctr_enc_bzero
1408
1409 ldp d8, d9, [sp, #16]
1410 ldp d10, d11, [sp, #32]
1411 ldp d12, d13, [sp, #48]
1412 ldp d14, d15, [sp, #64]
5adddcd9 1413 ldp x29, x30, [sp], #80
82551af5
BA
1414 ret
1415
1416.Lctr_enc_short:
5adddcd9 1417 stp x29, x30, [sp, #-96]!
82551af5
BA
1418 stp x19, x20, [sp, #16]
1419 stp x21, x22, [sp, #32]
1420 str x23, [sp, #48]
1421
1422 mov x19, x0 // copy arguments
1423 mov x20, x1
1424 mov x21, x2
1425 mov x22, x3
1426 ldr w23, [x4, #12] // load counter .LSW
1427 ldr q1, [x4] // load whole counter value
2bd5cde5 1428#ifdef __AARCH64EL__
82551af5
BA
1429 rev w23, w23
1430#endif
1431 str q1, [sp, #80] // copy counter value
1432
1433.Lctr_enc_short_loop:
1434 add x0, sp, #80 // input counter value
1435 add x1, sp, #64 // output on the stack
1436 mov x2, x22 // key
1437
1438 bl AES_encrypt
1439
1440 ldr q0, [x19], #16 // load input
1441 ldr q1, [sp, #64] // load encrypted counter
1442 add x23, x23, #1
2bd5cde5 1443#ifdef __AARCH64EL__
82551af5
BA
1444 rev w0, w23
1445 str w0, [sp, #80+12] // next counter value
1446#else
1447 str w23, [sp, #80+12] // next counter value
1448#endif
1449 eor v0.16b, v0.16b, v1.16b
1450 str q0, [x20], #16 // store output
1451 subs x21, x21, #1
1452 bne .Lctr_enc_short_loop
1453
1454 movi v0.16b, #0
1455 movi v1.16b, #0
1456 stp q0, q1, [sp, #64]
1457
1458 ldr x23, [sp, #48]
1459 ldp x21, x22, [sp, #32]
1460 ldp x19, x20, [sp, #16]
5adddcd9 1461 ldp x29, x30, [sp], #96
82551af5
BA
1462 ret
1463.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
1464
1465.globl ossl_bsaes_xts_encrypt
1466.type ossl_bsaes_xts_encrypt,%function
1467.align 4
1468// On entry:
1469// x0 -> input plaintext
1470// x1 -> output ciphertext
1471// x2 -> length of text in bytes (must be at least 16)
1472// x3 -> key1 (used to encrypt the XORed plaintext blocks)
1473// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
1474// x5 -> 16-byte initial vector (typically, sector number)
1475// On exit:
1476// Output ciphertext filled in
1477// No output registers, usual AAPCS64 register preservation
1478ossl_bsaes_xts_encrypt:
1479 // Stack layout:
1480 // sp ->
1481 // nrounds*128-96 bytes: key schedule
1482 // x19 ->
1483 // 16 bytes: frame record
1484 // 4*16 bytes: tweak storage across _bsaes_encrypt8
1485 // 6*8 bytes: storage for 5 callee-saved general-purpose registers
1486 // 8*8 bytes: storage for 8 callee-saved SIMD registers
5adddcd9 1487 stp x29, x30, [sp, #-192]!
82551af5
BA
1488 stp x19, x20, [sp, #80]
1489 stp x21, x22, [sp, #96]
1490 str x23, [sp, #112]
1491 stp d8, d9, [sp, #128]
1492 stp d10, d11, [sp, #144]
1493 stp d12, d13, [sp, #160]
1494 stp d14, d15, [sp, #176]
1495
1496 mov x19, sp
1497 mov x20, x0
1498 mov x21, x1
1499 mov x22, x2
1500 mov x23, x3
1501
1502 // generate initial tweak
1503 sub sp, sp, #16
1504 mov x0, x5 // iv[]
1505 mov x1, sp
1506 mov x2, x4 // key2
1507 bl AES_encrypt
1508 ldr q11, [sp], #16
1509
1510 ldr w1, [x23, #240] // get # of rounds
1511 // allocate the key schedule on the stack
1512 add x17, sp, #96
1513 sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
1514
1515 // populate the key schedule
1516 mov x9, x23 // pass key
1517 mov x10, x1 // pass # of rounds
1518 mov sp, x17
1519 bl _bsaes_key_convert
1520 eor v15.16b, v15.16b, v7.16b // fix up last round key
1521 str q15, [x17] // save last round key
1522
1523 subs x22, x22, #0x80
1524 blo .Lxts_enc_short
1525 b .Lxts_enc_loop
1526
1527.align 4
1528.Lxts_enc_loop:
1529 ldr q8, .Lxts_magic
1530 mov x10, x1 // pass rounds
1531 add x2, x19, #16
1532 ldr q0, [x20], #16
1533 sshr v1.2d, v11.2d, #63
1534 mov x9, sp // pass key schedule
1535 ldr q6, .Lxts_magic+16
1536 add v2.2d, v11.2d, v11.2d
1537 cmtst v3.2d, v11.2d, v6.2d
1538 and v1.16b, v1.16b, v8.16b
1539 ext v1.16b, v1.16b, v1.16b, #8
1540 and v3.16b, v3.16b, v8.16b
1541 ldr q4, [x20], #16
1542 eor v12.16b, v2.16b, v1.16b
1543 eor v1.16b, v4.16b, v12.16b
1544 eor v0.16b, v0.16b, v11.16b
1545 cmtst v2.2d, v12.2d, v6.2d
1546 add v4.2d, v12.2d, v12.2d
1547 add x0, x19, #16
1548 ext v3.16b, v3.16b, v3.16b, #8
1549 and v2.16b, v2.16b, v8.16b
1550 eor v13.16b, v4.16b, v3.16b
1551 ldr q3, [x20], #16
1552 ext v4.16b, v2.16b, v2.16b, #8
1553 eor v2.16b, v3.16b, v13.16b
1554 ldr q3, [x20], #16
1555 add v5.2d, v13.2d, v13.2d
1556 cmtst v7.2d, v13.2d, v6.2d
1557 and v7.16b, v7.16b, v8.16b
1558 ldr q9, [x20], #16
1559 ext v7.16b, v7.16b, v7.16b, #8
1560 ldr q10, [x20], #16
1561 eor v14.16b, v5.16b, v4.16b
1562 ldr q16, [x20], #16
1563 add v4.2d, v14.2d, v14.2d
1564 eor v3.16b, v3.16b, v14.16b
1565 eor v15.16b, v4.16b, v7.16b
1566 add v5.2d, v15.2d, v15.2d
1567 ldr q7, [x20], #16
1568 cmtst v4.2d, v14.2d, v6.2d
1569 and v17.16b, v4.16b, v8.16b
1570 cmtst v18.2d, v15.2d, v6.2d
1571 eor v4.16b, v9.16b, v15.16b
1572 ext v9.16b, v17.16b, v17.16b, #8
1573 eor v9.16b, v5.16b, v9.16b
1574 add v17.2d, v9.2d, v9.2d
1575 and v18.16b, v18.16b, v8.16b
1576 eor v5.16b, v10.16b, v9.16b
1577 str q9, [x2], #16
1578 ext v10.16b, v18.16b, v18.16b, #8
1579 cmtst v9.2d, v9.2d, v6.2d
1580 and v9.16b, v9.16b, v8.16b
1581 eor v10.16b, v17.16b, v10.16b
1582 cmtst v17.2d, v10.2d, v6.2d
1583 eor v6.16b, v16.16b, v10.16b
1584 str q10, [x2], #16
1585 ext v9.16b, v9.16b, v9.16b, #8
1586 add v10.2d, v10.2d, v10.2d
1587 eor v9.16b, v10.16b, v9.16b
1588 str q9, [x2], #16
1589 eor v7.16b, v7.16b, v9.16b
1590 add v9.2d, v9.2d, v9.2d
1591 and v8.16b, v17.16b, v8.16b
1592 ext v8.16b, v8.16b, v8.16b, #8
1593 eor v8.16b, v9.16b, v8.16b
1594 str q8, [x2] // next round tweak
1595
1596 bl _bsaes_encrypt8
1597
1598 ldr q8, [x0], #16
1599 eor v0.16b, v0.16b, v11.16b
1600 eor v1.16b, v1.16b, v12.16b
1601 ldr q9, [x0], #16
1602 eor v4.16b, v4.16b, v13.16b
1603 eor v6.16b, v6.16b, v14.16b
1604 ldr q10, [x0], #16
1605 eor v3.16b, v3.16b, v15.16b
1606 subs x22, x22, #0x80
1607 str q0, [x21], #16
1608 ldr q11, [x0] // next round tweak
1609 str q1, [x21], #16
1610 eor v0.16b, v7.16b, v8.16b
1611 eor v1.16b, v2.16b, v9.16b
1612 str q4, [x21], #16
1613 eor v2.16b, v5.16b, v10.16b
1614 str q6, [x21], #16
1615 str q3, [x21], #16
1616 str q0, [x21], #16
1617 str q1, [x21], #16
1618 str q2, [x21], #16
1619 bpl .Lxts_enc_loop
1620
1621.Lxts_enc_short:
1622 adds x22, x22, #0x70
1623 bmi .Lxts_enc_done
1624
1625 ldr q8, .Lxts_magic
1626 sshr v1.2d, v11.2d, #63
1627 add v2.2d, v11.2d, v11.2d
1628 ldr q9, .Lxts_magic+16
1629 subs x22, x22, #0x10
1630 ldr q0, [x20], #16
1631 and v1.16b, v1.16b, v8.16b
1632 cmtst v3.2d, v11.2d, v9.2d
1633 ext v1.16b, v1.16b, v1.16b, #8
1634 and v3.16b, v3.16b, v8.16b
1635 eor v12.16b, v2.16b, v1.16b
1636 ext v1.16b, v3.16b, v3.16b, #8
1637 add v2.2d, v12.2d, v12.2d
1638 cmtst v3.2d, v12.2d, v9.2d
1639 eor v13.16b, v2.16b, v1.16b
1640 and v22.16b, v3.16b, v8.16b
1641 bmi .Lxts_enc_1
1642
1643 ext v2.16b, v22.16b, v22.16b, #8
1644 add v3.2d, v13.2d, v13.2d
1645 ldr q1, [x20], #16
1646 cmtst v4.2d, v13.2d, v9.2d
1647 subs x22, x22, #0x10
1648 eor v14.16b, v3.16b, v2.16b
1649 and v23.16b, v4.16b, v8.16b
1650 bmi .Lxts_enc_2
1651
1652 ext v3.16b, v23.16b, v23.16b, #8
1653 add v4.2d, v14.2d, v14.2d
1654 ldr q2, [x20], #16
1655 cmtst v5.2d, v14.2d, v9.2d
1656 eor v0.16b, v0.16b, v11.16b
1657 subs x22, x22, #0x10
1658 eor v15.16b, v4.16b, v3.16b
1659 and v24.16b, v5.16b, v8.16b
1660 bmi .Lxts_enc_3
1661
1662 ext v4.16b, v24.16b, v24.16b, #8
1663 add v5.2d, v15.2d, v15.2d
1664 ldr q3, [x20], #16
1665 cmtst v6.2d, v15.2d, v9.2d
1666 eor v1.16b, v1.16b, v12.16b
1667 subs x22, x22, #0x10
1668 eor v16.16b, v5.16b, v4.16b
1669 and v25.16b, v6.16b, v8.16b
1670 bmi .Lxts_enc_4
1671
1672 ext v5.16b, v25.16b, v25.16b, #8
1673 add v6.2d, v16.2d, v16.2d
1674 add x0, x19, #16
1675 cmtst v7.2d, v16.2d, v9.2d
1676 ldr q4, [x20], #16
1677 eor v2.16b, v2.16b, v13.16b
1678 str q16, [x0], #16
1679 subs x22, x22, #0x10
1680 eor v17.16b, v6.16b, v5.16b
1681 and v26.16b, v7.16b, v8.16b
1682 bmi .Lxts_enc_5
1683
1684 ext v7.16b, v26.16b, v26.16b, #8
1685 add v18.2d, v17.2d, v17.2d
1686 ldr q5, [x20], #16
1687 eor v3.16b, v3.16b, v14.16b
1688 str q17, [x0], #16
1689 subs x22, x22, #0x10
1690 eor v18.16b, v18.16b, v7.16b
1691 bmi .Lxts_enc_6
1692
1693 ldr q6, [x20], #16
1694 eor v4.16b, v4.16b, v15.16b
1695 eor v5.16b, v5.16b, v16.16b
1696 str q18, [x0] // next round tweak
1697 mov x9, sp // pass key schedule
1698 mov x10, x1
1699 add x0, x19, #16
1700 sub x22, x22, #0x10
1701 eor v6.16b, v6.16b, v17.16b
1702
1703 bl _bsaes_encrypt8
1704
1705 ldr q16, [x0], #16
1706 eor v0.16b, v0.16b, v11.16b
1707 eor v1.16b, v1.16b, v12.16b
1708 ldr q17, [x0], #16
1709 eor v4.16b, v4.16b, v13.16b
1710 eor v6.16b, v6.16b, v14.16b
1711 eor v3.16b, v3.16b, v15.16b
1712 ldr q11, [x0] // next round tweak
1713 str q0, [x21], #16
1714 str q1, [x21], #16
1715 eor v0.16b, v7.16b, v16.16b
1716 eor v1.16b, v2.16b, v17.16b
1717 str q4, [x21], #16
1718 str q6, [x21], #16
1719 str q3, [x21], #16
1720 str q0, [x21], #16
1721 str q1, [x21], #16
1722 b .Lxts_enc_done
1723
1724.align 4
1725.Lxts_enc_6:
1726 eor v4.16b, v4.16b, v15.16b
1727 eor v5.16b, v5.16b, v16.16b
1728 mov x9, sp // pass key schedule
1729 mov x10, x1 // pass rounds
1730 add x0, x19, #16
1731
1732 bl _bsaes_encrypt8
1733
1734 ldr q16, [x0], #16
1735 eor v0.16b, v0.16b, v11.16b
1736 eor v1.16b, v1.16b, v12.16b
1737 eor v4.16b, v4.16b, v13.16b
1738 eor v6.16b, v6.16b, v14.16b
1739 ldr q11, [x0] // next round tweak
1740 eor v3.16b, v3.16b, v15.16b
1741 str q0, [x21], #16
1742 str q1, [x21], #16
1743 eor v0.16b, v7.16b, v16.16b
1744 str q4, [x21], #16
1745 str q6, [x21], #16
1746 str q3, [x21], #16
1747 str q0, [x21], #16
1748 b .Lxts_enc_done
1749
1750.align 4
1751.Lxts_enc_5:
1752 eor v3.16b, v3.16b, v14.16b
1753 eor v4.16b, v4.16b, v15.16b
1754 mov x9, sp // pass key schedule
1755 mov x10, x1 // pass rounds
1756 add x0, x19, #16
1757
1758 bl _bsaes_encrypt8
1759
1760 eor v0.16b, v0.16b, v11.16b
1761 eor v1.16b, v1.16b, v12.16b
1762 ldr q11, [x0] // next round tweak
1763 eor v4.16b, v4.16b, v13.16b
1764 eor v6.16b, v6.16b, v14.16b
1765 eor v3.16b, v3.16b, v15.16b
1766 str q0, [x21], #16
1767 str q1, [x21], #16
1768 str q4, [x21], #16
1769 str q6, [x21], #16
1770 str q3, [x21], #16
1771 b .Lxts_enc_done
1772
1773.align 4
1774.Lxts_enc_4:
1775 eor v2.16b, v2.16b, v13.16b
1776 eor v3.16b, v3.16b, v14.16b
1777 mov x9, sp // pass key schedule
1778 mov x10, x1 // pass rounds
1779 add x0, x19, #16
1780
1781 bl _bsaes_encrypt8
1782
1783 eor v0.16b, v0.16b, v11.16b
1784 eor v1.16b, v1.16b, v12.16b
1785 eor v4.16b, v4.16b, v13.16b
1786 eor v6.16b, v6.16b, v14.16b
1787 mov v11.16b, v15.16b // next round tweak
1788 str q0, [x21], #16
1789 str q1, [x21], #16
1790 str q4, [x21], #16
1791 str q6, [x21], #16
1792 b .Lxts_enc_done
1793
1794.align 4
1795.Lxts_enc_3:
1796 eor v1.16b, v1.16b, v12.16b
1797 eor v2.16b, v2.16b, v13.16b
1798 mov x9, sp // pass key schedule
1799 mov x10, x1 // pass rounds
1800 add x0, x19, #16
1801
1802 bl _bsaes_encrypt8
1803
1804 eor v0.16b, v0.16b, v11.16b
1805 eor v1.16b, v1.16b, v12.16b
1806 eor v4.16b, v4.16b, v13.16b
1807 mov v11.16b, v14.16b // next round tweak
1808 str q0, [x21], #16
1809 str q1, [x21], #16
1810 str q4, [x21], #16
1811 b .Lxts_enc_done
1812
1813.align 4
1814.Lxts_enc_2:
1815 eor v0.16b, v0.16b, v11.16b
1816 eor v1.16b, v1.16b, v12.16b
1817 mov x9, sp // pass key schedule
1818 mov x10, x1 // pass rounds
1819 add x0, x19, #16
1820
1821 bl _bsaes_encrypt8
1822
1823 eor v0.16b, v0.16b, v11.16b
1824 eor v1.16b, v1.16b, v12.16b
1825 mov v11.16b, v13.16b // next round tweak
1826 str q0, [x21], #16
1827 str q1, [x21], #16
1828 b .Lxts_enc_done
1829
1830.align 4
1831.Lxts_enc_1:
1832 eor v0.16b, v0.16b, v11.16b
1833 sub x0, sp, #16
1834 sub x1, sp, #16
1835 mov x2, x23
1836 mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
1837 mov v14.d[0], v12.d[1]
1838 str q0, [sp, #-16]!
1839
1840 bl AES_encrypt
1841
1842 ldr q0, [sp], #16
1843 trn1 v13.2d, v11.2d, v13.2d
1844 trn1 v11.2d, v12.2d, v14.2d // next round tweak
1845 eor v0.16b, v0.16b, v13.16b
1846 str q0, [x21], #16
1847
1848.Lxts_enc_done:
1849 adds x22, x22, #0x10
1850 beq .Lxts_enc_ret
1851
1852 sub x6, x21, #0x10
1853 // Penultimate plaintext block produces final ciphertext part-block
1854 // plus remaining part of final plaintext block. Move ciphertext part
a024ab98 1855 // to final position and reuse penultimate ciphertext block buffer to
82551af5
BA
1856 // construct final plaintext block
1857.Lxts_enc_steal:
1858 ldrb w0, [x20], #1
1859 ldrb w1, [x21, #-0x10]
1860 strb w0, [x21, #-0x10]
1861 strb w1, [x21], #1
1862
1863 subs x22, x22, #1
1864 bhi .Lxts_enc_steal
1865
1866 // Finally encrypt the penultimate ciphertext block using the
1867 // last tweak
1868 ldr q0, [x6]
1869 eor v0.16b, v0.16b, v11.16b
1870 str q0, [sp, #-16]!
1871 mov x0, sp
1872 mov x1, sp
1873 mov x2, x23
1874 mov x21, x6
1875 mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
1876
1877 bl AES_encrypt
1878
1879 trn1 v11.2d, v11.2d, v13.2d
1880 ldr q0, [sp], #16
1881 eor v0.16b, v0.16b, v11.16b
1882 str q0, [x21]
1883
1884.Lxts_enc_ret:
1885
1886 movi v0.16b, #0
1887 movi v1.16b, #0
1888.Lxts_enc_bzero: // wipe key schedule
1889 stp q0, q1, [sp], #32
1890 cmp sp, x19
1891 bne .Lxts_enc_bzero
1892
1893 ldp x19, x20, [sp, #80]
1894 ldp x21, x22, [sp, #96]
1895 ldr x23, [sp, #112]
1896 ldp d8, d9, [sp, #128]
1897 ldp d10, d11, [sp, #144]
1898 ldp d12, d13, [sp, #160]
1899 ldp d14, d15, [sp, #176]
5adddcd9 1900 ldp x29, x30, [sp], #192
82551af5
BA
1901 ret
1902.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
1903
1904// The assembler doesn't seem capable of de-duplicating these when expressed
1905// using `ldr qd,=` syntax, so assign a symbolic address
1906.align 5
1907.Lxts_magic:
1908.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000
1909
1910.globl ossl_bsaes_xts_decrypt
1911.type ossl_bsaes_xts_decrypt,%function
1912.align 4
1913// On entry:
1914// x0 -> input ciphertext
1915// x1 -> output plaintext
1916// x2 -> length of text in bytes (must be at least 16)
1917// x3 -> key1 (used to decrypt the XORed ciphertext blocks)
1918// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
1919// x5 -> 16-byte initial vector (typically, sector number)
1920// On exit:
1921// Output plaintext filled in
1922// No output registers, usual AAPCS64 register preservation
1923ossl_bsaes_xts_decrypt:
1924 // Stack layout:
1925 // sp ->
1926 // nrounds*128-96 bytes: key schedule
1927 // x19 ->
1928 // 16 bytes: frame record
1929 // 4*16 bytes: tweak storage across _bsaes_decrypt8
1930 // 6*8 bytes: storage for 5 callee-saved general-purpose registers
1931 // 8*8 bytes: storage for 8 callee-saved SIMD registers
5adddcd9 1932 stp x29, x30, [sp, #-192]!
82551af5
BA
1933 stp x19, x20, [sp, #80]
1934 stp x21, x22, [sp, #96]
1935 str x23, [sp, #112]
1936 stp d8, d9, [sp, #128]
1937 stp d10, d11, [sp, #144]
1938 stp d12, d13, [sp, #160]
1939 stp d14, d15, [sp, #176]
1940
1941 mov x19, sp
1942 mov x20, x0
1943 mov x21, x1
1944 mov x22, x2
1945 mov x23, x3
1946
1947 // generate initial tweak
1948 sub sp, sp, #16
1949 mov x0, x5 // iv[]
1950 mov x1, sp
1951 mov x2, x4 // key2
1952 bl AES_encrypt
1953 ldr q11, [sp], #16
1954
1955 ldr w1, [x23, #240] // get # of rounds
1956 // allocate the key schedule on the stack
1957 add x17, sp, #96
1958 sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes
1959
1960 // populate the key schedule
1961 mov x9, x23 // pass key
1962 mov x10, x1 // pass # of rounds
1963 mov sp, x17
1964 bl _bsaes_key_convert
1965 ldr q6, [sp]
1966 str q15, [x17] // save last round key
1967 eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63)
1968 str q6, [sp]
1969
1970 sub x30, x22, #0x10
1971 tst x22, #0xf // if not multiple of 16
1972 csel x22, x30, x22, ne // subtract another 16 bytes
1973 subs x22, x22, #0x80
1974
1975 blo .Lxts_dec_short
1976 b .Lxts_dec_loop
1977
1978.align 4
1979.Lxts_dec_loop:
1980 ldr q8, .Lxts_magic
1981 mov x10, x1 // pass rounds
1982 add x2, x19, #16
1983 ldr q0, [x20], #16
1984 sshr v1.2d, v11.2d, #63
1985 mov x9, sp // pass key schedule
1986 ldr q6, .Lxts_magic+16
1987 add v2.2d, v11.2d, v11.2d
1988 cmtst v3.2d, v11.2d, v6.2d
1989 and v1.16b, v1.16b, v8.16b
1990 ext v1.16b, v1.16b, v1.16b, #8
1991 and v3.16b, v3.16b, v8.16b
1992 ldr q4, [x20], #16
1993 eor v12.16b, v2.16b, v1.16b
1994 eor v1.16b, v4.16b, v12.16b
1995 eor v0.16b, v0.16b, v11.16b
1996 cmtst v2.2d, v12.2d, v6.2d
1997 add v4.2d, v12.2d, v12.2d
1998 add x0, x19, #16
1999 ext v3.16b, v3.16b, v3.16b, #8
2000 and v2.16b, v2.16b, v8.16b
2001 eor v13.16b, v4.16b, v3.16b
2002 ldr q3, [x20], #16
2003 ext v4.16b, v2.16b, v2.16b, #8
2004 eor v2.16b, v3.16b, v13.16b
2005 ldr q3, [x20], #16
2006 add v5.2d, v13.2d, v13.2d
2007 cmtst v7.2d, v13.2d, v6.2d
2008 and v7.16b, v7.16b, v8.16b
2009 ldr q9, [x20], #16
2010 ext v7.16b, v7.16b, v7.16b, #8
2011 ldr q10, [x20], #16
2012 eor v14.16b, v5.16b, v4.16b
2013 ldr q16, [x20], #16
2014 add v4.2d, v14.2d, v14.2d
2015 eor v3.16b, v3.16b, v14.16b
2016 eor v15.16b, v4.16b, v7.16b
2017 add v5.2d, v15.2d, v15.2d
2018 ldr q7, [x20], #16
2019 cmtst v4.2d, v14.2d, v6.2d
2020 and v17.16b, v4.16b, v8.16b
2021 cmtst v18.2d, v15.2d, v6.2d
2022 eor v4.16b, v9.16b, v15.16b
2023 ext v9.16b, v17.16b, v17.16b, #8
2024 eor v9.16b, v5.16b, v9.16b
2025 add v17.2d, v9.2d, v9.2d
2026 and v18.16b, v18.16b, v8.16b
2027 eor v5.16b, v10.16b, v9.16b
2028 str q9, [x2], #16
2029 ext v10.16b, v18.16b, v18.16b, #8
2030 cmtst v9.2d, v9.2d, v6.2d
2031 and v9.16b, v9.16b, v8.16b
2032 eor v10.16b, v17.16b, v10.16b
2033 cmtst v17.2d, v10.2d, v6.2d
2034 eor v6.16b, v16.16b, v10.16b
2035 str q10, [x2], #16
2036 ext v9.16b, v9.16b, v9.16b, #8
2037 add v10.2d, v10.2d, v10.2d
2038 eor v9.16b, v10.16b, v9.16b
2039 str q9, [x2], #16
2040 eor v7.16b, v7.16b, v9.16b
2041 add v9.2d, v9.2d, v9.2d
2042 and v8.16b, v17.16b, v8.16b
2043 ext v8.16b, v8.16b, v8.16b, #8
2044 eor v8.16b, v9.16b, v8.16b
2045 str q8, [x2] // next round tweak
2046
2047 bl _bsaes_decrypt8
2048
2049 eor v6.16b, v6.16b, v13.16b
2050 eor v0.16b, v0.16b, v11.16b
2051 ldr q8, [x0], #16
2052 eor v7.16b, v7.16b, v8.16b
2053 str q0, [x21], #16
2054 eor v0.16b, v1.16b, v12.16b
2055 ldr q1, [x0], #16
2056 eor v1.16b, v3.16b, v1.16b
2057 subs x22, x22, #0x80
2058 eor v2.16b, v2.16b, v15.16b
2059 eor v3.16b, v4.16b, v14.16b
2060 ldr q4, [x0], #16
2061 str q0, [x21], #16
2062 ldr q11, [x0] // next round tweak
2063 eor v0.16b, v5.16b, v4.16b
2064 str q6, [x21], #16
2065 str q3, [x21], #16
2066 str q2, [x21], #16
2067 str q7, [x21], #16
2068 str q1, [x21], #16
2069 str q0, [x21], #16
2070 bpl .Lxts_dec_loop
2071
2072.Lxts_dec_short:
2073 adds x22, x22, #0x70
2074 bmi .Lxts_dec_done
2075
2076 ldr q8, .Lxts_magic
2077 sshr v1.2d, v11.2d, #63
2078 add v2.2d, v11.2d, v11.2d
2079 ldr q9, .Lxts_magic+16
2080 subs x22, x22, #0x10
2081 ldr q0, [x20], #16
2082 and v1.16b, v1.16b, v8.16b
2083 cmtst v3.2d, v11.2d, v9.2d
2084 ext v1.16b, v1.16b, v1.16b, #8
2085 and v3.16b, v3.16b, v8.16b
2086 eor v12.16b, v2.16b, v1.16b
2087 ext v1.16b, v3.16b, v3.16b, #8
2088 add v2.2d, v12.2d, v12.2d
2089 cmtst v3.2d, v12.2d, v9.2d
2090 eor v13.16b, v2.16b, v1.16b
2091 and v22.16b, v3.16b, v8.16b
2092 bmi .Lxts_dec_1
2093
2094 ext v2.16b, v22.16b, v22.16b, #8
2095 add v3.2d, v13.2d, v13.2d
2096 ldr q1, [x20], #16
2097 cmtst v4.2d, v13.2d, v9.2d
2098 subs x22, x22, #0x10
2099 eor v14.16b, v3.16b, v2.16b
2100 and v23.16b, v4.16b, v8.16b
2101 bmi .Lxts_dec_2
2102
2103 ext v3.16b, v23.16b, v23.16b, #8
2104 add v4.2d, v14.2d, v14.2d
2105 ldr q2, [x20], #16
2106 cmtst v5.2d, v14.2d, v9.2d
2107 eor v0.16b, v0.16b, v11.16b
2108 subs x22, x22, #0x10
2109 eor v15.16b, v4.16b, v3.16b
2110 and v24.16b, v5.16b, v8.16b
2111 bmi .Lxts_dec_3
2112
2113 ext v4.16b, v24.16b, v24.16b, #8
2114 add v5.2d, v15.2d, v15.2d
2115 ldr q3, [x20], #16
2116 cmtst v6.2d, v15.2d, v9.2d
2117 eor v1.16b, v1.16b, v12.16b
2118 subs x22, x22, #0x10
2119 eor v16.16b, v5.16b, v4.16b
2120 and v25.16b, v6.16b, v8.16b
2121 bmi .Lxts_dec_4
2122
2123 ext v5.16b, v25.16b, v25.16b, #8
2124 add v6.2d, v16.2d, v16.2d
2125 add x0, x19, #16
2126 cmtst v7.2d, v16.2d, v9.2d
2127 ldr q4, [x20], #16
2128 eor v2.16b, v2.16b, v13.16b
2129 str q16, [x0], #16
2130 subs x22, x22, #0x10
2131 eor v17.16b, v6.16b, v5.16b
2132 and v26.16b, v7.16b, v8.16b
2133 bmi .Lxts_dec_5
2134
2135 ext v7.16b, v26.16b, v26.16b, #8
2136 add v18.2d, v17.2d, v17.2d
2137 ldr q5, [x20], #16
2138 eor v3.16b, v3.16b, v14.16b
2139 str q17, [x0], #16
2140 subs x22, x22, #0x10
2141 eor v18.16b, v18.16b, v7.16b
2142 bmi .Lxts_dec_6
2143
2144 ldr q6, [x20], #16
2145 eor v4.16b, v4.16b, v15.16b
2146 eor v5.16b, v5.16b, v16.16b
2147 str q18, [x0] // next round tweak
2148 mov x9, sp // pass key schedule
2149 mov x10, x1
2150 add x0, x19, #16
2151 sub x22, x22, #0x10
2152 eor v6.16b, v6.16b, v17.16b
2153
2154 bl _bsaes_decrypt8
2155
2156 ldr q16, [x0], #16
2157 eor v0.16b, v0.16b, v11.16b
2158 eor v1.16b, v1.16b, v12.16b
2159 ldr q17, [x0], #16
2160 eor v6.16b, v6.16b, v13.16b
2161 eor v4.16b, v4.16b, v14.16b
2162 eor v2.16b, v2.16b, v15.16b
2163 ldr q11, [x0] // next round tweak
2164 str q0, [x21], #16
2165 str q1, [x21], #16
2166 eor v0.16b, v7.16b, v16.16b
2167 eor v1.16b, v3.16b, v17.16b
2168 str q6, [x21], #16
2169 str q4, [x21], #16
2170 str q2, [x21], #16
2171 str q0, [x21], #16
2172 str q1, [x21], #16
2173 b .Lxts_dec_done
2174
2175.align 4
2176.Lxts_dec_6:
2177 eor v4.16b, v4.16b, v15.16b
2178 eor v5.16b, v5.16b, v16.16b
2179 mov x9, sp // pass key schedule
2180 mov x10, x1 // pass rounds
2181 add x0, x19, #16
2182
2183 bl _bsaes_decrypt8
2184
2185 ldr q16, [x0], #16
2186 eor v0.16b, v0.16b, v11.16b
2187 eor v1.16b, v1.16b, v12.16b
2188 eor v6.16b, v6.16b, v13.16b
2189 eor v4.16b, v4.16b, v14.16b
2190 ldr q11, [x0] // next round tweak
2191 eor v2.16b, v2.16b, v15.16b
2192 str q0, [x21], #16
2193 str q1, [x21], #16
2194 eor v0.16b, v7.16b, v16.16b
2195 str q6, [x21], #16
2196 str q4, [x21], #16
2197 str q2, [x21], #16
2198 str q0, [x21], #16
2199 b .Lxts_dec_done
2200
2201.align 4
2202.Lxts_dec_5:
2203 eor v3.16b, v3.16b, v14.16b
2204 eor v4.16b, v4.16b, v15.16b
2205 mov x9, sp // pass key schedule
2206 mov x10, x1 // pass rounds
2207 add x0, x19, #16
2208
2209 bl _bsaes_decrypt8
2210
2211 eor v0.16b, v0.16b, v11.16b
2212 eor v1.16b, v1.16b, v12.16b
2213 ldr q11, [x0] // next round tweak
2214 eor v6.16b, v6.16b, v13.16b
2215 eor v4.16b, v4.16b, v14.16b
2216 eor v2.16b, v2.16b, v15.16b
2217 str q0, [x21], #16
2218 str q1, [x21], #16
2219 str q6, [x21], #16
2220 str q4, [x21], #16
2221 str q2, [x21], #16
2222 b .Lxts_dec_done
2223
2224.align 4
2225.Lxts_dec_4:
2226 eor v2.16b, v2.16b, v13.16b
2227 eor v3.16b, v3.16b, v14.16b
2228 mov x9, sp // pass key schedule
2229 mov x10, x1 // pass rounds
2230 add x0, x19, #16
2231
2232 bl _bsaes_decrypt8
2233
2234 eor v0.16b, v0.16b, v11.16b
2235 eor v1.16b, v1.16b, v12.16b
2236 eor v6.16b, v6.16b, v13.16b
2237 eor v4.16b, v4.16b, v14.16b
2238 mov v11.16b, v15.16b // next round tweak
2239 str q0, [x21], #16
2240 str q1, [x21], #16
2241 str q6, [x21], #16
2242 str q4, [x21], #16
2243 b .Lxts_dec_done
2244
2245.align 4
2246.Lxts_dec_3:
2247 eor v1.16b, v1.16b, v12.16b
2248 eor v2.16b, v2.16b, v13.16b
2249 mov x9, sp // pass key schedule
2250 mov x10, x1 // pass rounds
2251 add x0, x19, #16
2252
2253 bl _bsaes_decrypt8
2254
2255 eor v0.16b, v0.16b, v11.16b
2256 eor v1.16b, v1.16b, v12.16b
2257 eor v6.16b, v6.16b, v13.16b
2258 mov v11.16b, v14.16b // next round tweak
2259 str q0, [x21], #16
2260 str q1, [x21], #16
2261 str q6, [x21], #16
2262 b .Lxts_dec_done
2263
2264.align 4
2265.Lxts_dec_2:
2266 eor v0.16b, v0.16b, v11.16b
2267 eor v1.16b, v1.16b, v12.16b
2268 mov x9, sp // pass key schedule
2269 mov x10, x1 // pass rounds
2270 add x0, x19, #16
2271
2272 bl _bsaes_decrypt8
2273
2274 eor v0.16b, v0.16b, v11.16b
2275 eor v1.16b, v1.16b, v12.16b
2276 mov v11.16b, v13.16b // next round tweak
2277 str q0, [x21], #16
2278 str q1, [x21], #16
2279 b .Lxts_dec_done
2280
2281.align 4
2282.Lxts_dec_1:
2283 eor v0.16b, v0.16b, v11.16b
2284 sub x0, sp, #16
2285 sub x1, sp, #16
2286 mov x2, x23
2287 mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
2288 mov v14.d[0], v12.d[1]
2289 str q0, [sp, #-16]!
2290
2291 bl AES_decrypt
2292
2293 ldr q0, [sp], #16
2294 trn1 v13.2d, v11.2d, v13.2d
2295 trn1 v11.2d, v12.2d, v14.2d // next round tweak
2296 eor v0.16b, v0.16b, v13.16b
2297 str q0, [x21], #16
2298
2299.Lxts_dec_done:
2300 adds x22, x22, #0x10
2301 beq .Lxts_dec_ret
2302
2303 // calculate one round of extra tweak for the stolen ciphertext
2304 ldr q8, .Lxts_magic
2305 sshr v6.2d, v11.2d, #63
2306 and v6.16b, v6.16b, v8.16b
2307 add v12.2d, v11.2d, v11.2d
2308 ext v6.16b, v6.16b, v6.16b, #8
2309 eor v12.16b, v12.16b, v6.16b
2310
2311 // perform the final decryption with the last tweak value
2312 ldr q0, [x20], #16
2313 eor v0.16b, v0.16b, v12.16b
2314 str q0, [sp, #-16]!
2315 mov x0, sp
2316 mov x1, sp
2317 mov x2, x23
2318 mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
2319 mov v14.d[0], v12.d[1]
2320
2321 bl AES_decrypt
2322
2323 trn1 v12.2d, v12.2d, v14.2d
2324 trn1 v11.2d, v11.2d, v13.2d
2325 ldr q0, [sp], #16
2326 eor v0.16b, v0.16b, v12.16b
2327 str q0, [x21]
2328
2329 mov x6, x21
2330 // Penultimate ciphertext block produces final plaintext part-block
2331 // plus remaining part of final ciphertext block. Move plaintext part
a024ab98 2332 // to final position and reuse penultimate plaintext block buffer to
82551af5
BA
2333 // construct final ciphertext block
2334.Lxts_dec_steal:
2335 ldrb w1, [x21]
2336 ldrb w0, [x20], #1
2337 strb w1, [x21, #0x10]
2338 strb w0, [x21], #1
2339
2340 subs x22, x22, #1
2341 bhi .Lxts_dec_steal
2342
2343 // Finally decrypt the penultimate plaintext block using the
2344 // penultimate tweak
2345 ldr q0, [x6]
2346 eor v0.16b, v0.16b, v11.16b
2347 str q0, [sp, #-16]!
2348 mov x0, sp
2349 mov x1, sp
2350 mov x2, x23
2351 mov x21, x6
2352
2353 bl AES_decrypt
2354
2355 trn1 v11.2d, v11.2d, v13.2d
2356 ldr q0, [sp], #16
2357 eor v0.16b, v0.16b, v11.16b
2358 str q0, [x21]
2359
2360.Lxts_dec_ret:
2361
2362 movi v0.16b, #0
2363 movi v1.16b, #0
2364.Lxts_dec_bzero: // wipe key schedule
2365 stp q0, q1, [sp], #32
2366 cmp sp, x19
2367 bne .Lxts_dec_bzero
2368
2369 ldp x19, x20, [sp, #80]
2370 ldp x21, x22, [sp, #96]
2371 ldr x23, [sp, #112]
2372 ldp d8, d9, [sp, #128]
2373 ldp d10, d11, [sp, #144]
2374 ldp d12, d13, [sp, #160]
2375 ldp d14, d15, [sp, #176]
5adddcd9 2376 ldp x29, x30, [sp], #192
82551af5
BA
2377 ret
2378.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt