]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sm4/asm/vpsm4_ex-armv8.pl
Copyright year updates
[thirdparty/openssl.git] / crypto / sm4 / asm / vpsm4_ex-armv8.pl
1 #! /usr/bin/env perl
2 # Copyright 2022-2023 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # This module implements SM4 with ASIMD and AESE on AARCH64
11 #
12 # Dec 2022
13 #
14
15 # $output is the last argument if it looks like a file (it has an extension)
16 # $flavour is the first argument if it doesn't look like a file
17 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
19
20 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23 die "can't locate arm-xlate.pl";
24
25 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26 or die "can't call $xlate: $!";
27 *STDOUT=*OUT;
28
29 $prefix="vpsm4_ex";
30 my @vtmp=map("v$_",(0..3));
31 my @qtmp=map("q$_",(0..3));
32 my @data=map("v$_",(4..7));
33 my @datax=map("v$_",(8..11));
34 my ($rk0,$rk1)=("v12","v13");
35 my ($rka,$rkb)=("v14","v15");
36 my @vtmpx=map("v$_",(12..15));
37 my ($vtmp4,$vtmp5)=("v24","v25");
38 my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31");
39 my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31");
40
41 my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
42 my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
43 my ($xtmp1,$xtmp2)=("x8","x9");
44 my ($ptr,$counter)=("x10","w11");
45 my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
46
47 sub rev32() {
48 my $dst = shift;
49 my $src = shift;
50
51 if ($src and ("$src" ne "$dst")) {
52 $code.=<<___;
53 #ifndef __AARCH64EB__
54 rev32 $dst.16b,$src.16b
55 #else
56 mov $dst.16b,$src.16b
57 #endif
58 ___
59 } else {
60 $code.=<<___;
61 #ifndef __AARCH64EB__
62 rev32 $dst.16b,$dst.16b
63 #endif
64 ___
65 }
66 }
67
68 sub rev32_armeb() {
69 my $dst = shift;
70 my $src = shift;
71
72 if ($src and ("$src" ne "$dst")) {
73 $code.=<<___;
74 #ifdef __AARCH64EB__
75 rev32 $dst.16b,$src.16b
76 #else
77 mov $dst.16b,$src.16b
78 #endif
79 ___
80 } else {
81 $code.=<<___;
82 #ifdef __AARCH64EB__
83 rev32 $dst.16b,$dst.16b
84 #endif
85 ___
86 }
87 }
88
89 sub rbit() {
90 my $dst = shift;
91 my $src = shift;
92 my $std = shift;
93
94 if ($src and ("$src" ne "$dst")) {
95 if ($std eq "_gb") {
96 $code.=<<___;
97 rbit $dst.16b,$src.16b
98 ___
99 } else {
100 $code.=<<___;
101 mov $dst.16b,$src.16b
102 ___
103 }
104 } else {
105 if ($std eq "_gb") {
106 $code.=<<___;
107 rbit $dst.16b,$src.16b
108 ___
109 }
110 }
111 }
112
113 sub transpose() {
114 my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
115
116 $code.=<<___;
117 zip1 $vt0.4s,$dat0.4s,$dat1.4s
118 zip2 $vt1.4s,$dat0.4s,$dat1.4s
119 zip1 $vt2.4s,$dat2.4s,$dat3.4s
120 zip2 $vt3.4s,$dat2.4s,$dat3.4s
121 zip1 $dat0.2d,$vt0.2d,$vt2.2d
122 zip2 $dat1.2d,$vt0.2d,$vt2.2d
123 zip1 $dat2.2d,$vt1.2d,$vt3.2d
124 zip2 $dat3.2d,$vt1.2d,$vt3.2d
125 ___
126 }
127
128 # matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x)
129 sub mul_matrix() {
130 my $x = shift;
131 my $higherMat = shift;
132 my $lowerMat = shift;
133 my $tmp = shift;
134 $code.=<<___;
135 ushr $tmp.16b, $x.16b, 4
136 and $x.16b, $x.16b, $ANDMaskV.16b
137 tbl $x.16b, {$lowerMat.16b}, $x.16b
138 tbl $tmp.16b, {$higherMat.16b}, $tmp.16b
139 eor $x.16b, $x.16b, $tmp.16b
140 ___
141 }
142
143 # sbox operations for 4-lane of words
144 # sbox operation for 4-lane of words
145 sub sbox() {
146 my $dat = shift;
147
148 $code.=<<___;
149 // optimize sbox using AESE instruction
150 tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
151 ___
152 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
153 $code.=<<___;
154 eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
155 aese @vtmp[0].16b,@vtmp[1].16b
156 ___
157 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4);
158 $code.=<<___;
159 mov $dat.16b,@vtmp[0].16b
160
161 // linear transformation
162 ushr @vtmp[0].4s,$dat.4s,32-2
163 ushr @vtmp[1].4s,$dat.4s,32-10
164 ushr @vtmp[2].4s,$dat.4s,32-18
165 ushr @vtmp[3].4s,$dat.4s,32-24
166 sli @vtmp[0].4s,$dat.4s,2
167 sli @vtmp[1].4s,$dat.4s,10
168 sli @vtmp[2].4s,$dat.4s,18
169 sli @vtmp[3].4s,$dat.4s,24
170 eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
171 eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b
172 eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
173 eor $dat.16b,$dat.16b,$vtmp4.16b
174 ___
175 }
176
177 # sbox operation for 8-lane of words
178 sub sbox_double() {
179 my $dat = shift;
180 my $datx = shift;
181
182 $code.=<<___;
183 // optimize sbox using AESE instruction
184 tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
185 tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b
186 ___
187 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
188 &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4);
189 $code.=<<___;
190 eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b
191 aese @vtmp[0].16b,$vtmp5.16b
192 aese @vtmp[1].16b,$vtmp5.16b
193 ___
194 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4);
195 &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4);
196 $code.=<<___;
197 mov $dat.16b,@vtmp[0].16b
198 mov $datx.16b,@vtmp[1].16b
199
200 // linear transformation
201 ushr @vtmp[0].4s,$dat.4s,32-2
202 ushr $vtmp5.4s,$datx.4s,32-2
203 ushr @vtmp[1].4s,$dat.4s,32-10
204 ushr @vtmp[2].4s,$dat.4s,32-18
205 ushr @vtmp[3].4s,$dat.4s,32-24
206 sli @vtmp[0].4s,$dat.4s,2
207 sli $vtmp5.4s,$datx.4s,2
208 sli @vtmp[1].4s,$dat.4s,10
209 sli @vtmp[2].4s,$dat.4s,18
210 sli @vtmp[3].4s,$dat.4s,24
211 eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
212 eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
213 eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
214 eor $dat.16b,$dat.16b,$vtmp4.16b
215 ushr @vtmp[1].4s,$datx.4s,32-10
216 ushr @vtmp[2].4s,$datx.4s,32-18
217 ushr @vtmp[3].4s,$datx.4s,32-24
218 sli @vtmp[1].4s,$datx.4s,10
219 sli @vtmp[2].4s,$datx.4s,18
220 sli @vtmp[3].4s,$datx.4s,24
221 eor $vtmp4.16b,$vtmp5.16b,$datx.16b
222 eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
223 eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
224 eor $datx.16b,$datx.16b,$vtmp4.16b
225 ___
226 }
227
228 # sbox operation for one single word
229 sub sbox_1word () {
230 my $word = shift;
231
232 $code.=<<___;
233 mov @vtmp[3].s[0],$word
234 // optimize sbox using AESE instruction
235 tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b
236 ___
237 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
238 $code.=<<___;
239 eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
240 aese @vtmp[0].16b,@vtmp[1].16b
241 ___
242 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
243 $code.=<<___;
244
245 mov $wtmp0,@vtmp[0].s[0]
246 eor $word,$wtmp0,$wtmp0,ror #32-2
247 eor $word,$word,$wtmp0,ror #32-10
248 eor $word,$word,$wtmp0,ror #32-18
249 eor $word,$word,$wtmp0,ror #32-24
250 ___
251 }
252
253 # sm4 for one block of data, in scalar registers word0/word1/word2/word3
254 sub sm4_1blk () {
255 my $kptr = shift;
256
257 $code.=<<___;
258 ldp $wtmp0,$wtmp1,[$kptr],8
259 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
260 eor $tmpw,$word2,$word3
261 eor $wtmp2,$wtmp0,$word1
262 eor $tmpw,$tmpw,$wtmp2
263 ___
264 &sbox_1word($tmpw);
265 $code.=<<___;
266 eor $word0,$word0,$tmpw
267 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
268 eor $tmpw,$word2,$word3
269 eor $wtmp2,$word0,$wtmp1
270 eor $tmpw,$tmpw,$wtmp2
271 ___
272 &sbox_1word($tmpw);
273 $code.=<<___;
274 ldp $wtmp0,$wtmp1,[$kptr],8
275 eor $word1,$word1,$tmpw
276 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
277 eor $tmpw,$word0,$word1
278 eor $wtmp2,$wtmp0,$word3
279 eor $tmpw,$tmpw,$wtmp2
280 ___
281 &sbox_1word($tmpw);
282 $code.=<<___;
283 eor $word2,$word2,$tmpw
284 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
285 eor $tmpw,$word0,$word1
286 eor $wtmp2,$word2,$wtmp1
287 eor $tmpw,$tmpw,$wtmp2
288 ___
289 &sbox_1word($tmpw);
290 $code.=<<___;
291 eor $word3,$word3,$tmpw
292 ___
293 }
294
295 # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
296 sub sm4_4blks () {
297 my $kptr = shift;
298
299 $code.=<<___;
300 ldp $wtmp0,$wtmp1,[$kptr],8
301 dup $rk0.4s,$wtmp0
302 dup $rk1.4s,$wtmp1
303
304 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
305 eor $rka.16b,@data[2].16b,@data[3].16b
306 eor $rk0.16b,@data[1].16b,$rk0.16b
307 eor $rk0.16b,$rka.16b,$rk0.16b
308 ___
309 &sbox($rk0);
310 $code.=<<___;
311 eor @data[0].16b,@data[0].16b,$rk0.16b
312
313 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
314 eor $rka.16b,$rka.16b,@data[0].16b
315 eor $rk1.16b,$rka.16b,$rk1.16b
316 ___
317 &sbox($rk1);
318 $code.=<<___;
319 ldp $wtmp0,$wtmp1,[$kptr],8
320 eor @data[1].16b,@data[1].16b,$rk1.16b
321
322 dup $rk0.4s,$wtmp0
323 dup $rk1.4s,$wtmp1
324
325 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
326 eor $rka.16b,@data[0].16b,@data[1].16b
327 eor $rk0.16b,@data[3].16b,$rk0.16b
328 eor $rk0.16b,$rka.16b,$rk0.16b
329 ___
330 &sbox($rk0);
331 $code.=<<___;
332 eor @data[2].16b,@data[2].16b,$rk0.16b
333
334 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
335 eor $rka.16b,$rka.16b,@data[2].16b
336 eor $rk1.16b,$rka.16b,$rk1.16b
337 ___
338 &sbox($rk1);
339 $code.=<<___;
340 eor @data[3].16b,@data[3].16b,$rk1.16b
341 ___
342 }
343
344 # sm4 for 8 lanes of data, in neon registers
345 # data0/data1/data2/data3 datax0/datax1/datax2/datax3
346 sub sm4_8blks () {
347 my $kptr = shift;
348
349 $code.=<<___;
350 ldp $wtmp0,$wtmp1,[$kptr],8
351 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
352 dup $rk0.4s,$wtmp0
353 eor $rka.16b,@data[2].16b,@data[3].16b
354 eor $rkb.16b,@datax[2].16b,@datax[3].16b
355 eor @vtmp[0].16b,@data[1].16b,$rk0.16b
356 eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
357 eor $rk0.16b,$rka.16b,@vtmp[0].16b
358 eor $rk1.16b,$rkb.16b,@vtmp[1].16b
359 ___
360 &sbox_double($rk0,$rk1);
361 $code.=<<___;
362 eor @data[0].16b,@data[0].16b,$rk0.16b
363 eor @datax[0].16b,@datax[0].16b,$rk1.16b
364
365 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
366 dup $rk1.4s,$wtmp1
367 eor $rka.16b,$rka.16b,@data[0].16b
368 eor $rkb.16b,$rkb.16b,@datax[0].16b
369 eor $rk0.16b,$rka.16b,$rk1.16b
370 eor $rk1.16b,$rkb.16b,$rk1.16b
371 ___
372 &sbox_double($rk0,$rk1);
373 $code.=<<___;
374 ldp $wtmp0,$wtmp1,[$kptr],8
375 eor @data[1].16b,@data[1].16b,$rk0.16b
376 eor @datax[1].16b,@datax[1].16b,$rk1.16b
377
378 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
379 dup $rk0.4s,$wtmp0
380 eor $rka.16b,@data[0].16b,@data[1].16b
381 eor $rkb.16b,@datax[0].16b,@datax[1].16b
382 eor @vtmp[0].16b,@data[3].16b,$rk0.16b
383 eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
384 eor $rk0.16b,$rka.16b,@vtmp[0].16b
385 eor $rk1.16b,$rkb.16b,@vtmp[1].16b
386 ___
387 &sbox_double($rk0,$rk1);
388 $code.=<<___;
389 eor @data[2].16b,@data[2].16b,$rk0.16b
390 eor @datax[2].16b,@datax[2].16b,$rk1.16b
391
392 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
393 dup $rk1.4s,$wtmp1
394 eor $rka.16b,$rka.16b,@data[2].16b
395 eor $rkb.16b,$rkb.16b,@datax[2].16b
396 eor $rk0.16b,$rka.16b,$rk1.16b
397 eor $rk1.16b,$rkb.16b,$rk1.16b
398 ___
399 &sbox_double($rk0,$rk1);
400 $code.=<<___;
401 eor @data[3].16b,@data[3].16b,$rk0.16b
402 eor @datax[3].16b,@datax[3].16b,$rk1.16b
403 ___
404 }
405
406 sub encrypt_1blk_norev() {
407 my $dat = shift;
408
409 $code.=<<___;
410 mov $ptr,$rks
411 mov $counter,#8
412 mov $word0,$dat.s[0]
413 mov $word1,$dat.s[1]
414 mov $word2,$dat.s[2]
415 mov $word3,$dat.s[3]
416 10:
417 ___
418 &sm4_1blk($ptr);
419 $code.=<<___;
420 subs $counter,$counter,#1
421 b.ne 10b
422 mov $dat.s[0],$word3
423 mov $dat.s[1],$word2
424 mov $dat.s[2],$word1
425 mov $dat.s[3],$word0
426 ___
427 }
428
429 sub encrypt_1blk() {
430 my $dat = shift;
431
432 &encrypt_1blk_norev($dat);
433 &rev32($dat,$dat);
434 }
435
436 sub encrypt_4blks() {
437 $code.=<<___;
438 mov $ptr,$rks
439 mov $counter,#8
440 10:
441 ___
442 &sm4_4blks($ptr);
443 $code.=<<___;
444 subs $counter,$counter,#1
445 b.ne 10b
446 ___
447 &rev32(@vtmp[3],@data[0]);
448 &rev32(@vtmp[2],@data[1]);
449 &rev32(@vtmp[1],@data[2]);
450 &rev32(@vtmp[0],@data[3]);
451 }
452
453 sub encrypt_8blks() {
454 $code.=<<___;
455 mov $ptr,$rks
456 mov $counter,#8
457 10:
458 ___
459 &sm4_8blks($ptr);
460 $code.=<<___;
461 subs $counter,$counter,#1
462 b.ne 10b
463 ___
464 &rev32(@vtmp[3],@data[0]);
465 &rev32(@vtmp[2],@data[1]);
466 &rev32(@vtmp[1],@data[2]);
467 &rev32(@vtmp[0],@data[3]);
468 &rev32(@data[3],@datax[0]);
469 &rev32(@data[2],@datax[1]);
470 &rev32(@data[1],@datax[2]);
471 &rev32(@data[0],@datax[3]);
472 }
473
474 sub load_sbox () {
475 my $data = shift;
476
477 $code.=<<___;
478 ldr $MaskQ, .Lsbox_magic
479 ldr $TAHMatQ, .Lsbox_magic+16
480 ldr $TALMatQ, .Lsbox_magic+32
481 ldr $ATAHMatQ, .Lsbox_magic+48
482 ldr $ATALMatQ, .Lsbox_magic+64
483 ldr $ANDMaskQ, .Lsbox_magic+80
484 ___
485 }
486
487 sub mov_reg_to_vec() {
488 my $src0 = shift;
489 my $src1 = shift;
490 my $desv = shift;
491 $code.=<<___;
492 mov $desv.d[0],$src0
493 mov $desv.d[1],$src1
494 ___
495 &rev32_armeb($desv,$desv);
496 }
497
498 sub mov_vec_to_reg() {
499 my $srcv = shift;
500 my $des0 = shift;
501 my $des1 = shift;
502 $code.=<<___;
503 mov $des0,$srcv.d[0]
504 mov $des1,$srcv.d[1]
505 ___
506 }
507
508 sub compute_tweak() {
509 my $src0 = shift;
510 my $src1 = shift;
511 my $des0 = shift;
512 my $des1 = shift;
513 $code.=<<___;
514 mov $wtmp0,0x87
515 extr $xtmp2,$src1,$src1,#32
516 extr $des1,$src1,$src0,#63
517 and $wtmp1,$wtmp0,$wtmp2,asr#31
518 eor $des0,$xtmp1,$src0,lsl#1
519 ___
520 }
521
522 sub compute_tweak_vec() {
523 my $src = shift;
524 my $des = shift;
525 my $std = shift;
526 &rbit(@vtmp[2],$src,$std);
527 $code.=<<___;
528 ldr @qtmp[0], .Lxts_magic
529 shl $des.16b, @vtmp[2].16b, #1
530 ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
531 ushr @vtmp[1].16b, @vtmp[1].16b, #7
532 mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
533 eor $des.16b, $des.16b, @vtmp[1].16b
534 ___
535 &rbit($des,$des,$std);
536 }
537
538 $code=<<___;
539 #include "arm_arch.h"
540 .arch armv8-a+crypto
541 .text
542
543 .type _${prefix}_consts,%object
544 .align 7
545 _${prefix}_consts:
546 .Lck:
547 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
548 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
549 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
550 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
551 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
552 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
553 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
554 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
555 .Lfk:
556 .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
557 .Lshuffles:
558 .quad 0x0B0A090807060504,0x030201000F0E0D0C
559 .Lxts_magic:
560 .quad 0x0101010101010187,0x0101010101010101
561 .Lsbox_magic:
562 .quad 0x0b0e0104070a0d00,0x0306090c0f020508
563 .quad 0x62185a2042387a00,0x22581a6002783a40
564 .quad 0x15df62a89e54e923,0xc10bb67c4a803df7
565 .quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
566 .quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
567 .quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
568
569 .size _${prefix}_consts,.-_${prefix}_consts
570 ___
571
572 {{{
573 my ($key,$keys,$enc)=("x0","x1","w2");
574 my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
575 my ($vkey,$vfk,$vmap)=("v5","v6","v7");
576 $code.=<<___;
577 .type _${prefix}_set_key,%function
578 .align 4
579 _${prefix}_set_key:
580 AARCH64_VALID_CALL_TARGET
581 ld1 {$vkey.4s},[$key]
582 ___
583 &load_sbox();
584 &rev32($vkey,$vkey);
585 $code.=<<___;
586 adr $pointer,.Lshuffles
587 ld1 {$vmap.2d},[$pointer]
588 adr $pointer,.Lfk
589 ld1 {$vfk.2d},[$pointer]
590 eor $vkey.16b,$vkey.16b,$vfk.16b
591 mov $schedules,#32
592 adr $pointer,.Lck
593 movi @vtmp[0].16b,#64
594 cbnz $enc,1f
595 add $keys,$keys,124
596 1:
597 mov $wtmp,$vkey.s[1]
598 ldr $roundkey,[$pointer],#4
599 eor $roundkey,$roundkey,$wtmp
600 mov $wtmp,$vkey.s[2]
601 eor $roundkey,$roundkey,$wtmp
602 mov $wtmp,$vkey.s[3]
603 eor $roundkey,$roundkey,$wtmp
604 // optimize sbox using AESE instruction
605 mov @data[0].s[0],$roundkey
606 tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b
607 ___
608 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
609 $code.=<<___;
610 eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
611 aese @vtmp[0].16b,@vtmp[1].16b
612 ___
613 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
614 $code.=<<___;
615 mov $wtmp,@vtmp[0].s[0]
616 eor $roundkey,$wtmp,$wtmp,ror #19
617 eor $roundkey,$roundkey,$wtmp,ror #9
618 mov $wtmp,$vkey.s[0]
619 eor $roundkey,$roundkey,$wtmp
620 mov $vkey.s[0],$roundkey
621 cbz $enc,2f
622 str $roundkey,[$keys],#4
623 b 3f
624 2:
625 str $roundkey,[$keys],#-4
626 3:
627 tbl $vkey.16b,{$vkey.16b},$vmap.16b
628 subs $schedules,$schedules,#1
629 b.ne 1b
630 ret
631 .size _${prefix}_set_key,.-_${prefix}_set_key
632 ___
633 }}}
634
635
636 {{{
637 $code.=<<___;
638 .type _${prefix}_enc_4blks,%function
639 .align 4
640 _${prefix}_enc_4blks:
641 AARCH64_VALID_CALL_TARGET
642 ___
643 &encrypt_4blks();
644 $code.=<<___;
645 ret
646 .size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks
647 ___
648 }}}
649
650 {{{
651 $code.=<<___;
652 .type _${prefix}_enc_8blks,%function
653 .align 4
654 _${prefix}_enc_8blks:
655 AARCH64_VALID_CALL_TARGET
656 ___
657 &encrypt_8blks();
658 $code.=<<___;
659 ret
660 .size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks
661 ___
662 }}}
663
664
665 {{{
666 my ($key,$keys)=("x0","x1");
667 $code.=<<___;
668 .globl ${prefix}_set_encrypt_key
669 .type ${prefix}_set_encrypt_key,%function
670 .align 5
671 ${prefix}_set_encrypt_key:
672 AARCH64_SIGN_LINK_REGISTER
673 stp x29,x30,[sp,#-16]!
674 mov w2,1
675 bl _${prefix}_set_key
676 ldp x29,x30,[sp],#16
677 AARCH64_VALIDATE_LINK_REGISTER
678 ret
679 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
680 ___
681 }}}
682
683 {{{
684 my ($key,$keys)=("x0","x1");
685 $code.=<<___;
686 .globl ${prefix}_set_decrypt_key
687 .type ${prefix}_set_decrypt_key,%function
688 .align 5
689 ${prefix}_set_decrypt_key:
690 AARCH64_SIGN_LINK_REGISTER
691 stp x29,x30,[sp,#-16]!
692 mov w2,0
693 bl _${prefix}_set_key
694 ldp x29,x30,[sp],#16
695 AARCH64_VALIDATE_LINK_REGISTER
696 ret
697 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
698 ___
699 }}}
700
701 {{{
702 sub gen_block () {
703 my $dir = shift;
704 my ($inp,$outp,$rk)=map("x$_",(0..2));
705
706 $code.=<<___;
707 .globl ${prefix}_${dir}crypt
708 .type ${prefix}_${dir}crypt,%function
709 .align 5
710 ${prefix}_${dir}crypt:
711 AARCH64_VALID_CALL_TARGET
712 ld1 {@data[0].4s},[$inp]
713 ___
714 &load_sbox();
715 &rev32(@data[0],@data[0]);
716 $code.=<<___;
717 mov $rks,$rk
718 ___
719 &encrypt_1blk(@data[0]);
720 $code.=<<___;
721 st1 {@data[0].4s},[$outp]
722 ret
723 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
724 ___
725 }
726 &gen_block("en");
727 &gen_block("de");
728 }}}
729
730 {{{
731 $code.=<<___;
732 .globl ${prefix}_ecb_encrypt
733 .type ${prefix}_ecb_encrypt,%function
734 .align 5
735 ${prefix}_ecb_encrypt:
736 AARCH64_SIGN_LINK_REGISTER
737 // convert length into blocks
738 lsr x2,x2,4
739 stp d8,d9,[sp,#-80]!
740 stp d10,d11,[sp,#16]
741 stp d12,d13,[sp,#32]
742 stp d14,d15,[sp,#48]
743 stp x29,x30,[sp,#64]
744 ___
745 &load_sbox();
746 $code.=<<___;
747 .Lecb_8_blocks_process:
748 cmp $blocks,#8
749 b.lt .Lecb_4_blocks_process
750 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
751 ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
752 ___
753 &rev32(@data[0],@data[0]);
754 &rev32(@data[1],@data[1]);
755 &rev32(@data[2],@data[2]);
756 &rev32(@data[3],@data[3]);
757 &rev32(@datax[0],@datax[0]);
758 &rev32(@datax[1],@datax[1]);
759 &rev32(@datax[2],@datax[2]);
760 &rev32(@datax[3],@datax[3]);
761 $code.=<<___;
762 bl _${prefix}_enc_8blks
763 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
764 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
765 subs $blocks,$blocks,#8
766 b.gt .Lecb_8_blocks_process
767 b 100f
768 .Lecb_4_blocks_process:
769 cmp $blocks,#4
770 b.lt 1f
771 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
772 ___
773 &rev32(@data[0],@data[0]);
774 &rev32(@data[1],@data[1]);
775 &rev32(@data[2],@data[2]);
776 &rev32(@data[3],@data[3]);
777 $code.=<<___;
778 bl _${prefix}_enc_4blks
779 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
780 sub $blocks,$blocks,#4
781 1:
782 // process last block
783 cmp $blocks,#1
784 b.lt 100f
785 b.gt 1f
786 ld1 {@data[0].4s},[$inp]
787 ___
788 &rev32(@data[0],@data[0]);
789 &encrypt_1blk(@data[0]);
790 $code.=<<___;
791 st1 {@data[0].4s},[$outp]
792 b 100f
793 1: // process last 2 blocks
794 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
795 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
796 cmp $blocks,#2
797 b.gt 1f
798 ___
799 &rev32(@data[0],@data[0]);
800 &rev32(@data[1],@data[1]);
801 &rev32(@data[2],@data[2]);
802 &rev32(@data[3],@data[3]);
803 $code.=<<___;
804 bl _${prefix}_enc_4blks
805 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
806 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
807 b 100f
808 1: // process last 3 blocks
809 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
810 ___
811 &rev32(@data[0],@data[0]);
812 &rev32(@data[1],@data[1]);
813 &rev32(@data[2],@data[2]);
814 &rev32(@data[3],@data[3]);
815 $code.=<<___;
816 bl _${prefix}_enc_4blks
817 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
818 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
819 st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
820 100:
821 ldp d10,d11,[sp,#16]
822 ldp d12,d13,[sp,#32]
823 ldp d14,d15,[sp,#48]
824 ldp x29,x30,[sp,#64]
825 ldp d8,d9,[sp],#80
826 AARCH64_VALIDATE_LINK_REGISTER
827 ret
828 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
829 ___
830 }}}
831
832 {{{
833 my ($len,$ivp,$enc)=("x2","x4","w5");
834 my $ivec0=("v3");
835 my $ivec1=("v15");
836
837 $code.=<<___;
838 .globl ${prefix}_cbc_encrypt
839 .type ${prefix}_cbc_encrypt,%function
840 .align 5
841 ${prefix}_cbc_encrypt:
842 AARCH64_VALID_CALL_TARGET
843 lsr $len,$len,4
844 ___
845 &load_sbox();
846 $code.=<<___;
847 cbz $enc,.Ldec
848 ld1 {$ivec0.4s},[$ivp]
849 .Lcbc_4_blocks_enc:
850 cmp $blocks,#4
851 b.lt 1f
852 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
853 eor @data[0].16b,@data[0].16b,$ivec0.16b
854 ___
855 &rev32(@data[1],@data[1]);
856 &rev32(@data[0],@data[0]);
857 &rev32(@data[2],@data[2]);
858 &rev32(@data[3],@data[3]);
859 &encrypt_1blk_norev(@data[0]);
860 $code.=<<___;
861 eor @data[1].16b,@data[1].16b,@data[0].16b
862 ___
863 &encrypt_1blk_norev(@data[1]);
864 &rev32(@data[0],@data[0]);
865
866 $code.=<<___;
867 eor @data[2].16b,@data[2].16b,@data[1].16b
868 ___
869 &encrypt_1blk_norev(@data[2]);
870 &rev32(@data[1],@data[1]);
871 $code.=<<___;
872 eor @data[3].16b,@data[3].16b,@data[2].16b
873 ___
874 &encrypt_1blk_norev(@data[3]);
875 &rev32(@data[2],@data[2]);
876 &rev32(@data[3],@data[3]);
877 $code.=<<___;
878 orr $ivec0.16b,@data[3].16b,@data[3].16b
879 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
880 subs $blocks,$blocks,#4
881 b.ne .Lcbc_4_blocks_enc
882 b 2f
883 1:
884 subs $blocks,$blocks,#1
885 b.lt 2f
886 ld1 {@data[0].4s},[$inp],#16
887 eor $ivec0.16b,$ivec0.16b,@data[0].16b
888 ___
889 &rev32($ivec0,$ivec0);
890 &encrypt_1blk($ivec0);
891 $code.=<<___;
892 st1 {$ivec0.4s},[$outp],#16
893 b 1b
894 2:
895 // save back IV
896 st1 {$ivec0.4s},[$ivp]
897 ret
898
899 .Ldec:
900 // decryption mode starts
901 AARCH64_SIGN_LINK_REGISTER
902 stp d8,d9,[sp,#-80]!
903 stp d10,d11,[sp,#16]
904 stp d12,d13,[sp,#32]
905 stp d14,d15,[sp,#48]
906 stp x29,x30,[sp,#64]
907 .Lcbc_8_blocks_dec:
908 cmp $blocks,#8
909 b.lt 1f
910 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
911 add $ptr,$inp,#64
912 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
913 ___
914 &rev32(@data[0],@data[0]);
915 &rev32(@data[1],@data[1]);
916 &rev32(@data[2],@data[2]);
917 &rev32(@data[3],$data[3]);
918 &rev32(@datax[0],@datax[0]);
919 &rev32(@datax[1],@datax[1]);
920 &rev32(@datax[2],@datax[2]);
921 &rev32(@datax[3],$datax[3]);
922 $code.=<<___;
923 bl _${prefix}_enc_8blks
924 ___
925 &transpose(@vtmp,@datax);
926 &transpose(@data,@datax);
927 $code.=<<___;
928 ld1 {$ivec1.4s},[$ivp]
929 ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
930 // note ivec1 and vtmpx[3] are resuing the same register
931 // care needs to be taken to avoid conflict
932 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
933 ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
934 eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
935 eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
936 eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
937 // save back IV
938 st1 {$vtmpx[3].4s}, [$ivp]
939 eor @data[0].16b,@data[0].16b,$datax[3].16b
940 eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
941 eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
942 eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
943 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
944 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
945 subs $blocks,$blocks,#8
946 b.gt .Lcbc_8_blocks_dec
947 b.eq 100f
948 1:
949 ld1 {$ivec1.4s},[$ivp]
950 .Lcbc_4_blocks_dec:
951 cmp $blocks,#4
952 b.lt 1f
953 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
954 ___
955 &rev32(@data[0],@data[0]);
956 &rev32(@data[1],@data[1]);
957 &rev32(@data[2],@data[2]);
958 &rev32(@data[3],$data[3]);
959 $code.=<<___;
960 bl _${prefix}_enc_4blks
961 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
962 ___
963 &transpose(@vtmp,@datax);
964 $code.=<<___;
965 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
966 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
967 orr $ivec1.16b,@data[3].16b,@data[3].16b
968 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
969 eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
970 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
971 subs $blocks,$blocks,#4
972 b.gt .Lcbc_4_blocks_dec
973 // save back IV
974 st1 {@data[3].4s}, [$ivp]
975 b 100f
976 1: // last block
977 subs $blocks,$blocks,#1
978 b.lt 100f
979 b.gt 1f
980 ld1 {@data[0].4s},[$inp],#16
981 // save back IV
982 st1 {$data[0].4s}, [$ivp]
983 ___
984 &rev32(@datax[0],@data[0]);
985 &encrypt_1blk(@datax[0]);
986 $code.=<<___;
987 eor @datax[0].16b,@datax[0].16b,$ivec1.16b
988 st1 {@datax[0].4s},[$outp],#16
989 b 100f
990 1: // last two blocks
991 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
992 add $ptr,$inp,#16
993 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
994 subs $blocks,$blocks,1
995 b.gt 1f
996 ___
997 &rev32(@data[0],@data[0]);
998 &rev32(@data[1],@data[1]);
999 &rev32(@data[2],@data[2]);
1000 &rev32(@data[3],@data[3]);
1001 $code.=<<___;
1002 bl _${prefix}_enc_4blks
1003 ld1 {@data[0].4s,@data[1].4s},[$inp],#32
1004 ___
1005 &transpose(@vtmp,@datax);
1006 $code.=<<___;
1007 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1008 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1009 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1010 // save back IV
1011 st1 {@data[1].4s}, [$ivp]
1012 b 100f
1013 1: // last 3 blocks
1014 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
1015 ___
1016 &rev32(@data[0],@data[0]);
1017 &rev32(@data[1],@data[1]);
1018 &rev32(@data[2],@data[2]);
1019 &rev32(@data[3],@data[3]);
1020 $code.=<<___;
1021 bl _${prefix}_enc_4blks
1022 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1023 ___
1024 &transpose(@vtmp,@datax);
1025 $code.=<<___;
1026 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1027 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1028 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
1029 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1030 // save back IV
1031 st1 {@data[2].4s}, [$ivp]
1032 100:
1033 ldp d10,d11,[sp,#16]
1034 ldp d12,d13,[sp,#32]
1035 ldp d14,d15,[sp,#48]
1036 ldp x29,x30,[sp,#64]
1037 ldp d8,d9,[sp],#80
1038 AARCH64_VALIDATE_LINK_REGISTER
1039 ret
1040 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1041 ___
1042 }}}
1043
1044 {{{
1045 my ($ivp)=("x4");
1046 my ($ctr)=("w5");
1047 my $ivec=("v3");
1048
1049 $code.=<<___;
1050 .globl ${prefix}_ctr32_encrypt_blocks
1051 .type ${prefix}_ctr32_encrypt_blocks,%function
1052 .align 5
1053 ${prefix}_ctr32_encrypt_blocks:
1054 AARCH64_VALID_CALL_TARGET
1055 ld1 {$ivec.4s},[$ivp]
1056 ___
1057 &rev32($ivec,$ivec);
1058 &load_sbox();
1059 $code.=<<___;
1060 cmp $blocks,#1
1061 b.ne 1f
1062 // fast processing for one single block without
1063 // context saving overhead
1064 ___
1065 &encrypt_1blk($ivec);
1066 $code.=<<___;
1067 ld1 {@data[0].4s},[$inp]
1068 eor @data[0].16b,@data[0].16b,$ivec.16b
1069 st1 {@data[0].4s},[$outp]
1070 ret
1071 1:
1072 AARCH64_SIGN_LINK_REGISTER
1073 stp d8,d9,[sp,#-80]!
1074 stp d10,d11,[sp,#16]
1075 stp d12,d13,[sp,#32]
1076 stp d14,d15,[sp,#48]
1077 stp x29,x30,[sp,#64]
1078 mov $word0,$ivec.s[0]
1079 mov $word1,$ivec.s[1]
1080 mov $word2,$ivec.s[2]
1081 mov $ctr,$ivec.s[3]
1082 .Lctr32_4_blocks_process:
1083 cmp $blocks,#4
1084 b.lt 1f
1085 dup @data[0].4s,$word0
1086 dup @data[1].4s,$word1
1087 dup @data[2].4s,$word2
1088 mov @data[3].s[0],$ctr
1089 add $ctr,$ctr,#1
1090 mov $data[3].s[1],$ctr
1091 add $ctr,$ctr,#1
1092 mov @data[3].s[2],$ctr
1093 add $ctr,$ctr,#1
1094 mov @data[3].s[3],$ctr
1095 add $ctr,$ctr,#1
1096 cmp $blocks,#8
1097 b.ge .Lctr32_8_blocks_process
1098 bl _${prefix}_enc_4blks
1099 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1100 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1101 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1102 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1103 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1104 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1105 subs $blocks,$blocks,#4
1106 b.ne .Lctr32_4_blocks_process
1107 b 100f
1108 .Lctr32_8_blocks_process:
1109 dup @datax[0].4s,$word0
1110 dup @datax[1].4s,$word1
1111 dup @datax[2].4s,$word2
1112 mov @datax[3].s[0],$ctr
1113 add $ctr,$ctr,#1
1114 mov $datax[3].s[1],$ctr
1115 add $ctr,$ctr,#1
1116 mov @datax[3].s[2],$ctr
1117 add $ctr,$ctr,#1
1118 mov @datax[3].s[3],$ctr
1119 add $ctr,$ctr,#1
1120 bl _${prefix}_enc_8blks
1121 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1122 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1123 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1124 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1125 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1126 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1127 eor @data[0].16b,@data[0].16b,@datax[0].16b
1128 eor @data[1].16b,@data[1].16b,@datax[1].16b
1129 eor @data[2].16b,@data[2].16b,@datax[2].16b
1130 eor @data[3].16b,@data[3].16b,@datax[3].16b
1131 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1132 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1133 subs $blocks,$blocks,#8
1134 b.ne .Lctr32_4_blocks_process
1135 b 100f
1136 1: // last block processing
1137 subs $blocks,$blocks,#1
1138 b.lt 100f
1139 b.gt 1f
1140 mov $ivec.s[0],$word0
1141 mov $ivec.s[1],$word1
1142 mov $ivec.s[2],$word2
1143 mov $ivec.s[3],$ctr
1144 ___
1145 &encrypt_1blk($ivec);
1146 $code.=<<___;
1147 ld1 {@data[0].4s},[$inp]
1148 eor @data[0].16b,@data[0].16b,$ivec.16b
1149 st1 {@data[0].4s},[$outp]
1150 b 100f
1151 1: // last 2 blocks processing
1152 dup @data[0].4s,$word0
1153 dup @data[1].4s,$word1
1154 dup @data[2].4s,$word2
1155 mov @data[3].s[0],$ctr
1156 add $ctr,$ctr,#1
1157 mov @data[3].s[1],$ctr
1158 subs $blocks,$blocks,#1
1159 b.ne 1f
1160 bl _${prefix}_enc_4blks
1161 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1162 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1163 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1164 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1165 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1166 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1167 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1168 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1169 b 100f
1170 1: // last 3 blocks processing
1171 add $ctr,$ctr,#1
1172 mov @data[3].s[2],$ctr
1173 bl _${prefix}_enc_4blks
1174 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1175 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1176 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1177 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1178 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1179 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1180 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1181 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1182 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1183 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1184 100:
1185 ldp d10,d11,[sp,#16]
1186 ldp d12,d13,[sp,#32]
1187 ldp d14,d15,[sp,#48]
1188 ldp x29,x30,[sp,#64]
1189 ldp d8,d9,[sp],#80
1190 AARCH64_VALIDATE_LINK_REGISTER
1191 ret
1192 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1193 ___
1194 }}}
1195
1196
1197 {{{
1198 my ($blocks,$len)=("x2","x2");
1199 my $ivp=("x5");
1200 my @twx=map("x$_",(12..27));
1201 my ($rks1,$rks2)=("x26","x27");
1202 my $lastBlk=("x26");
1203 my $enc=("w28");
1204 my $remain=("x29");
1205
1206 my @tweak=map("v$_",(16..23));
1207 my $lastTweak=("v25");
1208
1209 sub gen_xts_cipher() {
1210 my $std = shift;
1211 $code.=<<___;
1212 .globl ${prefix}_xts_encrypt${std}
1213 .type ${prefix}_xts_encrypt${std},%function
1214 .align 5
1215 ${prefix}_xts_encrypt${std}:
1216 AARCH64_SIGN_LINK_REGISTER
1217 stp x15, x16, [sp, #-0x10]!
1218 stp x17, x18, [sp, #-0x10]!
1219 stp x19, x20, [sp, #-0x10]!
1220 stp x21, x22, [sp, #-0x10]!
1221 stp x23, x24, [sp, #-0x10]!
1222 stp x25, x26, [sp, #-0x10]!
1223 stp x27, x28, [sp, #-0x10]!
1224 stp x29, x30, [sp, #-0x10]!
1225 stp d8, d9, [sp, #-0x10]!
1226 stp d10, d11, [sp, #-0x10]!
1227 stp d12, d13, [sp, #-0x10]!
1228 stp d14, d15, [sp, #-0x10]!
1229 mov $rks1,x3
1230 mov $rks2,x4
1231 mov $enc,w6
1232 ld1 {@tweak[0].4s}, [$ivp]
1233 mov $rks,$rks2
1234 ___
1235 &load_sbox();
1236 &rev32(@tweak[0],@tweak[0]);
1237 &encrypt_1blk(@tweak[0]);
1238 $code.=<<___;
1239 mov $rks,$rks1
1240 and $remain,$len,#0x0F
1241 // convert length into blocks
1242 lsr $blocks,$len,4
1243 cmp $blocks,#1
1244 b.lt .return${std}
1245
1246 cmp $remain,0
1247 // If the encryption/decryption Length is N times of 16,
1248 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1249 b.eq .xts_encrypt_blocks${std}
1250
1251 // If the encryption/decryption length is not N times of 16,
1252 // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
1253 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1254 subs $blocks,$blocks,#1
1255 b.eq .only_2blks_tweak${std}
1256 .xts_encrypt_blocks${std}:
1257 ___
1258 &rbit(@tweak[0],@tweak[0],$std);
1259 &rev32_armeb(@tweak[0],@tweak[0]);
1260 &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
1261 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1262 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1263 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1264 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1265 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1266 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1267 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1268 $code.=<<___;
1269 .Lxts_8_blocks_process${std}:
1270 cmp $blocks,#8
1271 ___
1272 &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
1273 &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
1274 &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
1275 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1276 &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
1277 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1278 &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
1279 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1280 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
1281 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1282 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
1283 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1284 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
1285 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1286 &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
1287 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1288 $code.=<<___;
1289 b.lt .Lxts_4_blocks_process${std}
1290 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1291 ___
1292 &rbit(@tweak[0],@tweak[0],$std);
1293 &rbit(@tweak[1],@tweak[1],$std);
1294 &rbit(@tweak[2],@tweak[2],$std);
1295 &rbit(@tweak[3],@tweak[3],$std);
1296 $code.=<<___;
1297 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1298 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1299 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1300 eor @data[3].16b, @data[3].16b, @tweak[3].16b
1301 ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1302 ___
1303 &rbit(@tweak[4],@tweak[4],$std);
1304 &rbit(@tweak[5],@tweak[5],$std);
1305 &rbit(@tweak[6],@tweak[6],$std);
1306 &rbit(@tweak[7],@tweak[7],$std);
1307 $code.=<<___;
1308 eor @datax[0].16b, @datax[0].16b, @tweak[4].16b
1309 eor @datax[1].16b, @datax[1].16b, @tweak[5].16b
1310 eor @datax[2].16b, @datax[2].16b, @tweak[6].16b
1311 eor @datax[3].16b, @datax[3].16b, @tweak[7].16b
1312 ___
1313 &rev32(@data[0],@data[0]);
1314 &rev32(@data[1],@data[1]);
1315 &rev32(@data[2],@data[2]);
1316 &rev32(@data[3],@data[3]);
1317 &rev32(@datax[0],@datax[0]);
1318 &rev32(@datax[1],@datax[1]);
1319 &rev32(@datax[2],@datax[2]);
1320 &rev32(@datax[3],@datax[3]);
1321 &transpose(@data,@vtmp);
1322 &transpose(@datax,@vtmp);
1323 $code.=<<___;
1324 bl _${prefix}_enc_8blks
1325 ___
1326 &transpose(@vtmp,@datax);
1327 &transpose(@data,@datax);
1328 $code.=<<___;
1329 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1330 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1331 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1332 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1333 eor @data[0].16b, @data[0].16b, @tweak[4].16b
1334 eor @data[1].16b, @data[1].16b, @tweak[5].16b
1335 eor @data[2].16b, @data[2].16b, @tweak[6].16b
1336 eor @data[3].16b, @data[3].16b, @tweak[7].16b
1337
1338 // save the last tweak
1339 mov $lastTweak.16b,@tweak[7].16b
1340 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1341 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1342 subs $blocks,$blocks,#8
1343 b.gt .Lxts_8_blocks_process${std}
1344 b 100f
1345 .Lxts_4_blocks_process${std}:
1346 cmp $blocks,#4
1347 b.lt 1f
1348 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1349 ___
1350 &rbit(@tweak[0],@tweak[0],$std);
1351 &rbit(@tweak[1],@tweak[1],$std);
1352 &rbit(@tweak[2],@tweak[2],$std);
1353 &rbit(@tweak[3],@tweak[3],$std);
1354 $code.=<<___;
1355 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1356 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1357 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1358 eor @data[3].16b, @data[3].16b, @tweak[3].16b
1359 ___
1360 &rev32(@data[0],@data[0]);
1361 &rev32(@data[1],@data[1]);
1362 &rev32(@data[2],@data[2]);
1363 &rev32(@data[3],@data[3]);
1364 &transpose(@data,@vtmp);
1365 $code.=<<___;
1366 bl _${prefix}_enc_4blks
1367 ___
1368 &transpose(@vtmp,@data);
1369 $code.=<<___;
1370 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1371 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1372 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1373 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1374 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1375 sub $blocks,$blocks,#4
1376 mov @tweak[0].16b,@tweak[4].16b
1377 mov @tweak[1].16b,@tweak[5].16b
1378 mov @tweak[2].16b,@tweak[6].16b
1379 // save the last tweak
1380 mov $lastTweak.16b,@tweak[3].16b
1381 1:
1382 // process last block
1383 cmp $blocks,#1
1384 b.lt 100f
1385 b.gt 1f
1386 ld1 {@data[0].4s},[$inp],#16
1387 ___
1388 &rbit(@tweak[0],@tweak[0],$std);
1389 $code.=<<___;
1390 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1391 ___
1392 &rev32(@data[0],@data[0]);
1393 &encrypt_1blk(@data[0]);
1394 $code.=<<___;
1395 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1396 st1 {@data[0].4s},[$outp],#16
1397 // save the last tweak
1398 mov $lastTweak.16b,@tweak[0].16b
1399 b 100f
1400 1: // process last 2 blocks
1401 cmp $blocks,#2
1402 b.gt 1f
1403 ld1 {@data[0].4s,@data[1].4s},[$inp],#32
1404 ___
1405 &rbit(@tweak[0],@tweak[0],$std);
1406 &rbit(@tweak[1],@tweak[1],$std);
1407 $code.=<<___;
1408 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1409 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1410 ___
1411 &rev32(@data[0],@data[0]);
1412 &rev32(@data[1],@data[1]);
1413 &transpose(@data,@vtmp);
1414 $code.=<<___;
1415 bl _${prefix}_enc_4blks
1416 ___
1417 &transpose(@vtmp,@data);
1418 $code.=<<___;
1419 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1420 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1421 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1422 // save the last tweak
1423 mov $lastTweak.16b,@tweak[1].16b
1424 b 100f
1425 1: // process last 3 blocks
1426 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1427 ___
1428 &rbit(@tweak[0],@tweak[0],$std);
1429 &rbit(@tweak[1],@tweak[1],$std);
1430 &rbit(@tweak[2],@tweak[2],$std);
1431 $code.=<<___;
1432 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1433 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1434 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1435 ___
1436 &rev32(@data[0],@data[0]);
1437 &rev32(@data[1],@data[1]);
1438 &rev32(@data[2],@data[2]);
1439 &transpose(@data,@vtmp);
1440 $code.=<<___;
1441 bl _${prefix}_enc_4blks
1442 ___
1443 &transpose(@vtmp,@data);
1444 $code.=<<___;
1445 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1446 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1447 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1448 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1449 // save the last tweak
1450 mov $lastTweak.16b,@tweak[2].16b
1451 100:
1452 cmp $remain,0
1453 b.eq .return${std}
1454
1455 // This branch calculates the last two tweaks,
1456 // while the encryption/decryption length is larger than 32
1457 .last_2blks_tweak${std}:
1458 ___
1459 &rev32_armeb($lastTweak,$lastTweak);
1460 &compute_tweak_vec($lastTweak,@tweak[1],$std);
1461 &compute_tweak_vec(@tweak[1],@tweak[2],$std);
1462 $code.=<<___;
1463 b .check_dec${std}
1464
1465
1466 // This branch calculates the last two tweaks,
1467 // while the encryption/decryption length is equal to 32, who only need two tweaks
1468 .only_2blks_tweak${std}:
1469 mov @tweak[1].16b,@tweak[0].16b
1470 ___
1471 &rev32_armeb(@tweak[1],@tweak[1]);
1472 &compute_tweak_vec(@tweak[1],@tweak[2]);
1473 $code.=<<___;
1474 b .check_dec${std}
1475
1476
1477 // Determine whether encryption or decryption is required.
1478 // The last two tweaks need to be swapped for decryption.
1479 .check_dec${std}:
1480 // encryption:1 decryption:0
1481 cmp $enc,1
1482 b.eq .prcess_last_2blks${std}
1483 mov @vtmp[0].16B,@tweak[1].16b
1484 mov @tweak[1].16B,@tweak[2].16b
1485 mov @tweak[2].16B,@vtmp[0].16b
1486
1487 .prcess_last_2blks${std}:
1488 ___
1489 &rev32_armeb(@tweak[1],@tweak[1]);
1490 &rev32_armeb(@tweak[2],@tweak[2]);
1491 $code.=<<___;
1492 ld1 {@data[0].4s},[$inp],#16
1493 eor @data[0].16b, @data[0].16b, @tweak[1].16b
1494 ___
1495 &rev32(@data[0],@data[0]);
1496 &encrypt_1blk(@data[0]);
1497 $code.=<<___;
1498 eor @data[0].16b, @data[0].16b, @tweak[1].16b
1499 st1 {@data[0].4s},[$outp],#16
1500
1501 sub $lastBlk,$outp,16
1502 .loop${std}:
1503 subs $remain,$remain,1
1504 ldrb $wtmp0,[$lastBlk,$remain]
1505 ldrb $wtmp1,[$inp,$remain]
1506 strb $wtmp1,[$lastBlk,$remain]
1507 strb $wtmp0,[$outp,$remain]
1508 b.gt .loop${std}
1509 ld1 {@data[0].4s}, [$lastBlk]
1510 eor @data[0].16b, @data[0].16b, @tweak[2].16b
1511 ___
1512 &rev32(@data[0],@data[0]);
1513 &encrypt_1blk(@data[0]);
1514 $code.=<<___;
1515 eor @data[0].16b, @data[0].16b, @tweak[2].16b
1516 st1 {@data[0].4s}, [$lastBlk]
1517 .return${std}:
1518 ldp d14, d15, [sp], #0x10
1519 ldp d12, d13, [sp], #0x10
1520 ldp d10, d11, [sp], #0x10
1521 ldp d8, d9, [sp], #0x10
1522 ldp x29, x30, [sp], #0x10
1523 ldp x27, x28, [sp], #0x10
1524 ldp x25, x26, [sp], #0x10
1525 ldp x23, x24, [sp], #0x10
1526 ldp x21, x22, [sp], #0x10
1527 ldp x19, x20, [sp], #0x10
1528 ldp x17, x18, [sp], #0x10
1529 ldp x15, x16, [sp], #0x10
1530 AARCH64_VALIDATE_LINK_REGISTER
1531 ret
1532 .size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
1533 ___
1534 } # end of gen_xts_cipher
1535 &gen_xts_cipher("_gb");
1536 &gen_xts_cipher("");
1537 }}}
1538
1539 ########################################
1540 open SELF,$0;
1541 while(<SELF>) {
1542 next if (/^#!/);
1543 last if (!s/^#/\/\// and !/^$/);
1544 print;
1545 }
1546 close SELF;
1547
1548 foreach(split("\n",$code)) {
1549 s/\`([^\`]*)\`/eval($1)/ge;
1550 print $_,"\n";
1551 }
1552
1553 close STDOUT or die "error closing STDOUT: $!";