2 # Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # This module implements SM4 with ASIMD on aarch64
15 # $output is the last argument if it looks like a file (it has an extension)
16 # $flavour is the first argument if it doesn't look like a file
17 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m
|\
.\w
+$| ?
pop : undef;
18 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m
|\
.| ?
shift : undef;
20 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
21 ( $xlate="${dir}arm-xlate.pl" and -f
$xlate ) or
22 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f
$xlate) or
23 die "can't locate arm-xlate.pl";
25 open OUT
,"| \"$^X\" $xlate $flavour \"$output\""
26 or die "can't call $xlate: $!";
30 my @vtmp=map("v$_",(0..3));
31 my @qtmp=map("q$_",(0..3));
32 my @data=map("v$_",(4..7));
33 my @datax=map("v$_",(8..11));
34 my ($rk0,$rk1)=("v12","v13");
35 my ($rka,$rkb)=("v14","v15");
36 my @vtmpx=map("v$_",(12..15));
37 my @sbox=map("v$_",(16..31));
38 my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
39 my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
40 my ($xtmp1,$xtmp2)=("x8","x9");
41 my ($ptr,$counter)=("x10","w11");
42 my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
48 if ($src and ("$src" ne "$dst")) {
51 rev32
$dst.16b
,$src.16b
59 rev32
$dst.16b
,$dst.16b
69 if ($src and ("$src" ne "$dst")) {
72 rev32
$dst.16b
,$src.16b
80 rev32
$dst.16b
,$dst.16b
91 if ($src and ("$src" ne "$dst")) {
94 rbit
$dst.16b
,$src.16b
104 rbit
$dst.16b
,$src.16b
111 my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
114 zip1
$vt0.4s
,$dat0.4s
,$dat1.4s
115 zip2
$vt1.4s
,$dat0.4s
,$dat1.4s
116 zip1
$vt2.4s
,$dat2.4s
,$dat3.4s
117 zip2
$vt3.4s
,$dat2.4s
,$dat3.4s
118 zip1
$dat0.2d
,$vt0.2d
,$vt2.2d
119 zip2
$dat1.2d
,$vt0.2d
,$vt2.2d
120 zip1
$dat2.2d
,$vt1.2d
,$vt3.2d
121 zip2
$dat3.2d
,$vt1.2d
,$vt3.2d
125 # sbox operations for 4-lane of words
130 movi
@vtmp[0].16b
,#64
131 movi
@vtmp[1].16b
,#128
132 movi
@vtmp[2].16b
,#192
133 sub @vtmp[0].16b
,$dat.16b
,@vtmp[0].16b
134 sub @vtmp[1].16b
,$dat.16b
,@vtmp[1].16b
135 sub @vtmp[2].16b
,$dat.16b
,@vtmp[2].16b
136 tbl
$dat.16b
,{@sbox[0].16b
,@sbox[1].16b
,@sbox[2].16b
,@sbox[3].16b
},$dat.16b
137 tbl
@vtmp[0].16b
,{@sbox[4].16b
,@sbox[5].16b
,@sbox[6].16b
,@sbox[7].16b
},@vtmp[0].16b
138 tbl
@vtmp[1].16b
,{@sbox[8].16b
,@sbox[9].16b
,@sbox[10].16b
,@sbox[11].16b
},@vtmp[1].16b
139 tbl
@vtmp[2].16b
,{@sbox[12].16b
,@sbox[13].16b
,@sbox[14].16b
,@sbox[15].16b
},@vtmp[2].16b
140 add
@vtmp[0].2d
,@vtmp[0].2d
,@vtmp[1].2d
141 add
@vtmp[2].2d
,@vtmp[2].2d
,$dat.2d
142 add
$dat.2d
,@vtmp[0].2d
,@vtmp[2].2d
144 ushr
@vtmp[0].4s
,$dat.4s
,32-2
145 sli
@vtmp[0].4s
,$dat.4s
,2
146 ushr
@vtmp[2].4s
,$dat.4s
,32-10
147 eor
@vtmp[1].16b
,@vtmp[0].16b
,$dat.16b
148 sli
@vtmp[2].4s
,$dat.4s
,10
149 eor
@vtmp[1].16b
,@vtmp[2].16b
,$vtmp[1].16b
150 ushr
@vtmp[0].4s
,$dat.4s
,32-18
151 sli
@vtmp[0].4s
,$dat.4s
,18
152 ushr
@vtmp[2].4s
,$dat.4s
,32-24
153 eor
@vtmp[1].16b
,@vtmp[0].16b
,$vtmp[1].16b
154 sli
@vtmp[2].4s
,$dat.4s
,24
155 eor
$dat.16b
,@vtmp[2].16b
,@vtmp[1].16b
159 # sbox operation for 8-lane of words
165 movi
@vtmp[3].16b
,#64
166 sub @vtmp[0].16b
,$dat.16b
,@vtmp[3].16b
167 sub @vtmp[1].16b
,@vtmp[0].16b
,@vtmp[3].16b
168 sub @vtmp[2].16b
,@vtmp[1].16b
,@vtmp[3].16b
169 tbl
$dat.16b
,{@sbox[0].16b
,@sbox[1].16b
,@sbox[2].16b
,@sbox[3].16b
},$dat.16b
170 tbl
@vtmp[0].16b
,{@sbox[4].16b
,@sbox[5].16b
,@sbox[6].16b
,@sbox[7].16b
},@vtmp[0].16b
171 tbl
@vtmp[1].16b
,{@sbox[8].16b
,@sbox[9].16b
,@sbox[10].16b
,@sbox[11].16b
},@vtmp[1].16b
172 tbl
@vtmp[2].16b
,{@sbox[12].16b
,@sbox[13].16b
,@sbox[14].16b
,@sbox[15].16b
},@vtmp[2].16b
173 add
@vtmp[1].2d
,@vtmp[0].2d
,@vtmp[1].2d
174 add
$dat.2d
,@vtmp[2].2d
,$dat.2d
175 add
$dat.2d
,@vtmp[1].2d
,$dat.2d
177 sub @vtmp[0].16b
,$datx.16b
,@vtmp[3].16b
178 sub @vtmp[1].16b
,@vtmp[0].16b
,@vtmp[3].16b
179 sub @vtmp[2].16b
,@vtmp[1].16b
,@vtmp[3].16b
180 tbl
$datx.16b
,{@sbox[0].16b
,@sbox[1].16b
,@sbox[2].16b
,@sbox[3].16b
},$datx.16b
181 tbl
@vtmp[0].16b
,{@sbox[4].16b
,@sbox[5].16b
,@sbox[6].16b
,@sbox[7].16b
},@vtmp[0].16b
182 tbl
@vtmp[1].16b
,{@sbox[8].16b
,@sbox[9].16b
,@sbox[10].16b
,@sbox[11].16b
},@vtmp[1].16b
183 tbl
@vtmp[2].16b
,{@sbox[12].16b
,@sbox[13].16b
,@sbox[14].16b
,@sbox[15].16b
},@vtmp[2].16b
184 add
@vtmp[1].2d
,@vtmp[0].2d
,@vtmp[1].2d
185 add
$datx.2d
,@vtmp[2].2d
,$datx.2d
186 add
$datx.2d
,@vtmp[1].2d
,$datx.2d
188 ushr
@vtmp[0].4s
,$dat.4s
,32-2
189 sli
@vtmp[0].4s
,$dat.4s
,2
190 ushr
@vtmp[2].4s
,$datx.4s
,32-2
191 eor
@vtmp[1].16b
,@vtmp[0].16b
,$dat.16b
192 sli
@vtmp[2].4s
,$datx.4s
,2
194 ushr
@vtmp[0].4s
,$dat.4s
,32-10
195 eor
@vtmp[3].16b
,@vtmp[2].16b
,$datx.16b
196 sli
@vtmp[0].4s
,$dat.4s
,10
197 ushr
@vtmp[2].4s
,$datx.4s
,32-10
198 eor
@vtmp[1].16b
,@vtmp[0].16b
,$vtmp[1].16b
199 sli
@vtmp[2].4s
,$datx.4s
,10
201 ushr
@vtmp[0].4s
,$dat.4s
,32-18
202 eor
@vtmp[3].16b
,@vtmp[2].16b
,$vtmp[3].16b
203 sli
@vtmp[0].4s
,$dat.4s
,18
204 ushr
@vtmp[2].4s
,$datx.4s
,32-18
205 eor
@vtmp[1].16b
,@vtmp[0].16b
,$vtmp[1].16b
206 sli
@vtmp[2].4s
,$datx.4s
,18
208 ushr
@vtmp[0].4s
,$dat.4s
,32-24
209 eor
@vtmp[3].16b
,@vtmp[2].16b
,$vtmp[3].16b
210 sli
@vtmp[0].4s
,$dat.4s
,24
211 ushr
@vtmp[2].4s
,$datx.4s
,32-24
212 eor
$dat.16b
,@vtmp[0].16b
,@vtmp[1].16b
213 sli
@vtmp[2].4s
,$datx.4s
,24
214 eor
$datx.16b
,@vtmp[2].16b
,@vtmp[3].16b
218 # sbox operation for one single word
223 movi
@vtmp[1].16b
,#64
224 movi
@vtmp[2].16b
,#128
225 movi
@vtmp[3].16b
,#192
226 mov
@vtmp[0].s
[0],$word
228 sub @vtmp[1].16b
,@vtmp[0].16b
,@vtmp[1].16b
229 sub @vtmp[2].16b
,@vtmp[0].16b
,@vtmp[2].16b
230 sub @vtmp[3].16b
,@vtmp[0].16b
,@vtmp[3].16b
232 tbl
@vtmp[0].16b
,{@sbox[0].16b
,@sbox[1].16b
,@sbox[2].16b
,@sbox[3].16b
},@vtmp[0].16b
233 tbl
@vtmp[1].16b
,{@sbox[4].16b
,@sbox[5].16b
,@sbox[6].16b
,@sbox[7].16b
},@vtmp[1].16b
234 tbl
@vtmp[2].16b
,{@sbox[8].16b
,@sbox[9].16b
,@sbox[10].16b
,@sbox[11].16b
},@vtmp[2].16b
235 tbl
@vtmp[3].16b
,{@sbox[12].16b
,@sbox[13].16b
,@sbox[14].16b
,@sbox[15].16b
},@vtmp[3].16b
237 mov
$word,@vtmp[0].s
[0]
238 mov
$wtmp0,@vtmp[1].s
[0]
239 mov
$wtmp2,@vtmp[2].s
[0]
240 add
$wtmp0,$word,$wtmp0
241 mov
$word,@vtmp[3].s
[0]
242 add
$wtmp0,$wtmp0,$wtmp2
243 add
$wtmp0,$wtmp0,$word
245 eor
$word,$wtmp0,$wtmp0,ror
#32-2
246 eor
$word,$word,$wtmp0,ror
#32-10
247 eor
$word,$word,$wtmp0,ror
#32-18
248 eor
$word,$word,$wtmp0,ror
#32-24
252 # sm4 for one block of data, in scalar registers word0/word1/word2/word3
257 ldp
$wtmp0,$wtmp1,[$kptr],8
258 // B0
^= SBOX
(B1
^ B2
^ B3
^ RK0
)
259 eor
$tmpw,$word2,$word3
260 eor
$wtmp2,$wtmp0,$word1
261 eor
$tmpw,$tmpw,$wtmp2
265 eor
$word0,$word0,$tmpw
266 // B1
^= SBOX
(B0
^ B2
^ B3
^ RK1
)
267 eor
$tmpw,$word2,$word3
268 eor
$wtmp2,$word0,$wtmp1
269 eor
$tmpw,$tmpw,$wtmp2
273 ldp
$wtmp0,$wtmp1,[$kptr],8
274 eor
$word1,$word1,$tmpw
275 // B2
^= SBOX
(B0
^ B1
^ B3
^ RK2
)
276 eor
$tmpw,$word0,$word1
277 eor
$wtmp2,$wtmp0,$word3
278 eor
$tmpw,$tmpw,$wtmp2
282 eor
$word2,$word2,$tmpw
283 // B3
^= SBOX
(B0
^ B1
^ B2
^ RK3
)
284 eor
$tmpw,$word0,$word1
285 eor
$wtmp2,$word2,$wtmp1
286 eor
$tmpw,$tmpw,$wtmp2
290 eor
$word3,$word3,$tmpw
294 # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
299 ldp
$wtmp0,$wtmp1,[$kptr],8
303 // B0
^= SBOX
(B1
^ B2
^ B3
^ RK0
)
304 eor
$rka.16b
,@data[2].16b
,@data[3].16b
305 eor
$rk0.16b
,@data[1].16b
,$rk0.16b
306 eor
$rk0.16b
,$rka.16b
,$rk0.16b
310 eor
@data[0].16b
,@data[0].16b
,$rk0.16b
312 // B1
^= SBOX
(B0
^ B2
^ B3
^ RK1
)
313 eor
$rka.16b
,$rka.16b
,@data[0].16b
314 eor
$rk1.16b
,$rka.16b
,$rk1.16b
318 ldp
$wtmp0,$wtmp1,[$kptr],8
319 eor
@data[1].16b
,@data[1].16b
,$rk1.16b
324 // B2
^= SBOX
(B0
^ B1
^ B3
^ RK2
)
325 eor
$rka.16b
,@data[0].16b
,@data[1].16b
326 eor
$rk0.16b
,@data[3].16b
,$rk0.16b
327 eor
$rk0.16b
,$rka.16b
,$rk0.16b
331 eor
@data[2].16b
,@data[2].16b
,$rk0.16b
333 // B3
^= SBOX
(B0
^ B1
^ B2
^ RK3
)
334 eor
$rka.16b
,$rka.16b
,@data[2].16b
335 eor
$rk1.16b
,$rka.16b
,$rk1.16b
339 eor
@data[3].16b
,@data[3].16b
,$rk1.16b
343 # sm4 for 8 lanes of data, in neon registers
344 # data0/data1/data2/data3 datax0/datax1/datax2/datax3
349 ldp
$wtmp0,$wtmp1,[$kptr],8
350 // B0
^= SBOX
(B1
^ B2
^ B3
^ RK0
)
352 eor
$rka.16b
,@data[2].16b
,@data[3].16b
353 eor
$rkb.16b
,@datax[2].16b
,@datax[3].16b
354 eor
@vtmp[0].16b
,@data[1].16b
,$rk0.16b
355 eor
@vtmp[1].16b
,@datax[1].16b
,$rk0.16b
356 eor
$rk0.16b
,$rka.16b
,@vtmp[0].16b
357 eor
$rk1.16b
,$rkb.16b
,@vtmp[1].16b
359 &sbox_double
($rk0,$rk1);
361 eor
@data[0].16b
,@data[0].16b
,$rk0.16b
362 eor
@datax[0].16b
,@datax[0].16b
,$rk1.16b
364 // B1
^= SBOX
(B0
^ B2
^ B3
^ RK1
)
366 eor
$rka.16b
,$rka.16b
,@data[0].16b
367 eor
$rkb.16b
,$rkb.16b
,@datax[0].16b
368 eor
$rk0.16b
,$rka.16b
,$rk1.16b
369 eor
$rk1.16b
,$rkb.16b
,$rk1.16b
371 &sbox_double
($rk0,$rk1);
373 ldp
$wtmp0,$wtmp1,[$kptr],8
374 eor
@data[1].16b
,@data[1].16b
,$rk0.16b
375 eor
@datax[1].16b
,@datax[1].16b
,$rk1.16b
377 // B2
^= SBOX
(B0
^ B1
^ B3
^ RK2
)
379 eor
$rka.16b
,@data[0].16b
,@data[1].16b
380 eor
$rkb.16b
,@datax[0].16b
,@datax[1].16b
381 eor
@vtmp[0].16b
,@data[3].16b
,$rk0.16b
382 eor
@vtmp[1].16b
,@datax[3].16b
,$rk0.16b
383 eor
$rk0.16b
,$rka.16b
,@vtmp[0].16b
384 eor
$rk1.16b
,$rkb.16b
,@vtmp[1].16b
386 &sbox_double
($rk0,$rk1);
388 eor
@data[2].16b
,@data[2].16b
,$rk0.16b
389 eor
@datax[2].16b
,@datax[2].16b
,$rk1.16b
391 // B3
^= SBOX
(B0
^ B1
^ B2
^ RK3
)
393 eor
$rka.16b
,$rka.16b
,@data[2].16b
394 eor
$rkb.16b
,$rkb.16b
,@datax[2].16b
395 eor
$rk0.16b
,$rka.16b
,$rk1.16b
396 eor
$rk1.16b
,$rkb.16b
,$rk1.16b
398 &sbox_double
($rk0,$rk1);
400 eor
@data[3].16b
,@data[3].16b
,$rk0.16b
401 eor
@datax[3].16b
,@datax[3].16b
,$rk1.16b
405 sub encrypt_1blk_norev
() {
419 subs
$counter,$counter,#1
431 &encrypt_1blk_norev
($dat);
435 sub encrypt_4blks
() {
443 subs
$counter,$counter,#1
446 &rev32
(@vtmp[3],@data[0]);
447 &rev32
(@vtmp[2],@data[1]);
448 &rev32
(@vtmp[1],@data[2]);
449 &rev32
(@vtmp[0],@data[3]);
452 sub encrypt_8blks
() {
460 subs
$counter,$counter,#1
463 &rev32
(@vtmp[3],@data[0]);
464 &rev32
(@vtmp[2],@data[1]);
465 &rev32
(@vtmp[1],@data[2]);
466 &rev32
(@vtmp[0],@data[3]);
467 &rev32
(@data[3],@datax[0]);
468 &rev32
(@data[2],@datax[1]);
469 &rev32
(@data[1],@datax[2]);
470 &rev32
(@data[0],@datax[3]);
478 ld1
{@sbox[0].16b
,@sbox[1].16b
,@sbox[2].16b
,@sbox[3].16b
},[$ptr],#64
479 ld1
{@sbox[4].16b
,@sbox[5].16b
,@sbox[6].16b
,@sbox[7].16b
},[$ptr],#64
480 ld1
{@sbox[8].16b
,@sbox[9].16b
,@sbox[10].16b
,@sbox[11].16b
},[$ptr],#64
481 ld1
{@sbox[12].16b
,@sbox[13].16b
,@sbox[14].16b
,@sbox[15].16b
},[$ptr]
486 sub mov_reg_to_vec
() {
494 &rev32_armeb
($desv,$desv);
497 sub mov_vec_to_reg
() {
507 sub compute_tweak
() {
514 extr
$xtmp2,$src1,$src1,#32
515 extr
$des1,$src1,$src0,#63
516 and $wtmp1,$wtmp0,$wtmp2,asr
#31
517 eor
$des0,$xtmp1,$src0,lsl
#1
521 sub compute_tweak_vec
() {
525 &rbit
(@vtmp[2],$src,$std);
527 ldr
@qtmp[0], .Lxts_magic
528 shl
$des.16b
, @vtmp[2].16b
, #1
529 ext
@vtmp[1].16b
, @vtmp[2].16b
, @vtmp[2].16b
,#15
530 ushr
@vtmp[1].16b
, @vtmp[1].16b
, #7
531 mul
@vtmp[1].16b
, @vtmp[1].16b
, @vtmp[0].16b
532 eor
$des.16b
, $des.16b
, @vtmp[1].16b
534 &rbit
($des,$des,$std);
538 #include "arm_arch.h"
542 .type _vpsm4_consts
,%object
546 .byte
0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
547 .byte
0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
548 .byte
0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
549 .byte
0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
550 .byte
0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
551 .byte
0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
552 .byte
0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
553 .byte
0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
554 .byte
0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
555 .byte
0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
556 .byte
0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
557 .byte
0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
558 .byte
0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
559 .byte
0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
560 .byte
0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
561 .byte
0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
563 .long
0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
564 .long
0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
565 .long
0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
566 .long
0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
567 .long
0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
568 .long
0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
569 .long
0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
570 .long
0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
572 .quad
0x56aa3350a3b1bac6,0xb27022dc677d9197
574 .quad
0x0B0A090807060504,0x030201000F0E0D0C
576 .quad
0x0101010101010187,0x0101010101010101
578 .size _vpsm4_consts
,.-_vpsm4_consts
582 my ($key,$keys,$enc)=("x0","x1","w2");
583 my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
584 my ($vkey,$vfk,$vmap)=("v5","v6","v7");
586 .type _vpsm4_set_key
,%function
589 AARCH64_VALID_CALL_TARGET
590 ld1
{$vkey.4s
},[$key]
595 adr
$pointer,.Lshuffles
596 ld1
{$vmap.2d
},[$pointer]
598 ld1
{$vfk.2d
},[$pointer]
599 eor
$vkey.16b
,$vkey.16b
,$vfk.16b
602 movi
@vtmp[0].16b
,#64
607 ldr
$roundkey,[$pointer],#4
608 eor
$roundkey,$roundkey,$wtmp
610 eor
$roundkey,$roundkey,$wtmp
612 eor
$roundkey,$roundkey,$wtmp
614 mov
@data[0].s
[0],$roundkey
615 tbl
@vtmp[1].16b
,{@sbox[0].16b
,@sbox[1].16b
,@sbox[2].16b
,@sbox[3].16b
},@data[0].16b
616 sub @data[0].16b
,@data[0].16b
,@vtmp[0].16b
617 tbx
@vtmp[1].16b
,{@sbox[4].16b
,@sbox[5].16b
,@sbox[6].16b
,@sbox[7].16b
},@data[0].16b
618 sub @data[0].16b
,@data[0].16b
,@vtmp[0].16b
619 tbx
@vtmp[1].16b
,{@sbox[8].16b
,@sbox[9].16b
,@sbox[10].16b
,@sbox[11].16b
},@data[0].16b
620 sub @data[0].16b
,@data[0].16b
,@vtmp[0].16b
621 tbx
@vtmp[1].16b
,{@sbox[12].16b
,@sbox[13].16b
,@sbox[14].16b
,@sbox[15].16b
},@data[0].16b
622 mov
$wtmp,@vtmp[1].s
[0]
623 eor
$roundkey,$wtmp,$wtmp,ror
#19
624 eor
$roundkey,$roundkey,$wtmp,ror
#9
626 eor
$roundkey,$roundkey,$wtmp
627 mov
$vkey.s
[0],$roundkey
629 str
$roundkey,[$keys],#4
632 str
$roundkey,[$keys],#-4
634 tbl
$vkey.16b
,{$vkey.16b
},$vmap.16b
635 subs
$schedules,$schedules,#1
638 .size _vpsm4_set_key
,.-_vpsm4_set_key
645 .type _vpsm4_enc_4blks
,%function
648 AARCH64_VALID_CALL_TARGET
653 .size _vpsm4_enc_4blks
,.-_vpsm4_enc_4blks
659 .type _vpsm4_enc_8blks
,%function
662 AARCH64_VALID_CALL_TARGET
667 .size _vpsm4_enc_8blks
,.-_vpsm4_enc_8blks
673 my ($key,$keys)=("x0","x1");
675 .globl
${prefix
}_set_encrypt_key
676 .type
${prefix
}_set_encrypt_key
,%function
678 ${prefix
}_set_encrypt_key
:
679 AARCH64_SIGN_LINK_REGISTER
680 stp x29
,x30
,[sp
,#-16]!
684 AARCH64_VALIDATE_LINK_REGISTER
686 .size
${prefix
}_set_encrypt_key
,.-${prefix
}_set_encrypt_key
691 my ($key,$keys)=("x0","x1");
693 .globl
${prefix
}_set_decrypt_key
694 .type
${prefix
}_set_decrypt_key
,%function
696 ${prefix
}_set_decrypt_key
:
697 AARCH64_SIGN_LINK_REGISTER
698 stp x29
,x30
,[sp
,#-16]!
702 AARCH64_VALIDATE_LINK_REGISTER
704 .size
${prefix
}_set_decrypt_key
,.-${prefix
}_set_decrypt_key
711 my ($inp,$outp,$rk)=map("x$_",(0..2));
714 .globl
${prefix
}_
${dir
}crypt
715 .type
${prefix
}_
${dir
}crypt,%function
717 ${prefix
}_
${dir
}crypt:
718 AARCH64_VALID_CALL_TARGET
719 ld1
{@data[0].4s
},[$inp]
722 &rev32
(@data[0],@data[0]);
726 &encrypt_1blk
(@data[0]);
728 st1
{@data[0].4s
},[$outp]
730 .size
${prefix
}_
${dir
}crypt,.-${prefix
}_
${dir
}crypt
739 my @dat=map("v$_",(16..23));
742 .globl
${prefix
}_ecb_encrypt
743 .type
${prefix
}_ecb_encrypt
,%function
745 ${prefix
}_ecb_encrypt
:
746 AARCH64_SIGN_LINK_REGISTER
747 // convert
length into blocks
757 .Lecb_8_blocks_process
:
759 b
.lt .Lecb_4_blocks_process
760 ld4
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$inp],#64
761 ld4
{@datax[0].4s
,$datax[1].4s
,@datax[2].4s
,@datax[3].4s
},[$inp],#64
763 &rev32
(@data[0],@data[0]);
764 &rev32
(@data[1],@data[1]);
765 &rev32
(@data[2],@data[2]);
766 &rev32
(@data[3],@data[3]);
767 &rev32
(@datax[0],@datax[0]);
768 &rev32
(@datax[1],@datax[1]);
769 &rev32
(@datax[2],@datax[2]);
770 &rev32
(@datax[3],@datax[3]);
773 st4
{@vtmp[0].4s
,@vtmp[1].4s
,@vtmp[2].4s
,@vtmp[3].4s
},[$outp],#64
774 st4
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$outp],#64
775 subs
$blocks,$blocks,#8
776 b
.gt .Lecb_8_blocks_process
778 .Lecb_4_blocks_process
:
781 ld4
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$inp],#64
783 &rev32
(@data[0],@data[0]);
784 &rev32
(@data[1],@data[1]);
785 &rev32
(@data[2],@data[2]);
786 &rev32
(@data[3],@data[3]);
789 st4
{@vtmp[0].4s
,@vtmp[1].4s
,@vtmp[2].4s
,@vtmp[3].4s
},[$outp],#64
790 sub $blocks,$blocks,#4
792 // process
last block
796 ld1
{@data[0].4s
},[$inp]
798 &rev32
(@data[0],@data[0]);
799 &encrypt_1blk
(@data[0]);
801 st1
{@data[0].4s
},[$outp]
803 1: // process
last 2 blocks
804 ld4
{@data[0].s
,@data[1].s
,@data[2].s
,@data[3].s
}[0],[$inp],#16
805 ld4
{@data[0].s
,@data[1].s
,@data[2].s
,@data[3].s
}[1],[$inp],#16
809 &rev32
(@data[0],@data[0]);
810 &rev32
(@data[1],@data[1]);
811 &rev32
(@data[2],@data[2]);
812 &rev32
(@data[3],@data[3]);
815 st4
{@vtmp[0].s
-@vtmp[3].s
}[0],[$outp],#16
816 st4
{@vtmp[0].s
-@vtmp[3].s
}[1],[$outp]
818 1: // process
last 3 blocks
819 ld4
{@data[0].s
,@data[1].s
,@data[2].s
,@data[3].s
}[2],[$inp],#16
821 &rev32
(@data[0],@data[0]);
822 &rev32
(@data[1],@data[1]);
823 &rev32
(@data[2],@data[2]);
824 &rev32
(@data[3],@data[3]);
827 st4
{@vtmp[0].s
-@vtmp[3].s
}[0],[$outp],#16
828 st4
{@vtmp[0].s
-@vtmp[3].s
}[1],[$outp],#16
829 st4
{@vtmp[0].s
-@vtmp[3].s
}[2],[$outp]
836 AARCH64_VALIDATE_LINK_REGISTER
838 .size
${prefix
}_ecb_encrypt
,.-${prefix
}_ecb_encrypt
843 my ($len,$ivp,$enc)=("x2","x4","w5");
848 .globl
${prefix
}_cbc_encrypt
849 .type
${prefix
}_cbc_encrypt
,%function
851 ${prefix
}_cbc_encrypt
:
852 AARCH64_VALID_CALL_TARGET
858 ld1
{$ivec0.4s
},[$ivp]
862 ld1
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$inp],#64
863 eor
@data[0].16b
,@data[0].16b
,$ivec0.16b
865 &rev32
(@data[1],@data[1]);
866 &rev32
(@data[0],@data[0]);
867 &rev32
(@data[2],@data[2]);
868 &rev32
(@data[3],@data[3]);
869 &encrypt_1blk_norev
(@data[0]);
871 eor
@data[1].16b
,@data[1].16b
,@data[0].16b
873 &encrypt_1blk_norev
(@data[1]);
874 &rev32
(@data[0],@data[0]);
877 eor
@data[2].16b
,@data[2].16b
,@data[1].16b
879 &encrypt_1blk_norev
(@data[2]);
880 &rev32
(@data[1],@data[1]);
882 eor
@data[3].16b
,@data[3].16b
,@data[2].16b
884 &encrypt_1blk_norev
(@data[3]);
885 &rev32
(@data[2],@data[2]);
886 &rev32
(@data[3],@data[3]);
888 orr
$ivec0.16b
,@data[3].16b
,@data[3].16b
889 st1
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$outp],#64
890 subs
$blocks,$blocks,#4
891 b
.ne .Lcbc_4_blocks_enc
894 subs
$blocks,$blocks,#1
896 ld1
{@data[0].4s
},[$inp],#16
897 eor
$ivec0.16b
,$ivec0.16b
,@data[0].16b
899 &rev32
($ivec0,$ivec0);
900 &encrypt_1blk
($ivec0);
902 st1
{$ivec0.4s
},[$outp],#16
906 st1
{$ivec0.4s
},[$ivp]
910 // decryption mode starts
911 AARCH64_SIGN_LINK_REGISTER
920 ld4
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$inp]
922 ld4
{@datax[0].4s
,@datax[1].4s
,@datax[2].4s
,@datax[3].4s
},[$ptr]
924 &rev32
(@data[0],@data[0]);
925 &rev32
(@data[1],@data[1]);
926 &rev32
(@data[2],@data[2]);
927 &rev32
(@data[3],$data[3]);
928 &rev32
(@datax[0],@datax[0]);
929 &rev32
(@datax[1],@datax[1]);
930 &rev32
(@datax[2],@datax[2]);
931 &rev32
(@datax[3],$datax[3]);
935 &transpose
(@vtmp,@datax);
936 &transpose
(@data,@datax);
938 ld1
{$ivec1.4s
},[$ivp]
939 ld1
{@datax[0].4s
,@datax[1].4s
,@datax[2].4s
,@datax[3].4s
},[$inp],#64
940 // note ivec1
and vtmpx
[3] are reusing the same register
941 // care needs to be taken to avoid conflict
942 eor
@vtmp[0].16b
,@vtmp[0].16b
,$ivec1.16b
943 ld1
{@vtmpx[0].4s
,@vtmpx[1].4s
,@vtmpx[2].4s
,@vtmpx[3].4s
},[$inp],#64
944 eor
@vtmp[1].16b
,@vtmp[1].16b
,@datax[0].16b
945 eor
@vtmp[2].16b
,@vtmp[2].16b
,@datax[1].16b
946 eor
@vtmp[3].16b
,$vtmp[3].16b
,@datax[2].16b
948 st1
{$vtmpx[3].4s
}, [$ivp]
949 eor
@data[0].16b
,@data[0].16b
,$datax[3].16b
950 eor
@data[1].16b
,@data[1].16b
,@vtmpx[0].16b
951 eor
@data[2].16b
,@data[2].16b
,@vtmpx[1].16b
952 eor
@data[3].16b
,$data[3].16b
,@vtmpx[2].16b
953 st1
{@vtmp[0].4s
,@vtmp[1].4s
,@vtmp[2].4s
,@vtmp[3].4s
},[$outp],#64
954 st1
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$outp],#64
955 subs
$blocks,$blocks,#8
956 b
.gt .Lcbc_8_blocks_dec
959 ld1
{$ivec1.4s
},[$ivp]
963 ld4
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$inp]
965 &rev32
(@data[0],@data[0]);
966 &rev32
(@data[1],@data[1]);
967 &rev32
(@data[2],@data[2]);
968 &rev32
(@data[3],$data[3]);
971 ld1
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$inp],#64
973 &transpose
(@vtmp,@datax);
975 eor
@vtmp[0].16b
,@vtmp[0].16b
,$ivec1.16b
976 eor
@vtmp[1].16b
,@vtmp[1].16b
,@data[0].16b
977 orr
$ivec1.16b
,@data[3].16b
,@data[3].16b
978 eor
@vtmp[2].16b
,@vtmp[2].16b
,@data[1].16b
979 eor
@vtmp[3].16b
,$vtmp[3].16b
,@data[2].16b
980 st1
{@vtmp[0].4s
,@vtmp[1].4s
,@vtmp[2].4s
,@vtmp[3].4s
},[$outp],#64
981 subs
$blocks,$blocks,#4
982 b
.gt .Lcbc_4_blocks_dec
984 st1
{@data[3].4s
}, [$ivp]
987 subs
$blocks,$blocks,#1
990 ld1
{@data[0].4s
},[$inp],#16
992 st1
{$data[0].4s
}, [$ivp]
994 &rev32
(@datax[0],@data[0]);
995 &encrypt_1blk
(@datax[0]);
997 eor
@datax[0].16b
,@datax[0].16b
,$ivec1.16b
998 st1
{@datax[0].4s
},[$outp],#16
1000 1: // last two blocks
1001 ld4
{@data[0].s
,@data[1].s
,@data[2].s
,@data[3].s
}[0],[$inp]
1003 ld4
{@data[0].s
,@data[1].s
,@data[2].s
,@data[3].s
}[1],[$ptr],#16
1004 subs
$blocks,$blocks,1
1007 &rev32
(@data[0],@data[0]);
1008 &rev32
(@data[1],@data[1]);
1009 &rev32
(@data[2],@data[2]);
1010 &rev32
(@data[3],@data[3]);
1013 ld1
{@data[0].4s
,@data[1].4s
},[$inp],#32
1015 &transpose
(@vtmp,@datax);
1017 eor
@vtmp[0].16b
,@vtmp[0].16b
,$ivec1.16b
1018 eor
@vtmp[1].16b
,@vtmp[1].16b
,@data[0].16b
1019 st1
{@vtmp[0].4s
,@vtmp[1].4s
},[$outp],#32
1021 st1
{@data[1].4s
}, [$ivp]
1024 ld4
{@data[0].s
,@data[1].s
,@data[2].s
,@data[3].s
}[2],[$ptr]
1026 &rev32
(@data[0],@data[0]);
1027 &rev32
(@data[1],@data[1]);
1028 &rev32
(@data[2],@data[2]);
1029 &rev32
(@data[3],@data[3]);
1032 ld1
{@data[0].4s
,@data[1].4s
,@data[2].4s
},[$inp],#48
1034 &transpose
(@vtmp,@datax);
1036 eor
@vtmp[0].16b
,@vtmp[0].16b
,$ivec1.16b
1037 eor
@vtmp[1].16b
,@vtmp[1].16b
,@data[0].16b
1038 eor
@vtmp[2].16b
,@vtmp[2].16b
,@data[1].16b
1039 st1
{@vtmp[0].4s
,@vtmp[1].4s
,@vtmp[2].4s
},[$outp],#48
1041 st1
{@data[2].4s
}, [$ivp]
1043 ldp d10
,d11
,[sp
,#16]
1044 ldp d12
,d13
,[sp
,#32]
1045 ldp d14
,d15
,[sp
,#48]
1046 ldp x29
,x30
,[sp
,#64]
1048 AARCH64_VALIDATE_LINK_REGISTER
1050 .size
${prefix
}_cbc_encrypt
,.-${prefix
}_cbc_encrypt
1060 .globl
${prefix
}_ctr32_encrypt_blocks
1061 .type
${prefix
}_ctr32_encrypt_blocks
,%function
1063 ${prefix
}_ctr32_encrypt_blocks
:
1064 AARCH64_VALID_CALL_TARGET
1065 ld1
{$ivec.4s
},[$ivp]
1067 &rev32
($ivec,$ivec);
1072 // fast processing
for one single block without
1073 // context saving overhead
1075 &encrypt_1blk
($ivec);
1077 ld1
{@data[0].4s
},[$inp]
1078 eor
@data[0].16b
,@data[0].16b
,$ivec.16b
1079 st1
{@data[0].4s
},[$outp]
1082 AARCH64_SIGN_LINK_REGISTER
1083 stp d8
,d9
,[sp
,#-80]!
1084 stp d10
,d11
,[sp
,#16]
1085 stp d12
,d13
,[sp
,#32]
1086 stp d14
,d15
,[sp
,#48]
1087 stp x29
,x30
,[sp
,#64]
1088 mov
$word0,$ivec.s
[0]
1089 mov
$word1,$ivec.s
[1]
1090 mov
$word2,$ivec.s
[2]
1092 .Lctr32_4_blocks_process
:
1095 dup
@data[0].4s
,$word0
1096 dup
@data[1].4s
,$word1
1097 dup
@data[2].4s
,$word2
1098 mov
@data[3].s
[0],$ctr
1100 mov
$data[3].s
[1],$ctr
1102 mov
@data[3].s
[2],$ctr
1104 mov
@data[3].s
[3],$ctr
1107 b
.ge .Lctr32_8_blocks_process
1109 ld4
{@vtmpx[0].4s
,@vtmpx[1].4s
,@vtmpx[2].4s
,@vtmpx[3].4s
},[$inp],#64
1110 eor
@vtmp[0].16b
,@vtmp[0].16b
,@vtmpx[0].16b
1111 eor
@vtmp[1].16b
,@vtmp[1].16b
,@vtmpx[1].16b
1112 eor
@vtmp[2].16b
,@vtmp[2].16b
,@vtmpx[2].16b
1113 eor
@vtmp[3].16b
,@vtmp[3].16b
,@vtmpx[3].16b
1114 st4
{@vtmp[0].4s
,@vtmp[1].4s
,@vtmp[2].4s
,@vtmp[3].4s
},[$outp],#64
1115 subs
$blocks,$blocks,#4
1116 b
.ne .Lctr32_4_blocks_process
1118 .Lctr32_8_blocks_process
:
1119 dup
@datax[0].4s
,$word0
1120 dup
@datax[1].4s
,$word1
1121 dup
@datax[2].4s
,$word2
1122 mov
@datax[3].s
[0],$ctr
1124 mov
$datax[3].s
[1],$ctr
1126 mov
@datax[3].s
[2],$ctr
1128 mov
@datax[3].s
[3],$ctr
1131 ld4
{@vtmpx[0].4s
,@vtmpx[1].4s
,@vtmpx[2].4s
,@vtmpx[3].4s
},[$inp],#64
1132 ld4
{@datax[0].4s
,@datax[1].4s
,@datax[2].4s
,@datax[3].4s
},[$inp],#64
1133 eor
@vtmp[0].16b
,@vtmp[0].16b
,@vtmpx[0].16b
1134 eor
@vtmp[1].16b
,@vtmp[1].16b
,@vtmpx[1].16b
1135 eor
@vtmp[2].16b
,@vtmp[2].16b
,@vtmpx[2].16b
1136 eor
@vtmp[3].16b
,@vtmp[3].16b
,@vtmpx[3].16b
1137 eor
@data[0].16b
,@data[0].16b
,@datax[0].16b
1138 eor
@data[1].16b
,@data[1].16b
,@datax[1].16b
1139 eor
@data[2].16b
,@data[2].16b
,@datax[2].16b
1140 eor
@data[3].16b
,@data[3].16b
,@datax[3].16b
1141 st4
{@vtmp[0].4s
,@vtmp[1].4s
,@vtmp[2].4s
,@vtmp[3].4s
},[$outp],#64
1142 st4
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$outp],#64
1143 subs
$blocks,$blocks,#8
1144 b
.ne .Lctr32_4_blocks_process
1146 1: // last block processing
1147 subs
$blocks,$blocks,#1
1150 mov
$ivec.s
[0],$word0
1151 mov
$ivec.s
[1],$word1
1152 mov
$ivec.s
[2],$word2
1155 &encrypt_1blk
($ivec);
1157 ld1
{@data[0].4s
},[$inp]
1158 eor
@data[0].16b
,@data[0].16b
,$ivec.16b
1159 st1
{@data[0].4s
},[$outp]
1161 1: // last 2 blocks processing
1162 dup
@data[0].4s
,$word0
1163 dup
@data[1].4s
,$word1
1164 dup
@data[2].4s
,$word2
1165 mov
@data[3].s
[0],$ctr
1167 mov
@data[3].s
[1],$ctr
1168 subs
$blocks,$blocks,#1
1171 ld4
{@vtmpx[0].s
,@vtmpx[1].s
,@vtmpx[2].s
,@vtmpx[3].s
}[0],[$inp],#16
1172 ld4
{@vtmpx[0].s
,@vtmpx[1].s
,@vtmpx[2].s
,@vtmpx[3].s
}[1],[$inp],#16
1173 eor
@vtmp[0].16b
,@vtmp[0].16b
,@vtmpx[0].16b
1174 eor
@vtmp[1].16b
,@vtmp[1].16b
,@vtmpx[1].16b
1175 eor
@vtmp[2].16b
,@vtmp[2].16b
,@vtmpx[2].16b
1176 eor
@vtmp[3].16b
,@vtmp[3].16b
,@vtmpx[3].16b
1177 st4
{@vtmp[0].s
,@vtmp[1].s
,@vtmp[2].s
,@vtmp[3].s
}[0],[$outp],#16
1178 st4
{@vtmp[0].s
,@vtmp[1].s
,@vtmp[2].s
,@vtmp[3].s
}[1],[$outp],#16
1180 1: // last 3 blocks processing
1182 mov
@data[3].s
[2],$ctr
1184 ld4
{@vtmpx[0].s
,@vtmpx[1].s
,@vtmpx[2].s
,@vtmpx[3].s
}[0],[$inp],#16
1185 ld4
{@vtmpx[0].s
,@vtmpx[1].s
,@vtmpx[2].s
,@vtmpx[3].s
}[1],[$inp],#16
1186 ld4
{@vtmpx[0].s
,@vtmpx[1].s
,@vtmpx[2].s
,@vtmpx[3].s
}[2],[$inp],#16
1187 eor
@vtmp[0].16b
,@vtmp[0].16b
,@vtmpx[0].16b
1188 eor
@vtmp[1].16b
,@vtmp[1].16b
,@vtmpx[1].16b
1189 eor
@vtmp[2].16b
,@vtmp[2].16b
,@vtmpx[2].16b
1190 eor
@vtmp[3].16b
,@vtmp[3].16b
,@vtmpx[3].16b
1191 st4
{@vtmp[0].s
,@vtmp[1].s
,@vtmp[2].s
,@vtmp[3].s
}[0],[$outp],#16
1192 st4
{@vtmp[0].s
,@vtmp[1].s
,@vtmp[2].s
,@vtmp[3].s
}[1],[$outp],#16
1193 st4
{@vtmp[0].s
,@vtmp[1].s
,@vtmp[2].s
,@vtmp[3].s
}[2],[$outp],#16
1195 ldp d10
,d11
,[sp
,#16]
1196 ldp d12
,d13
,[sp
,#32]
1197 ldp d14
,d15
,[sp
,#48]
1198 ldp x29
,x30
,[sp
,#64]
1200 AARCH64_VALIDATE_LINK_REGISTER
1202 .size
${prefix
}_ctr32_encrypt_blocks
,.-${prefix
}_ctr32_encrypt_blocks
1207 my ($blocks,$len)=("x2","x2");
1209 my @twx=map("x$_",(12..27));
1210 my ($rks1,$rks2)=("x26","x27");
1211 my $lastBlk=("x26");
1217 sub gen_xts_cipher
() {
1220 .globl
${prefix
}_xts_encrypt
${std
}
1221 .type
${prefix
}_xts_encrypt
${std
},%function
1223 ${prefix
}_xts_encrypt
${std
}:
1224 AARCH64_SIGN_LINK_REGISTER
1225 stp x15
, x16
, [sp
, #-0x10]!
1226 stp x17
, x18
, [sp
, #-0x10]!
1227 stp x19
, x20
, [sp
, #-0x10]!
1228 stp x21
, x22
, [sp
, #-0x10]!
1229 stp x23
, x24
, [sp
, #-0x10]!
1230 stp x25
, x26
, [sp
, #-0x10]!
1231 stp x27
, x28
, [sp
, #-0x10]!
1232 stp x29
, x30
, [sp
, #-0x10]!
1233 stp d8
, d9
, [sp
, #-0x10]!
1234 stp d10
, d11
, [sp
, #-0x10]!
1235 stp d12
, d13
, [sp
, #-0x10]!
1236 stp d14
, d15
, [sp
, #-0x10]!
1240 ld1
{@tweak[0].4s
}, [$ivp]
1244 &rev32
(@tweak[0],@tweak[0]);
1245 &encrypt_1blk
(@tweak[0]);
1248 and $remain,$len,#0x0F
1249 // convert
length into blocks
1255 // If the encryption
/decryption Length is N
times of
16,
1256 // the all blocks are encrypted
/decrypted
in .xts_encrypt_blocks
${std
}
1257 b
.eq .xts_encrypt_blocks
${std
}
1259 // If the encryption
/decryption
length is
not N
times of
16,
1260 // the
last two blocks are encrypted
/decrypted
in .last_2blks_tweak
${std
} or .only_2blks_tweak
${std
}
1261 // the other blocks are encrypted
/decrypted
in .xts_encrypt_blocks
${std
}
1262 subs
$blocks,$blocks,#1
1263 b
.eq .only_2blks_tweak
${std
}
1264 .xts_encrypt_blocks
${std
}:
1266 &rbit
(@tweak[0],@tweak[0],$std);
1267 &rev32_armeb
(@tweak[0],@tweak[0]);
1268 &mov_vec_to_reg
(@tweak[0],@twx[0],@twx[1]);
1269 &compute_tweak
(@twx[0],@twx[1],@twx[2],@twx[3]);
1270 &compute_tweak
(@twx[2],@twx[3],@twx[4],@twx[5]);
1271 &compute_tweak
(@twx[4],@twx[5],@twx[6],@twx[7]);
1272 &compute_tweak
(@twx[6],@twx[7],@twx[8],@twx[9]);
1273 &compute_tweak
(@twx[8],@twx[9],@twx[10],@twx[11]);
1274 &compute_tweak
(@twx[10],@twx[11],@twx[12],@twx[13]);
1275 &compute_tweak
(@twx[12],@twx[13],@twx[14],@twx[15]);
1277 .Lxts_8_blocks_process
${std
}:
1279 b
.lt .Lxts_4_blocks_process
${std
}
1281 &mov_reg_to_vec
(@twx[0],@twx[1],@vtmp[0]);
1282 &mov_reg_to_vec
(@twx[2],@twx[3],@vtmp[1]);
1283 &mov_reg_to_vec
(@twx[4],@twx[5],@vtmp[2]);
1284 &mov_reg_to_vec
(@twx[6],@twx[7],@vtmp[3]);
1285 &mov_reg_to_vec
(@twx[8],@twx[9],@vtmpx[0]);
1286 &mov_reg_to_vec
(@twx[10],@twx[11],@vtmpx[1]);
1287 &mov_reg_to_vec
(@twx[12],@twx[13],@vtmpx[2]);
1288 &mov_reg_to_vec
(@twx[14],@twx[15],@vtmpx[3]);
1290 ld1
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$inp],#64
1292 &rbit
(@vtmp[0],@vtmp[0],$std);
1293 &rbit
(@vtmp[1],@vtmp[1],$std);
1294 &rbit
(@vtmp[2],@vtmp[2],$std);
1295 &rbit
(@vtmp[3],@vtmp[3],$std);
1297 eor
@data[0].16b
, @data[0].16b
, @vtmp[0].16b
1298 eor
@data[1].16b
, @data[1].16b
, @vtmp[1].16b
1299 eor
@data[2].16b
, @data[2].16b
, @vtmp[2].16b
1300 eor
@data[3].16b
, @data[3].16b
, @vtmp[3].16b
1301 ld1
{@datax[0].4s
,$datax[1].4s
,@datax[2].4s
,@datax[3].4s
},[$inp],#64
1303 &rbit
(@vtmpx[0],@vtmpx[0],$std);
1304 &rbit
(@vtmpx[1],@vtmpx[1],$std);
1305 &rbit
(@vtmpx[2],@vtmpx[2],$std);
1306 &rbit
(@vtmpx[3],@vtmpx[3],$std);
1308 eor
@datax[0].16b
, @datax[0].16b
, @vtmpx[0].16b
1309 eor
@datax[1].16b
, @datax[1].16b
, @vtmpx[1].16b
1310 eor
@datax[2].16b
, @datax[2].16b
, @vtmpx[2].16b
1311 eor
@datax[3].16b
, @datax[3].16b
, @vtmpx[3].16b
1313 &rev32
(@data[0],@data[0]);
1314 &rev32
(@data[1],@data[1]);
1315 &rev32
(@data[2],@data[2]);
1316 &rev32
(@data[3],@data[3]);
1317 &rev32
(@datax[0],@datax[0]);
1318 &rev32
(@datax[1],@datax[1]);
1319 &rev32
(@datax[2],@datax[2]);
1320 &rev32
(@datax[3],@datax[3]);
1321 &transpose
(@data,@vtmp);
1322 &transpose
(@datax,@vtmp);
1324 bl _
${prefix
}_enc_8blks
1326 &transpose
(@vtmp,@datax);
1327 &transpose
(@data,@datax);
1329 &mov_reg_to_vec
(@twx[0],@twx[1],@vtmpx[0]);
1330 &compute_tweak
(@twx[14],@twx[15],@twx[0],@twx[1]);
1331 &mov_reg_to_vec
(@twx[2],@twx[3],@vtmpx[1]);
1332 &compute_tweak
(@twx[0],@twx[1],@twx[2],@twx[3]);
1333 &mov_reg_to_vec
(@twx[4],@twx[5],@vtmpx[2]);
1334 &compute_tweak
(@twx[2],@twx[3],@twx[4],@twx[5]);
1335 &mov_reg_to_vec
(@twx[6],@twx[7],@vtmpx[3]);
1336 &compute_tweak
(@twx[4],@twx[5],@twx[6],@twx[7]);
1337 &mov_reg_to_vec
(@twx[8],@twx[9],@tweak[0]);
1338 &compute_tweak
(@twx[6],@twx[7],@twx[8],@twx[9]);
1339 &mov_reg_to_vec
(@twx[10],@twx[11],@tweak[1]);
1340 &compute_tweak
(@twx[8],@twx[9],@twx[10],@twx[11]);
1341 &mov_reg_to_vec
(@twx[12],@twx[13],@tweak[2]);
1342 &compute_tweak
(@twx[10],@twx[11],@twx[12],@twx[13]);
1343 &mov_reg_to_vec
(@twx[14],@twx[15],@tweak[3]);
1344 &compute_tweak
(@twx[12],@twx[13],@twx[14],@twx[15]);
1346 eor
@vtmp[0].16b
, @vtmp[0].16b
, @vtmpx[0].16b
1347 eor
@vtmp[1].16b
, @vtmp[1].16b
, @vtmpx[1].16b
1348 eor
@vtmp[2].16b
, @vtmp[2].16b
, @vtmpx[2].16b
1349 eor
@vtmp[3].16b
, @vtmp[3].16b
, @vtmpx[3].16b
1350 eor
@data[0].16b
, @data[0].16b
, @tweak[0].16b
1351 eor
@data[1].16b
, @data[1].16b
, @tweak[1].16b
1352 eor
@data[2].16b
, @data[2].16b
, @tweak[2].16b
1353 eor
@data[3].16b
, @data[3].16b
, @tweak[3].16b
1355 // save the
last tweak
1356 st1
{@tweak[3].4s
},[$ivp]
1357 st1
{@vtmp[0].4s
,@vtmp[1].4s
,@vtmp[2].4s
,@vtmp[3].4s
},[$outp],#64
1358 st1
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$outp],#64
1359 subs
$blocks,$blocks,#8
1360 b
.gt .Lxts_8_blocks_process
${std
}
1362 .Lxts_4_blocks_process
${std
}:
1364 &mov_reg_to_vec
(@twx[0],@twx[1],@tweak[0]);
1365 &mov_reg_to_vec
(@twx[2],@twx[3],@tweak[1]);
1366 &mov_reg_to_vec
(@twx[4],@twx[5],@tweak[2]);
1367 &mov_reg_to_vec
(@twx[6],@twx[7],@tweak[3]);
1371 ld1
{@data[0].4s
,@data[1].4s
,@data[2].4s
,@data[3].4s
},[$inp],#64
1373 &rbit
(@tweak[0],@tweak[0],$std);
1374 &rbit
(@tweak[1],@tweak[1],$std);
1375 &rbit
(@tweak[2],@tweak[2],$std);
1376 &rbit
(@tweak[3],@tweak[3],$std);
1378 eor
@data[0].16b
, @data[0].16b
, @tweak[0].16b
1379 eor
@data[1].16b
, @data[1].16b
, @tweak[1].16b
1380 eor
@data[2].16b
, @data[2].16b
, @tweak[2].16b
1381 eor
@data[3].16b
, @data[3].16b
, @tweak[3].16b
1383 &rev32
(@data[0],@data[0]);
1384 &rev32
(@data[1],@data[1]);
1385 &rev32
(@data[2],@data[2]);
1386 &rev32
(@data[3],@data[3]);
1387 &transpose
(@data,@vtmp);
1389 bl _
${prefix
}_enc_4blks
1391 &transpose
(@vtmp,@data);
1393 eor
@vtmp[0].16b
, @vtmp[0].16b
, @tweak[0].16b
1394 eor
@vtmp[1].16b
, @vtmp[1].16b
, @tweak[1].16b
1395 eor
@vtmp[2].16b
, @vtmp[2].16b
, @tweak[2].16b
1396 eor
@vtmp[3].16b
, @vtmp[3].16b
, @tweak[3].16b
1397 st1
{@vtmp[0].4s
,@vtmp[1].4s
,@vtmp[2].4s
,@vtmp[3].4s
},[$outp],#64
1398 sub $blocks,$blocks,#4
1400 &mov_reg_to_vec
(@twx[8],@twx[9],@tweak[0]);
1401 &mov_reg_to_vec
(@twx[10],@twx[11],@tweak[1]);
1402 &mov_reg_to_vec
(@twx[12],@twx[13],@tweak[2]);
1404 // save the
last tweak
1405 st1
{@tweak[3].4s
},[$ivp]
1407 // process
last block
1411 ld1
{@data[0].4s
},[$inp],#16
1413 &rbit
(@tweak[0],@tweak[0],$std);
1415 eor
@data[0].16b
, @data[0].16b
, @tweak[0].16b
1417 &rev32
(@data[0],@data[0]);
1418 &encrypt_1blk
(@data[0]);
1420 eor
@data[0].16b
, @data[0].16b
, @tweak[0].16b
1421 st1
{@data[0].4s
},[$outp],#16
1422 // save the
last tweak
1423 st1
{@tweak[0].4s
},[$ivp]
1425 1: // process
last 2 blocks
1428 ld1
{@data[0].4s
,@data[1].4s
},[$inp],#32
1430 &rbit
(@tweak[0],@tweak[0],$std);
1431 &rbit
(@tweak[1],@tweak[1],$std);
1433 eor
@data[0].16b
, @data[0].16b
, @tweak[0].16b
1434 eor
@data[1].16b
, @data[1].16b
, @tweak[1].16b
1436 &rev32
(@data[0],@data[0]);
1437 &rev32
(@data[1],@data[1]);
1438 &transpose
(@data,@vtmp);
1440 bl _
${prefix
}_enc_4blks
1442 &transpose
(@vtmp,@data);
1444 eor
@vtmp[0].16b
, @vtmp[0].16b
, @tweak[0].16b
1445 eor
@vtmp[1].16b
, @vtmp[1].16b
, @tweak[1].16b
1446 st1
{@vtmp[0].4s
,@vtmp[1].4s
},[$outp],#32
1447 // save the
last tweak
1448 st1
{@tweak[1].4s
},[$ivp]
1450 1: // process
last 3 blocks
1451 ld1
{@data[0].4s
,@data[1].4s
,@data[2].4s
},[$inp],#48
1453 &rbit
(@tweak[0],@tweak[0],$std);
1454 &rbit
(@tweak[1],@tweak[1],$std);
1455 &rbit
(@tweak[2],@tweak[2],$std);
1457 eor
@data[0].16b
, @data[0].16b
, @tweak[0].16b
1458 eor
@data[1].16b
, @data[1].16b
, @tweak[1].16b
1459 eor
@data[2].16b
, @data[2].16b
, @tweak[2].16b
1461 &rev32
(@data[0],@data[0]);
1462 &rev32
(@data[1],@data[1]);
1463 &rev32
(@data[2],@data[2]);
1464 &transpose
(@data,@vtmp);
1466 bl _
${prefix
}_enc_4blks
1468 &transpose
(@vtmp,@data);
1470 eor
@vtmp[0].16b
, @vtmp[0].16b
, @tweak[0].16b
1471 eor
@vtmp[1].16b
, @vtmp[1].16b
, @tweak[1].16b
1472 eor
@vtmp[2].16b
, @vtmp[2].16b
, @tweak[2].16b
1473 st1
{@vtmp[0].4s
,@vtmp[1].4s
,@vtmp[2].4s
},[$outp],#48
1474 // save the
last tweak
1475 st1
{@tweak[2].4s
},[$ivp]
1480 // This branch calculates the
last two tweaks
,
1481 // while the encryption
/decryption
length is larger than
32
1482 .last_2blks_tweak
${std
}:
1483 ld1
{@tweak[0].4s
},[$ivp]
1485 &rev32_armeb
(@tweak[0],@tweak[0]);
1486 &compute_tweak_vec
(@tweak[0],@tweak[1],$std);
1487 &compute_tweak_vec
(@tweak[1],@tweak[2],$std);
1492 // This branch calculates the
last two tweaks
,
1493 // while the encryption
/decryption
length is equal to
32, who only need two tweaks
1494 .only_2blks_tweak
${std
}:
1495 mov
@tweak[1].16b
,@tweak[0].16b
1497 &rev32_armeb
(@tweak[1],@tweak[1]);
1498 &compute_tweak_vec
(@tweak[1],@tweak[2],$std);
1503 // Determine whether encryption
or decryption is required
.
1504 // The
last two tweaks need to be swapped
for decryption
.
1506 // encryption
:1 decryption
:0
1508 b
.eq .process_last_2blks
${std
}
1509 mov
@vtmp[0].16B
,@tweak[1].16b
1510 mov
@tweak[1].16B
,@tweak[2].16b
1511 mov
@tweak[2].16B
,@vtmp[0].16b
1513 .process_last_2blks
${std
}:
1515 &rev32_armeb
(@tweak[1],@tweak[1]);
1516 &rev32_armeb
(@tweak[2],@tweak[2]);
1518 ld1
{@data[0].4s
},[$inp],#16
1519 eor
@data[0].16b
, @data[0].16b
, @tweak[1].16b
1521 &rev32
(@data[0],@data[0]);
1522 &encrypt_1blk
(@data[0]);
1524 eor
@data[0].16b
, @data[0].16b
, @tweak[1].16b
1525 st1
{@data[0].4s
},[$outp],#16
1527 sub $lastBlk,$outp,16
1529 subs
$remain,$remain,1
1530 ldrb
$wtmp0,[$lastBlk,$remain]
1531 ldrb
$wtmp1,[$inp,$remain]
1532 strb
$wtmp1,[$lastBlk,$remain]
1533 strb
$wtmp0,[$outp,$remain]
1535 ld1
{@data[0].4s
}, [$lastBlk]
1536 eor
@data[0].16b
, @data[0].16b
, @tweak[2].16b
1538 &rev32
(@data[0],@data[0]);
1539 &encrypt_1blk
(@data[0]);
1541 eor
@data[0].16b
, @data[0].16b
, @tweak[2].16b
1542 st1
{@data[0].4s
}, [$lastBlk]
1544 ldp d14
, d15
, [sp
], #0x10
1545 ldp d12
, d13
, [sp
], #0x10
1546 ldp d10
, d11
, [sp
], #0x10
1547 ldp d8
, d9
, [sp
], #0x10
1548 ldp x29
, x30
, [sp
], #0x10
1549 ldp x27
, x28
, [sp
], #0x10
1550 ldp x25
, x26
, [sp
], #0x10
1551 ldp x23
, x24
, [sp
], #0x10
1552 ldp x21
, x22
, [sp
], #0x10
1553 ldp x19
, x20
, [sp
], #0x10
1554 ldp x17
, x18
, [sp
], #0x10
1555 ldp x15
, x16
, [sp
], #0x10
1556 AARCH64_VALIDATE_LINK_REGISTER
1558 .size
${prefix
}_xts_encrypt
${std
},.-${prefix
}_xts_encrypt
${std
}
1560 } # end of gen_xts_cipher
1561 &gen_xts_cipher
("_gb");
1562 &gen_xts_cipher
("");
1564 ########################################
1568 last if (!s/^#/\/\
// and !/^$/);
1573 foreach(split("\n",$code)) {
1574 s/\`([^\`]*)\`/eval($1)/ge;
1578 close STDOUT
or die "error closing STDOUT: $!";