]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/sm4/asm/vpsm4-armv8.pl
Copyright year updates
[thirdparty/openssl.git] / crypto / sm4 / asm / vpsm4-armv8.pl
1 #! /usr/bin/env perl
2 # Copyright 2020-2024 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # This module implements SM4 with ASIMD on aarch64
11 #
12 # Feb 2022
13 #
14
15 # $output is the last argument if it looks like a file (it has an extension)
16 # $flavour is the first argument if it doesn't look like a file
17 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
19
20 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23 die "can't locate arm-xlate.pl";
24
25 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26 or die "can't call $xlate: $!";
27 *STDOUT=*OUT;
28
29 $prefix="vpsm4";
30 my @vtmp=map("v$_",(0..3));
31 my @qtmp=map("q$_",(0..3));
32 my @data=map("v$_",(4..7));
33 my @datax=map("v$_",(8..11));
34 my ($rk0,$rk1)=("v12","v13");
35 my ($rka,$rkb)=("v14","v15");
36 my @vtmpx=map("v$_",(12..15));
37 my @sbox=map("v$_",(16..31));
38 my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
39 my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
40 my ($xtmp1,$xtmp2)=("x8","x9");
41 my ($ptr,$counter)=("x10","w11");
42 my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
43
44 sub rev32() {
45 my $dst = shift;
46 my $src = shift;
47
48 if ($src and ("$src" ne "$dst")) {
49 $code.=<<___;
50 #ifndef __AARCH64EB__
51 rev32 $dst.16b,$src.16b
52 #else
53 mov $dst.16b,$src.16b
54 #endif
55 ___
56 } else {
57 $code.=<<___;
58 #ifndef __AARCH64EB__
59 rev32 $dst.16b,$dst.16b
60 #endif
61 ___
62 }
63 }
64
65 sub rev32_armeb() {
66 my $dst = shift;
67 my $src = shift;
68
69 if ($src and ("$src" ne "$dst")) {
70 $code.=<<___;
71 #ifdef __AARCH64EB__
72 rev32 $dst.16b,$src.16b
73 #else
74 mov $dst.16b,$src.16b
75 #endif
76 ___
77 } else {
78 $code.=<<___;
79 #ifdef __AARCH64EB__
80 rev32 $dst.16b,$dst.16b
81 #endif
82 ___
83 }
84 }
85
86 sub rbit() {
87 my $dst = shift;
88 my $src = shift;
89 my $std = shift;
90
91 if ($src and ("$src" ne "$dst")) {
92 if ($std eq "_gb") {
93 $code.=<<___;
94 rbit $dst.16b,$src.16b
95 ___
96 } else {
97 $code.=<<___;
98 mov $dst.16b,$src.16b
99 ___
100 }
101 } else {
102 if ($std eq "_gb") {
103 $code.=<<___;
104 rbit $dst.16b,$src.16b
105 ___
106 }
107 }
108 }
109
110 sub transpose() {
111 my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
112
113 $code.=<<___;
114 zip1 $vt0.4s,$dat0.4s,$dat1.4s
115 zip2 $vt1.4s,$dat0.4s,$dat1.4s
116 zip1 $vt2.4s,$dat2.4s,$dat3.4s
117 zip2 $vt3.4s,$dat2.4s,$dat3.4s
118 zip1 $dat0.2d,$vt0.2d,$vt2.2d
119 zip2 $dat1.2d,$vt0.2d,$vt2.2d
120 zip1 $dat2.2d,$vt1.2d,$vt3.2d
121 zip2 $dat3.2d,$vt1.2d,$vt3.2d
122 ___
123 }
124
125 # sbox operations for 4-lane of words
126 sub sbox() {
127 my $dat = shift;
128
129 $code.=<<___;
130 movi @vtmp[0].16b,#64
131 movi @vtmp[1].16b,#128
132 movi @vtmp[2].16b,#192
133 sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b
134 sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b
135 sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b
136 tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
137 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
138 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
139 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
140 add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
141 add @vtmp[2].2d,@vtmp[2].2d,$dat.2d
142 add $dat.2d,@vtmp[0].2d,@vtmp[2].2d
143
144 ushr @vtmp[0].4s,$dat.4s,32-2
145 sli @vtmp[0].4s,$dat.4s,2
146 ushr @vtmp[2].4s,$dat.4s,32-10
147 eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
148 sli @vtmp[2].4s,$dat.4s,10
149 eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
150 ushr @vtmp[0].4s,$dat.4s,32-18
151 sli @vtmp[0].4s,$dat.4s,18
152 ushr @vtmp[2].4s,$dat.4s,32-24
153 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
154 sli @vtmp[2].4s,$dat.4s,24
155 eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b
156 ___
157 }
158
159 # sbox operation for 8-lane of words
160 sub sbox_double() {
161 my $dat = shift;
162 my $datx = shift;
163
164 $code.=<<___;
165 movi @vtmp[3].16b,#64
166 sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b
167 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
168 sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
169 tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
170 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
171 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
172 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
173 add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
174 add $dat.2d,@vtmp[2].2d,$dat.2d
175 add $dat.2d,@vtmp[1].2d,$dat.2d
176
177 sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b
178 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
179 sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
180 tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
181 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
182 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
183 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
184 add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
185 add $datx.2d,@vtmp[2].2d,$datx.2d
186 add $datx.2d,@vtmp[1].2d,$datx.2d
187
188 ushr @vtmp[0].4s,$dat.4s,32-2
189 sli @vtmp[0].4s,$dat.4s,2
190 ushr @vtmp[2].4s,$datx.4s,32-2
191 eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
192 sli @vtmp[2].4s,$datx.4s,2
193
194 ushr @vtmp[0].4s,$dat.4s,32-10
195 eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b
196 sli @vtmp[0].4s,$dat.4s,10
197 ushr @vtmp[2].4s,$datx.4s,32-10
198 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
199 sli @vtmp[2].4s,$datx.4s,10
200
201 ushr @vtmp[0].4s,$dat.4s,32-18
202 eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
203 sli @vtmp[0].4s,$dat.4s,18
204 ushr @vtmp[2].4s,$datx.4s,32-18
205 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
206 sli @vtmp[2].4s,$datx.4s,18
207
208 ushr @vtmp[0].4s,$dat.4s,32-24
209 eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
210 sli @vtmp[0].4s,$dat.4s,24
211 ushr @vtmp[2].4s,$datx.4s,32-24
212 eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b
213 sli @vtmp[2].4s,$datx.4s,24
214 eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
215 ___
216 }
217
218 # sbox operation for one single word
219 sub sbox_1word () {
220 my $word = shift;
221
222 $code.=<<___;
223 movi @vtmp[1].16b,#64
224 movi @vtmp[2].16b,#128
225 movi @vtmp[3].16b,#192
226 mov @vtmp[0].s[0],$word
227
228 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
229 sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
230 sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
231
232 tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
233 tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
234 tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
235 tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
236
237 mov $word,@vtmp[0].s[0]
238 mov $wtmp0,@vtmp[1].s[0]
239 mov $wtmp2,@vtmp[2].s[0]
240 add $wtmp0,$word,$wtmp0
241 mov $word,@vtmp[3].s[0]
242 add $wtmp0,$wtmp0,$wtmp2
243 add $wtmp0,$wtmp0,$word
244
245 eor $word,$wtmp0,$wtmp0,ror #32-2
246 eor $word,$word,$wtmp0,ror #32-10
247 eor $word,$word,$wtmp0,ror #32-18
248 eor $word,$word,$wtmp0,ror #32-24
249 ___
250 }
251
252 # sm4 for one block of data, in scalar registers word0/word1/word2/word3
253 sub sm4_1blk () {
254 my $kptr = shift;
255
256 $code.=<<___;
257 ldp $wtmp0,$wtmp1,[$kptr],8
258 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
259 eor $tmpw,$word2,$word3
260 eor $wtmp2,$wtmp0,$word1
261 eor $tmpw,$tmpw,$wtmp2
262 ___
263 &sbox_1word($tmpw);
264 $code.=<<___;
265 eor $word0,$word0,$tmpw
266 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
267 eor $tmpw,$word2,$word3
268 eor $wtmp2,$word0,$wtmp1
269 eor $tmpw,$tmpw,$wtmp2
270 ___
271 &sbox_1word($tmpw);
272 $code.=<<___;
273 ldp $wtmp0,$wtmp1,[$kptr],8
274 eor $word1,$word1,$tmpw
275 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
276 eor $tmpw,$word0,$word1
277 eor $wtmp2,$wtmp0,$word3
278 eor $tmpw,$tmpw,$wtmp2
279 ___
280 &sbox_1word($tmpw);
281 $code.=<<___;
282 eor $word2,$word2,$tmpw
283 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
284 eor $tmpw,$word0,$word1
285 eor $wtmp2,$word2,$wtmp1
286 eor $tmpw,$tmpw,$wtmp2
287 ___
288 &sbox_1word($tmpw);
289 $code.=<<___;
290 eor $word3,$word3,$tmpw
291 ___
292 }
293
294 # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
295 sub sm4_4blks () {
296 my $kptr = shift;
297
298 $code.=<<___;
299 ldp $wtmp0,$wtmp1,[$kptr],8
300 dup $rk0.4s,$wtmp0
301 dup $rk1.4s,$wtmp1
302
303 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
304 eor $rka.16b,@data[2].16b,@data[3].16b
305 eor $rk0.16b,@data[1].16b,$rk0.16b
306 eor $rk0.16b,$rka.16b,$rk0.16b
307 ___
308 &sbox($rk0);
309 $code.=<<___;
310 eor @data[0].16b,@data[0].16b,$rk0.16b
311
312 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
313 eor $rka.16b,$rka.16b,@data[0].16b
314 eor $rk1.16b,$rka.16b,$rk1.16b
315 ___
316 &sbox($rk1);
317 $code.=<<___;
318 ldp $wtmp0,$wtmp1,[$kptr],8
319 eor @data[1].16b,@data[1].16b,$rk1.16b
320
321 dup $rk0.4s,$wtmp0
322 dup $rk1.4s,$wtmp1
323
324 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
325 eor $rka.16b,@data[0].16b,@data[1].16b
326 eor $rk0.16b,@data[3].16b,$rk0.16b
327 eor $rk0.16b,$rka.16b,$rk0.16b
328 ___
329 &sbox($rk0);
330 $code.=<<___;
331 eor @data[2].16b,@data[2].16b,$rk0.16b
332
333 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
334 eor $rka.16b,$rka.16b,@data[2].16b
335 eor $rk1.16b,$rka.16b,$rk1.16b
336 ___
337 &sbox($rk1);
338 $code.=<<___;
339 eor @data[3].16b,@data[3].16b,$rk1.16b
340 ___
341 }
342
343 # sm4 for 8 lanes of data, in neon registers
344 # data0/data1/data2/data3 datax0/datax1/datax2/datax3
345 sub sm4_8blks () {
346 my $kptr = shift;
347
348 $code.=<<___;
349 ldp $wtmp0,$wtmp1,[$kptr],8
350 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
351 dup $rk0.4s,$wtmp0
352 eor $rka.16b,@data[2].16b,@data[3].16b
353 eor $rkb.16b,@datax[2].16b,@datax[3].16b
354 eor @vtmp[0].16b,@data[1].16b,$rk0.16b
355 eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
356 eor $rk0.16b,$rka.16b,@vtmp[0].16b
357 eor $rk1.16b,$rkb.16b,@vtmp[1].16b
358 ___
359 &sbox_double($rk0,$rk1);
360 $code.=<<___;
361 eor @data[0].16b,@data[0].16b,$rk0.16b
362 eor @datax[0].16b,@datax[0].16b,$rk1.16b
363
364 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
365 dup $rk1.4s,$wtmp1
366 eor $rka.16b,$rka.16b,@data[0].16b
367 eor $rkb.16b,$rkb.16b,@datax[0].16b
368 eor $rk0.16b,$rka.16b,$rk1.16b
369 eor $rk1.16b,$rkb.16b,$rk1.16b
370 ___
371 &sbox_double($rk0,$rk1);
372 $code.=<<___;
373 ldp $wtmp0,$wtmp1,[$kptr],8
374 eor @data[1].16b,@data[1].16b,$rk0.16b
375 eor @datax[1].16b,@datax[1].16b,$rk1.16b
376
377 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
378 dup $rk0.4s,$wtmp0
379 eor $rka.16b,@data[0].16b,@data[1].16b
380 eor $rkb.16b,@datax[0].16b,@datax[1].16b
381 eor @vtmp[0].16b,@data[3].16b,$rk0.16b
382 eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
383 eor $rk0.16b,$rka.16b,@vtmp[0].16b
384 eor $rk1.16b,$rkb.16b,@vtmp[1].16b
385 ___
386 &sbox_double($rk0,$rk1);
387 $code.=<<___;
388 eor @data[2].16b,@data[2].16b,$rk0.16b
389 eor @datax[2].16b,@datax[2].16b,$rk1.16b
390
391 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
392 dup $rk1.4s,$wtmp1
393 eor $rka.16b,$rka.16b,@data[2].16b
394 eor $rkb.16b,$rkb.16b,@datax[2].16b
395 eor $rk0.16b,$rka.16b,$rk1.16b
396 eor $rk1.16b,$rkb.16b,$rk1.16b
397 ___
398 &sbox_double($rk0,$rk1);
399 $code.=<<___;
400 eor @data[3].16b,@data[3].16b,$rk0.16b
401 eor @datax[3].16b,@datax[3].16b,$rk1.16b
402 ___
403 }
404
405 sub encrypt_1blk_norev() {
406 my $dat = shift;
407
408 $code.=<<___;
409 mov $ptr,$rks
410 mov $counter,#8
411 mov $word0,$dat.s[0]
412 mov $word1,$dat.s[1]
413 mov $word2,$dat.s[2]
414 mov $word3,$dat.s[3]
415 10:
416 ___
417 &sm4_1blk($ptr);
418 $code.=<<___;
419 subs $counter,$counter,#1
420 b.ne 10b
421 mov $dat.s[0],$word3
422 mov $dat.s[1],$word2
423 mov $dat.s[2],$word1
424 mov $dat.s[3],$word0
425 ___
426 }
427
428 sub encrypt_1blk() {
429 my $dat = shift;
430
431 &encrypt_1blk_norev($dat);
432 &rev32($dat,$dat);
433 }
434
435 sub encrypt_4blks() {
436 $code.=<<___;
437 mov $ptr,$rks
438 mov $counter,#8
439 10:
440 ___
441 &sm4_4blks($ptr);
442 $code.=<<___;
443 subs $counter,$counter,#1
444 b.ne 10b
445 ___
446 &rev32(@vtmp[3],@data[0]);
447 &rev32(@vtmp[2],@data[1]);
448 &rev32(@vtmp[1],@data[2]);
449 &rev32(@vtmp[0],@data[3]);
450 }
451
452 sub encrypt_8blks() {
453 $code.=<<___;
454 mov $ptr,$rks
455 mov $counter,#8
456 10:
457 ___
458 &sm4_8blks($ptr);
459 $code.=<<___;
460 subs $counter,$counter,#1
461 b.ne 10b
462 ___
463 &rev32(@vtmp[3],@data[0]);
464 &rev32(@vtmp[2],@data[1]);
465 &rev32(@vtmp[1],@data[2]);
466 &rev32(@vtmp[0],@data[3]);
467 &rev32(@data[3],@datax[0]);
468 &rev32(@data[2],@datax[1]);
469 &rev32(@data[1],@datax[2]);
470 &rev32(@data[0],@datax[3]);
471 }
472
473 sub load_sbox () {
474 my $data = shift;
475
476 $code.=<<___;
477 adr $ptr,.Lsbox
478 ld1 {@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64
479 ld1 {@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64
480 ld1 {@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64
481 ld1 {@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr]
482 ___
483 }
484
485
486 sub mov_reg_to_vec() {
487 my $src0 = shift;
488 my $src1 = shift;
489 my $desv = shift;
490 $code.=<<___;
491 mov $desv.d[0],$src0
492 mov $desv.d[1],$src1
493 ___
494 &rev32_armeb($desv,$desv);
495 }
496
497 sub mov_vec_to_reg() {
498 my $srcv = shift;
499 my $des0 = shift;
500 my $des1 = shift;
501 $code.=<<___;
502 mov $des0,$srcv.d[0]
503 mov $des1,$srcv.d[1]
504 ___
505 }
506
507 sub compute_tweak() {
508 my $src0 = shift;
509 my $src1 = shift;
510 my $des0 = shift;
511 my $des1 = shift;
512 $code.=<<___;
513 mov $wtmp0,0x87
514 extr $xtmp2,$src1,$src1,#32
515 extr $des1,$src1,$src0,#63
516 and $wtmp1,$wtmp0,$wtmp2,asr#31
517 eor $des0,$xtmp1,$src0,lsl#1
518 ___
519 }
520
521 sub compute_tweak_vec() {
522 my $src = shift;
523 my $des = shift;
524 my $std = shift;
525 &rbit(@vtmp[2],$src,$std);
526 $code.=<<___;
527 ldr @qtmp[0], .Lxts_magic
528 shl $des.16b, @vtmp[2].16b, #1
529 ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
530 ushr @vtmp[1].16b, @vtmp[1].16b, #7
531 mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
532 eor $des.16b, $des.16b, @vtmp[1].16b
533 ___
534 &rbit($des,$des,$std);
535 }
536
537 $code=<<___;
538 #include "arm_arch.h"
539 .arch armv8-a
540 .text
541
542 .type _vpsm4_consts,%object
543 .align 7
544 _vpsm4_consts:
545 .Lsbox:
546 .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
547 .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
548 .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
549 .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
550 .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
551 .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
552 .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
553 .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
554 .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
555 .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
556 .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
557 .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
558 .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
559 .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
560 .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
561 .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
562 .Lck:
563 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
564 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
565 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
566 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
567 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
568 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
569 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
570 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
571 .Lfk:
572 .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
573 .Lshuffles:
574 .quad 0x0B0A090807060504,0x030201000F0E0D0C
575 .Lxts_magic:
576 .quad 0x0101010101010187,0x0101010101010101
577
578 .size _vpsm4_consts,.-_vpsm4_consts
579 ___
580
581 {{{
582 my ($key,$keys,$enc)=("x0","x1","w2");
583 my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
584 my ($vkey,$vfk,$vmap)=("v5","v6","v7");
585 $code.=<<___;
586 .type _vpsm4_set_key,%function
587 .align 4
588 _vpsm4_set_key:
589 AARCH64_VALID_CALL_TARGET
590 ld1 {$vkey.4s},[$key]
591 ___
592 &load_sbox();
593 &rev32($vkey,$vkey);
594 $code.=<<___;
595 adr $pointer,.Lshuffles
596 ld1 {$vmap.2d},[$pointer]
597 adr $pointer,.Lfk
598 ld1 {$vfk.2d},[$pointer]
599 eor $vkey.16b,$vkey.16b,$vfk.16b
600 mov $schedules,#32
601 adr $pointer,.Lck
602 movi @vtmp[0].16b,#64
603 cbnz $enc,1f
604 add $keys,$keys,124
605 1:
606 mov $wtmp,$vkey.s[1]
607 ldr $roundkey,[$pointer],#4
608 eor $roundkey,$roundkey,$wtmp
609 mov $wtmp,$vkey.s[2]
610 eor $roundkey,$roundkey,$wtmp
611 mov $wtmp,$vkey.s[3]
612 eor $roundkey,$roundkey,$wtmp
613 // sbox lookup
614 mov @data[0].s[0],$roundkey
615 tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
616 sub @data[0].16b,@data[0].16b,@vtmp[0].16b
617 tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
618 sub @data[0].16b,@data[0].16b,@vtmp[0].16b
619 tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
620 sub @data[0].16b,@data[0].16b,@vtmp[0].16b
621 tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
622 mov $wtmp,@vtmp[1].s[0]
623 eor $roundkey,$wtmp,$wtmp,ror #19
624 eor $roundkey,$roundkey,$wtmp,ror #9
625 mov $wtmp,$vkey.s[0]
626 eor $roundkey,$roundkey,$wtmp
627 mov $vkey.s[0],$roundkey
628 cbz $enc,2f
629 str $roundkey,[$keys],#4
630 b 3f
631 2:
632 str $roundkey,[$keys],#-4
633 3:
634 tbl $vkey.16b,{$vkey.16b},$vmap.16b
635 subs $schedules,$schedules,#1
636 b.ne 1b
637 ret
638 .size _vpsm4_set_key,.-_vpsm4_set_key
639 ___
640 }}}
641
642
643 {{{
644 $code.=<<___;
645 .type _vpsm4_enc_4blks,%function
646 .align 4
647 _vpsm4_enc_4blks:
648 AARCH64_VALID_CALL_TARGET
649 ___
650 &encrypt_4blks();
651 $code.=<<___;
652 ret
653 .size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
654 ___
655 }}}
656
657 {{{
658 $code.=<<___;
659 .type _vpsm4_enc_8blks,%function
660 .align 4
661 _vpsm4_enc_8blks:
662 AARCH64_VALID_CALL_TARGET
663 ___
664 &encrypt_8blks();
665 $code.=<<___;
666 ret
667 .size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
668 ___
669 }}}
670
671
672 {{{
673 my ($key,$keys)=("x0","x1");
674 $code.=<<___;
675 .globl ${prefix}_set_encrypt_key
676 .type ${prefix}_set_encrypt_key,%function
677 .align 5
678 ${prefix}_set_encrypt_key:
679 AARCH64_SIGN_LINK_REGISTER
680 stp x29,x30,[sp,#-16]!
681 mov w2,1
682 bl _vpsm4_set_key
683 ldp x29,x30,[sp],#16
684 AARCH64_VALIDATE_LINK_REGISTER
685 ret
686 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
687 ___
688 }}}
689
690 {{{
691 my ($key,$keys)=("x0","x1");
692 $code.=<<___;
693 .globl ${prefix}_set_decrypt_key
694 .type ${prefix}_set_decrypt_key,%function
695 .align 5
696 ${prefix}_set_decrypt_key:
697 AARCH64_SIGN_LINK_REGISTER
698 stp x29,x30,[sp,#-16]!
699 mov w2,0
700 bl _vpsm4_set_key
701 ldp x29,x30,[sp],#16
702 AARCH64_VALIDATE_LINK_REGISTER
703 ret
704 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
705 ___
706 }}}
707
708 {{{
709 sub gen_block () {
710 my $dir = shift;
711 my ($inp,$outp,$rk)=map("x$_",(0..2));
712
713 $code.=<<___;
714 .globl ${prefix}_${dir}crypt
715 .type ${prefix}_${dir}crypt,%function
716 .align 5
717 ${prefix}_${dir}crypt:
718 AARCH64_VALID_CALL_TARGET
719 ld1 {@data[0].4s},[$inp]
720 ___
721 &load_sbox();
722 &rev32(@data[0],@data[0]);
723 $code.=<<___;
724 mov $rks,x2
725 ___
726 &encrypt_1blk(@data[0]);
727 $code.=<<___;
728 st1 {@data[0].4s},[$outp]
729 ret
730 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
731 ___
732 }
733 &gen_block("en");
734 &gen_block("de");
735 }}}
736
737 {{{
738 my ($enc) = ("w4");
739 my @dat=map("v$_",(16..23));
740
741 $code.=<<___;
742 .globl ${prefix}_ecb_encrypt
743 .type ${prefix}_ecb_encrypt,%function
744 .align 5
745 ${prefix}_ecb_encrypt:
746 AARCH64_SIGN_LINK_REGISTER
747 // convert length into blocks
748 lsr x2,x2,4
749 stp d8,d9,[sp,#-80]!
750 stp d10,d11,[sp,#16]
751 stp d12,d13,[sp,#32]
752 stp d14,d15,[sp,#48]
753 stp x29,x30,[sp,#64]
754 ___
755 &load_sbox();
756 $code.=<<___;
757 .Lecb_8_blocks_process:
758 cmp $blocks,#8
759 b.lt .Lecb_4_blocks_process
760 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
761 ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
762 ___
763 &rev32(@data[0],@data[0]);
764 &rev32(@data[1],@data[1]);
765 &rev32(@data[2],@data[2]);
766 &rev32(@data[3],@data[3]);
767 &rev32(@datax[0],@datax[0]);
768 &rev32(@datax[1],@datax[1]);
769 &rev32(@datax[2],@datax[2]);
770 &rev32(@datax[3],@datax[3]);
771 $code.=<<___;
772 bl _vpsm4_enc_8blks
773 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
774 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
775 subs $blocks,$blocks,#8
776 b.gt .Lecb_8_blocks_process
777 b 100f
778 .Lecb_4_blocks_process:
779 cmp $blocks,#4
780 b.lt 1f
781 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
782 ___
783 &rev32(@data[0],@data[0]);
784 &rev32(@data[1],@data[1]);
785 &rev32(@data[2],@data[2]);
786 &rev32(@data[3],@data[3]);
787 $code.=<<___;
788 bl _vpsm4_enc_4blks
789 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
790 sub $blocks,$blocks,#4
791 1:
792 // process last block
793 cmp $blocks,#1
794 b.lt 100f
795 b.gt 1f
796 ld1 {@data[0].4s},[$inp]
797 ___
798 &rev32(@data[0],@data[0]);
799 &encrypt_1blk(@data[0]);
800 $code.=<<___;
801 st1 {@data[0].4s},[$outp]
802 b 100f
803 1: // process last 2 blocks
804 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
805 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
806 cmp $blocks,#2
807 b.gt 1f
808 ___
809 &rev32(@data[0],@data[0]);
810 &rev32(@data[1],@data[1]);
811 &rev32(@data[2],@data[2]);
812 &rev32(@data[3],@data[3]);
813 $code.=<<___;
814 bl _vpsm4_enc_4blks
815 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
816 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
817 b 100f
818 1: // process last 3 blocks
819 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
820 ___
821 &rev32(@data[0],@data[0]);
822 &rev32(@data[1],@data[1]);
823 &rev32(@data[2],@data[2]);
824 &rev32(@data[3],@data[3]);
825 $code.=<<___;
826 bl _vpsm4_enc_4blks
827 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
828 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
829 st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
830 100:
831 ldp d10,d11,[sp,#16]
832 ldp d12,d13,[sp,#32]
833 ldp d14,d15,[sp,#48]
834 ldp x29,x30,[sp,#64]
835 ldp d8,d9,[sp],#80
836 AARCH64_VALIDATE_LINK_REGISTER
837 ret
838 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
839 ___
840 }}}
841
842 {{{
843 my ($len,$ivp,$enc)=("x2","x4","w5");
844 my $ivec0=("v3");
845 my $ivec1=("v15");
846
847 $code.=<<___;
848 .globl ${prefix}_cbc_encrypt
849 .type ${prefix}_cbc_encrypt,%function
850 .align 5
851 ${prefix}_cbc_encrypt:
852 AARCH64_VALID_CALL_TARGET
853 lsr $len,$len,4
854 ___
855 &load_sbox();
856 $code.=<<___;
857 cbz $enc,.Ldec
858 ld1 {$ivec0.4s},[$ivp]
859 .Lcbc_4_blocks_enc:
860 cmp $blocks,#4
861 b.lt 1f
862 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
863 eor @data[0].16b,@data[0].16b,$ivec0.16b
864 ___
865 &rev32(@data[1],@data[1]);
866 &rev32(@data[0],@data[0]);
867 &rev32(@data[2],@data[2]);
868 &rev32(@data[3],@data[3]);
869 &encrypt_1blk_norev(@data[0]);
870 $code.=<<___;
871 eor @data[1].16b,@data[1].16b,@data[0].16b
872 ___
873 &encrypt_1blk_norev(@data[1]);
874 &rev32(@data[0],@data[0]);
875
876 $code.=<<___;
877 eor @data[2].16b,@data[2].16b,@data[1].16b
878 ___
879 &encrypt_1blk_norev(@data[2]);
880 &rev32(@data[1],@data[1]);
881 $code.=<<___;
882 eor @data[3].16b,@data[3].16b,@data[2].16b
883 ___
884 &encrypt_1blk_norev(@data[3]);
885 &rev32(@data[2],@data[2]);
886 &rev32(@data[3],@data[3]);
887 $code.=<<___;
888 orr $ivec0.16b,@data[3].16b,@data[3].16b
889 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
890 subs $blocks,$blocks,#4
891 b.ne .Lcbc_4_blocks_enc
892 b 2f
893 1:
894 subs $blocks,$blocks,#1
895 b.lt 2f
896 ld1 {@data[0].4s},[$inp],#16
897 eor $ivec0.16b,$ivec0.16b,@data[0].16b
898 ___
899 &rev32($ivec0,$ivec0);
900 &encrypt_1blk($ivec0);
901 $code.=<<___;
902 st1 {$ivec0.4s},[$outp],#16
903 b 1b
904 2:
905 // save back IV
906 st1 {$ivec0.4s},[$ivp]
907 ret
908
909 .Ldec:
910 // decryption mode starts
911 AARCH64_SIGN_LINK_REGISTER
912 stp d8,d9,[sp,#-80]!
913 stp d10,d11,[sp,#16]
914 stp d12,d13,[sp,#32]
915 stp d14,d15,[sp,#48]
916 stp x29,x30,[sp,#64]
917 .Lcbc_8_blocks_dec:
918 cmp $blocks,#8
919 b.lt 1f
920 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
921 add $ptr,$inp,#64
922 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
923 ___
924 &rev32(@data[0],@data[0]);
925 &rev32(@data[1],@data[1]);
926 &rev32(@data[2],@data[2]);
927 &rev32(@data[3],$data[3]);
928 &rev32(@datax[0],@datax[0]);
929 &rev32(@datax[1],@datax[1]);
930 &rev32(@datax[2],@datax[2]);
931 &rev32(@datax[3],$datax[3]);
932 $code.=<<___;
933 bl _vpsm4_enc_8blks
934 ___
935 &transpose(@vtmp,@datax);
936 &transpose(@data,@datax);
937 $code.=<<___;
938 ld1 {$ivec1.4s},[$ivp]
939 ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
940 // note ivec1 and vtmpx[3] are reusing the same register
941 // care needs to be taken to avoid conflict
942 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
943 ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
944 eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
945 eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
946 eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
947 // save back IV
948 st1 {$vtmpx[3].4s}, [$ivp]
949 eor @data[0].16b,@data[0].16b,$datax[3].16b
950 eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
951 eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
952 eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
953 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
954 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
955 subs $blocks,$blocks,#8
956 b.gt .Lcbc_8_blocks_dec
957 b.eq 100f
958 1:
959 ld1 {$ivec1.4s},[$ivp]
960 .Lcbc_4_blocks_dec:
961 cmp $blocks,#4
962 b.lt 1f
963 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
964 ___
965 &rev32(@data[0],@data[0]);
966 &rev32(@data[1],@data[1]);
967 &rev32(@data[2],@data[2]);
968 &rev32(@data[3],$data[3]);
969 $code.=<<___;
970 bl _vpsm4_enc_4blks
971 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
972 ___
973 &transpose(@vtmp,@datax);
974 $code.=<<___;
975 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
976 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
977 orr $ivec1.16b,@data[3].16b,@data[3].16b
978 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
979 eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
980 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
981 subs $blocks,$blocks,#4
982 b.gt .Lcbc_4_blocks_dec
983 // save back IV
984 st1 {@data[3].4s}, [$ivp]
985 b 100f
986 1: // last block
987 subs $blocks,$blocks,#1
988 b.lt 100f
989 b.gt 1f
990 ld1 {@data[0].4s},[$inp],#16
991 // save back IV
992 st1 {$data[0].4s}, [$ivp]
993 ___
994 &rev32(@datax[0],@data[0]);
995 &encrypt_1blk(@datax[0]);
996 $code.=<<___;
997 eor @datax[0].16b,@datax[0].16b,$ivec1.16b
998 st1 {@datax[0].4s},[$outp],#16
999 b 100f
1000 1: // last two blocks
1001 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
1002 add $ptr,$inp,#16
1003 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
1004 subs $blocks,$blocks,1
1005 b.gt 1f
1006 ___
1007 &rev32(@data[0],@data[0]);
1008 &rev32(@data[1],@data[1]);
1009 &rev32(@data[2],@data[2]);
1010 &rev32(@data[3],@data[3]);
1011 $code.=<<___;
1012 bl _vpsm4_enc_4blks
1013 ld1 {@data[0].4s,@data[1].4s},[$inp],#32
1014 ___
1015 &transpose(@vtmp,@datax);
1016 $code.=<<___;
1017 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1018 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1019 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1020 // save back IV
1021 st1 {@data[1].4s}, [$ivp]
1022 b 100f
1023 1: // last 3 blocks
1024 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
1025 ___
1026 &rev32(@data[0],@data[0]);
1027 &rev32(@data[1],@data[1]);
1028 &rev32(@data[2],@data[2]);
1029 &rev32(@data[3],@data[3]);
1030 $code.=<<___;
1031 bl _vpsm4_enc_4blks
1032 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1033 ___
1034 &transpose(@vtmp,@datax);
1035 $code.=<<___;
1036 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1037 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1038 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
1039 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1040 // save back IV
1041 st1 {@data[2].4s}, [$ivp]
1042 100:
1043 ldp d10,d11,[sp,#16]
1044 ldp d12,d13,[sp,#32]
1045 ldp d14,d15,[sp,#48]
1046 ldp x29,x30,[sp,#64]
1047 ldp d8,d9,[sp],#80
1048 AARCH64_VALIDATE_LINK_REGISTER
1049 ret
1050 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1051 ___
1052 }}}
1053
1054 {{{
1055 my ($ivp)=("x4");
1056 my ($ctr)=("w5");
1057 my $ivec=("v3");
1058
1059 $code.=<<___;
1060 .globl ${prefix}_ctr32_encrypt_blocks
1061 .type ${prefix}_ctr32_encrypt_blocks,%function
1062 .align 5
1063 ${prefix}_ctr32_encrypt_blocks:
1064 AARCH64_VALID_CALL_TARGET
1065 ld1 {$ivec.4s},[$ivp]
1066 ___
1067 &rev32($ivec,$ivec);
1068 &load_sbox();
1069 $code.=<<___;
1070 cmp $blocks,#1
1071 b.ne 1f
1072 // fast processing for one single block without
1073 // context saving overhead
1074 ___
1075 &encrypt_1blk($ivec);
1076 $code.=<<___;
1077 ld1 {@data[0].4s},[$inp]
1078 eor @data[0].16b,@data[0].16b,$ivec.16b
1079 st1 {@data[0].4s},[$outp]
1080 ret
1081 1:
1082 AARCH64_SIGN_LINK_REGISTER
1083 stp d8,d9,[sp,#-80]!
1084 stp d10,d11,[sp,#16]
1085 stp d12,d13,[sp,#32]
1086 stp d14,d15,[sp,#48]
1087 stp x29,x30,[sp,#64]
1088 mov $word0,$ivec.s[0]
1089 mov $word1,$ivec.s[1]
1090 mov $word2,$ivec.s[2]
1091 mov $ctr,$ivec.s[3]
1092 .Lctr32_4_blocks_process:
1093 cmp $blocks,#4
1094 b.lt 1f
1095 dup @data[0].4s,$word0
1096 dup @data[1].4s,$word1
1097 dup @data[2].4s,$word2
1098 mov @data[3].s[0],$ctr
1099 add $ctr,$ctr,#1
1100 mov $data[3].s[1],$ctr
1101 add $ctr,$ctr,#1
1102 mov @data[3].s[2],$ctr
1103 add $ctr,$ctr,#1
1104 mov @data[3].s[3],$ctr
1105 add $ctr,$ctr,#1
1106 cmp $blocks,#8
1107 b.ge .Lctr32_8_blocks_process
1108 bl _vpsm4_enc_4blks
1109 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1110 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1111 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1112 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1113 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1114 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1115 subs $blocks,$blocks,#4
1116 b.ne .Lctr32_4_blocks_process
1117 b 100f
1118 .Lctr32_8_blocks_process:
1119 dup @datax[0].4s,$word0
1120 dup @datax[1].4s,$word1
1121 dup @datax[2].4s,$word2
1122 mov @datax[3].s[0],$ctr
1123 add $ctr,$ctr,#1
1124 mov $datax[3].s[1],$ctr
1125 add $ctr,$ctr,#1
1126 mov @datax[3].s[2],$ctr
1127 add $ctr,$ctr,#1
1128 mov @datax[3].s[3],$ctr
1129 add $ctr,$ctr,#1
1130 bl _vpsm4_enc_8blks
1131 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1132 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1133 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1134 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1135 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1136 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1137 eor @data[0].16b,@data[0].16b,@datax[0].16b
1138 eor @data[1].16b,@data[1].16b,@datax[1].16b
1139 eor @data[2].16b,@data[2].16b,@datax[2].16b
1140 eor @data[3].16b,@data[3].16b,@datax[3].16b
1141 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1142 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1143 subs $blocks,$blocks,#8
1144 b.ne .Lctr32_4_blocks_process
1145 b 100f
1146 1: // last block processing
1147 subs $blocks,$blocks,#1
1148 b.lt 100f
1149 b.gt 1f
1150 mov $ivec.s[0],$word0
1151 mov $ivec.s[1],$word1
1152 mov $ivec.s[2],$word2
1153 mov $ivec.s[3],$ctr
1154 ___
1155 &encrypt_1blk($ivec);
1156 $code.=<<___;
1157 ld1 {@data[0].4s},[$inp]
1158 eor @data[0].16b,@data[0].16b,$ivec.16b
1159 st1 {@data[0].4s},[$outp]
1160 b 100f
1161 1: // last 2 blocks processing
1162 dup @data[0].4s,$word0
1163 dup @data[1].4s,$word1
1164 dup @data[2].4s,$word2
1165 mov @data[3].s[0],$ctr
1166 add $ctr,$ctr,#1
1167 mov @data[3].s[1],$ctr
1168 subs $blocks,$blocks,#1
1169 b.ne 1f
1170 bl _vpsm4_enc_4blks
1171 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1172 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1173 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1174 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1175 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1176 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1177 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1178 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1179 b 100f
1180 1: // last 3 blocks processing
1181 add $ctr,$ctr,#1
1182 mov @data[3].s[2],$ctr
1183 bl _vpsm4_enc_4blks
1184 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1185 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1186 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1187 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1188 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1189 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1190 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1191 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1192 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1193 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1194 100:
1195 ldp d10,d11,[sp,#16]
1196 ldp d12,d13,[sp,#32]
1197 ldp d14,d15,[sp,#48]
1198 ldp x29,x30,[sp,#64]
1199 ldp d8,d9,[sp],#80
1200 AARCH64_VALIDATE_LINK_REGISTER
1201 ret
1202 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1203 ___
1204 }}}
1205
1206 {{{
1207 my ($blocks,$len)=("x2","x2");
1208 my $ivp=("x5");
1209 my @twx=map("x$_",(12..27));
1210 my ($rks1,$rks2)=("x26","x27");
1211 my $lastBlk=("x26");
1212 my $enc=("w28");
1213 my $remain=("x29");
1214
1215 my @tweak=@datax;
1216
1217 sub gen_xts_cipher() {
1218 my $std = shift;
1219 $code.=<<___;
1220 .globl ${prefix}_xts_encrypt${std}
1221 .type ${prefix}_xts_encrypt${std},%function
1222 .align 5
1223 ${prefix}_xts_encrypt${std}:
1224 AARCH64_SIGN_LINK_REGISTER
1225 stp x15, x16, [sp, #-0x10]!
1226 stp x17, x18, [sp, #-0x10]!
1227 stp x19, x20, [sp, #-0x10]!
1228 stp x21, x22, [sp, #-0x10]!
1229 stp x23, x24, [sp, #-0x10]!
1230 stp x25, x26, [sp, #-0x10]!
1231 stp x27, x28, [sp, #-0x10]!
1232 stp x29, x30, [sp, #-0x10]!
1233 stp d8, d9, [sp, #-0x10]!
1234 stp d10, d11, [sp, #-0x10]!
1235 stp d12, d13, [sp, #-0x10]!
1236 stp d14, d15, [sp, #-0x10]!
1237 mov $rks1,x3
1238 mov $rks2,x4
1239 mov $enc,w6
1240 ld1 {@tweak[0].4s}, [$ivp]
1241 mov $rks,$rks2
1242 ___
1243 &load_sbox();
1244 &rev32(@tweak[0],@tweak[0]);
1245 &encrypt_1blk(@tweak[0]);
1246 $code.=<<___;
1247 mov $rks,$rks1
1248 and $remain,$len,#0x0F
1249 // convert length into blocks
1250 lsr $blocks,$len,4
1251 cmp $blocks,#1
1252 b.lt .return${std}
1253
1254 cmp $remain,0
1255 // If the encryption/decryption Length is N times of 16,
1256 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1257 b.eq .xts_encrypt_blocks${std}
1258
1259 // If the encryption/decryption length is not N times of 16,
1260 // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
1261 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1262 subs $blocks,$blocks,#1
1263 b.eq .only_2blks_tweak${std}
1264 .xts_encrypt_blocks${std}:
1265 ___
1266 &rbit(@tweak[0],@tweak[0],$std);
1267 &rev32_armeb(@tweak[0],@tweak[0]);
1268 &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
1269 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1270 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1271 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1272 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1273 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1274 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1275 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1276 $code.=<<___;
1277 .Lxts_8_blocks_process${std}:
1278 cmp $blocks,#8
1279 b.lt .Lxts_4_blocks_process${std}
1280 ___
1281 &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
1282 &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
1283 &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
1284 &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
1285 &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
1286 &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
1287 &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
1288 &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
1289 $code.=<<___;
1290 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1291 ___
1292 &rbit(@vtmp[0],@vtmp[0],$std);
1293 &rbit(@vtmp[1],@vtmp[1],$std);
1294 &rbit(@vtmp[2],@vtmp[2],$std);
1295 &rbit(@vtmp[3],@vtmp[3],$std);
1296 $code.=<<___;
1297 eor @data[0].16b, @data[0].16b, @vtmp[0].16b
1298 eor @data[1].16b, @data[1].16b, @vtmp[1].16b
1299 eor @data[2].16b, @data[2].16b, @vtmp[2].16b
1300 eor @data[3].16b, @data[3].16b, @vtmp[3].16b
1301 ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1302 ___
1303 &rbit(@vtmpx[0],@vtmpx[0],$std);
1304 &rbit(@vtmpx[1],@vtmpx[1],$std);
1305 &rbit(@vtmpx[2],@vtmpx[2],$std);
1306 &rbit(@vtmpx[3],@vtmpx[3],$std);
1307 $code.=<<___;
1308 eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
1309 eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
1310 eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
1311 eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
1312 ___
1313 &rev32(@data[0],@data[0]);
1314 &rev32(@data[1],@data[1]);
1315 &rev32(@data[2],@data[2]);
1316 &rev32(@data[3],@data[3]);
1317 &rev32(@datax[0],@datax[0]);
1318 &rev32(@datax[1],@datax[1]);
1319 &rev32(@datax[2],@datax[2]);
1320 &rev32(@datax[3],@datax[3]);
1321 &transpose(@data,@vtmp);
1322 &transpose(@datax,@vtmp);
1323 $code.=<<___;
1324 bl _${prefix}_enc_8blks
1325 ___
1326 &transpose(@vtmp,@datax);
1327 &transpose(@data,@datax);
1328
1329 &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
1330 &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
1331 &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
1332 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1333 &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
1334 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1335 &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
1336 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1337 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1338 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1339 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1340 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1341 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1342 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1343 &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
1344 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1345 $code.=<<___;
1346 eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
1347 eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
1348 eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
1349 eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
1350 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1351 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1352 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1353 eor @data[3].16b, @data[3].16b, @tweak[3].16b
1354
1355 // save the last tweak
1356 st1 {@tweak[3].4s},[$ivp]
1357 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1358 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1359 subs $blocks,$blocks,#8
1360 b.gt .Lxts_8_blocks_process${std}
1361 b 100f
1362 .Lxts_4_blocks_process${std}:
1363 ___
1364 &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
1365 &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
1366 &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
1367 &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
1368 $code.=<<___;
1369 cmp $blocks,#4
1370 b.lt 1f
1371 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1372 ___
1373 &rbit(@tweak[0],@tweak[0],$std);
1374 &rbit(@tweak[1],@tweak[1],$std);
1375 &rbit(@tweak[2],@tweak[2],$std);
1376 &rbit(@tweak[3],@tweak[3],$std);
1377 $code.=<<___;
1378 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1379 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1380 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1381 eor @data[3].16b, @data[3].16b, @tweak[3].16b
1382 ___
1383 &rev32(@data[0],@data[0]);
1384 &rev32(@data[1],@data[1]);
1385 &rev32(@data[2],@data[2]);
1386 &rev32(@data[3],@data[3]);
1387 &transpose(@data,@vtmp);
1388 $code.=<<___;
1389 bl _${prefix}_enc_4blks
1390 ___
1391 &transpose(@vtmp,@data);
1392 $code.=<<___;
1393 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1394 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1395 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1396 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1397 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1398 sub $blocks,$blocks,#4
1399 ___
1400 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1401 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1402 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1403 $code.=<<___;
1404 // save the last tweak
1405 st1 {@tweak[3].4s},[$ivp]
1406 1:
1407 // process last block
1408 cmp $blocks,#1
1409 b.lt 100f
1410 b.gt 1f
1411 ld1 {@data[0].4s},[$inp],#16
1412 ___
1413 &rbit(@tweak[0],@tweak[0],$std);
1414 $code.=<<___;
1415 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1416 ___
1417 &rev32(@data[0],@data[0]);
1418 &encrypt_1blk(@data[0]);
1419 $code.=<<___;
1420 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1421 st1 {@data[0].4s},[$outp],#16
1422 // save the last tweak
1423 st1 {@tweak[0].4s},[$ivp]
1424 b 100f
1425 1: // process last 2 blocks
1426 cmp $blocks,#2
1427 b.gt 1f
1428 ld1 {@data[0].4s,@data[1].4s},[$inp],#32
1429 ___
1430 &rbit(@tweak[0],@tweak[0],$std);
1431 &rbit(@tweak[1],@tweak[1],$std);
1432 $code.=<<___;
1433 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1434 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1435 ___
1436 &rev32(@data[0],@data[0]);
1437 &rev32(@data[1],@data[1]);
1438 &transpose(@data,@vtmp);
1439 $code.=<<___;
1440 bl _${prefix}_enc_4blks
1441 ___
1442 &transpose(@vtmp,@data);
1443 $code.=<<___;
1444 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1445 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1446 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1447 // save the last tweak
1448 st1 {@tweak[1].4s},[$ivp]
1449 b 100f
1450 1: // process last 3 blocks
1451 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1452 ___
1453 &rbit(@tweak[0],@tweak[0],$std);
1454 &rbit(@tweak[1],@tweak[1],$std);
1455 &rbit(@tweak[2],@tweak[2],$std);
1456 $code.=<<___;
1457 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1458 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1459 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1460 ___
1461 &rev32(@data[0],@data[0]);
1462 &rev32(@data[1],@data[1]);
1463 &rev32(@data[2],@data[2]);
1464 &transpose(@data,@vtmp);
1465 $code.=<<___;
1466 bl _${prefix}_enc_4blks
1467 ___
1468 &transpose(@vtmp,@data);
1469 $code.=<<___;
1470 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1471 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1472 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1473 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1474 // save the last tweak
1475 st1 {@tweak[2].4s},[$ivp]
1476 100:
1477 cmp $remain,0
1478 b.eq .return${std}
1479
1480 // This branch calculates the last two tweaks,
1481 // while the encryption/decryption length is larger than 32
1482 .last_2blks_tweak${std}:
1483 ld1 {@tweak[0].4s},[$ivp]
1484 ___
1485 &rev32_armeb(@tweak[0],@tweak[0]);
1486 &compute_tweak_vec(@tweak[0],@tweak[1],$std);
1487 &compute_tweak_vec(@tweak[1],@tweak[2],$std);
1488 $code.=<<___;
1489 b .check_dec${std}
1490
1491
1492 // This branch calculates the last two tweaks,
1493 // while the encryption/decryption length is equal to 32, who only need two tweaks
1494 .only_2blks_tweak${std}:
1495 mov @tweak[1].16b,@tweak[0].16b
1496 ___
1497 &rev32_armeb(@tweak[1],@tweak[1]);
1498 &compute_tweak_vec(@tweak[1],@tweak[2],$std);
1499 $code.=<<___;
1500 b .check_dec${std}
1501
1502
1503 // Determine whether encryption or decryption is required.
1504 // The last two tweaks need to be swapped for decryption.
1505 .check_dec${std}:
1506 // encryption:1 decryption:0
1507 cmp $enc,1
1508 b.eq .process_last_2blks${std}
1509 mov @vtmp[0].16B,@tweak[1].16b
1510 mov @tweak[1].16B,@tweak[2].16b
1511 mov @tweak[2].16B,@vtmp[0].16b
1512
1513 .process_last_2blks${std}:
1514 ___
1515 &rev32_armeb(@tweak[1],@tweak[1]);
1516 &rev32_armeb(@tweak[2],@tweak[2]);
1517 $code.=<<___;
1518 ld1 {@data[0].4s},[$inp],#16
1519 eor @data[0].16b, @data[0].16b, @tweak[1].16b
1520 ___
1521 &rev32(@data[0],@data[0]);
1522 &encrypt_1blk(@data[0]);
1523 $code.=<<___;
1524 eor @data[0].16b, @data[0].16b, @tweak[1].16b
1525 st1 {@data[0].4s},[$outp],#16
1526
1527 sub $lastBlk,$outp,16
1528 .loop${std}:
1529 subs $remain,$remain,1
1530 ldrb $wtmp0,[$lastBlk,$remain]
1531 ldrb $wtmp1,[$inp,$remain]
1532 strb $wtmp1,[$lastBlk,$remain]
1533 strb $wtmp0,[$outp,$remain]
1534 b.gt .loop${std}
1535 ld1 {@data[0].4s}, [$lastBlk]
1536 eor @data[0].16b, @data[0].16b, @tweak[2].16b
1537 ___
1538 &rev32(@data[0],@data[0]);
1539 &encrypt_1blk(@data[0]);
1540 $code.=<<___;
1541 eor @data[0].16b, @data[0].16b, @tweak[2].16b
1542 st1 {@data[0].4s}, [$lastBlk]
1543 .return${std}:
1544 ldp d14, d15, [sp], #0x10
1545 ldp d12, d13, [sp], #0x10
1546 ldp d10, d11, [sp], #0x10
1547 ldp d8, d9, [sp], #0x10
1548 ldp x29, x30, [sp], #0x10
1549 ldp x27, x28, [sp], #0x10
1550 ldp x25, x26, [sp], #0x10
1551 ldp x23, x24, [sp], #0x10
1552 ldp x21, x22, [sp], #0x10
1553 ldp x19, x20, [sp], #0x10
1554 ldp x17, x18, [sp], #0x10
1555 ldp x15, x16, [sp], #0x10
1556 AARCH64_VALIDATE_LINK_REGISTER
1557 ret
1558 .size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
1559 ___
1560 } # end of gen_xts_cipher
1561 &gen_xts_cipher("_gb");
1562 &gen_xts_cipher("");
1563 }}}
1564 ########################################
1565 open SELF,$0;
1566 while(<SELF>) {
1567 next if (/^#!/);
1568 last if (!s/^#/\/\// and !/^$/);
1569 print;
1570 }
1571 close SELF;
1572
1573 foreach(split("\n",$code)) {
1574 s/\`([^\`]*)\`/eval($1)/ge;
1575 print $_,"\n";
1576 }
1577
1578 close STDOUT or die "error closing STDOUT: $!";