]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/sm4/asm/vpsm4-armv8.pl
Copyright year updates
[thirdparty/openssl.git] / crypto / sm4 / asm / vpsm4-armv8.pl
CommitLineData
4908787f 1#! /usr/bin/env perl
da1c088f 2# Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved.
4908787f
DH
3#
4# Licensed under the Apache License 2.0 (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# This module implements SM4 with ASIMD on aarch64
11#
12# Feb 2022
13#
14
15# $output is the last argument if it looks like a file (it has an extension)
16# $flavour is the first argument if it doesn't look like a file
17$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
19
20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23die "can't locate arm-xlate.pl";
24
25open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26 or die "can't call $xlate: $!";
27*STDOUT=*OUT;
28
29$prefix="vpsm4";
30my @vtmp=map("v$_",(0..3));
c007203b 31my @qtmp=map("q$_",(0..3));
4908787f
DH
32my @data=map("v$_",(4..7));
33my @datax=map("v$_",(8..11));
34my ($rk0,$rk1)=("v12","v13");
35my ($rka,$rkb)=("v14","v15");
36my @vtmpx=map("v$_",(12..15));
37my @sbox=map("v$_",(16..31));
38my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
39my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
c007203b 40my ($xtmp1,$xtmp2)=("x8","x9");
4908787f
DH
41my ($ptr,$counter)=("x10","w11");
42my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
43
44sub rev32() {
45 my $dst = shift;
46 my $src = shift;
47
48 if ($src and ("$src" ne "$dst")) {
49$code.=<<___;
accd3bdd 50#ifndef __AARCH64EB__
4908787f
DH
51 rev32 $dst.16b,$src.16b
52#else
53 mov $dst.16b,$src.16b
54#endif
55___
56 } else {
57$code.=<<___;
accd3bdd 58#ifndef __AARCH64EB__
4908787f
DH
59 rev32 $dst.16b,$dst.16b
60#endif
61___
62 }
63}
64
c007203b
XY
65sub rev32_armeb() {
66 my $dst = shift;
67 my $src = shift;
68
69 if ($src and ("$src" ne "$dst")) {
70$code.=<<___;
71#ifdef __AARCH64EB__
72 rev32 $dst.16b,$src.16b
73#else
74 mov $dst.16b,$src.16b
75#endif
76___
77 } else {
78$code.=<<___;
79#ifdef __AARCH64EB__
80 rev32 $dst.16b,$dst.16b
81#endif
82___
83 }
84}
85
86sub rbit() {
87 my $dst = shift;
88 my $src = shift;
89 my $std = shift;
90
91 if ($src and ("$src" ne "$dst")) {
92 if ($std eq "_gb") {
93$code.=<<___;
94 rbit $dst.16b,$src.16b
95___
96 } else {
97$code.=<<___;
98 mov $dst.16b,$src.16b
99___
100 }
101 } else {
102 if ($std eq "_gb") {
103$code.=<<___;
104 rbit $dst.16b,$src.16b
105___
106 }
107 }
108}
109
4908787f
DH
110sub transpose() {
111 my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
112
113$code.=<<___;
114 zip1 $vt0.4s,$dat0.4s,$dat1.4s
115 zip2 $vt1.4s,$dat0.4s,$dat1.4s
116 zip1 $vt2.4s,$dat2.4s,$dat3.4s
117 zip2 $vt3.4s,$dat2.4s,$dat3.4s
118 zip1 $dat0.2d,$vt0.2d,$vt2.2d
119 zip2 $dat1.2d,$vt0.2d,$vt2.2d
120 zip1 $dat2.2d,$vt1.2d,$vt3.2d
121 zip2 $dat3.2d,$vt1.2d,$vt3.2d
122___
123}
124
125# sbox operations for 4-lane of words
126sub sbox() {
127 my $dat = shift;
128
129$code.=<<___;
130 movi @vtmp[0].16b,#64
131 movi @vtmp[1].16b,#128
132 movi @vtmp[2].16b,#192
133 sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b
134 sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b
135 sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b
136 tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
137 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
138 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
139 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
140 add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
141 add @vtmp[2].2d,@vtmp[2].2d,$dat.2d
142 add $dat.2d,@vtmp[0].2d,@vtmp[2].2d
143
144 ushr @vtmp[0].4s,$dat.4s,32-2
145 sli @vtmp[0].4s,$dat.4s,2
146 ushr @vtmp[2].4s,$dat.4s,32-10
147 eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
148 sli @vtmp[2].4s,$dat.4s,10
149 eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
150 ushr @vtmp[0].4s,$dat.4s,32-18
151 sli @vtmp[0].4s,$dat.4s,18
152 ushr @vtmp[2].4s,$dat.4s,32-24
153 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
154 sli @vtmp[2].4s,$dat.4s,24
155 eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b
156___
157}
158
159# sbox operation for 8-lane of words
160sub sbox_double() {
161 my $dat = shift;
162 my $datx = shift;
163
164$code.=<<___;
165 movi @vtmp[3].16b,#64
166 sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b
167 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
168 sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
169 tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
170 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
171 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
172 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
173 add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
174 add $dat.2d,@vtmp[2].2d,$dat.2d
175 add $dat.2d,@vtmp[1].2d,$dat.2d
176
177 sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b
178 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
179 sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
180 tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
181 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
182 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
183 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
184 add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
185 add $datx.2d,@vtmp[2].2d,$datx.2d
186 add $datx.2d,@vtmp[1].2d,$datx.2d
187
188 ushr @vtmp[0].4s,$dat.4s,32-2
189 sli @vtmp[0].4s,$dat.4s,2
190 ushr @vtmp[2].4s,$datx.4s,32-2
191 eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
192 sli @vtmp[2].4s,$datx.4s,2
193
194 ushr @vtmp[0].4s,$dat.4s,32-10
195 eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b
196 sli @vtmp[0].4s,$dat.4s,10
197 ushr @vtmp[2].4s,$datx.4s,32-10
198 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
199 sli @vtmp[2].4s,$datx.4s,10
200
201 ushr @vtmp[0].4s,$dat.4s,32-18
202 eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
203 sli @vtmp[0].4s,$dat.4s,18
204 ushr @vtmp[2].4s,$datx.4s,32-18
205 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
206 sli @vtmp[2].4s,$datx.4s,18
207
208 ushr @vtmp[0].4s,$dat.4s,32-24
209 eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
210 sli @vtmp[0].4s,$dat.4s,24
211 ushr @vtmp[2].4s,$datx.4s,32-24
212 eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b
213 sli @vtmp[2].4s,$datx.4s,24
214 eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
215___
216}
217
218# sbox operation for one single word
219sub sbox_1word () {
220 my $word = shift;
221
222$code.=<<___;
223 movi @vtmp[1].16b,#64
224 movi @vtmp[2].16b,#128
225 movi @vtmp[3].16b,#192
226 mov @vtmp[0].s[0],$word
227
228 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
229 sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
230 sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
231
232 tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
233 tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
234 tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
235 tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
236
237 mov $word,@vtmp[0].s[0]
238 mov $wtmp0,@vtmp[1].s[0]
239 mov $wtmp2,@vtmp[2].s[0]
240 add $wtmp0,$word,$wtmp0
241 mov $word,@vtmp[3].s[0]
242 add $wtmp0,$wtmp0,$wtmp2
243 add $wtmp0,$wtmp0,$word
244
245 eor $word,$wtmp0,$wtmp0,ror #32-2
246 eor $word,$word,$wtmp0,ror #32-10
247 eor $word,$word,$wtmp0,ror #32-18
248 eor $word,$word,$wtmp0,ror #32-24
249___
250}
251
252# sm4 for one block of data, in scalar registers word0/word1/word2/word3
253sub sm4_1blk () {
254 my $kptr = shift;
255
256$code.=<<___;
257 ldp $wtmp0,$wtmp1,[$kptr],8
258 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
259 eor $tmpw,$word2,$word3
260 eor $wtmp2,$wtmp0,$word1
261 eor $tmpw,$tmpw,$wtmp2
262___
263 &sbox_1word($tmpw);
264$code.=<<___;
265 eor $word0,$word0,$tmpw
266 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
267 eor $tmpw,$word2,$word3
268 eor $wtmp2,$word0,$wtmp1
269 eor $tmpw,$tmpw,$wtmp2
270___
271 &sbox_1word($tmpw);
272$code.=<<___;
273 ldp $wtmp0,$wtmp1,[$kptr],8
274 eor $word1,$word1,$tmpw
275 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
276 eor $tmpw,$word0,$word1
277 eor $wtmp2,$wtmp0,$word3
278 eor $tmpw,$tmpw,$wtmp2
279___
280 &sbox_1word($tmpw);
281$code.=<<___;
282 eor $word2,$word2,$tmpw
283 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
284 eor $tmpw,$word0,$word1
285 eor $wtmp2,$word2,$wtmp1
286 eor $tmpw,$tmpw,$wtmp2
287___
288 &sbox_1word($tmpw);
289$code.=<<___;
290 eor $word3,$word3,$tmpw
291___
292}
293
294# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
295sub sm4_4blks () {
296 my $kptr = shift;
297
298$code.=<<___;
299 ldp $wtmp0,$wtmp1,[$kptr],8
300 dup $rk0.4s,$wtmp0
301 dup $rk1.4s,$wtmp1
302
303 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
304 eor $rka.16b,@data[2].16b,@data[3].16b
305 eor $rk0.16b,@data[1].16b,$rk0.16b
306 eor $rk0.16b,$rka.16b,$rk0.16b
307___
308 &sbox($rk0);
309$code.=<<___;
310 eor @data[0].16b,@data[0].16b,$rk0.16b
311
312 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
313 eor $rka.16b,$rka.16b,@data[0].16b
314 eor $rk1.16b,$rka.16b,$rk1.16b
315___
316 &sbox($rk1);
317$code.=<<___;
318 ldp $wtmp0,$wtmp1,[$kptr],8
319 eor @data[1].16b,@data[1].16b,$rk1.16b
320
321 dup $rk0.4s,$wtmp0
322 dup $rk1.4s,$wtmp1
323
324 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
325 eor $rka.16b,@data[0].16b,@data[1].16b
326 eor $rk0.16b,@data[3].16b,$rk0.16b
327 eor $rk0.16b,$rka.16b,$rk0.16b
328___
329 &sbox($rk0);
330$code.=<<___;
331 eor @data[2].16b,@data[2].16b,$rk0.16b
332
333 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
334 eor $rka.16b,$rka.16b,@data[2].16b
335 eor $rk1.16b,$rka.16b,$rk1.16b
336___
337 &sbox($rk1);
338$code.=<<___;
339 eor @data[3].16b,@data[3].16b,$rk1.16b
340___
341}
342
343# sm4 for 8 lanes of data, in neon registers
344# data0/data1/data2/data3 datax0/datax1/datax2/datax3
345sub sm4_8blks () {
346 my $kptr = shift;
347
348$code.=<<___;
349 ldp $wtmp0,$wtmp1,[$kptr],8
350 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
351 dup $rk0.4s,$wtmp0
352 eor $rka.16b,@data[2].16b,@data[3].16b
353 eor $rkb.16b,@datax[2].16b,@datax[3].16b
354 eor @vtmp[0].16b,@data[1].16b,$rk0.16b
355 eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
356 eor $rk0.16b,$rka.16b,@vtmp[0].16b
357 eor $rk1.16b,$rkb.16b,@vtmp[1].16b
358___
359 &sbox_double($rk0,$rk1);
360$code.=<<___;
361 eor @data[0].16b,@data[0].16b,$rk0.16b
362 eor @datax[0].16b,@datax[0].16b,$rk1.16b
363
364 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
365 dup $rk1.4s,$wtmp1
366 eor $rka.16b,$rka.16b,@data[0].16b
367 eor $rkb.16b,$rkb.16b,@datax[0].16b
368 eor $rk0.16b,$rka.16b,$rk1.16b
369 eor $rk1.16b,$rkb.16b,$rk1.16b
370___
371 &sbox_double($rk0,$rk1);
372$code.=<<___;
373 ldp $wtmp0,$wtmp1,[$kptr],8
374 eor @data[1].16b,@data[1].16b,$rk0.16b
375 eor @datax[1].16b,@datax[1].16b,$rk1.16b
376
377 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
378 dup $rk0.4s,$wtmp0
379 eor $rka.16b,@data[0].16b,@data[1].16b
380 eor $rkb.16b,@datax[0].16b,@datax[1].16b
381 eor @vtmp[0].16b,@data[3].16b,$rk0.16b
382 eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
383 eor $rk0.16b,$rka.16b,@vtmp[0].16b
384 eor $rk1.16b,$rkb.16b,@vtmp[1].16b
385___
386 &sbox_double($rk0,$rk1);
387$code.=<<___;
388 eor @data[2].16b,@data[2].16b,$rk0.16b
389 eor @datax[2].16b,@datax[2].16b,$rk1.16b
390
391 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
392 dup $rk1.4s,$wtmp1
393 eor $rka.16b,$rka.16b,@data[2].16b
394 eor $rkb.16b,$rkb.16b,@datax[2].16b
395 eor $rk0.16b,$rka.16b,$rk1.16b
396 eor $rk1.16b,$rkb.16b,$rk1.16b
397___
398 &sbox_double($rk0,$rk1);
399$code.=<<___;
400 eor @data[3].16b,@data[3].16b,$rk0.16b
401 eor @datax[3].16b,@datax[3].16b,$rk1.16b
402___
403}
404
405sub encrypt_1blk_norev() {
406 my $dat = shift;
407
408$code.=<<___;
409 mov $ptr,$rks
410 mov $counter,#8
411 mov $word0,$dat.s[0]
412 mov $word1,$dat.s[1]
413 mov $word2,$dat.s[2]
414 mov $word3,$dat.s[3]
41510:
416___
417 &sm4_1blk($ptr);
418$code.=<<___;
419 subs $counter,$counter,#1
420 b.ne 10b
421 mov $dat.s[0],$word3
422 mov $dat.s[1],$word2
423 mov $dat.s[2],$word1
424 mov $dat.s[3],$word0
425___
426}
427
428sub encrypt_1blk() {
429 my $dat = shift;
430
431 &encrypt_1blk_norev($dat);
432 &rev32($dat,$dat);
433}
434
435sub encrypt_4blks() {
436$code.=<<___;
437 mov $ptr,$rks
438 mov $counter,#8
43910:
440___
441 &sm4_4blks($ptr);
442$code.=<<___;
443 subs $counter,$counter,#1
444 b.ne 10b
445___
446 &rev32(@vtmp[3],@data[0]);
447 &rev32(@vtmp[2],@data[1]);
448 &rev32(@vtmp[1],@data[2]);
449 &rev32(@vtmp[0],@data[3]);
450}
451
452sub encrypt_8blks() {
453$code.=<<___;
454 mov $ptr,$rks
455 mov $counter,#8
45610:
457___
458 &sm4_8blks($ptr);
459$code.=<<___;
460 subs $counter,$counter,#1
461 b.ne 10b
462___
463 &rev32(@vtmp[3],@data[0]);
464 &rev32(@vtmp[2],@data[1]);
465 &rev32(@vtmp[1],@data[2]);
466 &rev32(@vtmp[0],@data[3]);
467 &rev32(@data[3],@datax[0]);
468 &rev32(@data[2],@datax[1]);
469 &rev32(@data[1],@datax[2]);
470 &rev32(@data[0],@datax[3]);
471}
472
473sub load_sbox () {
474 my $data = shift;
475
476$code.=<<___;
477 adr $ptr,.Lsbox
accd3bdd
XY
478 ld1 {@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64
479 ld1 {@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64
480 ld1 {@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64
481 ld1 {@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr]
4908787f
DH
482___
483}
484
c007203b
XY
485
486sub mov_reg_to_vec() {
487 my $src0 = shift;
488 my $src1 = shift;
489 my $desv = shift;
490$code.=<<___;
491 mov $desv.d[0],$src0
492 mov $desv.d[1],$src1
493___
494 &rev32_armeb($desv,$desv);
495}
496
497sub mov_vec_to_reg() {
498 my $srcv = shift;
499 my $des0 = shift;
500 my $des1 = shift;
501$code.=<<___;
502 mov $des0,$srcv.d[0]
503 mov $des1,$srcv.d[1]
504___
505}
506
507sub compute_tweak() {
508 my $src0 = shift;
509 my $src1 = shift;
510 my $des0 = shift;
511 my $des1 = shift;
512$code.=<<___;
513 mov $wtmp0,0x87
514 extr $xtmp2,$src1,$src1,#32
515 extr $des1,$src1,$src0,#63
516 and $wtmp1,$wtmp0,$wtmp2,asr#31
517 eor $des0,$xtmp1,$src0,lsl#1
518___
519}
520
521sub compute_tweak_vec() {
522 my $src = shift;
523 my $des = shift;
524 my $std = shift;
525 &rbit(@vtmp[2],$src,$std);
526$code.=<<___;
cded5d05 527 ldr @qtmp[0], .Lxts_magic
c007203b
XY
528 shl $des.16b, @vtmp[2].16b, #1
529 ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
530 ushr @vtmp[1].16b, @vtmp[1].16b, #7
531 mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
532 eor $des.16b, $des.16b, @vtmp[1].16b
533___
534 &rbit($des,$des,$std);
535}
536
4908787f
DH
537$code=<<___;
538#include "arm_arch.h"
539.arch armv8-a
540.text
541
542.type _vpsm4_consts,%object
543.align 7
544_vpsm4_consts:
545.Lsbox:
546 .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
547 .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
548 .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
549 .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
550 .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
551 .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
552 .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
553 .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
554 .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
555 .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
556 .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
557 .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
558 .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
559 .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
560 .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
561 .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
562.Lck:
563 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
564 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
565 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
566 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
567 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
568 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
569 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
570 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
571.Lfk:
852438ad 572 .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
4908787f 573.Lshuffles:
852438ad 574 .quad 0x0B0A090807060504,0x030201000F0E0D0C
cded5d05 575.Lxts_magic:
852438ad 576 .quad 0x0101010101010187,0x0101010101010101
4908787f
DH
577
578.size _vpsm4_consts,.-_vpsm4_consts
579___
580
581{{{
582my ($key,$keys,$enc)=("x0","x1","w2");
583my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
584my ($vkey,$vfk,$vmap)=("v5","v6","v7");
585$code.=<<___;
586.type _vpsm4_set_key,%function
587.align 4
588_vpsm4_set_key:
589 AARCH64_VALID_CALL_TARGET
590 ld1 {$vkey.4s},[$key]
591___
592 &load_sbox();
593 &rev32($vkey,$vkey);
594$code.=<<___;
595 adr $pointer,.Lshuffles
accd3bdd 596 ld1 {$vmap.2d},[$pointer]
4908787f 597 adr $pointer,.Lfk
accd3bdd 598 ld1 {$vfk.2d},[$pointer]
4908787f
DH
599 eor $vkey.16b,$vkey.16b,$vfk.16b
600 mov $schedules,#32
601 adr $pointer,.Lck
602 movi @vtmp[0].16b,#64
603 cbnz $enc,1f
604 add $keys,$keys,124
6051:
606 mov $wtmp,$vkey.s[1]
607 ldr $roundkey,[$pointer],#4
608 eor $roundkey,$roundkey,$wtmp
609 mov $wtmp,$vkey.s[2]
610 eor $roundkey,$roundkey,$wtmp
611 mov $wtmp,$vkey.s[3]
612 eor $roundkey,$roundkey,$wtmp
613 // sbox lookup
614 mov @data[0].s[0],$roundkey
615 tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
616 sub @data[0].16b,@data[0].16b,@vtmp[0].16b
617 tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
618 sub @data[0].16b,@data[0].16b,@vtmp[0].16b
619 tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
620 sub @data[0].16b,@data[0].16b,@vtmp[0].16b
621 tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
622 mov $wtmp,@vtmp[1].s[0]
623 eor $roundkey,$wtmp,$wtmp,ror #19
624 eor $roundkey,$roundkey,$wtmp,ror #9
625 mov $wtmp,$vkey.s[0]
626 eor $roundkey,$roundkey,$wtmp
627 mov $vkey.s[0],$roundkey
628 cbz $enc,2f
629 str $roundkey,[$keys],#4
630 b 3f
6312:
632 str $roundkey,[$keys],#-4
6333:
634 tbl $vkey.16b,{$vkey.16b},$vmap.16b
635 subs $schedules,$schedules,#1
636 b.ne 1b
637 ret
638.size _vpsm4_set_key,.-_vpsm4_set_key
639___
640}}}
641
642
643{{{
644$code.=<<___;
645.type _vpsm4_enc_4blks,%function
646.align 4
647_vpsm4_enc_4blks:
648 AARCH64_VALID_CALL_TARGET
649___
650 &encrypt_4blks();
651$code.=<<___;
652 ret
653.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
654___
655}}}
656
657{{{
658$code.=<<___;
659.type _vpsm4_enc_8blks,%function
660.align 4
661_vpsm4_enc_8blks:
662 AARCH64_VALID_CALL_TARGET
663___
664 &encrypt_8blks();
665$code.=<<___;
666 ret
667.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
668___
669}}}
670
671
672{{{
673my ($key,$keys)=("x0","x1");
674$code.=<<___;
675.globl ${prefix}_set_encrypt_key
676.type ${prefix}_set_encrypt_key,%function
677.align 5
678${prefix}_set_encrypt_key:
679 AARCH64_SIGN_LINK_REGISTER
680 stp x29,x30,[sp,#-16]!
681 mov w2,1
682 bl _vpsm4_set_key
683 ldp x29,x30,[sp],#16
684 AARCH64_VALIDATE_LINK_REGISTER
685 ret
686.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
687___
688}}}
689
690{{{
691my ($key,$keys)=("x0","x1");
692$code.=<<___;
693.globl ${prefix}_set_decrypt_key
694.type ${prefix}_set_decrypt_key,%function
695.align 5
696${prefix}_set_decrypt_key:
697 AARCH64_SIGN_LINK_REGISTER
698 stp x29,x30,[sp,#-16]!
699 mov w2,0
700 bl _vpsm4_set_key
701 ldp x29,x30,[sp],#16
702 AARCH64_VALIDATE_LINK_REGISTER
703 ret
704.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
705___
706}}}
707
708{{{
709sub gen_block () {
710 my $dir = shift;
711 my ($inp,$outp,$rk)=map("x$_",(0..2));
712
713$code.=<<___;
714.globl ${prefix}_${dir}crypt
715.type ${prefix}_${dir}crypt,%function
716.align 5
717${prefix}_${dir}crypt:
718 AARCH64_VALID_CALL_TARGET
accd3bdd 719 ld1 {@data[0].4s},[$inp]
4908787f
DH
720___
721 &load_sbox();
722 &rev32(@data[0],@data[0]);
723$code.=<<___;
724 mov $rks,x2
725___
726 &encrypt_1blk(@data[0]);
727$code.=<<___;
accd3bdd 728 st1 {@data[0].4s},[$outp]
4908787f
DH
729 ret
730.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
731___
732}
733&gen_block("en");
734&gen_block("de");
735}}}
736
737{{{
738my ($enc) = ("w4");
739my @dat=map("v$_",(16..23));
740
741$code.=<<___;
742.globl ${prefix}_ecb_encrypt
743.type ${prefix}_ecb_encrypt,%function
744.align 5
745${prefix}_ecb_encrypt:
746 AARCH64_SIGN_LINK_REGISTER
747 // convert length into blocks
748 lsr x2,x2,4
749 stp d8,d9,[sp,#-80]!
750 stp d10,d11,[sp,#16]
751 stp d12,d13,[sp,#32]
752 stp d14,d15,[sp,#48]
753 stp x29,x30,[sp,#64]
754___
755 &load_sbox();
756$code.=<<___;
757.Lecb_8_blocks_process:
758 cmp $blocks,#8
759 b.lt .Lecb_4_blocks_process
760 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
761 ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
762___
763 &rev32(@data[0],@data[0]);
764 &rev32(@data[1],@data[1]);
765 &rev32(@data[2],@data[2]);
766 &rev32(@data[3],@data[3]);
767 &rev32(@datax[0],@datax[0]);
768 &rev32(@datax[1],@datax[1]);
769 &rev32(@datax[2],@datax[2]);
770 &rev32(@datax[3],@datax[3]);
771$code.=<<___;
772 bl _vpsm4_enc_8blks
773 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
774 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
775 subs $blocks,$blocks,#8
776 b.gt .Lecb_8_blocks_process
777 b 100f
778.Lecb_4_blocks_process:
779 cmp $blocks,#4
780 b.lt 1f
781 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
782___
783 &rev32(@data[0],@data[0]);
784 &rev32(@data[1],@data[1]);
785 &rev32(@data[2],@data[2]);
786 &rev32(@data[3],@data[3]);
787$code.=<<___;
788 bl _vpsm4_enc_4blks
789 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
790 sub $blocks,$blocks,#4
7911:
792 // process last block
793 cmp $blocks,#1
794 b.lt 100f
795 b.gt 1f
accd3bdd 796 ld1 {@data[0].4s},[$inp]
4908787f
DH
797___
798 &rev32(@data[0],@data[0]);
799 &encrypt_1blk(@data[0]);
800$code.=<<___;
accd3bdd 801 st1 {@data[0].4s},[$outp]
4908787f
DH
802 b 100f
8031: // process last 2 blocks
804 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
805 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
806 cmp $blocks,#2
807 b.gt 1f
808___
809 &rev32(@data[0],@data[0]);
810 &rev32(@data[1],@data[1]);
811 &rev32(@data[2],@data[2]);
812 &rev32(@data[3],@data[3]);
813$code.=<<___;
814 bl _vpsm4_enc_4blks
815 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
816 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
817 b 100f
8181: // process last 3 blocks
819 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
820___
821 &rev32(@data[0],@data[0]);
822 &rev32(@data[1],@data[1]);
823 &rev32(@data[2],@data[2]);
824 &rev32(@data[3],@data[3]);
825$code.=<<___;
826 bl _vpsm4_enc_4blks
827 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
828 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
829 st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
830100:
831 ldp d10,d11,[sp,#16]
832 ldp d12,d13,[sp,#32]
833 ldp d14,d15,[sp,#48]
834 ldp x29,x30,[sp,#64]
835 ldp d8,d9,[sp],#80
836 AARCH64_VALIDATE_LINK_REGISTER
837 ret
838.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
839___
840}}}
841
842{{{
843my ($len,$ivp,$enc)=("x2","x4","w5");
844my $ivec0=("v3");
845my $ivec1=("v15");
846
847$code.=<<___;
848.globl ${prefix}_cbc_encrypt
849.type ${prefix}_cbc_encrypt,%function
850.align 5
851${prefix}_cbc_encrypt:
852 AARCH64_VALID_CALL_TARGET
853 lsr $len,$len,4
854___
855 &load_sbox();
856$code.=<<___;
857 cbz $enc,.Ldec
858 ld1 {$ivec0.4s},[$ivp]
859.Lcbc_4_blocks_enc:
860 cmp $blocks,#4
861 b.lt 1f
862 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
863 eor @data[0].16b,@data[0].16b,$ivec0.16b
864___
865 &rev32(@data[1],@data[1]);
866 &rev32(@data[0],@data[0]);
867 &rev32(@data[2],@data[2]);
868 &rev32(@data[3],@data[3]);
869 &encrypt_1blk_norev(@data[0]);
870$code.=<<___;
871 eor @data[1].16b,@data[1].16b,@data[0].16b
872___
873 &encrypt_1blk_norev(@data[1]);
874 &rev32(@data[0],@data[0]);
875
876$code.=<<___;
877 eor @data[2].16b,@data[2].16b,@data[1].16b
878___
879 &encrypt_1blk_norev(@data[2]);
880 &rev32(@data[1],@data[1]);
881$code.=<<___;
882 eor @data[3].16b,@data[3].16b,@data[2].16b
883___
884 &encrypt_1blk_norev(@data[3]);
885 &rev32(@data[2],@data[2]);
886 &rev32(@data[3],@data[3]);
887$code.=<<___;
888 orr $ivec0.16b,@data[3].16b,@data[3].16b
889 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
890 subs $blocks,$blocks,#4
891 b.ne .Lcbc_4_blocks_enc
892 b 2f
8931:
894 subs $blocks,$blocks,#1
895 b.lt 2f
896 ld1 {@data[0].4s},[$inp],#16
897 eor $ivec0.16b,$ivec0.16b,@data[0].16b
898___
899 &rev32($ivec0,$ivec0);
900 &encrypt_1blk($ivec0);
901$code.=<<___;
accd3bdd 902 st1 {$ivec0.4s},[$outp],#16
4908787f
DH
903 b 1b
9042:
905 // save back IV
accd3bdd 906 st1 {$ivec0.4s},[$ivp]
4908787f
DH
907 ret
908
909.Ldec:
910 // decryption mode starts
911 AARCH64_SIGN_LINK_REGISTER
912 stp d8,d9,[sp,#-80]!
913 stp d10,d11,[sp,#16]
914 stp d12,d13,[sp,#32]
915 stp d14,d15,[sp,#48]
916 stp x29,x30,[sp,#64]
917.Lcbc_8_blocks_dec:
918 cmp $blocks,#8
919 b.lt 1f
920 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
921 add $ptr,$inp,#64
922 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
923___
924 &rev32(@data[0],@data[0]);
925 &rev32(@data[1],@data[1]);
926 &rev32(@data[2],@data[2]);
927 &rev32(@data[3],$data[3]);
928 &rev32(@datax[0],@datax[0]);
929 &rev32(@datax[1],@datax[1]);
930 &rev32(@datax[2],@datax[2]);
931 &rev32(@datax[3],$datax[3]);
932$code.=<<___;
933 bl _vpsm4_enc_8blks
934___
935 &transpose(@vtmp,@datax);
936 &transpose(@data,@datax);
937$code.=<<___;
accd3bdd 938 ld1 {$ivec1.4s},[$ivp]
4908787f
DH
939 ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
940 // note ivec1 and vtmpx[3] are resuing the same register
941 // care needs to be taken to avoid conflict
942 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
943 ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
944 eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
945 eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
946 eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
947 // save back IV
accd3bdd 948 st1 {$vtmpx[3].4s}, [$ivp]
4908787f
DH
949 eor @data[0].16b,@data[0].16b,$datax[3].16b
950 eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
951 eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
952 eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
953 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
954 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
955 subs $blocks,$blocks,#8
956 b.gt .Lcbc_8_blocks_dec
957 b.eq 100f
9581:
accd3bdd 959 ld1 {$ivec1.4s},[$ivp]
4908787f
DH
960.Lcbc_4_blocks_dec:
961 cmp $blocks,#4
962 b.lt 1f
963 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
964___
965 &rev32(@data[0],@data[0]);
966 &rev32(@data[1],@data[1]);
967 &rev32(@data[2],@data[2]);
968 &rev32(@data[3],$data[3]);
969$code.=<<___;
970 bl _vpsm4_enc_4blks
971 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
972___
973 &transpose(@vtmp,@datax);
974$code.=<<___;
975 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
976 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
977 orr $ivec1.16b,@data[3].16b,@data[3].16b
978 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
979 eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
980 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
981 subs $blocks,$blocks,#4
982 b.gt .Lcbc_4_blocks_dec
983 // save back IV
accd3bdd 984 st1 {@data[3].4s}, [$ivp]
4908787f
DH
985 b 100f
9861: // last block
987 subs $blocks,$blocks,#1
988 b.lt 100f
989 b.gt 1f
990 ld1 {@data[0].4s},[$inp],#16
991 // save back IV
accd3bdd 992 st1 {$data[0].4s}, [$ivp]
4908787f
DH
993___
994 &rev32(@datax[0],@data[0]);
995 &encrypt_1blk(@datax[0]);
996$code.=<<___;
997 eor @datax[0].16b,@datax[0].16b,$ivec1.16b
accd3bdd 998 st1 {@datax[0].4s},[$outp],#16
4908787f
DH
999 b 100f
10001: // last two blocks
1001 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
1002 add $ptr,$inp,#16
1003 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
1004 subs $blocks,$blocks,1
1005 b.gt 1f
1006___
1007 &rev32(@data[0],@data[0]);
1008 &rev32(@data[1],@data[1]);
1009 &rev32(@data[2],@data[2]);
1010 &rev32(@data[3],@data[3]);
1011$code.=<<___;
1012 bl _vpsm4_enc_4blks
1013 ld1 {@data[0].4s,@data[1].4s},[$inp],#32
1014___
1015 &transpose(@vtmp,@datax);
1016$code.=<<___;
1017 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1018 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1019 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1020 // save back IV
accd3bdd 1021 st1 {@data[1].4s}, [$ivp]
4908787f
DH
1022 b 100f
10231: // last 3 blocks
1024 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
1025___
1026 &rev32(@data[0],@data[0]);
1027 &rev32(@data[1],@data[1]);
1028 &rev32(@data[2],@data[2]);
1029 &rev32(@data[3],@data[3]);
1030$code.=<<___;
1031 bl _vpsm4_enc_4blks
1032 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1033___
1034 &transpose(@vtmp,@datax);
1035$code.=<<___;
1036 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1037 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1038 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
1039 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1040 // save back IV
accd3bdd 1041 st1 {@data[2].4s}, [$ivp]
4908787f
DH
1042100:
1043 ldp d10,d11,[sp,#16]
1044 ldp d12,d13,[sp,#32]
1045 ldp d14,d15,[sp,#48]
1046 ldp x29,x30,[sp,#64]
1047 ldp d8,d9,[sp],#80
1048 AARCH64_VALIDATE_LINK_REGISTER
1049 ret
1050.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1051___
1052}}}
1053
1054{{{
1055my ($ivp)=("x4");
1056my ($ctr)=("w5");
1057my $ivec=("v3");
1058
1059$code.=<<___;
1060.globl ${prefix}_ctr32_encrypt_blocks
1061.type ${prefix}_ctr32_encrypt_blocks,%function
1062.align 5
1063${prefix}_ctr32_encrypt_blocks:
1064 AARCH64_VALID_CALL_TARGET
1065 ld1 {$ivec.4s},[$ivp]
1066___
1067 &rev32($ivec,$ivec);
1068 &load_sbox();
1069$code.=<<___;
1070 cmp $blocks,#1
1071 b.ne 1f
1072 // fast processing for one single block without
1073 // context saving overhead
1074___
1075 &encrypt_1blk($ivec);
1076$code.=<<___;
accd3bdd 1077 ld1 {@data[0].4s},[$inp]
4908787f 1078 eor @data[0].16b,@data[0].16b,$ivec.16b
accd3bdd 1079 st1 {@data[0].4s},[$outp]
4908787f
DH
1080 ret
10811:
1082 AARCH64_SIGN_LINK_REGISTER
1083 stp d8,d9,[sp,#-80]!
1084 stp d10,d11,[sp,#16]
1085 stp d12,d13,[sp,#32]
1086 stp d14,d15,[sp,#48]
1087 stp x29,x30,[sp,#64]
1088 mov $word0,$ivec.s[0]
1089 mov $word1,$ivec.s[1]
1090 mov $word2,$ivec.s[2]
1091 mov $ctr,$ivec.s[3]
1092.Lctr32_4_blocks_process:
1093 cmp $blocks,#4
1094 b.lt 1f
1095 dup @data[0].4s,$word0
1096 dup @data[1].4s,$word1
1097 dup @data[2].4s,$word2
1098 mov @data[3].s[0],$ctr
1099 add $ctr,$ctr,#1
1100 mov $data[3].s[1],$ctr
1101 add $ctr,$ctr,#1
1102 mov @data[3].s[2],$ctr
1103 add $ctr,$ctr,#1
1104 mov @data[3].s[3],$ctr
1105 add $ctr,$ctr,#1
1106 cmp $blocks,#8
1107 b.ge .Lctr32_8_blocks_process
1108 bl _vpsm4_enc_4blks
1109 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1110 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1111 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1112 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1113 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1114 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1115 subs $blocks,$blocks,#4
1116 b.ne .Lctr32_4_blocks_process
1117 b 100f
1118.Lctr32_8_blocks_process:
1119 dup @datax[0].4s,$word0
1120 dup @datax[1].4s,$word1
1121 dup @datax[2].4s,$word2
1122 mov @datax[3].s[0],$ctr
1123 add $ctr,$ctr,#1
1124 mov $datax[3].s[1],$ctr
1125 add $ctr,$ctr,#1
1126 mov @datax[3].s[2],$ctr
1127 add $ctr,$ctr,#1
1128 mov @datax[3].s[3],$ctr
1129 add $ctr,$ctr,#1
1130 bl _vpsm4_enc_8blks
1131 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1132 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1133 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1134 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1135 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1136 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1137 eor @data[0].16b,@data[0].16b,@datax[0].16b
1138 eor @data[1].16b,@data[1].16b,@datax[1].16b
1139 eor @data[2].16b,@data[2].16b,@datax[2].16b
1140 eor @data[3].16b,@data[3].16b,@datax[3].16b
1141 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1142 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1143 subs $blocks,$blocks,#8
1144 b.ne .Lctr32_4_blocks_process
1145 b 100f
11461: // last block processing
1147 subs $blocks,$blocks,#1
1148 b.lt 100f
1149 b.gt 1f
1150 mov $ivec.s[0],$word0
1151 mov $ivec.s[1],$word1
1152 mov $ivec.s[2],$word2
1153 mov $ivec.s[3],$ctr
1154___
1155 &encrypt_1blk($ivec);
1156$code.=<<___;
accd3bdd 1157 ld1 {@data[0].4s},[$inp]
4908787f 1158 eor @data[0].16b,@data[0].16b,$ivec.16b
accd3bdd 1159 st1 {@data[0].4s},[$outp]
4908787f
DH
1160 b 100f
11611: // last 2 blocks processing
1162 dup @data[0].4s,$word0
1163 dup @data[1].4s,$word1
1164 dup @data[2].4s,$word2
1165 mov @data[3].s[0],$ctr
1166 add $ctr,$ctr,#1
1167 mov @data[3].s[1],$ctr
1168 subs $blocks,$blocks,#1
1169 b.ne 1f
1170 bl _vpsm4_enc_4blks
1171 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1172 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1173 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1174 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1175 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1176 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1177 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1178 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1179 b 100f
11801: // last 3 blocks processing
1181 add $ctr,$ctr,#1
1182 mov @data[3].s[2],$ctr
1183 bl _vpsm4_enc_4blks
1184 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1185 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1186 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1187 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1188 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1189 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1190 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1191 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1192 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1193 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1194100:
1195 ldp d10,d11,[sp,#16]
1196 ldp d12,d13,[sp,#32]
1197 ldp d14,d15,[sp,#48]
1198 ldp x29,x30,[sp,#64]
1199 ldp d8,d9,[sp],#80
1200 AARCH64_VALIDATE_LINK_REGISTER
1201 ret
1202.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1203___
1204}}}
c007203b
XY
1205
1206{{{
1207my ($blocks,$len)=("x2","x2");
1208my $ivp=("x5");
1209my @twx=map("x$_",(12..27));
1210my ($rks1,$rks2)=("x26","x27");
1211my $lastBlk=("x26");
1212my $enc=("w28");
1213my $remain=("x29");
1214
1215my @tweak=@datax;
1216
1217sub gen_xts_cipher() {
1218 my $std = shift;
1219$code.=<<___;
1220.globl ${prefix}_xts_encrypt${std}
1221.type ${prefix}_xts_encrypt${std},%function
1222.align 5
1223${prefix}_xts_encrypt${std}:
1224 AARCH64_SIGN_LINK_REGISTER
1225 stp x15, x16, [sp, #-0x10]!
1226 stp x17, x18, [sp, #-0x10]!
1227 stp x19, x20, [sp, #-0x10]!
1228 stp x21, x22, [sp, #-0x10]!
1229 stp x23, x24, [sp, #-0x10]!
1230 stp x25, x26, [sp, #-0x10]!
1231 stp x27, x28, [sp, #-0x10]!
1232 stp x29, x30, [sp, #-0x10]!
1233 stp d8, d9, [sp, #-0x10]!
1234 stp d10, d11, [sp, #-0x10]!
1235 stp d12, d13, [sp, #-0x10]!
1236 stp d14, d15, [sp, #-0x10]!
1237 mov $rks1,x3
1238 mov $rks2,x4
1239 mov $enc,w6
1240 ld1 {@tweak[0].4s}, [$ivp]
1241 mov $rks,$rks2
1242___
1243 &load_sbox();
1244 &rev32(@tweak[0],@tweak[0]);
1245 &encrypt_1blk(@tweak[0]);
1246$code.=<<___;
1247 mov $rks,$rks1
1248 and $remain,$len,#0x0F
1249 // convert length into blocks
1250 lsr $blocks,$len,4
1251 cmp $blocks,#1
1252 b.lt .return${std}
1253
1254 cmp $remain,0
1255 // If the encryption/decryption Length is N times of 16,
1256 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1257 b.eq .xts_encrypt_blocks${std}
1258
1259 // If the encryption/decryption length is not N times of 16,
1260 // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
1261 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1262 subs $blocks,$blocks,#1
1263 b.eq .only_2blks_tweak${std}
1264.xts_encrypt_blocks${std}:
1265___
1266 &rbit(@tweak[0],@tweak[0],$std);
1267 &rev32_armeb(@tweak[0],@tweak[0]);
1268 &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
1269 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1270 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1271 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1272 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1273 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1274 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1275 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1276$code.=<<___;
1277.Lxts_8_blocks_process${std}:
1278 cmp $blocks,#8
1279 b.lt .Lxts_4_blocks_process${std}
1280___
1281 &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
1282 &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
1283 &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
1284 &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
1285 &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
1286 &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
1287 &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
1288 &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
1289$code.=<<___;
1290 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1291___
1292 &rbit(@vtmp[0],@vtmp[0],$std);
1293 &rbit(@vtmp[1],@vtmp[1],$std);
1294 &rbit(@vtmp[2],@vtmp[2],$std);
1295 &rbit(@vtmp[3],@vtmp[3],$std);
1296$code.=<<___;
1297 eor @data[0].16b, @data[0].16b, @vtmp[0].16b
1298 eor @data[1].16b, @data[1].16b, @vtmp[1].16b
1299 eor @data[2].16b, @data[2].16b, @vtmp[2].16b
1300 eor @data[3].16b, @data[3].16b, @vtmp[3].16b
1301 ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1302___
1303 &rbit(@vtmpx[0],@vtmpx[0],$std);
1304 &rbit(@vtmpx[1],@vtmpx[1],$std);
1305 &rbit(@vtmpx[2],@vtmpx[2],$std);
1306 &rbit(@vtmpx[3],@vtmpx[3],$std);
1307$code.=<<___;
1308 eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
1309 eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
1310 eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
1311 eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
1312___
1313 &rev32(@data[0],@data[0]);
1314 &rev32(@data[1],@data[1]);
1315 &rev32(@data[2],@data[2]);
1316 &rev32(@data[3],@data[3]);
1317 &rev32(@datax[0],@datax[0]);
1318 &rev32(@datax[1],@datax[1]);
1319 &rev32(@datax[2],@datax[2]);
1320 &rev32(@datax[3],@datax[3]);
1321 &transpose(@data,@vtmp);
1322 &transpose(@datax,@vtmp);
1323$code.=<<___;
1324 bl _${prefix}_enc_8blks
1325___
1326 &transpose(@vtmp,@datax);
1327 &transpose(@data,@datax);
1328
1329 &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
1330 &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
1331 &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
1332 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1333 &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
1334 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1335 &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
1336 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1337 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1338 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1339 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1340 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1341 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1342 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1343 &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
1344 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1345$code.=<<___;
1346 eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
1347 eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
1348 eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
1349 eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
1350 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1351 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1352 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1353 eor @data[3].16b, @data[3].16b, @tweak[3].16b
1354
1355 // save the last tweak
1356 st1 {@tweak[3].4s},[$ivp]
1357 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1358 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1359 subs $blocks,$blocks,#8
1360 b.gt .Lxts_8_blocks_process${std}
1361 b 100f
1362.Lxts_4_blocks_process${std}:
1363___
1364 &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
1365 &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
1366 &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
1367 &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
1368$code.=<<___;
1369 cmp $blocks,#4
1370 b.lt 1f
1371 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1372___
1373 &rbit(@tweak[0],@tweak[0],$std);
1374 &rbit(@tweak[1],@tweak[1],$std);
1375 &rbit(@tweak[2],@tweak[2],$std);
1376 &rbit(@tweak[3],@tweak[3],$std);
1377$code.=<<___;
1378 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1379 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1380 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1381 eor @data[3].16b, @data[3].16b, @tweak[3].16b
1382___
1383 &rev32(@data[0],@data[0]);
1384 &rev32(@data[1],@data[1]);
1385 &rev32(@data[2],@data[2]);
1386 &rev32(@data[3],@data[3]);
1387 &transpose(@data,@vtmp);
1388$code.=<<___;
1389 bl _${prefix}_enc_4blks
1390___
1391 &transpose(@vtmp,@data);
1392$code.=<<___;
1393 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1394 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1395 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1396 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1397 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1398 sub $blocks,$blocks,#4
1399___
1400 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1401 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1402 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1403$code.=<<___;
1404 // save the last tweak
1405 st1 {@tweak[3].4s},[$ivp]
14061:
1407 // process last block
1408 cmp $blocks,#1
1409 b.lt 100f
1410 b.gt 1f
1411 ld1 {@data[0].4s},[$inp],#16
1412___
1413 &rbit(@tweak[0],@tweak[0],$std);
1414$code.=<<___;
1415 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1416___
1417 &rev32(@data[0],@data[0]);
1418 &encrypt_1blk(@data[0]);
1419$code.=<<___;
1420 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1421 st1 {@data[0].4s},[$outp],#16
1422 // save the last tweak
1423 st1 {@tweak[0].4s},[$ivp]
1424 b 100f
14251: // process last 2 blocks
1426 cmp $blocks,#2
1427 b.gt 1f
1428 ld1 {@data[0].4s,@data[1].4s},[$inp],#32
1429___
1430 &rbit(@tweak[0],@tweak[0],$std);
1431 &rbit(@tweak[1],@tweak[1],$std);
1432$code.=<<___;
1433 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1434 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1435___
1436 &rev32(@data[0],@data[0]);
1437 &rev32(@data[1],@data[1]);
1438 &transpose(@data,@vtmp);
1439$code.=<<___;
1440 bl _${prefix}_enc_4blks
1441___
1442 &transpose(@vtmp,@data);
1443$code.=<<___;
1444 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1445 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1446 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1447 // save the last tweak
1448 st1 {@tweak[1].4s},[$ivp]
1449 b 100f
14501: // process last 3 blocks
1451 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1452___
1453 &rbit(@tweak[0],@tweak[0],$std);
1454 &rbit(@tweak[1],@tweak[1],$std);
1455 &rbit(@tweak[2],@tweak[2],$std);
1456$code.=<<___;
1457 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1458 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1459 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1460___
1461 &rev32(@data[0],@data[0]);
1462 &rev32(@data[1],@data[1]);
1463 &rev32(@data[2],@data[2]);
1464 &transpose(@data,@vtmp);
1465$code.=<<___;
1466 bl _${prefix}_enc_4blks
1467___
1468 &transpose(@vtmp,@data);
1469$code.=<<___;
1470 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1471 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1472 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1473 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1474 // save the last tweak
1475 st1 {@tweak[2].4s},[$ivp]
1476100:
1477 cmp $remain,0
1478 b.eq .return${std}
1479
eb4129e1 1480// This branch calculates the last two tweaks,
c007203b
XY
1481// while the encryption/decryption length is larger than 32
1482.last_2blks_tweak${std}:
1483 ld1 {@tweak[0].4s},[$ivp]
1484___
1485 &rev32_armeb(@tweak[0],@tweak[0]);
1486 &compute_tweak_vec(@tweak[0],@tweak[1],$std);
1487 &compute_tweak_vec(@tweak[1],@tweak[2],$std);
1488$code.=<<___;
1489 b .check_dec${std}
1490
1491
eb4129e1 1492// This branch calculates the last two tweaks,
c007203b
XY
1493// while the encryption/decryption length is equal to 32, who only need two tweaks
1494.only_2blks_tweak${std}:
1495 mov @tweak[1].16b,@tweak[0].16b
1496___
1497 &rev32_armeb(@tweak[1],@tweak[1]);
1498 &compute_tweak_vec(@tweak[1],@tweak[2]);
1499$code.=<<___;
1500 b .check_dec${std}
1501
1502
1503// Determine whether encryption or decryption is required.
1504// The last two tweaks need to be swapped for decryption.
1505.check_dec${std}:
1506 // encryption:1 decryption:0
1507 cmp $enc,1
1508 b.eq .prcess_last_2blks${std}
1509 mov @vtmp[0].16B,@tweak[1].16b
1510 mov @tweak[1].16B,@tweak[2].16b
1511 mov @tweak[2].16B,@vtmp[0].16b
1512
1513.prcess_last_2blks${std}:
1514___
1515 &rev32_armeb(@tweak[1],@tweak[1]);
1516 &rev32_armeb(@tweak[2],@tweak[2]);
1517$code.=<<___;
1518 ld1 {@data[0].4s},[$inp],#16
1519 eor @data[0].16b, @data[0].16b, @tweak[1].16b
1520___
1521 &rev32(@data[0],@data[0]);
1522 &encrypt_1blk(@data[0]);
1523$code.=<<___;
1524 eor @data[0].16b, @data[0].16b, @tweak[1].16b
1525 st1 {@data[0].4s},[$outp],#16
1526
1527 sub $lastBlk,$outp,16
1528 .loop${std}:
1529 subs $remain,$remain,1
1530 ldrb $wtmp0,[$lastBlk,$remain]
1531 ldrb $wtmp1,[$inp,$remain]
1532 strb $wtmp1,[$lastBlk,$remain]
1533 strb $wtmp0,[$outp,$remain]
1534 b.gt .loop${std}
1535 ld1 {@data[0].4s}, [$lastBlk]
1536 eor @data[0].16b, @data[0].16b, @tweak[2].16b
1537___
1538 &rev32(@data[0],@data[0]);
1539 &encrypt_1blk(@data[0]);
1540$code.=<<___;
1541 eor @data[0].16b, @data[0].16b, @tweak[2].16b
1542 st1 {@data[0].4s}, [$lastBlk]
1543.return${std}:
1544 ldp d14, d15, [sp], #0x10
1545 ldp d12, d13, [sp], #0x10
1546 ldp d10, d11, [sp], #0x10
1547 ldp d8, d9, [sp], #0x10
1548 ldp x29, x30, [sp], #0x10
1549 ldp x27, x28, [sp], #0x10
1550 ldp x25, x26, [sp], #0x10
1551 ldp x23, x24, [sp], #0x10
1552 ldp x21, x22, [sp], #0x10
1553 ldp x19, x20, [sp], #0x10
1554 ldp x17, x18, [sp], #0x10
1555 ldp x15, x16, [sp], #0x10
1556 AARCH64_VALIDATE_LINK_REGISTER
1557 ret
1558.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
1559___
1560} # end of gen_xts_cipher
1561&gen_xts_cipher("_gb");
1562&gen_xts_cipher("");
1563}}}
4908787f
DH
1564########################################
1565open SELF,$0;
1566while(<SELF>) {
1567 next if (/^#!/);
1568 last if (!s/^#/\/\// and !/^$/);
1569 print;
1570}
1571close SELF;
1572
1573foreach(split("\n",$code)) {
1574 s/\`([^\`]*)\`/eval($1)/ge;
1575 print $_,"\n";
1576}
1577
1578close STDOUT or die "error closing STDOUT: $!";