]>
Commit | Line | Data |
---|---|---|
c007203b XY |
1 | #! /usr/bin/env perl |
2 | # Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. | |
3 | # | |
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
9 | # | |
10 | # This module implements SM4 with ASIMD and AESE on AARCH64 | |
11 | # | |
12 | # Dec 2022 | |
13 | # | |
14 | ||
15 | # $output is the last argument if it looks like a file (it has an extension) | |
16 | # $flavour is the first argument if it doesn't look like a file | |
17 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
18 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
19 | ||
20 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
21 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |
22 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |
23 | die "can't locate arm-xlate.pl"; | |
24 | ||
25 | open OUT,"| \"$^X\" $xlate $flavour \"$output\"" | |
26 | or die "can't call $xlate: $!"; | |
27 | *STDOUT=*OUT; | |
28 | ||
29 | $prefix="vpsm4_ex"; | |
30 | my @vtmp=map("v$_",(0..3)); | |
31 | my @qtmp=map("q$_",(0..3)); | |
32 | my @data=map("v$_",(4..7)); | |
33 | my @datax=map("v$_",(8..11)); | |
34 | my ($rk0,$rk1)=("v12","v13"); | |
35 | my ($rka,$rkb)=("v14","v15"); | |
36 | my @vtmpx=map("v$_",(12..15)); | |
37 | my ($vtmp4,$vtmp5)=("v24","v25"); | |
38 | my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31"); | |
39 | my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31"); | |
40 | ||
41 | my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); | |
42 | my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); | |
43 | my ($xtmp1,$xtmp2)=("x8","x9"); | |
44 | my ($ptr,$counter)=("x10","w11"); | |
45 | my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); | |
46 | ||
47 | sub rev32() { | |
48 | my $dst = shift; | |
49 | my $src = shift; | |
50 | ||
51 | if ($src and ("$src" ne "$dst")) { | |
52 | $code.=<<___; | |
53 | #ifndef __AARCH64EB__ | |
54 | rev32 $dst.16b,$src.16b | |
55 | #else | |
56 | mov $dst.16b,$src.16b | |
57 | #endif | |
58 | ___ | |
59 | } else { | |
60 | $code.=<<___; | |
61 | #ifndef __AARCH64EB__ | |
62 | rev32 $dst.16b,$dst.16b | |
63 | #endif | |
64 | ___ | |
65 | } | |
66 | } | |
67 | ||
68 | sub rev32_armeb() { | |
69 | my $dst = shift; | |
70 | my $src = shift; | |
71 | ||
72 | if ($src and ("$src" ne "$dst")) { | |
73 | $code.=<<___; | |
74 | #ifdef __AARCH64EB__ | |
75 | rev32 $dst.16b,$src.16b | |
76 | #else | |
77 | mov $dst.16b,$src.16b | |
78 | #endif | |
79 | ___ | |
80 | } else { | |
81 | $code.=<<___; | |
82 | #ifdef __AARCH64EB__ | |
83 | rev32 $dst.16b,$dst.16b | |
84 | #endif | |
85 | ___ | |
86 | } | |
87 | } | |
88 | ||
89 | sub rbit() { | |
90 | my $dst = shift; | |
91 | my $src = shift; | |
92 | my $std = shift; | |
93 | ||
94 | if ($src and ("$src" ne "$dst")) { | |
95 | if ($std eq "_gb") { | |
96 | $code.=<<___; | |
97 | rbit $dst.16b,$src.16b | |
98 | ___ | |
99 | } else { | |
100 | $code.=<<___; | |
101 | mov $dst.16b,$src.16b | |
102 | ___ | |
103 | } | |
104 | } else { | |
105 | if ($std eq "_gb") { | |
106 | $code.=<<___; | |
107 | rbit $dst.16b,$src.16b | |
108 | ___ | |
109 | } | |
110 | } | |
111 | } | |
112 | ||
113 | sub transpose() { | |
114 | my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; | |
115 | ||
116 | $code.=<<___; | |
117 | zip1 $vt0.4s,$dat0.4s,$dat1.4s | |
118 | zip2 $vt1.4s,$dat0.4s,$dat1.4s | |
119 | zip1 $vt2.4s,$dat2.4s,$dat3.4s | |
120 | zip2 $vt3.4s,$dat2.4s,$dat3.4s | |
121 | zip1 $dat0.2d,$vt0.2d,$vt2.2d | |
122 | zip2 $dat1.2d,$vt0.2d,$vt2.2d | |
123 | zip1 $dat2.2d,$vt1.2d,$vt3.2d | |
124 | zip2 $dat3.2d,$vt1.2d,$vt3.2d | |
125 | ___ | |
126 | } | |
127 | ||
128 | # matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) | |
129 | sub mul_matrix() { | |
130 | my $x = shift; | |
131 | my $higherMat = shift; | |
132 | my $lowerMat = shift; | |
133 | my $tmp = shift; | |
134 | $code.=<<___; | |
135 | ushr $tmp.16b, $x.16b, 4 | |
136 | and $x.16b, $x.16b, $ANDMaskV.16b | |
137 | tbl $x.16b, {$lowerMat.16b}, $x.16b | |
138 | tbl $tmp.16b, {$higherMat.16b}, $tmp.16b | |
139 | eor $x.16b, $x.16b, $tmp.16b | |
140 | ___ | |
141 | } | |
142 | ||
143 | # sbox operations for 4-lane of words | |
144 | # sbox operation for 4-lane of words | |
145 | sub sbox() { | |
146 | my $dat = shift; | |
147 | ||
148 | $code.=<<___; | |
149 | // optimize sbox using AESE instruction | |
150 | tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b | |
151 | ___ | |
152 | &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); | |
153 | $code.=<<___; | |
154 | eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b | |
155 | aese @vtmp[0].16b,@vtmp[1].16b | |
156 | ___ | |
157 | &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4); | |
158 | $code.=<<___; | |
159 | mov $dat.16b,@vtmp[0].16b | |
160 | ||
161 | // linear transformation | |
162 | ushr @vtmp[0].4s,$dat.4s,32-2 | |
163 | ushr @vtmp[1].4s,$dat.4s,32-10 | |
164 | ushr @vtmp[2].4s,$dat.4s,32-18 | |
165 | ushr @vtmp[3].4s,$dat.4s,32-24 | |
166 | sli @vtmp[0].4s,$dat.4s,2 | |
167 | sli @vtmp[1].4s,$dat.4s,10 | |
168 | sli @vtmp[2].4s,$dat.4s,18 | |
169 | sli @vtmp[3].4s,$dat.4s,24 | |
170 | eor $vtmp4.16b,@vtmp[0].16b,$dat.16b | |
171 | eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b | |
172 | eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b | |
173 | eor $dat.16b,$dat.16b,$vtmp4.16b | |
174 | ___ | |
175 | } | |
176 | ||
177 | # sbox operation for 8-lane of words | |
178 | sub sbox_double() { | |
179 | my $dat = shift; | |
180 | my $datx = shift; | |
181 | ||
182 | $code.=<<___; | |
183 | // optimize sbox using AESE instruction | |
184 | tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b | |
185 | tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b | |
186 | ___ | |
187 | &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); | |
188 | &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4); | |
189 | $code.=<<___; | |
190 | eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b | |
191 | aese @vtmp[0].16b,$vtmp5.16b | |
192 | aese @vtmp[1].16b,$vtmp5.16b | |
193 | ___ | |
194 | &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4); | |
195 | &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4); | |
196 | $code.=<<___; | |
197 | mov $dat.16b,@vtmp[0].16b | |
198 | mov $datx.16b,@vtmp[1].16b | |
199 | ||
200 | // linear transformation | |
201 | ushr @vtmp[0].4s,$dat.4s,32-2 | |
202 | ushr $vtmp5.4s,$datx.4s,32-2 | |
203 | ushr @vtmp[1].4s,$dat.4s,32-10 | |
204 | ushr @vtmp[2].4s,$dat.4s,32-18 | |
205 | ushr @vtmp[3].4s,$dat.4s,32-24 | |
206 | sli @vtmp[0].4s,$dat.4s,2 | |
207 | sli $vtmp5.4s,$datx.4s,2 | |
208 | sli @vtmp[1].4s,$dat.4s,10 | |
209 | sli @vtmp[2].4s,$dat.4s,18 | |
210 | sli @vtmp[3].4s,$dat.4s,24 | |
211 | eor $vtmp4.16b,@vtmp[0].16b,$dat.16b | |
212 | eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b | |
213 | eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b | |
214 | eor $dat.16b,$dat.16b,$vtmp4.16b | |
215 | ushr @vtmp[1].4s,$datx.4s,32-10 | |
216 | ushr @vtmp[2].4s,$datx.4s,32-18 | |
217 | ushr @vtmp[3].4s,$datx.4s,32-24 | |
218 | sli @vtmp[1].4s,$datx.4s,10 | |
219 | sli @vtmp[2].4s,$datx.4s,18 | |
220 | sli @vtmp[3].4s,$datx.4s,24 | |
221 | eor $vtmp4.16b,$vtmp5.16b,$datx.16b | |
222 | eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b | |
223 | eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b | |
224 | eor $datx.16b,$datx.16b,$vtmp4.16b | |
225 | ___ | |
226 | } | |
227 | ||
228 | # sbox operation for one single word | |
229 | sub sbox_1word () { | |
230 | my $word = shift; | |
231 | ||
232 | $code.=<<___; | |
233 | mov @vtmp[3].s[0],$word | |
234 | // optimize sbox using AESE instruction | |
235 | tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b | |
236 | ___ | |
237 | &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); | |
238 | $code.=<<___; | |
239 | eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b | |
240 | aese @vtmp[0].16b,@vtmp[1].16b | |
241 | ___ | |
242 | &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); | |
243 | $code.=<<___; | |
244 | ||
245 | mov $wtmp0,@vtmp[0].s[0] | |
246 | eor $word,$wtmp0,$wtmp0,ror #32-2 | |
247 | eor $word,$word,$wtmp0,ror #32-10 | |
248 | eor $word,$word,$wtmp0,ror #32-18 | |
249 | eor $word,$word,$wtmp0,ror #32-24 | |
250 | ___ | |
251 | } | |
252 | ||
253 | # sm4 for one block of data, in scalar registers word0/word1/word2/word3 | |
254 | sub sm4_1blk () { | |
255 | my $kptr = shift; | |
256 | ||
257 | $code.=<<___; | |
258 | ldp $wtmp0,$wtmp1,[$kptr],8 | |
259 | // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) | |
260 | eor $tmpw,$word2,$word3 | |
261 | eor $wtmp2,$wtmp0,$word1 | |
262 | eor $tmpw,$tmpw,$wtmp2 | |
263 | ___ | |
264 | &sbox_1word($tmpw); | |
265 | $code.=<<___; | |
266 | eor $word0,$word0,$tmpw | |
267 | // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) | |
268 | eor $tmpw,$word2,$word3 | |
269 | eor $wtmp2,$word0,$wtmp1 | |
270 | eor $tmpw,$tmpw,$wtmp2 | |
271 | ___ | |
272 | &sbox_1word($tmpw); | |
273 | $code.=<<___; | |
274 | ldp $wtmp0,$wtmp1,[$kptr],8 | |
275 | eor $word1,$word1,$tmpw | |
276 | // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) | |
277 | eor $tmpw,$word0,$word1 | |
278 | eor $wtmp2,$wtmp0,$word3 | |
279 | eor $tmpw,$tmpw,$wtmp2 | |
280 | ___ | |
281 | &sbox_1word($tmpw); | |
282 | $code.=<<___; | |
283 | eor $word2,$word2,$tmpw | |
284 | // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) | |
285 | eor $tmpw,$word0,$word1 | |
286 | eor $wtmp2,$word2,$wtmp1 | |
287 | eor $tmpw,$tmpw,$wtmp2 | |
288 | ___ | |
289 | &sbox_1word($tmpw); | |
290 | $code.=<<___; | |
291 | eor $word3,$word3,$tmpw | |
292 | ___ | |
293 | } | |
294 | ||
295 | # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 | |
296 | sub sm4_4blks () { | |
297 | my $kptr = shift; | |
298 | ||
299 | $code.=<<___; | |
300 | ldp $wtmp0,$wtmp1,[$kptr],8 | |
301 | dup $rk0.4s,$wtmp0 | |
302 | dup $rk1.4s,$wtmp1 | |
303 | ||
304 | // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) | |
305 | eor $rka.16b,@data[2].16b,@data[3].16b | |
306 | eor $rk0.16b,@data[1].16b,$rk0.16b | |
307 | eor $rk0.16b,$rka.16b,$rk0.16b | |
308 | ___ | |
309 | &sbox($rk0); | |
310 | $code.=<<___; | |
311 | eor @data[0].16b,@data[0].16b,$rk0.16b | |
312 | ||
313 | // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) | |
314 | eor $rka.16b,$rka.16b,@data[0].16b | |
315 | eor $rk1.16b,$rka.16b,$rk1.16b | |
316 | ___ | |
317 | &sbox($rk1); | |
318 | $code.=<<___; | |
319 | ldp $wtmp0,$wtmp1,[$kptr],8 | |
320 | eor @data[1].16b,@data[1].16b,$rk1.16b | |
321 | ||
322 | dup $rk0.4s,$wtmp0 | |
323 | dup $rk1.4s,$wtmp1 | |
324 | ||
325 | // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) | |
326 | eor $rka.16b,@data[0].16b,@data[1].16b | |
327 | eor $rk0.16b,@data[3].16b,$rk0.16b | |
328 | eor $rk0.16b,$rka.16b,$rk0.16b | |
329 | ___ | |
330 | &sbox($rk0); | |
331 | $code.=<<___; | |
332 | eor @data[2].16b,@data[2].16b,$rk0.16b | |
333 | ||
334 | // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) | |
335 | eor $rka.16b,$rka.16b,@data[2].16b | |
336 | eor $rk1.16b,$rka.16b,$rk1.16b | |
337 | ___ | |
338 | &sbox($rk1); | |
339 | $code.=<<___; | |
340 | eor @data[3].16b,@data[3].16b,$rk1.16b | |
341 | ___ | |
342 | } | |
343 | ||
344 | # sm4 for 8 lanes of data, in neon registers | |
345 | # data0/data1/data2/data3 datax0/datax1/datax2/datax3 | |
346 | sub sm4_8blks () { | |
347 | my $kptr = shift; | |
348 | ||
349 | $code.=<<___; | |
350 | ldp $wtmp0,$wtmp1,[$kptr],8 | |
351 | // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) | |
352 | dup $rk0.4s,$wtmp0 | |
353 | eor $rka.16b,@data[2].16b,@data[3].16b | |
354 | eor $rkb.16b,@datax[2].16b,@datax[3].16b | |
355 | eor @vtmp[0].16b,@data[1].16b,$rk0.16b | |
356 | eor @vtmp[1].16b,@datax[1].16b,$rk0.16b | |
357 | eor $rk0.16b,$rka.16b,@vtmp[0].16b | |
358 | eor $rk1.16b,$rkb.16b,@vtmp[1].16b | |
359 | ___ | |
360 | &sbox_double($rk0,$rk1); | |
361 | $code.=<<___; | |
362 | eor @data[0].16b,@data[0].16b,$rk0.16b | |
363 | eor @datax[0].16b,@datax[0].16b,$rk1.16b | |
364 | ||
365 | // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) | |
366 | dup $rk1.4s,$wtmp1 | |
367 | eor $rka.16b,$rka.16b,@data[0].16b | |
368 | eor $rkb.16b,$rkb.16b,@datax[0].16b | |
369 | eor $rk0.16b,$rka.16b,$rk1.16b | |
370 | eor $rk1.16b,$rkb.16b,$rk1.16b | |
371 | ___ | |
372 | &sbox_double($rk0,$rk1); | |
373 | $code.=<<___; | |
374 | ldp $wtmp0,$wtmp1,[$kptr],8 | |
375 | eor @data[1].16b,@data[1].16b,$rk0.16b | |
376 | eor @datax[1].16b,@datax[1].16b,$rk1.16b | |
377 | ||
378 | // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) | |
379 | dup $rk0.4s,$wtmp0 | |
380 | eor $rka.16b,@data[0].16b,@data[1].16b | |
381 | eor $rkb.16b,@datax[0].16b,@datax[1].16b | |
382 | eor @vtmp[0].16b,@data[3].16b,$rk0.16b | |
383 | eor @vtmp[1].16b,@datax[3].16b,$rk0.16b | |
384 | eor $rk0.16b,$rka.16b,@vtmp[0].16b | |
385 | eor $rk1.16b,$rkb.16b,@vtmp[1].16b | |
386 | ___ | |
387 | &sbox_double($rk0,$rk1); | |
388 | $code.=<<___; | |
389 | eor @data[2].16b,@data[2].16b,$rk0.16b | |
390 | eor @datax[2].16b,@datax[2].16b,$rk1.16b | |
391 | ||
392 | // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) | |
393 | dup $rk1.4s,$wtmp1 | |
394 | eor $rka.16b,$rka.16b,@data[2].16b | |
395 | eor $rkb.16b,$rkb.16b,@datax[2].16b | |
396 | eor $rk0.16b,$rka.16b,$rk1.16b | |
397 | eor $rk1.16b,$rkb.16b,$rk1.16b | |
398 | ___ | |
399 | &sbox_double($rk0,$rk1); | |
400 | $code.=<<___; | |
401 | eor @data[3].16b,@data[3].16b,$rk0.16b | |
402 | eor @datax[3].16b,@datax[3].16b,$rk1.16b | |
403 | ___ | |
404 | } | |
405 | ||
406 | sub encrypt_1blk_norev() { | |
407 | my $dat = shift; | |
408 | ||
409 | $code.=<<___; | |
410 | mov $ptr,$rks | |
411 | mov $counter,#8 | |
412 | mov $word0,$dat.s[0] | |
413 | mov $word1,$dat.s[1] | |
414 | mov $word2,$dat.s[2] | |
415 | mov $word3,$dat.s[3] | |
416 | 10: | |
417 | ___ | |
418 | &sm4_1blk($ptr); | |
419 | $code.=<<___; | |
420 | subs $counter,$counter,#1 | |
421 | b.ne 10b | |
422 | mov $dat.s[0],$word3 | |
423 | mov $dat.s[1],$word2 | |
424 | mov $dat.s[2],$word1 | |
425 | mov $dat.s[3],$word0 | |
426 | ___ | |
427 | } | |
428 | ||
429 | sub encrypt_1blk() { | |
430 | my $dat = shift; | |
431 | ||
432 | &encrypt_1blk_norev($dat); | |
433 | &rev32($dat,$dat); | |
434 | } | |
435 | ||
436 | sub encrypt_4blks() { | |
437 | $code.=<<___; | |
438 | mov $ptr,$rks | |
439 | mov $counter,#8 | |
440 | 10: | |
441 | ___ | |
442 | &sm4_4blks($ptr); | |
443 | $code.=<<___; | |
444 | subs $counter,$counter,#1 | |
445 | b.ne 10b | |
446 | ___ | |
447 | &rev32(@vtmp[3],@data[0]); | |
448 | &rev32(@vtmp[2],@data[1]); | |
449 | &rev32(@vtmp[1],@data[2]); | |
450 | &rev32(@vtmp[0],@data[3]); | |
451 | } | |
452 | ||
453 | sub encrypt_8blks() { | |
454 | $code.=<<___; | |
455 | mov $ptr,$rks | |
456 | mov $counter,#8 | |
457 | 10: | |
458 | ___ | |
459 | &sm4_8blks($ptr); | |
460 | $code.=<<___; | |
461 | subs $counter,$counter,#1 | |
462 | b.ne 10b | |
463 | ___ | |
464 | &rev32(@vtmp[3],@data[0]); | |
465 | &rev32(@vtmp[2],@data[1]); | |
466 | &rev32(@vtmp[1],@data[2]); | |
467 | &rev32(@vtmp[0],@data[3]); | |
468 | &rev32(@data[3],@datax[0]); | |
469 | &rev32(@data[2],@datax[1]); | |
470 | &rev32(@data[1],@datax[2]); | |
471 | &rev32(@data[0],@datax[3]); | |
472 | } | |
473 | ||
474 | sub load_sbox () { | |
475 | my $data = shift; | |
476 | ||
477 | $code.=<<___; | |
cded5d05 XY |
478 | ldr $MaskQ, .Lsbox_magic |
479 | ldr $TAHMatQ, .Lsbox_magic+16 | |
480 | ldr $TALMatQ, .Lsbox_magic+32 | |
481 | ldr $ATAHMatQ, .Lsbox_magic+48 | |
482 | ldr $ATALMatQ, .Lsbox_magic+64 | |
483 | ldr $ANDMaskQ, .Lsbox_magic+80 | |
c007203b XY |
484 | ___ |
485 | } | |
486 | ||
487 | sub mov_reg_to_vec() { | |
488 | my $src0 = shift; | |
489 | my $src1 = shift; | |
490 | my $desv = shift; | |
491 | $code.=<<___; | |
492 | mov $desv.d[0],$src0 | |
493 | mov $desv.d[1],$src1 | |
494 | ___ | |
495 | &rev32_armeb($desv,$desv); | |
496 | } | |
497 | ||
498 | sub mov_vec_to_reg() { | |
499 | my $srcv = shift; | |
500 | my $des0 = shift; | |
501 | my $des1 = shift; | |
502 | $code.=<<___; | |
503 | mov $des0,$srcv.d[0] | |
504 | mov $des1,$srcv.d[1] | |
505 | ___ | |
506 | } | |
507 | ||
508 | sub compute_tweak() { | |
509 | my $src0 = shift; | |
510 | my $src1 = shift; | |
511 | my $des0 = shift; | |
512 | my $des1 = shift; | |
513 | $code.=<<___; | |
514 | mov $wtmp0,0x87 | |
515 | extr $xtmp2,$src1,$src1,#32 | |
516 | extr $des1,$src1,$src0,#63 | |
517 | and $wtmp1,$wtmp0,$wtmp2,asr#31 | |
518 | eor $des0,$xtmp1,$src0,lsl#1 | |
519 | ___ | |
520 | } | |
521 | ||
522 | sub compute_tweak_vec() { | |
523 | my $src = shift; | |
524 | my $des = shift; | |
525 | my $std = shift; | |
526 | &rbit(@vtmp[2],$src,$std); | |
527 | $code.=<<___; | |
cded5d05 | 528 | ldr @qtmp[0], .Lxts_magic |
c007203b XY |
529 | shl $des.16b, @vtmp[2].16b, #1 |
530 | ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 | |
531 | ushr @vtmp[1].16b, @vtmp[1].16b, #7 | |
532 | mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b | |
533 | eor $des.16b, $des.16b, @vtmp[1].16b | |
534 | ___ | |
535 | &rbit($des,$des,$std); | |
536 | } | |
537 | ||
538 | $code=<<___; | |
539 | #include "arm_arch.h" | |
540 | .arch armv8-a+crypto | |
541 | .text | |
542 | ||
543 | .type _${prefix}_consts,%object | |
544 | .align 7 | |
545 | _${prefix}_consts: | |
546 | .Lck: | |
547 | .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 | |
548 | .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 | |
549 | .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 | |
550 | .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 | |
551 | .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 | |
552 | .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 | |
553 | .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 | |
554 | .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 | |
555 | .Lfk: | |
556 | .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197 | |
557 | .Lshuffles: | |
558 | .dword 0x0B0A090807060504,0x030201000F0E0D0C | |
cded5d05 XY |
559 | .Lxts_magic: |
560 | .dword 0x0101010101010187,0x0101010101010101 | |
561 | .Lsbox_magic: | |
562 | .dword 0x0b0e0104070a0d00,0x0306090c0f020508 | |
563 | .dword 0x62185a2042387a00,0x22581a6002783a40 | |
564 | .dword 0x15df62a89e54e923,0xc10bb67c4a803df7 | |
565 | .dword 0xb9aa6b78c1d21300,0x1407c6d56c7fbead | |
566 | .dword 0x6404462679195b3b,0xe383c1a1fe9edcbc | |
567 | .dword 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f | |
c007203b XY |
568 | |
569 | .size _${prefix}_consts,.-_${prefix}_consts | |
570 | ___ | |
571 | ||
572 | {{{ | |
573 | my ($key,$keys,$enc)=("x0","x1","w2"); | |
574 | my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); | |
575 | my ($vkey,$vfk,$vmap)=("v5","v6","v7"); | |
576 | $code.=<<___; | |
577 | .type _${prefix}_set_key,%function | |
578 | .align 4 | |
579 | _${prefix}_set_key: | |
580 | AARCH64_VALID_CALL_TARGET | |
581 | ld1 {$vkey.4s},[$key] | |
582 | ___ | |
583 | &load_sbox(); | |
584 | &rev32($vkey,$vkey); | |
585 | $code.=<<___; | |
586 | adr $pointer,.Lshuffles | |
587 | ld1 {$vmap.2d},[$pointer] | |
588 | adr $pointer,.Lfk | |
589 | ld1 {$vfk.2d},[$pointer] | |
590 | eor $vkey.16b,$vkey.16b,$vfk.16b | |
591 | mov $schedules,#32 | |
592 | adr $pointer,.Lck | |
593 | movi @vtmp[0].16b,#64 | |
594 | cbnz $enc,1f | |
595 | add $keys,$keys,124 | |
596 | 1: | |
597 | mov $wtmp,$vkey.s[1] | |
598 | ldr $roundkey,[$pointer],#4 | |
599 | eor $roundkey,$roundkey,$wtmp | |
600 | mov $wtmp,$vkey.s[2] | |
601 | eor $roundkey,$roundkey,$wtmp | |
602 | mov $wtmp,$vkey.s[3] | |
603 | eor $roundkey,$roundkey,$wtmp | |
604 | // optimize sbox using AESE instruction | |
605 | mov @data[0].s[0],$roundkey | |
606 | tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b | |
607 | ___ | |
608 | &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); | |
609 | $code.=<<___; | |
610 | eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b | |
611 | aese @vtmp[0].16b,@vtmp[1].16b | |
612 | ___ | |
613 | &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); | |
614 | $code.=<<___; | |
615 | mov $wtmp,@vtmp[0].s[0] | |
616 | eor $roundkey,$wtmp,$wtmp,ror #19 | |
617 | eor $roundkey,$roundkey,$wtmp,ror #9 | |
618 | mov $wtmp,$vkey.s[0] | |
619 | eor $roundkey,$roundkey,$wtmp | |
620 | mov $vkey.s[0],$roundkey | |
621 | cbz $enc,2f | |
622 | str $roundkey,[$keys],#4 | |
623 | b 3f | |
624 | 2: | |
625 | str $roundkey,[$keys],#-4 | |
626 | 3: | |
627 | tbl $vkey.16b,{$vkey.16b},$vmap.16b | |
628 | subs $schedules,$schedules,#1 | |
629 | b.ne 1b | |
630 | ret | |
631 | .size _${prefix}_set_key,.-_${prefix}_set_key | |
632 | ___ | |
633 | }}} | |
634 | ||
635 | ||
636 | {{{ | |
637 | $code.=<<___; | |
638 | .type _${prefix}_enc_4blks,%function | |
639 | .align 4 | |
640 | _${prefix}_enc_4blks: | |
641 | AARCH64_VALID_CALL_TARGET | |
642 | ___ | |
643 | &encrypt_4blks(); | |
644 | $code.=<<___; | |
645 | ret | |
646 | .size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks | |
647 | ___ | |
648 | }}} | |
649 | ||
650 | {{{ | |
651 | $code.=<<___; | |
652 | .type _${prefix}_enc_8blks,%function | |
653 | .align 4 | |
654 | _${prefix}_enc_8blks: | |
655 | AARCH64_VALID_CALL_TARGET | |
656 | ___ | |
657 | &encrypt_8blks(); | |
658 | $code.=<<___; | |
659 | ret | |
660 | .size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks | |
661 | ___ | |
662 | }}} | |
663 | ||
664 | ||
665 | {{{ | |
666 | my ($key,$keys)=("x0","x1"); | |
667 | $code.=<<___; | |
668 | .globl ${prefix}_set_encrypt_key | |
669 | .type ${prefix}_set_encrypt_key,%function | |
670 | .align 5 | |
671 | ${prefix}_set_encrypt_key: | |
672 | AARCH64_SIGN_LINK_REGISTER | |
673 | stp x29,x30,[sp,#-16]! | |
674 | mov w2,1 | |
675 | bl _${prefix}_set_key | |
676 | ldp x29,x30,[sp],#16 | |
677 | AARCH64_VALIDATE_LINK_REGISTER | |
678 | ret | |
679 | .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key | |
680 | ___ | |
681 | }}} | |
682 | ||
683 | {{{ | |
684 | my ($key,$keys)=("x0","x1"); | |
685 | $code.=<<___; | |
686 | .globl ${prefix}_set_decrypt_key | |
687 | .type ${prefix}_set_decrypt_key,%function | |
688 | .align 5 | |
689 | ${prefix}_set_decrypt_key: | |
690 | AARCH64_SIGN_LINK_REGISTER | |
691 | stp x29,x30,[sp,#-16]! | |
692 | mov w2,0 | |
693 | bl _${prefix}_set_key | |
694 | ldp x29,x30,[sp],#16 | |
695 | AARCH64_VALIDATE_LINK_REGISTER | |
696 | ret | |
697 | .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key | |
698 | ___ | |
699 | }}} | |
700 | ||
701 | {{{ | |
702 | sub gen_block () { | |
703 | my $dir = shift; | |
704 | my ($inp,$outp,$rk)=map("x$_",(0..2)); | |
705 | ||
706 | $code.=<<___; | |
707 | .globl ${prefix}_${dir}crypt | |
708 | .type ${prefix}_${dir}crypt,%function | |
709 | .align 5 | |
710 | ${prefix}_${dir}crypt: | |
711 | AARCH64_VALID_CALL_TARGET | |
712 | ld1 {@data[0].4s},[$inp] | |
713 | ___ | |
714 | &load_sbox(); | |
715 | &rev32(@data[0],@data[0]); | |
716 | $code.=<<___; | |
717 | mov $rks,$rk | |
718 | ___ | |
719 | &encrypt_1blk(@data[0]); | |
720 | $code.=<<___; | |
721 | st1 {@data[0].4s},[$outp] | |
722 | ret | |
723 | .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt | |
724 | ___ | |
725 | } | |
726 | &gen_block("en"); | |
727 | &gen_block("de"); | |
728 | }}} | |
729 | ||
730 | {{{ | |
731 | $code.=<<___; | |
732 | .globl ${prefix}_ecb_encrypt | |
733 | .type ${prefix}_ecb_encrypt,%function | |
734 | .align 5 | |
735 | ${prefix}_ecb_encrypt: | |
736 | AARCH64_SIGN_LINK_REGISTER | |
737 | // convert length into blocks | |
738 | lsr x2,x2,4 | |
739 | stp d8,d9,[sp,#-80]! | |
740 | stp d10,d11,[sp,#16] | |
741 | stp d12,d13,[sp,#32] | |
742 | stp d14,d15,[sp,#48] | |
743 | stp x29,x30,[sp,#64] | |
744 | ___ | |
745 | &load_sbox(); | |
746 | $code.=<<___; | |
747 | .Lecb_8_blocks_process: | |
748 | cmp $blocks,#8 | |
749 | b.lt .Lecb_4_blocks_process | |
750 | ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 | |
751 | ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 | |
752 | ___ | |
753 | &rev32(@data[0],@data[0]); | |
754 | &rev32(@data[1],@data[1]); | |
755 | &rev32(@data[2],@data[2]); | |
756 | &rev32(@data[3],@data[3]); | |
757 | &rev32(@datax[0],@datax[0]); | |
758 | &rev32(@datax[1],@datax[1]); | |
759 | &rev32(@datax[2],@datax[2]); | |
760 | &rev32(@datax[3],@datax[3]); | |
761 | $code.=<<___; | |
762 | bl _${prefix}_enc_8blks | |
763 | st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
764 | st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 | |
765 | subs $blocks,$blocks,#8 | |
766 | b.gt .Lecb_8_blocks_process | |
767 | b 100f | |
768 | .Lecb_4_blocks_process: | |
769 | cmp $blocks,#4 | |
770 | b.lt 1f | |
771 | ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 | |
772 | ___ | |
773 | &rev32(@data[0],@data[0]); | |
774 | &rev32(@data[1],@data[1]); | |
775 | &rev32(@data[2],@data[2]); | |
776 | &rev32(@data[3],@data[3]); | |
777 | $code.=<<___; | |
778 | bl _${prefix}_enc_4blks | |
779 | st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
780 | sub $blocks,$blocks,#4 | |
781 | 1: | |
782 | // process last block | |
783 | cmp $blocks,#1 | |
784 | b.lt 100f | |
785 | b.gt 1f | |
786 | ld1 {@data[0].4s},[$inp] | |
787 | ___ | |
788 | &rev32(@data[0],@data[0]); | |
789 | &encrypt_1blk(@data[0]); | |
790 | $code.=<<___; | |
791 | st1 {@data[0].4s},[$outp] | |
792 | b 100f | |
793 | 1: // process last 2 blocks | |
794 | ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 | |
795 | ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 | |
796 | cmp $blocks,#2 | |
797 | b.gt 1f | |
798 | ___ | |
799 | &rev32(@data[0],@data[0]); | |
800 | &rev32(@data[1],@data[1]); | |
801 | &rev32(@data[2],@data[2]); | |
802 | &rev32(@data[3],@data[3]); | |
803 | $code.=<<___; | |
804 | bl _${prefix}_enc_4blks | |
805 | st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 | |
806 | st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] | |
807 | b 100f | |
808 | 1: // process last 3 blocks | |
809 | ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 | |
810 | ___ | |
811 | &rev32(@data[0],@data[0]); | |
812 | &rev32(@data[1],@data[1]); | |
813 | &rev32(@data[2],@data[2]); | |
814 | &rev32(@data[3],@data[3]); | |
815 | $code.=<<___; | |
816 | bl _${prefix}_enc_4blks | |
817 | st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 | |
818 | st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 | |
819 | st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] | |
820 | 100: | |
821 | ldp d10,d11,[sp,#16] | |
822 | ldp d12,d13,[sp,#32] | |
823 | ldp d14,d15,[sp,#48] | |
824 | ldp x29,x30,[sp,#64] | |
825 | ldp d8,d9,[sp],#80 | |
826 | AARCH64_VALIDATE_LINK_REGISTER | |
827 | ret | |
828 | .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt | |
829 | ___ | |
830 | }}} | |
831 | ||
832 | {{{ | |
833 | my ($len,$ivp,$enc)=("x2","x4","w5"); | |
834 | my $ivec0=("v3"); | |
835 | my $ivec1=("v15"); | |
836 | ||
837 | $code.=<<___; | |
838 | .globl ${prefix}_cbc_encrypt | |
839 | .type ${prefix}_cbc_encrypt,%function | |
840 | .align 5 | |
841 | ${prefix}_cbc_encrypt: | |
842 | AARCH64_VALID_CALL_TARGET | |
843 | lsr $len,$len,4 | |
844 | ___ | |
845 | &load_sbox(); | |
846 | $code.=<<___; | |
847 | cbz $enc,.Ldec | |
848 | ld1 {$ivec0.4s},[$ivp] | |
849 | .Lcbc_4_blocks_enc: | |
850 | cmp $blocks,#4 | |
851 | b.lt 1f | |
852 | ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 | |
853 | eor @data[0].16b,@data[0].16b,$ivec0.16b | |
854 | ___ | |
855 | &rev32(@data[1],@data[1]); | |
856 | &rev32(@data[0],@data[0]); | |
857 | &rev32(@data[2],@data[2]); | |
858 | &rev32(@data[3],@data[3]); | |
859 | &encrypt_1blk_norev(@data[0]); | |
860 | $code.=<<___; | |
861 | eor @data[1].16b,@data[1].16b,@data[0].16b | |
862 | ___ | |
863 | &encrypt_1blk_norev(@data[1]); | |
864 | &rev32(@data[0],@data[0]); | |
865 | ||
866 | $code.=<<___; | |
867 | eor @data[2].16b,@data[2].16b,@data[1].16b | |
868 | ___ | |
869 | &encrypt_1blk_norev(@data[2]); | |
870 | &rev32(@data[1],@data[1]); | |
871 | $code.=<<___; | |
872 | eor @data[3].16b,@data[3].16b,@data[2].16b | |
873 | ___ | |
874 | &encrypt_1blk_norev(@data[3]); | |
875 | &rev32(@data[2],@data[2]); | |
876 | &rev32(@data[3],@data[3]); | |
877 | $code.=<<___; | |
878 | orr $ivec0.16b,@data[3].16b,@data[3].16b | |
879 | st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 | |
880 | subs $blocks,$blocks,#4 | |
881 | b.ne .Lcbc_4_blocks_enc | |
882 | b 2f | |
883 | 1: | |
884 | subs $blocks,$blocks,#1 | |
885 | b.lt 2f | |
886 | ld1 {@data[0].4s},[$inp],#16 | |
887 | eor $ivec0.16b,$ivec0.16b,@data[0].16b | |
888 | ___ | |
889 | &rev32($ivec0,$ivec0); | |
890 | &encrypt_1blk($ivec0); | |
891 | $code.=<<___; | |
892 | st1 {$ivec0.4s},[$outp],#16 | |
893 | b 1b | |
894 | 2: | |
895 | // save back IV | |
896 | st1 {$ivec0.4s},[$ivp] | |
897 | ret | |
898 | ||
899 | .Ldec: | |
900 | // decryption mode starts | |
901 | AARCH64_SIGN_LINK_REGISTER | |
902 | stp d8,d9,[sp,#-80]! | |
903 | stp d10,d11,[sp,#16] | |
904 | stp d12,d13,[sp,#32] | |
905 | stp d14,d15,[sp,#48] | |
906 | stp x29,x30,[sp,#64] | |
907 | .Lcbc_8_blocks_dec: | |
908 | cmp $blocks,#8 | |
909 | b.lt 1f | |
910 | ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] | |
911 | add $ptr,$inp,#64 | |
912 | ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] | |
913 | ___ | |
914 | &rev32(@data[0],@data[0]); | |
915 | &rev32(@data[1],@data[1]); | |
916 | &rev32(@data[2],@data[2]); | |
917 | &rev32(@data[3],$data[3]); | |
918 | &rev32(@datax[0],@datax[0]); | |
919 | &rev32(@datax[1],@datax[1]); | |
920 | &rev32(@datax[2],@datax[2]); | |
921 | &rev32(@datax[3],$datax[3]); | |
922 | $code.=<<___; | |
923 | bl _${prefix}_enc_8blks | |
924 | ___ | |
925 | &transpose(@vtmp,@datax); | |
926 | &transpose(@data,@datax); | |
927 | $code.=<<___; | |
928 | ld1 {$ivec1.4s},[$ivp] | |
929 | ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 | |
930 | // note ivec1 and vtmpx[3] are resuing the same register | |
931 | // care needs to be taken to avoid conflict | |
932 | eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b | |
933 | ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 | |
934 | eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b | |
935 | eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b | |
936 | eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b | |
937 | // save back IV | |
938 | st1 {$vtmpx[3].4s}, [$ivp] | |
939 | eor @data[0].16b,@data[0].16b,$datax[3].16b | |
940 | eor @data[1].16b,@data[1].16b,@vtmpx[0].16b | |
941 | eor @data[2].16b,@data[2].16b,@vtmpx[1].16b | |
942 | eor @data[3].16b,$data[3].16b,@vtmpx[2].16b | |
943 | st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
944 | st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 | |
945 | subs $blocks,$blocks,#8 | |
946 | b.gt .Lcbc_8_blocks_dec | |
947 | b.eq 100f | |
948 | 1: | |
949 | ld1 {$ivec1.4s},[$ivp] | |
950 | .Lcbc_4_blocks_dec: | |
951 | cmp $blocks,#4 | |
952 | b.lt 1f | |
953 | ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] | |
954 | ___ | |
955 | &rev32(@data[0],@data[0]); | |
956 | &rev32(@data[1],@data[1]); | |
957 | &rev32(@data[2],@data[2]); | |
958 | &rev32(@data[3],$data[3]); | |
959 | $code.=<<___; | |
960 | bl _${prefix}_enc_4blks | |
961 | ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 | |
962 | ___ | |
963 | &transpose(@vtmp,@datax); | |
964 | $code.=<<___; | |
965 | eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b | |
966 | eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b | |
967 | orr $ivec1.16b,@data[3].16b,@data[3].16b | |
968 | eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b | |
969 | eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b | |
970 | st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
971 | subs $blocks,$blocks,#4 | |
972 | b.gt .Lcbc_4_blocks_dec | |
973 | // save back IV | |
974 | st1 {@data[3].4s}, [$ivp] | |
975 | b 100f | |
976 | 1: // last block | |
977 | subs $blocks,$blocks,#1 | |
978 | b.lt 100f | |
979 | b.gt 1f | |
980 | ld1 {@data[0].4s},[$inp],#16 | |
981 | // save back IV | |
982 | st1 {$data[0].4s}, [$ivp] | |
983 | ___ | |
984 | &rev32(@datax[0],@data[0]); | |
985 | &encrypt_1blk(@datax[0]); | |
986 | $code.=<<___; | |
987 | eor @datax[0].16b,@datax[0].16b,$ivec1.16b | |
988 | st1 {@datax[0].4s},[$outp],#16 | |
989 | b 100f | |
990 | 1: // last two blocks | |
991 | ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] | |
992 | add $ptr,$inp,#16 | |
993 | ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 | |
994 | subs $blocks,$blocks,1 | |
995 | b.gt 1f | |
996 | ___ | |
997 | &rev32(@data[0],@data[0]); | |
998 | &rev32(@data[1],@data[1]); | |
999 | &rev32(@data[2],@data[2]); | |
1000 | &rev32(@data[3],@data[3]); | |
1001 | $code.=<<___; | |
1002 | bl _${prefix}_enc_4blks | |
1003 | ld1 {@data[0].4s,@data[1].4s},[$inp],#32 | |
1004 | ___ | |
1005 | &transpose(@vtmp,@datax); | |
1006 | $code.=<<___; | |
1007 | eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b | |
1008 | eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b | |
1009 | st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 | |
1010 | // save back IV | |
1011 | st1 {@data[1].4s}, [$ivp] | |
1012 | b 100f | |
1013 | 1: // last 3 blocks | |
1014 | ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] | |
1015 | ___ | |
1016 | &rev32(@data[0],@data[0]); | |
1017 | &rev32(@data[1],@data[1]); | |
1018 | &rev32(@data[2],@data[2]); | |
1019 | &rev32(@data[3],@data[3]); | |
1020 | $code.=<<___; | |
1021 | bl _${prefix}_enc_4blks | |
1022 | ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 | |
1023 | ___ | |
1024 | &transpose(@vtmp,@datax); | |
1025 | $code.=<<___; | |
1026 | eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b | |
1027 | eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b | |
1028 | eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b | |
1029 | st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 | |
1030 | // save back IV | |
1031 | st1 {@data[2].4s}, [$ivp] | |
1032 | 100: | |
1033 | ldp d10,d11,[sp,#16] | |
1034 | ldp d12,d13,[sp,#32] | |
1035 | ldp d14,d15,[sp,#48] | |
1036 | ldp x29,x30,[sp,#64] | |
1037 | ldp d8,d9,[sp],#80 | |
1038 | AARCH64_VALIDATE_LINK_REGISTER | |
1039 | ret | |
1040 | .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt | |
1041 | ___ | |
1042 | }}} | |
1043 | ||
1044 | {{{ | |
1045 | my ($ivp)=("x4"); | |
1046 | my ($ctr)=("w5"); | |
1047 | my $ivec=("v3"); | |
1048 | ||
1049 | $code.=<<___; | |
1050 | .globl ${prefix}_ctr32_encrypt_blocks | |
1051 | .type ${prefix}_ctr32_encrypt_blocks,%function | |
1052 | .align 5 | |
1053 | ${prefix}_ctr32_encrypt_blocks: | |
1054 | AARCH64_VALID_CALL_TARGET | |
1055 | ld1 {$ivec.4s},[$ivp] | |
1056 | ___ | |
1057 | &rev32($ivec,$ivec); | |
1058 | &load_sbox(); | |
1059 | $code.=<<___; | |
1060 | cmp $blocks,#1 | |
1061 | b.ne 1f | |
1062 | // fast processing for one single block without | |
1063 | // context saving overhead | |
1064 | ___ | |
1065 | &encrypt_1blk($ivec); | |
1066 | $code.=<<___; | |
1067 | ld1 {@data[0].4s},[$inp] | |
1068 | eor @data[0].16b,@data[0].16b,$ivec.16b | |
1069 | st1 {@data[0].4s},[$outp] | |
1070 | ret | |
1071 | 1: | |
1072 | AARCH64_SIGN_LINK_REGISTER | |
1073 | stp d8,d9,[sp,#-80]! | |
1074 | stp d10,d11,[sp,#16] | |
1075 | stp d12,d13,[sp,#32] | |
1076 | stp d14,d15,[sp,#48] | |
1077 | stp x29,x30,[sp,#64] | |
1078 | mov $word0,$ivec.s[0] | |
1079 | mov $word1,$ivec.s[1] | |
1080 | mov $word2,$ivec.s[2] | |
1081 | mov $ctr,$ivec.s[3] | |
1082 | .Lctr32_4_blocks_process: | |
1083 | cmp $blocks,#4 | |
1084 | b.lt 1f | |
1085 | dup @data[0].4s,$word0 | |
1086 | dup @data[1].4s,$word1 | |
1087 | dup @data[2].4s,$word2 | |
1088 | mov @data[3].s[0],$ctr | |
1089 | add $ctr,$ctr,#1 | |
1090 | mov $data[3].s[1],$ctr | |
1091 | add $ctr,$ctr,#1 | |
1092 | mov @data[3].s[2],$ctr | |
1093 | add $ctr,$ctr,#1 | |
1094 | mov @data[3].s[3],$ctr | |
1095 | add $ctr,$ctr,#1 | |
1096 | cmp $blocks,#8 | |
1097 | b.ge .Lctr32_8_blocks_process | |
1098 | bl _${prefix}_enc_4blks | |
1099 | ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 | |
1100 | eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b | |
1101 | eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b | |
1102 | eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b | |
1103 | eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b | |
1104 | st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
1105 | subs $blocks,$blocks,#4 | |
1106 | b.ne .Lctr32_4_blocks_process | |
1107 | b 100f | |
1108 | .Lctr32_8_blocks_process: | |
1109 | dup @datax[0].4s,$word0 | |
1110 | dup @datax[1].4s,$word1 | |
1111 | dup @datax[2].4s,$word2 | |
1112 | mov @datax[3].s[0],$ctr | |
1113 | add $ctr,$ctr,#1 | |
1114 | mov $datax[3].s[1],$ctr | |
1115 | add $ctr,$ctr,#1 | |
1116 | mov @datax[3].s[2],$ctr | |
1117 | add $ctr,$ctr,#1 | |
1118 | mov @datax[3].s[3],$ctr | |
1119 | add $ctr,$ctr,#1 | |
1120 | bl _${prefix}_enc_8blks | |
1121 | ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 | |
1122 | ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 | |
1123 | eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b | |
1124 | eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b | |
1125 | eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b | |
1126 | eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b | |
1127 | eor @data[0].16b,@data[0].16b,@datax[0].16b | |
1128 | eor @data[1].16b,@data[1].16b,@datax[1].16b | |
1129 | eor @data[2].16b,@data[2].16b,@datax[2].16b | |
1130 | eor @data[3].16b,@data[3].16b,@datax[3].16b | |
1131 | st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
1132 | st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 | |
1133 | subs $blocks,$blocks,#8 | |
1134 | b.ne .Lctr32_4_blocks_process | |
1135 | b 100f | |
1136 | 1: // last block processing | |
1137 | subs $blocks,$blocks,#1 | |
1138 | b.lt 100f | |
1139 | b.gt 1f | |
1140 | mov $ivec.s[0],$word0 | |
1141 | mov $ivec.s[1],$word1 | |
1142 | mov $ivec.s[2],$word2 | |
1143 | mov $ivec.s[3],$ctr | |
1144 | ___ | |
1145 | &encrypt_1blk($ivec); | |
1146 | $code.=<<___; | |
1147 | ld1 {@data[0].4s},[$inp] | |
1148 | eor @data[0].16b,@data[0].16b,$ivec.16b | |
1149 | st1 {@data[0].4s},[$outp] | |
1150 | b 100f | |
1151 | 1: // last 2 blocks processing | |
1152 | dup @data[0].4s,$word0 | |
1153 | dup @data[1].4s,$word1 | |
1154 | dup @data[2].4s,$word2 | |
1155 | mov @data[3].s[0],$ctr | |
1156 | add $ctr,$ctr,#1 | |
1157 | mov @data[3].s[1],$ctr | |
1158 | subs $blocks,$blocks,#1 | |
1159 | b.ne 1f | |
1160 | bl _${prefix}_enc_4blks | |
1161 | ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 | |
1162 | ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 | |
1163 | eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b | |
1164 | eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b | |
1165 | eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b | |
1166 | eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b | |
1167 | st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 | |
1168 | st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 | |
1169 | b 100f | |
1170 | 1: // last 3 blocks processing | |
1171 | add $ctr,$ctr,#1 | |
1172 | mov @data[3].s[2],$ctr | |
1173 | bl _${prefix}_enc_4blks | |
1174 | ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 | |
1175 | ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 | |
1176 | ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 | |
1177 | eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b | |
1178 | eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b | |
1179 | eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b | |
1180 | eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b | |
1181 | st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 | |
1182 | st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 | |
1183 | st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 | |
1184 | 100: | |
1185 | ldp d10,d11,[sp,#16] | |
1186 | ldp d12,d13,[sp,#32] | |
1187 | ldp d14,d15,[sp,#48] | |
1188 | ldp x29,x30,[sp,#64] | |
1189 | ldp d8,d9,[sp],#80 | |
1190 | AARCH64_VALIDATE_LINK_REGISTER | |
1191 | ret | |
1192 | .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks | |
1193 | ___ | |
1194 | }}} | |
1195 | ||
1196 | ||
1197 | {{{ | |
1198 | my ($blocks,$len)=("x2","x2"); | |
1199 | my $ivp=("x5"); | |
1200 | my @twx=map("x$_",(12..27)); | |
1201 | my ($rks1,$rks2)=("x26","x27"); | |
1202 | my $lastBlk=("x26"); | |
1203 | my $enc=("w28"); | |
1204 | my $remain=("x29"); | |
1205 | ||
1206 | my @tweak=map("v$_",(16..23)); | |
1207 | my $lastTweak=("v25"); | |
1208 | ||
1209 | sub gen_xts_cipher() { | |
1210 | my $std = shift; | |
1211 | $code.=<<___; | |
1212 | .globl ${prefix}_xts_encrypt${std} | |
1213 | .type ${prefix}_xts_encrypt${std},%function | |
1214 | .align 5 | |
1215 | ${prefix}_xts_encrypt${std}: | |
1216 | AARCH64_SIGN_LINK_REGISTER | |
1217 | stp x15, x16, [sp, #-0x10]! | |
1218 | stp x17, x18, [sp, #-0x10]! | |
1219 | stp x19, x20, [sp, #-0x10]! | |
1220 | stp x21, x22, [sp, #-0x10]! | |
1221 | stp x23, x24, [sp, #-0x10]! | |
1222 | stp x25, x26, [sp, #-0x10]! | |
1223 | stp x27, x28, [sp, #-0x10]! | |
1224 | stp x29, x30, [sp, #-0x10]! | |
1225 | stp d8, d9, [sp, #-0x10]! | |
1226 | stp d10, d11, [sp, #-0x10]! | |
1227 | stp d12, d13, [sp, #-0x10]! | |
1228 | stp d14, d15, [sp, #-0x10]! | |
1229 | mov $rks1,x3 | |
1230 | mov $rks2,x4 | |
1231 | mov $enc,w6 | |
1232 | ld1 {@tweak[0].4s}, [$ivp] | |
1233 | mov $rks,$rks2 | |
1234 | ___ | |
1235 | &load_sbox(); | |
1236 | &rev32(@tweak[0],@tweak[0]); | |
1237 | &encrypt_1blk(@tweak[0]); | |
1238 | $code.=<<___; | |
1239 | mov $rks,$rks1 | |
1240 | and $remain,$len,#0x0F | |
1241 | // convert length into blocks | |
1242 | lsr $blocks,$len,4 | |
1243 | cmp $blocks,#1 | |
1244 | b.lt .return${std} | |
1245 | ||
1246 | cmp $remain,0 | |
1247 | // If the encryption/decryption Length is N times of 16, | |
1248 | // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} | |
1249 | b.eq .xts_encrypt_blocks${std} | |
1250 | ||
1251 | // If the encryption/decryption length is not N times of 16, | |
1252 | // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} | |
1253 | // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} | |
1254 | subs $blocks,$blocks,#1 | |
1255 | b.eq .only_2blks_tweak${std} | |
1256 | .xts_encrypt_blocks${std}: | |
1257 | ___ | |
1258 | &rbit(@tweak[0],@tweak[0],$std); | |
1259 | &rev32_armeb(@tweak[0],@tweak[0]); | |
1260 | &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); | |
1261 | &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); | |
1262 | &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); | |
1263 | &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); | |
1264 | &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); | |
1265 | &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); | |
1266 | &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); | |
1267 | &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); | |
1268 | $code.=<<___; | |
1269 | .Lxts_8_blocks_process${std}: | |
1270 | cmp $blocks,#8 | |
1271 | ___ | |
1272 | &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); | |
1273 | &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); | |
1274 | &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); | |
1275 | &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); | |
1276 | &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); | |
1277 | &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); | |
1278 | &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); | |
1279 | &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); | |
1280 | &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); | |
1281 | &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); | |
1282 | &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); | |
1283 | &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); | |
1284 | &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); | |
1285 | &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); | |
1286 | &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); | |
1287 | &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); | |
1288 | $code.=<<___; | |
1289 | b.lt .Lxts_4_blocks_process${std} | |
1290 | ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 | |
1291 | ___ | |
1292 | &rbit(@tweak[0],@tweak[0],$std); | |
1293 | &rbit(@tweak[1],@tweak[1],$std); | |
1294 | &rbit(@tweak[2],@tweak[2],$std); | |
1295 | &rbit(@tweak[3],@tweak[3],$std); | |
1296 | $code.=<<___; | |
1297 | eor @data[0].16b, @data[0].16b, @tweak[0].16b | |
1298 | eor @data[1].16b, @data[1].16b, @tweak[1].16b | |
1299 | eor @data[2].16b, @data[2].16b, @tweak[2].16b | |
1300 | eor @data[3].16b, @data[3].16b, @tweak[3].16b | |
1301 | ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 | |
1302 | ___ | |
1303 | &rbit(@tweak[4],@tweak[4],$std); | |
1304 | &rbit(@tweak[5],@tweak[5],$std); | |
1305 | &rbit(@tweak[6],@tweak[6],$std); | |
1306 | &rbit(@tweak[7],@tweak[7],$std); | |
1307 | $code.=<<___; | |
1308 | eor @datax[0].16b, @datax[0].16b, @tweak[4].16b | |
1309 | eor @datax[1].16b, @datax[1].16b, @tweak[5].16b | |
1310 | eor @datax[2].16b, @datax[2].16b, @tweak[6].16b | |
1311 | eor @datax[3].16b, @datax[3].16b, @tweak[7].16b | |
1312 | ___ | |
1313 | &rev32(@data[0],@data[0]); | |
1314 | &rev32(@data[1],@data[1]); | |
1315 | &rev32(@data[2],@data[2]); | |
1316 | &rev32(@data[3],@data[3]); | |
1317 | &rev32(@datax[0],@datax[0]); | |
1318 | &rev32(@datax[1],@datax[1]); | |
1319 | &rev32(@datax[2],@datax[2]); | |
1320 | &rev32(@datax[3],@datax[3]); | |
1321 | &transpose(@data,@vtmp); | |
1322 | &transpose(@datax,@vtmp); | |
1323 | $code.=<<___; | |
1324 | bl _${prefix}_enc_8blks | |
1325 | ___ | |
1326 | &transpose(@vtmp,@datax); | |
1327 | &transpose(@data,@datax); | |
1328 | $code.=<<___; | |
1329 | eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b | |
1330 | eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b | |
1331 | eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b | |
1332 | eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b | |
1333 | eor @data[0].16b, @data[0].16b, @tweak[4].16b | |
1334 | eor @data[1].16b, @data[1].16b, @tweak[5].16b | |
1335 | eor @data[2].16b, @data[2].16b, @tweak[6].16b | |
1336 | eor @data[3].16b, @data[3].16b, @tweak[7].16b | |
1337 | ||
1338 | // save the last tweak | |
1339 | mov $lastTweak.16b,@tweak[7].16b | |
1340 | st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
1341 | st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 | |
1342 | subs $blocks,$blocks,#8 | |
1343 | b.gt .Lxts_8_blocks_process${std} | |
1344 | b 100f | |
1345 | .Lxts_4_blocks_process${std}: | |
1346 | cmp $blocks,#4 | |
1347 | b.lt 1f | |
1348 | ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 | |
1349 | ___ | |
1350 | &rbit(@tweak[0],@tweak[0],$std); | |
1351 | &rbit(@tweak[1],@tweak[1],$std); | |
1352 | &rbit(@tweak[2],@tweak[2],$std); | |
1353 | &rbit(@tweak[3],@tweak[3],$std); | |
1354 | $code.=<<___; | |
1355 | eor @data[0].16b, @data[0].16b, @tweak[0].16b | |
1356 | eor @data[1].16b, @data[1].16b, @tweak[1].16b | |
1357 | eor @data[2].16b, @data[2].16b, @tweak[2].16b | |
1358 | eor @data[3].16b, @data[3].16b, @tweak[3].16b | |
1359 | ___ | |
1360 | &rev32(@data[0],@data[0]); | |
1361 | &rev32(@data[1],@data[1]); | |
1362 | &rev32(@data[2],@data[2]); | |
1363 | &rev32(@data[3],@data[3]); | |
1364 | &transpose(@data,@vtmp); | |
1365 | $code.=<<___; | |
1366 | bl _${prefix}_enc_4blks | |
1367 | ___ | |
1368 | &transpose(@vtmp,@data); | |
1369 | $code.=<<___; | |
1370 | eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b | |
1371 | eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b | |
1372 | eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b | |
1373 | eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b | |
1374 | st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
1375 | sub $blocks,$blocks,#4 | |
1376 | mov @tweak[0].16b,@tweak[4].16b | |
1377 | mov @tweak[1].16b,@tweak[5].16b | |
1378 | mov @tweak[2].16b,@tweak[6].16b | |
1379 | // save the last tweak | |
1380 | mov $lastTweak.16b,@tweak[3].16b | |
1381 | 1: | |
1382 | // process last block | |
1383 | cmp $blocks,#1 | |
1384 | b.lt 100f | |
1385 | b.gt 1f | |
1386 | ld1 {@data[0].4s},[$inp],#16 | |
1387 | ___ | |
1388 | &rbit(@tweak[0],@tweak[0],$std); | |
1389 | $code.=<<___; | |
1390 | eor @data[0].16b, @data[0].16b, @tweak[0].16b | |
1391 | ___ | |
1392 | &rev32(@data[0],@data[0]); | |
1393 | &encrypt_1blk(@data[0]); | |
1394 | $code.=<<___; | |
1395 | eor @data[0].16b, @data[0].16b, @tweak[0].16b | |
1396 | st1 {@data[0].4s},[$outp],#16 | |
1397 | // save the last tweak | |
1398 | mov $lastTweak.16b,@tweak[0].16b | |
1399 | b 100f | |
1400 | 1: // process last 2 blocks | |
1401 | cmp $blocks,#2 | |
1402 | b.gt 1f | |
1403 | ld1 {@data[0].4s,@data[1].4s},[$inp],#32 | |
1404 | ___ | |
1405 | &rbit(@tweak[0],@tweak[0],$std); | |
1406 | &rbit(@tweak[1],@tweak[1],$std); | |
1407 | $code.=<<___; | |
1408 | eor @data[0].16b, @data[0].16b, @tweak[0].16b | |
1409 | eor @data[1].16b, @data[1].16b, @tweak[1].16b | |
1410 | ___ | |
1411 | &rev32(@data[0],@data[0]); | |
1412 | &rev32(@data[1],@data[1]); | |
1413 | &transpose(@data,@vtmp); | |
1414 | $code.=<<___; | |
1415 | bl _${prefix}_enc_4blks | |
1416 | ___ | |
1417 | &transpose(@vtmp,@data); | |
1418 | $code.=<<___; | |
1419 | eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b | |
1420 | eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b | |
1421 | st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 | |
1422 | // save the last tweak | |
1423 | mov $lastTweak.16b,@tweak[1].16b | |
1424 | b 100f | |
1425 | 1: // process last 3 blocks | |
1426 | ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 | |
1427 | ___ | |
1428 | &rbit(@tweak[0],@tweak[0],$std); | |
1429 | &rbit(@tweak[1],@tweak[1],$std); | |
1430 | &rbit(@tweak[2],@tweak[2],$std); | |
1431 | $code.=<<___; | |
1432 | eor @data[0].16b, @data[0].16b, @tweak[0].16b | |
1433 | eor @data[1].16b, @data[1].16b, @tweak[1].16b | |
1434 | eor @data[2].16b, @data[2].16b, @tweak[2].16b | |
1435 | ___ | |
1436 | &rev32(@data[0],@data[0]); | |
1437 | &rev32(@data[1],@data[1]); | |
1438 | &rev32(@data[2],@data[2]); | |
1439 | &transpose(@data,@vtmp); | |
1440 | $code.=<<___; | |
1441 | bl _${prefix}_enc_4blks | |
1442 | ___ | |
1443 | &transpose(@vtmp,@data); | |
1444 | $code.=<<___; | |
1445 | eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b | |
1446 | eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b | |
1447 | eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b | |
1448 | st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 | |
1449 | // save the last tweak | |
1450 | mov $lastTweak.16b,@tweak[2].16b | |
1451 | 100: | |
1452 | cmp $remain,0 | |
1453 | b.eq .return${std} | |
1454 | ||
eb4129e1 | 1455 | // This branch calculates the last two tweaks, |
c007203b XY |
1456 | // while the encryption/decryption length is larger than 32 |
1457 | .last_2blks_tweak${std}: | |
1458 | ___ | |
1459 | &rev32_armeb($lastTweak,$lastTweak); | |
1460 | &compute_tweak_vec($lastTweak,@tweak[1],$std); | |
1461 | &compute_tweak_vec(@tweak[1],@tweak[2],$std); | |
1462 | $code.=<<___; | |
1463 | b .check_dec${std} | |
1464 | ||
1465 | ||
eb4129e1 | 1466 | // This branch calculates the last two tweaks, |
c007203b XY |
1467 | // while the encryption/decryption length is equal to 32, who only need two tweaks |
1468 | .only_2blks_tweak${std}: | |
1469 | mov @tweak[1].16b,@tweak[0].16b | |
1470 | ___ | |
1471 | &rev32_armeb(@tweak[1],@tweak[1]); | |
1472 | &compute_tweak_vec(@tweak[1],@tweak[2]); | |
1473 | $code.=<<___; | |
1474 | b .check_dec${std} | |
1475 | ||
1476 | ||
1477 | // Determine whether encryption or decryption is required. | |
1478 | // The last two tweaks need to be swapped for decryption. | |
1479 | .check_dec${std}: | |
1480 | // encryption:1 decryption:0 | |
1481 | cmp $enc,1 | |
1482 | b.eq .prcess_last_2blks${std} | |
1483 | mov @vtmp[0].16B,@tweak[1].16b | |
1484 | mov @tweak[1].16B,@tweak[2].16b | |
1485 | mov @tweak[2].16B,@vtmp[0].16b | |
1486 | ||
1487 | .prcess_last_2blks${std}: | |
1488 | ___ | |
1489 | &rev32_armeb(@tweak[1],@tweak[1]); | |
1490 | &rev32_armeb(@tweak[2],@tweak[2]); | |
1491 | $code.=<<___; | |
1492 | ld1 {@data[0].4s},[$inp],#16 | |
1493 | eor @data[0].16b, @data[0].16b, @tweak[1].16b | |
1494 | ___ | |
1495 | &rev32(@data[0],@data[0]); | |
1496 | &encrypt_1blk(@data[0]); | |
1497 | $code.=<<___; | |
1498 | eor @data[0].16b, @data[0].16b, @tweak[1].16b | |
1499 | st1 {@data[0].4s},[$outp],#16 | |
1500 | ||
1501 | sub $lastBlk,$outp,16 | |
1502 | .loop${std}: | |
1503 | subs $remain,$remain,1 | |
1504 | ldrb $wtmp0,[$lastBlk,$remain] | |
1505 | ldrb $wtmp1,[$inp,$remain] | |
1506 | strb $wtmp1,[$lastBlk,$remain] | |
1507 | strb $wtmp0,[$outp,$remain] | |
1508 | b.gt .loop${std} | |
1509 | ld1 {@data[0].4s}, [$lastBlk] | |
1510 | eor @data[0].16b, @data[0].16b, @tweak[2].16b | |
1511 | ___ | |
1512 | &rev32(@data[0],@data[0]); | |
1513 | &encrypt_1blk(@data[0]); | |
1514 | $code.=<<___; | |
1515 | eor @data[0].16b, @data[0].16b, @tweak[2].16b | |
1516 | st1 {@data[0].4s}, [$lastBlk] | |
1517 | .return${std}: | |
1518 | ldp d14, d15, [sp], #0x10 | |
1519 | ldp d12, d13, [sp], #0x10 | |
1520 | ldp d10, d11, [sp], #0x10 | |
1521 | ldp d8, d9, [sp], #0x10 | |
1522 | ldp x29, x30, [sp], #0x10 | |
1523 | ldp x27, x28, [sp], #0x10 | |
1524 | ldp x25, x26, [sp], #0x10 | |
1525 | ldp x23, x24, [sp], #0x10 | |
1526 | ldp x21, x22, [sp], #0x10 | |
1527 | ldp x19, x20, [sp], #0x10 | |
1528 | ldp x17, x18, [sp], #0x10 | |
1529 | ldp x15, x16, [sp], #0x10 | |
1530 | AARCH64_VALIDATE_LINK_REGISTER | |
1531 | ret | |
1532 | .size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} | |
1533 | ___ | |
1534 | } # end of gen_xts_cipher | |
1535 | &gen_xts_cipher("_gb"); | |
1536 | &gen_xts_cipher(""); | |
1537 | }}} | |
1538 | ||
1539 | ######################################## | |
1540 | open SELF,$0; | |
1541 | while(<SELF>) { | |
1542 | next if (/^#!/); | |
1543 | last if (!s/^#/\/\// and !/^$/); | |
1544 | print; | |
1545 | } | |
1546 | close SELF; | |
1547 | ||
1548 | foreach(split("\n",$code)) { | |
1549 | s/\`([^\`]*)\`/eval($1)/ge; | |
1550 | print $_,"\n"; | |
1551 | } | |
1552 | ||
1553 | close STDOUT or die "error closing STDOUT: $!"; |