]>
Commit | Line | Data |
---|---|---|
4908787f | 1 | #! /usr/bin/env perl |
da1c088f | 2 | # Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved. |
4908787f DH |
3 | # |
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | ||
9 | # | |
10 | # This module implements SM4 with ASIMD on aarch64 | |
11 | # | |
12 | # Feb 2022 | |
13 | # | |
14 | ||
15 | # $output is the last argument if it looks like a file (it has an extension) | |
16 | # $flavour is the first argument if it doesn't look like a file | |
17 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
18 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
19 | ||
20 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
21 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |
22 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |
23 | die "can't locate arm-xlate.pl"; | |
24 | ||
25 | open OUT,"| \"$^X\" $xlate $flavour \"$output\"" | |
26 | or die "can't call $xlate: $!"; | |
27 | *STDOUT=*OUT; | |
28 | ||
29 | $prefix="vpsm4"; | |
30 | my @vtmp=map("v$_",(0..3)); | |
c007203b | 31 | my @qtmp=map("q$_",(0..3)); |
4908787f DH |
32 | my @data=map("v$_",(4..7)); |
33 | my @datax=map("v$_",(8..11)); | |
34 | my ($rk0,$rk1)=("v12","v13"); | |
35 | my ($rka,$rkb)=("v14","v15"); | |
36 | my @vtmpx=map("v$_",(12..15)); | |
37 | my @sbox=map("v$_",(16..31)); | |
38 | my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); | |
39 | my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); | |
c007203b | 40 | my ($xtmp1,$xtmp2)=("x8","x9"); |
4908787f DH |
41 | my ($ptr,$counter)=("x10","w11"); |
42 | my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); | |
43 | ||
44 | sub rev32() { | |
45 | my $dst = shift; | |
46 | my $src = shift; | |
47 | ||
48 | if ($src and ("$src" ne "$dst")) { | |
49 | $code.=<<___; | |
accd3bdd | 50 | #ifndef __AARCH64EB__ |
4908787f DH |
51 | rev32 $dst.16b,$src.16b |
52 | #else | |
53 | mov $dst.16b,$src.16b | |
54 | #endif | |
55 | ___ | |
56 | } else { | |
57 | $code.=<<___; | |
accd3bdd | 58 | #ifndef __AARCH64EB__ |
4908787f DH |
59 | rev32 $dst.16b,$dst.16b |
60 | #endif | |
61 | ___ | |
62 | } | |
63 | } | |
64 | ||
c007203b XY |
65 | sub rev32_armeb() { |
66 | my $dst = shift; | |
67 | my $src = shift; | |
68 | ||
69 | if ($src and ("$src" ne "$dst")) { | |
70 | $code.=<<___; | |
71 | #ifdef __AARCH64EB__ | |
72 | rev32 $dst.16b,$src.16b | |
73 | #else | |
74 | mov $dst.16b,$src.16b | |
75 | #endif | |
76 | ___ | |
77 | } else { | |
78 | $code.=<<___; | |
79 | #ifdef __AARCH64EB__ | |
80 | rev32 $dst.16b,$dst.16b | |
81 | #endif | |
82 | ___ | |
83 | } | |
84 | } | |
85 | ||
86 | sub rbit() { | |
87 | my $dst = shift; | |
88 | my $src = shift; | |
89 | my $std = shift; | |
90 | ||
91 | if ($src and ("$src" ne "$dst")) { | |
92 | if ($std eq "_gb") { | |
93 | $code.=<<___; | |
94 | rbit $dst.16b,$src.16b | |
95 | ___ | |
96 | } else { | |
97 | $code.=<<___; | |
98 | mov $dst.16b,$src.16b | |
99 | ___ | |
100 | } | |
101 | } else { | |
102 | if ($std eq "_gb") { | |
103 | $code.=<<___; | |
104 | rbit $dst.16b,$src.16b | |
105 | ___ | |
106 | } | |
107 | } | |
108 | } | |
109 | ||
4908787f DH |
110 | sub transpose() { |
111 | my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; | |
112 | ||
113 | $code.=<<___; | |
114 | zip1 $vt0.4s,$dat0.4s,$dat1.4s | |
115 | zip2 $vt1.4s,$dat0.4s,$dat1.4s | |
116 | zip1 $vt2.4s,$dat2.4s,$dat3.4s | |
117 | zip2 $vt3.4s,$dat2.4s,$dat3.4s | |
118 | zip1 $dat0.2d,$vt0.2d,$vt2.2d | |
119 | zip2 $dat1.2d,$vt0.2d,$vt2.2d | |
120 | zip1 $dat2.2d,$vt1.2d,$vt3.2d | |
121 | zip2 $dat3.2d,$vt1.2d,$vt3.2d | |
122 | ___ | |
123 | } | |
124 | ||
125 | # sbox operations for 4-lane of words | |
126 | sub sbox() { | |
127 | my $dat = shift; | |
128 | ||
129 | $code.=<<___; | |
130 | movi @vtmp[0].16b,#64 | |
131 | movi @vtmp[1].16b,#128 | |
132 | movi @vtmp[2].16b,#192 | |
133 | sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b | |
134 | sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b | |
135 | sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b | |
136 | tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b | |
137 | tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b | |
138 | tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b | |
139 | tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b | |
140 | add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d | |
141 | add @vtmp[2].2d,@vtmp[2].2d,$dat.2d | |
142 | add $dat.2d,@vtmp[0].2d,@vtmp[2].2d | |
143 | ||
144 | ushr @vtmp[0].4s,$dat.4s,32-2 | |
145 | sli @vtmp[0].4s,$dat.4s,2 | |
146 | ushr @vtmp[2].4s,$dat.4s,32-10 | |
147 | eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b | |
148 | sli @vtmp[2].4s,$dat.4s,10 | |
149 | eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b | |
150 | ushr @vtmp[0].4s,$dat.4s,32-18 | |
151 | sli @vtmp[0].4s,$dat.4s,18 | |
152 | ushr @vtmp[2].4s,$dat.4s,32-24 | |
153 | eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b | |
154 | sli @vtmp[2].4s,$dat.4s,24 | |
155 | eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b | |
156 | ___ | |
157 | } | |
158 | ||
159 | # sbox operation for 8-lane of words | |
160 | sub sbox_double() { | |
161 | my $dat = shift; | |
162 | my $datx = shift; | |
163 | ||
164 | $code.=<<___; | |
165 | movi @vtmp[3].16b,#64 | |
166 | sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b | |
167 | sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b | |
168 | sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b | |
169 | tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b | |
170 | tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b | |
171 | tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b | |
172 | tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b | |
173 | add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d | |
174 | add $dat.2d,@vtmp[2].2d,$dat.2d | |
175 | add $dat.2d,@vtmp[1].2d,$dat.2d | |
176 | ||
177 | sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b | |
178 | sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b | |
179 | sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b | |
180 | tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b | |
181 | tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b | |
182 | tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b | |
183 | tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b | |
184 | add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d | |
185 | add $datx.2d,@vtmp[2].2d,$datx.2d | |
186 | add $datx.2d,@vtmp[1].2d,$datx.2d | |
187 | ||
188 | ushr @vtmp[0].4s,$dat.4s,32-2 | |
189 | sli @vtmp[0].4s,$dat.4s,2 | |
190 | ushr @vtmp[2].4s,$datx.4s,32-2 | |
191 | eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b | |
192 | sli @vtmp[2].4s,$datx.4s,2 | |
193 | ||
194 | ushr @vtmp[0].4s,$dat.4s,32-10 | |
195 | eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b | |
196 | sli @vtmp[0].4s,$dat.4s,10 | |
197 | ushr @vtmp[2].4s,$datx.4s,32-10 | |
198 | eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b | |
199 | sli @vtmp[2].4s,$datx.4s,10 | |
200 | ||
201 | ushr @vtmp[0].4s,$dat.4s,32-18 | |
202 | eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b | |
203 | sli @vtmp[0].4s,$dat.4s,18 | |
204 | ushr @vtmp[2].4s,$datx.4s,32-18 | |
205 | eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b | |
206 | sli @vtmp[2].4s,$datx.4s,18 | |
207 | ||
208 | ushr @vtmp[0].4s,$dat.4s,32-24 | |
209 | eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b | |
210 | sli @vtmp[0].4s,$dat.4s,24 | |
211 | ushr @vtmp[2].4s,$datx.4s,32-24 | |
212 | eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b | |
213 | sli @vtmp[2].4s,$datx.4s,24 | |
214 | eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b | |
215 | ___ | |
216 | } | |
217 | ||
218 | # sbox operation for one single word | |
219 | sub sbox_1word () { | |
220 | my $word = shift; | |
221 | ||
222 | $code.=<<___; | |
223 | movi @vtmp[1].16b,#64 | |
224 | movi @vtmp[2].16b,#128 | |
225 | movi @vtmp[3].16b,#192 | |
226 | mov @vtmp[0].s[0],$word | |
227 | ||
228 | sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b | |
229 | sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b | |
230 | sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b | |
231 | ||
232 | tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b | |
233 | tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b | |
234 | tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b | |
235 | tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b | |
236 | ||
237 | mov $word,@vtmp[0].s[0] | |
238 | mov $wtmp0,@vtmp[1].s[0] | |
239 | mov $wtmp2,@vtmp[2].s[0] | |
240 | add $wtmp0,$word,$wtmp0 | |
241 | mov $word,@vtmp[3].s[0] | |
242 | add $wtmp0,$wtmp0,$wtmp2 | |
243 | add $wtmp0,$wtmp0,$word | |
244 | ||
245 | eor $word,$wtmp0,$wtmp0,ror #32-2 | |
246 | eor $word,$word,$wtmp0,ror #32-10 | |
247 | eor $word,$word,$wtmp0,ror #32-18 | |
248 | eor $word,$word,$wtmp0,ror #32-24 | |
249 | ___ | |
250 | } | |
251 | ||
252 | # sm4 for one block of data, in scalar registers word0/word1/word2/word3 | |
253 | sub sm4_1blk () { | |
254 | my $kptr = shift; | |
255 | ||
256 | $code.=<<___; | |
257 | ldp $wtmp0,$wtmp1,[$kptr],8 | |
258 | // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) | |
259 | eor $tmpw,$word2,$word3 | |
260 | eor $wtmp2,$wtmp0,$word1 | |
261 | eor $tmpw,$tmpw,$wtmp2 | |
262 | ___ | |
263 | &sbox_1word($tmpw); | |
264 | $code.=<<___; | |
265 | eor $word0,$word0,$tmpw | |
266 | // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) | |
267 | eor $tmpw,$word2,$word3 | |
268 | eor $wtmp2,$word0,$wtmp1 | |
269 | eor $tmpw,$tmpw,$wtmp2 | |
270 | ___ | |
271 | &sbox_1word($tmpw); | |
272 | $code.=<<___; | |
273 | ldp $wtmp0,$wtmp1,[$kptr],8 | |
274 | eor $word1,$word1,$tmpw | |
275 | // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) | |
276 | eor $tmpw,$word0,$word1 | |
277 | eor $wtmp2,$wtmp0,$word3 | |
278 | eor $tmpw,$tmpw,$wtmp2 | |
279 | ___ | |
280 | &sbox_1word($tmpw); | |
281 | $code.=<<___; | |
282 | eor $word2,$word2,$tmpw | |
283 | // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) | |
284 | eor $tmpw,$word0,$word1 | |
285 | eor $wtmp2,$word2,$wtmp1 | |
286 | eor $tmpw,$tmpw,$wtmp2 | |
287 | ___ | |
288 | &sbox_1word($tmpw); | |
289 | $code.=<<___; | |
290 | eor $word3,$word3,$tmpw | |
291 | ___ | |
292 | } | |
293 | ||
294 | # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 | |
295 | sub sm4_4blks () { | |
296 | my $kptr = shift; | |
297 | ||
298 | $code.=<<___; | |
299 | ldp $wtmp0,$wtmp1,[$kptr],8 | |
300 | dup $rk0.4s,$wtmp0 | |
301 | dup $rk1.4s,$wtmp1 | |
302 | ||
303 | // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) | |
304 | eor $rka.16b,@data[2].16b,@data[3].16b | |
305 | eor $rk0.16b,@data[1].16b,$rk0.16b | |
306 | eor $rk0.16b,$rka.16b,$rk0.16b | |
307 | ___ | |
308 | &sbox($rk0); | |
309 | $code.=<<___; | |
310 | eor @data[0].16b,@data[0].16b,$rk0.16b | |
311 | ||
312 | // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) | |
313 | eor $rka.16b,$rka.16b,@data[0].16b | |
314 | eor $rk1.16b,$rka.16b,$rk1.16b | |
315 | ___ | |
316 | &sbox($rk1); | |
317 | $code.=<<___; | |
318 | ldp $wtmp0,$wtmp1,[$kptr],8 | |
319 | eor @data[1].16b,@data[1].16b,$rk1.16b | |
320 | ||
321 | dup $rk0.4s,$wtmp0 | |
322 | dup $rk1.4s,$wtmp1 | |
323 | ||
324 | // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) | |
325 | eor $rka.16b,@data[0].16b,@data[1].16b | |
326 | eor $rk0.16b,@data[3].16b,$rk0.16b | |
327 | eor $rk0.16b,$rka.16b,$rk0.16b | |
328 | ___ | |
329 | &sbox($rk0); | |
330 | $code.=<<___; | |
331 | eor @data[2].16b,@data[2].16b,$rk0.16b | |
332 | ||
333 | // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) | |
334 | eor $rka.16b,$rka.16b,@data[2].16b | |
335 | eor $rk1.16b,$rka.16b,$rk1.16b | |
336 | ___ | |
337 | &sbox($rk1); | |
338 | $code.=<<___; | |
339 | eor @data[3].16b,@data[3].16b,$rk1.16b | |
340 | ___ | |
341 | } | |
342 | ||
343 | # sm4 for 8 lanes of data, in neon registers | |
344 | # data0/data1/data2/data3 datax0/datax1/datax2/datax3 | |
345 | sub sm4_8blks () { | |
346 | my $kptr = shift; | |
347 | ||
348 | $code.=<<___; | |
349 | ldp $wtmp0,$wtmp1,[$kptr],8 | |
350 | // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) | |
351 | dup $rk0.4s,$wtmp0 | |
352 | eor $rka.16b,@data[2].16b,@data[3].16b | |
353 | eor $rkb.16b,@datax[2].16b,@datax[3].16b | |
354 | eor @vtmp[0].16b,@data[1].16b,$rk0.16b | |
355 | eor @vtmp[1].16b,@datax[1].16b,$rk0.16b | |
356 | eor $rk0.16b,$rka.16b,@vtmp[0].16b | |
357 | eor $rk1.16b,$rkb.16b,@vtmp[1].16b | |
358 | ___ | |
359 | &sbox_double($rk0,$rk1); | |
360 | $code.=<<___; | |
361 | eor @data[0].16b,@data[0].16b,$rk0.16b | |
362 | eor @datax[0].16b,@datax[0].16b,$rk1.16b | |
363 | ||
364 | // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) | |
365 | dup $rk1.4s,$wtmp1 | |
366 | eor $rka.16b,$rka.16b,@data[0].16b | |
367 | eor $rkb.16b,$rkb.16b,@datax[0].16b | |
368 | eor $rk0.16b,$rka.16b,$rk1.16b | |
369 | eor $rk1.16b,$rkb.16b,$rk1.16b | |
370 | ___ | |
371 | &sbox_double($rk0,$rk1); | |
372 | $code.=<<___; | |
373 | ldp $wtmp0,$wtmp1,[$kptr],8 | |
374 | eor @data[1].16b,@data[1].16b,$rk0.16b | |
375 | eor @datax[1].16b,@datax[1].16b,$rk1.16b | |
376 | ||
377 | // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) | |
378 | dup $rk0.4s,$wtmp0 | |
379 | eor $rka.16b,@data[0].16b,@data[1].16b | |
380 | eor $rkb.16b,@datax[0].16b,@datax[1].16b | |
381 | eor @vtmp[0].16b,@data[3].16b,$rk0.16b | |
382 | eor @vtmp[1].16b,@datax[3].16b,$rk0.16b | |
383 | eor $rk0.16b,$rka.16b,@vtmp[0].16b | |
384 | eor $rk1.16b,$rkb.16b,@vtmp[1].16b | |
385 | ___ | |
386 | &sbox_double($rk0,$rk1); | |
387 | $code.=<<___; | |
388 | eor @data[2].16b,@data[2].16b,$rk0.16b | |
389 | eor @datax[2].16b,@datax[2].16b,$rk1.16b | |
390 | ||
391 | // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) | |
392 | dup $rk1.4s,$wtmp1 | |
393 | eor $rka.16b,$rka.16b,@data[2].16b | |
394 | eor $rkb.16b,$rkb.16b,@datax[2].16b | |
395 | eor $rk0.16b,$rka.16b,$rk1.16b | |
396 | eor $rk1.16b,$rkb.16b,$rk1.16b | |
397 | ___ | |
398 | &sbox_double($rk0,$rk1); | |
399 | $code.=<<___; | |
400 | eor @data[3].16b,@data[3].16b,$rk0.16b | |
401 | eor @datax[3].16b,@datax[3].16b,$rk1.16b | |
402 | ___ | |
403 | } | |
404 | ||
405 | sub encrypt_1blk_norev() { | |
406 | my $dat = shift; | |
407 | ||
408 | $code.=<<___; | |
409 | mov $ptr,$rks | |
410 | mov $counter,#8 | |
411 | mov $word0,$dat.s[0] | |
412 | mov $word1,$dat.s[1] | |
413 | mov $word2,$dat.s[2] | |
414 | mov $word3,$dat.s[3] | |
415 | 10: | |
416 | ___ | |
417 | &sm4_1blk($ptr); | |
418 | $code.=<<___; | |
419 | subs $counter,$counter,#1 | |
420 | b.ne 10b | |
421 | mov $dat.s[0],$word3 | |
422 | mov $dat.s[1],$word2 | |
423 | mov $dat.s[2],$word1 | |
424 | mov $dat.s[3],$word0 | |
425 | ___ | |
426 | } | |
427 | ||
428 | sub encrypt_1blk() { | |
429 | my $dat = shift; | |
430 | ||
431 | &encrypt_1blk_norev($dat); | |
432 | &rev32($dat,$dat); | |
433 | } | |
434 | ||
435 | sub encrypt_4blks() { | |
436 | $code.=<<___; | |
437 | mov $ptr,$rks | |
438 | mov $counter,#8 | |
439 | 10: | |
440 | ___ | |
441 | &sm4_4blks($ptr); | |
442 | $code.=<<___; | |
443 | subs $counter,$counter,#1 | |
444 | b.ne 10b | |
445 | ___ | |
446 | &rev32(@vtmp[3],@data[0]); | |
447 | &rev32(@vtmp[2],@data[1]); | |
448 | &rev32(@vtmp[1],@data[2]); | |
449 | &rev32(@vtmp[0],@data[3]); | |
450 | } | |
451 | ||
452 | sub encrypt_8blks() { | |
453 | $code.=<<___; | |
454 | mov $ptr,$rks | |
455 | mov $counter,#8 | |
456 | 10: | |
457 | ___ | |
458 | &sm4_8blks($ptr); | |
459 | $code.=<<___; | |
460 | subs $counter,$counter,#1 | |
461 | b.ne 10b | |
462 | ___ | |
463 | &rev32(@vtmp[3],@data[0]); | |
464 | &rev32(@vtmp[2],@data[1]); | |
465 | &rev32(@vtmp[1],@data[2]); | |
466 | &rev32(@vtmp[0],@data[3]); | |
467 | &rev32(@data[3],@datax[0]); | |
468 | &rev32(@data[2],@datax[1]); | |
469 | &rev32(@data[1],@datax[2]); | |
470 | &rev32(@data[0],@datax[3]); | |
471 | } | |
472 | ||
473 | sub load_sbox () { | |
474 | my $data = shift; | |
475 | ||
476 | $code.=<<___; | |
477 | adr $ptr,.Lsbox | |
accd3bdd XY |
478 | ld1 {@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64 |
479 | ld1 {@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64 | |
480 | ld1 {@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64 | |
481 | ld1 {@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr] | |
4908787f DH |
482 | ___ |
483 | } | |
484 | ||
c007203b XY |
485 | |
486 | sub mov_reg_to_vec() { | |
487 | my $src0 = shift; | |
488 | my $src1 = shift; | |
489 | my $desv = shift; | |
490 | $code.=<<___; | |
491 | mov $desv.d[0],$src0 | |
492 | mov $desv.d[1],$src1 | |
493 | ___ | |
494 | &rev32_armeb($desv,$desv); | |
495 | } | |
496 | ||
497 | sub mov_vec_to_reg() { | |
498 | my $srcv = shift; | |
499 | my $des0 = shift; | |
500 | my $des1 = shift; | |
501 | $code.=<<___; | |
502 | mov $des0,$srcv.d[0] | |
503 | mov $des1,$srcv.d[1] | |
504 | ___ | |
505 | } | |
506 | ||
507 | sub compute_tweak() { | |
508 | my $src0 = shift; | |
509 | my $src1 = shift; | |
510 | my $des0 = shift; | |
511 | my $des1 = shift; | |
512 | $code.=<<___; | |
513 | mov $wtmp0,0x87 | |
514 | extr $xtmp2,$src1,$src1,#32 | |
515 | extr $des1,$src1,$src0,#63 | |
516 | and $wtmp1,$wtmp0,$wtmp2,asr#31 | |
517 | eor $des0,$xtmp1,$src0,lsl#1 | |
518 | ___ | |
519 | } | |
520 | ||
521 | sub compute_tweak_vec() { | |
522 | my $src = shift; | |
523 | my $des = shift; | |
524 | my $std = shift; | |
525 | &rbit(@vtmp[2],$src,$std); | |
526 | $code.=<<___; | |
cded5d05 | 527 | ldr @qtmp[0], .Lxts_magic |
c007203b XY |
528 | shl $des.16b, @vtmp[2].16b, #1 |
529 | ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 | |
530 | ushr @vtmp[1].16b, @vtmp[1].16b, #7 | |
531 | mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b | |
532 | eor $des.16b, $des.16b, @vtmp[1].16b | |
533 | ___ | |
534 | &rbit($des,$des,$std); | |
535 | } | |
536 | ||
4908787f DH |
537 | $code=<<___; |
538 | #include "arm_arch.h" | |
539 | .arch armv8-a | |
540 | .text | |
541 | ||
542 | .type _vpsm4_consts,%object | |
543 | .align 7 | |
544 | _vpsm4_consts: | |
545 | .Lsbox: | |
546 | .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05 | |
547 | .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99 | |
548 | .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62 | |
549 | .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6 | |
550 | .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8 | |
551 | .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35 | |
552 | .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87 | |
553 | .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E | |
554 | .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1 | |
555 | .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3 | |
556 | .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F | |
557 | .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51 | |
558 | .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8 | |
559 | .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0 | |
560 | .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84 | |
561 | .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48 | |
562 | .Lck: | |
563 | .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 | |
564 | .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 | |
565 | .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 | |
566 | .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 | |
567 | .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 | |
568 | .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 | |
569 | .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 | |
570 | .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 | |
571 | .Lfk: | |
852438ad | 572 | .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 |
4908787f | 573 | .Lshuffles: |
852438ad | 574 | .quad 0x0B0A090807060504,0x030201000F0E0D0C |
cded5d05 | 575 | .Lxts_magic: |
852438ad | 576 | .quad 0x0101010101010187,0x0101010101010101 |
4908787f DH |
577 | |
578 | .size _vpsm4_consts,.-_vpsm4_consts | |
579 | ___ | |
580 | ||
581 | {{{ | |
582 | my ($key,$keys,$enc)=("x0","x1","w2"); | |
583 | my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); | |
584 | my ($vkey,$vfk,$vmap)=("v5","v6","v7"); | |
585 | $code.=<<___; | |
586 | .type _vpsm4_set_key,%function | |
587 | .align 4 | |
588 | _vpsm4_set_key: | |
589 | AARCH64_VALID_CALL_TARGET | |
590 | ld1 {$vkey.4s},[$key] | |
591 | ___ | |
592 | &load_sbox(); | |
593 | &rev32($vkey,$vkey); | |
594 | $code.=<<___; | |
595 | adr $pointer,.Lshuffles | |
accd3bdd | 596 | ld1 {$vmap.2d},[$pointer] |
4908787f | 597 | adr $pointer,.Lfk |
accd3bdd | 598 | ld1 {$vfk.2d},[$pointer] |
4908787f DH |
599 | eor $vkey.16b,$vkey.16b,$vfk.16b |
600 | mov $schedules,#32 | |
601 | adr $pointer,.Lck | |
602 | movi @vtmp[0].16b,#64 | |
603 | cbnz $enc,1f | |
604 | add $keys,$keys,124 | |
605 | 1: | |
606 | mov $wtmp,$vkey.s[1] | |
607 | ldr $roundkey,[$pointer],#4 | |
608 | eor $roundkey,$roundkey,$wtmp | |
609 | mov $wtmp,$vkey.s[2] | |
610 | eor $roundkey,$roundkey,$wtmp | |
611 | mov $wtmp,$vkey.s[3] | |
612 | eor $roundkey,$roundkey,$wtmp | |
613 | // sbox lookup | |
614 | mov @data[0].s[0],$roundkey | |
615 | tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b | |
616 | sub @data[0].16b,@data[0].16b,@vtmp[0].16b | |
617 | tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b | |
618 | sub @data[0].16b,@data[0].16b,@vtmp[0].16b | |
619 | tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b | |
620 | sub @data[0].16b,@data[0].16b,@vtmp[0].16b | |
621 | tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b | |
622 | mov $wtmp,@vtmp[1].s[0] | |
623 | eor $roundkey,$wtmp,$wtmp,ror #19 | |
624 | eor $roundkey,$roundkey,$wtmp,ror #9 | |
625 | mov $wtmp,$vkey.s[0] | |
626 | eor $roundkey,$roundkey,$wtmp | |
627 | mov $vkey.s[0],$roundkey | |
628 | cbz $enc,2f | |
629 | str $roundkey,[$keys],#4 | |
630 | b 3f | |
631 | 2: | |
632 | str $roundkey,[$keys],#-4 | |
633 | 3: | |
634 | tbl $vkey.16b,{$vkey.16b},$vmap.16b | |
635 | subs $schedules,$schedules,#1 | |
636 | b.ne 1b | |
637 | ret | |
638 | .size _vpsm4_set_key,.-_vpsm4_set_key | |
639 | ___ | |
640 | }}} | |
641 | ||
642 | ||
643 | {{{ | |
644 | $code.=<<___; | |
645 | .type _vpsm4_enc_4blks,%function | |
646 | .align 4 | |
647 | _vpsm4_enc_4blks: | |
648 | AARCH64_VALID_CALL_TARGET | |
649 | ___ | |
650 | &encrypt_4blks(); | |
651 | $code.=<<___; | |
652 | ret | |
653 | .size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks | |
654 | ___ | |
655 | }}} | |
656 | ||
657 | {{{ | |
658 | $code.=<<___; | |
659 | .type _vpsm4_enc_8blks,%function | |
660 | .align 4 | |
661 | _vpsm4_enc_8blks: | |
662 | AARCH64_VALID_CALL_TARGET | |
663 | ___ | |
664 | &encrypt_8blks(); | |
665 | $code.=<<___; | |
666 | ret | |
667 | .size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks | |
668 | ___ | |
669 | }}} | |
670 | ||
671 | ||
672 | {{{ | |
673 | my ($key,$keys)=("x0","x1"); | |
674 | $code.=<<___; | |
675 | .globl ${prefix}_set_encrypt_key | |
676 | .type ${prefix}_set_encrypt_key,%function | |
677 | .align 5 | |
678 | ${prefix}_set_encrypt_key: | |
679 | AARCH64_SIGN_LINK_REGISTER | |
680 | stp x29,x30,[sp,#-16]! | |
681 | mov w2,1 | |
682 | bl _vpsm4_set_key | |
683 | ldp x29,x30,[sp],#16 | |
684 | AARCH64_VALIDATE_LINK_REGISTER | |
685 | ret | |
686 | .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key | |
687 | ___ | |
688 | }}} | |
689 | ||
690 | {{{ | |
691 | my ($key,$keys)=("x0","x1"); | |
692 | $code.=<<___; | |
693 | .globl ${prefix}_set_decrypt_key | |
694 | .type ${prefix}_set_decrypt_key,%function | |
695 | .align 5 | |
696 | ${prefix}_set_decrypt_key: | |
697 | AARCH64_SIGN_LINK_REGISTER | |
698 | stp x29,x30,[sp,#-16]! | |
699 | mov w2,0 | |
700 | bl _vpsm4_set_key | |
701 | ldp x29,x30,[sp],#16 | |
702 | AARCH64_VALIDATE_LINK_REGISTER | |
703 | ret | |
704 | .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key | |
705 | ___ | |
706 | }}} | |
707 | ||
708 | {{{ | |
709 | sub gen_block () { | |
710 | my $dir = shift; | |
711 | my ($inp,$outp,$rk)=map("x$_",(0..2)); | |
712 | ||
713 | $code.=<<___; | |
714 | .globl ${prefix}_${dir}crypt | |
715 | .type ${prefix}_${dir}crypt,%function | |
716 | .align 5 | |
717 | ${prefix}_${dir}crypt: | |
718 | AARCH64_VALID_CALL_TARGET | |
accd3bdd | 719 | ld1 {@data[0].4s},[$inp] |
4908787f DH |
720 | ___ |
721 | &load_sbox(); | |
722 | &rev32(@data[0],@data[0]); | |
723 | $code.=<<___; | |
724 | mov $rks,x2 | |
725 | ___ | |
726 | &encrypt_1blk(@data[0]); | |
727 | $code.=<<___; | |
accd3bdd | 728 | st1 {@data[0].4s},[$outp] |
4908787f DH |
729 | ret |
730 | .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt | |
731 | ___ | |
732 | } | |
733 | &gen_block("en"); | |
734 | &gen_block("de"); | |
735 | }}} | |
736 | ||
737 | {{{ | |
738 | my ($enc) = ("w4"); | |
739 | my @dat=map("v$_",(16..23)); | |
740 | ||
741 | $code.=<<___; | |
742 | .globl ${prefix}_ecb_encrypt | |
743 | .type ${prefix}_ecb_encrypt,%function | |
744 | .align 5 | |
745 | ${prefix}_ecb_encrypt: | |
746 | AARCH64_SIGN_LINK_REGISTER | |
747 | // convert length into blocks | |
748 | lsr x2,x2,4 | |
749 | stp d8,d9,[sp,#-80]! | |
750 | stp d10,d11,[sp,#16] | |
751 | stp d12,d13,[sp,#32] | |
752 | stp d14,d15,[sp,#48] | |
753 | stp x29,x30,[sp,#64] | |
754 | ___ | |
755 | &load_sbox(); | |
756 | $code.=<<___; | |
757 | .Lecb_8_blocks_process: | |
758 | cmp $blocks,#8 | |
759 | b.lt .Lecb_4_blocks_process | |
760 | ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 | |
761 | ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 | |
762 | ___ | |
763 | &rev32(@data[0],@data[0]); | |
764 | &rev32(@data[1],@data[1]); | |
765 | &rev32(@data[2],@data[2]); | |
766 | &rev32(@data[3],@data[3]); | |
767 | &rev32(@datax[0],@datax[0]); | |
768 | &rev32(@datax[1],@datax[1]); | |
769 | &rev32(@datax[2],@datax[2]); | |
770 | &rev32(@datax[3],@datax[3]); | |
771 | $code.=<<___; | |
772 | bl _vpsm4_enc_8blks | |
773 | st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
774 | st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 | |
775 | subs $blocks,$blocks,#8 | |
776 | b.gt .Lecb_8_blocks_process | |
777 | b 100f | |
778 | .Lecb_4_blocks_process: | |
779 | cmp $blocks,#4 | |
780 | b.lt 1f | |
781 | ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 | |
782 | ___ | |
783 | &rev32(@data[0],@data[0]); | |
784 | &rev32(@data[1],@data[1]); | |
785 | &rev32(@data[2],@data[2]); | |
786 | &rev32(@data[3],@data[3]); | |
787 | $code.=<<___; | |
788 | bl _vpsm4_enc_4blks | |
789 | st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
790 | sub $blocks,$blocks,#4 | |
791 | 1: | |
792 | // process last block | |
793 | cmp $blocks,#1 | |
794 | b.lt 100f | |
795 | b.gt 1f | |
accd3bdd | 796 | ld1 {@data[0].4s},[$inp] |
4908787f DH |
797 | ___ |
798 | &rev32(@data[0],@data[0]); | |
799 | &encrypt_1blk(@data[0]); | |
800 | $code.=<<___; | |
accd3bdd | 801 | st1 {@data[0].4s},[$outp] |
4908787f DH |
802 | b 100f |
803 | 1: // process last 2 blocks | |
804 | ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 | |
805 | ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 | |
806 | cmp $blocks,#2 | |
807 | b.gt 1f | |
808 | ___ | |
809 | &rev32(@data[0],@data[0]); | |
810 | &rev32(@data[1],@data[1]); | |
811 | &rev32(@data[2],@data[2]); | |
812 | &rev32(@data[3],@data[3]); | |
813 | $code.=<<___; | |
814 | bl _vpsm4_enc_4blks | |
815 | st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 | |
816 | st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] | |
817 | b 100f | |
818 | 1: // process last 3 blocks | |
819 | ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 | |
820 | ___ | |
821 | &rev32(@data[0],@data[0]); | |
822 | &rev32(@data[1],@data[1]); | |
823 | &rev32(@data[2],@data[2]); | |
824 | &rev32(@data[3],@data[3]); | |
825 | $code.=<<___; | |
826 | bl _vpsm4_enc_4blks | |
827 | st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 | |
828 | st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 | |
829 | st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] | |
830 | 100: | |
831 | ldp d10,d11,[sp,#16] | |
832 | ldp d12,d13,[sp,#32] | |
833 | ldp d14,d15,[sp,#48] | |
834 | ldp x29,x30,[sp,#64] | |
835 | ldp d8,d9,[sp],#80 | |
836 | AARCH64_VALIDATE_LINK_REGISTER | |
837 | ret | |
838 | .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt | |
839 | ___ | |
840 | }}} | |
841 | ||
842 | {{{ | |
843 | my ($len,$ivp,$enc)=("x2","x4","w5"); | |
844 | my $ivec0=("v3"); | |
845 | my $ivec1=("v15"); | |
846 | ||
847 | $code.=<<___; | |
848 | .globl ${prefix}_cbc_encrypt | |
849 | .type ${prefix}_cbc_encrypt,%function | |
850 | .align 5 | |
851 | ${prefix}_cbc_encrypt: | |
852 | AARCH64_VALID_CALL_TARGET | |
853 | lsr $len,$len,4 | |
854 | ___ | |
855 | &load_sbox(); | |
856 | $code.=<<___; | |
857 | cbz $enc,.Ldec | |
858 | ld1 {$ivec0.4s},[$ivp] | |
859 | .Lcbc_4_blocks_enc: | |
860 | cmp $blocks,#4 | |
861 | b.lt 1f | |
862 | ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 | |
863 | eor @data[0].16b,@data[0].16b,$ivec0.16b | |
864 | ___ | |
865 | &rev32(@data[1],@data[1]); | |
866 | &rev32(@data[0],@data[0]); | |
867 | &rev32(@data[2],@data[2]); | |
868 | &rev32(@data[3],@data[3]); | |
869 | &encrypt_1blk_norev(@data[0]); | |
870 | $code.=<<___; | |
871 | eor @data[1].16b,@data[1].16b,@data[0].16b | |
872 | ___ | |
873 | &encrypt_1blk_norev(@data[1]); | |
874 | &rev32(@data[0],@data[0]); | |
875 | ||
876 | $code.=<<___; | |
877 | eor @data[2].16b,@data[2].16b,@data[1].16b | |
878 | ___ | |
879 | &encrypt_1blk_norev(@data[2]); | |
880 | &rev32(@data[1],@data[1]); | |
881 | $code.=<<___; | |
882 | eor @data[3].16b,@data[3].16b,@data[2].16b | |
883 | ___ | |
884 | &encrypt_1blk_norev(@data[3]); | |
885 | &rev32(@data[2],@data[2]); | |
886 | &rev32(@data[3],@data[3]); | |
887 | $code.=<<___; | |
888 | orr $ivec0.16b,@data[3].16b,@data[3].16b | |
889 | st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 | |
890 | subs $blocks,$blocks,#4 | |
891 | b.ne .Lcbc_4_blocks_enc | |
892 | b 2f | |
893 | 1: | |
894 | subs $blocks,$blocks,#1 | |
895 | b.lt 2f | |
896 | ld1 {@data[0].4s},[$inp],#16 | |
897 | eor $ivec0.16b,$ivec0.16b,@data[0].16b | |
898 | ___ | |
899 | &rev32($ivec0,$ivec0); | |
900 | &encrypt_1blk($ivec0); | |
901 | $code.=<<___; | |
accd3bdd | 902 | st1 {$ivec0.4s},[$outp],#16 |
4908787f DH |
903 | b 1b |
904 | 2: | |
905 | // save back IV | |
accd3bdd | 906 | st1 {$ivec0.4s},[$ivp] |
4908787f DH |
907 | ret |
908 | ||
909 | .Ldec: | |
910 | // decryption mode starts | |
911 | AARCH64_SIGN_LINK_REGISTER | |
912 | stp d8,d9,[sp,#-80]! | |
913 | stp d10,d11,[sp,#16] | |
914 | stp d12,d13,[sp,#32] | |
915 | stp d14,d15,[sp,#48] | |
916 | stp x29,x30,[sp,#64] | |
917 | .Lcbc_8_blocks_dec: | |
918 | cmp $blocks,#8 | |
919 | b.lt 1f | |
920 | ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] | |
921 | add $ptr,$inp,#64 | |
922 | ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] | |
923 | ___ | |
924 | &rev32(@data[0],@data[0]); | |
925 | &rev32(@data[1],@data[1]); | |
926 | &rev32(@data[2],@data[2]); | |
927 | &rev32(@data[3],$data[3]); | |
928 | &rev32(@datax[0],@datax[0]); | |
929 | &rev32(@datax[1],@datax[1]); | |
930 | &rev32(@datax[2],@datax[2]); | |
931 | &rev32(@datax[3],$datax[3]); | |
932 | $code.=<<___; | |
933 | bl _vpsm4_enc_8blks | |
934 | ___ | |
935 | &transpose(@vtmp,@datax); | |
936 | &transpose(@data,@datax); | |
937 | $code.=<<___; | |
accd3bdd | 938 | ld1 {$ivec1.4s},[$ivp] |
4908787f DH |
939 | ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 |
940 | // note ivec1 and vtmpx[3] are resuing the same register | |
941 | // care needs to be taken to avoid conflict | |
942 | eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b | |
943 | ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 | |
944 | eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b | |
945 | eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b | |
946 | eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b | |
947 | // save back IV | |
accd3bdd | 948 | st1 {$vtmpx[3].4s}, [$ivp] |
4908787f DH |
949 | eor @data[0].16b,@data[0].16b,$datax[3].16b |
950 | eor @data[1].16b,@data[1].16b,@vtmpx[0].16b | |
951 | eor @data[2].16b,@data[2].16b,@vtmpx[1].16b | |
952 | eor @data[3].16b,$data[3].16b,@vtmpx[2].16b | |
953 | st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
954 | st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 | |
955 | subs $blocks,$blocks,#8 | |
956 | b.gt .Lcbc_8_blocks_dec | |
957 | b.eq 100f | |
958 | 1: | |
accd3bdd | 959 | ld1 {$ivec1.4s},[$ivp] |
4908787f DH |
960 | .Lcbc_4_blocks_dec: |
961 | cmp $blocks,#4 | |
962 | b.lt 1f | |
963 | ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] | |
964 | ___ | |
965 | &rev32(@data[0],@data[0]); | |
966 | &rev32(@data[1],@data[1]); | |
967 | &rev32(@data[2],@data[2]); | |
968 | &rev32(@data[3],$data[3]); | |
969 | $code.=<<___; | |
970 | bl _vpsm4_enc_4blks | |
971 | ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 | |
972 | ___ | |
973 | &transpose(@vtmp,@datax); | |
974 | $code.=<<___; | |
975 | eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b | |
976 | eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b | |
977 | orr $ivec1.16b,@data[3].16b,@data[3].16b | |
978 | eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b | |
979 | eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b | |
980 | st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
981 | subs $blocks,$blocks,#4 | |
982 | b.gt .Lcbc_4_blocks_dec | |
983 | // save back IV | |
accd3bdd | 984 | st1 {@data[3].4s}, [$ivp] |
4908787f DH |
985 | b 100f |
986 | 1: // last block | |
987 | subs $blocks,$blocks,#1 | |
988 | b.lt 100f | |
989 | b.gt 1f | |
990 | ld1 {@data[0].4s},[$inp],#16 | |
991 | // save back IV | |
accd3bdd | 992 | st1 {$data[0].4s}, [$ivp] |
4908787f DH |
993 | ___ |
994 | &rev32(@datax[0],@data[0]); | |
995 | &encrypt_1blk(@datax[0]); | |
996 | $code.=<<___; | |
997 | eor @datax[0].16b,@datax[0].16b,$ivec1.16b | |
accd3bdd | 998 | st1 {@datax[0].4s},[$outp],#16 |
4908787f DH |
999 | b 100f |
1000 | 1: // last two blocks | |
1001 | ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] | |
1002 | add $ptr,$inp,#16 | |
1003 | ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 | |
1004 | subs $blocks,$blocks,1 | |
1005 | b.gt 1f | |
1006 | ___ | |
1007 | &rev32(@data[0],@data[0]); | |
1008 | &rev32(@data[1],@data[1]); | |
1009 | &rev32(@data[2],@data[2]); | |
1010 | &rev32(@data[3],@data[3]); | |
1011 | $code.=<<___; | |
1012 | bl _vpsm4_enc_4blks | |
1013 | ld1 {@data[0].4s,@data[1].4s},[$inp],#32 | |
1014 | ___ | |
1015 | &transpose(@vtmp,@datax); | |
1016 | $code.=<<___; | |
1017 | eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b | |
1018 | eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b | |
1019 | st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 | |
1020 | // save back IV | |
accd3bdd | 1021 | st1 {@data[1].4s}, [$ivp] |
4908787f DH |
1022 | b 100f |
1023 | 1: // last 3 blocks | |
1024 | ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] | |
1025 | ___ | |
1026 | &rev32(@data[0],@data[0]); | |
1027 | &rev32(@data[1],@data[1]); | |
1028 | &rev32(@data[2],@data[2]); | |
1029 | &rev32(@data[3],@data[3]); | |
1030 | $code.=<<___; | |
1031 | bl _vpsm4_enc_4blks | |
1032 | ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 | |
1033 | ___ | |
1034 | &transpose(@vtmp,@datax); | |
1035 | $code.=<<___; | |
1036 | eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b | |
1037 | eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b | |
1038 | eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b | |
1039 | st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 | |
1040 | // save back IV | |
accd3bdd | 1041 | st1 {@data[2].4s}, [$ivp] |
4908787f DH |
1042 | 100: |
1043 | ldp d10,d11,[sp,#16] | |
1044 | ldp d12,d13,[sp,#32] | |
1045 | ldp d14,d15,[sp,#48] | |
1046 | ldp x29,x30,[sp,#64] | |
1047 | ldp d8,d9,[sp],#80 | |
1048 | AARCH64_VALIDATE_LINK_REGISTER | |
1049 | ret | |
1050 | .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt | |
1051 | ___ | |
1052 | }}} | |
1053 | ||
1054 | {{{ | |
1055 | my ($ivp)=("x4"); | |
1056 | my ($ctr)=("w5"); | |
1057 | my $ivec=("v3"); | |
1058 | ||
1059 | $code.=<<___; | |
1060 | .globl ${prefix}_ctr32_encrypt_blocks | |
1061 | .type ${prefix}_ctr32_encrypt_blocks,%function | |
1062 | .align 5 | |
1063 | ${prefix}_ctr32_encrypt_blocks: | |
1064 | AARCH64_VALID_CALL_TARGET | |
1065 | ld1 {$ivec.4s},[$ivp] | |
1066 | ___ | |
1067 | &rev32($ivec,$ivec); | |
1068 | &load_sbox(); | |
1069 | $code.=<<___; | |
1070 | cmp $blocks,#1 | |
1071 | b.ne 1f | |
1072 | // fast processing for one single block without | |
1073 | // context saving overhead | |
1074 | ___ | |
1075 | &encrypt_1blk($ivec); | |
1076 | $code.=<<___; | |
accd3bdd | 1077 | ld1 {@data[0].4s},[$inp] |
4908787f | 1078 | eor @data[0].16b,@data[0].16b,$ivec.16b |
accd3bdd | 1079 | st1 {@data[0].4s},[$outp] |
4908787f DH |
1080 | ret |
1081 | 1: | |
1082 | AARCH64_SIGN_LINK_REGISTER | |
1083 | stp d8,d9,[sp,#-80]! | |
1084 | stp d10,d11,[sp,#16] | |
1085 | stp d12,d13,[sp,#32] | |
1086 | stp d14,d15,[sp,#48] | |
1087 | stp x29,x30,[sp,#64] | |
1088 | mov $word0,$ivec.s[0] | |
1089 | mov $word1,$ivec.s[1] | |
1090 | mov $word2,$ivec.s[2] | |
1091 | mov $ctr,$ivec.s[3] | |
1092 | .Lctr32_4_blocks_process: | |
1093 | cmp $blocks,#4 | |
1094 | b.lt 1f | |
1095 | dup @data[0].4s,$word0 | |
1096 | dup @data[1].4s,$word1 | |
1097 | dup @data[2].4s,$word2 | |
1098 | mov @data[3].s[0],$ctr | |
1099 | add $ctr,$ctr,#1 | |
1100 | mov $data[3].s[1],$ctr | |
1101 | add $ctr,$ctr,#1 | |
1102 | mov @data[3].s[2],$ctr | |
1103 | add $ctr,$ctr,#1 | |
1104 | mov @data[3].s[3],$ctr | |
1105 | add $ctr,$ctr,#1 | |
1106 | cmp $blocks,#8 | |
1107 | b.ge .Lctr32_8_blocks_process | |
1108 | bl _vpsm4_enc_4blks | |
1109 | ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 | |
1110 | eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b | |
1111 | eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b | |
1112 | eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b | |
1113 | eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b | |
1114 | st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
1115 | subs $blocks,$blocks,#4 | |
1116 | b.ne .Lctr32_4_blocks_process | |
1117 | b 100f | |
1118 | .Lctr32_8_blocks_process: | |
1119 | dup @datax[0].4s,$word0 | |
1120 | dup @datax[1].4s,$word1 | |
1121 | dup @datax[2].4s,$word2 | |
1122 | mov @datax[3].s[0],$ctr | |
1123 | add $ctr,$ctr,#1 | |
1124 | mov $datax[3].s[1],$ctr | |
1125 | add $ctr,$ctr,#1 | |
1126 | mov @datax[3].s[2],$ctr | |
1127 | add $ctr,$ctr,#1 | |
1128 | mov @datax[3].s[3],$ctr | |
1129 | add $ctr,$ctr,#1 | |
1130 | bl _vpsm4_enc_8blks | |
1131 | ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 | |
1132 | ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 | |
1133 | eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b | |
1134 | eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b | |
1135 | eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b | |
1136 | eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b | |
1137 | eor @data[0].16b,@data[0].16b,@datax[0].16b | |
1138 | eor @data[1].16b,@data[1].16b,@datax[1].16b | |
1139 | eor @data[2].16b,@data[2].16b,@datax[2].16b | |
1140 | eor @data[3].16b,@data[3].16b,@datax[3].16b | |
1141 | st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
1142 | st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 | |
1143 | subs $blocks,$blocks,#8 | |
1144 | b.ne .Lctr32_4_blocks_process | |
1145 | b 100f | |
1146 | 1: // last block processing | |
1147 | subs $blocks,$blocks,#1 | |
1148 | b.lt 100f | |
1149 | b.gt 1f | |
1150 | mov $ivec.s[0],$word0 | |
1151 | mov $ivec.s[1],$word1 | |
1152 | mov $ivec.s[2],$word2 | |
1153 | mov $ivec.s[3],$ctr | |
1154 | ___ | |
1155 | &encrypt_1blk($ivec); | |
1156 | $code.=<<___; | |
accd3bdd | 1157 | ld1 {@data[0].4s},[$inp] |
4908787f | 1158 | eor @data[0].16b,@data[0].16b,$ivec.16b |
accd3bdd | 1159 | st1 {@data[0].4s},[$outp] |
4908787f DH |
1160 | b 100f |
1161 | 1: // last 2 blocks processing | |
1162 | dup @data[0].4s,$word0 | |
1163 | dup @data[1].4s,$word1 | |
1164 | dup @data[2].4s,$word2 | |
1165 | mov @data[3].s[0],$ctr | |
1166 | add $ctr,$ctr,#1 | |
1167 | mov @data[3].s[1],$ctr | |
1168 | subs $blocks,$blocks,#1 | |
1169 | b.ne 1f | |
1170 | bl _vpsm4_enc_4blks | |
1171 | ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 | |
1172 | ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 | |
1173 | eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b | |
1174 | eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b | |
1175 | eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b | |
1176 | eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b | |
1177 | st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 | |
1178 | st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 | |
1179 | b 100f | |
1180 | 1: // last 3 blocks processing | |
1181 | add $ctr,$ctr,#1 | |
1182 | mov @data[3].s[2],$ctr | |
1183 | bl _vpsm4_enc_4blks | |
1184 | ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 | |
1185 | ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 | |
1186 | ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 | |
1187 | eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b | |
1188 | eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b | |
1189 | eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b | |
1190 | eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b | |
1191 | st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 | |
1192 | st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 | |
1193 | st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 | |
1194 | 100: | |
1195 | ldp d10,d11,[sp,#16] | |
1196 | ldp d12,d13,[sp,#32] | |
1197 | ldp d14,d15,[sp,#48] | |
1198 | ldp x29,x30,[sp,#64] | |
1199 | ldp d8,d9,[sp],#80 | |
1200 | AARCH64_VALIDATE_LINK_REGISTER | |
1201 | ret | |
1202 | .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks | |
1203 | ___ | |
1204 | }}} | |
c007203b XY |
1205 | |
1206 | {{{ | |
1207 | my ($blocks,$len)=("x2","x2"); | |
1208 | my $ivp=("x5"); | |
1209 | my @twx=map("x$_",(12..27)); | |
1210 | my ($rks1,$rks2)=("x26","x27"); | |
1211 | my $lastBlk=("x26"); | |
1212 | my $enc=("w28"); | |
1213 | my $remain=("x29"); | |
1214 | ||
1215 | my @tweak=@datax; | |
1216 | ||
1217 | sub gen_xts_cipher() { | |
1218 | my $std = shift; | |
1219 | $code.=<<___; | |
1220 | .globl ${prefix}_xts_encrypt${std} | |
1221 | .type ${prefix}_xts_encrypt${std},%function | |
1222 | .align 5 | |
1223 | ${prefix}_xts_encrypt${std}: | |
1224 | AARCH64_SIGN_LINK_REGISTER | |
1225 | stp x15, x16, [sp, #-0x10]! | |
1226 | stp x17, x18, [sp, #-0x10]! | |
1227 | stp x19, x20, [sp, #-0x10]! | |
1228 | stp x21, x22, [sp, #-0x10]! | |
1229 | stp x23, x24, [sp, #-0x10]! | |
1230 | stp x25, x26, [sp, #-0x10]! | |
1231 | stp x27, x28, [sp, #-0x10]! | |
1232 | stp x29, x30, [sp, #-0x10]! | |
1233 | stp d8, d9, [sp, #-0x10]! | |
1234 | stp d10, d11, [sp, #-0x10]! | |
1235 | stp d12, d13, [sp, #-0x10]! | |
1236 | stp d14, d15, [sp, #-0x10]! | |
1237 | mov $rks1,x3 | |
1238 | mov $rks2,x4 | |
1239 | mov $enc,w6 | |
1240 | ld1 {@tweak[0].4s}, [$ivp] | |
1241 | mov $rks,$rks2 | |
1242 | ___ | |
1243 | &load_sbox(); | |
1244 | &rev32(@tweak[0],@tweak[0]); | |
1245 | &encrypt_1blk(@tweak[0]); | |
1246 | $code.=<<___; | |
1247 | mov $rks,$rks1 | |
1248 | and $remain,$len,#0x0F | |
1249 | // convert length into blocks | |
1250 | lsr $blocks,$len,4 | |
1251 | cmp $blocks,#1 | |
1252 | b.lt .return${std} | |
1253 | ||
1254 | cmp $remain,0 | |
1255 | // If the encryption/decryption Length is N times of 16, | |
1256 | // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} | |
1257 | b.eq .xts_encrypt_blocks${std} | |
1258 | ||
1259 | // If the encryption/decryption length is not N times of 16, | |
1260 | // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} | |
1261 | // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} | |
1262 | subs $blocks,$blocks,#1 | |
1263 | b.eq .only_2blks_tweak${std} | |
1264 | .xts_encrypt_blocks${std}: | |
1265 | ___ | |
1266 | &rbit(@tweak[0],@tweak[0],$std); | |
1267 | &rev32_armeb(@tweak[0],@tweak[0]); | |
1268 | &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); | |
1269 | &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); | |
1270 | &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); | |
1271 | &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); | |
1272 | &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); | |
1273 | &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); | |
1274 | &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); | |
1275 | &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); | |
1276 | $code.=<<___; | |
1277 | .Lxts_8_blocks_process${std}: | |
1278 | cmp $blocks,#8 | |
1279 | b.lt .Lxts_4_blocks_process${std} | |
1280 | ___ | |
1281 | &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]); | |
1282 | &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]); | |
1283 | &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]); | |
1284 | &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]); | |
1285 | &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]); | |
1286 | &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]); | |
1287 | &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]); | |
1288 | &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]); | |
1289 | $code.=<<___; | |
1290 | ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 | |
1291 | ___ | |
1292 | &rbit(@vtmp[0],@vtmp[0],$std); | |
1293 | &rbit(@vtmp[1],@vtmp[1],$std); | |
1294 | &rbit(@vtmp[2],@vtmp[2],$std); | |
1295 | &rbit(@vtmp[3],@vtmp[3],$std); | |
1296 | $code.=<<___; | |
1297 | eor @data[0].16b, @data[0].16b, @vtmp[0].16b | |
1298 | eor @data[1].16b, @data[1].16b, @vtmp[1].16b | |
1299 | eor @data[2].16b, @data[2].16b, @vtmp[2].16b | |
1300 | eor @data[3].16b, @data[3].16b, @vtmp[3].16b | |
1301 | ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 | |
1302 | ___ | |
1303 | &rbit(@vtmpx[0],@vtmpx[0],$std); | |
1304 | &rbit(@vtmpx[1],@vtmpx[1],$std); | |
1305 | &rbit(@vtmpx[2],@vtmpx[2],$std); | |
1306 | &rbit(@vtmpx[3],@vtmpx[3],$std); | |
1307 | $code.=<<___; | |
1308 | eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b | |
1309 | eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b | |
1310 | eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b | |
1311 | eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b | |
1312 | ___ | |
1313 | &rev32(@data[0],@data[0]); | |
1314 | &rev32(@data[1],@data[1]); | |
1315 | &rev32(@data[2],@data[2]); | |
1316 | &rev32(@data[3],@data[3]); | |
1317 | &rev32(@datax[0],@datax[0]); | |
1318 | &rev32(@datax[1],@datax[1]); | |
1319 | &rev32(@datax[2],@datax[2]); | |
1320 | &rev32(@datax[3],@datax[3]); | |
1321 | &transpose(@data,@vtmp); | |
1322 | &transpose(@datax,@vtmp); | |
1323 | $code.=<<___; | |
1324 | bl _${prefix}_enc_8blks | |
1325 | ___ | |
1326 | &transpose(@vtmp,@datax); | |
1327 | &transpose(@data,@datax); | |
1328 | ||
1329 | &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]); | |
1330 | &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); | |
1331 | &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]); | |
1332 | &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); | |
1333 | &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]); | |
1334 | &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); | |
1335 | &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]); | |
1336 | &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); | |
1337 | &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); | |
1338 | &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); | |
1339 | &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); | |
1340 | &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); | |
1341 | &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); | |
1342 | &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); | |
1343 | &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]); | |
1344 | &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); | |
1345 | $code.=<<___; | |
1346 | eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b | |
1347 | eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b | |
1348 | eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b | |
1349 | eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b | |
1350 | eor @data[0].16b, @data[0].16b, @tweak[0].16b | |
1351 | eor @data[1].16b, @data[1].16b, @tweak[1].16b | |
1352 | eor @data[2].16b, @data[2].16b, @tweak[2].16b | |
1353 | eor @data[3].16b, @data[3].16b, @tweak[3].16b | |
1354 | ||
1355 | // save the last tweak | |
1356 | st1 {@tweak[3].4s},[$ivp] | |
1357 | st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
1358 | st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 | |
1359 | subs $blocks,$blocks,#8 | |
1360 | b.gt .Lxts_8_blocks_process${std} | |
1361 | b 100f | |
1362 | .Lxts_4_blocks_process${std}: | |
1363 | ___ | |
1364 | &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); | |
1365 | &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); | |
1366 | &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); | |
1367 | &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); | |
1368 | $code.=<<___; | |
1369 | cmp $blocks,#4 | |
1370 | b.lt 1f | |
1371 | ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 | |
1372 | ___ | |
1373 | &rbit(@tweak[0],@tweak[0],$std); | |
1374 | &rbit(@tweak[1],@tweak[1],$std); | |
1375 | &rbit(@tweak[2],@tweak[2],$std); | |
1376 | &rbit(@tweak[3],@tweak[3],$std); | |
1377 | $code.=<<___; | |
1378 | eor @data[0].16b, @data[0].16b, @tweak[0].16b | |
1379 | eor @data[1].16b, @data[1].16b, @tweak[1].16b | |
1380 | eor @data[2].16b, @data[2].16b, @tweak[2].16b | |
1381 | eor @data[3].16b, @data[3].16b, @tweak[3].16b | |
1382 | ___ | |
1383 | &rev32(@data[0],@data[0]); | |
1384 | &rev32(@data[1],@data[1]); | |
1385 | &rev32(@data[2],@data[2]); | |
1386 | &rev32(@data[3],@data[3]); | |
1387 | &transpose(@data,@vtmp); | |
1388 | $code.=<<___; | |
1389 | bl _${prefix}_enc_4blks | |
1390 | ___ | |
1391 | &transpose(@vtmp,@data); | |
1392 | $code.=<<___; | |
1393 | eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b | |
1394 | eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b | |
1395 | eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b | |
1396 | eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b | |
1397 | st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 | |
1398 | sub $blocks,$blocks,#4 | |
1399 | ___ | |
1400 | &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); | |
1401 | &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); | |
1402 | &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); | |
1403 | $code.=<<___; | |
1404 | // save the last tweak | |
1405 | st1 {@tweak[3].4s},[$ivp] | |
1406 | 1: | |
1407 | // process last block | |
1408 | cmp $blocks,#1 | |
1409 | b.lt 100f | |
1410 | b.gt 1f | |
1411 | ld1 {@data[0].4s},[$inp],#16 | |
1412 | ___ | |
1413 | &rbit(@tweak[0],@tweak[0],$std); | |
1414 | $code.=<<___; | |
1415 | eor @data[0].16b, @data[0].16b, @tweak[0].16b | |
1416 | ___ | |
1417 | &rev32(@data[0],@data[0]); | |
1418 | &encrypt_1blk(@data[0]); | |
1419 | $code.=<<___; | |
1420 | eor @data[0].16b, @data[0].16b, @tweak[0].16b | |
1421 | st1 {@data[0].4s},[$outp],#16 | |
1422 | // save the last tweak | |
1423 | st1 {@tweak[0].4s},[$ivp] | |
1424 | b 100f | |
1425 | 1: // process last 2 blocks | |
1426 | cmp $blocks,#2 | |
1427 | b.gt 1f | |
1428 | ld1 {@data[0].4s,@data[1].4s},[$inp],#32 | |
1429 | ___ | |
1430 | &rbit(@tweak[0],@tweak[0],$std); | |
1431 | &rbit(@tweak[1],@tweak[1],$std); | |
1432 | $code.=<<___; | |
1433 | eor @data[0].16b, @data[0].16b, @tweak[0].16b | |
1434 | eor @data[1].16b, @data[1].16b, @tweak[1].16b | |
1435 | ___ | |
1436 | &rev32(@data[0],@data[0]); | |
1437 | &rev32(@data[1],@data[1]); | |
1438 | &transpose(@data,@vtmp); | |
1439 | $code.=<<___; | |
1440 | bl _${prefix}_enc_4blks | |
1441 | ___ | |
1442 | &transpose(@vtmp,@data); | |
1443 | $code.=<<___; | |
1444 | eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b | |
1445 | eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b | |
1446 | st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 | |
1447 | // save the last tweak | |
1448 | st1 {@tweak[1].4s},[$ivp] | |
1449 | b 100f | |
1450 | 1: // process last 3 blocks | |
1451 | ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 | |
1452 | ___ | |
1453 | &rbit(@tweak[0],@tweak[0],$std); | |
1454 | &rbit(@tweak[1],@tweak[1],$std); | |
1455 | &rbit(@tweak[2],@tweak[2],$std); | |
1456 | $code.=<<___; | |
1457 | eor @data[0].16b, @data[0].16b, @tweak[0].16b | |
1458 | eor @data[1].16b, @data[1].16b, @tweak[1].16b | |
1459 | eor @data[2].16b, @data[2].16b, @tweak[2].16b | |
1460 | ___ | |
1461 | &rev32(@data[0],@data[0]); | |
1462 | &rev32(@data[1],@data[1]); | |
1463 | &rev32(@data[2],@data[2]); | |
1464 | &transpose(@data,@vtmp); | |
1465 | $code.=<<___; | |
1466 | bl _${prefix}_enc_4blks | |
1467 | ___ | |
1468 | &transpose(@vtmp,@data); | |
1469 | $code.=<<___; | |
1470 | eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b | |
1471 | eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b | |
1472 | eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b | |
1473 | st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 | |
1474 | // save the last tweak | |
1475 | st1 {@tweak[2].4s},[$ivp] | |
1476 | 100: | |
1477 | cmp $remain,0 | |
1478 | b.eq .return${std} | |
1479 | ||
eb4129e1 | 1480 | // This branch calculates the last two tweaks, |
c007203b XY |
1481 | // while the encryption/decryption length is larger than 32 |
1482 | .last_2blks_tweak${std}: | |
1483 | ld1 {@tweak[0].4s},[$ivp] | |
1484 | ___ | |
1485 | &rev32_armeb(@tweak[0],@tweak[0]); | |
1486 | &compute_tweak_vec(@tweak[0],@tweak[1],$std); | |
1487 | &compute_tweak_vec(@tweak[1],@tweak[2],$std); | |
1488 | $code.=<<___; | |
1489 | b .check_dec${std} | |
1490 | ||
1491 | ||
eb4129e1 | 1492 | // This branch calculates the last two tweaks, |
c007203b XY |
1493 | // while the encryption/decryption length is equal to 32, who only need two tweaks |
1494 | .only_2blks_tweak${std}: | |
1495 | mov @tweak[1].16b,@tweak[0].16b | |
1496 | ___ | |
1497 | &rev32_armeb(@tweak[1],@tweak[1]); | |
1498 | &compute_tweak_vec(@tweak[1],@tweak[2]); | |
1499 | $code.=<<___; | |
1500 | b .check_dec${std} | |
1501 | ||
1502 | ||
1503 | // Determine whether encryption or decryption is required. | |
1504 | // The last two tweaks need to be swapped for decryption. | |
1505 | .check_dec${std}: | |
1506 | // encryption:1 decryption:0 | |
1507 | cmp $enc,1 | |
1508 | b.eq .prcess_last_2blks${std} | |
1509 | mov @vtmp[0].16B,@tweak[1].16b | |
1510 | mov @tweak[1].16B,@tweak[2].16b | |
1511 | mov @tweak[2].16B,@vtmp[0].16b | |
1512 | ||
1513 | .prcess_last_2blks${std}: | |
1514 | ___ | |
1515 | &rev32_armeb(@tweak[1],@tweak[1]); | |
1516 | &rev32_armeb(@tweak[2],@tweak[2]); | |
1517 | $code.=<<___; | |
1518 | ld1 {@data[0].4s},[$inp],#16 | |
1519 | eor @data[0].16b, @data[0].16b, @tweak[1].16b | |
1520 | ___ | |
1521 | &rev32(@data[0],@data[0]); | |
1522 | &encrypt_1blk(@data[0]); | |
1523 | $code.=<<___; | |
1524 | eor @data[0].16b, @data[0].16b, @tweak[1].16b | |
1525 | st1 {@data[0].4s},[$outp],#16 | |
1526 | ||
1527 | sub $lastBlk,$outp,16 | |
1528 | .loop${std}: | |
1529 | subs $remain,$remain,1 | |
1530 | ldrb $wtmp0,[$lastBlk,$remain] | |
1531 | ldrb $wtmp1,[$inp,$remain] | |
1532 | strb $wtmp1,[$lastBlk,$remain] | |
1533 | strb $wtmp0,[$outp,$remain] | |
1534 | b.gt .loop${std} | |
1535 | ld1 {@data[0].4s}, [$lastBlk] | |
1536 | eor @data[0].16b, @data[0].16b, @tweak[2].16b | |
1537 | ___ | |
1538 | &rev32(@data[0],@data[0]); | |
1539 | &encrypt_1blk(@data[0]); | |
1540 | $code.=<<___; | |
1541 | eor @data[0].16b, @data[0].16b, @tweak[2].16b | |
1542 | st1 {@data[0].4s}, [$lastBlk] | |
1543 | .return${std}: | |
1544 | ldp d14, d15, [sp], #0x10 | |
1545 | ldp d12, d13, [sp], #0x10 | |
1546 | ldp d10, d11, [sp], #0x10 | |
1547 | ldp d8, d9, [sp], #0x10 | |
1548 | ldp x29, x30, [sp], #0x10 | |
1549 | ldp x27, x28, [sp], #0x10 | |
1550 | ldp x25, x26, [sp], #0x10 | |
1551 | ldp x23, x24, [sp], #0x10 | |
1552 | ldp x21, x22, [sp], #0x10 | |
1553 | ldp x19, x20, [sp], #0x10 | |
1554 | ldp x17, x18, [sp], #0x10 | |
1555 | ldp x15, x16, [sp], #0x10 | |
1556 | AARCH64_VALIDATE_LINK_REGISTER | |
1557 | ret | |
1558 | .size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} | |
1559 | ___ | |
1560 | } # end of gen_xts_cipher | |
1561 | &gen_xts_cipher("_gb"); | |
1562 | &gen_xts_cipher(""); | |
1563 | }}} | |
4908787f DH |
1564 | ######################################## |
1565 | open SELF,$0; | |
1566 | while(<SELF>) { | |
1567 | next if (/^#!/); | |
1568 | last if (!s/^#/\/\// and !/^$/); | |
1569 | print; | |
1570 | } | |
1571 | close SELF; | |
1572 | ||
1573 | foreach(split("\n",$code)) { | |
1574 | s/\`([^\`]*)\`/eval($1)/ge; | |
1575 | print $_,"\n"; | |
1576 | } | |
1577 | ||
1578 | close STDOUT or die "error closing STDOUT: $!"; |