]>
Commit | Line | Data |
---|---|---|
b1b2146d | 1 | #! /usr/bin/env perl |
da1c088f | 2 | # Copyright 2022-2023 The OpenSSL Project Authors. All Rights Reserved. |
b1b2146d DH |
3 | # |
4 | # Licensed under the Apache License 2.0 (the "License"). You may not use | |
5 | # this file except in compliance with the License. You can obtain a copy | |
6 | # in the file LICENSE in the source distribution or at | |
7 | # https://www.openssl.org/source/license.html | |
8 | # | |
9 | # | |
10 | # ChaCha20 for ARMv8 via SVE | |
11 | # | |
12 | # $output is the last argument if it looks like a file (it has an extension) | |
13 | # $flavour is the first argument if it doesn't look like a file | |
14 | $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; | |
15 | $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; | |
16 | ||
17 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
18 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |
19 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |
20 | die "can't locate arm-xlate.pl"; | |
21 | ||
22 | open OUT,"| \"$^X\" $xlate $flavour \"$output\"" | |
23 | or die "can't call $xlate: $!"; | |
24 | *STDOUT=*OUT; | |
25 | ||
26 | sub AUTOLOAD() # thunk [simplified] x86-style perlasm | |
27 | { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; | |
28 | my $arg = pop; | |
29 | $arg = "#$arg" if ($arg*1 eq $arg); | |
30 | $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; | |
31 | } | |
32 | ||
33 | my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4)); | |
3f42f41a DH |
34 | my ($veclen) = ("x5"); |
35 | my ($counter) = ("x6"); | |
36 | my ($counter_w) = ("w6"); | |
37 | my @xx=(7..22); | |
38 | my @sxx=map("x$_",@xx); | |
39 | my @sx=map("w$_",@xx); | |
40 | my @K=map("x$_",(23..30)); | |
41 | my @elem=(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); | |
42 | my @KL=map("w$_",(23..30)); | |
43 | my @mx=map("z$_",@elem); | |
44 | my @vx=map("v$_",@elem); | |
b1b2146d DH |
45 | my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, |
46 | $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx; | |
bcb52bcc | 47 | my ($zctr) = ("z16"); |
3f42f41a DH |
48 | my @tt=(17..24); |
49 | my @xt=map("z$_",@tt); | |
50 | my @vt=map("v$_",@tt); | |
bcb52bcc DH |
51 | my @perm=map("z$_",(25..30)); |
52 | my ($rot8) = ("z31"); | |
3f42f41a | 53 | my @bak=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],@xt[4],@xt[5],@xt[6],@xt[7],@xt[0],@xt[1],$zctr,@xt[2],@xt[3],$rot8); |
b1b2146d DH |
54 | my $debug_encoder=0; |
55 | ||
56 | sub SVE_ADD() { | |
57 | my $x = shift; | |
58 | my $y = shift; | |
59 | ||
60 | $code.=<<___; | |
61 | add @mx[$x].s,@mx[$x].s,@mx[$y].s | |
3f42f41a DH |
62 | .if mixin == 1 |
63 | add @sx[$x],@sx[$x],@sx[$y] | |
64 | .endif | |
b1b2146d DH |
65 | ___ |
66 | if (@_) { | |
67 | &SVE_ADD(@_); | |
68 | } | |
69 | } | |
70 | ||
71 | sub SVE_EOR() { | |
72 | my $x = shift; | |
73 | my $y = shift; | |
74 | ||
75 | $code.=<<___; | |
76 | eor @mx[$x].d,@mx[$x].d,@mx[$y].d | |
3f42f41a DH |
77 | .if mixin == 1 |
78 | eor @sx[$x],@sx[$x],@sx[$y] | |
79 | .endif | |
b1b2146d DH |
80 | ___ |
81 | if (@_) { | |
82 | &SVE_EOR(@_); | |
83 | } | |
84 | } | |
85 | ||
86 | sub SVE_LSL() { | |
87 | my $bits = shift; | |
88 | my $x = shift; | |
89 | my $y = shift; | |
90 | my $next = $x + 1; | |
91 | ||
92 | $code.=<<___; | |
93 | lsl @xt[$x].s,@mx[$y].s,$bits | |
94 | ___ | |
95 | if (@_) { | |
96 | &SVE_LSL($bits,$next,@_); | |
97 | } | |
98 | } | |
99 | ||
100 | sub SVE_LSR() { | |
101 | my $bits = shift; | |
102 | my $x = shift; | |
103 | ||
104 | $code.=<<___; | |
105 | lsr @mx[$x].s,@mx[$x].s,$bits | |
3f42f41a DH |
106 | .if mixin == 1 |
107 | ror @sx[$x],@sx[$x],$bits | |
108 | .endif | |
b1b2146d DH |
109 | ___ |
110 | if (@_) { | |
111 | &SVE_LSR($bits,@_); | |
112 | } | |
113 | } | |
114 | ||
115 | sub SVE_ORR() { | |
116 | my $x = shift; | |
117 | my $y = shift; | |
118 | my $next = $x + 1; | |
119 | ||
120 | $code.=<<___; | |
121 | orr @mx[$y].d,@mx[$y].d,@xt[$x].d | |
122 | ___ | |
123 | if (@_) { | |
124 | &SVE_ORR($next,@_); | |
125 | } | |
126 | } | |
127 | ||
128 | sub SVE_REV16() { | |
129 | my $x = shift; | |
130 | ||
131 | $code.=<<___; | |
132 | revh @mx[$x].s,p0/m,@mx[$x].s | |
3f42f41a DH |
133 | .if mixin == 1 |
134 | ror @sx[$x],@sx[$x],#16 | |
135 | .endif | |
b1b2146d DH |
136 | ___ |
137 | if (@_) { | |
138 | &SVE_REV16(@_); | |
139 | } | |
140 | } | |
141 | ||
142 | sub SVE_ROT8() { | |
143 | my $x = shift; | |
144 | ||
145 | $code.=<<___; | |
146 | tbl @mx[$x].b,{@mx[$x].b},$rot8.b | |
3f42f41a DH |
147 | .if mixin == 1 |
148 | ror @sx[$x],@sx[$x],#24 | |
149 | .endif | |
b1b2146d DH |
150 | ___ |
151 | if (@_) { | |
152 | &SVE_ROT8(@_); | |
153 | } | |
154 | } | |
155 | ||
156 | sub SVE2_XAR() { | |
157 | my $bits = shift; | |
158 | my $x = shift; | |
159 | my $y = shift; | |
160 | my $rbits = 32-$bits; | |
161 | ||
162 | $code.=<<___; | |
3f42f41a DH |
163 | .if mixin == 1 |
164 | eor @sx[$x],@sx[$x],@sx[$y] | |
165 | .endif | |
b1b2146d | 166 | xar @mx[$x].s,@mx[$x].s,@mx[$y].s,$rbits |
3f42f41a DH |
167 | .if mixin == 1 |
168 | ror @sx[$x],@sx[$x],$rbits | |
169 | .endif | |
b1b2146d DH |
170 | ___ |
171 | if (@_) { | |
172 | &SVE2_XAR($bits,@_); | |
173 | } | |
174 | } | |
175 | ||
3f42f41a DH |
176 | sub SVE2_QR_GROUP() { |
177 | my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_; | |
178 | ||
179 | &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); | |
180 | &SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); | |
181 | ||
182 | &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); | |
183 | &SVE2_XAR(12,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); | |
184 | ||
185 | &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); | |
186 | &SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); | |
187 | ||
188 | &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); | |
189 | &SVE2_XAR(7,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); | |
190 | } | |
191 | ||
b1b2146d | 192 | sub SVE_QR_GROUP() { |
b1b2146d DH |
193 | my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_; |
194 | ||
195 | &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); | |
3f42f41a DH |
196 | &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); |
197 | &SVE_REV16($d0,$d1,$d2,$d3); | |
b1b2146d DH |
198 | |
199 | &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); | |
3f42f41a DH |
200 | &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); |
201 | &SVE_LSL(12,0,$b0,$b1,$b2,$b3); | |
202 | &SVE_LSR(20,$b0,$b1,$b2,$b3); | |
203 | &SVE_ORR(0,$b0,$b1,$b2,$b3); | |
b1b2146d DH |
204 | |
205 | &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); | |
3f42f41a DH |
206 | &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); |
207 | &SVE_ROT8($d0,$d1,$d2,$d3); | |
b1b2146d DH |
208 | |
209 | &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); | |
3f42f41a DH |
210 | &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); |
211 | &SVE_LSL(7,0,$b0,$b1,$b2,$b3); | |
212 | &SVE_LSR(25,$b0,$b1,$b2,$b3); | |
213 | &SVE_ORR(0,$b0,$b1,$b2,$b3); | |
b1b2146d DH |
214 | } |
215 | ||
216 | sub SVE_INNER_BLOCK() { | |
217 | $code.=<<___; | |
bcb52bcc | 218 | mov $counter,#10 |
3f42f41a | 219 | 10: |
bcb52bcc | 220 | .align 5 |
b1b2146d | 221 | ___ |
3f42f41a DH |
222 | &SVE_QR_GROUP(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); |
223 | &SVE_QR_GROUP(0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); | |
b1b2146d | 224 | $code.=<<___; |
3f42f41a DH |
225 | sub $counter,$counter,1 |
226 | cbnz $counter,10b | |
b1b2146d | 227 | ___ |
bcb52bcc DH |
228 | } |
229 | ||
230 | sub SVE2_INNER_BLOCK() { | |
b1b2146d | 231 | $code.=<<___; |
bcb52bcc | 232 | mov $counter,#10 |
3f42f41a | 233 | 10: |
bcb52bcc DH |
234 | .align 5 |
235 | ___ | |
3f42f41a DH |
236 | &SVE2_QR_GROUP(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); |
237 | &SVE2_QR_GROUP(0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); | |
bcb52bcc | 238 | $code.=<<___; |
3f42f41a DH |
239 | sub $counter,$counter,1 |
240 | cbnz $counter,10b | |
b1b2146d DH |
241 | ___ |
242 | } | |
243 | ||
3f42f41a DH |
244 | sub load_regs() { |
245 | my $offset = shift; | |
246 | my $reg = shift; | |
247 | my $next_offset = $offset + 1; | |
248 | $code.=<<___; | |
249 | ld1w {$reg.s},p0/z,[$inp,#$offset,MUL VL] | |
cd7a8e96 | 250 | #ifdef __AARCH64EB__ |
251 | revb $reg.s,p0/m,$reg.s | |
252 | #endif | |
3f42f41a DH |
253 | ___ |
254 | if (@_) { | |
255 | &load_regs($next_offset, @_); | |
256 | } else { | |
b1b2146d | 257 | $code.=<<___; |
3f42f41a | 258 | addvl $inp,$inp,$next_offset |
b1b2146d | 259 | ___ |
3f42f41a | 260 | } |
b1b2146d DH |
261 | } |
262 | ||
3f42f41a DH |
263 | sub load() { |
264 | if (@_) { | |
265 | &load_regs(0, @_); | |
266 | } | |
267 | } | |
b1b2146d | 268 | |
3f42f41a DH |
269 | sub store_regs() { |
270 | my $offset = shift; | |
271 | my $reg = shift; | |
272 | my $next_offset = $offset + 1; | |
b1b2146d | 273 | $code.=<<___; |
cd7a8e96 | 274 | #ifdef __AARCH64EB__ |
275 | revb $reg.s,p0/m,$reg.s | |
276 | #endif | |
3f42f41a | 277 | st1w {$reg.s},p0,[$outp,#$offset,MUL VL] |
b1b2146d | 278 | ___ |
3f42f41a DH |
279 | if (@_) { |
280 | &store_regs($next_offset, @_); | |
281 | } else { | |
282 | $code.=<<___; | |
283 | addvl $outp,$outp,$next_offset | |
284 | ___ | |
285 | } | |
286 | } | |
287 | ||
288 | sub store() { | |
289 | if (@_) { | |
290 | &store_regs(0, @_); | |
291 | } | |
b1b2146d DH |
292 | } |
293 | ||
294 | sub transpose() { | |
295 | my $xa = shift; | |
296 | my $xb = shift; | |
297 | my $xc = shift; | |
298 | my $xd = shift; | |
3f42f41a DH |
299 | my $xa1 = shift; |
300 | my $xb1 = shift; | |
301 | my $xc1 = shift; | |
302 | my $xd1 = shift; | |
303 | $code.=<<___; | |
304 | zip1 @xt[0].s,$xa.s,$xb.s | |
305 | zip2 @xt[1].s,$xa.s,$xb.s | |
306 | zip1 @xt[2].s,$xc.s,$xd.s | |
307 | zip2 @xt[3].s,$xc.s,$xd.s | |
308 | ||
309 | zip1 @xt[4].s,$xa1.s,$xb1.s | |
310 | zip2 @xt[5].s,$xa1.s,$xb1.s | |
311 | zip1 @xt[6].s,$xc1.s,$xd1.s | |
312 | zip2 @xt[7].s,$xc1.s,$xd1.s | |
313 | ||
314 | zip1 $xa.d,@xt[0].d,@xt[2].d | |
315 | zip2 $xb.d,@xt[0].d,@xt[2].d | |
316 | zip1 $xc.d,@xt[1].d,@xt[3].d | |
317 | zip2 $xd.d,@xt[1].d,@xt[3].d | |
318 | ||
319 | zip1 $xa1.d,@xt[4].d,@xt[6].d | |
320 | zip2 $xb1.d,@xt[4].d,@xt[6].d | |
321 | zip1 $xc1.d,@xt[5].d,@xt[7].d | |
322 | zip2 $xd1.d,@xt[5].d,@xt[7].d | |
323 | ___ | |
324 | } | |
325 | ||
326 | sub ACCUM() { | |
327 | my $idx0 = shift; | |
328 | my $idx1 = $idx0 + 1; | |
329 | my $x0 = @sx[$idx0]; | |
330 | my $xx0 = @sxx[$idx0]; | |
331 | my $x1 = @sx[$idx1]; | |
332 | my $xx1 = @sxx[$idx1]; | |
333 | my $d = $idx0/2; | |
334 | my ($tmp,$tmpw) = ($counter,$counter_w); | |
335 | my $bk0 = @_ ? shift : @bak[$idx0]; | |
336 | my $bk1 = @_ ? shift : @bak[$idx1]; | |
337 | ||
338 | $code.=<<___; | |
339 | .if mixin == 1 | |
340 | add @sx[$idx0],@sx[$idx0],@KL[$d] | |
341 | .endif | |
342 | add @mx[$idx0].s,@mx[$idx0].s,$bk0.s | |
343 | .if mixin == 1 | |
344 | add @sxx[$idx1],@sxx[$idx1],@K[$d],lsr #32 | |
345 | .endif | |
346 | add @mx[$idx1].s,@mx[$idx1].s,$bk1.s | |
347 | .if mixin == 1 | |
348 | add @sxx[$idx0],@sxx[$idx0],$sxx[$idx1],lsl #32 // pack | |
349 | .endif | |
350 | ___ | |
351 | } | |
352 | ||
353 | sub SCA_INP() { | |
354 | my $idx0 = shift; | |
355 | my $idx1 = $idx0 + 2; | |
356 | $code.=<<___; | |
357 | .if mixin == 1 | |
358 | ldp @sxx[$idx0],@sxx[$idx1],[$inp],#16 | |
359 | .endif | |
360 | ___ | |
361 | } | |
362 | ||
363 | sub SVE_ACCUM_STATES() { | |
364 | my ($tmp,$tmpw) = ($counter,$counter_w); | |
365 | ||
366 | $code.=<<___; | |
367 | lsr $tmp,@K[5],#32 | |
368 | dup @bak[10].s,@KL[5] | |
369 | dup @bak[11].s,$tmpw | |
370 | lsr $tmp,@K[6],#32 | |
371 | dup @bak[13].s,$tmpw | |
372 | lsr $tmp,@K[7],#32 | |
373 | ___ | |
374 | &ACCUM(0); | |
375 | &ACCUM(2); | |
376 | &SCA_INP(1); | |
377 | &ACCUM(4); | |
378 | &ACCUM(6); | |
379 | &SCA_INP(5); | |
380 | &ACCUM(8); | |
381 | &ACCUM(10); | |
382 | &SCA_INP(9); | |
383 | $code.=<<___; | |
384 | dup @bak[14].s,@KL[7] | |
385 | dup @bak[0].s,$tmpw // bak[15] not available for SVE | |
386 | ___ | |
387 | &ACCUM(12); | |
388 | &ACCUM(14, @bak[14],@bak[0]); | |
389 | &SCA_INP(13); | |
390 | } | |
391 | ||
392 | sub SVE2_ACCUM_STATES() { | |
393 | &ACCUM(0); | |
394 | &ACCUM(2); | |
395 | &SCA_INP(1); | |
396 | &ACCUM(4); | |
397 | &ACCUM(6); | |
398 | &SCA_INP(5); | |
399 | &ACCUM(8); | |
400 | &ACCUM(10); | |
401 | &SCA_INP(9); | |
402 | &ACCUM(12); | |
403 | &ACCUM(14); | |
404 | &SCA_INP(13); | |
405 | } | |
406 | ||
407 | sub SCA_EOR() { | |
408 | my $idx0 = shift; | |
409 | my $idx1 = $idx0 + 1; | |
410 | $code.=<<___; | |
411 | .if mixin == 1 | |
412 | eor @sxx[$idx0],@sxx[$idx0],@sxx[$idx1] | |
413 | .endif | |
414 | ___ | |
415 | } | |
416 | ||
417 | sub SCA_SAVE() { | |
418 | my $idx0 = shift; | |
419 | my $idx1 = shift; | |
420 | $code.=<<___; | |
421 | .if mixin == 1 | |
422 | stp @sxx[$idx0],@sxx[$idx1],[$outp],#16 | |
423 | .endif | |
424 | ___ | |
425 | } | |
b1b2146d | 426 | |
3f42f41a DH |
427 | sub SVE_VL128_TRANSFORMS() { |
428 | &SCA_EOR(0); | |
429 | &SCA_EOR(2); | |
430 | &SCA_EOR(4); | |
431 | &transpose($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); | |
432 | &SCA_EOR(6); | |
433 | &SCA_EOR(8); | |
434 | &SCA_EOR(10); | |
435 | &transpose($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3); | |
436 | &SCA_EOR(12); | |
437 | &SCA_EOR(14); | |
438 | $code.=<<___; | |
439 | ld1 {@vt[0].4s-@vt[3].4s},[$inp],#64 | |
440 | ld1 {@vt[4].4s-@vt[7].4s},[$inp],#64 | |
441 | eor $xa0.d,$xa0.d,@xt[0].d | |
442 | eor $xb0.d,$xb0.d,@xt[1].d | |
443 | eor $xc0.d,$xc0.d,@xt[2].d | |
444 | eor $xd0.d,$xd0.d,@xt[3].d | |
445 | eor $xa1.d,$xa1.d,@xt[4].d | |
446 | eor $xb1.d,$xb1.d,@xt[5].d | |
447 | eor $xc1.d,$xc1.d,@xt[6].d | |
448 | eor $xd1.d,$xd1.d,@xt[7].d | |
449 | ld1 {@vt[0].4s-@vt[3].4s},[$inp],#64 | |
450 | ld1 {@vt[4].4s-@vt[7].4s},[$inp],#64 | |
451 | ___ | |
452 | &SCA_SAVE(0,2); | |
453 | $code.=<<___; | |
454 | eor $xa2.d,$xa2.d,@xt[0].d | |
455 | eor $xb2.d,$xb2.d,@xt[1].d | |
456 | ___ | |
457 | &SCA_SAVE(4,6); | |
458 | $code.=<<___; | |
459 | eor $xc2.d,$xc2.d,@xt[2].d | |
460 | eor $xd2.d,$xd2.d,@xt[3].d | |
461 | ___ | |
462 | &SCA_SAVE(8,10); | |
b1b2146d | 463 | $code.=<<___; |
3f42f41a DH |
464 | eor $xa3.d,$xa3.d,@xt[4].d |
465 | eor $xb3.d,$xb3.d,@xt[5].d | |
466 | ___ | |
467 | &SCA_SAVE(12,14); | |
468 | $code.=<<___; | |
469 | eor $xc3.d,$xc3.d,@xt[6].d | |
470 | eor $xd3.d,$xd3.d,@xt[7].d | |
471 | st1 {@vx[0].4s-@vx[12].4s},[$outp],#64 | |
472 | st1 {@vx[1].4s-@vx[13].4s},[$outp],#64 | |
473 | st1 {@vx[2].4s-@vx[14].4s},[$outp],#64 | |
474 | st1 {@vx[3].4s-@vx[15].4s},[$outp],#64 | |
b1b2146d DH |
475 | ___ |
476 | } | |
477 | ||
478 | sub SVE_TRANSFORMS() { | |
3f42f41a DH |
479 | $code.=<<___; |
480 | #ifdef __AARCH64EB__ | |
cd7a8e96 | 481 | rev @sxx[0],@sxx[0] |
482 | rev @sxx[2],@sxx[2] | |
483 | rev @sxx[4],@sxx[4] | |
484 | rev @sxx[6],@sxx[6] | |
485 | rev @sxx[8],@sxx[8] | |
486 | rev @sxx[10],@sxx[10] | |
487 | rev @sxx[12],@sxx[12] | |
488 | rev @sxx[14],@sxx[14] | |
3f42f41a DH |
489 | #endif |
490 | .if mixin == 1 | |
491 | add @K[6],@K[6],#1 | |
492 | .endif | |
493 | cmp $veclen,4 | |
494 | b.ne 200f | |
495 | ___ | |
496 | &SVE_VL128_TRANSFORMS(); | |
497 | $code.=<<___; | |
498 | b 210f | |
499 | 200: | |
b1b2146d | 500 | ___ |
3f42f41a DH |
501 | &transpose($xa0,$xb0,$xc0,$xd0,$xa1,$xb1,$xc1,$xd1); |
502 | &SCA_EOR(0); | |
503 | &SCA_EOR(2); | |
504 | &transpose($xa2,$xb2,$xc2,$xd2,$xa3,$xb3,$xc3,$xd3); | |
505 | &SCA_EOR(4); | |
506 | &SCA_EOR(6); | |
507 | &transpose($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); | |
508 | &SCA_EOR(8); | |
509 | &SCA_EOR(10); | |
510 | &transpose($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3); | |
511 | &SCA_EOR(12); | |
512 | &SCA_EOR(14); | |
513 | &load(@xt[0],@xt[1],@xt[2],@xt[3],@xt[4],@xt[5],@xt[6],@xt[7]); | |
514 | $code.=<<___; | |
515 | eor $xa0.d,$xa0.d,@xt[0].d | |
516 | eor $xa1.d,$xa1.d,@xt[1].d | |
517 | eor $xa2.d,$xa2.d,@xt[2].d | |
518 | eor $xa3.d,$xa3.d,@xt[3].d | |
519 | eor $xb0.d,$xb0.d,@xt[4].d | |
520 | eor $xb1.d,$xb1.d,@xt[5].d | |
521 | eor $xb2.d,$xb2.d,@xt[6].d | |
522 | eor $xb3.d,$xb3.d,@xt[7].d | |
523 | ___ | |
524 | &load(@xt[0],@xt[1],@xt[2],@xt[3],@xt[4],@xt[5],@xt[6],@xt[7]); | |
525 | &SCA_SAVE(0,2); | |
526 | $code.=<<___; | |
527 | eor $xc0.d,$xc0.d,@xt[0].d | |
528 | eor $xc1.d,$xc1.d,@xt[1].d | |
529 | ___ | |
530 | &SCA_SAVE(4,6); | |
531 | $code.=<<___; | |
532 | eor $xc2.d,$xc2.d,@xt[2].d | |
533 | eor $xc3.d,$xc3.d,@xt[3].d | |
534 | ___ | |
535 | &SCA_SAVE(8,10); | |
536 | $code.=<<___; | |
537 | eor $xd0.d,$xd0.d,@xt[4].d | |
538 | eor $xd1.d,$xd1.d,@xt[5].d | |
539 | ___ | |
540 | &SCA_SAVE(12,14); | |
541 | $code.=<<___; | |
542 | eor $xd2.d,$xd2.d,@xt[6].d | |
543 | eor $xd3.d,$xd3.d,@xt[7].d | |
544 | ___ | |
545 | &store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); | |
b1b2146d DH |
546 | &store($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3); |
547 | $code.=<<___; | |
3f42f41a DH |
548 | 210: |
549 | incw @K[6], ALL, MUL #1 | |
550 | ___ | |
551 | } | |
552 | ||
553 | sub SET_STATE_BAK() { | |
554 | my $idx0 = shift; | |
555 | my $idx1 = $idx0 + 1; | |
556 | my $x0 = @sx[$idx0]; | |
557 | my $xx0 = @sxx[$idx0]; | |
558 | my $x1 = @sx[$idx1]; | |
559 | my $xx1 = @sxx[$idx1]; | |
560 | my $d = $idx0/2; | |
561 | ||
562 | $code.=<<___; | |
563 | lsr $xx1,@K[$d],#32 | |
564 | dup @mx[$idx0].s,@KL[$d] | |
565 | dup @bak[$idx0].s,@KL[$d] | |
566 | .if mixin == 1 | |
567 | mov $x0,@KL[$d] | |
568 | .endif | |
569 | dup @mx[$idx1].s,$x1 | |
570 | dup @bak[$idx1].s,$x1 | |
571 | ___ | |
572 | } | |
573 | ||
574 | sub SET_STATE() { | |
575 | my $idx0 = shift; | |
576 | my $idx1 = $idx0 + 1; | |
577 | my $x0 = @sx[$idx0]; | |
578 | my $xx0 = @sxx[$idx0]; | |
579 | my $x1 = @sx[$idx1]; | |
580 | my $xx1 = @sxx[$idx1]; | |
581 | my $d = $idx0/2; | |
582 | ||
583 | $code.=<<___; | |
584 | lsr $xx1,@K[$d],#32 | |
585 | dup @mx[$idx0].s,@KL[$d] | |
586 | .if mixin == 1 | |
587 | mov $x0,@KL[$d] | |
588 | .endif | |
589 | dup @mx[$idx1].s,$x1 | |
b1b2146d DH |
590 | ___ |
591 | } | |
b1b2146d DH |
592 | |
593 | sub SVE_LOAD_STATES() { | |
3f42f41a DH |
594 | &SET_STATE_BAK(0); |
595 | &SET_STATE_BAK(2); | |
596 | &SET_STATE_BAK(4); | |
597 | &SET_STATE_BAK(6); | |
598 | &SET_STATE_BAK(8); | |
599 | &SET_STATE(10); | |
600 | &SET_STATE(14); | |
b1b2146d | 601 | $code.=<<___; |
3f42f41a DH |
602 | .if mixin == 1 |
603 | add @sx[13],@KL[6],#1 | |
604 | mov @sx[12],@KL[6] | |
605 | index $zctr.s,@sx[13],1 | |
606 | index @mx[12].s,@sx[13],1 | |
607 | .else | |
608 | index $zctr.s,@KL[6],1 | |
609 | index @mx[12].s,@KL[6],1 | |
610 | .endif | |
611 | lsr @sxx[13],@K[6],#32 | |
612 | dup @mx[13].s,@sx[13] | |
b1b2146d | 613 | ___ |
bcb52bcc DH |
614 | } |
615 | ||
616 | sub SVE2_LOAD_STATES() { | |
3f42f41a DH |
617 | &SET_STATE_BAK(0); |
618 | &SET_STATE_BAK(2); | |
619 | &SET_STATE_BAK(4); | |
620 | &SET_STATE_BAK(6); | |
621 | &SET_STATE_BAK(8); | |
622 | &SET_STATE_BAK(10); | |
623 | &SET_STATE_BAK(14); | |
624 | ||
b1b2146d | 625 | $code.=<<___; |
3f42f41a DH |
626 | .if mixin == 1 |
627 | add @sx[13],@KL[6],#1 | |
628 | mov @sx[12],@KL[6] | |
629 | index $zctr.s,@sx[13],1 | |
630 | index @mx[12].s,@sx[13],1 | |
631 | .else | |
632 | index $zctr.s,@KL[6],1 | |
633 | index @mx[12].s,@KL[6],1 | |
634 | .endif | |
635 | lsr @sxx[13],@K[6],#32 | |
636 | dup @mx[13].s,@sx[13] | |
637 | dup @bak[13].s,@sx[13] | |
b1b2146d | 638 | ___ |
3f42f41a DH |
639 | } |
640 | ||
641 | sub chacha20_sve() { | |
642 | my ($tmp) = (@sxx[0]); | |
643 | ||
bcb52bcc | 644 | $code.=<<___; |
3f42f41a DH |
645 | .align 5 |
646 | 100: | |
647 | subs $tmp,$len,$veclen,lsl #6 | |
648 | b.lt 110f | |
649 | mov $len,$tmp | |
650 | b.eq 101f | |
651 | cmp $len,64 | |
652 | b.lt 101f | |
653 | mixin=1 | |
bcb52bcc DH |
654 | ___ |
655 | &SVE_LOAD_STATES(); | |
b1b2146d | 656 | &SVE_INNER_BLOCK(); |
3f42f41a DH |
657 | &SVE_ACCUM_STATES(); |
658 | &SVE_TRANSFORMS(); | |
b1b2146d | 659 | $code.=<<___; |
3f42f41a DH |
660 | subs $len,$len,64 |
661 | b.gt 100b | |
662 | b 110f | |
663 | 101: | |
664 | mixin=0 | |
b1b2146d | 665 | ___ |
3f42f41a DH |
666 | &SVE_LOAD_STATES(); |
667 | &SVE_INNER_BLOCK(); | |
668 | &SVE_ACCUM_STATES(); | |
b1b2146d | 669 | &SVE_TRANSFORMS(); |
3f42f41a DH |
670 | $code.=<<___; |
671 | 110: | |
672 | ___ | |
b1b2146d DH |
673 | } |
674 | ||
3f42f41a DH |
675 | sub chacha20_sve2() { |
676 | my ($tmp) = (@sxx[0]); | |
677 | ||
b1b2146d DH |
678 | $code.=<<___; |
679 | .align 5 | |
3f42f41a DH |
680 | 100: |
681 | subs $tmp,$len,$veclen,lsl #6 | |
682 | b.lt 110f | |
683 | mov $len,$tmp | |
684 | b.eq 101f | |
685 | cmp $len,64 | |
686 | b.lt 101f | |
687 | mixin=1 | |
b1b2146d | 688 | ___ |
3f42f41a DH |
689 | &SVE2_LOAD_STATES(); |
690 | &SVE2_INNER_BLOCK(); | |
691 | &SVE2_ACCUM_STATES(); | |
692 | &SVE_TRANSFORMS(); | |
b1b2146d | 693 | $code.=<<___; |
3f42f41a DH |
694 | subs $len,$len,64 |
695 | b.gt 100b | |
696 | b 110f | |
697 | 101: | |
698 | mixin=0 | |
699 | ___ | |
700 | &SVE2_LOAD_STATES(); | |
701 | &SVE2_INNER_BLOCK(); | |
702 | &SVE2_ACCUM_STATES(); | |
703 | &SVE_TRANSFORMS(); | |
704 | $code.=<<___; | |
705 | 110: | |
b1b2146d DH |
706 | ___ |
707 | } | |
708 | ||
3f42f41a | 709 | |
b1b2146d | 710 | {{{ |
3f42f41a DH |
711 | my ($tmp,$tmpw) = ("x6", "w6"); |
712 | my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10"); | |
713 | my ($sve2flag) = ("x7"); | |
714 | ||
b1b2146d DH |
715 | $code.=<<___; |
716 | #include "arm_arch.h" | |
717 | ||
718 | .arch armv8-a | |
719 | ||
b1b2146d DH |
720 | .extern OPENSSL_armcap_P |
721 | .hidden OPENSSL_armcap_P | |
b1b2146d DH |
722 | |
723 | .text | |
724 | .align 5 | |
725 | .Lchacha20_consts: | |
bcb52bcc | 726 | .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral |
b1b2146d DH |
727 | .Lrot8: |
728 | .word 0x02010003,0x04040404,0x02010003,0x04040404 | |
729 | .globl ChaCha20_ctr32_sve | |
730 | .type ChaCha20_ctr32_sve,%function | |
731 | .align 5 | |
732 | ChaCha20_ctr32_sve: | |
733 | AARCH64_VALID_CALL_TARGET | |
bcb52bcc | 734 | cntw $veclen, ALL, MUL #1 |
3f42f41a | 735 | cmp $len,$veclen,lsl #6 |
b1b2146d | 736 | b.lt .Lreturn |
b1b2146d DH |
737 | mov $sve2flag,0 |
738 | adrp $tmp,OPENSSL_armcap_P | |
739 | ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P] | |
740 | tst $tmpw,#ARMV8_SVE2 | |
741 | b.eq 1f | |
742 | mov $sve2flag,1 | |
bcb52bcc | 743 | b 2f |
b1b2146d | 744 | 1: |
bcb52bcc DH |
745 | cmp $veclen,4 |
746 | b.le .Lreturn | |
747 | adr $tmp,.Lrot8 | |
748 | ldp $tmpw0,$tmpw1,[$tmp] | |
749 | index $rot8.s,$tmpw0,$tmpw1 | |
750 | 2: | |
3f42f41a DH |
751 | AARCH64_SIGN_LINK_REGISTER |
752 | stp d8,d9,[sp,-192]! | |
bcb52bcc DH |
753 | stp d10,d11,[sp,16] |
754 | stp d12,d13,[sp,32] | |
755 | stp d14,d15,[sp,48] | |
3f42f41a DH |
756 | stp x16,x17,[sp,64] |
757 | stp x18,x19,[sp,80] | |
758 | stp x20,x21,[sp,96] | |
759 | stp x22,x23,[sp,112] | |
760 | stp x24,x25,[sp,128] | |
761 | stp x26,x27,[sp,144] | |
762 | stp x28,x29,[sp,160] | |
763 | str x30,[sp,176] | |
764 | ||
bcb52bcc DH |
765 | adr $tmp,.Lchacha20_consts |
766 | ldp @K[0],@K[1],[$tmp] | |
767 | ldp @K[2],@K[3],[$key] | |
768 | ldp @K[4],@K[5],[$key, 16] | |
769 | ldp @K[6],@K[7],[$ctr] | |
bcb52bcc DH |
770 | ptrues p0.s,ALL |
771 | #ifdef __AARCH64EB__ | |
772 | ror @K[2],@K[2],#32 | |
773 | ror @K[3],@K[3],#32 | |
774 | ror @K[4],@K[4],#32 | |
775 | ror @K[5],@K[5],#32 | |
776 | ror @K[6],@K[6],#32 | |
777 | ror @K[7],@K[7],#32 | |
b1b2146d | 778 | #endif |
3f42f41a DH |
779 | cbz $sve2flag, 1f |
780 | ___ | |
781 | &chacha20_sve2(); | |
782 | $code.=<<___; | |
783 | b 2f | |
784 | 1: | |
b1b2146d | 785 | ___ |
3f42f41a | 786 | &chacha20_sve(); |
b1b2146d | 787 | $code.=<<___; |
3f42f41a DH |
788 | 2: |
789 | str @KL[6],[$ctr] | |
b1b2146d DH |
790 | ldp d10,d11,[sp,16] |
791 | ldp d12,d13,[sp,32] | |
bcb52bcc | 792 | ldp d14,d15,[sp,48] |
3f42f41a DH |
793 | ldp x16,x17,[sp,64] |
794 | ldp x18,x19,[sp,80] | |
795 | ldp x20,x21,[sp,96] | |
796 | ldp x22,x23,[sp,112] | |
797 | ldp x24,x25,[sp,128] | |
798 | ldp x26,x27,[sp,144] | |
799 | ldp x28,x29,[sp,160] | |
800 | ldr x30,[sp,176] | |
801 | ldp d8,d9,[sp],192 | |
802 | AARCH64_VALIDATE_LINK_REGISTER | |
b1b2146d DH |
803 | .Lreturn: |
804 | ret | |
805 | .size ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve | |
806 | ___ | |
807 | ||
808 | }}} | |
809 | ||
810 | ######################################## | |
811 | { | |
812 | my %opcode_unpred = ( | |
bcb52bcc | 813 | "movprfx" => 0x0420BC00, |
b1b2146d DH |
814 | "eor" => 0x04a03000, |
815 | "add" => 0x04200000, | |
816 | "orr" => 0x04603000, | |
817 | "lsl" => 0x04209C00, | |
818 | "lsr" => 0x04209400, | |
3f42f41a | 819 | "incw" => 0x04B00000, |
b1b2146d DH |
820 | "xar" => 0x04203400, |
821 | "zip1" => 0x05206000, | |
822 | "zip2" => 0x05206400, | |
823 | "uzp1" => 0x05206800, | |
824 | "uzp2" => 0x05206C00, | |
825 | "index" => 0x04204C00, | |
826 | "mov" => 0x05203800, | |
827 | "dup" => 0x05203800, | |
bcb52bcc | 828 | "cntw" => 0x04A0E000, |
b1b2146d DH |
829 | "tbl" => 0x05203000); |
830 | ||
831 | my %opcode_imm_unpred = ( | |
832 | "dup" => 0x2538C000, | |
833 | "index" => 0x04204400); | |
834 | ||
835 | my %opcode_scalar_pred = ( | |
836 | "mov" => 0x0528A000, | |
837 | "cpy" => 0x0528A000, | |
838 | "st4w" => 0xE5606000, | |
839 | "st1w" => 0xE5004000, | |
840 | "ld1w" => 0xA5404000); | |
841 | ||
842 | my %opcode_gather_pred = ( | |
843 | "ld1w" => 0x85204000); | |
844 | ||
845 | my %opcode_pred = ( | |
846 | "eor" => 0x04190000, | |
847 | "add" => 0x04000000, | |
848 | "orr" => 0x04180000, | |
849 | "whilelo" => 0x25200C00, | |
850 | "whilelt" => 0x25200400, | |
851 | "cntp" => 0x25208000, | |
852 | "addvl" => 0x04205000, | |
853 | "lsl" => 0x04038000, | |
854 | "lsr" => 0x04018000, | |
855 | "sel" => 0x0520C000, | |
856 | "mov" => 0x0520C000, | |
857 | "ptrue" => 0x2518E000, | |
858 | "pfalse" => 0x2518E400, | |
859 | "ptrues" => 0x2519E000, | |
860 | "pnext" => 0x2519C400, | |
861 | "ld4w" => 0xA560E000, | |
862 | "st4w" => 0xE570E000, | |
863 | "st1w" => 0xE500E000, | |
864 | "ld1w" => 0xA540A000, | |
bcb52bcc | 865 | "ld1rw" => 0x8540C000, |
3f42f41a | 866 | "lasta" => 0x0520A000, |
cd7a8e96 | 867 | "revh" => 0x05258000, |
868 | "revb" => 0x05248000); | |
b1b2146d DH |
869 | |
870 | my %tsize = ( | |
871 | 'b' => 0, | |
872 | 'h' => 1, | |
873 | 's' => 2, | |
874 | 'd' => 3); | |
875 | ||
876 | my %sf = ( | |
877 | "w" => 0, | |
878 | "x" => 1); | |
879 | ||
880 | my %pattern = ( | |
881 | "POW2" => 0, | |
882 | "VL1" => 1, | |
883 | "VL2" => 2, | |
884 | "VL3" => 3, | |
885 | "VL4" => 4, | |
886 | "VL5" => 5, | |
887 | "VL6" => 6, | |
888 | "VL7" => 7, | |
889 | "VL8" => 8, | |
890 | "VL16" => 9, | |
891 | "VL32" => 10, | |
892 | "VL64" => 11, | |
893 | "VL128" => 12, | |
894 | "VL256" => 13, | |
895 | "MUL4" => 29, | |
896 | "MUL3" => 30, | |
897 | "ALL" => 31); | |
898 | ||
899 | sub create_verifier { | |
900 | my $filename="./compile_sve.sh"; | |
901 | ||
902 | $scripts = <<___; | |
903 | #! /bin/bash | |
904 | set -e | |
905 | CROSS_COMPILE=\${CROSS_COMPILE:-'aarch64-none-linux-gnu-'} | |
906 | ||
907 | [ -z "\$1" ] && exit 1 | |
908 | ARCH=`uname -p | xargs echo -n` | |
909 | ||
910 | # need gcc-10 and above to compile SVE code | |
911 | # change this according to your system during debugging | |
912 | if [ \$ARCH == 'aarch64' ]; then | |
913 | CC=gcc-11 | |
914 | OBJDUMP=objdump | |
915 | else | |
916 | CC=\${CROSS_COMPILE}gcc | |
917 | OBJDUMP=\${CROSS_COMPILE}objdump | |
918 | fi | |
919 | TMPFILE=/tmp/\$\$ | |
920 | cat > \$TMPFILE.c << EOF | |
921 | extern __attribute__((noinline, section("disasm_output"))) void dummy_func() | |
922 | { | |
923 | asm("\$@\\t\\n"); | |
924 | } | |
925 | int main(int argc, char *argv[]) | |
926 | { | |
927 | } | |
928 | EOF | |
929 | \$CC -march=armv8.2-a+sve+sve2 -o \$TMPFILE.out \$TMPFILE.c | |
930 | \$OBJDUMP -d \$TMPFILE.out | awk -F"\\n" -v RS="\\n\\n" '\$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",\$2}' | |
931 | rm \$TMPFILE.c \$TMPFILE.out | |
932 | ___ | |
933 | open(FH, '>', $filename) or die $!; | |
934 | print FH $scripts; | |
935 | close(FH); | |
936 | system("chmod a+x ./compile_sve.sh"); | |
937 | } | |
938 | ||
939 | sub compile_sve { | |
940 | return `./compile_sve.sh '@_'` | |
941 | } | |
942 | ||
943 | sub verify_inst { | |
944 | my ($code,$inst)=@_; | |
945 | my $hexcode = (sprintf "%08x", $code); | |
946 | ||
947 | if ($debug_encoder == 1) { | |
948 | my $expect=&compile_sve($inst); | |
949 | if ($expect ne $hexcode) { | |
950 | return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode); | |
951 | } | |
952 | } | |
953 | return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst); | |
954 | } | |
955 | ||
956 | sub reg_code { | |
957 | my $code = shift; | |
958 | ||
959 | if ($code == "zr") { | |
960 | return "31"; | |
961 | } | |
962 | return $code; | |
963 | } | |
964 | ||
965 | sub encode_size_imm() { | |
966 | my ($mnemonic, $isize, $const)=@_; | |
967 | my $esize = (8<<$tsize{$isize}); | |
968 | my $tsize_imm = $esize + $const; | |
969 | ||
970 | if ($mnemonic eq "lsr" || $mnemonic eq "xar") { | |
971 | $tsize_imm = 2*$esize - $const; | |
972 | } | |
973 | return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16); | |
974 | } | |
975 | ||
976 | sub encode_shift_pred() { | |
977 | my ($mnemonic, $isize, $const)=@_; | |
978 | my $esize = (8<<$tsize{$isize}); | |
979 | my $tsize_imm = $esize + $const; | |
980 | ||
981 | if ($mnemonic eq "lsr") { | |
982 | $tsize_imm = 2*$esize - $const; | |
983 | } | |
984 | return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<5); | |
985 | } | |
986 | ||
987 | sub sve_unpred { | |
988 | my ($mnemonic,$arg)=@_; | |
989 | my $inst = (sprintf "%s %s", $mnemonic,$arg); | |
990 | ||
991 | if ($arg =~ m/z([0-9]+)\.([bhsd]),\s*\{\s*z([0-9]+)\.[bhsd].*\},\s*z([0-9]+)\.[bhsd].*/o) { | |
992 | return &verify_inst($opcode_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22)|($4<<16), | |
993 | $inst) | |
994 | } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*([zwx][0-9]+.*)/o) { | |
995 | my $regd = $1; | |
996 | my $isize = $2; | |
997 | my $regs=$3; | |
998 | ||
999 | if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) { | |
1000 | if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o | |
1001 | && ((8<<$tsize{$isize}) > $2)) { | |
1002 | return &verify_inst($opcode_unpred{$mnemonic}|$regd|($1<<5)|&encode_size_imm($mnemonic,$isize,$2), | |
1003 | $inst); | |
1004 | } | |
1005 | } elsif($regs =~ m/[wx]([0-9]+),\s*[wx]([0-9]+)/o) { | |
1006 | return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst); | |
1007 | } elsif ($regs =~ m/[wx]([0-9]+),\s*#?([0-9]+)/o) { | |
1008 | return &verify_inst($opcode_imm_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst); | |
1009 | } elsif ($regs =~ m/[wx]([0-9]+)/o) { | |
1010 | return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5), $inst); | |
1011 | } else { | |
1012 | my $encoded_size = 0; | |
1013 | if (($mnemonic eq "add") || ($mnemonic =~ /zip./) || ($mnemonic =~ /uzp./) ) { | |
1014 | $encoded_size = ($tsize{$isize}<<22); | |
1015 | } | |
1016 | if ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd],\s*([0-9]+)/o && | |
1017 | $1 == $regd) { | |
1018 | return &verify_inst($opcode_unpred{$mnemonic}|$regd|($2<<5)|&encode_size_imm($mnemonic,$isize,$3), $inst); | |
1019 | } elsif ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd]/o) { | |
1020 | return &verify_inst($opcode_unpred{$mnemonic}|$regd|$encoded_size|($1<<5)|($2<<16), $inst); | |
1021 | } | |
1022 | } | |
1023 | } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*#?([0-9]+)/o) { | |
1024 | return &verify_inst($opcode_imm_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22), | |
1025 | $inst) | |
1026 | } | |
1027 | sprintf "%s // fail to parse", $inst; | |
1028 | } | |
1029 | ||
1030 | sub sve_pred { | |
1031 | my ($mnemonic,,$arg)=@_; | |
1032 | my $inst = (sprintf "%s %s", $mnemonic,$arg); | |
1033 | ||
1034 | if ($arg =~ m/\{\s*z([0-9]+)\.([bhsd]).*\},\s*p([0-9])+(\/z)?,\s*\[(\s*[xs].*)\]/o) { | |
1035 | my $zt = $1; | |
1036 | my $size = $tsize{$2}; | |
1037 | my $pg = $3; | |
1038 | my $addr = $5; | |
1039 | my $xn = 31; | |
1040 | ||
1041 | if ($addr =~ m/x([0-9]+)\s*/o) { | |
1042 | $xn = $1; | |
1043 | } | |
bcb52bcc DH |
1044 | |
1045 | if ($mnemonic =~m/ld1r[bhwd]/o) { | |
1046 | $size = 0; | |
1047 | } | |
b1b2146d DH |
1048 | if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) { |
1049 | return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); | |
1050 | } elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) { | |
1051 | my $xs = ($2 eq "SXTW") ? 1 : 0; | |
1052 | return &verify_inst($opcode_gather_pred{$mnemonic}|($xs<<22)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); | |
1053 | } elsif($addr =~ m/\w+\s*,\s*#?([0-9]+)/o) { | |
1054 | return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); | |
1055 | } else { | |
1056 | return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($xn<<5),$inst); | |
1057 | } | |
1058 | } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/([mz]),\s*([zwx][0-9]+.*)/o) { | |
1059 | my $regd = $1; | |
1060 | my $isize = $2; | |
1061 | my $pg = $3; | |
1062 | my $mod = $4; | |
1063 | my $regs = $5; | |
1064 | ||
1065 | if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) { | |
1066 | if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o | |
1067 | && $regd == $1 | |
1068 | && $mode == 'm' | |
1069 | && ((8<<$tsize{$isize}) > $2)) { | |
1070 | return &verify_inst($opcode_pred{$mnemonic}|$regd|($pg<<10)|&encode_shift_pred($mnemonic,$isize,$2), $inst); | |
1071 | } | |
1072 | } elsif($regs =~ m/[wx]([0-9]+)/o) { | |
1073 | return &verify_inst($opcode_scalar_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst); | |
1074 | } elsif ($regs =~ m/z([0-9]+)[^,]*(?:,\s*z([0-9]+))?/o) { | |
1075 | if ($mnemonic eq "sel") { | |
1076 | return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($2<<16), $inst); | |
1077 | } elsif ($mnemonic eq "mov") { | |
1078 | return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($regd<<16), $inst); | |
1079 | } elsif (length $2 > 0) { | |
1080 | return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($2<<5), $inst); | |
1081 | } else { | |
1082 | return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst); | |
1083 | } | |
1084 | } | |
1085 | } elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(\w+.*)/o) { | |
1086 | my $pg = $1; | |
1087 | my $isize = $2; | |
1088 | my $regs = $3; | |
1089 | ||
1090 | if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) { | |
1091 | return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($sf{$1}<<12)|(®_code($2)<<5)|(®_code($3)<<16), $inst); | |
1092 | } elsif ($regs =~ m/p([0-9]+),\s*p([0-9]+)\.[bhsd]/o) { | |
1093 | return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($1<<5), $inst); | |
1094 | } else { | |
1095 | return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($pattern{$regs}<<5), $inst); | |
1096 | } | |
1097 | } elsif ($arg =~ m/p([0-9]+)\.([bhsd])/o) { | |
1098 | return &verify_inst($opcode_pred{$mnemonic}|$1, $inst); | |
1099 | } | |
1100 | ||
1101 | sprintf "%s // fail to parse", $inst; | |
1102 | } | |
1103 | ||
1104 | sub sve_other { | |
1105 | my ($mnemonic,$arg)=@_; | |
1106 | my $inst = (sprintf "%s %s", $mnemonic,$arg); | |
1107 | ||
1108 | if ($arg =~ m/x([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*p([0-9]+)\.([bhsd])/o) { | |
1109 | return &verify_inst($opcode_pred{$mnemonic}|($tsize{$4}<<22)|$1|($2<<10)|($3<<5), $inst); | |
3f42f41a DH |
1110 | } elsif ($arg =~ m/(x|w)([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*z([0-9]+)\.([bhsd])/o) { |
1111 | return &verify_inst($opcode_pred{$mnemonic}|($tsize{$5}<<22)|$1|($3<<10)|($4<<5)|$2, $inst); | |
1112 | }elsif ($mnemonic =~ /inc[bhdw]/) { | |
b1b2146d | 1113 | if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { |
3f42f41a | 1114 | return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(2<<12)|(($3 - 1)<<16)|0xE000, $inst); |
b1b2146d | 1115 | } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { |
3f42f41a | 1116 | return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16)|0xC000, $inst); |
b1b2146d | 1117 | } elsif ($arg =~ m/x([0-9]+)/o) { |
3f42f41a | 1118 | return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16)|0xE000, $inst); |
b1b2146d | 1119 | } |
bcb52bcc DH |
1120 | } elsif ($mnemonic =~ /cnt[bhdw]/) { |
1121 | if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { | |
1122 | return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst); | |
1123 | } | |
b1b2146d DH |
1124 | } elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) { |
1125 | return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst); | |
bcb52bcc DH |
1126 | } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) { |
1127 | return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst); | |
b1b2146d DH |
1128 | } |
1129 | sprintf "%s // fail to parse", $inst; | |
1130 | } | |
1131 | } | |
1132 | ||
1133 | open SELF,$0; | |
1134 | while(<SELF>) { | |
1135 | next if (/^#!/); | |
1136 | last if (!s/^#/\/\// and !/^$/); | |
1137 | print; | |
1138 | } | |
1139 | close SELF; | |
1140 | ||
1141 | if ($debug_encoder == 1) { | |
1142 | &create_verifier(); | |
1143 | } | |
1144 | ||
1145 | foreach(split("\n",$code)) { | |
1146 | s/\`([^\`]*)\`/eval($1)/ge; | |
1147 | s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge; | |
1148 | s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge; | |
1149 | s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge; | |
bcb52bcc | 1150 | s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge; |
b1b2146d DH |
1151 | s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge; |
1152 | s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge; | |
3f42f41a | 1153 | s/\b(movprfx|lasta|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z|w).*)/sve_other($1,$2)/ge; |
b1b2146d DH |
1154 | print $_,"\n"; |
1155 | } | |
1156 | ||
1157 | close STDOUT or die "error closing STDOUT: $!"; |