]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/chacha/asm/chacha-armv8-sve.pl
Optimize chacha20 on aarch64 by SVE2
[thirdparty/openssl.git] / crypto / chacha / asm / chacha-armv8-sve.pl
1 #! /usr/bin/env perl
2 # Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 #
10 # ChaCha20 for ARMv8 via SVE
11 #
12 # $output is the last argument if it looks like a file (it has an extension)
13 # $flavour is the first argument if it doesn't look like a file
14 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
15 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
16
17 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
18 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
19 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
20 die "can't locate arm-xlate.pl";
21
22 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
23 or die "can't call $xlate: $!";
24 *STDOUT=*OUT;
25
26 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
27 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
28 my $arg = pop;
29 $arg = "#$arg" if ($arg*1 eq $arg);
30 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
31 }
32
33 my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
34 my ($veclen_w,$veclen,$blocks) = ("w5","x5","x6");
35 my ($sve2flag) = ("x7");
36 my ($wctr, $xctr) = ("w8", "x8");
37 my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10");
38 my ($tmp,$tmpw) = ("x10", "w10");
39 my ($counter) = ("x11");
40 my @K=map("x$_",(12..15,19..22));
41 my @KL=map("w$_",(12..15,19..22));
42 my @mx=map("z$_",(0..15));
43 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
44 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
45 my ($zctr) = ("z16");
46 my @xt=map("z$_",(17..24));
47 my @perm=map("z$_",(25..30));
48 my ($rot8) = ("z31");
49 my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7)=@xt;
50 # in SVE mode we can only use bak0 ~ bak9 (the rest used as scratch register)
51 # in SVE2 we use all 15 backup register
52 my ($bak0,$bak1,$bak2,$bak3,$bak4,$bak5,$bak6,$bak7,$bak8,$bak9,$bak10,$bak11,$bak13,$bak14,$bak15)=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],$xt4,$xt5,$xt6,$xt7,$xt0,$xt1,$xt2,$xt3,$rot8);
53 my $debug_encoder=0;
54
55 sub SVE_ADD() {
56 my $x = shift;
57 my $y = shift;
58
59 $code.=<<___;
60 add @mx[$x].s,@mx[$x].s,@mx[$y].s
61 ___
62 if (@_) {
63 &SVE_ADD(@_);
64 }
65 }
66
67 sub SVE_EOR() {
68 my $x = shift;
69 my $y = shift;
70
71 $code.=<<___;
72 eor @mx[$x].d,@mx[$x].d,@mx[$y].d
73 ___
74 if (@_) {
75 &SVE_EOR(@_);
76 }
77 }
78
79 sub SVE_LSL() {
80 my $bits = shift;
81 my $x = shift;
82 my $y = shift;
83 my $next = $x + 1;
84
85 $code.=<<___;
86 lsl @xt[$x].s,@mx[$y].s,$bits
87 ___
88 if (@_) {
89 &SVE_LSL($bits,$next,@_);
90 }
91 }
92
93 sub SVE_LSR() {
94 my $bits = shift;
95 my $x = shift;
96
97 $code.=<<___;
98 lsr @mx[$x].s,@mx[$x].s,$bits
99 ___
100 if (@_) {
101 &SVE_LSR($bits,@_);
102 }
103 }
104
105 sub SVE_ORR() {
106 my $x = shift;
107 my $y = shift;
108 my $next = $x + 1;
109
110 $code.=<<___;
111 orr @mx[$y].d,@mx[$y].d,@xt[$x].d
112 ___
113 if (@_) {
114 &SVE_ORR($next,@_);
115 }
116 }
117
118 sub SVE_REV16() {
119 my $x = shift;
120
121 $code.=<<___;
122 revh @mx[$x].s,p0/m,@mx[$x].s
123 ___
124 if (@_) {
125 &SVE_REV16(@_);
126 }
127 }
128
129 sub SVE_ROT8() {
130 my $x = shift;
131
132 $code.=<<___;
133 tbl @mx[$x].b,{@mx[$x].b},$rot8.b
134 ___
135 if (@_) {
136 &SVE_ROT8(@_);
137 }
138 }
139
140 sub SVE2_XAR() {
141 my $bits = shift;
142 my $x = shift;
143 my $y = shift;
144 my $rbits = 32-$bits;
145
146 $code.=<<___;
147 xar @mx[$x].s,@mx[$x].s,@mx[$y].s,$rbits
148 ___
149 if (@_) {
150 &SVE2_XAR($bits,@_);
151 }
152 }
153
154 sub SVE_QR_GROUP() {
155 my $have_sve2 = shift;
156 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
157
158 &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
159 if ($have_sve2 == 0) {
160 &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
161 &SVE_REV16($d0,$d1,$d2,$d3);
162 } else {
163 &SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
164 }
165
166 &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
167 if ($have_sve2 == 0) {
168 &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
169 &SVE_LSL(12,0,$b0,$b1,$b2,$b3);
170 &SVE_LSR(20,$b0,$b1,$b2,$b3);
171 &SVE_ORR(0,$b0,$b1,$b2,$b3,);
172 } else {
173 &SVE2_XAR(12,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
174 }
175
176 &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
177 if ($have_sve2 == 0) {
178 &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
179 &SVE_ROT8($d0,$d1,$d2,$d3);
180 } else {
181 &SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
182 }
183
184 &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
185 if ($have_sve2 == 0) {
186 &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
187 &SVE_LSL(7,0,$b0,$b1,$b2,$b3);
188 &SVE_LSR(25,$b0,$b1,$b2,$b3);
189 &SVE_ORR(0,$b0,$b1,$b2,$b3);
190 } else {
191 &SVE2_XAR(7,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
192 }
193 }
194
195 sub SVE_INNER_BLOCK() {
196 $code.=<<___;
197 mov $counter,#10
198 1:
199 .align 5
200 ___
201 &SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
202 &SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
203 $code.=<<___;
204 subs $counter,$counter,1
205 b.ne 1b
206 ___
207 }
208
209 sub SVE2_INNER_BLOCK() {
210 $code.=<<___;
211 mov $counter,#10
212 1:
213 .align 5
214 ___
215 &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
216 &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
217 $code.=<<___;
218 subs $counter,$counter,1
219 b.ne 1b
220 ___
221 }
222
223 sub load() {
224 my $x0 = shift;
225 my $x1 = shift;
226 my $x2 = shift;
227 my $x3 = shift;
228 my $x4 = shift;
229 my $x5 = shift;
230 my $x6 = shift;
231 my $x7 = shift;
232
233 $code.=<<___;
234 ld1w {$x0.s},p0/z,[$inp]
235 ld1w {$x1.s},p0/z,[$inp, #1, MUL VL]
236 ld1w {$x2.s},p0/z,[$inp, #2, MUL VL]
237 ld1w {$x3.s},p0/z,[$inp, #3, MUL VL]
238 ld1w {$x4.s},p0/z,[$inp, #4, MUL VL]
239 ld1w {$x5.s},p0/z,[$inp, #5, MUL VL]
240 ld1w {$x6.s},p0/z,[$inp, #6, MUL VL]
241 ld1w {$x7.s},p0/z,[$inp, #7, MUL VL]
242 addvl $inp,$inp,#8
243 ___
244 }
245
246 sub store() {
247 my $x0 = shift;
248 my $x1 = shift;
249 my $x2 = shift;
250 my $x3 = shift;
251 my $x4 = shift;
252 my $x5 = shift;
253 my $x6 = shift;
254 my $x7 = shift;
255
256 $code.=<<___;
257 st1w {$x0.s},p0,[$outp]
258 st1w {$x1.s},p0,[$outp, #1, MUL VL]
259 st1w {$x2.s},p0,[$outp, #2, MUL VL]
260 st1w {$x3.s},p0,[$outp, #3, MUL VL]
261 st1w {$x4.s},p0,[$outp, #4, MUL VL]
262 st1w {$x5.s},p0,[$outp, #5, MUL VL]
263 st1w {$x6.s},p0,[$outp, #6, MUL VL]
264 st1w {$x7.s},p0,[$outp, #7, MUL VL]
265 addvl $outp,$outp,#8
266 ___
267 }
268
269 sub transpose() {
270 my $xa = shift;
271 my $xb = shift;
272 my $xc = shift;
273 my $xd = shift;
274
275 $code.=<<___;
276 zip1 $xt0.s,$xa.s,$xb.s
277 zip2 $xt1.s,$xa.s,$xb.s
278 zip1 $xt2.s,$xc.s,$xd.s
279 zip2 $xt3.s,$xc.s,$xd.s
280 zip1 $xa.d,$xt0.d,$xt2.d
281 zip2 $xb.d,$xt0.d,$xt2.d
282 zip1 $xc.d,$xt1.d,$xt3.d
283 zip2 $xd.d,$xt1.d,$xt3.d
284 ___
285 }
286
287 sub SVE_ADD_STATES() {
288 $code.=<<___;
289 lsr $tmp1,@K[5],#32
290 dup $xt0.s,@KL[5]
291 dup $xt1.s,$tmpw1
292 add @mx[0].s,@mx[0].s,$bak0.s
293 add @mx[1].s,@mx[1].s,$bak1.s
294 add @mx[2].s,@mx[2].s,$bak2.s
295 add @mx[3].s,@mx[3].s,$bak3.s
296 add @mx[4].s,@mx[4].s,$bak4.s
297 add @mx[5].s,@mx[5].s,$bak5.s
298 add @mx[6].s,@mx[6].s,$bak6.s
299 add @mx[7].s,@mx[7].s,$bak7.s
300 add @mx[8].s,@mx[8].s,$bak8.s
301 add @mx[9].s,@mx[9].s,$bak9.s
302 lsr $tmp0,@K[6],#32
303 dup $xt4.s,$tmpw0
304 lsr $tmp1,@K[7],#32
305 dup $xt5.s,@KL[7]
306 dup $xt6.s,$tmpw1
307 add @mx[10].s,@mx[10].s,$xt0.s
308 add @mx[11].s,@mx[11].s,$xt1.s
309 add @mx[12].s,@mx[12].s,$zctr.s
310 add @mx[13].s,@mx[13].s,$xt4.s
311 add @mx[14].s,@mx[14].s,$xt5.s
312 add @mx[15].s,@mx[15].s,$xt6.s
313 ___
314 }
315
316 sub SVE2_ADD_STATES() {
317 $code.=<<___;
318 add @mx[0].s,@mx[0].s,$bak0.s
319 add @mx[1].s,@mx[1].s,$bak1.s
320 add @mx[2].s,@mx[2].s,$bak2.s
321 add @mx[3].s,@mx[3].s,$bak3.s
322 add @mx[4].s,@mx[4].s,$bak4.s
323 add @mx[5].s,@mx[5].s,$bak5.s
324 add @mx[6].s,@mx[6].s,$bak6.s
325 add @mx[7].s,@mx[7].s,$bak7.s
326 add @mx[8].s,@mx[8].s,$bak8.s
327 add @mx[9].s,@mx[9].s,$bak9.s
328 add @mx[10].s,@mx[10].s,$bak10.s
329 add @mx[11].s,@mx[11].s,$bak11.s
330 add @mx[12].s,@mx[12].s,$zctr.s
331 add @mx[13].s,@mx[13].s,$bak13.s
332 add @mx[14].s,@mx[14].s,$bak14.s
333 add @mx[15].s,@mx[15].s,$bak15.s
334 ___
335 }
336
337 sub SVE_TRANSFORMS() {
338 &transpose($xa0,$xb0,$xc0,$xd0);
339 &transpose($xa1,$xb1,$xc1,$xd1);
340 &transpose($xa2,$xb2,$xc2,$xd2);
341 &transpose($xa3,$xb3,$xc3,$xd3);
342 &transpose($xa0,$xa1,$xa2,$xa3);
343 &transpose($xb0,$xb1,$xb2,$xb3);
344 &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
345 $code.=<<___;
346 eor $xa0.d,$xa0.d,$xt0.d
347 eor $xa1.d,$xa1.d,$xt1.d
348 eor $xa2.d,$xa2.d,$xt2.d
349 eor $xa3.d,$xa3.d,$xt3.d
350 eor $xb0.d,$xb0.d,$xt4.d
351 eor $xb1.d,$xb1.d,$xt5.d
352 eor $xb2.d,$xb2.d,$xt6.d
353 eor $xb3.d,$xb3.d,$xt7.d
354 ___
355 &transpose($xc0,$xc1,$xc2,$xc3);
356 &store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
357 &transpose($xd0,$xd1,$xd2,$xd3);
358 &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
359 $code.=<<___;
360 eor $xc0.d,$xc0.d,$xt0.d
361 eor $xc1.d,$xc1.d,$xt1.d
362 eor $xc2.d,$xc2.d,$xt2.d
363 eor $xc3.d,$xc3.d,$xt3.d
364 eor $xd0.d,$xd0.d,$xt4.d
365 eor $xd1.d,$xd1.d,$xt5.d
366 eor $xd2.d,$xd2.d,$xt6.d
367 eor $xd3.d,$xd3.d,$xt7.d
368 ___
369 &store($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
370 $code.=<<___;
371 incw $xctr, ALL, MUL #1
372 incw $zctr.s, ALL, MUL #1
373 ___
374 }
375
376 sub SVE_LOAD_STATES() {
377 $code.=<<___;
378 lsr $tmp0,@K[0],#32
379 dup @mx[0].s,@KL[0]
380 dup $bak0.s,@KL[0]
381 dup @mx[1].s,$tmpw0
382 dup $bak1.s,$tmpw0
383 lsr $tmp1,@K[1],#32
384 dup @mx[2].s,@KL[1]
385 dup $bak2.s,@KL[1]
386 dup @mx[3].s,$tmpw1
387 dup $bak3.s,$tmpw1
388 lsr $tmp0,@K[2],#32
389 dup @mx[4].s,@KL[2]
390 dup $bak4.s,@KL[2]
391 dup @mx[5].s,$tmpw0
392 dup $bak5.s,$tmpw0
393 lsr $tmp1,@K[3],#32
394 dup @mx[6].s,@KL[3]
395 dup $bak6.s,@KL[3]
396 dup @mx[7].s,$tmpw1
397 dup $bak7.s,$tmpw1
398 lsr $tmp0,@K[4],#32
399 dup @mx[8].s,@KL[4]
400 dup $bak8.s,@KL[4]
401 dup @mx[9].s,$tmpw0
402 dup $bak9.s,$tmpw0
403 lsr $tmp1,@K[5],#32
404 dup @mx[10].s,@KL[5]
405 dup @mx[11].s,$tmpw1
406 orr @mx[12].d,$zctr.d,$zctr.d
407 lsr $tmp0,@K[6],#32
408 dup @mx[13].s,$tmpw0
409 lsr $tmp1,@K[7],#32
410 dup @mx[14].s,@KL[7]
411 dup @mx[15].s,$tmpw1
412 ___
413 }
414
415 sub SVE2_LOAD_STATES() {
416 $code.=<<___;
417 lsr $tmp0,@K[0],#32
418 dup @mx[0].s,@KL[0]
419 dup $bak0.s,@KL[0]
420 dup @mx[1].s,$tmpw0
421 dup $bak1.s,$tmpw0
422 lsr $tmp1,@K[1],#32
423 dup @mx[2].s,@KL[1]
424 dup $bak2.s,@KL[1]
425 dup @mx[3].s,$tmpw1
426 dup $bak3.s,$tmpw1
427 lsr $tmp0,@K[2],#32
428 dup @mx[4].s,@KL[2]
429 dup $bak4.s,@KL[2]
430 dup @mx[5].s,$tmpw0
431 dup $bak5.s,$tmpw0
432 lsr $tmp1,@K[3],#32
433 dup @mx[6].s,@KL[3]
434 dup $bak6.s,@KL[3]
435 dup @mx[7].s,$tmpw1
436 dup $bak7.s,$tmpw1
437 lsr $tmp0,@K[4],#32
438 dup @mx[8].s,@KL[4]
439 dup $bak8.s,@KL[4]
440 dup @mx[9].s,$tmpw0
441 dup $bak9.s,$tmpw0
442 lsr $tmp1,@K[5],#32
443 dup @mx[10].s,@KL[5]
444 dup $bak10.s,@KL[5]
445 dup @mx[11].s,$tmpw1
446 dup $bak11.s,$tmpw1
447 orr @mx[12].d,$zctr.d,$zctr.d
448 lsr $tmp0,@K[6],#32
449 dup @mx[13].s,$tmpw0
450 dup $bak13.s,$tmpw0
451 lsr $tmp1,@K[7],#32
452 dup @mx[14].s,@KL[7]
453 dup $bak14.s,@KL[7]
454 dup @mx[15].s,$tmpw1
455 dup $bak15.s,$tmpw1
456 ___
457 }
458
459 sub sve_handle_blocks() {
460 $code.=<<___;
461 cbz $sve2flag,.sve_inner
462 ___
463 &SVE2_LOAD_STATES();
464 &SVE2_INNER_BLOCK();
465 &SVE2_ADD_STATES();
466 $code.=<<___;
467 b .fini_inner
468 .sve_inner:
469 ___
470 &SVE_LOAD_STATES();
471 &SVE_INNER_BLOCK();
472 &SVE_ADD_STATES();
473 $code.=<<___;
474 .fini_inner:
475 ___
476 &SVE_TRANSFORMS();
477 }
478
479 sub chacha20_process() {
480 $code.=<<___;
481 .align 5
482 .Loop:
483 cmp $blocks,$veclen
484 b.lt .Lexit
485 ___
486 &sve_handle_blocks();
487 $code.=<<___;
488 subs $blocks,$blocks,$veclen
489 b.gt .Loop
490 .Lexit:
491 ___
492 }
493
494 {{{
495 $code.=<<___;
496 #include "arm_arch.h"
497
498 .arch armv8-a
499
500 .extern OPENSSL_armcap_P
501 .hidden OPENSSL_armcap_P
502
503 .text
504 .align 5
505 .Lchacha20_consts:
506 .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
507 .Lrot8:
508 .word 0x02010003,0x04040404,0x02010003,0x04040404
509 .globl ChaCha20_ctr32_sve
510 .type ChaCha20_ctr32_sve,%function
511 .align 5
512 ChaCha20_ctr32_sve:
513 AARCH64_VALID_CALL_TARGET
514 cntw $veclen, ALL, MUL #1
515 lsr $blocks,$len,#6
516 cmp $blocks,$veclen
517 b.lt .Lreturn
518 mov $sve2flag,0
519 adrp $tmp,OPENSSL_armcap_P
520 ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
521 tst $tmpw,#ARMV8_SVE2
522 b.eq 1f
523 mov $sve2flag,1
524 b 2f
525 1:
526 cmp $veclen,4
527 b.le .Lreturn
528 adr $tmp,.Lrot8
529 ldp $tmpw0,$tmpw1,[$tmp]
530 index $rot8.s,$tmpw0,$tmpw1
531 2:
532 stp d8,d9,[sp,-96]!
533 stp d10,d11,[sp,16]
534 stp d12,d13,[sp,32]
535 stp d14,d15,[sp,48]
536 stp x19,x20,[sp,64]
537 stp x21,x22,[sp,80]
538 adr $tmp,.Lchacha20_consts
539 ldp @K[0],@K[1],[$tmp]
540 ldp @K[2],@K[3],[$key]
541 ldp @K[4],@K[5],[$key, 16]
542 ldp @K[6],@K[7],[$ctr]
543 ldr $wctr,[$ctr]
544 index $zctr.s,$wctr,1
545 ptrues p0.s,ALL
546 #ifdef __AARCH64EB__
547 ror @K[2],@K[2],#32
548 ror @K[3],@K[3],#32
549 ror @K[4],@K[4],#32
550 ror @K[5],@K[5],#32
551 ror @K[6],@K[6],#32
552 ror @K[7],@K[7],#32
553 #endif
554 ___
555 &chacha20_process();
556 $code.=<<___;
557 ldp d10,d11,[sp,16]
558 ldp d12,d13,[sp,32]
559 ldp d14,d15,[sp,48]
560 ldp x19,x20,[sp,64]
561 ldp x21,x22,[sp,80]
562 ldp d8,d9,[sp],96
563 str $wctr,[$ctr]
564 and $len,$len,#63
565 add $len,$len,$blocks,lsl #6
566 .Lreturn:
567 ret
568 .size ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve
569 ___
570
571 }}}
572
573 ########################################
574 {
575 my %opcode_unpred = (
576 "movprfx" => 0x0420BC00,
577 "eor" => 0x04a03000,
578 "add" => 0x04200000,
579 "orr" => 0x04603000,
580 "lsl" => 0x04209C00,
581 "lsr" => 0x04209400,
582 "incw" => 0x04B0C000,
583 "xar" => 0x04203400,
584 "zip1" => 0x05206000,
585 "zip2" => 0x05206400,
586 "uzp1" => 0x05206800,
587 "uzp2" => 0x05206C00,
588 "index" => 0x04204C00,
589 "mov" => 0x05203800,
590 "dup" => 0x05203800,
591 "cntw" => 0x04A0E000,
592 "tbl" => 0x05203000);
593
594 my %opcode_imm_unpred = (
595 "dup" => 0x2538C000,
596 "index" => 0x04204400);
597
598 my %opcode_scalar_pred = (
599 "mov" => 0x0528A000,
600 "cpy" => 0x0528A000,
601 "st4w" => 0xE5606000,
602 "st1w" => 0xE5004000,
603 "ld1w" => 0xA5404000);
604
605 my %opcode_gather_pred = (
606 "ld1w" => 0x85204000);
607
608 my %opcode_pred = (
609 "eor" => 0x04190000,
610 "add" => 0x04000000,
611 "orr" => 0x04180000,
612 "whilelo" => 0x25200C00,
613 "whilelt" => 0x25200400,
614 "cntp" => 0x25208000,
615 "addvl" => 0x04205000,
616 "lsl" => 0x04038000,
617 "lsr" => 0x04018000,
618 "sel" => 0x0520C000,
619 "mov" => 0x0520C000,
620 "ptrue" => 0x2518E000,
621 "pfalse" => 0x2518E400,
622 "ptrues" => 0x2519E000,
623 "pnext" => 0x2519C400,
624 "ld4w" => 0xA560E000,
625 "st4w" => 0xE570E000,
626 "st1w" => 0xE500E000,
627 "ld1w" => 0xA540A000,
628 "ld1rw" => 0x8540C000,
629 "revh" => 0x05258000);
630
631 my %tsize = (
632 'b' => 0,
633 'h' => 1,
634 's' => 2,
635 'd' => 3);
636
637 my %sf = (
638 "w" => 0,
639 "x" => 1);
640
641 my %pattern = (
642 "POW2" => 0,
643 "VL1" => 1,
644 "VL2" => 2,
645 "VL3" => 3,
646 "VL4" => 4,
647 "VL5" => 5,
648 "VL6" => 6,
649 "VL7" => 7,
650 "VL8" => 8,
651 "VL16" => 9,
652 "VL32" => 10,
653 "VL64" => 11,
654 "VL128" => 12,
655 "VL256" => 13,
656 "MUL4" => 29,
657 "MUL3" => 30,
658 "ALL" => 31);
659
660 sub create_verifier {
661 my $filename="./compile_sve.sh";
662
663 $scripts = <<___;
664 #! /bin/bash
665 set -e
666 CROSS_COMPILE=\${CROSS_COMPILE:-'aarch64-none-linux-gnu-'}
667
668 [ -z "\$1" ] && exit 1
669 ARCH=`uname -p | xargs echo -n`
670
671 # need gcc-10 and above to compile SVE code
672 # change this according to your system during debugging
673 if [ \$ARCH == 'aarch64' ]; then
674 CC=gcc-11
675 OBJDUMP=objdump
676 else
677 CC=\${CROSS_COMPILE}gcc
678 OBJDUMP=\${CROSS_COMPILE}objdump
679 fi
680 TMPFILE=/tmp/\$\$
681 cat > \$TMPFILE.c << EOF
682 extern __attribute__((noinline, section("disasm_output"))) void dummy_func()
683 {
684 asm("\$@\\t\\n");
685 }
686 int main(int argc, char *argv[])
687 {
688 }
689 EOF
690 \$CC -march=armv8.2-a+sve+sve2 -o \$TMPFILE.out \$TMPFILE.c
691 \$OBJDUMP -d \$TMPFILE.out | awk -F"\\n" -v RS="\\n\\n" '\$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",\$2}'
692 rm \$TMPFILE.c \$TMPFILE.out
693 ___
694 open(FH, '>', $filename) or die $!;
695 print FH $scripts;
696 close(FH);
697 system("chmod a+x ./compile_sve.sh");
698 }
699
700 sub compile_sve {
701 return `./compile_sve.sh '@_'`
702 }
703
704 sub verify_inst {
705 my ($code,$inst)=@_;
706 my $hexcode = (sprintf "%08x", $code);
707
708 if ($debug_encoder == 1) {
709 my $expect=&compile_sve($inst);
710 if ($expect ne $hexcode) {
711 return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode);
712 }
713 }
714 return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst);
715 }
716
717 sub reg_code {
718 my $code = shift;
719
720 if ($code == "zr") {
721 return "31";
722 }
723 return $code;
724 }
725
726 sub encode_size_imm() {
727 my ($mnemonic, $isize, $const)=@_;
728 my $esize = (8<<$tsize{$isize});
729 my $tsize_imm = $esize + $const;
730
731 if ($mnemonic eq "lsr" || $mnemonic eq "xar") {
732 $tsize_imm = 2*$esize - $const;
733 }
734 return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16);
735 }
736
737 sub encode_shift_pred() {
738 my ($mnemonic, $isize, $const)=@_;
739 my $esize = (8<<$tsize{$isize});
740 my $tsize_imm = $esize + $const;
741
742 if ($mnemonic eq "lsr") {
743 $tsize_imm = 2*$esize - $const;
744 }
745 return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<5);
746 }
747
748 sub sve_unpred {
749 my ($mnemonic,$arg)=@_;
750 my $inst = (sprintf "%s %s", $mnemonic,$arg);
751
752 if ($arg =~ m/z([0-9]+)\.([bhsd]),\s*\{\s*z([0-9]+)\.[bhsd].*\},\s*z([0-9]+)\.[bhsd].*/o) {
753 return &verify_inst($opcode_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22)|($4<<16),
754 $inst)
755 } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*([zwx][0-9]+.*)/o) {
756 my $regd = $1;
757 my $isize = $2;
758 my $regs=$3;
759
760 if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
761 if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
762 && ((8<<$tsize{$isize}) > $2)) {
763 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($1<<5)|&encode_size_imm($mnemonic,$isize,$2),
764 $inst);
765 }
766 } elsif($regs =~ m/[wx]([0-9]+),\s*[wx]([0-9]+)/o) {
767 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
768 } elsif ($regs =~ m/[wx]([0-9]+),\s*#?([0-9]+)/o) {
769 return &verify_inst($opcode_imm_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
770 } elsif ($regs =~ m/[wx]([0-9]+)/o) {
771 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5), $inst);
772 } else {
773 my $encoded_size = 0;
774 if (($mnemonic eq "add") || ($mnemonic =~ /zip./) || ($mnemonic =~ /uzp./) ) {
775 $encoded_size = ($tsize{$isize}<<22);
776 }
777 if ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd],\s*([0-9]+)/o &&
778 $1 == $regd) {
779 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($2<<5)|&encode_size_imm($mnemonic,$isize,$3), $inst);
780 } elsif ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd]/o) {
781 return &verify_inst($opcode_unpred{$mnemonic}|$regd|$encoded_size|($1<<5)|($2<<16), $inst);
782 }
783 }
784 } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*#?([0-9]+)/o) {
785 return &verify_inst($opcode_imm_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22),
786 $inst)
787 }
788 sprintf "%s // fail to parse", $inst;
789 }
790
791 sub sve_pred {
792 my ($mnemonic,,$arg)=@_;
793 my $inst = (sprintf "%s %s", $mnemonic,$arg);
794
795 if ($arg =~ m/\{\s*z([0-9]+)\.([bhsd]).*\},\s*p([0-9])+(\/z)?,\s*\[(\s*[xs].*)\]/o) {
796 my $zt = $1;
797 my $size = $tsize{$2};
798 my $pg = $3;
799 my $addr = $5;
800 my $xn = 31;
801
802 if ($addr =~ m/x([0-9]+)\s*/o) {
803 $xn = $1;
804 }
805
806 if ($mnemonic =~m/ld1r[bhwd]/o) {
807 $size = 0;
808 }
809 if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
810 return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
811 } elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
812 my $xs = ($2 eq "SXTW") ? 1 : 0;
813 return &verify_inst($opcode_gather_pred{$mnemonic}|($xs<<22)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
814 } elsif($addr =~ m/\w+\s*,\s*#?([0-9]+)/o) {
815 return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
816 } else {
817 return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($xn<<5),$inst);
818 }
819 } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/([mz]),\s*([zwx][0-9]+.*)/o) {
820 my $regd = $1;
821 my $isize = $2;
822 my $pg = $3;
823 my $mod = $4;
824 my $regs = $5;
825
826 if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
827 if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
828 && $regd == $1
829 && $mode == 'm'
830 && ((8<<$tsize{$isize}) > $2)) {
831 return &verify_inst($opcode_pred{$mnemonic}|$regd|($pg<<10)|&encode_shift_pred($mnemonic,$isize,$2), $inst);
832 }
833 } elsif($regs =~ m/[wx]([0-9]+)/o) {
834 return &verify_inst($opcode_scalar_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
835 } elsif ($regs =~ m/z([0-9]+)[^,]*(?:,\s*z([0-9]+))?/o) {
836 if ($mnemonic eq "sel") {
837 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($2<<16), $inst);
838 } elsif ($mnemonic eq "mov") {
839 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($regd<<16), $inst);
840 } elsif (length $2 > 0) {
841 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($2<<5), $inst);
842 } else {
843 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
844 }
845 }
846 } elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(\w+.*)/o) {
847 my $pg = $1;
848 my $isize = $2;
849 my $regs = $3;
850
851 if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) {
852 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($sf{$1}<<12)|(&reg_code($2)<<5)|(&reg_code($3)<<16), $inst);
853 } elsif ($regs =~ m/p([0-9]+),\s*p([0-9]+)\.[bhsd]/o) {
854 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($1<<5), $inst);
855 } else {
856 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($pattern{$regs}<<5), $inst);
857 }
858 } elsif ($arg =~ m/p([0-9]+)\.([bhsd])/o) {
859 return &verify_inst($opcode_pred{$mnemonic}|$1, $inst);
860 }
861
862 sprintf "%s // fail to parse", $inst;
863 }
864
865 sub sve_other {
866 my ($mnemonic,$arg)=@_;
867 my $inst = (sprintf "%s %s", $mnemonic,$arg);
868
869 if ($arg =~ m/x([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*p([0-9]+)\.([bhsd])/o) {
870 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$4}<<22)|$1|($2<<10)|($3<<5), $inst);
871 } elsif ($mnemonic =~ /inc[bhdw]/) {
872 if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
873 return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(2<<12)|(($3 - 1)<<16), $inst);
874 } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
875 return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
876 } elsif ($arg =~ m/x([0-9]+)/o) {
877 return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst);
878 }
879 } elsif ($mnemonic =~ /cnt[bhdw]/) {
880 if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
881 return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
882 }
883 } elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
884 return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
885 } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) {
886 return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst);
887 }
888 sprintf "%s // fail to parse", $inst;
889 }
890 }
891
892 open SELF,$0;
893 while(<SELF>) {
894 next if (/^#!/);
895 last if (!s/^#/\/\// and !/^$/);
896 print;
897 }
898 close SELF;
899
900 if ($debug_encoder == 1) {
901 &create_verifier();
902 }
903
904 foreach(split("\n",$code)) {
905 s/\`([^\`]*)\`/eval($1)/ge;
906 s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
907 s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
908 s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
909 s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
910 s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
911 s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
912 s/\b(movprfx|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
913 print $_,"\n";
914 }
915
916 close STDOUT or die "error closing STDOUT: $!";