]> git.ipfire.org Git - thirdparty/openssl.git/commitdiff
Optimize chacha20 on aarch64 by SVE2
authorDaniel Hu <Daniel.Hu@arm.com>
Wed, 25 May 2022 09:23:40 +0000 (10:23 +0100)
committerPauli <pauli@openssl.org>
Wed, 22 Jun 2022 07:07:17 +0000 (17:07 +1000)
This patch improves existing chacha20 SVE patch by using SVE2,
which is an optional architecture feature of aarch64, with XAR
instruction that can improve the performance of chacha20.

Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/18522)

crypto/chacha/asm/chacha-armv8-sve.pl

index 6080414e0d570135f1ca84b345845b366d1975e1..dfc4548a4f4485d3a635b7bf3d5e47d0479394a9 100755 (executable)
@@ -31,17 +31,25 @@ sub AUTOLOAD()              # thunk [simplified] x86-style perlasm
 }
 
 my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
-my ($state) = ("x5");
-my ($veclen_w,$veclen,$blocks) = ("w6","x6","x7");
-my ($saved_outp) = ("x8");
-my ($wctr, $xctr) = ("w9", "x9");
-my @mx=map("z$_",(0..7,16..23));
+my ($veclen_w,$veclen,$blocks) = ("w5","x5","x6");
+my ($sve2flag) = ("x7");
+my ($wctr, $xctr) = ("w8", "x8");
+my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10");
+my ($tmp,$tmpw) = ("x10", "w10");
+my ($counter) = ("x11");
+my @K=map("x$_",(12..15,19..22));
+my @KL=map("w$_",(12..15,19..22));
+my @mx=map("z$_",(0..15));
 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
     $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
-my @xt=map("z$_",(24..31,8..11));
-my ($rot8) = ("z12");
-my ($zctr) = ("z13");
-my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7,$xt8,$xt9,$xt10,$xt11)=@xt;
+my ($zctr) = ("z16");
+my @xt=map("z$_",(17..24));
+my @perm=map("z$_",(25..30));
+my ($rot8) = ("z31");
+my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7)=@xt;
+# in SVE mode we can only use bak0 ~ bak9 (the rest used as scratch register)
+# in SVE2 we use all 15 backup register
+my ($bak0,$bak1,$bak2,$bak3,$bak4,$bak5,$bak6,$bak7,$bak8,$bak9,$bak10,$bak11,$bak13,$bak14,$bak15)=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],$xt4,$xt5,$xt6,$xt7,$xt0,$xt1,$xt2,$xt3,$rot8);
 my $debug_encoder=0;
 
 sub SVE_ADD() {
@@ -148,8 +156,12 @@ sub SVE_QR_GROUP() {
        my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
 
        &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
-       &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
-       &SVE_REV16($d0,$d1,$d2,$d3);
+       if ($have_sve2 == 0) {
+               &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+               &SVE_REV16($d0,$d1,$d2,$d3);
+       } else {
+               &SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+       }
 
        &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
        if ($have_sve2 == 0) {
@@ -162,8 +174,12 @@ sub SVE_QR_GROUP() {
        }
 
        &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
-       &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
-       &SVE_ROT8($d0,$d1,$d2,$d3);
+       if ($have_sve2 == 0) {
+               &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+               &SVE_ROT8($d0,$d1,$d2,$d3);
+       } else {
+               &SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+       }
 
        &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
        if ($have_sve2 == 0) {
@@ -178,27 +194,32 @@ sub SVE_QR_GROUP() {
 
 sub SVE_INNER_BLOCK() {
 $code.=<<___;
-       //cbnz $sve2flag, 10f
+       mov     $counter,#10
+1:
+.align 5
 ___
        &SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
        &SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
 $code.=<<___;
-       // SVE 2 not enabled until hardware available
-#if 0
-       b 11f
-10:
+       subs    $counter,$counter,1
+       b.ne    1b
 ___
-#      &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
-#      &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
+}
+
+sub SVE2_INNER_BLOCK() {
 $code.=<<___;
-11:
-#endif
+       mov     $counter,#10
+1:
+.align 5
+___
+       &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
+       &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
+$code.=<<___;
+       subs    $counter,$counter,1
+       b.ne    1b
 ___
 }
 
-{{{
-my ($dlen,$rsize,$tmp) = ("x10","x11","x12");
-
 sub load() {
        my $x0 = shift;
        my $x1 = shift;
@@ -252,72 +273,75 @@ sub transpose() {
        my $xd = shift;
 
 $code.=<<___;
-       zip1    $xt8.s,$xa.s,$xb.s
-       zip2    $xt9.s,$xa.s,$xb.s
-       zip1    $xt10.s,$xc.s,$xd.s
-       zip2    $xt11.s,$xc.s,$xd.s
-       zip1    $xa.d,$xt8.d,$xt10.d
-       zip2    $xb.d,$xt8.d,$xt10.d
-       zip1    $xc.d,$xt9.d,$xt11.d
-       zip2    $xd.d,$xt9.d,$xt11.d
+       zip1    $xt0.s,$xa.s,$xb.s
+       zip2    $xt1.s,$xa.s,$xb.s
+       zip1    $xt2.s,$xc.s,$xd.s
+       zip2    $xt3.s,$xc.s,$xd.s
+       zip1    $xa.d,$xt0.d,$xt2.d
+       zip2    $xb.d,$xt0.d,$xt2.d
+       zip1    $xc.d,$xt1.d,$xt3.d
+       zip2    $xd.d,$xt1.d,$xt3.d
 ___
 }
 
-sub add_states() {
-       my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
-
+sub SVE_ADD_STATES() {
 $code.=<<___;
-       ldp     $tmpw0,$tmpw1,[$state]
-       ldp     $tmpw2,$tmpw3,[$state,#8]
-       dup     $xt0.s,$tmpw0
+       lsr     $tmp1,@K[5],#32
+       dup     $xt0.s,@KL[5]
        dup     $xt1.s,$tmpw1
-       dup     $xt2.s,$tmpw2
-       dup     $xt3.s,$tmpw3
-       ldp     $tmpw0,$tmpw1,[$state,#16]
-       ldp     $tmpw2,$tmpw3,[$state,#24]
-       add     @mx[0].s,@mx[0].s,$xt0.s
-       add     @mx[1].s,@mx[1].s,$xt1.s
-       add     @mx[2].s,@mx[2].s,$xt2.s
-       add     @mx[3].s,@mx[3].s,$xt3.s
+       add     @mx[0].s,@mx[0].s,$bak0.s
+       add     @mx[1].s,@mx[1].s,$bak1.s
+       add     @mx[2].s,@mx[2].s,$bak2.s
+       add     @mx[3].s,@mx[3].s,$bak3.s
+       add     @mx[4].s,@mx[4].s,$bak4.s
+       add     @mx[5].s,@mx[5].s,$bak5.s
+       add     @mx[6].s,@mx[6].s,$bak6.s
+       add     @mx[7].s,@mx[7].s,$bak7.s
+       add     @mx[8].s,@mx[8].s,$bak8.s
+       add     @mx[9].s,@mx[9].s,$bak9.s
+       lsr     $tmp0,@K[6],#32
        dup     $xt4.s,$tmpw0
-       dup     $xt5.s,$tmpw1
-       dup     $xt6.s,$tmpw2
-       dup     $xt7.s,$tmpw3
-       ldp     $tmpw0,$tmpw1,[$state,#32]
-       ldp     $tmpw2,$tmpw3,[$state,#40]
-       add     @mx[4].s,@mx[4].s,$xt4.s
-       add     @mx[5].s,@mx[5].s,$xt5.s
-       add     @mx[6].s,@mx[6].s,$xt6.s
-       add     @mx[7].s,@mx[7].s,$xt7.s
-       dup     $xt0.s,$tmpw0
-       dup     $xt1.s,$tmpw1
-       dup     $xt2.s,$tmpw2
-       dup     $xt3.s,$tmpw3
-       ldp     $tmpw0,$tmpw1,[$state,#48]
-       ldp     $tmpw2,$tmpw3,[$state,#56]
-       add     @mx[8].s,@mx[8].s,$xt0.s
-       add     @mx[9].s,@mx[9].s,$xt1.s
-       add     @mx[10].s,@mx[10].s,$xt2.s
-       add     @mx[11].s,@mx[11].s,$xt3.s
-       dup     $xt5.s,$tmpw1
-       dup     $xt6.s,$tmpw2
-       dup     $xt7.s,$tmpw3
+       lsr     $tmp1,@K[7],#32
+       dup     $xt5.s,@KL[7]
+       dup     $xt6.s,$tmpw1
+       add     @mx[10].s,@mx[10].s,$xt0.s
+       add     @mx[11].s,@mx[11].s,$xt1.s
+       add     @mx[12].s,@mx[12].s,$zctr.s
+       add     @mx[13].s,@mx[13].s,$xt4.s
+       add     @mx[14].s,@mx[14].s,$xt5.s
+       add     @mx[15].s,@mx[15].s,$xt6.s
+___
+}
+
+sub SVE2_ADD_STATES() {
+$code.=<<___;
+       add     @mx[0].s,@mx[0].s,$bak0.s
+       add     @mx[1].s,@mx[1].s,$bak1.s
+       add     @mx[2].s,@mx[2].s,$bak2.s
+       add     @mx[3].s,@mx[3].s,$bak3.s
+       add     @mx[4].s,@mx[4].s,$bak4.s
+       add     @mx[5].s,@mx[5].s,$bak5.s
+       add     @mx[6].s,@mx[6].s,$bak6.s
+       add     @mx[7].s,@mx[7].s,$bak7.s
+       add     @mx[8].s,@mx[8].s,$bak8.s
+       add     @mx[9].s,@mx[9].s,$bak9.s
+       add     @mx[10].s,@mx[10].s,$bak10.s
+       add     @mx[11].s,@mx[11].s,$bak11.s
        add     @mx[12].s,@mx[12].s,$zctr.s
-       add     @mx[13].s,@mx[13].s,$xt5.s
-       add     @mx[14].s,@mx[14].s,$xt6.s
-       add     @mx[15].s,@mx[15].s,$xt7.s
+       add     @mx[13].s,@mx[13].s,$bak13.s
+       add     @mx[14].s,@mx[14].s,$bak14.s
+       add     @mx[15].s,@mx[15].s,$bak15.s
 ___
 }
 
 sub SVE_TRANSFORMS() {
-       &add_states();
        &transpose($xa0,$xb0,$xc0,$xd0);
        &transpose($xa1,$xb1,$xc1,$xd1);
        &transpose($xa2,$xb2,$xc2,$xd2);
        &transpose($xa3,$xb3,$xc3,$xd3);
-       &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
        &transpose($xa0,$xa1,$xa2,$xa3);
        &transpose($xb0,$xb1,$xb2,$xb3);
+       &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
 $code.=<<___;
        eor     $xa0.d,$xa0.d,$xt0.d
        eor     $xa1.d,$xa1.d,$xt1.d
@@ -330,8 +354,8 @@ $code.=<<___;
 ___
        &transpose($xc0,$xc1,$xc2,$xc3);
        &store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
-       &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
        &transpose($xd0,$xd1,$xd2,$xd3);
+       &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
 $code.=<<___;
        eor     $xc0.d,$xc0.d,$xt0.d
        eor     $xc1.d,$xc1.d,$xt1.d
@@ -348,73 +372,111 @@ $code.=<<___;
        incw    $zctr.s, ALL, MUL #1
 ___
 }
-}}}
 
 sub SVE_LOAD_STATES() {
-       my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
-
 $code.=<<___;
-       // FIXME following code are not functionally necessary
-       // but appear to enhance performance
-#if 1
-       ptrues  p2.s,ALL
-       ptrues  p2.s,ALL
-       ptrues  p2.s,ALL
-       ptrues  p2.s,ALL
-       ptrues  p2.s,ALL
-       ptrues  p2.s,ALL
-#endif
+       lsr     $tmp0,@K[0],#32
+       dup     @mx[0].s,@KL[0]
+       dup     $bak0.s,@KL[0]
+       dup     @mx[1].s,$tmpw0
+       dup     $bak1.s,$tmpw0
+       lsr     $tmp1,@K[1],#32
+       dup     @mx[2].s,@KL[1]
+       dup     $bak2.s,@KL[1]
+       dup     @mx[3].s,$tmpw1
+       dup     $bak3.s,$tmpw1
+       lsr     $tmp0,@K[2],#32
+       dup     @mx[4].s,@KL[2]
+       dup     $bak4.s,@KL[2]
+       dup     @mx[5].s,$tmpw0
+       dup     $bak5.s,$tmpw0
+       lsr     $tmp1,@K[3],#32
+       dup     @mx[6].s,@KL[3]
+       dup     $bak6.s,@KL[3]
+       dup     @mx[7].s,$tmpw1
+       dup     $bak7.s,$tmpw1
+       lsr     $tmp0,@K[4],#32
+       dup     @mx[8].s,@KL[4]
+       dup     $bak8.s,@KL[4]
+       dup     @mx[9].s,$tmpw0
+       dup     $bak9.s,$tmpw0
+       lsr     $tmp1,@K[5],#32
+       dup     @mx[10].s,@KL[5]
+       dup     @mx[11].s,$tmpw1
+       orr     @mx[12].d,$zctr.d,$zctr.d
+       lsr     $tmp0,@K[6],#32
+       dup     @mx[13].s,$tmpw0
+       lsr     $tmp1,@K[7],#32
+       dup     @mx[14].s,@KL[7]
+       dup     @mx[15].s,$tmpw1
 ___
+}
+
+sub SVE2_LOAD_STATES() {
 $code.=<<___;
-       ldp     $tmpw0,$tmpw1,[$state]
-       ldp     $tmpw2,$tmpw3,[$state,#8]
-       dup     @mx[0].s,$tmpw0
-       dup     @mx[1].s,$tmpw1
-       dup     @mx[2].s,$tmpw2
-       dup     @mx[3].s,$tmpw3
-       ldp     $tmpw0,$tmpw1,[$state,#16]
-       ldp     $tmpw2,$tmpw3,[$state,#24]
-       dup     @mx[4].s,$tmpw0
-       dup     @mx[5].s,$tmpw1
-       dup     @mx[6].s,$tmpw2
-       dup     @mx[7].s,$tmpw3
-       ldp     $tmpw0,$tmpw1,[$state,#32]
-       ldp     $tmpw2,$tmpw3,[$state,#40]
-       dup     @mx[8].s,$tmpw0
-       dup     @mx[9].s,$tmpw1
-       dup     @mx[10].s,$tmpw2
-       dup     @mx[11].s,$tmpw3
-       ldp     $tmpw0,$tmpw1,[$state, #48]
-       ldp     $tmpw2,$tmpw3,[$state,#56]
-       mov     @mx[12].s,p0/m,$zctr.s
-       dup     @mx[13].s,$tmpw1
-       dup     @mx[14].s,$tmpw2
-       dup     @mx[15].s,$tmpw3
+       lsr     $tmp0,@K[0],#32
+       dup     @mx[0].s,@KL[0]
+       dup     $bak0.s,@KL[0]
+       dup     @mx[1].s,$tmpw0
+       dup     $bak1.s,$tmpw0
+       lsr     $tmp1,@K[1],#32
+       dup     @mx[2].s,@KL[1]
+       dup     $bak2.s,@KL[1]
+       dup     @mx[3].s,$tmpw1
+       dup     $bak3.s,$tmpw1
+       lsr     $tmp0,@K[2],#32
+       dup     @mx[4].s,@KL[2]
+       dup     $bak4.s,@KL[2]
+       dup     @mx[5].s,$tmpw0
+       dup     $bak5.s,$tmpw0
+       lsr     $tmp1,@K[3],#32
+       dup     @mx[6].s,@KL[3]
+       dup     $bak6.s,@KL[3]
+       dup     @mx[7].s,$tmpw1
+       dup     $bak7.s,$tmpw1
+       lsr     $tmp0,@K[4],#32
+       dup     @mx[8].s,@KL[4]
+       dup     $bak8.s,@KL[4]
+       dup     @mx[9].s,$tmpw0
+       dup     $bak9.s,$tmpw0
+       lsr     $tmp1,@K[5],#32
+       dup     @mx[10].s,@KL[5]
+       dup     $bak10.s,@KL[5]
+       dup     @mx[11].s,$tmpw1
+       dup     $bak11.s,$tmpw1
+       orr     @mx[12].d,$zctr.d,$zctr.d
+       lsr     $tmp0,@K[6],#32
+       dup     @mx[13].s,$tmpw0
+       dup     $bak13.s,$tmpw0
+       lsr     $tmp1,@K[7],#32
+       dup     @mx[14].s,@KL[7]
+       dup     $bak14.s,@KL[7]
+       dup     @mx[15].s,$tmpw1
+       dup     $bak15.s,$tmpw1
 ___
 }
 
 sub sve_handle_blocks() {
-       my ($counter) = ("x10");
-
-       &SVE_LOAD_STATES();
 $code.=<<___;
-       mov     $counter,#10
-.align 5
-1:
+       cbz     $sve2flag,.sve_inner
 ___
-
+       &SVE2_LOAD_STATES();
+       &SVE2_INNER_BLOCK();
+       &SVE2_ADD_STATES();
+$code.=<<___;
+       b       .fini_inner
+.sve_inner:
+___
+       &SVE_LOAD_STATES();
        &SVE_INNER_BLOCK();
+       &SVE_ADD_STATES();
 $code.=<<___;
-       subs    $counter,$counter,1
-       b.ne    1b
+.fini_inner:
 ___
        &SVE_TRANSFORMS();
 }
 
 sub chacha20_process() {
-       my ($counter) = ("x10");
-       my ($tmpw) = ("w11");
-
 $code.=<<___;
 .align 5
 .Loop:
@@ -430,27 +492,18 @@ ___
 }
 
 {{{
-my ($tmp,$tmpw) = ("x10", "w10");
-my ($tmpw0,$tmpw1) = ("w11", "w12");
-my ($ptr) = ("x13");
-
 $code.=<<___;
 #include "arm_arch.h"
 
 .arch   armv8-a
 
-#if 0
 .extern        OPENSSL_armcap_P
 .hidden        OPENSSL_armcap_P
-#endif
 
 .text
 .align 5
 .Lchacha20_consts:
-       .word 0x61707865
-       .word 0x3320646e
-       .word 0x79622d32
-       .word 0x6b206574
+.quad  0x3320646e61707865,0x6b20657479622d32           // endian-neutral
 .Lrot8:
        .word 0x02010003,0x04040404,0x02010003,0x04040404
 .globl ChaCha20_ctr32_sve
@@ -458,49 +511,55 @@ $code.=<<___;
 .align 5
 ChaCha20_ctr32_sve:
        AARCH64_VALID_CALL_TARGET
-       mov     $tmp, #64
-       whilelo p0.s,xzr,$tmp
-       cntp    $veclen,p0,p0.s
-       // run Neon if we only have 128-bit SVE
-       // in the future, we need to check SVE2
-       cmp     $veclen,4
-       b.le    .Lreturn
+       cntw    $veclen, ALL, MUL #1
        lsr     $blocks,$len,#6
        cmp     $blocks,$veclen
        b.lt    .Lreturn
-       stp     d8,d9,[sp,-48]!
-       stp     d10,d11,[sp,16]
-       stp     d12,d13,[sp,32]
-       sub     sp,sp,#64
-       adr     $tmp,.Lchacha20_consts
-       ld1     {v0.4s},[$tmp]
-       adr     $tmp,.Lrot8
-       ldp     $tmpw0,$tmpw1,[$tmp]
-       ld1     {v1.4s,v2.4s},[$key]
-       ld1     {v3.4s},[$ctr]
-       ldr     $wctr,[$ctr]
-       index   $zctr.s,$wctr,1
-       index   $rot8.s,$tmpw0,$tmpw1
-       st1     {v0.4s,v1.4s,v2.4s,v3.4s},[sp]
-       mov     $state,sp
-#if 0
-       // SVE2 code not enabled until we have hardware
-       // for verification
        mov     $sve2flag,0
        adrp    $tmp,OPENSSL_armcap_P
        ldr     $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
        tst     $tmpw,#ARMV8_SVE2
        b.eq    1f
        mov     $sve2flag,1
+       b       2f
 1:
+       cmp     $veclen,4
+       b.le    .Lreturn
+       adr     $tmp,.Lrot8
+       ldp     $tmpw0,$tmpw1,[$tmp]
+       index   $rot8.s,$tmpw0,$tmpw1
+2:
+       stp     d8,d9,[sp,-96]!
+       stp     d10,d11,[sp,16]
+       stp     d12,d13,[sp,32]
+       stp     d14,d15,[sp,48]
+       stp     x19,x20,[sp,64]
+       stp     x21,x22,[sp,80]
+       adr     $tmp,.Lchacha20_consts
+       ldp     @K[0],@K[1],[$tmp]
+       ldp     @K[2],@K[3],[$key]
+       ldp     @K[4],@K[5],[$key, 16]
+       ldp     @K[6],@K[7],[$ctr]
+       ldr     $wctr,[$ctr]
+       index   $zctr.s,$wctr,1
+       ptrues  p0.s,ALL
+#ifdef __AARCH64EB__
+       ror     @K[2],@K[2],#32
+       ror     @K[3],@K[3],#32
+       ror     @K[4],@K[4],#32
+       ror     @K[5],@K[5],#32
+       ror     @K[6],@K[6],#32
+       ror     @K[7],@K[7],#32
 #endif
 ___
        &chacha20_process();
 $code.=<<___;
-       add     sp,sp,#64
        ldp     d10,d11,[sp,16]
        ldp     d12,d13,[sp,32]
-       ldp     d8,d9,[sp],48
+       ldp     d14,d15,[sp,48]
+       ldp     x19,x20,[sp,64]
+       ldp     x21,x22,[sp,80]
+       ldp     d8,d9,[sp],96
        str     $wctr,[$ctr]
        and     $len,$len,#63
        add     $len,$len,$blocks,lsl #6
@@ -514,6 +573,7 @@ ___
 ########################################
 {
 my  %opcode_unpred = (
+       "movprfx"      => 0x0420BC00,
        "eor"          => 0x04a03000,
        "add"          => 0x04200000,
        "orr"          => 0x04603000,
@@ -528,6 +588,7 @@ my  %opcode_unpred = (
        "index"        => 0x04204C00,
        "mov"          => 0x05203800,
        "dup"          => 0x05203800,
+       "cntw"         => 0x04A0E000,
        "tbl"          => 0x05203000);
 
 my  %opcode_imm_unpred = (
@@ -564,6 +625,7 @@ my  %opcode_pred = (
        "st4w"         => 0xE570E000,
        "st1w"         => 0xE500E000,
        "ld1w"         => 0xA540A000,
+       "ld1rw"        => 0x8540C000,
        "revh"         => 0x05258000);
 
 my  %tsize = (
@@ -740,6 +802,10 @@ sub sve_pred {
                if ($addr =~ m/x([0-9]+)\s*/o) {
                        $xn = $1;
                }
+
+               if ($mnemonic =~m/ld1r[bhwd]/o) {
+                       $size = 0;
+               }
                if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
                        return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
                } elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
@@ -810,8 +876,14 @@ sub sve_other {
                } elsif ($arg =~ m/x([0-9]+)/o) {
                        return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst);
                }
+       } elsif ($mnemonic =~ /cnt[bhdw]/) {
+               if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
+                       return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
+               }
        } elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
                return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
+       } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) {
+               return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst);
        }
        sprintf "%s // fail to parse", $inst;
 }
@@ -834,9 +906,10 @@ foreach(split("\n",$code)) {
        s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
        s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
        s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
+       s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
        s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
        s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
-       s/\b(cntp|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
+       s/\b(movprfx|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
        print $_,"\n";
 }