From: Xi Ruoyao Date: Sat, 25 Nov 2023 09:53:57 +0000 (+0800) Subject: LoongArch64 assembly pack: Fix ChaCha20 ABI breakage X-Git-Tag: openssl-3.3.0-alpha1~454 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b46de72c260e7c4d9bfefa35b02295ba32ad2ac6;p=thirdparty%2Fopenssl.git LoongArch64 assembly pack: Fix ChaCha20 ABI breakage The [LP64D ABI][1] requires the floating-point registers f24-f31 (aka fs0-fs7) callee-saved. The low 64 bits of a LSX/LASX vector register aliases with the corresponding FPR, so we must save and restore the callee-saved FPR when we writes into the corresponding vector register. This ABI breakage can be easily demonstrated by injecting the use of a saved FPR into the test in bio_enc_test.c: static int test_bio_enc_chacha20(int idx) { register double fs7 asm("f31") = 114.514; asm("#optimize barrier":"+f"(fs7)); return do_test_bio_cipher(EVP_chacha20(), idx) && fs7 == 114.514; } So fix it. To make the logic simpler, jump into the scalar implementation earlier when LSX and LASX are not enumerated in AT_HWCAP, or the input is too short. [1]: https://github.com/loongson/la-abi-specs/blob/v2.20/lapcs.adoc#floating-point-registers Reviewed-by: Neil Horman Reviewed-by: Tomas Mraz (Merged from https://github.com/openssl/openssl/pull/22817) --- diff --git a/crypto/chacha/asm/chacha-loongarch64.pl b/crypto/chacha/asm/chacha-loongarch64.pl index ea9cc7ecce2..9eed5860de9 100644 --- a/crypto/chacha/asm/chacha-loongarch64.pl +++ b/crypto/chacha/asm/chacha-loongarch64.pl @@ -17,6 +17,14 @@ my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11)); my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$x)=map("\$r$_",(12..21)); my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8)=map("\$r$_",(23..31)); +# The saved floating-point registers in the LP64D ABI. In LoongArch +# with vector extension, the low 64 bits of a vector register alias with +# the corresponding FPR. So we must save and restore the corresponding +# FPR if we'll write into a vector register. The ABI only requires +# saving and restoring the FPR (i.e. 64 bits of the corresponding vector +# register), not the entire vector register. +my ($fs0,$fs1,$fs2,$fs3,$fs4,$fs5,$fs6,$fs7)=map("\$f$_",(24..31)); + # Here is the 128-bit vector register layout for LSX extension. my ($vr0,$vr1,$vr2,$vr3,$vr4,$vr5,$vr6,$vr7,$vr8,$vr9,$vr10, $vr11,$vr12,$vr13,$vr14,$vr15,$vr16,$vr17,$vr18,$vr19, @@ -66,13 +74,25 @@ ChaCha20_ctr32: la.pcrel $t0,OPENSSL_loongarch_hwcap_P ld.w $t0,$t0,0 + bleu $len,$t3,.LChaCha20_1x # goto 1x when len <= 64 + + andi $t0,$t0,LOONGARCH_HWCAP_LASX | LOONGARCH_HWCAP_LSX + beqz $t0,.LChaCha20_1x + + addi.d $sp,$sp,-64 + fst.d $fs0,$sp,0 + fst.d $fs1,$sp,8 + fst.d $fs2,$sp,16 + fst.d $fs3,$sp,24 + fst.d $fs4,$sp,32 + fst.d $fs5,$sp,40 + fst.d $fs6,$sp,48 + fst.d $fs7,$sp,56 + andi $t1,$t0,LOONGARCH_HWCAP_LASX bnez $t1,.LChaCha20_8x - andi $t2,$t0,LOONGARCH_HWCAP_LSX - bnez $t2,.LChaCha20_4x - - b .LChaCha20_1x + b .LChaCha20_4x EOF @@ -442,8 +462,6 @@ $code .= <