From: Min Zhou Date: Thu, 7 Sep 2023 03:07:53 +0000 (+0800) Subject: LoongArch64 assembly pack: add ChaCha20 modules X-Git-Tag: openssl-3.2.0-alpha2~110 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=9a41a3c6a453a90d6c6cf106480a4a72b08b14f9;p=thirdparty%2Fopenssl.git LoongArch64 assembly pack: add ChaCha20 modules This assembly implementation for ChaCha20 includes three code paths: scalar path, 128-bit LSX path and 256-bit LASX path. We prefer the LASX path or LSX path if the hardware and system support these extensions. There are 32 vector registers avaialable in the LSX and LASX extensions. So, we can load the 16 initial states and the 16 intermediate states of ChaCha into the 32 vector registers for calculating in the implementation. The test results on the 3A5000 and 3A6000 show that this assembly implementation significantly improves the performance of ChaCha20 on LoongArch based machines. The detailed test results are as following. Test with: $ openssl speed -evp chacha20 3A5000 type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes C code 178484.53k 282789.93k 311793.70k 322234.99k 324405.93k 324659.88k assembly code 223152.28k 407863.65k 989520.55k 2049192.96k 2127248.70k 2131749.55k +25% +44% +217% +536% +556% +557% 3A6000 type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes C code 214945.33k 310041.75k 340724.22k 349949.27k 352925.01k 353140.74k assembly code 299151.34k 492766.34k 2070166.02k 4300909.91k 4473978.88k 4499084.63k +39% +59% +508% +1129% +1168% +1174% Signed-off-by: Min Zhou Reviewed-by: Tomas Mraz Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/21998) --- diff --git a/crypto/chacha/asm/chacha-loongarch64.pl b/crypto/chacha/asm/chacha-loongarch64.pl new file mode 100644 index 00000000000..ea9cc7ecce2 --- /dev/null +++ b/crypto/chacha/asm/chacha-loongarch64.pl @@ -0,0 +1,1413 @@ +#! /usr/bin/env perl +# Author: Min Zhou +# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +use strict; + +my $code; + +# Here is the scalar register layout for LoongArch. +my ($zero,$ra,$tp,$sp,$fp)=map("\$r$_",(0..3,22)); +my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11)); +my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$x)=map("\$r$_",(12..21)); +my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8)=map("\$r$_",(23..31)); + +# Here is the 128-bit vector register layout for LSX extension. +my ($vr0,$vr1,$vr2,$vr3,$vr4,$vr5,$vr6,$vr7,$vr8,$vr9,$vr10, + $vr11,$vr12,$vr13,$vr14,$vr15,$vr16,$vr17,$vr18,$vr19, + $vr20,$vr21,$vr22,$vr23,$vr24,$vr25,$vr26,$vr27,$vr28, + $vr29,$vr30,$vr31)=map("\$vr$_",(0..31)); + +# Here is the 256-bit vector register layout for LASX extension. +my ($xr0,$xr1,$xr2,$xr3,$xr4,$xr5,$xr6,$xr7,$xr8,$xr9,$xr10, + $xr11,$xr12,$xr13,$xr14,$xr15,$xr16,$xr17,$xr18,$xr19, + $xr20,$xr21,$xr22,$xr23,$xr24,$xr25,$xr26,$xr27,$xr28, + $xr29,$xr30,$xr31)=map("\$xr$_",(0..31)); + +my $output; +for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +# Input parameter block +my ($out, $inp, $len, $key, $counter) = ($a0, $a1, $a2, $a3, $a4); + +$code .= <