[thirdparty/openssl.git] / crypto / bn / asm / armv4-gf2m.pl

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
# C for the time being... Except that it has two code paths: pure
# integer code suitable for any ARMv4 and later CPU and NEON code
# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
# faster than compiler-generated code. For ECDH and ECDSA verify (but
# not for ECDSA sign) it means 25%-45% improvement depending on key
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...

$flavour = shift;
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }

$code=<<___;
#include "arm_arch.h"

.text
.code	32

#if __ARM_ARCH__>=7
.fpu	neon

.type	mul_1x1_neon,%function
.align	5
mul_1x1_neon:
	vshl.u64	`&Dlo("q1")`,d16,#8	@ q1-q3 are slided $a
Commit	Line	Data
75359644 AP	1	#!/usr/bin/env perl
	2	#
	3	# ====================================================================
	4	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
	5	# project. The module is, however, dual licensed under OpenSSL and
	6	# CRYPTOGAMS licenses depending on where you obtain it. For further
	7	# details see http://www.openssl.org/~appro/cryptogams/.
	8	# ====================================================================
	9	#
	10	# May 2011
	11	#
	12	# The module implements bn_GF2m_mul_2x2 polynomial multiplication
	13	# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
	14	# C for the time being... Except that it has two code paths: pure
	15	# integer code suitable for any ARMv4 and later CPU and NEON code
	16	# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
	17	# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
	18	# faster than compiler-generated code. For ECDH and ECDSA verify (but
	19	# not for ECDSA sign) it means 25%-45% improvement depending on key
	20	# length, more for longer keys. Even though NEON 1x1 multiplication
	21	# runs in even less cycles, ~30, improvement is measurable only on
	22	# longer keys. One has to optimize code elsewhere to get NEON glow...
	23
bb98f6be AP	24	$flavour = shift;
	25	if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
	26	else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
	27
	28	if ($flavour && $flavour ne "void") {
	29	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
	30	( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
	31	( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
	32	die "can't locate arm-xlate.pl";
	33
	34	open STDOUT,"\| \"$^X\" $xlate $flavour $output";
	35	} else {
	36	open STDOUT,">$output";
	37	}
75359644 AP	38
	39	sub Dlo() { shift=~m\|q([1]?[0-9])\|?"d".($1*2):""; }
	40	sub Dhi() { shift=~m\|q([1]?[0-9])\|?"d".($1*2+1):""; }
	41	sub Q() { shift=~m\|d([1-3]?[02468])\|?"q".($1/2):""; }
	42
	43	$code=<<___;
	44	#include "arm_arch.h"
	45
	46	.text
	47	.code 32
	48
	49	#if __ARM_ARCH__>=7
	50	.fpu neon
	51
	52	.type mul_1x1_neon,%function
	53	.align 5
	54	mul_1x1_neon:
	55	vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
	56