]> git.ipfire.org Git - thirdparty/openssl.git/blame - crypto/bn/asm/armv4-gf2m.pl
Adapt ARM assembly pack for iOS.
[thirdparty/openssl.git] / crypto / bn / asm / armv4-gf2m.pl
CommitLineData
75359644
AP
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# May 2011
11#
12# The module implements bn_GF2m_mul_2x2 polynomial multiplication
13# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
14# C for the time being... Except that it has two code paths: pure
15# integer code suitable for any ARMv4 and later CPU and NEON code
16# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
17# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
18# faster than compiler-generated code. For ECDH and ECDSA verify (but
19# not for ECDSA sign) it means 25%-45% improvement depending on key
20# length, more for longer keys. Even though NEON 1x1 multiplication
21# runs in even less cycles, ~30, improvement is measurable only on
22# longer keys. One has to optimize code elsewhere to get NEON glow...
23
bb98f6be
AP
24$flavour = shift;
25if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
26else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
27
28if ($flavour && $flavour ne "void") {
29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
31 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
32 die "can't locate arm-xlate.pl";
33
34 open STDOUT,"| \"$^X\" $xlate $flavour $output";
35} else {
36 open STDOUT,">$output";
37}
75359644
AP
38
39sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
40sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
41sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
42
43$code=<<___;
44#include "arm_arch.h"
45
46.text
47.code 32
48
49#if __ARM_ARCH__>=7
50.fpu neon
51
52.type mul_1x1_neon,%function
53.align 5
54mul_1x1_neon:
55 vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
56