]>
Commit | Line | Data |
---|---|---|
75359644 AP |
1 | #!/usr/bin/env perl |
2 | # | |
3 | # ==================================================================== | |
4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |
5 | # project. The module is, however, dual licensed under OpenSSL and | |
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | |
7 | # details see http://www.openssl.org/~appro/cryptogams/. | |
8 | # ==================================================================== | |
9 | # | |
10 | # May 2011 | |
11 | # | |
12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication | |
13 | # used in bn_gf2m.c. It's kind of low-hanging mechanical port from | |
14 | # C for the time being... Except that it has two code paths: pure | |
15 | # integer code suitable for any ARMv4 and later CPU and NEON code | |
16 | # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs | |
17 | # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50% | |
18 | # faster than compiler-generated code. For ECDH and ECDSA verify (but | |
19 | # not for ECDSA sign) it means 25%-45% improvement depending on key | |
20 | # length, more for longer keys. Even though NEON 1x1 multiplication | |
21 | # runs in even less cycles, ~30, improvement is measurable only on | |
22 | # longer keys. One has to optimize code elsewhere to get NEON glow... | |
23 | ||
bb98f6be AP |
24 | $flavour = shift; |
25 | if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } | |
26 | else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} } | |
27 | ||
28 | if ($flavour && $flavour ne "void") { | |
29 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
30 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |
31 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |
32 | die "can't locate arm-xlate.pl"; | |
33 | ||
34 | open STDOUT,"| \"$^X\" $xlate $flavour $output"; | |
35 | } else { | |
36 | open STDOUT,">$output"; | |
37 | } | |
75359644 AP |
38 | |
39 | sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } | |
40 | sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } | |
41 | sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } | |
42 | ||
43 | $code=<<___; | |
44 | #include "arm_arch.h" | |
45 | ||
46 | .text | |
47 | .code 32 | |
48 | ||
49 | #if __ARM_ARCH__>=7 | |
50 | .fpu neon | |
51 | ||
52 | .type mul_1x1_neon,%function | |
53 | .align 5 | |
54 | mul_1x1_neon: | |
55 | vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a | |
56 |