2 # Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # $output is the last argument if it looks like a file (it has an extension)
10 # $flavour is the first argument if it doesn't look like a file
11 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m
|\
.\w
+$| ?
pop : undef;
12 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m
|\
.| ?
shift : undef;
14 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
15 ( $xlate="${dir}arm-xlate.pl" and -f
$xlate ) or
16 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f
$xlate) or
17 die "can't locate arm-xlate.pl";
19 open OUT
,"| \"$^X\" $xlate $flavour \"$output\""
20 or die "can't call $xlate: $!";
23 my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("x$_",(7..14));
24 my ($a8,$a10,$a12,$a14,$a9,$a11,$a13,$a15)=map("x$_",(7..14));
25 my ($t0,$t1,$t2,$t3)=map("x$_",(3..6));
26 my ($t4,$t5,$t6,$t7,$t8)=map("x$_",(15..17,19,20));
62 // Select based on carry
108 // Select based on carry
120 sub bn_mod_div_by_2
() {
127 // Save the least significant bit
162 #include "arm_arch.h"
169 .quad
0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff
170 // The order of polynomial n
172 .quad
0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff
175 .quad
0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff
178 .quad
0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff
180 // void bn_rshift1
(BN_ULONG
*a
);
182 .type bn_rshift1
,%function
185 AARCH64_VALID_CALL_TARGET
201 .size bn_rshift1
,.-bn_rshift1
203 // void bn_sub
(BN_ULONG
*r
, const BN_ULONG
*a
, const BN_ULONG
*b
);
205 .type bn_sub
,%function
208 AARCH64_VALID_CALL_TARGET
226 .size bn_sub
,.-bn_sub
228 // void ecp_sm2p256_div_by_2
(BN_ULONG
*r
,const BN_ULONG
*a
);
229 .globl ecp_sm2p256_div_by_2
230 .type ecp_sm2p256_div_by_2
,%function
232 ecp_sm2p256_div_by_2
:
233 AARCH64_VALID_CALL_TARGET
235 &bn_mod_div_by_2
(".Lpoly_div_2");
238 .size ecp_sm2p256_div_by_2
,.-ecp_sm2p256_div_by_2
240 // void ecp_sm2p256_div_by_2_mod_ord
(BN_ULONG
*r
,const BN_ULONG
*a
);
241 .globl ecp_sm2p256_div_by_2_mod_ord
242 .type ecp_sm2p256_div_by_2_mod_ord
,%function
244 ecp_sm2p256_div_by_2_mod_ord
:
245 AARCH64_VALID_CALL_TARGET
247 &bn_mod_div_by_2
(".Lord_div_2");
250 .size ecp_sm2p256_div_by_2_mod_ord
,.-ecp_sm2p256_div_by_2_mod_ord
252 // void ecp_sm2p256_mul_by_3
(BN_ULONG
*r
,const BN_ULONG
*a
);
253 .globl ecp_sm2p256_mul_by_3
254 .type ecp_sm2p256_mul_by_3
,%function
256 ecp_sm2p256_mul_by_3
:
257 AARCH64_VALID_CALL_TARGET
324 .size ecp_sm2p256_mul_by_3
,.-ecp_sm2p256_mul_by_3
326 // void ecp_sm2p256_add
(BN_ULONG
*r
,const BN_ULONG
*a
,const BN_ULONG
*b
);
327 .globl ecp_sm2p256_add
328 .type ecp_sm2p256_add
,%function
331 AARCH64_VALID_CALL_TARGET
333 &bn_mod_add
(".Lpoly");
336 .size ecp_sm2p256_add
,.-ecp_sm2p256_add
338 // void ecp_sm2p256_sub
(BN_ULONG
*r
,const BN_ULONG
*a
,const BN_ULONG
*b
);
339 .globl ecp_sm2p256_sub
340 .type ecp_sm2p256_sub
,%function
343 AARCH64_VALID_CALL_TARGET
345 &bn_mod_sub
(".Lpoly");
348 .size ecp_sm2p256_sub
,.-ecp_sm2p256_sub
350 // void ecp_sm2p256_sub_mod_ord
(BN_ULONG
*r
,const BN_ULONG
*a
,const BN_ULONG
*b
);
351 .globl ecp_sm2p256_sub_mod_ord
352 .type ecp_sm2p256_sub_mod_ord
,%function
354 ecp_sm2p256_sub_mod_ord
:
355 AARCH64_VALID_CALL_TARGET
357 &bn_mod_sub
(".Lord");
360 .size ecp_sm2p256_sub_mod_ord
,.-ecp_sm2p256_sub_mod_ord
363 // a
= | s7
| ... | s0
|, where si are
64-bit quantities
364 // = |a15
|a14
| ... |a1
|a0
|, where ai are
32-bit quantities
365 // | s7
| s6
| s5
| s4
|
366 // | a15
| a14
| a13
| a12
| a11
| a10
| a9
| a8
|
367 // | s3
| s2
| s1
| s0
|
368 // | a7
| a6
| a5
| a4
| a3
| a2
| a1
| a0
|
369 // =================================================
370 // | a8
| a11
| a10
| a9
| a8
| 0 | s4
| (+)
371 // | a9
| a15
| s6
| a11
| 0 | a10
| a9
| (+)
372 // | a10
| 0 | a14
| a13
| a12
| 0 | s5
| (+)
373 // | a11
| 0 | s7
| a13
| 0 | a12
| a11
| (+)
374 // | a12
| 0 | s7
| a13
| 0 | s6
| (+)
375 // | a12
| 0 | 0 | a15
| a14
| 0 | a14
| a13
| (+)
376 // | a13
| 0 | 0 | 0 | a15
| 0 | a14
| a13
| (+)
377 // | a13
| 0 | 0 | 0 | 0 | 0 | s7
| (+)
378 // | a14
| 0 | 0 | 0 | 0 | 0 | s7
| (+)
379 // | a14
| 0 | 0 | 0 | 0 | 0 | 0 | a15
| (+)
380 // | a15
| 0 | 0 | 0 | 0 | 0 | 0 | a15
| (+)
381 // | a15
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
382 // | s7
| 0 | 0 | 0 | 0 | 0 | 0 | (+)
383 // | 0 | 0 | 0 | 0 | 0 | a8
| 0 | 0 | (-)
384 // | 0 | 0 | 0 | 0 | 0 | a9
| 0 | 0 | (-)
385 // | 0 | 0 | 0 | 0 | 0 | a13
| 0 | 0 | (-)
386 // | 0 | 0 | 0 | 0 | 0 | a14
| 0 | 0 | (-)
387 // | U
[7]| U
[6]| U
[5]| U
[4]| U
[3]| U
[2]| U
[1]| U
[0]|
388 // | V
[3] | V
[2] | V
[1] | V
[0] |
390 // 1. 64-bit addition
413 // 2. 64-bit to
32-bit spread
419 and $s0,$s0,$t1 // a8
420 and $s1,$s1,$t1 // a10
421 and $s2,$s2,$t1 // a12
422 and $s3,$s3,$t1 // a14
423 lsr
$s4,$s4,#32 // a9
424 lsr
$s5,$s5,#32 // a11
425 lsr
$s6,$s6,#32 // a13
426 lsr
$s7,$s7,#32 // a15
428 // 3. 32-bit addition
429 add
$t1,$a14,$a12 // t1
<- a12
+ a14
430 add
$t2,$a15,$a13 // t2
<- a13
+ a15
431 add
$t3,$a8,$a9 // t3
<- a8
+ a9
432 add
$t4,$a14,$a10 // t4
<- a10
+ a14
433 add
$a15,$a15,$a11 // a15
<- a11
+ a15
434 add
$a12,$t2,$t1 // a12
<- a12
+ a13
+ a14
+ a15
435 add
$a10,$a10,$a12 // a10
<- a10
+ a12
+ a13
+ a14
+ a15
436 add
$a10,$a10,$a12 // a10
<- a10
+ 2*(a12
+ a13
+ a14
+ a15
)
437 add
$a10,$a10,$t3 // a10
<- a8
+ a9
+ a10
+ 2*(a12
+ a13
+ a14
+ a15
)
438 add
$a10,$a10,$a11 // a10
<- a8
+ a9
+ a10
+ a11
+ 2*(a12
+ a13
+ a14
+ a15
)
439 add
$a12,$a12,$a13 // a12
<- a12
+ 2*a13
+ a14
+ a15
440 add
$a12,$a12,$a11 // a12
<- a11
+ a12
+ 2*a13
+ a14
+ a15
441 add
$a12,$a12,$a8 // a12
<- a8
+ a11
+ a12
+ 2*a13
+ a14
+ a15
442 add
$t3,$t3,$a14 // t3
<- a8
+ a9
+ a14
443 add
$t3,$t3,$a13 // t3
<- a8
+ a9
+ a13
+ a14
444 add
$a9,$a9,$t2 // a9
<- a9
+ a13
+ a15
445 add
$a11,$a11,$a9 // a11
<- a9
+ a11
+ a13
+ a15
446 add
$a11,$a11,$t2 // a11
<- a9
+ a11
+ 2*(a13
+ a15
)
447 add
$t1,$t1,$t4 // t1
<- a10
+ a12
+ 2*a14
449 // U
[0] s5 a9
+ a11
+ 2*(a13
+ a15
)
450 // U
[1] t1 a10
+ a12
+ 2*a14
451 // U
[2] -t3 a8
+ a9
+ a13
+ a14
452 // U
[3] s2 a8
+ a11
+ a12
+ 2*a13
+ a14
+ a15
453 // U
[4] s4 a9
+ a13
+ a15
456 // U
[7] s1 a8
+ a9
+ a10
+ a11
+ 2*(a12
+ a13
+ a14
+ a15
)
458 // 4. 32-bit to
64-bit
465 // 5. 64-bit addition
505 // return y
- p
if y
> p
else y
513 ldp
$t3,$t4,[$t0,#16]
530 // void ecp_sm2p256_mul
(BN_ULONG
*r
, const BN_ULONG
*a
, const BN_ULONG
*b
);
531 .globl ecp_sm2p256_mul
532 .type ecp_sm2p256_mul
,%function
535 AARCH64_SIGN_LINK_REGISTER
536 // Store
scalar registers
537 stp x29
,x30
,[sp
,#-80]!
548 // ### multiplication ###
549 // ========================
552 // ------------------------
565 // ------------------------
566 // s7 s6 s5 s4 s3 s2 s1 s0
567 // ========================
573 // ### s1*s4 + s0*s5 ###
585 // ### s2*s4 + s1*s5 + s0*s6 ###
603 // ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
628 // ### s3*s5 + s2*s6 + s1*s7 ###
647 // ### s3*s6 + s2*s7 ###
671 // result of mul
: s7 s6 s5 s4 s3 s2 s1 s0
679 // Restore
scalar registers
684 AARCH64_VALIDATE_LINK_REGISTER
686 .size ecp_sm2p256_mul
,.-ecp_sm2p256_mul
688 // void ecp_sm2p256_sqr
(BN_ULONG
*r
, const BN_ULONG
*a
);
689 .globl ecp_sm2p256_sqr
690 .type ecp_sm2p256_sqr
,%function
694 AARCH64_SIGN_LINK_REGISTER
695 // Store
scalar registers
696 stp x29
,x30
,[sp
,#-80]!
706 // ========================
709 // ------------------------
722 // ------------------------
723 // s7 s6 s5 s4 s3 s2 s1 s0
724 // ========================
736 // ### s4*s7 + s5*s6 ###
760 // ### 2*(t3,t2,s0,s3,s2,s1) ###
799 // result of mul
: s7 s6 s5 s4 s3 s2 s1 s0
807 // Restore
scalar registers
812 AARCH64_VALIDATE_LINK_REGISTER
814 .size ecp_sm2p256_sqr
,.-ecp_sm2p256_sqr
818 foreach (split("\n",$code)) {
819 s/\`([^\`]*)\`/eval $1/ge;
823 close STDOUT
or die "error closing STDOUT: $!"; # enforce flush