2 # Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # $output is the last argument if it looks like a file (it has an extension)
10 # $flavour is the first argument if it doesn't look like a file
11 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m
|\
.\w
+$| ?
pop : undef;
12 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m
|\
.| ?
shift : undef;
14 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
15 ( $xlate="${dir}arm-xlate.pl" and -f
$xlate ) or
16 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f
$xlate) or
17 die "can't locate arm-xlate.pl";
19 open OUT
,"| \"$^X\" $xlate $flavour \"$output\""
20 or die "can't call $xlate: $!";
23 my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("x$_",(7..14));
24 my ($a8,$a10,$a12,$a14,$a9,$a11,$a13,$a15)=map("x$_",(7..14));
25 my ($t0,$t1,$t2,$t3)=map("x$_",(3..6));
26 my ($t4,$t5,$t6,$t7,$t8)=map("x$_",(15..19));
62 # Select based on carry
108 # Select based on carry
120 sub bn_mod_div_by_2
() {
127 # Save the least significant bit
162 #include "arm_arch.h"
169 .quad
0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff
170 // The order of polynomial n
172 .quad
0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff
175 .quad
0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff
178 .quad
0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff
180 // void bn_rshift1
(BN_ULONG
*a
);
182 .type bn_rshift1
,%function
185 AARCH64_VALID_CALL_TARGET
201 .size bn_rshift1
,.-bn_rshift1
203 // void bn_sub
(BN_ULONG
*r
, const BN_ULONG
*a
, const BN_ULONG
*b
);
205 .type bn_sub
,%function
208 AARCH64_VALID_CALL_TARGET
226 .size bn_sub
,.-bn_sub
228 // void ecp_sm2p256_div_by_2
(BN_ULONG
*r
,const BN_ULONG
*a
);
229 .globl ecp_sm2p256_div_by_2
230 .type ecp_sm2p256_div_by_2
,%function
232 ecp_sm2p256_div_by_2
:
233 AARCH64_VALID_CALL_TARGET
235 &bn_mod_div_by_2
(".Lpoly_div_2");
238 .size ecp_sm2p256_div_by_2
,.-ecp_sm2p256_div_by_2
240 // void ecp_sm2p256_div_by_2_mod_ord
(BN_ULONG
*r
,const BN_ULONG
*a
);
241 .globl ecp_sm2p256_div_by_2_mod_ord
242 .type ecp_sm2p256_div_by_2_mod_ord
,%function
244 ecp_sm2p256_div_by_2_mod_ord
:
245 AARCH64_VALID_CALL_TARGET
247 &bn_mod_div_by_2
(".Lord_div_2");
250 .size ecp_sm2p256_div_by_2_mod_ord
,.-ecp_sm2p256_div_by_2_mod_ord
252 // void ecp_sm2p256_mul_by_3
(BN_ULONG
*r
,const BN_ULONG
*a
);
253 .globl ecp_sm2p256_mul_by_3
254 .type ecp_sm2p256_mul_by_3
,%function
256 ecp_sm2p256_mul_by_3
:
257 AARCH64_VALID_CALL_TARGET
324 .size ecp_sm2p256_mul_by_3
,.-ecp_sm2p256_mul_by_3
326 // void ecp_sm2p256_add
(BN_ULONG
*r
,const BN_ULONG
*a
,const BN_ULONG
*b
);
327 .globl ecp_sm2p256_add
328 .type ecp_sm2p256_add
,%function
331 AARCH64_VALID_CALL_TARGET
333 &bn_mod_add
(".Lpoly");
336 .size ecp_sm2p256_add
,.-ecp_sm2p256_add
338 // void ecp_sm2p256_sub
(BN_ULONG
*r
,const BN_ULONG
*a
,const BN_ULONG
*b
);
339 .globl ecp_sm2p256_sub
340 .type ecp_sm2p256_sub
,%function
343 AARCH64_VALID_CALL_TARGET
345 &bn_mod_sub
(".Lpoly");
348 .size ecp_sm2p256_sub
,.-ecp_sm2p256_sub
350 // void ecp_sm2p256_sub_mod_ord
(BN_ULONG
*r
,const BN_ULONG
*a
,const BN_ULONG
*b
);
351 .globl ecp_sm2p256_sub_mod_ord
352 .type ecp_sm2p256_sub_mod_ord
,%function
354 ecp_sm2p256_sub_mod_ord
:
355 AARCH64_VALID_CALL_TARGET
357 &bn_mod_sub
(".Lord");
360 .size ecp_sm2p256_sub_mod_ord
,.-ecp_sm2p256_sub_mod_ord
363 # a = | s7 | ... | s0 |, where si are 64-bit quantities
364 # = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities
365 # | s7 | s6 | s5 | s4 |
366 # | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 |
367 # | s3 | s2 | s1 | s0 |
368 # | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 |
369 # =================================================
370 # | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+)
371 # | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+)
372 # | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+)
373 # | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+)
374 # | a12 | 0 | s7 | a13 | 0 | s6 | (+)
375 # | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+)
376 # | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+)
377 # | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
378 # | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
379 # | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
380 # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
381 # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
382 # | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
383 # | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-)
384 # | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-)
385 # | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-)
386 # | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-)
387 # | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
388 # | V[3] | V[2] | V[1] | V[0] |
413 # 2. 64-bit to 32-bit spread
419 and $s0,$s0,$t1 // a8
420 and $s1,$s1,$t1 // a10
421 and $s2,$s2,$t1 // a12
422 and $s3,$s3,$t1 // a14
423 lsr
$s4,$s4,#32 // a9
424 lsr
$s5,$s5,#32 // a11
425 lsr
$s6,$s6,#32 // a13
426 lsr
$s7,$s7,#32 // a15
429 add
$t1,$a14,$a12 // t1
<- a12
+ a14
430 add
$t2,$a15,$a13 // t2
<- a13
+ a15
431 add
$t3,$a8,$a9 // t3
<- a8
+ a9
432 add
$t4,$a14,$a10 // t4
<- a10
+ a14
433 add
$a15,$a15,$a11 // a15
<- a11
+ a15
434 add
$a12,$t2,$t1 // a12
<- a12
+ a13
+ a14
+ a15
435 add
$a10,$a10,$a12 // a10
<- a10
+ a12
+ a13
+ a14
+ a15
436 add
$a10,$a10,$a12 // a10
<- a10
+ 2*(a12
+ a13
+ a14
+ a15
)
437 add
$a10,$a10,$t3 // a10
<- a8
+ a9
+ a10
+ 2*(a12
+ a13
+ a14
+ a15
)
438 add
$a10,$a10,$a11 // a10
<- a8
+ a9
+ a10
+ a11
+ 2*(a12
+ a13
+ a14
+ a15
)
439 add
$a12,$a12,$a13 // a12
<- a12
+ 2*a13
+ a14
+ a15
440 add
$a12,$a12,$a11 // a12
<- a11
+ a12
+ 2*a13
+ a14
+ a15
441 add
$a12,$a12,$a8 // a12
<- a8
+ a11
+ a12
+ 2*a13
+ a14
+ a15
442 add
$t3,$t3,$a14 // t3
<- a8
+ a9
+ a14
443 add
$t3,$t3,$a13 // t3
<- a8
+ a9
+ a13
+ a14
444 add
$a9,$a9,$t2 // a9
<- a9
+ a13
+ a15
445 add
$a11,$a11,$a9 // a11
<- a9
+ a11
+ a13
+ a15
446 add
$a11,$a11,$t2 // a11
<- a9
+ a11
+ 2*(a13
+ a15
)
447 add
$t1,$t1,$t4 // t1
<- a10
+ a12
+ 2*a14
449 # U[0] s5 a9 + a11 + 2*(a13 + a15)
450 # U[1] t1 a10 + a12 + 2*a14
451 # U[2] -t3 a8 + a9 + a13 + a14
452 # U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15
453 # U[4] s4 a9 + a13 + a15
456 # U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
458 # 4. 32-bit to 64-bit
505 # return y - p if y > p else y
513 ldp
$t3,$t4,[$t0,#16]
530 // void ecp_sm2p256_mul
(BN_ULONG
*r
, const BN_ULONG
*a
, const BN_ULONG
*b
);
531 .globl ecp_sm2p256_mul
532 .type ecp_sm2p256_mul
,%function
535 AARCH64_SIGN_LINK_REGISTER
536 # Store scalar registers
537 stp x29
,x30
,[sp
,#-80]!
548 ### multiplication ###
549 # ========================
552 # ------------------------
565 # ------------------------
566 # s7 s6 s5 s4 s3 s2 s1 s0
567 # ========================
573 ### s1*s4 + s0*s5 ###
585 ### s2*s4 + s1*s5 + s0*s6 ###
603 ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
628 ### s3*s5 + s2*s6 + s1*s7 ###
647 ### s3*s6 + s2*s7 ###
671 # result of mul: s7 s6 s5 s4 s3 s2 s1 s0
679 # Restore scalar registers
684 AARCH64_VALIDATE_LINK_REGISTER
686 .size ecp_sm2p256_mul
,.-ecp_sm2p256_mul
688 // void ecp_sm2p256_sqr
(BN_ULONG
*r
, const BN_ULONG
*a
);
689 .globl ecp_sm2p256_sqr
690 .type ecp_sm2p256_sqr
,%function
694 AARCH64_SIGN_LINK_REGISTER
695 # Store scalar registers
696 stp x29
,x30
,[sp
,#-80]!
706 # ========================
709 # ------------------------
722 # ------------------------
723 # s7 s6 s5 s4 s3 s2 s1 s0
724 # ========================
736 ### s4*s7 + s5*s6 ###
760 ### 2*(t3,t2,s0,s3,s2,s1) ###
799 # result of mul: s7 s6 s5 s4 s3 s2 s1 s0
807 # Restore scalar registers
812 AARCH64_VALIDATE_LINK_REGISTER
814 .size ecp_sm2p256_sqr
,.-ecp_sm2p256_sqr
818 foreach (split("\n",$code)) {
819 s/\`([^\`]*)\`/eval $1/ge;
823 close STDOUT
or die "error closing STDOUT: $!"; # enforce flush