]> git.ipfire.org Git - thirdparty/openssl.git/blob - crypto/ec/asm/ecp_sm2p256-armv8.pl
Fix SM2 test failures on Apple Silicon
[thirdparty/openssl.git] / crypto / ec / asm / ecp_sm2p256-armv8.pl
1 #! /usr/bin/env perl
2 # Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 # $output is the last argument if it looks like a file (it has an extension)
10 # $flavour is the first argument if it doesn't look like a file
11 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
12 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
13
14 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
15 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
16 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
17 die "can't locate arm-xlate.pl";
18
19 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
20 or die "can't call $xlate: $!";
21 *STDOUT=*OUT;
22
23 my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("x$_",(7..14));
24 my ($a8,$a10,$a12,$a14,$a9,$a11,$a13,$a15)=map("x$_",(7..14));
25 my ($t0,$t1,$t2,$t3)=map("x$_",(3..6));
26 my ($t4,$t5,$t6,$t7,$t8)=map("x$_",(15..17,19,20));
27
28 sub bn_mod_add() {
29 my $mod = shift;
30 $code.=<<___;
31 // Load inputs
32 ldp $s0,$s1,[x1]
33 ldp $s2,$s3,[x1,#16]
34 ldp $s4,$s5,[x2]
35 ldp $s6,$s7,[x2,#16]
36
37 // Addition
38 adds $s0,$s0,$s4
39 adcs $s1,$s1,$s5
40 adcs $s2,$s2,$s6
41 adcs $s3,$s3,$s7
42 adc $t4,xzr,xzr
43
44 // Load polynomial
45 adr x2,$mod
46 ldp $s4,$s5,[x2]
47 ldp $s6,$s7,[x2,#16]
48
49 // Backup Addition
50 mov $t0,$s0
51 mov $t1,$s1
52 mov $t2,$s2
53 mov $t3,$s3
54
55 // Sub polynomial
56 subs $t0,$t0,$s4
57 sbcs $t1,$t1,$s5
58 sbcs $t2,$t2,$s6
59 sbcs $t3,$t3,$s7
60 sbcs $t4,$t4,xzr
61
62 // Select based on carry
63 csel $s0,$s0,$t0,cc
64 csel $s1,$s1,$t1,cc
65 csel $s2,$s2,$t2,cc
66 csel $s3,$s3,$t3,cc
67
68 // Store results
69 stp $s0,$s1,[x0]
70 stp $s2,$s3,[x0,#16]
71 ___
72 }
73
74 sub bn_mod_sub() {
75 my $mod = shift;
76 $code.=<<___;
77 // Load inputs
78 ldp $s0,$s1,[x1]
79 ldp $s2,$s3,[x1,#16]
80 ldp $s4,$s5,[x2]
81 ldp $s6,$s7,[x2,#16]
82
83 // Subtraction
84 subs $s0,$s0,$s4
85 sbcs $s1,$s1,$s5
86 sbcs $s2,$s2,$s6
87 sbcs $s3,$s3,$s7
88 sbc $t4,xzr,xzr
89
90 // Load polynomial
91 adr x2,$mod
92 ldp $s4,$s5,[x2]
93 ldp $s6,$s7,[x2,#16]
94
95 // Backup subtraction
96 mov $t0,$s0
97 mov $t1,$s1
98 mov $t2,$s2
99 mov $t3,$s3
100
101 // Add polynomial
102 adds $t0,$t0,$s4
103 adcs $t1,$t1,$s5
104 adcs $t2,$t2,$s6
105 adcs $t3,$t3,$s7
106 tst $t4,$t4
107
108 // Select based on carry
109 csel $s0,$s0,$t0,eq
110 csel $s1,$s1,$t1,eq
111 csel $s2,$s2,$t2,eq
112 csel $s3,$s3,$t3,eq
113
114 // Store results
115 stp $s0,$s1,[x0]
116 stp $s2,$s3,[x0,#16]
117 ___
118 }
119
120 sub bn_mod_div_by_2() {
121 my $mod = shift;
122 $code.=<<___;
123 // Load inputs
124 ldp $s0,$s1,[x1]
125 ldp $s2,$s3,[x1,#16]
126
127 // Save the least significant bit
128 mov $t0,$s0
129
130 // Right shift 1
131 extr $s0,$s1,$s0,#1
132 extr $s1,$s2,$s1,#1
133 extr $s2,$s3,$s2,#1
134 lsr $s3,$s3,#1
135
136 // Load mod
137 adr x2,$mod
138 ldp $s4,$s5,[x2]
139 ldp $s6,$s7,[x2,#16]
140
141 // Parity check
142 tst $t0,#1
143 csel $s4,xzr,$s4,eq
144 csel $s5,xzr,$s5,eq
145 csel $s6,xzr,$s6,eq
146 csel $s7,xzr,$s7,eq
147
148 // Add
149 adds $s0,$s0,$s4
150 adcs $s1,$s1,$s5
151 adcs $s2,$s2,$s6
152 adc $s3,$s3,$s7
153
154 // Store results
155 stp $s0,$s1,[x0]
156 stp $s2,$s3,[x0,#16]
157 ___
158 }
159
160 {
161 $code.=<<___;
162 #include "arm_arch.h"
163 .arch armv8-a
164 .text
165
166 .align 5
167 // The polynomial p
168 .Lpoly:
169 .quad 0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff
170 // The order of polynomial n
171 .Lord:
172 .quad 0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff
173 // (p + 1) / 2
174 .Lpoly_div_2:
175 .quad 0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff
176 // (n + 1) / 2
177 .Lord_div_2:
178 .quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff
179
180 // void bn_rshift1(BN_ULONG *a);
181 .globl bn_rshift1
182 .type bn_rshift1,%function
183 .align 5
184 bn_rshift1:
185 AARCH64_VALID_CALL_TARGET
186 // Load inputs
187 ldp $s0,$s1,[x0]
188 ldp $s2,$s3,[x0,#16]
189
190 // Right shift
191 extr $s0,$s1,$s0,#1
192 extr $s1,$s2,$s1,#1
193 extr $s2,$s3,$s2,#1
194 lsr $s3,$s3,#1
195
196 // Store results
197 stp $s0,$s1,[x0]
198 stp $s2,$s3,[x0,#16]
199
200 ret
201 .size bn_rshift1,.-bn_rshift1
202
203 // void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
204 .globl bn_sub
205 .type bn_sub,%function
206 .align 5
207 bn_sub:
208 AARCH64_VALID_CALL_TARGET
209 // Load inputs
210 ldp $s0,$s1,[x1]
211 ldp $s2,$s3,[x1,#16]
212 ldp $s4,$s5,[x2]
213 ldp $s6,$s7,[x2,#16]
214
215 // Subtraction
216 subs $s0,$s0,$s4
217 sbcs $s1,$s1,$s5
218 sbcs $s2,$s2,$s6
219 sbc $s3,$s3,$s7
220
221 // Store results
222 stp $s0,$s1,[x0]
223 stp $s2,$s3,[x0,#16]
224
225 ret
226 .size bn_sub,.-bn_sub
227
228 // void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a);
229 .globl ecp_sm2p256_div_by_2
230 .type ecp_sm2p256_div_by_2,%function
231 .align 5
232 ecp_sm2p256_div_by_2:
233 AARCH64_VALID_CALL_TARGET
234 ___
235 &bn_mod_div_by_2(".Lpoly_div_2");
236 $code.=<<___;
237 ret
238 .size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2
239
240 // void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a);
241 .globl ecp_sm2p256_div_by_2_mod_ord
242 .type ecp_sm2p256_div_by_2_mod_ord,%function
243 .align 5
244 ecp_sm2p256_div_by_2_mod_ord:
245 AARCH64_VALID_CALL_TARGET
246 ___
247 &bn_mod_div_by_2(".Lord_div_2");
248 $code.=<<___;
249 ret
250 .size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord
251
252 // void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a);
253 .globl ecp_sm2p256_mul_by_3
254 .type ecp_sm2p256_mul_by_3,%function
255 .align 5
256 ecp_sm2p256_mul_by_3:
257 AARCH64_VALID_CALL_TARGET
258 // Load inputs
259 ldp $s0,$s1,[x1]
260 ldp $s2,$s3,[x1,#16]
261
262 // 2*a
263 adds $s0,$s0,$s0
264 adcs $s1,$s1,$s1
265 adcs $s2,$s2,$s2
266 adcs $s3,$s3,$s3
267 adcs $t4,xzr,xzr
268
269 mov $t0,$s0
270 mov $t1,$s1
271 mov $t2,$s2
272 mov $t3,$s3
273
274 // Sub polynomial
275 adr x2,.Lpoly
276 ldp $s4,$s5,[x2]
277 ldp $s6,$s7,[x2,#16]
278 subs $s0,$s0,$s4
279 sbcs $s1,$s1,$s5
280 sbcs $s2,$s2,$s6
281 sbcs $s3,$s3,$s7
282 sbcs $t4,$t4,xzr
283
284 csel $s0,$s0,$t0,cs
285 csel $s1,$s1,$t1,cs
286 csel $s2,$s2,$t2,cs
287 csel $s3,$s3,$t3,cs
288 eor $t4,$t4,$t4
289
290 // 3*a
291 ldp $s4,$s5,[x1]
292 ldp $s6,$s7,[x1,#16]
293 adds $s0,$s0,$s4
294 adcs $s1,$s1,$s5
295 adcs $s2,$s2,$s6
296 adcs $s3,$s3,$s7
297 adcs $t4,xzr,xzr
298
299 mov $t0,$s0
300 mov $t1,$s1
301 mov $t2,$s2
302 mov $t3,$s3
303
304 // Sub polynomial
305 adr x2,.Lpoly
306 ldp $s4,$s5,[x2]
307 ldp $s6,$s7,[x2,#16]
308 subs $s0,$s0,$s4
309 sbcs $s1,$s1,$s5
310 sbcs $s2,$s2,$s6
311 sbcs $s3,$s3,$s7
312 sbcs $t4,$t4,xzr
313
314 csel $s0,$s0,$t0,cs
315 csel $s1,$s1,$t1,cs
316 csel $s2,$s2,$t2,cs
317 csel $s3,$s3,$t3,cs
318
319 // Store results
320 stp $s0,$s1,[x0]
321 stp $s2,$s3,[x0,#16]
322
323 ret
324 .size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3
325
326 // void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
327 .globl ecp_sm2p256_add
328 .type ecp_sm2p256_add,%function
329 .align 5
330 ecp_sm2p256_add:
331 AARCH64_VALID_CALL_TARGET
332 ___
333 &bn_mod_add(".Lpoly");
334 $code.=<<___;
335 ret
336 .size ecp_sm2p256_add,.-ecp_sm2p256_add
337
338 // void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
339 .globl ecp_sm2p256_sub
340 .type ecp_sm2p256_sub,%function
341 .align 5
342 ecp_sm2p256_sub:
343 AARCH64_VALID_CALL_TARGET
344 ___
345 &bn_mod_sub(".Lpoly");
346 $code.=<<___;
347 ret
348 .size ecp_sm2p256_sub,.-ecp_sm2p256_sub
349
350 // void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
351 .globl ecp_sm2p256_sub_mod_ord
352 .type ecp_sm2p256_sub_mod_ord,%function
353 .align 5
354 ecp_sm2p256_sub_mod_ord:
355 AARCH64_VALID_CALL_TARGET
356 ___
357 &bn_mod_sub(".Lord");
358 $code.=<<___;
359 ret
360 .size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord
361
362 .macro RDC
363 // a = | s7 | ... | s0 |, where si are 64-bit quantities
364 // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities
365 // | s7 | s6 | s5 | s4 |
366 // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 |
367 // | s3 | s2 | s1 | s0 |
368 // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 |
369 // =================================================
370 // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+)
371 // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+)
372 // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+)
373 // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+)
374 // | a12 | 0 | s7 | a13 | 0 | s6 | (+)
375 // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+)
376 // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+)
377 // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
378 // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
379 // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
380 // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
381 // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
382 // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
383 // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-)
384 // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-)
385 // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-)
386 // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-)
387 // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
388 // | V[3] | V[2] | V[1] | V[0] |
389
390 // 1. 64-bit addition
391 // t2=s6+s7+s7
392 adds $t2,$s6,$s7
393 adcs $t1,xzr,xzr
394 adds $t2,$t2,$s7
395 adcs $t1,$t1,xzr
396 // t3=s4+s5+t2
397 adds $t3,$s4,$t2
398 adcs $t4,$t1,xzr
399 adds $t3,$t3,$s5
400 adcs $t4,$t4,xzr
401 // sum
402 adds $s0,$s0,$t3
403 adcs $s1,$s1,$t4
404 adcs $s2,$s2,$t2
405 adcs $s3,$s3,$s7
406 adcs $t0,xzr,xzr
407 adds $s3,$s3,$t1
408 adcs $t0,$t0,xzr
409
410 stp $s0,$s1,[sp,#32]
411 stp $s2,$s3,[sp,#48]
412
413 // 2. 64-bit to 32-bit spread
414 mov $t1,#0xffffffff
415 mov $s0,$s4
416 mov $s1,$s5
417 mov $s2,$s6
418 mov $s3,$s7
419 and $s0,$s0,$t1 // a8
420 and $s1,$s1,$t1 // a10
421 and $s2,$s2,$t1 // a12
422 and $s3,$s3,$t1 // a14
423 lsr $s4,$s4,#32 // a9
424 lsr $s5,$s5,#32 // a11
425 lsr $s6,$s6,#32 // a13
426 lsr $s7,$s7,#32 // a15
427
428 // 3. 32-bit addition
429 add $t1,$a14,$a12 // t1 <- a12 + a14
430 add $t2,$a15,$a13 // t2 <- a13 + a15
431 add $t3,$a8,$a9 // t3 <- a8 + a9
432 add $t4,$a14,$a10 // t4 <- a10 + a14
433 add $a15,$a15,$a11 // a15 <- a11 + a15
434 add $a12,$t2,$t1 // a12 <- a12 + a13 + a14 + a15
435 add $a10,$a10,$a12 // a10 <- a10 + a12 + a13 + a14 + a15
436 add $a10,$a10,$a12 // a10 <- a10 + 2*(a12 + a13 + a14 + a15)
437 add $a10,$a10,$t3 // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15)
438 add $a10,$a10,$a11 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
439 add $a12,$a12,$a13 // a12 <- a12 + 2*a13 + a14 + a15
440 add $a12,$a12,$a11 // a12 <- a11 + a12 + 2*a13 + a14 + a15
441 add $a12,$a12,$a8 // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15
442 add $t3,$t3,$a14 // t3 <- a8 + a9 + a14
443 add $t3,$t3,$a13 // t3 <- a8 + a9 + a13 + a14
444 add $a9,$a9,$t2 // a9 <- a9 + a13 + a15
445 add $a11,$a11,$a9 // a11 <- a9 + a11 + a13 + a15
446 add $a11,$a11,$t2 // a11 <- a9 + a11 + 2*(a13 + a15)
447 add $t1,$t1,$t4 // t1 <- a10 + a12 + 2*a14
448
449 // U[0] s5 a9 + a11 + 2*(a13 + a15)
450 // U[1] t1 a10 + a12 + 2*a14
451 // U[2] -t3 a8 + a9 + a13 + a14
452 // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15
453 // U[4] s4 a9 + a13 + a15
454 // U[5] t4 a10 + a14
455 // U[6] s7 a11 + a15
456 // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
457
458 // 4. 32-bit to 64-bit
459 lsl $s0,$t1,#32
460 extr $t1,$s2,$t1,#32
461 extr $s2,$t4,$s2,#32
462 extr $t4,$s1,$t4,#32
463 lsr $s1,$s1,#32
464
465 // 5. 64-bit addition
466 adds $s5,$s5,$s0
467 adcs $t1,$t1,xzr
468 adcs $s4,$s4,$s2
469 adcs $s7,$s7,$t4
470 adcs $t0,$t0,$s1
471
472 // V[0] s5
473 // V[1] t1
474 // V[2] s4
475 // V[3] s7
476 // carry t0
477 // sub t3
478
479 // 5. Process s0-s3
480 ldp $s0,$s1,[sp,#32]
481 ldp $s2,$s3,[sp,#48]
482 // add with V0-V3
483 adds $s0,$s0,$s5
484 adcs $s1,$s1,$t1
485 adcs $s2,$s2,$s4
486 adcs $s3,$s3,$s7
487 adcs $t0,$t0,xzr
488 // sub with t3
489 subs $s1,$s1,$t3
490 sbcs $s2,$s2,xzr
491 sbcs $s3,$s3,xzr
492 sbcs $t0,$t0,xzr
493
494 // 6. MOD
495 // First Mod
496 lsl $t1,$t0,#32
497 subs $t2,$t1,$t0
498
499 adds $s0,$s0,$t0
500 adcs $s1,$s1,$t2
501 adcs $s2,$s2,xzr
502 adcs $s3,$s3,$t1
503
504 // Last Mod
505 // return y - p if y > p else y
506 mov $s4,$s0
507 mov $s5,$s1
508 mov $s6,$s2
509 mov $s7,$s3
510
511 adr $t0,.Lpoly
512 ldp $t1,$t2,[$t0]
513 ldp $t3,$t4,[$t0,#16]
514
515 adcs $t5,xzr,xzr
516
517 subs $s0,$s0,$t1
518 sbcs $s1,$s1,$t2
519 sbcs $s2,$s2,$t3
520 sbcs $s3,$s3,$t4
521 sbcs $t5,$t5,xzr
522
523 csel $s0,$s0,$s4,cs
524 csel $s1,$s1,$s5,cs
525 csel $s2,$s2,$s6,cs
526 csel $s3,$s3,$s7,cs
527
528 .endm
529
530 // void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
531 .globl ecp_sm2p256_mul
532 .type ecp_sm2p256_mul,%function
533 .align 5
534 ecp_sm2p256_mul:
535 AARCH64_SIGN_LINK_REGISTER
536 // Store scalar registers
537 stp x29,x30,[sp,#-80]!
538 add x29,sp,#0
539 stp x16,x17,[sp,#16]
540 stp x19,x20,[sp,#64]
541
542 // Load inputs
543 ldp $s0,$s1,[x1]
544 ldp $s2,$s3,[x1,#16]
545 ldp $s4,$s5,[x2]
546 ldp $s6,$s7,[x2,#16]
547
548 // ### multiplication ###
549 // ========================
550 // s3 s2 s1 s0
551 // * s7 s6 s5 s4
552 // ------------------------
553 // + s0 s0 s0 s0
554 // * * * *
555 // s7 s6 s5 s4
556 // s1 s1 s1 s1
557 // * * * *
558 // s7 s6 s5 s4
559 // s2 s2 s2 s2
560 // * * * *
561 // s7 s6 s5 s4
562 // s3 s3 s3 s3
563 // * * * *
564 // s7 s6 s5 s4
565 // ------------------------
566 // s7 s6 s5 s4 s3 s2 s1 s0
567 // ========================
568
569 // ### s0*s4 ###
570 mul $t5,$s0,$s4
571 umulh $t2,$s0,$s4
572
573 // ### s1*s4 + s0*s5 ###
574 mul $t0,$s1,$s4
575 umulh $t1,$s1,$s4
576 adds $t2,$t2,$t0
577 adcs $t3,$t1,xzr
578
579 mul $t0,$s0,$s5
580 umulh $t1,$s0,$s5
581 adds $t2,$t2,$t0
582 adcs $t3,$t3,$t1
583 adcs $t4,xzr,xzr
584
585 // ### s2*s4 + s1*s5 + s0*s6 ###
586 mul $t0,$s2,$s4
587 umulh $t1,$s2,$s4
588 adds $t3,$t3,$t0
589 adcs $t4,$t4,$t1
590
591 mul $t0,$s1,$s5
592 umulh $t1,$s1,$s5
593 adds $t3,$t3,$t0
594 adcs $t4,$t4,$t1
595 adcs $t6,xzr,xzr
596
597 mul $t0,$s0,$s6
598 umulh $t1,$s0,$s6
599 adds $t3,$t3,$t0
600 adcs $t4,$t4,$t1
601 adcs $t6,$t6,xzr
602
603 // ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
604 mul $t0,$s3,$s4
605 umulh $t1,$s3,$s4
606 adds $t4,$t4,$t0
607 adcs $t6,$t6,$t1
608 adcs $t7,xzr,xzr
609
610 mul $t0,$s2,$s5
611 umulh $t1,$s2,$s5
612 adds $t4,$t4,$t0
613 adcs $t6,$t6,$t1
614 adcs $t7,$t7,xzr
615
616 mul $t0,$s1,$s6
617 umulh $t1,$s1,$s6
618 adds $t4,$t4,$t0
619 adcs $t6,$t6,$t1
620 adcs $t7,$t7,xzr
621
622 mul $t0,$s0,$s7
623 umulh $t1,$s0,$s7
624 adds $t4,$t4,$t0
625 adcs $t6,$t6,$t1
626 adcs $t7,$t7,xzr
627
628 // ### s3*s5 + s2*s6 + s1*s7 ###
629 mul $t0,$s3,$s5
630 umulh $t1,$s3,$s5
631 adds $t6,$t6,$t0
632 adcs $t7,$t7,$t1
633 adcs $t8,xzr,xzr
634
635 mul $t0,$s2,$s6
636 umulh $t1,$s2,$s6
637 adds $t6,$t6,$t0
638 adcs $t7,$t7,$t1
639 adcs $t8,$t8,xzr
640
641 mul $t0,$s1,$s7
642 umulh $t1,$s1,$s7
643 adds $s4,$t6,$t0
644 adcs $t7,$t7,$t1
645 adcs $t8,$t8,xzr
646
647 // ### s3*s6 + s2*s7 ###
648 mul $t0,$s3,$s6
649 umulh $t1,$s3,$s6
650 adds $t7,$t7,$t0
651 adcs $t8,$t8,$t1
652 adcs $t6,xzr,xzr
653
654 mul $t0,$s2,$s7
655 umulh $t1,$s2,$s7
656 adds $s5,$t7,$t0
657 adcs $t8,$t8,$t1
658 adcs $t6,$t6,xzr
659
660 // ### s3*s7 ###
661 mul $t0,$s3,$s7
662 umulh $t1,$s3,$s7
663 adds $s6,$t8,$t0
664 adcs $s7,$t6,$t1
665
666 mov $s0,$t5
667 mov $s1,$t2
668 mov $s2,$t3
669 mov $s3,$t4
670
671 // result of mul: s7 s6 s5 s4 s3 s2 s1 s0
672
673 // ### Reduction ###
674 RDC
675
676 stp $s0,$s1,[x0]
677 stp $s2,$s3,[x0,#16]
678
679 // Restore scalar registers
680 ldp x16,x17,[sp,#16]
681 ldp x19,x20,[sp,#64]
682 ldp x29,x30,[sp],#80
683
684 AARCH64_VALIDATE_LINK_REGISTER
685 ret
686 .size ecp_sm2p256_mul,.-ecp_sm2p256_mul
687
688 // void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a);
689 .globl ecp_sm2p256_sqr
690 .type ecp_sm2p256_sqr,%function
691 .align 5
692
693 ecp_sm2p256_sqr:
694 AARCH64_SIGN_LINK_REGISTER
695 // Store scalar registers
696 stp x29,x30,[sp,#-80]!
697 add x29,sp,#0
698 stp x16,x17,[sp,#16]
699 stp x19,x20,[sp,#64]
700
701 // Load inputs
702 ldp $s4,$s5,[x1]
703 ldp $s6,$s7,[x1,#16]
704
705 // ### square ###
706 // ========================
707 // s7 s6 s5 s4
708 // * s7 s6 s5 s4
709 // ------------------------
710 // + s4 s4 s4 s4
711 // * * * *
712 // s7 s6 s5 s4
713 // s5 s5 s5 s5
714 // * * * *
715 // s7 s6 s5 s4
716 // s6 s6 s6 s6
717 // * * * *
718 // s7 s6 s5 s4
719 // s7 s7 s7 s7
720 // * * * *
721 // s7 s6 s5 s4
722 // ------------------------
723 // s7 s6 s5 s4 s3 s2 s1 s0
724 // ========================
725
726 // ### s4*s5 ###
727 mul $s1,$s4,$s5
728 umulh $s2,$s4,$s5
729
730 // ### s4*s6 ###
731 mul $t0,$s6,$s4
732 umulh $s3,$s6,$s4
733 adds $s2,$s2,$t0
734 adcs $s3,$s3,xzr
735
736 // ### s4*s7 + s5*s6 ###
737 mul $t0,$s7,$s4
738 umulh $t1,$s7,$s4
739 adds $s3,$s3,$t0
740 adcs $s0,$t1,xzr
741
742 mul $t0,$s6,$s5
743 umulh $t1,$s6,$s5
744 adds $s3,$s3,$t0
745 adcs $s0,$s0,$t1
746 adcs $t2,xzr,xzr
747
748 // ### s5*s7 ###
749 mul $t0,$s7,$s5
750 umulh $t1,$s7,$s5
751 adds $s0,$s0,$t0
752 adcs $t2,$t2,$t1
753
754 // ### s6*s7 ###
755 mul $t0,$s7,$s6
756 umulh $t1,$s7,$s6
757 adds $t2,$t2,$t0
758 adcs $t3,$t1,xzr
759
760 // ### 2*(t3,t2,s0,s3,s2,s1) ###
761 adds $s1,$s1,$s1
762 adcs $s2,$s2,$s2
763 adcs $s3,$s3,$s3
764 adcs $s0,$s0,$s0
765 adcs $t2,$t2,$t2
766 adcs $t3,$t3,$t3
767 adcs $t4,xzr,xzr
768
769 // ### s4*s4 ###
770 mul $t5,$s4,$s4
771 umulh $t6,$s4,$s4
772
773 // ### s5*s5 ###
774 mul $s4,$s5,$s5
775 umulh $s5,$s5,$s5
776
777 // ### s6*s6 ###
778 mul $t0,$s6,$s6
779 umulh $t1,$s6,$s6
780
781 // ### s7*s7 ###
782 mul $t7,$s7,$s7
783 umulh $t8,$s7,$s7
784
785 adds $s1,$s1,$t6
786 adcs $s2,$s2,$s4
787 adcs $s3,$s3,$s5
788 adcs $s0,$s0,$t0
789 adcs $t2,$t2,$t1
790 adcs $t3,$t3,$t7
791 adcs $t4,$t4,$t8
792
793 mov $s4,$s0
794 mov $s0,$t5
795 mov $s5,$t2
796 mov $s6,$t3
797 mov $s7,$t4
798
799 // result of mul: s7 s6 s5 s4 s3 s2 s1 s0
800
801 // ### Reduction ###
802 RDC
803
804 stp $s0,$s1,[x0]
805 stp $s2,$s3,[x0,#16]
806
807 // Restore scalar registers
808 ldp x16,x17,[sp,#16]
809 ldp x19,x20,[sp,#64]
810 ldp x29,x30,[sp],#80
811
812 AARCH64_VALIDATE_LINK_REGISTER
813 ret
814 .size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr
815 ___
816 }
817
818 foreach (split("\n",$code)) {
819 s/\`([^\`]*)\`/eval $1/ge;
820
821 print $_,"\n";
822 }
823 close STDOUT or die "error closing STDOUT: $!"; # enforce flush